feat: nym signers monitor (#5933)

* initialise nym-signers-monitor

* creating nyxd client

* performing checks

* sending notifications on failure

* rate limitting on notifications + clippy
This commit is contained in:
Jędrzej Stuczyński
2025-09-02 09:27:09 +01:00
committed by GitHub
parent 2c4b5f168b
commit baddaaac22
13 changed files with 744 additions and 12 deletions
+31
View File
@@ -0,0 +1,31 @@
[package]
name = "nym-signers-monitor"
version = "0.1.0"
authors.workspace = true
repository.workspace = true
homepage.workspace = true
documentation.workspace = true
edition.workspace = true
license.workspace = true
rust-version.workspace = true
readme.workspace = true
[dependencies]
anyhow = { workspace = true }
clap = { workspace = true, features = ["cargo", "derive", "env", "string"] }
humantime = { workspace = true }
itertools = { workspace = true }
time = { workspace = true }
tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
tracing = { workspace = true }
url = { workspace = true }
nym-bin-common = { path = "../common/bin-common", features = ["output_format", "basic_tracing"] }
nym-ecash-signer-check = { path = "../common/ecash-signer-check" }
nym-network-defaults = { path = "../common/network-defaults" }
nym-task = { path = "../common/task" }
nym-validator-client = { path = "../common/client-libs/validator-client" }
zulip-client = { path = "../common/zulip-client" }
[lints]
workspace = true
+15
View File
@@ -0,0 +1,15 @@
// Copyright 2025 - Nym Technologies SA <contact@nymtech.net>
// SPDX-License-Identifier: GPL-3.0-only
use nym_bin_common::bin_info_owned;
use nym_bin_common::output_format::OutputFormat;
#[derive(clap::Args, Debug)]
pub(crate) struct Args {
#[arg(short, long, default_value_t = OutputFormat::default())]
output: OutputFormat,
}
pub(crate) fn execute(args: Args) {
println!("{}", args.output.format(&bin_info_owned!()))
}
+20
View File
@@ -0,0 +1,20 @@
// Copyright 2025 - Nym Technologies SA <contact@nymtech.net>
// SPDX-License-Identifier: GPL-3.0-only
pub(crate) mod vars {
pub(crate) const ZULIP_BOT_EMAIL_ARG: &str = "ZULIP_BOT_EMAIL";
pub(crate) const ZULIP_BOT_API_KEY_ARG: &str = "ZULIP_BOT_API_KEY";
pub(crate) const ZULIP_SERVER_URL_ARG: &str = "ZULIP_SERVER_URL";
pub(crate) const ZULIP_NOTIFICATION_CHANNEL_ID_ARG: &str = "ZULIP_NOTIFICATION_CHANNEL_ID";
pub(crate) const ZULIP_NOTIFICATION_CHANNEL_TOPIC_ARG: &str =
"ZULIP_NOTIFICATION_CHANNEL_TOPIC";
pub(crate) const SIGNERS_MONITOR_CHECK_INTERVAL_ARG: &str = "SIGNERS_MONITOR_CHECK_INTERVAL";
pub(crate) const SIGNERS_MONITOR_MIN_NOTIFICATION_DELAY_ARG: &str =
"SIGNERS_MONITOR_MIN_NOTIFICATION_DELAY";
pub(crate) const KNOWN_NETWORK_NAME_ARG: &str = "KNOWN_NETWORK_NAME";
pub(crate) const NYXD_CLIENT_CONFIG_ENV_FILE_ARG: &str = "NYXD_CLIENT_CONFIG_ENV_FILE";
pub(crate) const NYXD_RPC_ENDPOINT_ARG: &str = "NYXD_RPC_ENDPOINT";
pub(crate) const NYXD_DKG_CONTRACT_ADDRESS_ARG: &str = "NYXD_DKG_CONTRACT_ADDRESS";
}
+43
View File
@@ -0,0 +1,43 @@
// Copyright 2025 - Nym Technologies SA <contact@nymtech.net>
// SPDX-License-Identifier: GPL-3.0-only
use clap::{Parser, Subcommand};
use nym_bin_common::bin_info;
use std::sync::OnceLock;
pub(crate) mod build_info;
pub(crate) mod env;
pub(crate) mod run;
// Helper for passing LONG_VERSION to clap
fn pretty_build_info_static() -> &'static str {
static PRETTY_BUILD_INFORMATION: OnceLock<String> = OnceLock::new();
PRETTY_BUILD_INFORMATION.get_or_init(|| bin_info!().pretty_print())
}
#[derive(Parser, Debug)]
#[clap(author = "Nymtech", version, long_version = pretty_build_info_static(), about)]
pub(crate) struct Cli {
#[clap(subcommand)]
command: Commands,
}
impl Cli {
pub async fn execute(self) -> anyhow::Result<()> {
match self.command {
Commands::BuildInfo(args) => build_info::execute(args),
Commands::Run(args) => run::execute(*args).await?,
}
Ok(())
}
}
#[derive(Subcommand, Debug)]
pub(crate) enum Commands {
/// Show build information of this binary
BuildInfo(build_info::Args),
/// Start signers monitor and send notifications on any failures
Run(Box<run::Args>),
}
+167
View File
@@ -0,0 +1,167 @@
// Copyright 2025 - Nym Technologies SA <contact@nymtech.net>
// SPDX-License-Identifier: GPL-3.0-only
use crate::cli::env::vars::*;
use crate::monitor::SignersMonitor;
use anyhow::{bail, Context};
use clap::ArgGroup;
use nym_network_defaults::{setup_env, NymNetworkDetails};
use nym_validator_client::nyxd::AccountId;
use nym_validator_client::QueryHttpRpcNyxdClient;
use std::time::Duration;
use url::Url;
#[derive(Debug, clap::Args)]
pub(crate) struct NyxdConnectionArgs {
// for well-known networks, such mainnet, we can use hardcoded values
/// Name of a well known network (such as 'mainnet') that has well-known
/// pre-configured setup values
#[clap(long, env = KNOWN_NETWORK_NAME_ARG)]
pub(crate) known_network_name: Option<String>,
/// Path pointing to an env file that configures the nyxd client.
#[clap(
short,
long,
env = NYXD_CLIENT_CONFIG_ENV_FILE_ARG
)]
pub(crate) config_env_file: Option<std::path::PathBuf>,
/// For unknown networks (or if one wishes to override defaults),
/// specify the RPC endpoint of a node from which signer information should be retrieved
#[clap(long, env = NYXD_RPC_ENDPOINT_ARG)]
pub(crate) nyxd_rpc_endpoint: Option<Url>,
/// For unknown networks, specify address of the DKG contract to pull signer information from.
#[clap(
long,
requires("nyxd_rpc_endpoint"),
env = NYXD_DKG_CONTRACT_ADDRESS_ARG
)]
pub(crate) dkg_contract_address: Option<AccountId>,
// if needed down the line (not sure why), we could define additional args
// for specifying denoms, etc.
// #[clap(long, requires("dkg_contract_address"))]
// pub(crate) mix_denom: Option<String>,
}
impl NyxdConnectionArgs {
fn get_minimal_nym_network_details(&self) -> anyhow::Result<NymNetworkDetails> {
if let Some(known_network_name) = &self.known_network_name {
match known_network_name.as_str() {
"mainnet" => return Ok(NymNetworkDetails::new_mainnet()),
other => bail!("{other} is not a known network name - please use another method of setting up chain connection"),
}
}
if let Some(config_env_file) = &self.config_env_file {
setup_env(Some(config_env_file));
return Ok(NymNetworkDetails::new_from_env());
}
// SAFETY: clap ensures at least one of the fields is set
#[allow(clippy::unwrap_used)]
let dkg_contract = self.dkg_contract_address.as_ref().unwrap();
// use mainnet's chain details (i.e. prefixes, denoms, etc)
let mainnet_chain_details = NymNetworkDetails::new_mainnet().chain_details;
Ok(NymNetworkDetails::new_empty()
.with_chain_details(mainnet_chain_details)
.with_coconut_dkg_contract(Some(dkg_contract.to_string())))
}
pub(crate) fn try_create_nyxd_client(&self) -> anyhow::Result<QueryHttpRpcNyxdClient> {
let network_details = self.get_minimal_nym_network_details()?;
let nyxd_endpoint = match &self.nyxd_rpc_endpoint {
Some(nyxd_rpc_endpoint) => nyxd_rpc_endpoint.clone(),
None => network_details
.endpoints
.first()
.context("no nyxd endpoints provided")?
.nyxd_url
.parse()?,
};
Ok(QueryHttpRpcNyxdClient::connect_with_network_details(
nyxd_endpoint.as_str(),
network_details,
)?)
}
}
#[derive(clap::Args, Debug)]
#[command(group(
ArgGroup::new("nyxd_connection")
.multiple(true)
.required(true)
.args([
"known_network_name",
"config_env_file",
"nyxd_rpc_endpoint"
])
))]
pub(crate) struct Args {
/// Specify email address for the bot responsible for sending notifications to the zulip server
/// in case 'upgrade' mode is detected
#[clap(
long,
env = ZULIP_BOT_EMAIL_ARG
)]
pub(crate) zulip_bot_email: String,
/// Specify the API key for the bot responsible for sending notifications to the zulip server
/// in case 'upgrade' mode is detected
#[clap(
long,
env = ZULIP_BOT_API_KEY_ARG
)]
pub(crate) zulip_bot_api_key: String,
/// Specify the sever endpoint for the bot responsible for sending notifications
/// in case 'upgrade' mode is detected
#[clap(
long,
env = ZULIP_SERVER_URL_ARG
)]
pub(crate) zulip_server_url: Url,
/// Specify the channel id for where the notification is going to be sent
#[clap(
long,
env = ZULIP_NOTIFICATION_CHANNEL_ID_ARG
)]
pub(crate) zulip_notification_channel_id: u32,
/// Optionally specify the channel topic for where the notification is going to be sent
#[clap(
long,
env = ZULIP_NOTIFICATION_CHANNEL_TOPIC_ARG
)]
pub(crate) zulip_notification_topic: Option<String>,
/// Specify the delay between subsequent signers checks
#[clap(
long,
env = SIGNERS_MONITOR_CHECK_INTERVAL_ARG,
value_parser = humantime::parse_duration,
default_value = "15m"
)]
pub(crate) signers_check_interval: Duration,
/// Specify the minimum delay between two subsequent notifications
#[clap(
long,
env = SIGNERS_MONITOR_MIN_NOTIFICATION_DELAY_ARG,
value_parser = humantime::parse_duration,
default_value = "1h"
)]
pub(crate) minimum_notification_delay: Duration,
#[clap(flatten)]
pub(crate) nyxd_connection: NyxdConnectionArgs,
}
pub(crate) async fn execute(args: Args) -> anyhow::Result<()> {
SignersMonitor::new(args)?.run().await
}
+24
View File
@@ -0,0 +1,24 @@
// Copyright 2025 - Nym Technologies SA <contact@nymtech.net>
// SPDX-License-Identifier: GPL-3.0-only
use crate::cli::Cli;
use clap::Parser;
use nym_bin_common::bin_info_owned;
use nym_bin_common::logging::setup_tracing_logger;
use tracing::{info, trace};
mod cli;
mod monitor;
pub(crate) mod test_result;
#[tokio::main]
async fn main() -> anyhow::Result<()> {
setup_tracing_logger();
let cli = Cli::parse();
trace!("args: {cli:#?}");
let bin_info = bin_info_owned!();
info!("using the following version: {bin_info}");
cli.execute().await
}
+223
View File
@@ -0,0 +1,223 @@
// Copyright 2025 - Nym Technologies SA <contact@nymtech.net>
// SPDX-License-Identifier: GPL-3.0-only
use crate::cli::run;
use crate::test_result::{DisplayableSignerResult, Summary, TestResult};
use nym_ecash_signer_check::check_signers_with_client;
use nym_task::ShutdownManager;
use nym_validator_client::QueryHttpRpcNyxdClient;
use std::time::Duration;
use time::OffsetDateTime;
use tokio::time::interval;
use tracing::{error, info};
const LOST_QUORUM_MSG: &str = r#"
# 🔥🔥🔥 LOST SIGNING QUORUM 🔥🔥🔥
We seem to have lost the signing quorum - check if we should enable the 'upgrade' mode!
"#;
const UNKNOWN_QUORUM_MSG: &str = r#"
# ❓❓❓ UNKNOWN SIGNING QUORUM ❓❓❓
We can't determine the signing quroum - if we're not undergoing DKG exchange check if we should enable the 'upgrade' mode!
"#;
pub(crate) struct SignersMonitor {
zulip_client: zulip_client::Client,
nyxd_client: QueryHttpRpcNyxdClient,
notification_channel_id: u32,
notification_topic: Option<String>,
check_interval: Duration,
min_notification_delay: Duration,
last_notification_sent: Option<OffsetDateTime>,
}
impl SignersMonitor {
pub(crate) fn new(args: run::Args) -> anyhow::Result<Self> {
let zulip_client = zulip_client::Client::builder(
args.zulip_bot_email,
args.zulip_bot_api_key,
args.zulip_server_url,
)?
.build()?;
let nyxd_client = args.nyxd_connection.try_create_nyxd_client()?;
Ok(SignersMonitor {
zulip_client,
nyxd_client,
notification_channel_id: args.zulip_notification_channel_id,
notification_topic: args.zulip_notification_topic,
check_interval: args.signers_check_interval,
min_notification_delay: args.minimum_notification_delay,
last_notification_sent: None,
})
}
async fn check_signers(&mut self) -> anyhow::Result<TestResult> {
info!("starting signer check...");
let check_result = check_signers_with_client(&self.nyxd_client).await?;
let mut unreachable_signers = 0;
let mut unknown_local_chain_status = 0;
let mut stalled_local_chain = 0;
let mut working_local_chain = 0;
let mut unknown_credential_issuance_status = 0;
let mut working_credential_issuance = 0;
let mut unavailable_credential_issuance = 0;
let mut fully_working = 0;
let mut signers = Vec::new();
for result in &check_result.results {
if result.signer_unreachable() {
unreachable_signers += 1;
}
if result.unknown_chain_status() {
unknown_local_chain_status += 1;
}
if result.chain_available() {
working_local_chain += 1;
}
if result.chain_provably_stalled() || result.chain_unprovably_stalled() {
stalled_local_chain += 1;
}
if result.unknown_signing_status() {
unknown_credential_issuance_status += 1;
}
if result.signing_available() {
working_credential_issuance += 1;
}
if result.signing_provably_unavailable() || result.signing_unprovably_unavailable() {
unavailable_credential_issuance += 1;
}
let signing_available = if result.unknown_signing_status() {
None
} else {
Some(result.signing_available())
};
let chain_not_stalled = if result.unknown_chain_status() {
None
} else {
Some(result.chain_available())
};
if (result.chain_available()) && (result.signing_available()) {
fully_working += 1;
}
signers.push(DisplayableSignerResult {
version: result
.try_get_test_result()
.map(|r| r.reported_version.clone()),
url: result.information.announce_address.clone(),
signing_available,
chain_not_stalled,
})
}
let signing_quorum_available = check_result.threshold.map(|threshold| {
(working_local_chain as u64) >= threshold
&& (working_credential_issuance as u64) >= threshold
});
signers.sort_by_key(|s| s.version.clone());
let summary = Summary {
signing_quorum_available,
fully_working,
unreachable_signers,
registered_signers: check_result.results.len(),
unknown_local_chain_status,
stalled_local_chain,
working_local_chain,
unknown_credential_issuance_status,
working_credential_issuance,
unavailable_credential_issuance,
threshold: check_result.threshold,
};
Ok(TestResult { summary, signers })
}
async fn perform_signer_check(&mut self) -> anyhow::Result<()> {
let result = self.check_signers().await?;
let result_md = result.results_to_markdown_message();
let msg = if result.quorum_unavailable() {
Some(format!("{LOST_QUORUM_MSG}\n\n{result_md}",))
} else if result.quorum_unknown() {
Some(format!("{UNKNOWN_QUORUM_MSG}\n\n{result_md}",))
} else {
None
};
if let Some(msg) = msg {
self.maybe_notify_about_failure(&msg).await?
}
Ok(())
}
async fn maybe_notify_about_failure(&mut self, message: &String) -> anyhow::Result<()> {
if let Some(last_notification_sent) = self.last_notification_sent {
if last_notification_sent + self.min_notification_delay > OffsetDateTime::now_utc() {
info!("too soon to send another notification");
return Ok(());
}
}
self.send_zulip_notification(message).await?;
self.last_notification_sent = Some(OffsetDateTime::now_utc());
Ok(())
}
async fn send_zulip_notification(&self, message: &String) -> anyhow::Result<()> {
self.zulip_client
.send_channel_message((
self.notification_channel_id,
message,
&self.notification_topic,
))
.await?;
Ok(())
}
async fn send_shutdown_notification(&self) -> anyhow::Result<()> {
println!("here be sending shutdown notification");
Ok(())
}
pub(crate) async fn run(&mut self) -> anyhow::Result<()> {
let shutdown_manager =
ShutdownManager::new("nym-signers-monitor").with_default_shutdown_signals()?;
let mut check_interval = interval(self.check_interval);
while !shutdown_manager.root_token.is_cancelled() {
tokio::select! {
biased;
_ = shutdown_manager.root_token.cancelled() => {
info!("received shutdown");
break;
}
_ = check_interval.tick() => {
if let Err(err) = self.perform_signer_check().await {
error!("failed to check signers: {err}");
}
}
}
}
shutdown_manager.close();
shutdown_manager.run_until_shutdown().await;
if let Err(err) = self.send_shutdown_notification().await {
error!("failed to send shutdown notification: {err}");
}
Ok(())
}
}
+115
View File
@@ -0,0 +1,115 @@
// Copyright 2025 - Nym Technologies SA <contact@nymtech.net>
// SPDX-License-Identifier: GPL-3.0-only
use itertools::Itertools;
fn maybe_bool_to_emoji_string(maybe_bool: Option<bool>) -> String {
match maybe_bool {
None => "⚠️ unknown".into(),
Some(true) => "✅ yes".into(),
Some(false) => "❌ no".into(),
}
}
pub(crate) struct DisplayableSignerResult {
pub(crate) url: String,
pub(crate) version: Option<String>,
pub(crate) signing_available: Option<bool>,
pub(crate) chain_not_stalled: Option<bool>,
}
impl DisplayableSignerResult {
fn to_markdown_table_row(&self) -> String {
format!(
"| {} | {} | {} | {} |",
self.url,
self.version.as_deref().unwrap_or("unknown"),
maybe_bool_to_emoji_string(self.signing_available),
maybe_bool_to_emoji_string(self.chain_not_stalled)
)
}
}
pub(crate) struct TestResult {
pub(crate) summary: Summary,
pub(crate) signers: Vec<DisplayableSignerResult>,
}
impl TestResult {
pub(crate) fn quorum_unavailable(&self) -> bool {
self.summary.signing_quorum_available.unwrap_or(false)
}
pub(crate) fn quorum_unknown(&self) -> bool {
self.summary.signing_quorum_available.is_none()
}
pub(crate) fn results_to_markdown_message(&self) -> String {
let p_available = format!(
"{:.2}",
(self.summary.fully_working as f32 / self.summary.registered_signers as f32) * 100.
);
format!(
r#"
## Summary
- quorum available: {} ({p_available}% of signers fully available)
- signers fully working: {}
- signing threshold: {}
- registered signers: {}
- unreachable signers: {}
### Chain Status
- unknown status: {}
- working chain: {}
- stalled chain: {}
### Credential Issuance Status
(note: signers below 1.1.64 do not return fully reliable results)
- unknown status: {}
- working issuance: {}
- unavailable issuance: {}
## Detailed Results
| address | version | chain working | issuance (maybe) available |
| - | - | - | - |
{}
"#,
maybe_bool_to_emoji_string(self.summary.signing_quorum_available),
self.summary.fully_working,
self.summary
.threshold
.map(|threshold| threshold.to_string())
.unwrap_or("???".to_string()),
self.summary.registered_signers,
self.summary.unreachable_signers,
self.summary.unknown_local_chain_status,
self.summary.working_local_chain,
self.summary.stalled_local_chain,
self.summary.unknown_credential_issuance_status,
self.summary.working_credential_issuance,
self.summary.unavailable_credential_issuance,
self.signers
.iter()
.map(|r| r.to_markdown_table_row())
.join("\n")
)
}
}
pub(crate) struct Summary {
pub(crate) signing_quorum_available: Option<bool>,
pub(crate) fully_working: usize,
pub(crate) threshold: Option<u64>,
pub(crate) registered_signers: usize,
pub(crate) unreachable_signers: usize,
pub(crate) unknown_local_chain_status: usize,
pub(crate) stalled_local_chain: usize,
pub(crate) working_local_chain: usize,
pub(crate) unknown_credential_issuance_status: usize,
pub(crate) working_credential_issuance: usize,
pub(crate) unavailable_credential_issuance: usize,
}