From 2faddf96bc22e1aedc1f72f929cb1a64b520db6a Mon Sep 17 00:00:00 2001 From: Jayant Krishnamurthy Date: Fri, 10 Jan 2025 14:04:29 -0800 Subject: [PATCH] fix(fortuna): Better success and failure metrics (#2241) * better success and failure metrics * comment --- apps/fortuna/Cargo.lock | 2 +- apps/fortuna/Cargo.toml | 2 +- apps/fortuna/src/keeper.rs | 90 +++++++++++++++++++++++++------------- 3 files changed, 62 insertions(+), 32 deletions(-) diff --git a/apps/fortuna/Cargo.lock b/apps/fortuna/Cargo.lock index eb343e900..d9add4027 100644 --- a/apps/fortuna/Cargo.lock +++ b/apps/fortuna/Cargo.lock @@ -1503,7 +1503,7 @@ dependencies = [ [[package]] name = "fortuna" -version = "6.8.0" +version = "6.8.1" dependencies = [ "anyhow", "axum", diff --git a/apps/fortuna/Cargo.toml b/apps/fortuna/Cargo.toml index cd63aed9f..ba0577b4e 100644 --- a/apps/fortuna/Cargo.toml +++ b/apps/fortuna/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fortuna" -version = "6.8.0" +version = "6.8.1" edition = "2021" [dependencies] diff --git a/apps/fortuna/src/keeper.rs b/apps/fortuna/src/keeper.rs index b5c9e67f2..2c47dd9fa 100644 --- a/apps/fortuna/src/keeper.rs +++ b/apps/fortuna/src/keeper.rs @@ -73,6 +73,8 @@ pub struct KeeperMetrics { pub total_gas_spent: Family>, pub requests: Family, pub requests_processed: Family, + pub requests_processed_success: Family, + pub requests_processed_failure: Family, pub requests_reprocessed: Family, pub reveals: Family, pub request_duration_ms: Family, @@ -89,6 +91,8 @@ impl Default for KeeperMetrics { total_gas_spent: Family::default(), requests: Family::default(), requests_processed: Family::default(), + requests_processed_success: Family::default(), + requests_processed_failure: Family::default(), requests_reprocessed: Family::default(), reveals: Family::default(), request_duration_ms: Family::new_with_constructor(|| { @@ -133,6 +137,18 @@ impl KeeperMetrics { keeper_metrics.requests_processed.clone(), ); + writable_registry.register( + "requests_processed_success", + "Number of requests processed successfully", + keeper_metrics.requests_processed_success.clone(), + ); + + writable_registry.register( + "requests_processed_failure", + "Number of requests processed with failure", + keeper_metrics.requests_processed_failure.clone(), + ); + writable_registry.register( "reveal", "Number of reveals", @@ -171,7 +187,7 @@ impl KeeperMetrics { writable_registry.register( "request_duration_ms", - "Time taken to process each callback request in milliseconds", + "Time taken to process each successful callback request in milliseconds", keeper_metrics.request_duration_ms.clone(), ); @@ -382,14 +398,12 @@ pub async fn process_event_with_backoff( metrics: Arc, ) { let start_time = std::time::Instant::now(); + let account_label = AccountLabel { + chain_id: chain_state.id.clone(), + address: chain_state.provider_address.to_string(), + }; - metrics - .requests - .get_or_create(&AccountLabel { - chain_id: chain_state.id.clone(), - address: chain_state.provider_address.to_string(), - }) - .inc(); + metrics.requests.get_or_create(&account_label).inc(); tracing::info!("Started processing event"); let backoff = ExponentialBackoff { max_elapsed_time: Some(Duration::from_secs(300)), // retry for 5 minutes @@ -398,7 +412,7 @@ pub async fn process_event_with_backoff( let current_multiplier = Arc::new(AtomicU64::new(DEFAULT_GAS_ESTIMATE_MULTIPLIER_PCT)); - match backoff::future::retry_notify( + let success = backoff::future::retry_notify( backoff, || async { let multiplier = current_multiplier.load(std::sync::atomic::Ordering::Relaxed); @@ -426,32 +440,48 @@ pub async fn process_event_with_backoff( ); }, ) - .await - { - Ok(()) => { - tracing::info!("Processed event",); - } - Err(e) => { - tracing::error!("Failed to process event: {:?}", e); - } - } + .await; - let duration_ms = start_time.elapsed().as_millis() as f64; - metrics - .request_duration_ms - .get_or_create(&AccountLabel { - chain_id: chain_state.id.clone(), - address: chain_state.provider_address.to_string(), - }) - .observe(duration_ms); + let duration = start_time.elapsed(); metrics .requests_processed - .get_or_create(&AccountLabel { - chain_id: chain_state.id.clone(), - address: chain_state.provider_address.to_string(), - }) + .get_or_create(&account_label) .inc(); + + match success { + Ok(()) => { + tracing::info!("Processed event successfully in {:?}", duration); + + metrics + .requests_processed_success + .get_or_create(&account_label) + .inc(); + + metrics + .request_duration_ms + .get_or_create(&account_label) + .observe(duration.as_millis() as f64); + } + Err(e) => { + // In case the callback did not succeed, we double-check that the request is still on-chain. + // If the request is no longer on-chain, one of the transactions we sent likely succeeded, but + // the RPC gave us an error anyway. + let req = chain_state + .contract + .get_request(event.provider_address, event.sequence_number) + .await; + tracing::error!("Failed to process event: {:?}. Request: {:?}", e, req); + + // We only count failures for cases where we are completely certain that the callback failed. + if req.is_ok_and(|x| x.is_some()) { + metrics + .requests_processed_failure + .get_or_create(&account_label) + .inc(); + } + } + } } const TX_CONFIRMATION_TIMEOUT_SECS: u64 = 30;