diff --git a/backend-rust/Cargo.lock b/backend-rust/Cargo.lock index 78a255f3..d2929af1 100644 --- a/backend-rust/Cargo.lock +++ b/backend-rust/Cargo.lock @@ -983,7 +983,7 @@ dependencies = [ [[package]] name = "concordium-scan" -version = "0.1.17" +version = "0.1.18" dependencies = [ "anyhow", "async-graphql", diff --git a/backend-rust/Cargo.toml b/backend-rust/Cargo.toml index c67ea531..c09c3aef 100644 --- a/backend-rust/Cargo.toml +++ b/backend-rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "concordium-scan" -version = "0.1.17" +version = "0.1.18" edition = "2021" description = "CCDScan: Indexer and API for the Concordium blockchain" authors = ["Concordium "] diff --git a/backend-rust/src/bin/ccdscan-api.rs b/backend-rust/src/bin/ccdscan-api.rs index be76c010..abb17822 100644 --- a/backend-rust/src/bin/ccdscan-api.rs +++ b/backend-rust/src/bin/ccdscan-api.rs @@ -1,4 +1,5 @@ use anyhow::Context; +use axum::{http::StatusCode, Json}; use clap::Parser; use concordium_scan::{ graphql_api, graphql_api::node_status::NodeInfoReceiver, migrations, router, @@ -161,7 +162,7 @@ async fn main() -> anyhow::Result<()> { let (subscription, subscription_listener) = graphql_api::Subscription::new(cli.database_retry_delay_secs); let (nodes_status_sender, nodes_status_receiver) = tokio::sync::watch::channel(None); - let block_receiver_health = nodes_status_receiver.clone(); + let node_status_receiver = nodes_status_receiver.clone(); let mut pgnotify_listener = { let pool = pool.clone(); let stop_signal = cancel_token.child_token(); @@ -183,10 +184,23 @@ async fn main() -> anyhow::Result<()> { info!("Server is running at {:?}", cli.listen); tokio::spawn(async move { service.serve(tcp_listener, stop_signal).await }) }; + let mut node_collector_task = { + let stop_signal = cancel_token.child_token(); + let service = graphql_api::node_status::Service::new( + nodes_status_sender, + &cli.node_collector_backend_origin, + Duration::from_secs(cli.node_collector_backend_pull_frequency_sec), + client, + cli.node_collector_connection_max_content_length, + stop_signal, + &mut registry, + ); + tokio::spawn(service.serve()) + }; let mut monitoring_task = { let state = HealthState { pool, - node_status_receiver: block_receiver_health, + node_status_receiver, }; let health_routes = axum::Router::new().route("/", axum::routing::get(health)).with_state(state); @@ -197,18 +211,6 @@ async fn main() -> anyhow::Result<()> { info!("Monitoring server is running at {:?}", cli.monitoring_listen); tokio::spawn(router::serve(registry, tcp_listener, stop_signal, health_routes)) }; - let mut node_collector_task = { - let stop_signal = cancel_token.child_token(); - let service = graphql_api::node_status::Service::new( - nodes_status_sender, - &cli.node_collector_backend_origin, - Duration::from_secs(cli.node_collector_backend_pull_frequency_sec), - client, - cli.node_collector_connection_max_content_length, - stop_signal, - ); - tokio::spawn(service.serve()) - }; // Await for signal to shutdown or any of the tasks to stop. tokio::select! { @@ -265,15 +267,25 @@ struct HealthState { /// Verifying the API service state is as expected. async fn health( axum::extract::State(state): axum::extract::State, -) -> axum::Json { +) -> (StatusCode, Json) { let node_status_connected = state.node_status_receiver.borrow().is_some(); let database_connected = migrations::ensure_compatible_schema_version(&state.pool, SUPPORTED_SCHEMA_VERSION) .await .is_ok(); - axum::Json(json!({ - "status": if node_status_connected && database_connected {"ok"} else {"error"}, - "node_status": if node_status_connected {"connected"} else {"not connected"}, - "database": if database_connected {"connected"} else {"not connected"}, - })) + + let is_healthy = node_status_connected && database_connected; + + let status_code = if is_healthy { + StatusCode::OK + } else { + StatusCode::INTERNAL_SERVER_ERROR + }; + ( + status_code, + Json(json!({ + "node_status": if node_status_connected {"connected"} else {"not connected"}, + "database_status": if database_connected {"connected"} else {"not connected"}, + })), + ) } diff --git a/backend-rust/src/bin/ccdscan-indexer.rs b/backend-rust/src/bin/ccdscan-indexer.rs index 6d5c2cc1..a84a82d4 100644 --- a/backend-rust/src/bin/ccdscan-indexer.rs +++ b/backend-rust/src/bin/ccdscan-indexer.rs @@ -1,4 +1,5 @@ use anyhow::Context; +use axum::{http::StatusCode, Json}; use clap::Parser; use concordium_rust_sdk::v2; use concordium_scan::{ @@ -177,15 +178,17 @@ async fn main() -> anyhow::Result<()> { /// Verifying the indexer service state is as expected. async fn health( axum::extract::State(pool): axum::extract::State, -) -> axum::Json { - match migrations::ensure_latest_schema_version(&pool).await { - Ok(_) => axum::Json(json!({ - "status": "ok", - "database": "connected" - })), - Err(err) => axum::Json(json!({ - "status": "error", - "database": format!("not connected: {}", err) +) -> (StatusCode, Json) { + let database_connected = migrations::ensure_latest_schema_version(&pool).await.is_ok(); + let status_code = if database_connected { + StatusCode::OK + } else { + StatusCode::INTERNAL_SERVER_ERROR + }; + ( + status_code, + Json(json!({ + "database_status": if database_connected {"connected"} else {"not connected"}, })), - } + ) } diff --git a/backend-rust/src/graphql_api/node_status.rs b/backend-rust/src/graphql_api/node_status.rs index fb86191e..6acaf5bb 100644 --- a/backend-rust/src/graphql_api/node_status.rs +++ b/backend-rust/src/graphql_api/node_status.rs @@ -1,6 +1,7 @@ use super::{ApiError, ApiResult}; use crate::connection::connection_from_slice; use async_graphql::{connection, types, ComplexObject, Context, Enum, Object, SimpleObject}; +use prometheus_client::{metrics::counter::Counter, registry::Registry}; use reqwest::{Client, StatusCode}; use serde::{Deserialize, Serialize}; use std::{cmp::Ordering::Equal, time::Duration}; @@ -81,10 +82,11 @@ enum NodeSortDirection { } pub struct Service { - sender: Sender>>, + sender: Sender>>, node_collector_backend: NodeCollectorBackendClient, - pull_frequency: Duration, - cancellation_token: CancellationToken, + pull_frequency: Duration, + cancellation_token: CancellationToken, + failed_node_status_fetch_counter: Counter, } impl Service { @@ -95,7 +97,15 @@ impl Service { client: Client, max_content_length: u64, cancellation_token: CancellationToken, + registry: &mut Registry, ) -> Self { + let failed_node_status_fetch_counter = Counter::default(); + registry.register( + "failed_node_status_fetch_counter", + "Number of failed attempts to retrieve data from the node status collector", + failed_node_status_fetch_counter.clone(), + ); + let node_collector_backend = NodeCollectorBackendClient::new(client, origin, max_content_length); Self { @@ -103,6 +113,7 @@ impl Service { node_collector_backend, pull_frequency, cancellation_token, + failed_node_status_fetch_counter, } } @@ -121,6 +132,7 @@ impl Service { } } Err(err) => { + self.failed_node_status_fetch_counter.inc(); error!("Error querying node summary: {}", err); } }