From dfe28b9dc3c789776694446f4717ec9291c48cd4 Mon Sep 17 00:00:00 2001 From: Johnny Graettinger Date: Mon, 4 Nov 2024 18:50:49 -0600 Subject: [PATCH] gazette/runtime: use HTTP/2 keep-alive intervals HTTP/2 keep-alive sends a PING frame every interval, and fails the connection of the peer doesn't respond in time. This verifies the end-to-end health of the HTTP/2 transport and catches issues like servers which have bound sockets but aren't actively listening. Also using HTTP/2 keep-alive when connecting to local containers. We've observed that `podman` can fail in ways that leave the reactor believing it has an established connection to flow-connector-init, even though the container has failed and the network namespace has been torn down. --- crates/gazette/src/lib.rs | 7 +++---- crates/runtime/src/container.rs | 1 + 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/gazette/src/lib.rs b/crates/gazette/src/lib.rs index db36bb6910..204d734f64 100644 --- a/crates/gazette/src/lib.rs +++ b/crates/gazette/src/lib.rs @@ -87,11 +87,10 @@ pub fn dial_channel(endpoint: &str) -> Result { // Note this connect_timeout accounts only for TCP connection time and // does not apply to time required for TLS or HTTP/2 transport start, // which can block indefinitely if the server is bound but not listening. - // Callers MUST implement per-RPC timeouts if that's important. - // This timeout is only a best-effort sanity check. .connect_timeout(Duration::from_secs(5)) - .keep_alive_timeout(Duration::from_secs(120)) - .keep_alive_while_idle(true) + // HTTP/2 keep-alive sends a PING frame every interval to confirm the + // health of the end-to-end HTTP/2 transport. + .http2_keep_alive_interval(std::time::Duration::from_secs(5)) .tls_config( tonic::transport::ClientTlsConfig::new() .with_native_roots() diff --git a/crates/runtime/src/container.rs b/crates/runtime/src/container.rs index 7abe1277b0..ca2303175e 100644 --- a/crates/runtime/src/container.rs +++ b/crates/runtime/src/container.rs @@ -227,6 +227,7 @@ pub async fn start( let channel = tonic::transport::Endpoint::new(init_address.clone()) .expect("formatting endpoint address") .connect_timeout(std::time::Duration::from_secs(5)) + .http2_keep_alive_interval(std::time::Duration::from_secs(5)) .connect() .await .with_context(|| {