From 9fdeb873baa873c46c0da27b6ef2ddbb99c452e7 Mon Sep 17 00:00:00 2001 From: Omer Tuchfeld Date: Thu, 25 Apr 2024 12:37:09 +0200 Subject: [PATCH] Retry on etcd too many requests error # tl;dr A fix for a rare error: ``` Error: finalizing Caused by: 0: commiting etcd cache to actual etcd 1: grpc request error: status: Unknown, message: "etcdserver: too many requests", details: [], metadata: MetadataMap { headers: {"content-type": "application/grpc"} } ``` # Background When committing our in-memory etcd representation to actual etcd, we send all delete requests concurrently (we have many). # Issue Sometimes this leads to us receiving an error from etcd which says "etcdserver: too many requests". Recert treated this error as a hard error and as a result it exits. # Solution Compare the error string to this exact phrasing (as there doesn't seem to be a more robust error code we can check, the code just says `Unknown`), and if we encounter it, just repeat the request again. Eventually hopefully all requests should go through. --- src/k8s_etcd.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/k8s_etcd.rs b/src/k8s_etcd.rs index 259ec1e7..9165fefc 100644 --- a/src/k8s_etcd.rs +++ b/src/k8s_etcd.rs @@ -59,7 +59,19 @@ impl InMemoryK8sEtcd { let key = key.clone(); let etcd_client = Arc::clone(etcd_client); tokio::spawn(async move { - etcd_client.kv_client().delete(key.as_bytes(), None).await?; + loop { + let delete_response = etcd_client.kv_client().delete(key.as_bytes(), None).await; + + if is_too_many_requests_error(&delete_response) { + continue; + } + + match delete_response { + Ok(_) => break, + Err(_) => delete_response.context(format!("during etcd delete {}", key))?, + }; + } + anyhow::Ok(()) }) }) @@ -166,6 +178,16 @@ impl InMemoryK8sEtcd { } } +fn is_too_many_requests_error(delete_response: &std::prelude::v1::Result) -> bool { + match delete_response { + Ok(_) => false, + Err(err) => match err { + etcd_client::Error::GRpcStatus(status) => status.message() == "etcdserver: too many requests", + _ => false, + }, + } +} + pub(crate) async fn get_etcd_json(client: &InMemoryK8sEtcd, k8slocation: &K8sResourceLocation) -> Result> { let etcd_result = client .get(k8slocation.as_etcd_key())