From 72577c32498ffec0a2ffd7d7e72ad0da08b4b98e Mon Sep 17 00:00:00 2001 From: Marco Hofstetter Date: Mon, 4 Dec 2023 11:56:52 +0100 Subject: [PATCH] connectivity test: check for deleted cilium agent pod in health probe Currently, the health probe connectivity test fails forever if an Cilium Agent Pod that existed when starting the connectivity tests no longer exists. The reason is that the health probes uses the list of Cilium Agent pods that got fetched at the beginning of the connectivity test run. Therefore, this commit adds a check whether the Pod still exists. If not, the health probe check fails. The underlying reason is often that a underlying K8s node has been deleted in the meantime (since starting the tests). Signed-off-by: Marco Hofstetter --- connectivity/tests/health.go | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/connectivity/tests/health.go b/connectivity/tests/health.go index 87c3413ee0..17847bab6f 100644 --- a/connectivity/tests/health.go +++ b/connectivity/tests/health.go @@ -11,6 +11,8 @@ import ( "strings" "time" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/util/jsonpath" "github.com/cilium/cilium-cli/connectivity/check" @@ -32,12 +34,12 @@ func (s *ciliumHealth) Run(ctx context.Context, t *check.Test) { for name, pod := range t.Context().CiliumPods() { pod := pod t.NewAction(s, name, &pod, nil, features.IPFamilyAny).Run(func(a *check.Action) { - runHealthProbe(ctx, t.Context(), &pod) + runHealthProbe(ctx, t, &pod) }) } } -func runHealthProbe(ctx context.Context, t *check.ConnectivityTest, pod *check.Pod) { +func runHealthProbe(ctx context.Context, t *check.Test, pod *check.Pod) { cmd := []string{"cilium-health", "status", "--probe", "-o=json"} done := ctx.Done() @@ -45,20 +47,25 @@ func runHealthProbe(ctx context.Context, t *check.ConnectivityTest, pod *check.P for { retryTimer := time.After(time.Second) + if _, err := pod.K8sClient.GetPod(ctx, pod.Pod.Namespace, pod.Pod.Name, metav1.GetOptions{}); k8serrors.IsNotFound(err) { + t.Failf("cilium-health validation failed. Cilium Agent Pod %s/%s no longer exists", pod.Pod.Namespace, pod.Pod.Name) + return + } + stdout, err := pod.K8sClient.ExecInPod(ctx, pod.Pod.Namespace, pod.Pod.Name, defaults.AgentContainerName, cmd) if err != nil { - t.Warnf("cilium-health probe failed: %q, stdout: %q, retrying...", err, stdout) + t.Context().Warnf("cilium-health probe failed: %q, stdout: %q, retrying...", err, stdout) } else { - err = validateHealthStatus(t, pod, stdout) + err = validateHealthStatus(t.Context(), pod, stdout) if err == nil { return } - t.Warnf("cilium-health validation failed: %q, retrying...", err) + t.Context().Warnf("cilium-health validation failed: %q, retrying...", err) } // Wait until it's time to retry or context is cancelled. select { case <-done: - t.Fatalf("cilium-health probe on '%s' failed: %s", pod.Name(), err) + t.Context().Fatalf("cilium-health probe on '%s' failed: %s", pod.Name(), err) return case <-retryTimer: }