Skip to content

Commit

Permalink
connectivity test: check for deleted cilium agent pod in health probe
Browse files Browse the repository at this point in the history
Currently, the health probe connectivity test fails forever if an
Cilium Agent Pod that existed when starting the connectivity tests no
longer exists.

The reason is that the health probes uses the list of Cilium Agent pods
that got fetched at the beginning of the connectivity test run.

Therefore, this commit adds a check whether the Pod still exists. If not,
the health probe check fails.

The underlying reason is often that a underlying K8s node has been deleted
in the meantime (since starting the tests).

Signed-off-by: Marco Hofstetter <[email protected]>
  • Loading branch information
mhofstetter authored and tklauser committed Dec 5, 2023
1 parent de4a110 commit 04f8372
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions connectivity/tests/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"strings"
"time"

k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/jsonpath"

"github.com/cilium/cilium-cli/connectivity/check"
Expand All @@ -32,33 +34,38 @@ func (s *ciliumHealth) Run(ctx context.Context, t *check.Test) {
for name, pod := range t.Context().CiliumPods() {
pod := pod
t.NewAction(s, name, &pod, nil, features.IPFamilyAny).Run(func(a *check.Action) {
runHealthProbe(ctx, t.Context(), &pod)
runHealthProbe(ctx, t, &pod)
})
}
}

func runHealthProbe(ctx context.Context, t *check.ConnectivityTest, pod *check.Pod) {
func runHealthProbe(ctx context.Context, t *check.Test, pod *check.Pod) {
cmd := []string{"cilium-health", "status", "--probe", "-o=json"}
done := ctx.Done()

// Probe health status until it passes checks or timeout is reached.
for {
retryTimer := time.After(time.Second)

if _, err := pod.K8sClient.GetPod(ctx, pod.Pod.Namespace, pod.Pod.Name, metav1.GetOptions{}); k8serrors.IsNotFound(err) {
t.Failf("cilium-health validation failed. Cilium Agent Pod %s/%s no longer exists", pod.Pod.Namespace, pod.Pod.Name)
return
}

stdout, err := pod.K8sClient.ExecInPod(ctx, pod.Pod.Namespace, pod.Pod.Name, defaults.AgentContainerName, cmd)
if err != nil {
t.Warnf("cilium-health probe failed: %q, stdout: %q, retrying...", err, stdout)
t.Context().Warnf("cilium-health probe failed: %q, stdout: %q, retrying...", err, stdout)
} else {
err = validateHealthStatus(t, pod, stdout)
err = validateHealthStatus(t.Context(), pod, stdout)
if err == nil {
return
}
t.Warnf("cilium-health validation failed: %q, retrying...", err)
t.Context().Warnf("cilium-health validation failed: %q, retrying...", err)
}
// Wait until it's time to retry or context is cancelled.
select {
case <-done:
t.Fatalf("cilium-health probe on '%s' failed: %s", pod.Name(), err)
t.Context().Fatalf("cilium-health probe on '%s' failed: %s", pod.Name(), err)
return
case <-retryTimer:
}
Expand Down

0 comments on commit 04f8372

Please sign in to comment.