Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

connectivity test: check for deleted cilium agent pod in health probe #2146

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions connectivity/tests/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"strings"
"time"

k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/jsonpath"

"github.com/cilium/cilium-cli/connectivity/check"
Expand All @@ -32,33 +34,38 @@ func (s *ciliumHealth) Run(ctx context.Context, t *check.Test) {
for name, pod := range t.Context().CiliumPods() {
pod := pod
t.NewAction(s, name, &pod, nil, features.IPFamilyAny).Run(func(a *check.Action) {
runHealthProbe(ctx, t.Context(), &pod)
runHealthProbe(ctx, t, &pod)
})
}
}

func runHealthProbe(ctx context.Context, t *check.ConnectivityTest, pod *check.Pod) {
func runHealthProbe(ctx context.Context, t *check.Test, pod *check.Pod) {
cmd := []string{"cilium-health", "status", "--probe", "-o=json"}
done := ctx.Done()

// Probe health status until it passes checks or timeout is reached.
for {
retryTimer := time.After(time.Second)

if _, err := pod.K8sClient.GetPod(ctx, pod.Pod.Namespace, pod.Pod.Name, metav1.GetOptions{}); k8serrors.IsNotFound(err) {
t.Failf("cilium-health validation failed. Cilium Agent Pod %s/%s no longer exists", pod.Pod.Namespace, pod.Pod.Name)
return
}

stdout, err := pod.K8sClient.ExecInPod(ctx, pod.Pod.Namespace, pod.Pod.Name, defaults.AgentContainerName, cmd)
if err != nil {
t.Warnf("cilium-health probe failed: %q, stdout: %q, retrying...", err, stdout)
t.Context().Warnf("cilium-health probe failed: %q, stdout: %q, retrying...", err, stdout)
} else {
err = validateHealthStatus(t, pod, stdout)
err = validateHealthStatus(t.Context(), pod, stdout)
if err == nil {
return
}
t.Warnf("cilium-health validation failed: %q, retrying...", err)
t.Context().Warnf("cilium-health validation failed: %q, retrying...", err)
}
// Wait until it's time to retry or context is cancelled.
select {
case <-done:
t.Fatalf("cilium-health probe on '%s' failed: %s", pod.Name(), err)
t.Context().Fatalf("cilium-health probe on '%s' failed: %s", pod.Name(), err)
return
case <-retryTimer:
}
Expand Down