diff --git a/.vscode/launch.json b/.vscode/launch.json index 83fd9326..386c1169 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,6 +4,16 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ + { + "name": "Go: Launch e2e", + "type": "go", + "request": "launch", + "mode": "test", + "program": "${workspaceFolder}/tests/e2e", + "args": ["-kubeconfig","~/.kube/config", "-chart","./../../deployment/","-image-repository","nvidia/dcgm-exporter","-arguments","{-f=/etc/dcgm-exporter/dcp-metrics-included.csv,--enable-dcgm-log=true,--dcgm-log-level=ERROR}"], + "env": {}, + "buildFlags": "-tags=e2e" + }, { "name": "Run Debug", "type": "go", diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index c799bbac..3679a0b2 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -19,15 +19,27 @@ IMAGE_REPOSITORY ?= "nvcr.io/nvidia/k8s/dcgm-exporter" IMAGE_TAG ?= "3.3.5-3.4.0-ubuntu22.04" KUBECONFIG ?= "~/.kube/config" -.PHONY: e2e-test -e2e-test: +define TEST_CMD @if [ -z ${KUBECONFIG} ]; then \ echo "[ERR] KUBECONFIG is missing, must be set"; \ exit 1; \ fi - $(GO_CMD) test --tags=e2e -v . -args \ + $(GO_CMD) test --tags=e2e -v . \ + -args \ -kubeconfig=$(KUBECONFIG) \ -chart="$(CHART)" \ -namespace=$(NAMESPACE) \ -image-repository=$(IMAGE_REPOSITORY) \ -image-tag=$(IMAGE_TAG) +endef + +.PHONY: e2e-test +e2e-test: + @$(TEST_CMD) + + +.PHONY: e2e-test-no-profiling +e2e-test-no-profiling: + @$(TEST_CMD) \ + -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}" + diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go index 2ba24a0a..9db91dc2 100644 --- a/tests/e2e/e2e_suite_test.go +++ b/tests/e2e/e2e_suite_test.go @@ -64,6 +64,7 @@ type suiteConfig struct { chart string imageRepository string imageTag string + arguments string } type Suite struct { @@ -159,6 +160,10 @@ func (s *Suite) SetupSuite() { fmt.Sprintf("serviceMonitor.enabled=%v", false), } + if s.arguments != "" { + values = append(values, fmt.Sprintf("arguments=%s", s.arguments)) + } + if s.imageRepository != "" { values = append(values, fmt.Sprintf("image.repository=%s", s.imageRepository)) } @@ -190,11 +195,9 @@ func (s *Suite) TearDownSuite() { s.T().Log("Starting tear down E2E test setup...") if s.workloadPod != nil { - s.T().Logf("Starting delete of the workload pod: %s...", s.workloadPod.Name) err = s.k8SClient.DeletePod(s.ctx, s.namespace, s.workloadPod.Name) s.Assert().NoErrorf(err, "Failed to delete pod: %s", s.workloadPod.Name) - if err == nil { s.T().Logf("Workload pod: %s is deleted.", s.workloadPod.Name) } @@ -203,19 +206,18 @@ func (s *Suite) TearDownSuite() { s.T().Logf("Starting uninstall of the helm chart: %s...", s.chart) err = s.helmClient.Uninstall(s.helmReleaseName) - s.Assert().NoError(err, "Failed to uninstall helm chart") - + s.Assert().NoErrorf(err, "Failed to uninstall release: %s with error: %v", s.helmReleaseName, err) if err == nil { s.T().Logf("The helm chart: %s is uninstalled", s.chart) } err = s.helmClient.Cleanup() + if err != nil { + s.T().Logf("Failed to clean up directories used by helm client: %v", err) + } - s.Assert().NoError(err, "Failed to clean up directories used by helm client") - - s.T().Logf("Deleting namespace: %s...", s.namespace) err = s.k8SClient.DeleteNamespace(s.ctx, s.namespace) - s.Assert().NoError(err, "Cannot delete namespace") + s.Assert().NoErrorf(err, "Failed to delete namespace %q with error: %v", s.namespace, err) if err == nil { s.T().Logf("Namespace: %q deleted", s.namespace) } @@ -226,8 +228,9 @@ func (s *Suite) TearDownSuite() { func (s *Suite) TestDCGMExporter() { s.DCGMExporterPrechecks() - s.T().Run("Create workload pod", func(t *testing.T) { - t.Log("Creating a workload pod...") + s.Run("Create workload pod", func() { + + s.T().Log("Creating a workload pod...") var err error @@ -239,18 +242,21 @@ func (s *Suite) TestDCGMExporter() { workloadImage, ) - require.NoError(t, err, "Cannot create workload pod") + s.Require().NoError(err, "Cannot create workload pod") - require.Eventuallyf(t, func() bool { + s.Require().Eventuallyf(func() bool { isReady, err := s.k8SClient.CheckPodCondition(s.ctx, s.namespace, s.workloadPod.Name, corev1.PodScheduled) - assert.NoErrorf(t, err, "Cannot get pod status: %v", err) + s.Require().NoErrorf(err, "Cannot get pod status: %v", err) return isReady - }, 15*time.Minute, 5*time.Second, "Failed to create pod: %s", s.workloadPod.Name) + }, 15*time.Minute, time.Second, "Failed to create pod: %s", s.workloadPod.Name) - t.Log("The workload was created.") + s.T().Log("The workload was created.") }) s.T().Run("Verify metrics", func(t *testing.T) { + if s.T().Failed() { + s.T().Skip("Test skipped, because previous step failed") + } require.EventuallyWithT(t, func(c *assert.CollectT) { metrics, err := s.k8SClient.DoHttpRequest(s.ctx, s.namespace, @@ -292,44 +298,50 @@ func (s *Suite) TestDCGMExporter() { ptr.Deref(metricFamily.Name, ""), expectedLabels, metric.Label) } } - }, 60*time.Second, 30*time.Second) + }, time.Minute, time.Second) }) } func (s *Suite) DCGMExporterPrechecks() { - s.T().Run("Checking pre-requisite: dcgm-exporter is up and running", - func(t *testing.T) { - t.Log("Checking the dcgm-exporter pod....") - t.Log("It can take up to the 15 minutes.") + s.Run("Checking pre-requisite: dcgm-exporter is up and running", + func() { + s.T().Log("Checking the dcgm-exporter pod....") + s.T().Log("It can take up to the 15 minutes.") labelMap := map[string]string{dcgmExporterPodNameLabel: dcgmExporterPodNameLabelValue} var pod *corev1.Pod - require.Eventuallyf(t, func() bool { + s.Require().Eventuallyf(func() bool { pods, err := s.k8SClient.GetPodsByLabel(s.ctx, s.namespace, labelMap) if err != nil { log.Warnf("Error retrieving pods: %v", err) return false } - require.Lenf(t, pods, 1, "Expected a one pod only") + s.Require().Lenf(pods, 1, "Expected a one pod only") pod = &pods[0] return true - }, 15*time.Minute, 5*time.Second, "The pod was not created") + }, 15*time.Minute, time.Second, "The pod was not created") - require.NotNil(t, pod, "Nil value is not expected after pod created") + s.Require().NotNil(pod, "Nil value is not expected after pod created") - require.Eventuallyf(t, func() bool { + var errs error + s.Require().Eventuallyf(func() bool { isReady, err := s.k8SClient.CheckPodCondition(s.ctx, s.namespace, pod.Name, corev1.PodReady) - assert.NoErrorf(t, err, "Cannot get pod status: %v", err) + if err != nil { + errs = err + return true + } s.dcgmExpPod = pod return isReady - }, 15*time.Minute, 5*time.Second, "The %s pod is not running", pod.Name) + }, time.Minute, time.Second, "The %s pod is not running", pod.Name) + + s.Require().NoError(errs) - t.Log("The dcgm-exporter pod is running") + s.T().Log("The dcgm-exporter pod is running") }, ) } diff --git a/tests/e2e/internal/framework/kube.go b/tests/e2e/internal/framework/kube.go index f01c6f1f..f1182cfb 100644 --- a/tests/e2e/internal/framework/kube.go +++ b/tests/e2e/internal/framework/kube.go @@ -92,6 +92,12 @@ func (c *KubeClient) CheckPodCondition(ctx context.Context, } } + for _, c := range pod.Status.ContainerStatuses { + if c.State.Waiting != nil && c.State.Waiting.Reason == "CrashLoopBackOff" { + return false, fmt.Errorf("pod %s in namespace %s is in CrashLoopBackOff", pod.Name, pod.Namespace) + } + } + return false, nil } diff --git a/tests/e2e/main_test.go b/tests/e2e/main_test.go index b0e88dbe..bff71939 100644 --- a/tests/e2e/main_test.go +++ b/tests/e2e/main_test.go @@ -66,6 +66,11 @@ func TestMain(m *testing.M) { "", "DCGM-exporter image tag to use") + flag.StringVar(&suiteCfg.arguments, + "arguments", + "", + `DCGM-exporter command line parameters. Example: -arguments={-f=/etc/dcgm-exporter/default-counters.csv}`) + flag.Parse() os.Exit(m.Run()) }