Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

E2E tests can be run with default-counters.csv #295

Merged
merged 3 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Go: Launch e2e",
"type": "go",
"request": "launch",
"mode": "test",
"program": "${workspaceFolder}/tests/e2e",
"args": ["-kubeconfig","~/.kube/config", "-chart","./../../deployment/","-image-repository","nvidia/dcgm-exporter","-arguments","{-f=/etc/dcgm-exporter/dcp-metrics-included.csv,--enable-dcgm-log=true,--dcgm-log-level=ERROR}"],
nvvfedorov marked this conversation as resolved.
Show resolved Hide resolved
"env": {},
"buildFlags": "-tags=e2e"
},
{
nvvfedorov marked this conversation as resolved.
Show resolved Hide resolved
"name": "Run Debug",
"type": "go",
Expand Down
18 changes: 15 additions & 3 deletions tests/e2e/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,27 @@ IMAGE_REPOSITORY ?= "nvcr.io/nvidia/k8s/dcgm-exporter"
IMAGE_TAG ?= "3.3.5-3.4.0-ubuntu22.04"
KUBECONFIG ?= "~/.kube/config"

.PHONY: e2e-test
e2e-test:
define TEST_CMD
@if [ -z ${KUBECONFIG} ]; then \
echo "[ERR] KUBECONFIG is missing, must be set"; \
exit 1; \
fi
$(GO_CMD) test --tags=e2e -v . -args \
$(GO_CMD) test --tags=e2e -v . \
-args \
-kubeconfig=$(KUBECONFIG) \
nvvfedorov marked this conversation as resolved.
Show resolved Hide resolved
-chart="$(CHART)" \
-namespace=$(NAMESPACE) \
-image-repository=$(IMAGE_REPOSITORY) \
-image-tag=$(IMAGE_TAG)
endef

.PHONY: e2e-test
e2e-test:
@$(TEST_CMD)


.PHONY: e2e-test-no-profiling
e2e-test-no-profiling:
@$(TEST_CMD) \
-arguments="{-f=/etc/dcgm-exporter/default-counters.csv}"

68 changes: 40 additions & 28 deletions tests/e2e/e2e_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ type suiteConfig struct {
chart string
imageRepository string
imageTag string
arguments string
}

type Suite struct {
Expand Down Expand Up @@ -159,6 +160,10 @@ func (s *Suite) SetupSuite() {
fmt.Sprintf("serviceMonitor.enabled=%v", false),
}

if s.arguments != "" {
values = append(values, fmt.Sprintf("arguments=%s", s.arguments))
}

if s.imageRepository != "" {
values = append(values, fmt.Sprintf("image.repository=%s", s.imageRepository))
}
Expand Down Expand Up @@ -190,11 +195,9 @@ func (s *Suite) TearDownSuite() {
s.T().Log("Starting tear down E2E test setup...")

if s.workloadPod != nil {
s.T().Logf("Starting delete of the workload pod: %s...", s.workloadPod.Name)
nvvfedorov marked this conversation as resolved.
Show resolved Hide resolved

err = s.k8SClient.DeletePod(s.ctx, s.namespace, s.workloadPod.Name)
s.Assert().NoErrorf(err, "Failed to delete pod: %s", s.workloadPod.Name)

if err == nil {
s.T().Logf("Workload pod: %s is deleted.", s.workloadPod.Name)
}
Expand All @@ -203,19 +206,18 @@ func (s *Suite) TearDownSuite() {
s.T().Logf("Starting uninstall of the helm chart: %s...", s.chart)

nvvfedorov marked this conversation as resolved.
Show resolved Hide resolved
err = s.helmClient.Uninstall(s.helmReleaseName)
s.Assert().NoError(err, "Failed to uninstall helm chart")

s.Assert().NoErrorf(err, "Failed to uninstall release: %s with error: %v", s.helmReleaseName, err)
if err == nil {
s.T().Logf("The helm chart: %s is uninstalled", s.chart)
}

err = s.helmClient.Cleanup()
if err != nil {
s.T().Logf("Failed to clean up directories used by helm client: %v", err)
}

s.Assert().NoError(err, "Failed to clean up directories used by helm client")

s.T().Logf("Deleting namespace: %s...", s.namespace)
err = s.k8SClient.DeleteNamespace(s.ctx, s.namespace)
s.Assert().NoError(err, "Cannot delete namespace")
s.Assert().NoErrorf(err, "Failed to delete namespace %q with error: %v", s.namespace, err)
if err == nil {
s.T().Logf("Namespace: %q deleted", s.namespace)
}
Expand All @@ -226,8 +228,9 @@ func (s *Suite) TearDownSuite() {
func (s *Suite) TestDCGMExporter() {
s.DCGMExporterPrechecks()

s.T().Run("Create workload pod", func(t *testing.T) {
t.Log("Creating a workload pod...")
nvvfedorov marked this conversation as resolved.
Show resolved Hide resolved
s.Run("Create workload pod", func() {

s.T().Log("Creating a workload pod...")

var err error

Expand All @@ -239,18 +242,21 @@ func (s *Suite) TestDCGMExporter() {
workloadImage,
)

require.NoError(t, err, "Cannot create workload pod")
s.Require().NoError(err, "Cannot create workload pod")

require.Eventuallyf(t, func() bool {
s.Require().Eventuallyf(func() bool {
isReady, err := s.k8SClient.CheckPodCondition(s.ctx, s.namespace, s.workloadPod.Name, corev1.PodScheduled)
assert.NoErrorf(t, err, "Cannot get pod status: %v", err)
s.Require().NoErrorf(err, "Cannot get pod status: %v", err)
return isReady
}, 15*time.Minute, 5*time.Second, "Failed to create pod: %s", s.workloadPod.Name)
}, 15*time.Minute, time.Second, "Failed to create pod: %s", s.workloadPod.Name)

t.Log("The workload was created.")
s.T().Log("The workload was created.")
})

s.T().Run("Verify metrics", func(t *testing.T) {
if s.T().Failed() {
s.T().Skip("Test skipped, because previous step failed")
}
require.EventuallyWithT(t, func(c *assert.CollectT) {
metrics, err := s.k8SClient.DoHttpRequest(s.ctx,
s.namespace,
Expand Down Expand Up @@ -292,44 +298,50 @@ func (s *Suite) TestDCGMExporter() {
ptr.Deref(metricFamily.Name, ""), expectedLabels, metric.Label)
}
}
}, 60*time.Second, 30*time.Second)
}, time.Minute, time.Second)
})
}

func (s *Suite) DCGMExporterPrechecks() {
s.T().Run("Checking pre-requisite: dcgm-exporter is up and running",
func(t *testing.T) {
t.Log("Checking the dcgm-exporter pod....")
t.Log("It can take up to the 15 minutes.")
s.Run("Checking pre-requisite: dcgm-exporter is up and running",
func() {
s.T().Log("Checking the dcgm-exporter pod....")
s.T().Log("It can take up to the 15 minutes.")
labelMap := map[string]string{dcgmExporterPodNameLabel: dcgmExporterPodNameLabelValue}

var pod *corev1.Pod

require.Eventuallyf(t, func() bool {
s.Require().Eventuallyf(func() bool {
pods, err := s.k8SClient.GetPodsByLabel(s.ctx, s.namespace, labelMap)
if err != nil {
log.Warnf("Error retrieving pods: %v", err)
return false
}

require.Lenf(t, pods, 1, "Expected a one pod only")
s.Require().Lenf(pods, 1, "Expected a one pod only")
pod = &pods[0]

return true
}, 15*time.Minute, 5*time.Second, "The pod was not created")
}, 15*time.Minute, time.Second, "The pod was not created")

require.NotNil(t, pod, "Nil value is not expected after pod created")
s.Require().NotNil(pod, "Nil value is not expected after pod created")

require.Eventuallyf(t, func() bool {
var errs error
s.Require().Eventuallyf(func() bool {
isReady, err := s.k8SClient.CheckPodCondition(s.ctx, s.namespace, pod.Name, corev1.PodReady)
assert.NoErrorf(t, err, "Cannot get pod status: %v", err)
if err != nil {
errs = err
return true
}

s.dcgmExpPod = pod

return isReady
}, 15*time.Minute, 5*time.Second, "The %s pod is not running", pod.Name)
}, time.Minute, time.Second, "The %s pod is not running", pod.Name)

s.Require().NoError(errs)

t.Log("The dcgm-exporter pod is running")
s.T().Log("The dcgm-exporter pod is running")
},
)
}
6 changes: 6 additions & 0 deletions tests/e2e/internal/framework/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ func (c *KubeClient) CheckPodCondition(ctx context.Context,
}
}

for _, c := range pod.Status.ContainerStatuses {
if c.State.Waiting != nil && c.State.Waiting.Reason == "CrashLoopBackOff" {
return false, fmt.Errorf("pod %s in namespace %s is in CrashLoopBackOff", pod.Name, pod.Namespace)
}
}

nvvfedorov marked this conversation as resolved.
Show resolved Hide resolved
return false, nil
}

Expand Down
5 changes: 5 additions & 0 deletions tests/e2e/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ func TestMain(m *testing.M) {
"",
"DCGM-exporter image tag to use")

flag.StringVar(&suiteCfg.arguments,
"arguments",
"",
`DCGM-exporter command line parameters. Example: -arguments={-f=/etc/dcgm-exporter/default-counters.csv}`)

flag.Parse()
nvvfedorov marked this conversation as resolved.
Show resolved Hide resolved
os.Exit(m.Run())
}
Expand Down
Loading