From b14e7f8892bb89bb93116e7a26520eac3e0880b4 Mon Sep 17 00:00:00 2001 From: Douglas Wightman Date: Tue, 7 Jan 2025 09:41:18 -0700 Subject: [PATCH] DCGM-Exporter 4.0.0 - Update to DCGM 4.0.0 - Major refactor to enable clean mock testing - Refactor metric collection to align with prometheus best practices - Many more bug fixes and improvements --- .devcontainer/Dockerfile | 70 +- .github/workflows/go.yml | 2 +- .gitignore | 6 +- .vscode/launch.json | 22 +- Jenkinsfile | 64 - Makefile | 76 +- README.md | 5 +- cmd/dcgm-exporter/main.go | 10 +- dcgm-exporter.yaml | 12 +- deployment/Chart.yaml | 4 +- deployment/templates/_helpers.tpl | 20 + deployment/templates/daemonset.yaml | 37 + deployment/templates/tls-secret.yaml | 43 + .../templates/web-config-configmap.yaml | 40 + deployment/values.yaml | 42 +- docker/{Dockerfile.ubi9 => Dockerfile.ubi} | 77 +- docker/Dockerfile.ubuntu | 98 + docker/Dockerfile.ubuntu22.04 | 88 - etc/default-counters.csv | 27 +- go.mod | 142 +- go.sum | 353 +-- hack/VERSION | 8 +- .../mocks/pkg/collector/mock_collector.go | 81 + .../mocks/pkg/dcgmprovider/mock_client.go | 507 +++ .../mocks/pkg/deviceinfo/mock_device_info.go | 266 ++ .../pkg/devicewatcher/mock_device_watcher.go | 87 + .../mock_device_watchlist_manager.go | 85 + internal/mocks/pkg/elf/mock_elf.go | 69 + internal/mocks/pkg/exec/mock_cmd.go | 68 + internal/mocks/pkg/exec/mock_exec.go | 73 + .../mocks/pkg/nvmlprovider/mock_client.go | 81 + .../os/{dir_entry.go => mock_dir_entry.go} | 2 +- .../os/{file_info.go => mock_file_info.go} | 2 +- internal/mocks/pkg/os/{os.go => mock_os.go} | 14 +- .../pkg/transformation/mock_transformer.go | 84 + .../pkg/transformations/mock_transformer.go | 83 + internal/pkg/appconfig/const.go | 26 + .../pkg/appconfig/types.go | 19 +- internal/pkg/collector/base_collector.go | 98 + .../pkg/collector}/clock_events_collector.go | 49 +- .../collector/clock_events_collector_test.go | 798 +++++ internal/pkg/collector/collector_factory.go | 172 ++ .../pkg/collector/collector_factory_test.go | 580 ++++ internal/pkg/collector/const.go | 24 + internal/pkg/collector/expcollector.go | 138 + .../pkg/collector}/gpu_collector.go | 209 +- internal/pkg/collector/gpu_collector_test.go | 168 + .../pkg/collector/gpu_health_collector.go | 386 +++ .../collector/gpu_health_collector_test.go | 364 +++ internal/pkg/collector/types.go | 92 + .../pkg/collector/variables.go | 2 +- internal/pkg/collector/xid_collector.go | 81 + internal/pkg/collector/xid_collector_test.go | 534 ++++ .../pkg/collector}/xid_errors.go | 2 +- internal/pkg/counters/const.go | 28 + .../pkg/counters/counter_config.go | 50 +- .../pkg/counters/counter_config_test.go | 32 +- .../pkg/counters/exporter_counters.go | 17 +- .../pkg/counters/exporter_counters_test.go | 4 +- internal/pkg/counters/types.go | 50 + internal/pkg/counters/variables.go | 29 + internal/pkg/dcgmprovider/dcgm.go | 256 ++ internal/pkg/dcgmprovider/types.go | 59 + internal/pkg/deviceinfo/device_info.go | 597 ++++ internal/pkg/deviceinfo/device_info_test.go | 2749 +++++++++++++++++ internal/pkg/deviceinfo/testutils.go | 196 ++ internal/pkg/deviceinfo/types.go | 72 + internal/pkg/devicemonitoring/const.go | 21 + .../pkg/devicemonitoring/device_monitoring.go | 251 ++ .../device_monitoring_test.go | 1610 ++++++++++ .../pkg/devicemonitoring/types.go | 28 +- internal/pkg/devicewatcher/const.go | 24 + internal/pkg/devicewatcher/device_watcher.go | 295 ++ .../pkg/devicewatcher/device_watcher_test.go | 1951 ++++++++++++ internal/pkg/devicewatcher/types.go | 31 + internal/pkg/devicewatcher/variables.go | 21 + .../device_watchlist_manager.go | 151 + .../device_watchlist_manager_test.go | 780 +++++ internal/pkg/devicewatchlistmanager/types.go | 30 + internal/pkg/elf/README.md | 3 + internal/pkg/elf/elf.go | 29 + internal/pkg/elf/types.go | 24 + internal/pkg/exec/README.md | 3 + internal/pkg/exec/exec.go | 48 + internal/pkg/hostname/hostname.go | 57 + internal/pkg/hostname/hostname_test.go | 130 + .../pkg/integration_test/collector_test.go | 1117 +++++++ .../integration_test/transformation_test.go | 105 + internal/pkg/logging/const.go | 28 + internal/pkg/logging/logger_adapter.go | 76 - internal/pkg/logging/logger_adapter_test.go | 114 - internal/pkg/nvmlprovider/provider.go | 165 +- internal/pkg/nvmlprovider/provider_test.go | 101 +- internal/pkg/nvmlprovider/types.go | 24 + internal/pkg/os/os.go | 9 +- internal/pkg/prerequisites/dcgmlib_rule.go | 86 + .../pkg/prerequisites/dcgmlib_rule_test.go | 210 ++ internal/pkg/prerequisites/types.go | 21 + internal/pkg/prerequisites/validation.go | 32 + internal/pkg/prerequisites/validation_test.go | 99 + internal/pkg/prerequisites/variables.go | 42 + internal/pkg/registry/registry.go | 120 + internal/pkg/registry/registry_test.go | 141 + internal/pkg/registry/types.go | 26 + internal/pkg/rendermetrics/render_metrics.go | 155 + .../pkg/rendermetrics/render_metrics_test.go | 132 + internal/pkg/server/server.go | 195 ++ internal/pkg/server/server_test.go | 277 ++ internal/pkg/server/types.go | 42 + {pkg => internal/pkg}/stdout/capture.go | 9 +- {pkg => internal/pkg}/stdout/capture_test.go | 32 +- .../pkg}/stdout/capture_test_wrapper.go | 36 +- .../pkg}/stdout/stdoutprocessor.go | 0 internal/pkg/testutils/const.go | 49 + internal/pkg/testutils/test_utils.go | 313 ++ internal/pkg/testutils/testutils.go | 61 - internal/pkg/testutils/types.go | 22 + internal/pkg/testutils/variables.go | 145 + internal/pkg/transformation/const.go | 30 + .../pkg/transformation}/hpc.go | 43 +- .../pkg/transformation}/hpc_test.go | 75 +- .../pkg/transformation}/kubernetes.go | 53 +- .../pkg/transformation/kubernetes_test.go | 217 ++ internal/pkg/transformation/transformer.go | 37 + .../pkg/transformation/transformer_test.go | 67 + internal/pkg/transformation/types.go | 40 + internal/pkg/transformation/variables.go | 25 + .../pkg/utils}/utils.go | 26 +- .../pkg/utils}/utils_test.go | 75 +- .../systemd/nvidia-dcgm-exporter.service | 33 + pkg/cmd/app.go | 243 +- pkg/cmd/app_test.go | 85 +- pkg/{dcgmexporter => cmd}/const.go | 21 +- .../clock_events_collector_test.go | 483 --- pkg/dcgmexporter/dcgm.go | 131 - pkg/dcgmexporter/expcollector.go | 260 -- .../field_entity_group_system_info.go | 95 - pkg/dcgmexporter/gpu_collector_test.go | 486 --- pkg/dcgmexporter/kubernetes_test.go | 337 -- pkg/dcgmexporter/pipeline.go | 377 --- pkg/dcgmexporter/pipeline_test.go | 208 -- pkg/dcgmexporter/registry.go | 92 - pkg/dcgmexporter/registry_test.go | 108 - pkg/dcgmexporter/server.go | 166 - pkg/dcgmexporter/system_info.go | 890 ------ pkg/dcgmexporter/system_info_test.go | 671 ---- pkg/dcgmexporter/types.go | 160 - pkg/dcgmexporter/xid_collector.go | 68 - pkg/dcgmexporter/xid_collector_test.go | 312 -- scripts/test_coverage.sh | 23 +- service-monitor.yaml | 4 +- tests/e2e/Makefile | 28 +- tests/e2e/e2e_actions_test.go | 108 +- tests/e2e/e2e_suite_test.go | 255 +- .../e2e_verify_default_configuration_test.go | 178 ++ tests/e2e/e2e_verify_http_basic_auth_test.go | 134 + tests/e2e/e2e_verify_tls_test.go | 118 + tests/e2e/internal/framework/helm.go | 23 +- tests/e2e/internal/framework/kube.go | 106 +- tests/e2e/main_test.go | 10 + tests/integration/start_read_test.go | 5 +- tests/integration/start_with_tls_test.go | 8 +- .../integration/testdata/default-counters.csv | 77 + 163 files changed, 20574 insertions(+), 6517 deletions(-) delete mode 100644 Jenkinsfile create mode 100644 deployment/templates/tls-secret.yaml create mode 100644 deployment/templates/web-config-configmap.yaml rename docker/{Dockerfile.ubi9 => Dockerfile.ubi} (58%) create mode 100644 docker/Dockerfile.ubuntu delete mode 100644 docker/Dockerfile.ubuntu22.04 create mode 100644 internal/mocks/pkg/collector/mock_collector.go create mode 100644 internal/mocks/pkg/dcgmprovider/mock_client.go create mode 100644 internal/mocks/pkg/deviceinfo/mock_device_info.go create mode 100644 internal/mocks/pkg/devicewatcher/mock_device_watcher.go create mode 100644 internal/mocks/pkg/devicewatchlistmanager/mock_device_watchlist_manager.go create mode 100644 internal/mocks/pkg/elf/mock_elf.go create mode 100644 internal/mocks/pkg/exec/mock_cmd.go create mode 100644 internal/mocks/pkg/exec/mock_exec.go create mode 100644 internal/mocks/pkg/nvmlprovider/mock_client.go rename internal/mocks/pkg/os/{dir_entry.go => mock_dir_entry.go} (96%) rename internal/mocks/pkg/os/{file_info.go => mock_file_info.go} (96%) rename internal/mocks/pkg/os/{os.go => mock_os.go} (93%) create mode 100644 internal/mocks/pkg/transformation/mock_transformer.go create mode 100644 internal/mocks/pkg/transformations/mock_transformer.go create mode 100644 internal/pkg/appconfig/const.go rename pkg/dcgmexporter/config.go => internal/pkg/appconfig/types.go (84%) create mode 100644 internal/pkg/collector/base_collector.go rename {pkg/dcgmexporter => internal/pkg/collector}/clock_events_collector.go (77%) create mode 100644 internal/pkg/collector/clock_events_collector_test.go create mode 100644 internal/pkg/collector/collector_factory.go create mode 100644 internal/pkg/collector/collector_factory_test.go create mode 100644 internal/pkg/collector/const.go create mode 100644 internal/pkg/collector/expcollector.go rename {pkg/dcgmexporter => internal/pkg/collector}/gpu_collector.go (58%) create mode 100644 internal/pkg/collector/gpu_collector_test.go create mode 100644 internal/pkg/collector/gpu_health_collector.go create mode 100644 internal/pkg/collector/gpu_health_collector_test.go create mode 100644 internal/pkg/collector/types.go rename pkg/dcgmexporter/os.go => internal/pkg/collector/variables.go (97%) create mode 100644 internal/pkg/collector/xid_collector.go create mode 100644 internal/pkg/collector/xid_collector_test.go rename {pkg/dcgmexporter => internal/pkg/collector}/xid_errors.go (99%) create mode 100644 internal/pkg/counters/const.go rename pkg/dcgmexporter/parser.go => internal/pkg/counters/counter_config.go (74%) rename pkg/dcgmexporter/parser_test.go => internal/pkg/counters/counter_config_test.go (81%) rename pkg/dcgmexporter/exporter_metrics.go => internal/pkg/counters/exporter_counters.go (82%) rename pkg/dcgmexporter/exporter_metrics_test.go => internal/pkg/counters/exporter_counters_test.go (95%) create mode 100644 internal/pkg/counters/types.go create mode 100644 internal/pkg/counters/variables.go create mode 100644 internal/pkg/dcgmprovider/dcgm.go create mode 100644 internal/pkg/dcgmprovider/types.go create mode 100644 internal/pkg/deviceinfo/device_info.go create mode 100644 internal/pkg/deviceinfo/device_info_test.go create mode 100644 internal/pkg/deviceinfo/testutils.go create mode 100644 internal/pkg/deviceinfo/types.go create mode 100644 internal/pkg/devicemonitoring/const.go create mode 100644 internal/pkg/devicemonitoring/device_monitoring.go create mode 100644 internal/pkg/devicemonitoring/device_monitoring_test.go rename pkg/dcgmexporter/test_utils.go => internal/pkg/devicemonitoring/types.go (60%) create mode 100644 internal/pkg/devicewatcher/const.go create mode 100644 internal/pkg/devicewatcher/device_watcher.go create mode 100644 internal/pkg/devicewatcher/device_watcher_test.go create mode 100644 internal/pkg/devicewatcher/types.go create mode 100644 internal/pkg/devicewatcher/variables.go create mode 100644 internal/pkg/devicewatchlistmanager/device_watchlist_manager.go create mode 100644 internal/pkg/devicewatchlistmanager/device_watchlist_manager_test.go create mode 100644 internal/pkg/devicewatchlistmanager/types.go create mode 100644 internal/pkg/elf/README.md create mode 100644 internal/pkg/elf/elf.go create mode 100644 internal/pkg/elf/types.go create mode 100644 internal/pkg/exec/README.md create mode 100644 internal/pkg/exec/exec.go create mode 100644 internal/pkg/hostname/hostname.go create mode 100644 internal/pkg/hostname/hostname_test.go create mode 100644 internal/pkg/integration_test/collector_test.go create mode 100644 internal/pkg/integration_test/transformation_test.go create mode 100644 internal/pkg/logging/const.go delete mode 100644 internal/pkg/logging/logger_adapter.go delete mode 100644 internal/pkg/logging/logger_adapter_test.go create mode 100644 internal/pkg/nvmlprovider/types.go create mode 100644 internal/pkg/prerequisites/dcgmlib_rule.go create mode 100644 internal/pkg/prerequisites/dcgmlib_rule_test.go create mode 100644 internal/pkg/prerequisites/types.go create mode 100644 internal/pkg/prerequisites/validation.go create mode 100644 internal/pkg/prerequisites/validation_test.go create mode 100644 internal/pkg/prerequisites/variables.go create mode 100644 internal/pkg/registry/registry.go create mode 100644 internal/pkg/registry/registry_test.go create mode 100644 internal/pkg/registry/types.go create mode 100644 internal/pkg/rendermetrics/render_metrics.go create mode 100644 internal/pkg/rendermetrics/render_metrics_test.go create mode 100644 internal/pkg/server/server.go create mode 100644 internal/pkg/server/server_test.go create mode 100644 internal/pkg/server/types.go rename {pkg => internal/pkg}/stdout/capture.go (84%) rename {pkg => internal/pkg}/stdout/capture_test.go (76%) rename {pkg => internal/pkg}/stdout/capture_test_wrapper.go (56%) rename {pkg => internal/pkg}/stdout/stdoutprocessor.go (100%) create mode 100644 internal/pkg/testutils/const.go create mode 100644 internal/pkg/testutils/test_utils.go delete mode 100644 internal/pkg/testutils/testutils.go create mode 100644 internal/pkg/testutils/types.go create mode 100644 internal/pkg/testutils/variables.go create mode 100644 internal/pkg/transformation/const.go rename {pkg/dcgmexporter => internal/pkg/transformation}/hpc.go (62%) rename {pkg/dcgmexporter => internal/pkg/transformation}/hpc_test.go (77%) rename {pkg/dcgmexporter => internal/pkg/transformation}/kubernetes.go (75%) create mode 100644 internal/pkg/transformation/kubernetes_test.go create mode 100644 internal/pkg/transformation/transformer.go create mode 100644 internal/pkg/transformation/transformer_test.go create mode 100644 internal/pkg/transformation/types.go create mode 100644 internal/pkg/transformation/variables.go rename {pkg/dcgmexporter => internal/pkg/utils}/utils.go (73%) rename {pkg/dcgmexporter => internal/pkg/utils}/utils_test.go (50%) create mode 100644 packaging/config-files/systemd/nvidia-dcgm-exporter.service rename pkg/{dcgmexporter => cmd}/const.go (73%) delete mode 100644 pkg/dcgmexporter/clock_events_collector_test.go delete mode 100644 pkg/dcgmexporter/dcgm.go delete mode 100644 pkg/dcgmexporter/expcollector.go delete mode 100644 pkg/dcgmexporter/field_entity_group_system_info.go delete mode 100644 pkg/dcgmexporter/gpu_collector_test.go delete mode 100644 pkg/dcgmexporter/kubernetes_test.go delete mode 100644 pkg/dcgmexporter/pipeline.go delete mode 100644 pkg/dcgmexporter/pipeline_test.go delete mode 100644 pkg/dcgmexporter/registry.go delete mode 100644 pkg/dcgmexporter/registry_test.go delete mode 100644 pkg/dcgmexporter/server.go delete mode 100644 pkg/dcgmexporter/system_info.go delete mode 100644 pkg/dcgmexporter/system_info_test.go delete mode 100644 pkg/dcgmexporter/types.go delete mode 100644 pkg/dcgmexporter/xid_collector.go delete mode 100644 pkg/dcgmexporter/xid_collector_test.go create mode 100644 tests/e2e/e2e_verify_default_configuration_test.go create mode 100644 tests/e2e/e2e_verify_http_basic_auth_test.go create mode 100644 tests/e2e/e2e_verify_tls_test.go create mode 100644 tests/integration/testdata/default-counters.csv diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 462bf5f6..e7882b15 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,13 +1,12 @@ -FROM nvcr.io/nvidia/cuda:12.3.1-base-ubuntu22.04 -ARG GOLANG_VERSION=1.21.5 +FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04 +ARG GOLANG_VERSION=1.22.5 ARG USERNAME=developer ARG USER_UID=1000 ARG USER_GID=1000 -ARG DCGM_VERSION=3.3.3 # Create a user 'developer' with UID=1000, add to 'developer' group, and add to 'sudo' group RUN groupadd -g $USER_GID $USERNAME && \ - useradd -m -u $USER_GID -g $USERNAME -s /bin/bash $USERNAME && \ - usermod -aG sudo $USERNAME + useradd -m -u $USER_GID -g $USERNAME -s /bin/bash $USERNAME && \ + usermod -aG sudo $USERNAME # Allow 'developer' to use sudo without a password RUN echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers @@ -15,17 +14,26 @@ RUN --mount=type=cache,target=/var/cache/apt \ set -eux; \ apt-get update; \ apt-get install -y --no-install-recommends \ - git \ - ca-certificates \ - g++ \ - gcc \ - libc6-dev \ - make \ - pkg-config \ - wget \ - datacenter-gpu-manager=1:${DCGM_VERSION} \ - libcap2-bin \ - && apt-get autoremove -y \ + git \ + ca-certificates \ + g++ \ + gcc \ + libc6-dev \ + make \ + pkg-config \ + wget \ + datacenter-gpu-manager-4-core \ + libcap2-bin \ + && install -m 0755 -d /etc/apt/keyrings \ + && wget -O /etc/apt/keyrings/docker.asc https://download.docker.com/linux/ubuntu/gpg \ + && chmod a+r /etc/apt/keyrings/docker.asc \ + && echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null \ + && apt-get update \ + && apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io docker-buildx-plugin \ + && apt-get autoremove -y \ && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \ # DCGM exporter doesn't use libdcgm_cublas_proxy*.so. && rm -rf /usr/lib/x86_64-linux-gnu/libdcgm_cublas_proxy*.so \ @@ -36,25 +44,25 @@ RUN set -eux; \ url=; \ echo "$arch"; \ case "$arch" in \ - 'amd64') \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ - ;; \ - 'arm64') \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \ - ;; \ - *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ + 'amd64') \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ + ;; \ + 'arm64') \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \ + ;; \ + *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ esac; \ build=; \ if [ -z "$url" ]; then \ -# https://github.com/golang/go/issues/38536#issuecomment-616897960 - build=1; \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \ - echo >&2; \ - echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \ - echo >&2; \ + # https://github.com/golang/go/issues/38536#issuecomment-616897960 + build=1; \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \ + echo >&2; \ + echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \ + echo >&2; \ fi; \ - wget -O go.tgz "$url" --progress=dot:giga; \ - tar -C /usr/local -xzf go.tgz; \ + wget -O go.tgz "$url" --progress=dot:giga; \ + tar -C /usr/local -xzf go.tgz; \ rm go.tgz ENV GOTOOLCHAIN=local ENV GOPATH /go diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index eb423060..39fb9c5f 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -16,7 +16,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v2 with: - go-version: 1.21 + go-version: 1.22 - name: Build run: make binary diff --git a/.gitignore b/.gitignore index 2b06b62b..6864811c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,9 @@ tests.cov test_results.json .scannerwork dist/ -.run/ +.run +dist/ + ############################################################################### # JetBrains # https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore @@ -236,4 +238,4 @@ $RECYCLE.BIN/ *.msp # Windows shortcuts -*.lnk +*.lnk \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index 8d941056..0e7c9609 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,12 +10,18 @@ "request": "launch", "mode": "test", "program": "${workspaceFolder}/tests/e2e", - "args": ["-test.v", - "--ginkgo.v", - "-kubeconfig","~/.kube/config", - "-chart","./../../deployment/", - "-image-repository","nvidia/dcgm-exporter", - "-arguments","{-f=/etc/dcgm-exporter/default-counters.csv,--enable-dcgm-log=true,--dcgm-log-level=ERROR}"], + "args": [ + "-test.v", + "--ginkgo.v", + "-kubeconfig", + "~/.kube/config", + "-chart", + "./../../deployment/", + "-image-repository", + "nvidia/dcgm-exporter", + "-arguments", + "{-f=/etc/dcgm-exporter/default-counters.csv}" + ], "env": {}, "buildFlags": "-tags=e2e" }, @@ -30,8 +36,8 @@ "-f", "./etc/default-counters.csv", "--debug", - "--enable-dcgm-log", - "--dcgm-log-level=INFO" + "-r", + "localhost:5555" ] } ] diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index c5233875..00000000 --- a/Jenkinsfile +++ /dev/null @@ -1,64 +0,0 @@ -@Library(['shared-libs']) _ - -pipeline { - - agent { - dockerfile { - label 'docker' - filename 'Dockerfile' - args '-v /etc/passwd:/etc/passwd:ro -v /var/run/docker.sock:/var/run/docker.sock:rw' - } - } - - options { - ansiColor('xterm') - timestamps() - timeout(time: 1, unit: 'HOURS') - gitLabConnection('GitLab Master') - buildDiscarder(logRotator(numToKeepStr: '100', artifactNumToKeepStr: '10')) - } - - environment { - HOME="${WORKSPACE}" - PYTHONUNBUFFERED=1 - } - - parameters { - string(name: 'REF', defaultValue: '\${gitlabBranch}', description: 'Commit to build') - } - - stages { - stage('Prep') { - steps { - script { - updateGitlabCommitStatus(name: 'Jenkins CI', state: 'running') - } - } - } - stage('Compile') { - steps { - echo "building" - sh "make binary" - } - } - stage('Test') { - steps { - echo "Running tests" - // Tests require supported GPU - // make test-main - sh "make check-format" - } - } - } - post { - always { - script{ - String status = (currentBuild.currentResult == "SUCCESS") ? "success" : "failed" - updateGitlabCommitStatus(name: 'Jenkins CI', state: status) - } - } - cleanup { - cleanWs() - } - } -} diff --git a/Makefile b/Makefile index 98c95003..2d9f5103 100644 --- a/Makefile +++ b/Makefile @@ -18,30 +18,29 @@ REGISTRY ?= nvidia GO ?= go MKDIR ?= mkdir GOLANGCILINT_TIMEOUT ?= 10m +IMAGE_TAG ?= "" DCGM_VERSION := $(NEW_DCGM_VERSION) -GOLANG_VERSION := 1.22.5 +GOLANG_VERSION := 1.22.9 VERSION := $(NEW_EXPORTER_VERSION) FULL_VERSION := $(DCGM_VERSION)-$(VERSION) OUTPUT := type=oci,dest=/dev/null PLATFORMS := linux/amd64,linux/arm64 -DOCKERCMD := docker buildx build +DOCKERCMD := docker --debug buildx build MODULE := github.com/NVIDIA/dcgm-exporter - .PHONY: all binary install check-format local all: update-version ubuntu22.04 ubi9 -binary: generate update-version +binary: update-version cd cmd/dcgm-exporter; $(GO) build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}" -test-main: +test-main: generate $(GO) test ./... -short install: binary install -m 755 cmd/dcgm-exporter/dcgm-exporter /usr/bin/dcgm-exporter install -m 644 -D ./etc/default-counters.csv /etc/dcgm-exporter/default-counters.csv - install -m 644 -D ./etc/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv check-format: test $$(gofmt -l pkg | tee /dev/stderr | wc -l) -eq 0 @@ -58,23 +57,71 @@ else $(MAKE) PLATFORMS=linux/amd64 OUTPUT=type=docker DOCKERCMD='docker build' endif -TARGETS = ubuntu22.04 ubi9 +ubi%: DOCKERFILE = docker/Dockerfile.ubi +ubi%: --docker-build-% + @ +ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubi9 +ubi9: IMAGE_TAG = ubi9 + +ubuntu%: DOCKERFILE = docker/Dockerfile.ubuntu +ubuntu%: --docker-build-% + @ +ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04 +ubuntu22.04: IMAGE_TAG = ubuntu22.04 -DOCKERFILE.ubuntu22.04 = docker/Dockerfile.ubuntu22.04 -DOCKERFILE.ubi9 = docker/Dockerfile.ubi9 -$(TARGETS): +--docker-build-%: + @echo "Building for $@" + DOCKER_BUILDKIT=1 \ $(DOCKERCMD) --pull \ --output $(OUTPUT) \ + --progress=plain \ --platform $(PLATFORMS) \ + --build-arg BASEIMAGE="$(BASE_IMAGE)" \ --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ --build-arg "DCGM_VERSION=$(DCGM_VERSION)" \ --build-arg "VERSION=$(VERSION)" \ - --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$@" \ - --file $(DOCKERFILE.$@) . + --tag $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)$(if $(IMAGE_TAG),-$(IMAGE_TAG)) \ + --file $(DOCKERFILE) . + +.PHONY: packages package-arm64 package-amd64 +packages: package-amd64 package-arm64 + +package-arm64: + $(MAKE) package-build PLATFORMS=linux/arm64 + +package-amd64: + $(MAKE) package-build PLATFORMS=linux/amd64 + +package-build: IMAGE_TAG = ubuntu22.04 +package-build: + ARCH=`echo $(PLATFORMS) | cut -d'/' -f2)`; \ + if [ "$$ARCH" = "amd64" ]; then \ + ARCH="x86-64"; \ + fi; \ + if [ "$$ARCH" = "arm64" ]; then \ + ARCH="sbsa"; \ + fi; \ + export DIST_NAME="dcgm_exporter-linux-$$ARCH-$(VERSION)"; \ + export COMPONENT_NAME="dcgm_exporter"; \ + $(MAKE) ubuntu22.04 OUTPUT=type=docker PLATFORMS=$(PLATFORMS) && \ + $(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME && \ + $(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin && \ + $(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/dcgm-exporter && \ + I=`docker create $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$(IMAGE_TAG)` && \ + docker cp $$I:/usr/bin/dcgm-exporter /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin/ && \ + docker cp $$I:/etc/dcgm-exporter /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/ && \ + cp ./LICENSE /tmp/$$DIST_NAME/$$COMPONENT_NAME && \ + mkdir -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/lib/systemd/system/ && \ + cp ./packaging/config-files/systemd/nvidia-dcgm-exporter.service \ + /tmp/$$DIST_NAME/$$COMPONENT_NAME/lib/systemd/system/nvidia-dcgm-exporter.service && \ + docker rm -f $$I && \ + $(MKDIR) -p $(CURDIR)/dist && \ + cd "/tmp/$$DIST_NAME" && tar -czf $(CURDIR)/dist/$$DIST_NAME.tar.gz `ls -A` && \ + rm -rf "/tmp/$$DIST_NAME"; .PHONY: integration -test-integration: +test-integration: generate go test -race -count=1 -timeout 5m -v $(TEST_ARGS) ./tests/integration/ test-coverage: @@ -83,7 +130,7 @@ test-coverage: .PHONY: lint lint: - golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix + golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 .PHONY: validate-modules validate-modules: @@ -99,6 +146,7 @@ tools: ## Install required tools and utilities go install github.com/axw/gocov/gocov@latest go install golang.org/x/tools/cmd/goimports@latest go install mvdan.cc/gofumpt@latest + go install github.com/wadey/gocovmerge@latest fmt: find . -name '*.go' | xargs gofumpt -l -w diff --git a/README.md b/README.md index fa13ec5e..7e4a1e27 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https To gather metrics on a GPU node, simply start the `dcgm-exporter` container: ```shell -docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.9-3.6.1-ubuntu22.04 +docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.0.0-4.0.0-ubuntu22.04 curl localhost:9400/metrics # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). # TYPE DCGM_FI_DEV_SM_CLOCK gauge @@ -111,8 +111,9 @@ To enable GPU-to-job mapping on the DCGM-exporter side, users must run the DCGM- In order to build dcgm-exporter ensure you have the following: -* [Golang >= 1.21 installed](https://golang.org/) +* [Golang >= 1.22 installed](https://golang.org/) * [DCGM installed](https://developer.nvidia.com/dcgm) +* Have Linux machine with GPU, compatible with DCGM. ```shell git clone https://github.com/NVIDIA/dcgm-exporter.git diff --git a/cmd/dcgm-exporter/main.go b/cmd/dcgm-exporter/main.go index 2dedaae1..94505f48 100644 --- a/cmd/dcgm-exporter/main.go +++ b/cmd/dcgm-exporter/main.go @@ -17,22 +17,20 @@ package main import ( + "log/slog" "os" - "github.com/sirupsen/logrus" - _ "go.uber.org/automaxprocs" "github.com/NVIDIA/dcgm-exporter/pkg/cmd" ) -var ( - BuildVersion = "Filled by the build system" -) +var BuildVersion = "Filled by the build system" func main() { app := cmd.NewApp(BuildVersion) if err := app.Run(os.Args); err != nil { - logrus.Fatal(err) + slog.Error(err.Error()) + os.Exit(1) } } diff --git a/dcgm-exporter.yaml b/dcgm-exporter.yaml index d919dc83..8a6c8d6b 100644 --- a/dcgm-exporter.yaml +++ b/dcgm-exporter.yaml @@ -18,23 +18,23 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.6.1" + app.kubernetes.io/version: "4.0.0" spec: updateStrategy: type: RollingUpdate selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.6.1" + app.kubernetes.io/version: "4.0.0" template: metadata: labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.6.1" + app.kubernetes.io/version: "4.0.0" name: "dcgm-exporter" spec: containers: - - image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.9-3.6.1-ubuntu22.04" + - image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.0.0-4.0.0-ubuntu22.04" env: - name: "DCGM_EXPORTER_LISTEN" value: ":9400" @@ -66,11 +66,11 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.6.1" + app.kubernetes.io/version: "4.0.0" spec: selector: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.6.1" + app.kubernetes.io/version: "4.0.0" ports: - name: "metrics" port: 9400 diff --git a/deployment/Chart.yaml b/deployment/Chart.yaml index a297d602..f8face93 100644 --- a/deployment/Chart.yaml +++ b/deployment/Chart.yaml @@ -1,9 +1,9 @@ apiVersion: v2 name: dcgm-exporter description: A Helm chart for DCGM exporter -version: "3.7.0" +version: "4.0.0" kubeVersion: ">= 1.19.0-0" -appVersion: "3.6.1" +appVersion: "4.0.0" sources: - https://github.com/nvidia/dcgm-exporter home: https://github.com/nvidia/dcgm-exporter/ diff --git a/deployment/templates/_helpers.tpl b/deployment/templates/_helpers.tpl index ff71dd04..c14ede3a 100644 --- a/deployment/templates/_helpers.tpl +++ b/deployment/templates/_helpers.tpl @@ -73,3 +73,23 @@ Create the name of the service account to use {{ default "default" .Values.serviceAccount.name }} {{- end -}} {{- end -}} + + +{{/* +Create the name of the tls secret to use +*/}} +{{- define "dcgm-exporter.tlsCertsSecretName" -}} +{{- if .Values.tlsServerConfig.existingSecret -}} + {{- printf "%s" (tpl .Values.tlsServerConfig.existingSecret $) -}} +{{- else -}} + {{ printf "%s-tls" (include "dcgm-exporter.fullname" .) }} +{{- end -}} +{{- end -}} + + +{{/* +Create the name of the web-config configmap name to use +*/}} +{{- define "dcgm-exporter.webConfigConfigMap" -}} + {{ printf "%s-web-config.yml" (include "dcgm-exporter.fullname" .) }} +{{- end -}} \ No newline at end of file diff --git a/deployment/templates/daemonset.yaml b/deployment/templates/daemonset.yaml index c662d3e3..103f09cb 100644 --- a/deployment/templates/daemonset.yaml +++ b/deployment/templates/daemonset.yaml @@ -78,6 +78,18 @@ spec: - name: "pod-gpu-resources" hostPath: path: {{ .Values.kubeletPath }} + {{- if and .Values.tlsServerConfig.enabled }} + - name: "tls" + secret: + secretName: {{ include "dcgm-exporter.tlsCertsSecretName" . }} + defaultMode: 0664 + {{- end }} + {{- if or .Values.tlsServerConfig.enabled $.Values.basicAuth.users}} + - name: "web-config-yaml" + configMap: + name: {{ include "dcgm-exporter.webConfigConfigMap" . }} + defaultMode: 0664 + {{- end }} {{- range .Values.extraHostVolumes }} - name: {{ .name | quote }} hostPath: @@ -109,6 +121,10 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + {{- if or .Values.tlsServerConfig.enabled $.Values.basicAuth.users}} + - name: "DCGM_EXPORTER_WEB_CONFIG_FILE" + value: /etc/dcgm-exporter/web-config.yaml + {{- end }} {{- if .Values.extraEnv }} {{- toYaml .Values.extraEnv | nindent 8 }} {{- end }} @@ -119,19 +135,40 @@ spec: - name: "pod-gpu-resources" readOnly: true mountPath: "/var/lib/kubelet/pod-resources" + {{- if and .Values.tlsServerConfig.enabled }} + - name: "tls" + mountPath: /etc/dcgm-exporter/tls + {{- end }} + {{- if or .Values.tlsServerConfig.enabled $.Values.basicAuth.users}} + - name: "web-config-yaml" + mountPath: /etc/dcgm-exporter/web-config.yaml + subPath: web-config.yaml + {{- end }} {{- if .Values.extraVolumeMounts }} {{- toYaml .Values.extraVolumeMounts | nindent 8 }} {{- end }} livenessProbe: + {{- if not $.Values.basicAuth.users }} httpGet: path: /health port: {{ .Values.service.port }} + scheme: {{ ternary "HTTPS" "HTTP" $.Values.tlsServerConfig.enabled }} + {{- else }} + tcpSocket: + port: {{ .Values.service.port }} + {{- end }} initialDelaySeconds: 45 periodSeconds: 5 readinessProbe: + {{- if not $.Values.basicAuth.users }} httpGet: path: /health port: {{ .Values.service.port }} + scheme: {{ ternary "HTTPS" "HTTP" $.Values.tlsServerConfig.enabled }} + {{- else }} + tcpSocket: + port: {{ .Values.service.port }} + {{- end }} initialDelaySeconds: 45 {{- if .Values.resources }} resources: diff --git a/deployment/templates/tls-secret.yaml b/deployment/templates/tls-secret.yaml new file mode 100644 index 00000000..0762eb04 --- /dev/null +++ b/deployment/templates/tls-secret.yaml @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +{{- if and .Values.tlsServerConfig.enabled (not .Values.tlsServerConfig.existingSecret) }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ (include "dcgm-exporter.tlsCertsSecretName" .) }} + namespace: {{ include "dcgm-exporter.namespace" . }} + labels: + app.kubernetes.io/component: "dcgm-exporter" + {{- include "dcgm-exporter.labels" . | nindent 4 }} +type: Opaque +data: + {{- if .Values.tlsServerConfig.autoGenerated }} + {{- $ca := genCA "dcgm-exporter-ca" 365 }} + {{- $hostname := printf "%s" (include "dcgm-exporter.fullname" .) }} + {{- $cert := genSignedCert $hostname nil (list $hostname) 365 $ca }} + {{ .Values.tlsServerConfig.certFilename }}: {{ $cert.Cert | b64enc | quote }} + {{ .Values.tlsServerConfig.keyFilename }}: {{ $cert.Key | b64enc | quote }} + {{- if .Values.tlsServerConfig.clientAuthType }} + {{ .Values.tlsServerConfig.caFilename }}: {{ $ca.Cert | b64enc | quote }} + {{- end }} + {{- else }} + {{ .Values.tlsServerConfig.certFilename }}: {{ required "'tlsServerConfig.cert' is required when 'tlsServerConfig.enabled=true'" .Values.tlsServerConfig.cert | b64enc | quote }} + {{ .Values.tlsServerConfig.keyFilename }}: {{ required "'tlsServerConfig.key' is required when 'tlsServerConfig.enabled=true'" .Values.tlsServerConfig.key | b64enc | quote }} + {{- if .Values.tlsServerConfig.clientAuthType }} + {{ .Values.tlsServerConfig.caFilename }}: {{ required "'tlsServerConfig.ca' is required when 'tlsServerConfig.clientAuthType' is provided" .Values.tlsServerConfig.ca | b64enc | quote }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/deployment/templates/web-config-configmap.yaml b/deployment/templates/web-config-configmap.yaml new file mode 100644 index 00000000..af21dfd1 --- /dev/null +++ b/deployment/templates/web-config-configmap.yaml @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +{{- if or .Values.tlsServerConfig.enabled .Values.basicAuth.users }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "dcgm-exporter.webConfigConfigMap" . }} + namespace: {{ include "dcgm-exporter.namespace" . }} + labels: + app.kubernetes.io/component: "dcgm-exporter" + {{- include "dcgm-exporter.labels" . | nindent 4 }} +data: + web-config.yaml: | +{{- if .Values.tlsServerConfig.enabled }} + tls_server_config: + cert_file: {{ required "'tlsServerConfig.certFilename' is required when 'tlsServerConfig.enabled=true'" .Values.tlsServerConfig.certFilename | printf "/etc/dcgm-exporter/tls/%s" | quote }} + key_file: {{ required "'tlsServerConfig.keyFilename' is required when 'tlsServerConfig.enabled=true'" .Values.tlsServerConfig.keyFilename | printf "/etc/dcgm-exporter/tls/%s" | quote }} + {{- if .Values.tlsServerConfig.clientAuthType }} + client_auth_type: {{ .Values.tlsServerConfig.clientAuthType }} + client_ca_file: {{ required "'tlsServerConfig.caFilename' is required when 'tlsServerConfig.clientAuthType' is provided" .Values.tlsServerConfig.caFilename | printf "/etc/dcgm-exporter/tls/%s" | quote }} + {{- end }} +{{- end }} +{{- if .Values.basicAuth.users }} + basic_auth_users: + {{- range $user, $password := .Values.basicAuth.users }} + {{ $user }}: {{ (split ":" (htpasswd $user $password))._1 }} + {{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/deployment/values.yaml b/deployment/values.yaml index bf2d5be7..ba1e1e66 100644 --- a/deployment/values.yaml +++ b/deployment/values.yaml @@ -17,11 +17,11 @@ image: pullPolicy: IfNotPresent # Image tag defaults to AppVersion, but you can use the tag key # for the image tag, e.g: - tag: 3.3.9-3.6.1-ubuntu22.04 + tag: 4.0.0-4.0.0-ubuntu22.04 # Change the following reference to "/etc/dcgm-exporter/default-counters.csv" # to stop profiling metrics from DCGM -arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"] +arguments: ["-f", "/etc/dcgm-exporter/default-counters.csv"] # NOTE: in general, add any command line arguments to arguments above # and they will be passed through. # Use "-r", ":" to connect to an already running hostengine @@ -146,12 +146,12 @@ extraConfigMapVolumes: name: exporter-metrics-config-map items: - key: metrics - path: dcp-metrics-included.csv + path: default-counters.csv extraVolumeMounts: - name: exporter-metrics-volume - mountPath: /etc/dcgm-exporter/dcp-metrics-included.csv - subPath: dcp-metrics-included.csv + mountPath: /etc/dcgm-exporter/default-counters.csv + subPath: default-counters.csv extraEnv: [] #- name: EXTRA_VAR @@ -160,6 +160,38 @@ extraEnv: [] # Path to the kubelet socket for /pod-resources kubeletPath: "/var/lib/kubelet/pod-resources" +# HTTPS configuration +tlsServerConfig: + # Enable or disable HTTPS configuration + enabled: false + # Use autogenerated self-signed TLS certificates. Not recommended for production environments. + autoGenerated: true + # Existing secret containing your own server key and certificate + existingSecret: "" + # Certificate file name + certFilename: "tls.crt" + # Key file name + keyFilename: "tls.key" + # CA certificate file name + caFilename: "ca.crt" + # Server policy for client authentication. Maps to ClientAuth Policies. + # For more detail on clientAuth options: + # https://golang.org/pkg/crypto/tls/#ClientAuthType + # + # NOTE: If you want to enable client authentication, you need to use + # RequireAndVerifyClientCert. Other values are insecure. + clientAuthType: "" + # TLS Key for HTTPS - ignored if existingSecret is provided + key: "" + # TLS Certificate for HTTPS - ignored if existingSecret is provided + cert: "" + # CA Certificate for HTTPS - ignored if existingSecret is provided + ca: "" + +basicAuth: + #Object containing : key-value pairs for each user that will have access via basic authentication + users: {} + # Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list. # Must be the complete list and is not additive. If unset, the default list will take effect. # customMetrics: | diff --git a/docker/Dockerfile.ubi9 b/docker/Dockerfile.ubi similarity index 58% rename from docker/Dockerfile.ubi9 rename to docker/Dockerfile.ubi index e6154d21..a073d9fc 100644 --- a/docker/Dockerfile.ubi9 +++ b/docker/Dockerfile.ubi @@ -1,15 +1,29 @@ -FROM nvcr.io/nvidia/cuda:12.6.2-base-ubi9 AS builder -ARG GOLANG_VERSION=1.22.4 +ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.6.3-base-ubi9 + +FROM --platform=$BUILDPLATFORM ubuntu:18.04 AS builder + +ARG GOLANG_VERSION=1.22.5 + WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter -RUN set -eux; \ - dnf clean expire-cache; \ - dnf install -y go-toolset make wget -RUN dnf clean all && rm -rf /usr/bin/go +RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ + apt-get update \ + && apt-get install -y --no-install-recommends \ + wget \ + ca-certificates \ + git \ + build-essential \ + gcc \ + gcc-aarch64-linux-gnu \ + qemu-user \ + qemu-system-arm \ + libc6-dev-arm64-cross \ + && apt-get autoremove -y \ + && rm -rf /var/lib/apt/lists/* \ + && ln -sf /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1 /lib/ld-linux-aarch64.so.1 -# Install Go official release RUN set -eux; \ + arch="$(dpkg --print-architecture)"; arch="${arch##*-}"; \ url=; \ - arch=$(uname -m) && if [ "${arch}" = "x86_64" ]; then arch=amd64; fi && if [ "${arch}" = "aarch64" ]; then arch=arm64; fi; \ case "$arch" in \ 'amd64') \ url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ @@ -19,42 +33,37 @@ RUN set -eux; \ ;; \ *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ esac; \ - build=; \ - if [ -z "$url" ]; then \ - # https://github.com/golang/go/issues/38536#issuecomment-616897960 - build=1; \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \ - echo >&2; \ - echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \ - echo >&2; \ - fi; \ wget -O go.tgz "$url" --progress=dot:giga; \ tar -C /usr/local -xzf go.tgz; \ rm go.tgz; -ENV GOTOOLCHAIN=local -ENV GOPATH /go -ENV PATH $GOPATH/bin:$PATH +ENV GOTOOLCHAIN=local GOPATH=/go +ENV PATH=$GOPATH/bin:$PATH RUN mkdir -p "$GOPATH/src" "$GOPATH/bin" && chmod -R 1777 "$GOPATH" -ENV PATH $PATH:/usr/local/go/bin +ENV PATH=$PATH:/usr/local/go/bin COPY . . +ARG TARGETOS +ARG TARGETARCH +RUN --mount=type=cache,target=/root/.cache/go-build \ + --mount=type=cache,target=/go/pkg \ + # when building aarch64 we have to target aarch64-linux-gnu-gcc compiler + if [ "$TARGETARCH" = "arm64" ]; then \ + export CC=aarch64-linux-gnu-gcc; \ + export LD_LIBRARY_PATH=/usr/aarch64-linux-gnu/lib:$LD_LIBRARY_PATH; \ + fi && \ + GOOS=$TARGETOS GOARCH=$TARGETARCH CC=$CC CGO_ENABLED=1 make install -RUN make binary check-format -FROM nvcr.io/nvidia/cuda:12.6.2-base-ubi9 +FROM ${BASEIMAGE} ARG DCGM_VERSION ARG VERSION ARG DIST_DIR LABEL io.k8s.display-name="NVIDIA DCGM Exporter" -LABEL name="NVIDIA DCGM Exporter" -LABEL vendor="NVIDIA" -LABEL version="${VERSION}" -LABEL release="N/A" -LABEL summary="Exports GPU Metrics to Prometheus" -LABEL description="See summary" + +ARG DCGM_VERSION RUN dnf update --disablerepo=* --enablerepo=ubi-9-appstream-rpms --enablerepo=ubi-9-baseos-rpms -y \ - && dnf install --nodocs --setopt=install_weak_deps=False -y datacenter-gpu-manager-${DCGM_VERSION} libcap \ + && dnf install --nodocs --setopt=install_weak_deps=False -y datacenter-gpu-manager-4-core libcap \ && dnf -y clean all\ && rm -rf /var/cache/yum\ && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \ @@ -64,11 +73,11 @@ RUN dnf update --disablerepo=* --enablerepo=ubi-9-appstream-rpms --enablerepo=ub && rm -f /usr/include/*.h /usr/bin/DcgmProfTesterKernels.ptx /usr/bin/dcgmproftester* \ && rm -rf /var/lib/rpm/rpmdb.sqlite /var/cache/* /var/lib/dnf/history.* /var/log/* /tmp/* /var/tmp/* \ && rm -rf /usr/share/doc && rm -rf /usr/share/man \ - && dnf remove openssl + && ldconfig COPY ./LICENSE ./licenses/LICENSE -COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/ -COPY ./etc /etc/dcgm-exporter +COPY --from=builder /usr/bin/dcgm-exporter /usr/bin/ +COPY etc /etc/dcgm-exporter ENV NVIDIA_VISIBLE_DEVICES=all # disable all constraints on the configurations required by NVIDIA container toolkit @@ -76,7 +85,7 @@ ENV NVIDIA_DISABLE_REQUIRE="true" # Required for DCP metrics ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 -ENV NO_SETCAP= +ENV NO_SETCAP="" COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu new file mode 100644 index 00000000..a8b9c1a7 --- /dev/null +++ b/docker/Dockerfile.ubuntu @@ -0,0 +1,98 @@ +ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04 + +FROM --platform=$BUILDPLATFORM ubuntu:18.04 AS builder + +ARG GOLANG_VERSION=1.22.5 + +WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter +RUN apt-get -qq update \ + && apt-get -qq install -y --no-install-recommends \ + wget \ + ca-certificates \ + git \ + build-essential \ + gcc \ + gcc-aarch64-linux-gnu \ + qemu-user \ + qemu-system-arm \ + libc6-dev-arm64-cross \ + && ln -sf /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1 /lib/ld-linux-aarch64.so.1 + +RUN set -eux; \ + arch="$(dpkg --print-architecture)"; arch="${arch##*-}"; \ + url=; \ + case "$arch" in \ + 'amd64') \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ + ;; \ + 'arm64') \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \ + ;; \ + *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ + esac; \ + wget -q -O go.tgz "$url"; \ + tar -C /usr/local -xzf go.tgz; \ + rm go.tgz; +ENV GOTOOLCHAIN=local GOPATH=/go +ENV PATH=$GOPATH/bin:$PATH +RUN mkdir -p "$GOPATH/src" "$GOPATH/bin" && chmod -R 1777 "$GOPATH" +ENV PATH=$PATH:/usr/local/go/bin +COPY go.mod go.sum ./ +COPY . . +RUN go mod download +# when building aarch64 we have to target aarch64-linux-gnu-gcc compiler +ARG TARGETOS +ARG TARGETARCH +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + export CC=aarch64-linux-gnu-gcc; \ + export LD_LIBRARY_PATH=/usr/aarch64-linux-gnu/lib:$LD_LIBRARY_PATH; \ + fi && \ + GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 make install +RUN apt-get update && apt-get install -y file && rm -rf /var/lib/apt/lists/* +RUN file /usr/bin/dcgm-exporter + +FROM --platform=$TARGETARCH ${BASEIMAGE} + +ARG VERSION +ARG DCGM_VERSION +ARG DIST_DIR +ARG TARGETARCH + +LABEL io.k8s.display-name="NVIDIA DCGM Exporter" +LABEL name="NVIDIA DCGM Exporter" +LABEL vendor="NVIDIA" +LABEL version="${VERSION}" +LABEL release="N/A" +LABEL summary="Exports GPU Metrics to Prometheus" +LABEL description="See summary" + +COPY ./LICENSE ./licenses/LICENSE +COPY --from=builder /usr/bin/dcgm-exporter /usr/bin/ +COPY etc /etc/dcgm-exporter +ENV DEBIAN_FRONTEND=noninteractive +RUN echo "$TARGETARCH" && apt-get -qq update && apt-get -qq install -y --no-install-recommends \ + datacenter-gpu-manager-4-core libcap2-bin \ + && apt-get -qq purge --autoremove -y openssl \ + && apt-get -qq -y clean \ + && apt-get -qq -y autoclean \ + && apt-get -qq autoremove -y \ + && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \ + # DCGM exporter doesn't use libdcgm_cublas_proxy*.so. + && rm -rf /usr/lib/x86_64-linux-gnu/libdcgm_cublas_proxy*.so \ + && rm -rf /usr/local/dcgm/scripts \ + && rm -f /usr/include/*.h /usr/bin/DcgmProfTesterKernels.ptx /usr/bin/dcgmproftester* \ + && rm -rf /var/cache/debconf/* /var/lib/apt/lists/* /var/log/* /tmp/* /var/tmp/* \ + && rm -rf /usr/share/doc && rm -rf /usr/share/man \ + && ldconfig +# Required for DCP metrics +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 +# disable all constraints on the configurations required by NVIDIA container toolkit +ENV NVIDIA_DISABLE_REQUIRE="true" +ENV NVIDIA_VISIBLE_DEVICES=all + +ENV NO_SETCAP="" +COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh +RUN uname -a +RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh + +ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] diff --git a/docker/Dockerfile.ubuntu22.04 b/docker/Dockerfile.ubuntu22.04 deleted file mode 100644 index 1cd1c3cc..00000000 --- a/docker/Dockerfile.ubuntu22.04 +++ /dev/null @@ -1,88 +0,0 @@ -FROM nvcr.io/nvidia/cuda:12.6.2-base-ubuntu22.04 AS builder -ARG GOLANG_VERSION=1.22.4 -WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter -RUN set -eux; \ - apt-get update; \ - apt-get install -y --no-install-recommends \ - g++ \ - gcc \ - libc6-dev \ - make \ - pkg-config \ - wget \ - ; \ - rm -rf /var/lib/apt/lists/* -RUN set -eux; \ - arch="$(dpkg --print-architecture)"; arch="${arch##*-}"; \ - url=; \ - case "$arch" in \ - 'amd64') \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ - ;; \ - 'arm64') \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \ - ;; \ - *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ - esac; \ - build=; \ - if [ -z "$url" ]; then \ - # https://github.com/golang/go/issues/38536#issuecomment-616897960 - build=1; \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \ - echo >&2; \ - echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \ - echo >&2; \ - fi; \ - wget -O go.tgz "$url" --progress=dot:giga; \ - tar -C /usr/local -xzf go.tgz; \ - rm go.tgz; -ENV GOTOOLCHAIN=local -ENV GOPATH /go -ENV PATH $GOPATH/bin:$PATH -RUN mkdir -p "$GOPATH/src" "$GOPATH/bin" && chmod -R 1777 "$GOPATH" -ENV PATH $PATH:/usr/local/go/bin -COPY . . - -RUN make binary check-format - -FROM nvcr.io/nvidia/cuda:12.6.2-base-ubuntu22.04 - -ARG VERSION -ARG DCGM_VERSION -ARG DIST_DIR - -LABEL io.k8s.display-name="NVIDIA DCGM Exporter" -LABEL name="NVIDIA DCGM Exporter" -LABEL vendor="NVIDIA" -LABEL version="${VERSION}" -LABEL release="N/A" -LABEL summary="Exports GPU Metrics to Prometheus" -LABEL description="See summary" - -COPY ./LICENSE ./licenses/LICENSE -COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/ -COPY etc /etc/dcgm-exporter - -RUN apt-get update && apt-get install -y --no-install-recommends \ - datacenter-gpu-manager=1:${DCGM_VERSION} libcap2-bin && apt-get purge --autoremove -y openssl \ - && apt-get -y clean \ - && apt-get -y autoclean \ - && apt-get autoremove -y \ - && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \ - # DCGM exporter doesn't use libdcgm_cublas_proxy*.so. - && rm -rf /usr/lib/x86_64-linux-gnu/libdcgm_cublas_proxy*.so \ - && rm -rf /usr/local/dcgm/scripts \ - && rm -f /usr/include/*.h /usr/bin/DcgmProfTesterKernels.ptx /usr/bin/dcgmproftester* \ - && rm -rf /var/cache/debconf/* /var/lib/apt/lists/* /var/log/* /tmp/* /var/tmp/* \ - && rm -rf /usr/share/doc && rm -rf /usr/share/man -# Required for DCP metrics -ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 -# disable all constraints on the configurations required by NVIDIA container toolkit -ENV NVIDIA_DISABLE_REQUIRE="true" -ENV NVIDIA_VISIBLE_DEVICES=all - -ENV NO_SETCAP= -COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh -RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh - -ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] diff --git a/etc/default-counters.csv b/etc/default-counters.csv index ad949dd2..aa77a4e4 100644 --- a/etc/default-counters.csv +++ b/etc/default-counters.csv @@ -5,7 +5,6 @@ # Clocks DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). -# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param). # Temperature DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). @@ -16,8 +15,8 @@ DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). # PCIE -DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. -DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. +# DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. +# DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. # Utilization (the sample period varies depending on the product) @@ -34,10 +33,10 @@ DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encoun # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). # DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). -# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param). + # Memory usage -DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). -DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). +DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). +DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). # ECC # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. @@ -55,7 +54,8 @@ DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. -DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. +# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. # VGPU License status DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status @@ -75,3 +75,16 @@ DCGM_FI_DRIVER_VERSION, label, Driver Version # DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version # DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version # DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device + +# Datacenter Profiling (DCP) metrics +# NOTE: supported on Nvidia datacenter Volta GPUs and newer +DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. +# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. +# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM. +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. +DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. +# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active. +# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active. +# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. +DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. diff --git a/go.mod b/go.mod index 9436d873..fa1efc33 100644 --- a/go.mod +++ b/go.mod @@ -2,57 +2,59 @@ module github.com/NVIDIA/dcgm-exporter go 1.22.0 -toolchain go1.22.4 +toolchain go1.22.9 require ( - github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f - github.com/NVIDIA/go-nvml v0.12.0-2 - github.com/avast/retry-go/v4 v4.5.1 - github.com/bits-and-blooms/bitset v1.13.0 - github.com/go-kit/log v0.2.1 + github.com/NVIDIA/go-dcgm v0.0.0-20250106155650-850266c9c8a5 + github.com/NVIDIA/go-nvml v0.12.4-0 + github.com/avast/retry-go/v4 v4.6.0 + github.com/bits-and-blooms/bitset v1.17.0 github.com/google/uuid v1.6.0 github.com/gorilla/mux v1.8.1 - github.com/mittwald/go-helm-client v0.12.9 - github.com/onsi/ginkgo/v2 v2.15.0 - github.com/onsi/gomega v1.32.0 - github.com/prometheus/client_model v0.6.0 - github.com/prometheus/common v0.47.0 - github.com/prometheus/exporter-toolkit v0.11.0 + github.com/mittwald/go-helm-client v0.12.14 + github.com/onsi/ginkgo/v2 v2.22.0 + github.com/onsi/gomega v1.36.0 + github.com/pkg/errors v0.9.1 + github.com/prometheus/client_model v0.6.1 + github.com/prometheus/common v0.60.1 + github.com/prometheus/exporter-toolkit v0.13.1 github.com/sirupsen/logrus v1.9.3 - github.com/stretchr/testify v1.8.4 + github.com/stretchr/testify v1.10.0 github.com/urfave/cli/v2 v2.27.1 go.uber.org/automaxprocs v1.5.3 go.uber.org/mock v0.4.0 - golang.org/x/sync v0.7.0 - google.golang.org/grpc v1.64.1 - k8s.io/api v0.30.2 - k8s.io/apimachinery v0.30.2 - k8s.io/client-go v0.30.2 + golang.org/x/sync v0.8.0 + google.golang.org/grpc v1.65.0 + k8s.io/api v0.31.1 + k8s.io/apimachinery v0.31.1 + k8s.io/client-go v0.31.1 k8s.io/kubelet v0.30.2 - k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 + k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 ) require ( + dario.cat/mergo v1.0.1 // indirect github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 // indirect github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect github.com/BurntSushi/toml v1.3.2 // indirect github.com/MakeNowJust/heredoc v1.0.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver v1.5.0 // indirect - github.com/Masterminds/semver/v3 v3.2.1 // indirect - github.com/Masterminds/sprig/v3 v3.2.3 // indirect + github.com/Masterminds/semver/v3 v3.3.0 // indirect + github.com/Masterminds/sprig/v3 v3.3.0 // indirect github.com/Masterminds/squirrel v1.5.4 // indirect github.com/Microsoft/hcsshim v0.11.4 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/beorn7/perks v1.0.1 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chai2010/gettext-go v1.0.2 // indirect github.com/containerd/containerd v1.7.12 // indirect github.com/containerd/log v0.1.0 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect - github.com/cpuguy83/go-md2man/v2 v2.0.3 // indirect - github.com/cyphar/filepath-securejoin v0.2.4 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect + github.com/cyphar/filepath-securejoin v0.3.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/distribution/reference v0.5.0 // indirect github.com/docker/cli v26.1.4+incompatible // indirect github.com/docker/distribution v2.8.3+incompatible // indirect @@ -60,20 +62,20 @@ require ( github.com/docker/docker-credential-helpers v0.8.0 // indirect github.com/docker/go-connections v0.5.0 // indirect github.com/docker/go-metrics v0.0.1 // indirect - github.com/emicklei/go-restful/v3 v3.11.1 // indirect - github.com/evanphx/json-patch v5.7.0+incompatible // indirect + github.com/emicklei/go-restful/v3 v3.12.1 // indirect + github.com/evanphx/json-patch v5.9.0+incompatible // indirect github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect github.com/fatih/color v1.16.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-errors/errors v1.5.1 // indirect github.com/go-gorp/gorp/v3 v3.1.0 // indirect - github.com/go-logfmt/logfmt v0.6.0 // indirect - github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/jsonpointer v0.20.2 // indirect github.com/go-openapi/jsonreference v0.20.4 // indirect github.com/go-openapi/swag v0.22.7 // indirect - github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.4 // indirect @@ -81,21 +83,21 @@ require ( github.com/google/gnostic-models v0.6.8 // indirect github.com/google/go-cmp v0.6.0 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect + github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect github.com/gorilla/websocket v1.5.1 // indirect github.com/gosuri/uitable v0.0.4 // indirect github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect - github.com/huandu/xstrings v1.4.0 // indirect + github.com/huandu/xstrings v1.5.0 // indirect github.com/imdario/mergo v0.3.16 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/jmoiron/sqlx v1.3.5 // indirect + github.com/jmoiron/sqlx v1.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.17.4 // indirect + github.com/klauspost/compress v1.17.9 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect github.com/lib/pq v1.10.9 // indirect @@ -104,11 +106,13 @@ require ( github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-runewidth v0.0.15 // indirect + github.com/mdlayher/socket v0.4.1 // indirect + github.com/mdlayher/vsock v1.2.1 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/go-wordwrap v1.0.1 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/moby/locker v1.0.1 // indirect - github.com/moby/spdystream v0.2.0 // indirect + github.com/moby/spdystream v0.4.0 // indirect github.com/moby/term v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect @@ -117,58 +121,56 @@ require ( github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/opencontainers/go-digest v1.0.0 // indirect - github.com/opencontainers/image-spec v1.1.0-rc6 // indirect + github.com/opencontainers/image-spec v1.1.0 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect - github.com/pkg/errors v0.9.1 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_golang v1.18.0 // indirect - github.com/prometheus/procfs v0.12.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_golang v1.20.4 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/rivo/uniseg v0.4.4 // indirect - github.com/rubenv/sql-migrate v1.6.0 // indirect + github.com/rubenv/sql-migrate v1.7.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect - github.com/shopspring/decimal v1.3.1 // indirect - github.com/spf13/cast v1.6.0 // indirect - github.com/spf13/cobra v1.8.0 // indirect + github.com/shopspring/decimal v1.4.0 // indirect + github.com/spf13/cast v1.7.0 // indirect + github.com/spf13/cobra v1.8.1 // indirect github.com/spf13/pflag v1.0.5 // indirect - github.com/stretchr/objx v0.5.0 // indirect + github.com/stretchr/objx v0.5.2 // indirect + github.com/x448/float16 v0.8.4 // indirect github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect github.com/xeipuuv/gojsonschema v1.2.0 // indirect github.com/xlab/treeprint v1.2.0 // indirect github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1 // indirect - go.opentelemetry.io/otel v1.21.0 // indirect - go.opentelemetry.io/otel/metric v1.21.0 // indirect - go.opentelemetry.io/otel/trace v1.21.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect + go.opentelemetry.io/otel v1.28.0 // indirect + go.opentelemetry.io/otel/metric v1.28.0 // indirect + go.opentelemetry.io/otel/trace v1.28.0 // indirect go.starlark.net v0.0.0-20231121155337-90ade8b19d09 // indirect - golang.org/x/crypto v0.24.0 // indirect - golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc // indirect - golang.org/x/net v0.26.0 // indirect - golang.org/x/oauth2 v0.18.0 // indirect - golang.org/x/sys v0.21.0 // indirect - golang.org/x/term v0.21.0 // indirect - golang.org/x/text v0.16.0 // indirect + golang.org/x/crypto v0.28.0 // indirect + golang.org/x/net v0.30.0 // indirect + golang.org/x/oauth2 v0.23.0 // indirect + golang.org/x/sys v0.26.0 // indirect + golang.org/x/term v0.25.0 // indirect + golang.org/x/text v0.19.0 // indirect golang.org/x/time v0.5.0 // indirect - golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect - google.golang.org/appengine v1.6.8 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 // indirect - google.golang.org/protobuf v1.33.0 // indirect - gopkg.in/evanphx/json-patch.v5 v5.7.0 // indirect + golang.org/x/tools v0.26.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect + google.golang.org/protobuf v1.35.1 // indirect + gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - helm.sh/helm/v3 v3.15.2 // indirect - k8s.io/apiextensions-apiserver v0.30.0 // indirect - k8s.io/apiserver v0.30.2 // indirect - k8s.io/cli-runtime v0.30.0 // indirect - k8s.io/component-base v0.30.2 // indirect - k8s.io/klog/v2 v2.120.1 // indirect + helm.sh/helm/v3 v3.16.1 // indirect + k8s.io/apiextensions-apiserver v0.31.1 // indirect + k8s.io/apiserver v0.31.1 // indirect + k8s.io/cli-runtime v0.31.1 // indirect + k8s.io/component-base v0.31.1 // indirect + k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect - k8s.io/kubectl v0.30.0 // indirect + k8s.io/kubectl v0.31.0 // indirect oras.land/oras-go v1.2.5 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/kustomize/api v0.16.0 // indirect - sigs.k8s.io/kustomize/kyaml v0.16.0 // indirect + sigs.k8s.io/kustomize/api v0.17.2 // indirect + sigs.k8s.io/kustomize/kyaml v0.17.1 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect sigs.k8s.io/yaml v1.4.0 // indirect ) diff --git a/go.sum b/go.sum index 459f9df0..0f30e6bb 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,7 @@ +dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s= +dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= +filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= +filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 h1:bvDV9vkmnHYOMsOr4WLk+Vo07yKIzd94sVoIqshQ4bU= github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= @@ -12,21 +16,20 @@ github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJ github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= -github.com/Masterminds/semver/v3 v3.2.0/go.mod h1:qvl/7zhW3nngYb5+80sSMF+FG2BjYrf8m9wsX0PNOMQ= -github.com/Masterminds/semver/v3 v3.2.1 h1:RN9w6+7QoMeJVGyfmbcgs28Br8cvmnucEXnY0rYXWg0= -github.com/Masterminds/semver/v3 v3.2.1/go.mod h1:qvl/7zhW3nngYb5+80sSMF+FG2BjYrf8m9wsX0PNOMQ= -github.com/Masterminds/sprig/v3 v3.2.3 h1:eL2fZNezLomi0uOLqjQoN6BfsDD+fyLtgbJMAj9n6YA= -github.com/Masterminds/sprig/v3 v3.2.3/go.mod h1:rXcFaZ2zZbLRJv/xSysmlgIM1u11eBaRMhvYXJNkGuM= +github.com/Masterminds/semver/v3 v3.3.0 h1:B8LGeaivUe71a5qox1ICM/JLl0NqZSW5CHyL+hmvYS0= +github.com/Masterminds/semver/v3 v3.3.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe3tPhs= +github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10= github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7fz8= github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w= -github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f h1:HEY1H1By8XI2P6KHA0wk+nXsBE+l/iYRCAwR6nZAoU8= -github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4= -github.com/NVIDIA/go-nvml v0.12.0-2 h1:Sg239yy7jmopu/cuvYauoMj9fOpcGMngxVxxS1EBXeY= -github.com/NVIDIA/go-nvml v0.12.0-2/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0= +github.com/NVIDIA/go-dcgm v0.0.0-20250106155650-850266c9c8a5 h1:+HrFl/XGrOqfX8tgvJTCHfuDzbZbpdEQmbOdcDR53Ew= +github.com/NVIDIA/go-dcgm v0.0.0-20250106155650-850266c9c8a5/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4= +github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg= +github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d h1:UrqY+r/OJnIp5u0s1SbQ8dVfLCZJsnvazdBP5hS4iRs= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= @@ -35,14 +38,16 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= -github.com/avast/retry-go/v4 v4.5.1 h1:AxIx0HGi4VZ3I02jr78j5lZ3M6x1E0Ivxa6b0pUUh7o= -github.com/avast/retry-go/v4 v4.5.1/go.mod h1:/sipNsvNB3RRuT5iNcb6h73nw3IBmXJ/H3XrCQYSOpc= +github.com/avast/retry-go/v4 v4.6.0 h1:K9xNA+KeB8HHc2aWFuLb25Offp+0iVRXEvFx8IinRJA= +github.com/avast/retry-go/v4 v4.6.0/go.mod h1:gvWlPhBVsvBbLkVGDg/KwvBv0bEkCOLRRSHKIr2PyOE= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE= -github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bits-and-blooms/bitset v1.17.0 h1:1X2TS7aHz1ELcC0yU1y2stUs/0ig5oMU6STFZGrhvHI= +github.com/bits-and-blooms/bitset v1.17.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/bshuster-repo/logrus-logstash-hook v1.0.0 h1:e+C0SB5R1pu//O4MQ3f9cFuPGoOVeF2fE4Og9otCc70= github.com/bshuster-repo/logrus-logstash-hook v1.0.0/go.mod h1:zsTqEiSzDgAa/8GZR7E1qaXrhYNDKBYy5/dWPTIflbk= github.com/bugsnag/bugsnag-go v0.0.0-20141110184014-b1d153021fcd h1:rFt+Y/IK1aEZkEHchZRSq9OQbsSzIT/OrI8YFFmRIng= @@ -51,13 +56,10 @@ github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b h1:otBG+dV+YK+Soembj github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b/go.mod h1:obH5gd0BsqsP2LwDJ9aOkm/6J86V6lyAXCoQWGw3K50= github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0 h1:nvj0OLI3YqYXer/kZD8Ri1aaunCxIEsOst1BVJswV0o= github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0/go.mod h1:D/8v3kj0zr8ZAKg1AQ6crr+5VwKN5eIywRkfhyM/+dE= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chai2010/gettext-go v1.0.2 h1:1Lwwip6Q2QGsAdl/ZKPCwTe9fe0CjlUbqj5bFNSjIRk= github.com/chai2010/gettext-go v1.0.2/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHeQQ+5AjwawxA= -github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= -github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw= github.com/containerd/containerd v1.7.12 h1:+KQsnv4VnzyxWcfO9mlxxELaoztsDEjOuCMPAuPqgU0= @@ -68,15 +70,16 @@ github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= -github.com/cpuguy83/go-md2man/v2 v2.0.3 h1:qMCsGGgs+MAzDFyp9LpAe1Lqy/fY/qCovCm0qnXZOBM= -github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= -github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= -github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= +github.com/cyphar/filepath-securejoin v0.3.1 h1:1V7cHiaW+C+39wEfpH6XlLBQo3j/PciWFrgfCLS8XrE= +github.com/cyphar/filepath-securejoin v0.3.1/go.mod h1:F7i41x/9cBF7lzCrVsYs9fuzwRZm4NQsGTBdpp6mETc= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/distribution/distribution/v3 v3.0.0-20221208165359-362910506bc2 h1:aBfCb7iqHmDEIp6fBvC/hQUddQfg+3qdYjwzaiP9Hnc= github.com/distribution/distribution/v3 v3.0.0-20221208165359-362910506bc2/go.mod h1:WHNsWjnIn2V1LYOrME7e8KxSeKunYHsxEm4am0BUtcI= github.com/distribution/reference v0.5.0 h1:/FUIFXtfc/x2gpa5/VGfiGLuOIdYa1t65IKK2OFGvA0= @@ -97,34 +100,32 @@ github.com/docker/go-metrics v0.0.1 h1:AgB/0SvBxihN0X8OR4SjsblXkbMvalQ8cjmtKQ2rQ github.com/docker/go-metrics v0.0.1/go.mod h1:cG1hvH2utMXtqgqqYE9plW6lDxS3/5ayHzueweSI3Vw= github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1 h1:ZClxb8laGDf5arXfYcAtECDFgAgHklGI8CxgjHnXKJ4= github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1/go.mod h1:cyGadeNEkKy96OOhEzfZl+yxihPEzKnqJwvfuSUqbZE= -github.com/emicklei/go-restful/v3 v3.11.1 h1:S+9bSbua1z3FgCnV0KKOSSZ3mDthb5NyEPL5gEpCvyk= -github.com/emicklei/go-restful/v3 v3.11.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= -github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU= +github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= +github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f h1:Wl78ApPPB2Wvf/TIe2xdyJxTlb6obmF18d8QdkxNDu4= github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f/go.mod h1:OSYXu++VVOHnXeitef/D8n/6y4QV8uLHSFXX4NeXMGc= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/foxcpp/go-mockdns v1.0.0 h1:7jBqxd3WDWwi/6WhDvacvH1XsN3rOLXyHM1uhvIx6FI= -github.com/foxcpp/go-mockdns v1.0.0/go.mod h1:lgRN6+KxQBawyIghpnl5CezHFGS9VLzvtVlwxvzXTQ4= +github.com/foxcpp/go-mockdns v1.1.0 h1:jI0rD8M0wuYAxL7r/ynTrCQQq0BVqfB99Vgk7DlmewI= +github.com/foxcpp/go-mockdns v1.1.0/go.mod h1:IhLeSFGed3mJIAXPH2aiRQB+kqz7oqu8ld2qVbOu7Wk= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= +github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= +github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= github.com/go-errors/errors v1.5.1 h1:ZwEMSLRCapFLflTpT7NKaAc7ukJ8ZPEjzlxt8rPN8bk= github.com/go-errors/errors v1.5.1/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-gorp/gorp/v3 v3.1.0 h1:ItKF/Vbuj31dmV4jxA1qblpSwkl9g1typ24xoe70IGs= github.com/go-gorp/gorp/v3 v3.1.0/go.mod h1:dLEjIyyRNiXvNZ8PSmzpt1GsWAUK8kjVhEpjH8TixEw= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= -github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU= -github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0= github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= -github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= -github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= -github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-openapi/jsonpointer v0.20.2 h1:mQc3nmndL8ZBzStEo3JYF8wzmeWffDH4VbXz58sAx6Q= @@ -133,11 +134,11 @@ github.com/go-openapi/jsonreference v0.20.4 h1:bKlDxQxQJgwpUSgOENiMPzCTBVuc7vTdX github.com/go-openapi/jsonreference v0.20.4/go.mod h1:5pZJyJP2MnYCpoeoMAql78cCHauHj0V9Lhc506VOpw4= github.com/go-openapi/swag v0.22.7 h1:JWrc1uc/P9cSomxfnsFSVWoE1FW6bNbrVPmpQYpCcR8= github.com/go-openapi/swag v0.22.7/go.mod h1:Gl91UqO+btAM0plGGxHqJcQZ1ZTy6jbmridBTsDy8A0= -github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE= -github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= +github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y= +github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= @@ -149,8 +150,6 @@ github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4er github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/gomodule/redigo v1.8.2 h1:H5XSIre1MB5NbPYFp+i1NBbb5qN1W8Y8YAQoAYbkm8k= @@ -160,25 +159,22 @@ github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl76 github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= +github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ= -github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/handlers v1.5.1 h1:9lRY6j8DEeeBT10CvO9hGW0gmky0BprnvDI5vfhUHH4= github.com/gorilla/handlers v1.5.1/go.mod h1:t8XrUpc4KVXb7HGyJ4/cEnwQiaxrX/hz1Zv/4g96P1Q= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= -github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/gorilla/websocket v1.5.1 h1:gmztn0JnHVt9JZquRuzLw3g4wouNVzKL15iLr/zn/QY= github.com/gorilla/websocket v1.5.1/go.mod h1:x3kM2JMyaluk02fnUJpQuwD2dCS5NDG2ZHL0uE0tcaY= github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY= @@ -192,17 +188,14 @@ github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+l github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+lJfyTc= github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= -github.com/huandu/xstrings v1.3.3/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= -github.com/huandu/xstrings v1.4.0 h1:D17IlohoQq4UcpqD7fDk80P7l+lwAmlFaBHgOipl2FU= -github.com/huandu/xstrings v1.4.0/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= -github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/imdario/mergo v0.3.11/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= +github.com/huandu/xstrings v1.5.0 h1:2ag3IFq9ZDANvthTwTiqSSZLjDc+BedvHPAp5tJy2TI= +github.com/huandu/xstrings v1.5.0/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/jmoiron/sqlx v1.3.5 h1:vFFPA71p1o5gAeqtEAwLU4dnX2napprKtHr7PYIcN3g= -github.com/jmoiron/sqlx v1.3.5/go.mod h1:nRVWtLre0KfCLJvgxzCsLVMogSvQ1zNJtpYr2Ccp0mQ= +github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= +github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= @@ -214,19 +207,20 @@ github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHm github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.17.4 h1:Ej5ixsIri7BrIjBkRZLTo6ghwrEtHFk7ijlczPW4fZ4= -github.com/klauspost/compress v1.17.4/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq6+3iTQz8KNCLtVX6idSoTLdUw= github.com/lann/builder v0.0.0-20180802200727-47ae307949d0/go.mod h1:dXGbAdH5GtBTC4WfIxhKZfyBF/HBFgRZSWwZ9g/He9o= github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 h1:P6pPBnrTSX3DEVR4fDembhRWSsG5rVo6hYhAB/ADZrk= github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6FmdpVm2joNMFikkuWg0EoCKLGUMNw= -github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= @@ -240,26 +234,27 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= -github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= -github.com/mattn/go-sqlite3 v1.14.15 h1:vfoHhTN1af61xCRSWzFIWzx2YskyMTwHLrExkBOjvxI= -github.com/mattn/go-sqlite3 v1.14.15/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= +github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= +github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= -github.com/miekg/dns v1.1.25 h1:dFwPR6SfLtrSwgDcIq2bcU/gVutB4sNApq2HBdqcakg= -github.com/miekg/dns v1.1.25/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKjuso= -github.com/mitchellh/copystructure v1.0.0/go.mod h1:SNtv71yrdKgLRyLFxmLdkAbkKEFWgYaq1OVrnRcwhnw= +github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U= +github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA= +github.com/mdlayher/vsock v1.2.1 h1:pC1mTJTvjo1r9n9fbm7S1j04rCgCzhCOS5DY0zqHlnQ= +github.com/mdlayher/vsock v1.2.1/go.mod h1:NRfCibel++DgeMD8z/hP+PPTjlNJsdPOmxcnENvE+SE= +github.com/miekg/dns v1.1.57 h1:Jzi7ApEIzwEPLHWRcafCN9LZSBbqQpxjt/wpgvg7wcM= +github.com/miekg/dns v1.1.57/go.mod h1:uqRjCRUuEAA6qsOiJvDd+CFo/vW+y5WR6SNmHE55hZk= github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s= github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= -github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= -github.com/mittwald/go-helm-client v0.12.9 h1:tfI5ECgrbfAolA9TnlCeA5F2TEIvdsOxVmoSyW80lCI= -github.com/mittwald/go-helm-client v0.12.9/go.mod h1:ukR3Et5zbfBij7bFL1ZnLvPytsbBXCrI2qQYr2yVi9I= +github.com/mittwald/go-helm-client v0.12.14 h1:az3GJ4kRmFK609Ic3iHXveNtg92n9jWG0YpKKTIK4oo= +github.com/mittwald/go-helm-client v0.12.14/go.mod h1:2VogAupgnV7FiuoPqtpCYKS/RrMh9fFA3/pD/OmTaLc= github.com/moby/locker v1.0.1 h1:fOXqR41zeveg4fFODix+1Ch4mj/gT0NE1XJbp/epuBg= github.com/moby/locker v1.0.1/go.mod h1:S7SDdo5zpBK84bzzVlKr2V0hz+7x9hWbYC/kq7oQppc= -github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= -github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= +github.com/moby/spdystream v0.4.0 h1:Vy79D6mHeJJjiPdFEL2yku1kl0chZpJfZcPpb16BRl8= +github.com/moby/spdystream v0.4.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/moby/sys/mountinfo v0.6.2 h1:BzJjoreD5BMFNmD9Rus6gdd1pLuecOFPt8wC+Vygl78= github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= @@ -280,14 +275,14 @@ github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+ github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/onsi/ginkgo/v2 v2.15.0 h1:79HwNRBAZHOEwrczrgSOPy+eFTTlIGELKy5as+ClttY= -github.com/onsi/ginkgo/v2 v2.15.0/go.mod h1:HlxMHtYF57y6Dpf+mc5529KKmSq9h2FpCF+/ZkwUxKM= -github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk= -github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg= +github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= +github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= +github.com/onsi/gomega v1.36.0 h1:Pb12RlruUtj4XUuPUqeEWc6j5DkVVVA49Uf6YLfC95Y= +github.com/onsi/gomega v1.36.0/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= -github.com/opencontainers/image-spec v1.1.0-rc6 h1:XDqvyKsJEbRtATzkgItUqBA7QHk58yxX1Ov9HERHNqU= -github.com/opencontainers/image-spec v1.1.0-rc6/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= +github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= +github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5 h1:Ii+DKncOVM8Cu1Hc+ETb5K+23HdAMvESYE3ZJ5b5cMI= @@ -295,8 +290,9 @@ github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5/go.mod h1:iIss55rK github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/poy/onpar v1.1.2 h1:QaNrNiZx0+Nar5dLgTVp5mXkyoVFIbepjyEoGSnhbAY= github.com/poy/onpar v1.1.2/go.mod h1:6X8FLNoxyr9kkmnlqpK6LSoiOtrO6MICtWwEuWkLjzg= github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= @@ -304,63 +300,59 @@ github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.1.0/go.mod h1:I1FGZT9+L76gKKOs5djB6ezCbFQP1xR9D75/vuwEF3g= -github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk= -github.com/prometheus/client_golang v1.18.0/go.mod h1:T+GXkCk5wSJyOqMIzVgvvjFDlkOQntgjkJWKrN5txjA= +github.com/prometheus/client_golang v1.20.4 h1:Tgh3Yr67PaOv/uTqloMsCEdeuFTatm5zIq5+qNN23vI= +github.com/prometheus/client_golang v1.20.4/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos= -github.com/prometheus/client_model v0.6.0/go.mod h1:NTQHnmxFpouOD0DpvP4XujX3CdOAGQPoaGhyTchlyt8= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.6.0/go.mod h1:eBmuwkDJBwy6iBfxCBob6t6dR6ENT/y+J+Zk0j9GMYc= -github.com/prometheus/common v0.47.0 h1:p5Cz0FNHo7SnWOmWmoRozVcjEp0bIVU8cV7OShpjL1k= -github.com/prometheus/common v0.47.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc= -github.com/prometheus/exporter-toolkit v0.11.0 h1:yNTsuZ0aNCNFQ3aFTD2uhPOvr4iD7fdBvKPAEGkNf+g= -github.com/prometheus/exporter-toolkit v0.11.0/go.mod h1:BVnENhnNecpwoTLiABx7mrPB/OLRIgN74qlQbV+FK1Q= +github.com/prometheus/common v0.60.1 h1:FUas6GcOw66yB/73KC+BOZoFJmbo/1pojoILArPAaSc= +github.com/prometheus/common v0.60.1/go.mod h1:h0LYf1R1deLSKtD4Vdg8gy4RuOvENW2J/h19V5NADQw= +github.com/prometheus/exporter-toolkit v0.13.1 h1:Evsh0gWQo2bdOHlnz9+0Nm7/OFfIwhE2Ws4A2jIlR04= +github.com/prometheus/exporter-toolkit v0.13.1/go.mod h1:ujdv2YIOxtdFxxqtloLpbqmxd5J0Le6IITUvIRSWjj0= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.3/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDaekg4FpcdQ= -github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= -github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis= github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= -github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= -github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= -github.com/rubenv/sql-migrate v1.6.0 h1:IZpcTlAx/VKXphWEpwWJ7BaMq05tYtE80zYz+8a5Il8= -github.com/rubenv/sql-migrate v1.6.0/go.mod h1:m3ilnKP7sNb4eYkLsp6cGdPOl4OBcXM6rcbzU+Oqc5k= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/rubenv/sql-migrate v1.7.0 h1:HtQq1xyTN2ISmQDggnh0c9U3JlP8apWh8YO2jzlXpTI= +github.com/rubenv/sql-migrate v1.7.0/go.mod h1:S4wtDEG1CKn+0ShpTtzWhFpHHI5PvCUtiGI+C+Z2THE= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= -github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= -github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= -github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8= -github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= +github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= +github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= +github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= +github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= -github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0= -github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= -github.com/spf13/cobra v1.8.0 h1:7aJaZx1B85qltLMc546zn58BxxfZdR/W22ej9CFoEf0= -github.com/spf13/cobra v1.8.0/go.mod h1:WXLWApfZ71AjXPya3WOlMsY9yMs7YeiHhFVlvLyhcho= +github.com/spf13/cast v1.7.0 h1:ntdiHjuueXFgm5nzDRdOS4yfT43P5Fnud6DH50rz/7w= +github.com/spf13/cast v1.7.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/urfave/cli/v2 v2.27.1 h1:8xSQ6szndafKVRmfyeUMxkNUJQMjL1F2zmsZ+qHpfho= github.com/urfave/cli/v2 v2.27.1/go.mod h1:8qnjx1vcq5s2/wpsqoZFndg2CE5tNFyrTvS6SinrnYQ= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb h1:zGWFAtiMcyryUHoUjUJX0/lt1H2+i2Ka2n+D3DImSNo= github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= @@ -374,7 +366,6 @@ github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRT github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/yvasiyarov/go-metrics v0.0.0-20140926110328-57bccd1ccd43 h1:+lm10QQTNSBd8DVTNGHx7o/IKu9HYDvLMffDhbyLccI= github.com/yvasiyarov/go-metrics v0.0.0-20140926110328-57bccd1ccd43/go.mod h1:aX5oPXxHm3bOH+xeAttToC8pqch2ScQN/JoXYupl6xs= github.com/yvasiyarov/gorelic v0.0.0-20141212073537-a9bba5b9ab50 h1:hlE8//ciYMztlGpl/VA+Zm1AcTPHYkHJPbHqE6WJUXE= @@ -383,14 +374,14 @@ github.com/yvasiyarov/newrelic_platform_go v0.0.0-20140908184405-b21fdbd4370f h1 github.com/yvasiyarov/newrelic_platform_go v0.0.0-20140908184405-b21fdbd4370f/go.mod h1:GlGEuHIJweS1mbCqG+7vt2nvWLzLLnRHbXz5JKd/Qbg= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1 h1:aFJWCqJMNjENlcleuuOkGAPH82y0yULBScfXcIEdS24= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1/go.mod h1:sEGXWArGqc3tVa+ekntsN65DmVbVeW+7lTKTjZF3/Fo= -go.opentelemetry.io/otel v1.21.0 h1:hzLeKBZEL7Okw2mGzZ0cc4k/A7Fta0uoPgaJCr8fsFc= -go.opentelemetry.io/otel v1.21.0/go.mod h1:QZzNPQPm1zLX4gZK4cMi+71eaorMSGT3A4znnUvNNEo= -go.opentelemetry.io/otel/metric v1.21.0 h1:tlYWfeo+Bocx5kLEloTjbcDwBuELRrIFxwdQ36PlJu4= -go.opentelemetry.io/otel/metric v1.21.0/go.mod h1:o1p3CA8nNHW8j5yuQLdc1eeqEaPfzug24uvsyIEJRWM= -go.opentelemetry.io/otel/trace v1.21.0 h1:WD9i5gzvoUPuXIXH24ZNBudiarZDKuekPqi/E8fpfLc= -go.opentelemetry.io/otel/trace v1.21.0/go.mod h1:LGbsEB0f9LGjN+OZaQQ26sohbOmiMR+BaslueVtS/qQ= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= +go.opentelemetry.io/otel v1.28.0 h1:/SqNcYk+idO0CxKEUOtKQClMK/MimZihKYMruSMViUo= +go.opentelemetry.io/otel v1.28.0/go.mod h1:q68ijF8Fc8CnMHKyzqL6akLO46ePnjkgfIMIjUIX9z4= +go.opentelemetry.io/otel/metric v1.28.0 h1:f0HGvSl1KRAU1DLgLGFjrwVyismPlnuU6JD6bOeuA5Q= +go.opentelemetry.io/otel/metric v1.28.0/go.mod h1:Fb1eVBFZmLVTMb6PPohq3TO9IIhUisDsbJoL/+uQW4s= +go.opentelemetry.io/otel/trace v1.28.0 h1:GhQ9cUuQGmNDd5BTCP2dAvv75RdMxEfTmYejp+lkx9g= +go.opentelemetry.io/otel/trace v1.28.0/go.mod h1:jPyXzNPg6da9+38HEwElrQiHlVMTnVfM3/yv2OlIHaI= go.starlark.net v0.0.0-20231121155337-90ade8b19d09 h1:hzy3LFnSN8kuQK8h9tHl4ndF6UruMj47OqwqsS+/Ai4= go.starlark.net v0.0.0-20231121155337-90ade8b19d09/go.mod h1:LcLNIzVOMp4oV+uusnpk+VU+SzXaJakUuBjoCSWH5dM= go.uber.org/automaxprocs v1.5.3 h1:kWazyxZUrS3Gs4qUpbwo5kEIMGe/DAvi5Z4tl2NW4j8= @@ -403,103 +394,75 @@ golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnf golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= -golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= -golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= -golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc h1:ao2WRsKSzW6KuUY9IWPwWahcHCgR0s52IfwutMfEbdM= -golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI= +golang.org/x/crypto v0.28.0 h1:GBDwsMXVQi34v5CCYUm2jkJvu4cbtru2U4TN2PSyQnw= +golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= +golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= -golang.org/x/oauth2 v0.18.0 h1:09qnuIAgzdx1XplqJvW6CQqMCtGZykZWcXzPMPUusvI= -golang.org/x/oauth2 v0.18.0/go.mod h1:Wf7knwG0MPoWIMMBgFlEaSUDaKskp0dCfrlJRJXbBi8= +golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= +golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= +golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= +golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190801041406-cbf593c0f2f3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= -golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ= +golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= -google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 h1:NnYq6UN9ReLM9/Y01KWNOWyI5xQ9kbIms5GGJVwS/Yc= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY= -google.golang.org/grpc v1.64.1 h1:LKtvyfbX3UGVPFcGqJ9ItpVWW6oN/2XqTxfAnwRRXiA= -google.golang.org/grpc v1.64.1/go.mod h1:hiQF4LFZelK2WKaP6W0L92zGHtiQdZxk8CrSdvyjeP0= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 h1:BwIjyKYGsK9dMCBOorzRri8MQwmi7mT9rGHsCEinZkA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc= +google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v5 v5.7.0 h1:dGKGylPlZ/jus2g1YqhhyzfH0gPy2R8/MYUpW/OslTY= -gopkg.in/evanphx/json-patch.v5 v5.7.0/go.mod h1:/kvTRh1TVm5wuM6OkHxqXtE/1nUZZpihg29RtuIyfvk= +gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= +gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= @@ -507,40 +470,40 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= gotest.tools/v3 v3.4.0/go.mod h1:CtbdzLSsqVhDgMtKsx03ird5YTGB3ar27v0u/yKBW5g= -helm.sh/helm/v3 v3.15.2 h1:/3XINUFinJOBjQplGnjw92eLGpgXXp1L8chWPkCkDuw= -helm.sh/helm/v3 v3.15.2/go.mod h1:FzSIP8jDQaa6WAVg9F+OkKz7J0ZmAga4MABtTbsb9WQ= -k8s.io/api v0.30.2 h1:+ZhRj+28QT4UOH+BKznu4CBgPWgkXO7XAvMcMl0qKvI= -k8s.io/api v0.30.2/go.mod h1:ULg5g9JvOev2dG0u2hig4Z7tQ2hHIuS+m8MNZ+X6EmI= -k8s.io/apiextensions-apiserver v0.30.0 h1:jcZFKMqnICJfRxTgnC4E+Hpcq8UEhT8B2lhBcQ+6uAs= -k8s.io/apiextensions-apiserver v0.30.0/go.mod h1:N9ogQFGcrbWqAY9p2mUAL5mGxsLqwgtUce127VtRX5Y= -k8s.io/apimachinery v0.30.2 h1:fEMcnBj6qkzzPGSVsAZtQThU62SmQ4ZymlXRC5yFSCg= -k8s.io/apimachinery v0.30.2/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= -k8s.io/apiserver v0.30.2 h1:ACouHiYl1yFI2VFI3YGM+lvxgy6ir4yK2oLOsLI1/tw= -k8s.io/apiserver v0.30.2/go.mod h1:BOTdFBIch9Sv0ypSEcUR6ew/NUFGocRFNl72Ra7wTm8= -k8s.io/cli-runtime v0.30.0 h1:0vn6/XhOvn1RJ2KJOC6IRR2CGqrpT6QQF4+8pYpWQ48= -k8s.io/cli-runtime v0.30.0/go.mod h1:vATpDMATVTMA79sZ0YUCzlMelf6rUjoBzlp+RnoM+cg= -k8s.io/client-go v0.30.2 h1:sBIVJdojUNPDU/jObC+18tXWcTJVcwyqS9diGdWHk50= -k8s.io/client-go v0.30.2/go.mod h1:JglKSWULm9xlJLx4KCkfLLQ7XwtlbflV6uFFSHTMgVs= -k8s.io/component-base v0.30.2 h1:pqGBczYoW1sno8q9ObExUqrYSKhtE5rW3y6gX88GZII= -k8s.io/component-base v0.30.2/go.mod h1:yQLkQDrkK8J6NtP+MGJOws+/PPeEXNpwFixsUI7h/OE= -k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= -k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +helm.sh/helm/v3 v3.16.1 h1:cER6tI/8PgUAsaJaQCVBUg3VI9KN4oVaZJgY60RIc0c= +helm.sh/helm/v3 v3.16.1/go.mod h1:r+xBHHP20qJeEqtvBXMf7W35QDJnzY/eiEBzt+TfHps= +k8s.io/api v0.31.1 h1:Xe1hX/fPW3PXYYv8BlozYqw63ytA92snr96zMW9gWTU= +k8s.io/api v0.31.1/go.mod h1:sbN1g6eY6XVLeqNsZGLnI5FwVseTrZX7Fv3O26rhAaI= +k8s.io/apiextensions-apiserver v0.31.1 h1:L+hwULvXx+nvTYX/MKM3kKMZyei+UiSXQWciX/N6E40= +k8s.io/apiextensions-apiserver v0.31.1/go.mod h1:tWMPR3sgW+jsl2xm9v7lAyRF1rYEK71i9G5dRtkknoQ= +k8s.io/apimachinery v0.31.1 h1:mhcUBbj7KUjaVhyXILglcVjuS4nYXiwC+KKFBgIVy7U= +k8s.io/apimachinery v0.31.1/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= +k8s.io/apiserver v0.31.1 h1:Sars5ejQDCRBY5f7R3QFHdqN3s61nhkpaX8/k1iEw1c= +k8s.io/apiserver v0.31.1/go.mod h1:lzDhpeToamVZJmmFlaLwdYZwd7zB+WYRYIboqA1kGxM= +k8s.io/cli-runtime v0.31.1 h1:/ZmKhmZ6hNqDM+yf9s3Y4KEYakNXUn5sod2LWGGwCuk= +k8s.io/cli-runtime v0.31.1/go.mod h1:pKv1cDIaq7ehWGuXQ+A//1OIF+7DI+xudXtExMCbe9U= +k8s.io/client-go v0.31.1 h1:f0ugtWSbWpxHR7sjVpQwuvw9a3ZKLXX0u0itkFXufb0= +k8s.io/client-go v0.31.1/go.mod h1:sKI8871MJN2OyeqRlmA4W4KM9KBdBUpDLu/43eGemCg= +k8s.io/component-base v0.31.1 h1:UpOepcrX3rQ3ab5NB6g5iP0tvsgJWzxTyAo20sgYSy8= +k8s.io/component-base v0.31.1/go.mod h1:WGeaw7t/kTsqpVTaCoVEtillbqAhF2/JgvO0LDOMa0w= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag= k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= -k8s.io/kubectl v0.30.0 h1:xbPvzagbJ6RNYVMVuiHArC1grrV5vSmmIcSZuCdzRyk= -k8s.io/kubectl v0.30.0/go.mod h1:zgolRw2MQXLPwmic2l/+iHs239L49fhSeICuMhQQXTI= +k8s.io/kubectl v0.31.0 h1:kANwAAPVY02r4U4jARP/C+Q1sssCcN/1p9Nk+7BQKVg= +k8s.io/kubectl v0.31.0/go.mod h1:pB47hhFypGsaHAPjlwrNbvhXgmuAr01ZBvAIIUaI8d4= k8s.io/kubelet v0.30.2 h1:Ck4E/pHndI20IzDXxS57dElhDGASPO5pzXF7BcKfmCY= k8s.io/kubelet v0.30.2/go.mod h1:DSwwTbLQmdNkebAU7ypIALR4P9aXZNFwgRmedojUE94= -k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 h1:jgGTlFYnhF1PM1Ax/lAlxUPE+KfCIXHaathvJg1C3ak= -k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= +k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= oras.land/oras-go v1.2.5 h1:XpYuAwAb0DfQsunIyMfeET92emK8km3W4yEzZvUbsTo= oras.land/oras-go v1.2.5/go.mod h1:PuAwRShRZCsZb7g8Ar3jKKQR/2A/qN+pkYxIOd/FAoo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/kustomize/api v0.16.0 h1:/zAR4FOQDCkgSDmVzV2uiFbuy9bhu3jEzthrHCuvm1g= -sigs.k8s.io/kustomize/api v0.16.0/go.mod h1:MnFZ7IP2YqVyVwMWoRxPtgl/5hpA+eCCrQR/866cm5c= -sigs.k8s.io/kustomize/kyaml v0.16.0 h1:6J33uKSoATlKZH16unr2XOhDI+otoe2sR3M8PDzW3K0= -sigs.k8s.io/kustomize/kyaml v0.16.0/go.mod h1:xOK/7i+vmE14N2FdFyugIshB8eF6ALpy7jI87Q2nRh4= +sigs.k8s.io/kustomize/api v0.17.2 h1:E7/Fjk7V5fboiuijoZHgs4aHuexi5Y2loXlVOAVAG5g= +sigs.k8s.io/kustomize/api v0.17.2/go.mod h1:UWTz9Ct+MvoeQsHcJ5e+vziRRkwimm3HytpZgIYqye0= +sigs.k8s.io/kustomize/kyaml v0.17.1 h1:TnxYQxFXzbmNG6gOINgGWQt09GghzgTP6mIurOgrLCQ= +sigs.k8s.io/kustomize/kyaml v0.17.1/go.mod h1:9V0mCjIEYjlXuCdYsSXvyoy2BTsLESH7TlGV81S282U= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= diff --git a/hack/VERSION b/hack/VERSION index 17b0cffa..4dfd2130 100644 --- a/hack/VERSION +++ b/hack/VERSION @@ -1,4 +1,4 @@ -OLD_DCGM_VERSION=3.3.8 -OLD_EXPORTER_VERSION=3.6.0 -NEW_DCGM_VERSION=3.3.9 -NEW_EXPORTER_VERSION=3.6.1 +OLD_DCGM_VERSION=3.3.9 +OLD_EXPORTER_VERSION=3.6.1 +NEW_DCGM_VERSION=4.0.0 +NEW_EXPORTER_VERSION=4.0.0 diff --git a/internal/mocks/pkg/collector/mock_collector.go b/internal/mocks/pkg/collector/mock_collector.go new file mode 100644 index 00000000..395ebb8b --- /dev/null +++ b/internal/mocks/pkg/collector/mock_collector.go @@ -0,0 +1,81 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/collector (interfaces: Collector) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/collector/mock_collector.go -package=collector -copyright_file=../../../hack/header.txt . Collector +// + +// Package collector is a generated GoMock package. +package collector + +import ( + reflect "reflect" + + collector "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + gomock "go.uber.org/mock/gomock" +) + +// MockCollector is a mock of Collector interface. +type MockCollector struct { + ctrl *gomock.Controller + recorder *MockCollectorMockRecorder +} + +// MockCollectorMockRecorder is the mock recorder for MockCollector. +type MockCollectorMockRecorder struct { + mock *MockCollector +} + +// NewMockCollector creates a new mock instance. +func NewMockCollector(ctrl *gomock.Controller) *MockCollector { + mock := &MockCollector{ctrl: ctrl} + mock.recorder = &MockCollectorMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockCollector) EXPECT() *MockCollectorMockRecorder { + return m.recorder +} + +// Cleanup mocks base method. +func (m *MockCollector) Cleanup() { + m.ctrl.T.Helper() + m.ctrl.Call(m, "Cleanup") +} + +// Cleanup indicates an expected call of Cleanup. +func (mr *MockCollectorMockRecorder) Cleanup() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Cleanup", reflect.TypeOf((*MockCollector)(nil).Cleanup)) +} + +// GetMetrics mocks base method. +func (m *MockCollector) GetMetrics() (collector.MetricsByCounter, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetMetrics") + ret0, _ := ret[0].(collector.MetricsByCounter) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetMetrics indicates an expected call of GetMetrics. +func (mr *MockCollectorMockRecorder) GetMetrics() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetMetrics", reflect.TypeOf((*MockCollector)(nil).GetMetrics)) +} diff --git a/internal/mocks/pkg/dcgmprovider/mock_client.go b/internal/mocks/pkg/dcgmprovider/mock_client.go new file mode 100644 index 00000000..8229c26d --- /dev/null +++ b/internal/mocks/pkg/dcgmprovider/mock_client.go @@ -0,0 +1,507 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider (interfaces: DCGM) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/dcgmprovider/mock_client.go -package=dcgmprovider -copyright_file=../../../hack/header.txt . DCGM +// + +// Package dcgmprovider is a generated GoMock package. +package dcgmprovider + +import ( + reflect "reflect" + time "time" + + dcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm" + gomock "go.uber.org/mock/gomock" +) + +// MockDCGM is a mock of DCGM interface. +type MockDCGM struct { + ctrl *gomock.Controller + recorder *MockDCGMMockRecorder +} + +// MockDCGMMockRecorder is the mock recorder for MockDCGM. +type MockDCGMMockRecorder struct { + mock *MockDCGM +} + +// NewMockDCGM creates a new mock instance. +func NewMockDCGM(ctrl *gomock.Controller) *MockDCGM { + mock := &MockDCGM{ctrl: ctrl} + mock.recorder = &MockDCGMMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockDCGM) EXPECT() *MockDCGMMockRecorder { + return m.recorder +} + +// AddEntityToGroup mocks base method. +func (m *MockDCGM) AddEntityToGroup(arg0 dcgm.GroupHandle, arg1 dcgm.Field_Entity_Group, arg2 uint) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "AddEntityToGroup", arg0, arg1, arg2) + ret0, _ := ret[0].(error) + return ret0 +} + +// AddEntityToGroup indicates an expected call of AddEntityToGroup. +func (mr *MockDCGMMockRecorder) AddEntityToGroup(arg0, arg1, arg2 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddEntityToGroup", reflect.TypeOf((*MockDCGM)(nil).AddEntityToGroup), arg0, arg1, arg2) +} + +// AddLinkEntityToGroup mocks base method. +func (m *MockDCGM) AddLinkEntityToGroup(arg0 dcgm.GroupHandle, arg1, arg2 uint) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "AddLinkEntityToGroup", arg0, arg1, arg2) + ret0, _ := ret[0].(error) + return ret0 +} + +// AddLinkEntityToGroup indicates an expected call of AddLinkEntityToGroup. +func (mr *MockDCGMMockRecorder) AddLinkEntityToGroup(arg0, arg1, arg2 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddLinkEntityToGroup", reflect.TypeOf((*MockDCGM)(nil).AddLinkEntityToGroup), arg0, arg1, arg2) +} + +// Cleanup mocks base method. +func (m *MockDCGM) Cleanup() { + m.ctrl.T.Helper() + m.ctrl.Call(m, "Cleanup") +} + +// Cleanup indicates an expected call of Cleanup. +func (mr *MockDCGMMockRecorder) Cleanup() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Cleanup", reflect.TypeOf((*MockDCGM)(nil).Cleanup)) +} + +// CreateFakeEntities mocks base method. +func (m *MockDCGM) CreateFakeEntities(arg0 []dcgm.MigHierarchyInfo) ([]uint, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CreateFakeEntities", arg0) + ret0, _ := ret[0].([]uint) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// CreateFakeEntities indicates an expected call of CreateFakeEntities. +func (mr *MockDCGMMockRecorder) CreateFakeEntities(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateFakeEntities", reflect.TypeOf((*MockDCGM)(nil).CreateFakeEntities), arg0) +} + +// CreateGroup mocks base method. +func (m *MockDCGM) CreateGroup(arg0 string) (dcgm.GroupHandle, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CreateGroup", arg0) + ret0, _ := ret[0].(dcgm.GroupHandle) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// CreateGroup indicates an expected call of CreateGroup. +func (mr *MockDCGMMockRecorder) CreateGroup(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateGroup", reflect.TypeOf((*MockDCGM)(nil).CreateGroup), arg0) +} + +// DestroyGroup mocks base method. +func (m *MockDCGM) DestroyGroup(arg0 dcgm.GroupHandle) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "DestroyGroup", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// DestroyGroup indicates an expected call of DestroyGroup. +func (mr *MockDCGMMockRecorder) DestroyGroup(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DestroyGroup", reflect.TypeOf((*MockDCGM)(nil).DestroyGroup), arg0) +} + +// EntitiesGetLatestValues mocks base method. +func (m *MockDCGM) EntitiesGetLatestValues(arg0 []dcgm.GroupEntityPair, arg1 []dcgm.Short, arg2 uint) ([]dcgm.FieldValue_v2, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "EntitiesGetLatestValues", arg0, arg1, arg2) + ret0, _ := ret[0].([]dcgm.FieldValue_v2) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// EntitiesGetLatestValues indicates an expected call of EntitiesGetLatestValues. +func (mr *MockDCGMMockRecorder) EntitiesGetLatestValues(arg0, arg1, arg2 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EntitiesGetLatestValues", reflect.TypeOf((*MockDCGM)(nil).EntitiesGetLatestValues), arg0, arg1, arg2) +} + +// EntityGetLatestValues mocks base method. +func (m *MockDCGM) EntityGetLatestValues(arg0 dcgm.Field_Entity_Group, arg1 uint, arg2 []dcgm.Short) ([]dcgm.FieldValue_v1, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "EntityGetLatestValues", arg0, arg1, arg2) + ret0, _ := ret[0].([]dcgm.FieldValue_v1) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// EntityGetLatestValues indicates an expected call of EntityGetLatestValues. +func (mr *MockDCGMMockRecorder) EntityGetLatestValues(arg0, arg1, arg2 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EntityGetLatestValues", reflect.TypeOf((*MockDCGM)(nil).EntityGetLatestValues), arg0, arg1, arg2) +} + +// FieldGetById mocks base method. +func (m *MockDCGM) FieldGetById(arg0 dcgm.Short) dcgm.FieldMeta { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "FieldGetById", arg0) + ret0, _ := ret[0].(dcgm.FieldMeta) + return ret0 +} + +// FieldGetById indicates an expected call of FieldGetById. +func (mr *MockDCGMMockRecorder) FieldGetById(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FieldGetById", reflect.TypeOf((*MockDCGM)(nil).FieldGetById), arg0) +} + +// FieldGroupCreate mocks base method. +func (m *MockDCGM) FieldGroupCreate(arg0 string, arg1 []dcgm.Short) (dcgm.FieldHandle, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "FieldGroupCreate", arg0, arg1) + ret0, _ := ret[0].(dcgm.FieldHandle) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// FieldGroupCreate indicates an expected call of FieldGroupCreate. +func (mr *MockDCGMMockRecorder) FieldGroupCreate(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FieldGroupCreate", reflect.TypeOf((*MockDCGM)(nil).FieldGroupCreate), arg0, arg1) +} + +// FieldGroupDestroy mocks base method. +func (m *MockDCGM) FieldGroupDestroy(arg0 dcgm.FieldHandle) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "FieldGroupDestroy", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// FieldGroupDestroy indicates an expected call of FieldGroupDestroy. +func (mr *MockDCGMMockRecorder) FieldGroupDestroy(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FieldGroupDestroy", reflect.TypeOf((*MockDCGM)(nil).FieldGroupDestroy), arg0) +} + +// Fv2_String mocks base method. +func (m *MockDCGM) Fv2_String(arg0 dcgm.FieldValue_v2) string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Fv2_String", arg0) + ret0, _ := ret[0].(string) + return ret0 +} + +// Fv2_String indicates an expected call of Fv2_String. +func (mr *MockDCGMMockRecorder) Fv2_String(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Fv2_String", reflect.TypeOf((*MockDCGM)(nil).Fv2_String), arg0) +} + +// GetAllDeviceCount mocks base method. +func (m *MockDCGM) GetAllDeviceCount() (uint, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetAllDeviceCount") + ret0, _ := ret[0].(uint) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetAllDeviceCount indicates an expected call of GetAllDeviceCount. +func (mr *MockDCGMMockRecorder) GetAllDeviceCount() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAllDeviceCount", reflect.TypeOf((*MockDCGM)(nil).GetAllDeviceCount)) +} + +// GetCpuHierarchy mocks base method. +func (m *MockDCGM) GetCpuHierarchy() (dcgm.CpuHierarchy_v1, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetCpuHierarchy") + ret0, _ := ret[0].(dcgm.CpuHierarchy_v1) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetCpuHierarchy indicates an expected call of GetCpuHierarchy. +func (mr *MockDCGMMockRecorder) GetCpuHierarchy() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetCpuHierarchy", reflect.TypeOf((*MockDCGM)(nil).GetCpuHierarchy)) +} + +// GetDeviceInfo mocks base method. +func (m *MockDCGM) GetDeviceInfo(arg0 uint) (dcgm.Device, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetDeviceInfo", arg0) + ret0, _ := ret[0].(dcgm.Device) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetDeviceInfo indicates an expected call of GetDeviceInfo. +func (mr *MockDCGMMockRecorder) GetDeviceInfo(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetDeviceInfo", reflect.TypeOf((*MockDCGM)(nil).GetDeviceInfo), arg0) +} + +// GetEntityGroupEntities mocks base method. +func (m *MockDCGM) GetEntityGroupEntities(arg0 dcgm.Field_Entity_Group) ([]uint, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetEntityGroupEntities", arg0) + ret0, _ := ret[0].([]uint) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetEntityGroupEntities indicates an expected call of GetEntityGroupEntities. +func (mr *MockDCGMMockRecorder) GetEntityGroupEntities(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetEntityGroupEntities", reflect.TypeOf((*MockDCGM)(nil).GetEntityGroupEntities), arg0) +} + +// GetGpuInstanceHierarchy mocks base method. +func (m *MockDCGM) GetGpuInstanceHierarchy() (dcgm.MigHierarchy_v2, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetGpuInstanceHierarchy") + ret0, _ := ret[0].(dcgm.MigHierarchy_v2) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetGpuInstanceHierarchy indicates an expected call of GetGpuInstanceHierarchy. +func (mr *MockDCGMMockRecorder) GetGpuInstanceHierarchy() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetGpuInstanceHierarchy", reflect.TypeOf((*MockDCGM)(nil).GetGpuInstanceHierarchy)) +} + +// GetGroupInfo mocks base method. +func (m *MockDCGM) GetGroupInfo(arg0 dcgm.GroupHandle) (*dcgm.GroupInfo, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetGroupInfo", arg0) + ret0, _ := ret[0].(*dcgm.GroupInfo) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetGroupInfo indicates an expected call of GetGroupInfo. +func (mr *MockDCGMMockRecorder) GetGroupInfo(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetGroupInfo", reflect.TypeOf((*MockDCGM)(nil).GetGroupInfo), arg0) +} + +// GetNvLinkLinkStatus mocks base method. +func (m *MockDCGM) GetNvLinkLinkStatus() ([]dcgm.NvLinkStatus, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetNvLinkLinkStatus") + ret0, _ := ret[0].([]dcgm.NvLinkStatus) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetNvLinkLinkStatus indicates an expected call of GetNvLinkLinkStatus. +func (mr *MockDCGMMockRecorder) GetNvLinkLinkStatus() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetNvLinkLinkStatus", reflect.TypeOf((*MockDCGM)(nil).GetNvLinkLinkStatus)) +} + +// GetSupportedDevices mocks base method. +func (m *MockDCGM) GetSupportedDevices() ([]uint, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetSupportedDevices") + ret0, _ := ret[0].([]uint) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetSupportedDevices indicates an expected call of GetSupportedDevices. +func (mr *MockDCGMMockRecorder) GetSupportedDevices() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetSupportedDevices", reflect.TypeOf((*MockDCGM)(nil).GetSupportedDevices)) +} + +// GetSupportedMetricGroups mocks base method. +func (m *MockDCGM) GetSupportedMetricGroups(arg0 uint) ([]dcgm.MetricGroup, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetSupportedMetricGroups", arg0) + ret0, _ := ret[0].([]dcgm.MetricGroup) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetSupportedMetricGroups indicates an expected call of GetSupportedMetricGroups. +func (mr *MockDCGMMockRecorder) GetSupportedMetricGroups(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetSupportedMetricGroups", reflect.TypeOf((*MockDCGM)(nil).GetSupportedMetricGroups), arg0) +} + +// GetValuesSince mocks base method. +func (m *MockDCGM) GetValuesSince(arg0 dcgm.GroupHandle, arg1 dcgm.FieldHandle, arg2 time.Time) ([]dcgm.FieldValue_v2, time.Time, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetValuesSince", arg0, arg1, arg2) + ret0, _ := ret[0].([]dcgm.FieldValue_v2) + ret1, _ := ret[1].(time.Time) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 +} + +// GetValuesSince indicates an expected call of GetValuesSince. +func (mr *MockDCGMMockRecorder) GetValuesSince(arg0, arg1, arg2 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetValuesSince", reflect.TypeOf((*MockDCGM)(nil).GetValuesSince), arg0, arg1, arg2) +} + +// GroupAllGPUs mocks base method. +func (m *MockDCGM) GroupAllGPUs() dcgm.GroupHandle { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GroupAllGPUs") + ret0, _ := ret[0].(dcgm.GroupHandle) + return ret0 +} + +// GroupAllGPUs indicates an expected call of GroupAllGPUs. +func (mr *MockDCGMMockRecorder) GroupAllGPUs() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GroupAllGPUs", reflect.TypeOf((*MockDCGM)(nil).GroupAllGPUs)) +} + +// HealthCheck mocks base method. +func (m *MockDCGM) HealthCheck(arg0 dcgm.GroupHandle) (dcgm.HealthResponse, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "HealthCheck", arg0) + ret0, _ := ret[0].(dcgm.HealthResponse) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// HealthCheck indicates an expected call of HealthCheck. +func (mr *MockDCGMMockRecorder) HealthCheck(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HealthCheck", reflect.TypeOf((*MockDCGM)(nil).HealthCheck), arg0) +} + +// HealthGet mocks base method. +func (m *MockDCGM) HealthGet(arg0 dcgm.GroupHandle) (dcgm.HealthSystem, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "HealthGet", arg0) + ret0, _ := ret[0].(dcgm.HealthSystem) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// HealthGet indicates an expected call of HealthGet. +func (mr *MockDCGMMockRecorder) HealthGet(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HealthGet", reflect.TypeOf((*MockDCGM)(nil).HealthGet), arg0) +} + +// HealthSet mocks base method. +func (m *MockDCGM) HealthSet(arg0 dcgm.GroupHandle, arg1 dcgm.HealthSystem) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "HealthSet", arg0, arg1) + ret0, _ := ret[0].(error) + return ret0 +} + +// HealthSet indicates an expected call of HealthSet. +func (mr *MockDCGMMockRecorder) HealthSet(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HealthSet", reflect.TypeOf((*MockDCGM)(nil).HealthSet), arg0, arg1) +} + +// InjectFieldValue mocks base method. +func (m *MockDCGM) InjectFieldValue(arg0, arg1, arg2 uint, arg3 int, arg4 int64, arg5 any) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "InjectFieldValue", arg0, arg1, arg2, arg3, arg4, arg5) + ret0, _ := ret[0].(error) + return ret0 +} + +// InjectFieldValue indicates an expected call of InjectFieldValue. +func (mr *MockDCGMMockRecorder) InjectFieldValue(arg0, arg1, arg2, arg3, arg4, arg5 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InjectFieldValue", reflect.TypeOf((*MockDCGM)(nil).InjectFieldValue), arg0, arg1, arg2, arg3, arg4, arg5) +} + +// LinkGetLatestValues mocks base method. +func (m *MockDCGM) LinkGetLatestValues(arg0, arg1 uint, arg2 []dcgm.Short) ([]dcgm.FieldValue_v1, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "LinkGetLatestValues", arg0, arg1, arg2) + ret0, _ := ret[0].([]dcgm.FieldValue_v1) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// LinkGetLatestValues indicates an expected call of LinkGetLatestValues. +func (mr *MockDCGMMockRecorder) LinkGetLatestValues(arg0, arg1, arg2 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LinkGetLatestValues", reflect.TypeOf((*MockDCGM)(nil).LinkGetLatestValues), arg0, arg1, arg2) +} + +// NewDefaultGroup mocks base method. +func (m *MockDCGM) NewDefaultGroup(arg0 string) (dcgm.GroupHandle, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "NewDefaultGroup", arg0) + ret0, _ := ret[0].(dcgm.GroupHandle) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// NewDefaultGroup indicates an expected call of NewDefaultGroup. +func (mr *MockDCGMMockRecorder) NewDefaultGroup(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NewDefaultGroup", reflect.TypeOf((*MockDCGM)(nil).NewDefaultGroup), arg0) +} + +// UpdateAllFields mocks base method. +func (m *MockDCGM) UpdateAllFields() error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UpdateAllFields") + ret0, _ := ret[0].(error) + return ret0 +} + +// UpdateAllFields indicates an expected call of UpdateAllFields. +func (mr *MockDCGMMockRecorder) UpdateAllFields() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateAllFields", reflect.TypeOf((*MockDCGM)(nil).UpdateAllFields)) +} + +// WatchFieldsWithGroupEx mocks base method. +func (m *MockDCGM) WatchFieldsWithGroupEx(arg0 dcgm.FieldHandle, arg1 dcgm.GroupHandle, arg2 int64, arg3 float64, arg4 int32) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "WatchFieldsWithGroupEx", arg0, arg1, arg2, arg3, arg4) + ret0, _ := ret[0].(error) + return ret0 +} + +// WatchFieldsWithGroupEx indicates an expected call of WatchFieldsWithGroupEx. +func (mr *MockDCGMMockRecorder) WatchFieldsWithGroupEx(arg0, arg1, arg2, arg3, arg4 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WatchFieldsWithGroupEx", reflect.TypeOf((*MockDCGM)(nil).WatchFieldsWithGroupEx), arg0, arg1, arg2, arg3, arg4) +} diff --git a/internal/mocks/pkg/deviceinfo/mock_device_info.go b/internal/mocks/pkg/deviceinfo/mock_device_info.go new file mode 100644 index 00000000..0d87396b --- /dev/null +++ b/internal/mocks/pkg/deviceinfo/mock_device_info.go @@ -0,0 +1,266 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo (interfaces: Provider) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/deviceinfo/mock_device_info.go -package=deviceinfo -copyright_file=../../../hack/header.txt . Provider +// + +// Package deviceinfo is a generated GoMock package. +package deviceinfo + +import ( + reflect "reflect" + + appconfig "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + deviceinfo "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + dcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm" + gomock "go.uber.org/mock/gomock" +) + +// MockProvider is a mock of Provider interface. +type MockProvider struct { + ctrl *gomock.Controller + recorder *MockProviderMockRecorder +} + +// MockProviderMockRecorder is the mock recorder for MockProvider. +type MockProviderMockRecorder struct { + mock *MockProvider +} + +// NewMockProvider creates a new mock instance. +func NewMockProvider(ctrl *gomock.Controller) *MockProvider { + mock := &MockProvider{ctrl: ctrl} + mock.recorder = &MockProviderMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockProvider) EXPECT() *MockProviderMockRecorder { + return m.recorder +} + +// COpts mocks base method. +func (m *MockProvider) COpts() appconfig.DeviceOptions { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "COpts") + ret0, _ := ret[0].(appconfig.DeviceOptions) + return ret0 +} + +// COpts indicates an expected call of COpts. +func (mr *MockProviderMockRecorder) COpts() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "COpts", reflect.TypeOf((*MockProvider)(nil).COpts)) +} + +// CPU mocks base method. +func (m *MockProvider) CPU(arg0 uint) deviceinfo.CPUInfo { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CPU", arg0) + ret0, _ := ret[0].(deviceinfo.CPUInfo) + return ret0 +} + +// CPU indicates an expected call of CPU. +func (mr *MockProviderMockRecorder) CPU(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CPU", reflect.TypeOf((*MockProvider)(nil).CPU), arg0) +} + +// CPUs mocks base method. +func (m *MockProvider) CPUs() []deviceinfo.CPUInfo { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CPUs") + ret0, _ := ret[0].([]deviceinfo.CPUInfo) + return ret0 +} + +// CPUs indicates an expected call of CPUs. +func (mr *MockProviderMockRecorder) CPUs() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CPUs", reflect.TypeOf((*MockProvider)(nil).CPUs)) +} + +// GOpts mocks base method. +func (m *MockProvider) GOpts() appconfig.DeviceOptions { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GOpts") + ret0, _ := ret[0].(appconfig.DeviceOptions) + return ret0 +} + +// GOpts indicates an expected call of GOpts. +func (mr *MockProviderMockRecorder) GOpts() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GOpts", reflect.TypeOf((*MockProvider)(nil).GOpts)) +} + +// GPU mocks base method. +func (m *MockProvider) GPU(arg0 uint) deviceinfo.GPUInfo { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GPU", arg0) + ret0, _ := ret[0].(deviceinfo.GPUInfo) + return ret0 +} + +// GPU indicates an expected call of GPU. +func (mr *MockProviderMockRecorder) GPU(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GPU", reflect.TypeOf((*MockProvider)(nil).GPU), arg0) +} + +// GPUCount mocks base method. +func (m *MockProvider) GPUCount() uint { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GPUCount") + ret0, _ := ret[0].(uint) + return ret0 +} + +// GPUCount indicates an expected call of GPUCount. +func (mr *MockProviderMockRecorder) GPUCount() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GPUCount", reflect.TypeOf((*MockProvider)(nil).GPUCount)) +} + +// GPUs mocks base method. +func (m *MockProvider) GPUs() []deviceinfo.GPUInfo { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GPUs") + ret0, _ := ret[0].([]deviceinfo.GPUInfo) + return ret0 +} + +// GPUs indicates an expected call of GPUs. +func (mr *MockProviderMockRecorder) GPUs() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GPUs", reflect.TypeOf((*MockProvider)(nil).GPUs)) +} + +// InfoType mocks base method. +func (m *MockProvider) InfoType() dcgm.Field_Entity_Group { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "InfoType") + ret0, _ := ret[0].(dcgm.Field_Entity_Group) + return ret0 +} + +// InfoType indicates an expected call of InfoType. +func (mr *MockProviderMockRecorder) InfoType() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InfoType", reflect.TypeOf((*MockProvider)(nil).InfoType)) +} + +// IsCPUWatched mocks base method. +func (m *MockProvider) IsCPUWatched(arg0 uint) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsCPUWatched", arg0) + ret0, _ := ret[0].(bool) + return ret0 +} + +// IsCPUWatched indicates an expected call of IsCPUWatched. +func (mr *MockProviderMockRecorder) IsCPUWatched(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsCPUWatched", reflect.TypeOf((*MockProvider)(nil).IsCPUWatched), arg0) +} + +// IsCoreWatched mocks base method. +func (m *MockProvider) IsCoreWatched(arg0, arg1 uint) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsCoreWatched", arg0, arg1) + ret0, _ := ret[0].(bool) + return ret0 +} + +// IsCoreWatched indicates an expected call of IsCoreWatched. +func (mr *MockProviderMockRecorder) IsCoreWatched(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsCoreWatched", reflect.TypeOf((*MockProvider)(nil).IsCoreWatched), arg0, arg1) +} + +// IsLinkWatched mocks base method. +func (m *MockProvider) IsLinkWatched(arg0, arg1 uint) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsLinkWatched", arg0, arg1) + ret0, _ := ret[0].(bool) + return ret0 +} + +// IsLinkWatched indicates an expected call of IsLinkWatched. +func (mr *MockProviderMockRecorder) IsLinkWatched(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsLinkWatched", reflect.TypeOf((*MockProvider)(nil).IsLinkWatched), arg0, arg1) +} + +// IsSwitchWatched mocks base method. +func (m *MockProvider) IsSwitchWatched(arg0 uint) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsSwitchWatched", arg0) + ret0, _ := ret[0].(bool) + return ret0 +} + +// IsSwitchWatched indicates an expected call of IsSwitchWatched. +func (mr *MockProviderMockRecorder) IsSwitchWatched(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsSwitchWatched", reflect.TypeOf((*MockProvider)(nil).IsSwitchWatched), arg0) +} + +// SOpts mocks base method. +func (m *MockProvider) SOpts() appconfig.DeviceOptions { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SOpts") + ret0, _ := ret[0].(appconfig.DeviceOptions) + return ret0 +} + +// SOpts indicates an expected call of SOpts. +func (mr *MockProviderMockRecorder) SOpts() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SOpts", reflect.TypeOf((*MockProvider)(nil).SOpts)) +} + +// Switch mocks base method. +func (m *MockProvider) Switch(arg0 uint) deviceinfo.SwitchInfo { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Switch", arg0) + ret0, _ := ret[0].(deviceinfo.SwitchInfo) + return ret0 +} + +// Switch indicates an expected call of Switch. +func (mr *MockProviderMockRecorder) Switch(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Switch", reflect.TypeOf((*MockProvider)(nil).Switch), arg0) +} + +// Switches mocks base method. +func (m *MockProvider) Switches() []deviceinfo.SwitchInfo { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Switches") + ret0, _ := ret[0].([]deviceinfo.SwitchInfo) + return ret0 +} + +// Switches indicates an expected call of Switches. +func (mr *MockProviderMockRecorder) Switches() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Switches", reflect.TypeOf((*MockProvider)(nil).Switches)) +} diff --git a/internal/mocks/pkg/devicewatcher/mock_device_watcher.go b/internal/mocks/pkg/devicewatcher/mock_device_watcher.go new file mode 100644 index 00000000..563decf8 --- /dev/null +++ b/internal/mocks/pkg/devicewatcher/mock_device_watcher.go @@ -0,0 +1,87 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher (interfaces: Watcher) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/devicewatcher/mock_device_watcher.go -package=devicewatcher -copyright_file=../../../hack/header.txt . Watcher +// + +// Package devicewatcher is a generated GoMock package. +package devicewatcher + +import ( + reflect "reflect" + + counters "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + deviceinfo "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + dcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm" + gomock "go.uber.org/mock/gomock" +) + +// MockWatcher is a mock of Watcher interface. +type MockWatcher struct { + ctrl *gomock.Controller + recorder *MockWatcherMockRecorder +} + +// MockWatcherMockRecorder is the mock recorder for MockWatcher. +type MockWatcherMockRecorder struct { + mock *MockWatcher +} + +// NewMockWatcher creates a new mock instance. +func NewMockWatcher(ctrl *gomock.Controller) *MockWatcher { + mock := &MockWatcher{ctrl: ctrl} + mock.recorder = &MockWatcherMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockWatcher) EXPECT() *MockWatcherMockRecorder { + return m.recorder +} + +// GetDeviceFields mocks base method. +func (m *MockWatcher) GetDeviceFields(arg0 []counters.Counter, arg1 dcgm.Field_Entity_Group) []dcgm.Short { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetDeviceFields", arg0, arg1) + ret0, _ := ret[0].([]dcgm.Short) + return ret0 +} + +// GetDeviceFields indicates an expected call of GetDeviceFields. +func (mr *MockWatcherMockRecorder) GetDeviceFields(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetDeviceFields", reflect.TypeOf((*MockWatcher)(nil).GetDeviceFields), arg0, arg1) +} + +// WatchDeviceFields mocks base method. +func (m *MockWatcher) WatchDeviceFields(arg0 []dcgm.Short, arg1 deviceinfo.Provider, arg2 int64) ([]dcgm.GroupHandle, dcgm.FieldHandle, []func(), error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "WatchDeviceFields", arg0, arg1, arg2) + ret0, _ := ret[0].([]dcgm.GroupHandle) + ret1, _ := ret[1].(dcgm.FieldHandle) + ret2, _ := ret[2].([]func()) + ret3, _ := ret[3].(error) + return ret0, ret1, ret2, ret3 +} + +// WatchDeviceFields indicates an expected call of WatchDeviceFields. +func (mr *MockWatcherMockRecorder) WatchDeviceFields(arg0, arg1, arg2 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WatchDeviceFields", reflect.TypeOf((*MockWatcher)(nil).WatchDeviceFields), arg0, arg1, arg2) +} diff --git a/internal/mocks/pkg/devicewatchlistmanager/mock_device_watchlist_manager.go b/internal/mocks/pkg/devicewatchlistmanager/mock_device_watchlist_manager.go new file mode 100644 index 00000000..9d98c66e --- /dev/null +++ b/internal/mocks/pkg/devicewatchlistmanager/mock_device_watchlist_manager.go @@ -0,0 +1,85 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager (interfaces: Manager) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/devicewatchlistmanager/mock_device_watchlist_manager.go -package=devicewatchlistmanager -copyright_file=../../../hack/header.txt . Manager +// + +// Package devicewatchlistmanager is a generated GoMock package. +package devicewatchlistmanager + +import ( + reflect "reflect" + + devicewatcher "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher" + devicewatchlistmanager "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" + dcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm" + gomock "go.uber.org/mock/gomock" +) + +// MockManager is a mock of Manager interface. +type MockManager struct { + ctrl *gomock.Controller + recorder *MockManagerMockRecorder +} + +// MockManagerMockRecorder is the mock recorder for MockManager. +type MockManagerMockRecorder struct { + mock *MockManager +} + +// NewMockManager creates a new mock instance. +func NewMockManager(ctrl *gomock.Controller) *MockManager { + mock := &MockManager{ctrl: ctrl} + mock.recorder = &MockManagerMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockManager) EXPECT() *MockManagerMockRecorder { + return m.recorder +} + +// CreateEntityWatchList mocks base method. +func (m *MockManager) CreateEntityWatchList(arg0 dcgm.Field_Entity_Group, arg1 devicewatcher.Watcher, arg2 int64) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CreateEntityWatchList", arg0, arg1, arg2) + ret0, _ := ret[0].(error) + return ret0 +} + +// CreateEntityWatchList indicates an expected call of CreateEntityWatchList. +func (mr *MockManagerMockRecorder) CreateEntityWatchList(arg0, arg1, arg2 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateEntityWatchList", reflect.TypeOf((*MockManager)(nil).CreateEntityWatchList), arg0, arg1, arg2) +} + +// EntityWatchList mocks base method. +func (m *MockManager) EntityWatchList(arg0 dcgm.Field_Entity_Group) (devicewatchlistmanager.WatchList, bool) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "EntityWatchList", arg0) + ret0, _ := ret[0].(devicewatchlistmanager.WatchList) + ret1, _ := ret[1].(bool) + return ret0, ret1 +} + +// EntityWatchList indicates an expected call of EntityWatchList. +func (mr *MockManagerMockRecorder) EntityWatchList(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EntityWatchList", reflect.TypeOf((*MockManager)(nil).EntityWatchList), arg0) +} diff --git a/internal/mocks/pkg/elf/mock_elf.go b/internal/mocks/pkg/elf/mock_elf.go new file mode 100644 index 00000000..a39eda43 --- /dev/null +++ b/internal/mocks/pkg/elf/mock_elf.go @@ -0,0 +1,69 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/elf (interfaces: ELF) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/elf/mock_elf.go -package=elf -copyright_file=../../../hack/header.txt . ELF +// + +// Package elf is a generated GoMock package. +package elf + +import ( + elf "debug/elf" + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockELF is a mock of ELF interface. +type MockELF struct { + ctrl *gomock.Controller + recorder *MockELFMockRecorder +} + +// MockELFMockRecorder is the mock recorder for MockELF. +type MockELFMockRecorder struct { + mock *MockELF +} + +// NewMockELF creates a new mock instance. +func NewMockELF(ctrl *gomock.Controller) *MockELF { + mock := &MockELF{ctrl: ctrl} + mock.recorder = &MockELFMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockELF) EXPECT() *MockELFMockRecorder { + return m.recorder +} + +// Open mocks base method. +func (m *MockELF) Open(arg0 string) (*elf.File, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Open", arg0) + ret0, _ := ret[0].(*elf.File) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Open indicates an expected call of Open. +func (mr *MockELFMockRecorder) Open(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Open", reflect.TypeOf((*MockELF)(nil).Open), arg0) +} diff --git a/internal/mocks/pkg/exec/mock_cmd.go b/internal/mocks/pkg/exec/mock_cmd.go new file mode 100644 index 00000000..85f81647 --- /dev/null +++ b/internal/mocks/pkg/exec/mock_cmd.go @@ -0,0 +1,68 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/exec (interfaces: Cmd) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/exec/mock_cmd.go -package=exec -copyright_file=../../../hack/header.txt . Cmd +// + +// Package exec is a generated GoMock package. +package exec + +import ( + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockCmd is a mock of Cmd interface. +type MockCmd struct { + ctrl *gomock.Controller + recorder *MockCmdMockRecorder +} + +// MockCmdMockRecorder is the mock recorder for MockCmd. +type MockCmdMockRecorder struct { + mock *MockCmd +} + +// NewMockCmd creates a new mock instance. +func NewMockCmd(ctrl *gomock.Controller) *MockCmd { + mock := &MockCmd{ctrl: ctrl} + mock.recorder = &MockCmdMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockCmd) EXPECT() *MockCmdMockRecorder { + return m.recorder +} + +// Output mocks base method. +func (m *MockCmd) Output() ([]byte, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Output") + ret0, _ := ret[0].([]byte) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Output indicates an expected call of Output. +func (mr *MockCmdMockRecorder) Output() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Output", reflect.TypeOf((*MockCmd)(nil).Output)) +} diff --git a/internal/mocks/pkg/exec/mock_exec.go b/internal/mocks/pkg/exec/mock_exec.go new file mode 100644 index 00000000..18423b6e --- /dev/null +++ b/internal/mocks/pkg/exec/mock_exec.go @@ -0,0 +1,73 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/exec (interfaces: Exec) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/exec/mock_exec.go -package=exec -copyright_file=../../../hack/header.txt . Exec +// + +// Package exec is a generated GoMock package. +package exec + +import ( + reflect "reflect" + + exec "github.com/NVIDIA/dcgm-exporter/internal/pkg/exec" + gomock "go.uber.org/mock/gomock" +) + +// MockExec is a mock of Exec interface. +type MockExec struct { + ctrl *gomock.Controller + recorder *MockExecMockRecorder +} + +// MockExecMockRecorder is the mock recorder for MockExec. +type MockExecMockRecorder struct { + mock *MockExec +} + +// NewMockExec creates a new mock instance. +func NewMockExec(ctrl *gomock.Controller) *MockExec { + mock := &MockExec{ctrl: ctrl} + mock.recorder = &MockExecMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockExec) EXPECT() *MockExecMockRecorder { + return m.recorder +} + +// Command mocks base method. +func (m *MockExec) Command(arg0 string, arg1 ...string) exec.Cmd { + m.ctrl.T.Helper() + varargs := []any{arg0} + for _, a := range arg1 { + varargs = append(varargs, a) + } + ret := m.ctrl.Call(m, "Command", varargs...) + ret0, _ := ret[0].(exec.Cmd) + return ret0 +} + +// Command indicates an expected call of Command. +func (mr *MockExecMockRecorder) Command(arg0 any, arg1 ...any) *gomock.Call { + mr.mock.ctrl.T.Helper() + varargs := append([]any{arg0}, arg1...) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Command", reflect.TypeOf((*MockExec)(nil).Command), varargs...) +} diff --git a/internal/mocks/pkg/nvmlprovider/mock_client.go b/internal/mocks/pkg/nvmlprovider/mock_client.go new file mode 100644 index 00000000..da770340 --- /dev/null +++ b/internal/mocks/pkg/nvmlprovider/mock_client.go @@ -0,0 +1,81 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider (interfaces: NVML) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/nvmlprovider/mock_client.go -package=nvmlprovider -copyright_file=../../../hack/header.txt . NVML +// + +// Package nvmlprovider is a generated GoMock package. +package nvmlprovider + +import ( + reflect "reflect" + + nvmlprovider "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" + gomock "go.uber.org/mock/gomock" +) + +// MockNVML is a mock of NVML interface. +type MockNVML struct { + ctrl *gomock.Controller + recorder *MockNVMLMockRecorder +} + +// MockNVMLMockRecorder is the mock recorder for MockNVML. +type MockNVMLMockRecorder struct { + mock *MockNVML +} + +// NewMockNVML creates a new mock instance. +func NewMockNVML(ctrl *gomock.Controller) *MockNVML { + mock := &MockNVML{ctrl: ctrl} + mock.recorder = &MockNVMLMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockNVML) EXPECT() *MockNVMLMockRecorder { + return m.recorder +} + +// Cleanup mocks base method. +func (m *MockNVML) Cleanup() { + m.ctrl.T.Helper() + m.ctrl.Call(m, "Cleanup") +} + +// Cleanup indicates an expected call of Cleanup. +func (mr *MockNVMLMockRecorder) Cleanup() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Cleanup", reflect.TypeOf((*MockNVML)(nil).Cleanup)) +} + +// GetMIGDeviceInfoByID mocks base method. +func (m *MockNVML) GetMIGDeviceInfoByID(arg0 string) (*nvmlprovider.MIGDeviceInfo, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetMIGDeviceInfoByID", arg0) + ret0, _ := ret[0].(*nvmlprovider.MIGDeviceInfo) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetMIGDeviceInfoByID indicates an expected call of GetMIGDeviceInfoByID. +func (mr *MockNVMLMockRecorder) GetMIGDeviceInfoByID(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetMIGDeviceInfoByID", reflect.TypeOf((*MockNVML)(nil).GetMIGDeviceInfoByID), arg0) +} diff --git a/internal/mocks/pkg/os/dir_entry.go b/internal/mocks/pkg/os/mock_dir_entry.go similarity index 96% rename from internal/mocks/pkg/os/dir_entry.go rename to internal/mocks/pkg/os/mock_dir_entry.go index d6271885..29113576 100644 --- a/internal/mocks/pkg/os/dir_entry.go +++ b/internal/mocks/pkg/os/mock_dir_entry.go @@ -17,7 +17,7 @@ // // Generated by this command: // -// mockgen -destination=../../mocks/pkg/os/dir_entry.go -package=os -copyright_file=../../../hack/header.txt os DirEntry +// mockgen -destination=../../mocks/pkg/os/mock_dir_entry.go -package=os -copyright_file=../../../hack/header.txt os DirEntry // // Package os is a generated GoMock package. diff --git a/internal/mocks/pkg/os/file_info.go b/internal/mocks/pkg/os/mock_file_info.go similarity index 96% rename from internal/mocks/pkg/os/file_info.go rename to internal/mocks/pkg/os/mock_file_info.go index b2ef5282..d4e35f94 100644 --- a/internal/mocks/pkg/os/file_info.go +++ b/internal/mocks/pkg/os/mock_file_info.go @@ -17,7 +17,7 @@ // // Generated by this command: // -// mockgen -destination=../../mocks/pkg/os/file_info.go -package=os -copyright_file=../../../hack/header.txt io/fs FileInfo +// mockgen -destination=../../mocks/pkg/os/mock_file_info.go -package=os -copyright_file=../../../hack/header.txt io/fs FileInfo // // Package os is a generated GoMock package. diff --git a/internal/mocks/pkg/os/os.go b/internal/mocks/pkg/os/mock_os.go similarity index 93% rename from internal/mocks/pkg/os/os.go rename to internal/mocks/pkg/os/mock_os.go index f3b77f42..70c9a186 100644 --- a/internal/mocks/pkg/os/os.go +++ b/internal/mocks/pkg/os/mock_os.go @@ -17,7 +17,7 @@ // // Generated by this command: // -// mockgen -destination=../../mocks/pkg/os/os.go -package=os -copyright_file=../../../hack/header.txt . OS +// mockgen -destination=../../mocks/pkg/os/mock_os.go -package=os -copyright_file=../../../hack/header.txt . OS // // Package os is a generated GoMock package. @@ -69,6 +69,18 @@ func (mr *MockOSMockRecorder) CreateTemp(arg0, arg1 any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateTemp", reflect.TypeOf((*MockOS)(nil).CreateTemp), arg0, arg1) } +// Exit mocks base method. +func (m *MockOS) Exit(arg0 int) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "Exit", arg0) +} + +// Exit indicates an expected call of Exit. +func (mr *MockOSMockRecorder) Exit(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Exit", reflect.TypeOf((*MockOS)(nil).Exit), arg0) +} + // Getenv mocks base method. func (m *MockOS) Getenv(arg0 string) string { m.ctrl.T.Helper() diff --git a/internal/mocks/pkg/transformation/mock_transformer.go b/internal/mocks/pkg/transformation/mock_transformer.go new file mode 100644 index 00000000..1ef2e87d --- /dev/null +++ b/internal/mocks/pkg/transformation/mock_transformer.go @@ -0,0 +1,84 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/transformations (interfaces: Transform) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/transformations/mock_transformer.go -package=transformation -copyright_file=../../../hack/header.txt . Transform +// + +// Package transformation is a generated GoMock package. +package transformation + +import ( + reflect "reflect" + + gomock "go.uber.org/mock/gomock" + + collector "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + deviceinfo "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" +) + +// MockTransform is a mock of Transform interface. +type MockTransform struct { + ctrl *gomock.Controller + recorder *MockTransformMockRecorder +} + +// MockTransformMockRecorder is the mock recorder for MockTransform. +type MockTransformMockRecorder struct { + mock *MockTransform +} + +// NewMockTransform creates a new mock instance. +func NewMockTransform(ctrl *gomock.Controller) *MockTransform { + mock := &MockTransform{ctrl: ctrl} + mock.recorder = &MockTransformMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockTransform) EXPECT() *MockTransformMockRecorder { + return m.recorder +} + +// Name mocks base method. +func (m *MockTransform) Name() string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Name") + ret0, _ := ret[0].(string) + return ret0 +} + +// Name indicates an expected call of Name. +func (mr *MockTransformMockRecorder) Name() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockTransform)(nil).Name)) +} + +// Process mocks base method. +func (m *MockTransform) Process(arg0 collector.MetricsByCounter, arg1 deviceinfo.Provider) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Process", arg0, arg1) + ret0, _ := ret[0].(error) + return ret0 +} + +// Process indicates an expected call of Process. +func (mr *MockTransformMockRecorder) Process(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Process", reflect.TypeOf((*MockTransform)(nil).Process), arg0, arg1) +} diff --git a/internal/mocks/pkg/transformations/mock_transformer.go b/internal/mocks/pkg/transformations/mock_transformer.go new file mode 100644 index 00000000..bfd858f9 --- /dev/null +++ b/internal/mocks/pkg/transformations/mock_transformer.go @@ -0,0 +1,83 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/transformation (interfaces: Transform) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/transformations/mock_transformer.go -package=transformation -copyright_file=../../../hack/header.txt . Transform +// + +// Package transformation is a generated GoMock package. +package transformation + +import ( + reflect "reflect" + + collector "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + deviceinfo "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + gomock "go.uber.org/mock/gomock" +) + +// MockTransform is a mock of Transform interface. +type MockTransform struct { + ctrl *gomock.Controller + recorder *MockTransformMockRecorder +} + +// MockTransformMockRecorder is the mock recorder for MockTransform. +type MockTransformMockRecorder struct { + mock *MockTransform +} + +// NewMockTransform creates a new mock instance. +func NewMockTransform(ctrl *gomock.Controller) *MockTransform { + mock := &MockTransform{ctrl: ctrl} + mock.recorder = &MockTransformMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockTransform) EXPECT() *MockTransformMockRecorder { + return m.recorder +} + +// Name mocks base method. +func (m *MockTransform) Name() string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Name") + ret0, _ := ret[0].(string) + return ret0 +} + +// Name indicates an expected call of Name. +func (mr *MockTransformMockRecorder) Name() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockTransform)(nil).Name)) +} + +// Process mocks base method. +func (m *MockTransform) Process(arg0 collector.MetricsByCounter, arg1 deviceinfo.Provider) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Process", arg0, arg1) + ret0, _ := ret[0].(error) + return ret0 +} + +// Process indicates an expected call of Process. +func (mr *MockTransformMockRecorder) Process(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Process", reflect.TypeOf((*MockTransform)(nil).Process), arg0, arg1) +} diff --git a/internal/pkg/appconfig/const.go b/internal/pkg/appconfig/const.go new file mode 100644 index 00000000..53fa246e --- /dev/null +++ b/internal/pkg/appconfig/const.go @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package appconfig + +const ( + GPUUID KubernetesGPUIDType = "uid" + DeviceName KubernetesGPUIDType = "device-name" + + NvidiaResourceName = "nvidia.com/gpu" + NvidiaMigResourcePrefix = "nvidia.com/mig-" + MIG_UUID_PREFIX = "MIG-" +) diff --git a/pkg/dcgmexporter/config.go b/internal/pkg/appconfig/types.go similarity index 84% rename from pkg/dcgmexporter/config.go rename to internal/pkg/appconfig/types.go index f13c91db..6d369845 100644 --- a/pkg/dcgmexporter/config.go +++ b/internal/pkg/appconfig/types.go @@ -5,7 +5,7 @@ * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -14,17 +14,14 @@ * limitations under the License. */ -package dcgmexporter +package appconfig -import "github.com/NVIDIA/go-dcgm/pkg/dcgm" +import ( + "github.com/NVIDIA/go-dcgm/pkg/dcgm" +) type KubernetesGPUIDType string -const ( - GPUUID KubernetesGPUIDType = "uid" - DeviceName KubernetesGPUIDType = "device-name" -) - type DeviceOptions struct { Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled. MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all @@ -41,9 +38,9 @@ type Config struct { UseOldNamespace bool UseRemoteHE bool RemoteHEInfo string - GPUDevices DeviceOptions - SwitchDevices DeviceOptions - CPUDevices DeviceOptions + GPUDeviceOptions DeviceOptions + SwitchDeviceOptions DeviceOptions + CPUDeviceOptions DeviceOptions NoHostname bool UseFakeGPUs bool ConfigMapData string diff --git a/internal/pkg/collector/base_collector.go b/internal/pkg/collector/base_collector.go new file mode 100644 index 00000000..e8b2c43c --- /dev/null +++ b/internal/pkg/collector/base_collector.go @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "fmt" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicemonitoring" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" +) + +type baseExpCollector struct { + deviceWatchList devicewatchlistmanager.WatchList // Device info and fields used for counters and labels + counter counters.Counter // Counter for a specific collector type + labelsCounters []counters.Counter // Counters used for labels + hostname string // Hostname + config *appconfig.Config // Configuration settings + cleanups []func() // Cleanup functions +} + +func (c *baseExpCollector) createMetric( + labels map[string]string, mi devicemonitoring.Info, uuid string, val int, +) Metric { + gpuModel := getGPUModel(mi.DeviceInfo, c.config.ReplaceBlanksInModelName) + + m := Metric{ + Counter: c.counter, + Value: fmt.Sprint(val), + UUID: uuid, + GPU: fmt.Sprintf("%d", mi.DeviceInfo.GPU), + GPUUUID: mi.DeviceInfo.UUID, + GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU), + GPUModelName: gpuModel, + GPUPCIBusID: mi.DeviceInfo.PCI.BusID, + Hostname: c.hostname, + + Labels: labels, + Attributes: map[string]string{}, + } + if mi.InstanceInfo != nil { + m.MigProfile = mi.InstanceInfo.ProfileName + m.GPUInstanceID = fmt.Sprintf("%d", mi.InstanceInfo.Info.NvmlInstanceId) + } else { + m.MigProfile = "" + m.GPUInstanceID = "" + } + return m +} + +func (c *baseExpCollector) getLabelsFromCounters(mi devicemonitoring.Info, labels map[string]string) error { + latestValues, err := dcgmprovider.Client().EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, + c.deviceWatchList.LabelDeviceFields()) + if err != nil { + return err + } + // Extract Labels + for _, val := range latestValues { + v := toString(val) + // Filter out counters with no value and ignored fields for this entity + if v == skipDCGMValue { + continue + } + + counter, err := findCounterField(c.labelsCounters, val.FieldId) + if err != nil { + continue + } + + if counter.IsLabel() { + labels[counter.FieldName] = v + continue + } + } + return nil +} + +func (c *baseExpCollector) Cleanup() { + for _, cleanup := range c.cleanups { + cleanup() + } +} diff --git a/pkg/dcgmexporter/clock_events_collector.go b/internal/pkg/collector/clock_events_collector.go similarity index 77% rename from pkg/dcgmexporter/clock_events_collector.go rename to internal/pkg/collector/clock_events_collector.go index 31eb0ff6..827c63c5 100644 --- a/pkg/dcgmexporter/clock_events_collector.go +++ b/internal/pkg/collector/clock_events_collector.go @@ -14,21 +14,25 @@ * limitations under the License. */ -package dcgmexporter +package collector import ( "fmt" + "log/slog" "slices" "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/sirupsen/logrus" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" ) // IsDCGMExpClockEventsCountEnabled checks if the DCGM_EXP_CLOCK_EVENTS_COUNT counter exists -func IsDCGMExpClockEventsCountEnabled(counters []Counter) bool { - return slices.ContainsFunc(counters, - func(c Counter) bool { - return c.FieldName == dcgmExpClockEventsCount +func IsDCGMExpClockEventsCountEnabled(counterList counters.CounterList) bool { + return slices.ContainsFunc(counterList, + func(c counters.Counter) bool { + return c.FieldName == counters.DCGMExpClockEventsCount }) } @@ -50,7 +54,7 @@ const ( DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN clockEventBitmask = 0x0000000000000008 // DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST Sync Boost DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST clockEventBitmask = 0x0000000000000010 - //SW Thermal Slowdown + // SW Thermal Slowdown DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL clockEventBitmask = 0x0000000000000020 // DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL clockEventBitmask = 0x0000000000000040 @@ -82,26 +86,33 @@ func (c *clockEventsCollector) GetMetrics() (MetricsByCounter, error) { return c.expCollector.getMetrics() } -func NewClockEventsCollector(counters []Counter, +func NewClockEventsCollector( + counterList counters.CounterList, hostname string, - config *Config, - fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (Collector, error) { - if !IsDCGMExpClockEventsCountEnabled(counters) { - logrus.Error(dcgmExpClockEventsCount + " collector is disabled") - return nil, fmt.Errorf(dcgmExpClockEventsCount + " collector is disabled") + config *appconfig.Config, + deviceWatchList devicewatchlistmanager.WatchList, +) (Collector, error) { + if !IsDCGMExpClockEventsCountEnabled(counterList) { + slog.Error(counters.DCGMExpClockEventsCount + " collector is disabled") + return nil, fmt.Errorf(counters.DCGMExpClockEventsCount + " collector is disabled") } collector := clockEventsCollector{} - collector.expCollector = newExpCollector( - counters, + var err error + deviceWatchList.SetDeviceFields([]dcgm.Short{dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS}) + + collector.expCollector, err = newExpCollector( + counterList.LabelCounters(), hostname, - []dcgm.Short{dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS}, config, - fieldEntityGroupTypeSystemInfo, + deviceWatchList, ) + if err != nil { + return nil, err + } - collector.counter = counters[slices.IndexFunc(counters, func(c Counter) bool { - return c.FieldName == dcgmExpClockEventsCount + collector.counter = counterList[slices.IndexFunc(counterList, func(c counters.Counter) bool { + return c.FieldName == counters.DCGMExpClockEventsCount })] collector.labelFiller = func(metricValueLabels map[string]string, entityValue int64) { diff --git a/internal/pkg/collector/clock_events_collector_test.go b/internal/pkg/collector/clock_events_collector_test.go new file mode 100644 index 00000000..a5eaa417 --- /dev/null +++ b/internal/pkg/collector/clock_events_collector_test.go @@ -0,0 +1,798 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "fmt" + "slices" + "testing" + "time" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + + mockdcgm "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/dcgmprovider" + mockdevicewatcher "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/devicewatcher" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" +) + +const invalidClockEventValue = 10000 + +func TestIsDCGMExpClockEventsCountEnabled(t *testing.T) { + tests := []struct { + name string + arg counters.CounterList + want bool + }{ + { + name: "empty", + arg: counters.CounterList{}, + want: false, + }, + { + name: "counter event count disabled", + arg: counters.CounterList{ + counters.Counter{ + FieldID: 1, + FieldName: "random1", + }, + counters.Counter{ + FieldID: 2, + FieldName: "random2", + }, + }, + want: false, + }, + { + name: "counter event count enabled", + arg: counters.CounterList{ + counters.Counter{ + FieldID: 1, + FieldName: counters.DCGMExpClockEventsCount, + }, + counters.Counter{ + FieldID: 2, + FieldName: "random2", + }, + }, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equalf(t, tt.want, IsDCGMExpClockEventsCountEnabled(tt.arg), "unexpected response") + }) + } +} + +func TestNewClockEventsCollector(t *testing.T) { + ctrl := gomock.NewController(t) + mockDeviceWatcher := mockdevicewatcher.NewMockWatcher(ctrl) + + sampleDeviceInfo := &deviceinfo.Info{} + sampleDeviceFields := []dcgm.Short{42} + sampleCollectorInterval := int64(1) + sampleConfig := appconfig.Config{} + sampleHostname := "localhost" + var sampleCleanups []func() + + sampleDCGMExpClockEventsCounter := counters.Counter{ + FieldID: 1, + FieldName: counters.DCGMExpClockEventsCount, + } + + sampleOtherCounter := counters.Counter{ + FieldID: 2, + FieldName: "random2", + } + + sampleLabelCounter := counters.Counter{ + FieldID: 3, + FieldName: "random2", + PromType: "label", + } + + type args struct { + counterList counters.CounterList + hostname string + config *appconfig.Config + deviceWatchList *devicewatchlistmanager.WatchList + } + tests := []struct { + name string + args args + conditions func(watcher *mockdevicewatcher.MockWatcher) + want func(string, *appconfig.Config, devicewatchlistmanager.WatchList) Collector + wantErr bool + }{ + { + name: "counter is disabled ", + args: args{ + counterList: counters.CounterList{}, + hostname: sampleHostname, + config: nil, + deviceWatchList: &devicewatchlistmanager.WatchList{}, + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher) {}, + want: func( + _ string, _ *appconfig.Config, + _ devicewatchlistmanager.WatchList, + ) Collector { + return nil + }, + wantErr: true, + }, + { + name: "new clock events collector watcher fails", + args: args{ + counterList: counters.CounterList{ + sampleDCGMExpClockEventsCounter, + sampleOtherCounter, + sampleLabelCounter, + }, + hostname: sampleHostname, + config: &sampleConfig, + deviceWatchList: devicewatchlistmanager.NewWatchList(sampleDeviceInfo, sampleDeviceFields, nil, + mockDeviceWatcher, sampleCollectorInterval), + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher) { + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return(nil, + dcgm.FieldHandle{}, + sampleCleanups, fmt.Errorf("some error")) + }, + want: func( + _ string, _ *appconfig.Config, + _ devicewatchlistmanager.WatchList, + ) Collector { + return nil + }, + wantErr: true, + }, + { + name: "new clock events collector ", + args: args{ + counterList: counters.CounterList{ + sampleDCGMExpClockEventsCounter, + sampleOtherCounter, + sampleLabelCounter, + }, + hostname: sampleHostname, + config: &sampleConfig, + deviceWatchList: devicewatchlistmanager.NewWatchList(sampleDeviceInfo, sampleDeviceFields, nil, + mockDeviceWatcher, sampleCollectorInterval), + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher) { + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return(nil, + dcgm.FieldHandle{}, + sampleCleanups, nil) + }, + want: func( + hostname string, config *appconfig.Config, + deviceWatchList devicewatchlistmanager.WatchList, + ) Collector { + deviceWatchList.SetDeviceFields([]dcgm.Short{dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS}) + return &clockEventsCollector{ + expCollector{ + baseExpCollector: baseExpCollector{ + deviceWatchList: deviceWatchList, + counter: sampleDCGMExpClockEventsCounter, + labelsCounters: []counters.Counter{sampleLabelCounter}, + hostname: hostname, + config: config, + cleanups: sampleCleanups, + }, + windowSize: config.ClockEventsCountWindowSize, + }, + } + }, + wantErr: false, + }, + { + name: "new clock events collector with no label counters", + args: args{ + counterList: counters.CounterList{ + sampleDCGMExpClockEventsCounter, + sampleOtherCounter, + }, + hostname: sampleHostname, + config: &sampleConfig, + deviceWatchList: devicewatchlistmanager.NewWatchList(sampleDeviceInfo, sampleDeviceFields, nil, + mockDeviceWatcher, sampleCollectorInterval), + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher) { + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return(nil, + dcgm.FieldHandle{}, + sampleCleanups, nil) + }, + want: func( + hostname string, config *appconfig.Config, + deviceWatchList devicewatchlistmanager.WatchList, + ) Collector { + deviceWatchList.SetDeviceFields([]dcgm.Short{dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS}) + return &clockEventsCollector{ + expCollector{ + baseExpCollector: baseExpCollector{ + deviceWatchList: deviceWatchList, + counter: sampleDCGMExpClockEventsCounter, + labelsCounters: nil, + hostname: hostname, + config: config, + cleanups: sampleCleanups, + }, + windowSize: config.ClockEventsCountWindowSize, + }, + } + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.conditions(mockDeviceWatcher) + + got, err := NewClockEventsCollector(tt.args.counterList, tt.args.hostname, tt.args.config, + *tt.args.deviceWatchList) + want := tt.want(tt.args.hostname, tt.args.config, *tt.args.deviceWatchList) + + if !tt.wantErr { + assert.NoError(t, err, "unexpected error") + + wantAttrs := testutils.GetFields(&want.(*clockEventsCollector).expCollector, testutils.Fields) + gotAttrs := testutils.GetFields(&got.(*clockEventsCollector).expCollector, testutils.Fields) + assert.Equal(t, wantAttrs, gotAttrs, "unexpected result") + + gotFuncAttrs := testutils.GetFields(&got.(*clockEventsCollector).expCollector, testutils.Functions) + for functionName, value := range gotFuncAttrs { + assert.NotNilf(t, value, "unexpected %s to be not nil", functionName) + } + } else { + assert.Error(t, err, "expected error") + assert.Equal(t, want, got, "unexpected result") + } + }) + } +} + +func clockEventMetricsCreator( + counter counters.Counter, gpuID uint, value, hostname, mockFieldName, + mockFieldLabelValue string, mockClockEvent uint64, useOldNamespace bool, +) Metric { + uuid := "UUID" + if useOldNamespace { + uuid = "uuid" + } + + labels := map[string]string{ + windowSizeInMSLabel: "0", + mockFieldName: mockFieldLabelValue, + } + + if mockClockEvent != invalidClockEventValue { + labels["clock_event"] = clockEventBitmask(mockClockEvent).String() + } + + return Metric{ + Counter: counter, + Value: value, + GPU: fmt.Sprintf("%d", gpuID), + GPUUUID: "", + GPUDevice: fmt.Sprintf("nvidia%d", gpuID), + GPUModelName: "", + UUID: uuid, + MigProfile: "", + GPUInstanceID: "", + Hostname: hostname, + Labels: labels, + Attributes: map[string]string{}, + } +} + +func sortClockEventMetrics(metrics []Metric) { + slices.SortFunc(metrics, func(a, b Metric) int { + if a.GPU < b.GPU { + return -1 + } else if a.GPU == b.GPU { + if a.Labels["clock_event"] < b.Labels["clock_event"] { + return -1 + } + } + return 1 + }) +} + +func Test_clockEventsCollector_GetMetrics(t *testing.T) { + /******* Mock DCGM *************/ + ctrl := gomock.NewController(t) + mockDCGM := mockdcgm.NewMockDCGM(ctrl) + mockDeviceWatcher := mockdevicewatcher.NewMockWatcher(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGM) + + /******** Mock Counters ************/ + mockDCGMExpClockEventsCounter := counters.Counter{ + FieldID: 1, + FieldName: counters.DCGMExpClockEventsCount, + } + + mockOtherCounter := counters.Counter{ + FieldID: 2, + FieldName: "random2", + } + + mockLabelDeviceField := dcgm.Short(3) + mockFieldName := "random3" + mockLabelValue := "this is mock label" + mockLabelCounter := counters.Counter{ + FieldID: mockLabelDeviceField, + FieldName: mockFieldName, + PromType: "label", + } + + /******** Mock Device Info *********/ + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + /******** Other Mock Inputs ************/ + gpuID1 := uint(0) + gpuID2 := uint(1) + + mockDeviceFields := []dcgm.Short{42} + mockCollectorInterval := int64(1) + mockConfig := appconfig.Config{} + mockHostname := "localhost" + cleanupCalled := 0 + mockCleanups := []func(){ + func() { + cleanupCalled++ + }, + } + + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + mockLatestValues := []dcgm.FieldValue_v1{ + { + FieldId: 150, + FieldType: dcgm.DCGM_FT_INT64, + Value: [4096]byte{42}, + }, + { + FieldId: uint(mockLabelDeviceField), + FieldType: dcgm.DCGM_FT_STRING, + Value: testutils.StrToByteArray(mockLabelValue), + }, + { + FieldId: uint(mockLabelDeviceField), + FieldType: dcgm.DCGM_FT_STRING, + Value: testutils.StrToByteArray(dcgm.DCGM_FT_STR_NOT_FOUND), + }, + } + + tests := []struct { + name string + collector func() Collector + conditions func(*mockdevicewatcher.MockWatcher, byte, byte) + want func() (MetricsByCounter, byte, byte) + wantErr bool + }{ + { + name: "clock events collector with single clock events", + collector: func() Collector { + counterList := counters.CounterList{ + mockDCGMExpClockEventsCounter, + mockOtherCounter, + mockLabelCounter, + } + sampleConfig := appconfig.Config{UseOldNamespace: true} + deviceWatchList := devicewatchlistmanager.NewWatchList(mockGPUDeviceInfo, mockDeviceFields, + []dcgm.Short{mockLabelDeviceField}, mockDeviceWatcher, mockCollectorInterval) + + collector, _ := NewClockEventsCollector(counterList, mockHostname, &sampleConfig, *deviceWatchList) + return collector + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher, gpu1Value, gpu2Value byte) { + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: gpuID1, Value: [4096]byte{gpu1Value}}, + {EntityId: gpuID2, Value: [4096]byte{gpu2Value}}, + } + + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return([]dcgm.GroupHandle{mockGroupHandle1}, + mockFieldGroupHandle, + mockCleanups, nil) + + mockDCGM.EXPECT().UpdateAllFields().Return(nil) + mockDCGM.EXPECT().GetValuesSince(mockGroupHandle1, mockFieldGroupHandle, + gomock.AssignableToTypeOf(time.Time{})).Return(mockEntitiesResult, time.Time{}, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID1, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID2, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + }, + want: func() (MetricsByCounter, byte, byte) { + mockClockOutput11 := uint64(DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP) + mockClockOutput12 := uint64(DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL) + + mockClockOutput21 := uint64(DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE) + mockClockOutput22 := uint64(DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL) + + return MetricsByCounter{ + mockDCGMExpClockEventsCounter: []Metric{ + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID1, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput11, true), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID1, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput12, true), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID2, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput21, true), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID2, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput22, true), + }, + }, byte(mockClockOutput11 + mockClockOutput12), byte(mockClockOutput21 + mockClockOutput22) + }, + wantErr: false, + }, + { + name: "extra values from GPUs that are not monitored", + collector: func() Collector { + counterList := counters.CounterList{ + mockDCGMExpClockEventsCounter, + mockOtherCounter, + mockLabelCounter, + } + deviceWatchList := devicewatchlistmanager.NewWatchList(mockGPUDeviceInfo, mockDeviceFields, + []dcgm.Short{mockLabelDeviceField}, mockDeviceWatcher, mockCollectorInterval) + + collector, _ := NewClockEventsCollector(counterList, mockHostname, &mockConfig, *deviceWatchList) + return collector + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher, gpu1Value, gpu2Value byte) { + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: gpuID1, Value: [4096]byte{gpu1Value}}, + {EntityId: gpuID2, Value: [4096]byte{gpu2Value}}, + {EntityId: uint(2), Value: [4096]byte{gpu2Value}}, + } + + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return([]dcgm.GroupHandle{mockGroupHandle1}, + mockFieldGroupHandle, + mockCleanups, nil) + + mockDCGM.EXPECT().UpdateAllFields().Return(nil) + mockDCGM.EXPECT().GetValuesSince(mockGroupHandle1, mockFieldGroupHandle, + gomock.AssignableToTypeOf(time.Time{})).Return(mockEntitiesResult, time.Time{}, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID1, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID2, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + }, + want: func() (MetricsByCounter, byte, byte) { + mockClockOutput11 := uint64(DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP) + mockClockOutput12 := uint64(DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL) + + mockClockOutput21 := uint64(DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE) + mockClockOutput22 := uint64(DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL) + + return MetricsByCounter{ + mockDCGMExpClockEventsCounter: []Metric{ + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID1, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput11, false), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID1, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput12, false), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID2, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput21, false), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID2, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput22, false), + }, + }, byte(mockClockOutput11 + mockClockOutput12), byte(mockClockOutput21 + mockClockOutput22) + }, + wantErr: false, + }, + { + name: "missing values for a GPU that is monitored", + collector: func() Collector { + counterList := counters.CounterList{ + mockDCGMExpClockEventsCounter, + mockOtherCounter, + mockLabelCounter, + } + + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[3] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + mockGPUDeviceInfoTemp := testutils.MockGPUDeviceInfo(ctrl, 4, gpuInstanceInfos) + mockGPUDeviceInfoTemp.EXPECT().GOpts().Return(gOpts).AnyTimes() + + deviceWatchList := devicewatchlistmanager.NewWatchList(mockGPUDeviceInfoTemp, mockDeviceFields, + []dcgm.Short{mockLabelDeviceField}, mockDeviceWatcher, mockCollectorInterval) + + collector, _ := NewClockEventsCollector(counterList, mockHostname, &mockConfig, *deviceWatchList) + return collector + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher, gpu1Value, gpu2Value byte) { + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: gpuID1, Value: [4096]byte{gpu1Value}}, + {EntityId: gpuID2, Value: [4096]byte{gpu2Value}}, + } + + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return([]dcgm.GroupHandle{mockGroupHandle1}, + mockFieldGroupHandle, + mockCleanups, nil) + + mockDCGM.EXPECT().UpdateAllFields().Return(nil) + mockDCGM.EXPECT().GetValuesSince(mockGroupHandle1, mockFieldGroupHandle, + gomock.AssignableToTypeOf(time.Time{})).Return(mockEntitiesResult, time.Time{}, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID1, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID2, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, uint(2), + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU_I, uint(14), + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + }, + want: func() (MetricsByCounter, byte, byte) { + mockClockOutput11 := uint64(DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP) + mockClockOutput12 := uint64(DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL) + + mockClockOutput21 := uint64(DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE) + mockClockOutput22 := uint64(DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL) + + migClockEvent := clockEventMetricsCreator(mockDCGMExpClockEventsCounter, uint(3), "0", mockHostname, + mockFieldName, + mockLabelValue, invalidClockEventValue, false) + migClockEvent.MigProfile = testutils.MockGPUInstanceInfo2.ProfileName + migClockEvent.GPUInstanceID = fmt.Sprintf("%d", testutils.MockGPUInstanceInfo2.Info.NvmlInstanceId) + + return MetricsByCounter{ + mockDCGMExpClockEventsCounter: []Metric{ + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID1, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput11, false), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID1, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput12, false), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID2, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput21, false), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID2, "1", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput22, false), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, uint(2), "0", mockHostname, + mockFieldName, + mockLabelValue, invalidClockEventValue, false), + migClockEvent, + }, + }, byte(mockClockOutput11 + mockClockOutput12), byte(mockClockOutput21 + mockClockOutput22) + }, + wantErr: false, + }, + { + name: "clock events collector with multiple clock events", + collector: func() Collector { + counterList := counters.CounterList{ + mockDCGMExpClockEventsCounter, + mockOtherCounter, + mockLabelCounter, + } + deviceWatchList := devicewatchlistmanager.NewWatchList(mockGPUDeviceInfo, mockDeviceFields, + []dcgm.Short{mockLabelDeviceField}, mockDeviceWatcher, mockCollectorInterval) + + collector, _ := NewClockEventsCollector(counterList, mockHostname, &mockConfig, *deviceWatchList) + return collector + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher, gpu1Value, gpu2Value byte) { + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: gpuID1, Value: [4096]byte{gpu1Value}}, + {EntityId: gpuID1, Value: [4096]byte{gpu1Value}}, + {EntityId: gpuID1, Value: [4096]byte{gpu1Value}}, + {EntityId: gpuID2, Value: [4096]byte{gpu2Value}}, + {EntityId: gpuID2, Value: [4096]byte{gpu2Value}}, + } + + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return([]dcgm.GroupHandle{mockGroupHandle1}, + mockFieldGroupHandle, + mockCleanups, nil) + + mockDCGM.EXPECT().UpdateAllFields().Return(nil) + mockDCGM.EXPECT().GetValuesSince(mockGroupHandle1, mockFieldGroupHandle, + gomock.AssignableToTypeOf(time.Time{})).Return(mockEntitiesResult, time.Time{}, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID1, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID2, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + }, + want: func() (MetricsByCounter, byte, byte) { + mockClockOutput11 := uint64(DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP) + mockClockOutput12 := uint64(DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL) + + mockClockOutput21 := uint64(DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE) + mockClockOutput22 := uint64(DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL) + + return MetricsByCounter{ + mockDCGMExpClockEventsCounter: []Metric{ + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID1, "3", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput11, false), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID1, "3", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput12, false), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID2, "2", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput21, false), + clockEventMetricsCreator(mockDCGMExpClockEventsCounter, gpuID2, "2", mockHostname, + mockFieldName, + mockLabelValue, mockClockOutput22, false), + }, + }, byte(mockClockOutput11 + mockClockOutput12), byte(mockClockOutput21 + mockClockOutput22) + }, + wantErr: false, + }, + { + name: "clock events collector with UpdateAllFields() error", + collector: func() Collector { + counterList := counters.CounterList{ + mockDCGMExpClockEventsCounter, + mockOtherCounter, + mockLabelCounter, + } + deviceWatchList := devicewatchlistmanager.NewWatchList(mockGPUDeviceInfo, mockDeviceFields, + []dcgm.Short{mockLabelDeviceField}, mockDeviceWatcher, mockCollectorInterval) + + collector, _ := NewClockEventsCollector(counterList, mockHostname, &mockConfig, *deviceWatchList) + return collector + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher, _, _ byte) { + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return([]dcgm.GroupHandle{mockGroupHandle1}, + mockFieldGroupHandle, + mockCleanups, nil) + + mockDCGM.EXPECT().UpdateAllFields().Return(fmt.Errorf("some error")) + }, + want: func() (MetricsByCounter, byte, byte) { + return nil, 0, 0 + }, + wantErr: true, + }, + { + name: "clock events collector with GetValuesSince() error", + collector: func() Collector { + counterList := counters.CounterList{ + mockDCGMExpClockEventsCounter, + mockOtherCounter, + mockLabelCounter, + } + deviceWatchList := devicewatchlistmanager.NewWatchList(mockGPUDeviceInfo, mockDeviceFields, + []dcgm.Short{mockLabelDeviceField}, mockDeviceWatcher, mockCollectorInterval) + + collector, _ := NewClockEventsCollector(counterList, mockHostname, &mockConfig, *deviceWatchList) + return collector + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher, _, _ byte) { + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return([]dcgm.GroupHandle{mockGroupHandle1}, + mockFieldGroupHandle, + mockCleanups, nil) + + mockDCGM.EXPECT().UpdateAllFields().Return(nil) + mockDCGM.EXPECT().GetValuesSince(mockGroupHandle1, mockFieldGroupHandle, + gomock.AssignableToTypeOf(time.Time{})).Return([]dcgm.FieldValue_v2{}, time.Time{}, + fmt.Errorf("some error")) + }, + want: func() (MetricsByCounter, byte, byte) { + return nil, 0, 0 + }, + wantErr: true, + }, + { + name: "clock events collector with EntityGetLatestValues() error", + collector: func() Collector { + counterList := counters.CounterList{ + mockDCGMExpClockEventsCounter, + mockOtherCounter, + mockLabelCounter, + } + deviceWatchList := devicewatchlistmanager.NewWatchList(mockGPUDeviceInfo, mockDeviceFields, + []dcgm.Short{mockLabelDeviceField}, mockDeviceWatcher, mockCollectorInterval) + + collector, _ := NewClockEventsCollector(counterList, mockHostname, &mockConfig, *deviceWatchList) + return collector + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher, _, _ byte) { + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return([]dcgm.GroupHandle{mockGroupHandle1}, + mockFieldGroupHandle, + mockCleanups, nil) + + mockDCGM.EXPECT().UpdateAllFields().Return(nil) + mockDCGM.EXPECT().GetValuesSince(mockGroupHandle1, mockFieldGroupHandle, + gomock.AssignableToTypeOf(time.Time{})).Return([]dcgm.FieldValue_v2{}, time.Time{}, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID1, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, fmt.Errorf("some error")) + }, + want: func() (MetricsByCounter, byte, byte) { + return nil, 0, 0 + }, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + want, gpu1Value, gpu2Value := tt.want() + tt.conditions(mockDeviceWatcher, gpu1Value, gpu2Value) + c := tt.collector() + defer func() { + c.Cleanup() + assert.Equal(t, 1, cleanupCalled, "clean up function was not called") + cleanupCalled = 0 // reset to zero + }() + + got, err := c.GetMetrics() + + if !tt.wantErr { + assert.NoError(t, err, "GetMetrics() failed") + assert.NotEmpty(t, got, "GetMetrics() returned no metrics") + + wantMetrics := want[mockDCGMExpClockEventsCounter] + gotMetrics := got[mockDCGMExpClockEventsCounter] + + assert.Len(t, gotMetrics, len(wantMetrics), "GetMetrics() returned wrong number of metrics") + + sortClockEventMetrics(wantMetrics) + sortClockEventMetrics(gotMetrics) + + assert.Equalf(t, wantMetrics, gotMetrics, "GetMetrics()") + } else { + assert.Errorf(t, err, "GetMetrics() did not return expected error") + assert.Empty(t, got, "GetMetrics() returned unexpected metrics") + } + }) + } +} diff --git a/internal/pkg/collector/collector_factory.go b/internal/pkg/collector/collector_factory.go new file mode 100644 index 00000000..e26027d2 --- /dev/null +++ b/internal/pkg/collector/collector_factory.go @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "fmt" + "log/slog" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/logging" +) + +type Factory interface { + NewCollectors() []EntityCollectorTuple +} + +type collectorFactory struct { + counterSet *counters.CounterSet + deviceWatchListManager devicewatchlistmanager.Manager + hostname string + config *appconfig.Config +} + +func InitCollectorFactory( + counterSet *counters.CounterSet, + deviceWatchListManager devicewatchlistmanager.Manager, + hostname string, + config *appconfig.Config, +) Factory { + return &collectorFactory{ + counterSet: counterSet, + deviceWatchListManager: deviceWatchListManager, + hostname: hostname, + config: config, + } +} + +func (cf *collectorFactory) NewCollectors() []EntityCollectorTuple { + slog.Debug("Counters are being initialized.", + slog.String(logging.DumpKey, fmt.Sprintf("%+v", cf.counterSet.DCGMCounters))) + + entityCollectorTuples := make([]EntityCollectorTuple, 0) + entityTypes := []dcgm.Field_Entity_Group{ + dcgm.FE_GPU, + dcgm.FE_SWITCH, + dcgm.FE_LINK, + dcgm.FE_CPU, + dcgm.FE_CPU_CORE, + } + + for _, entityType := range entityTypes { + if len(cf.counterSet.DCGMCounters) > 0 { + entityWatchList, exists := cf.deviceWatchListManager.EntityWatchList(entityType) + if !exists || len(entityWatchList.DeviceFields()) == 0 { + continue + } + + if dcgmCollector, err := cf.enableDCGMCollector(entityWatchList); err != nil { + slog.Error(fmt.Sprintf("DCGM collector for entity type '%s' cannot be initialized; err: %v", + entityType.String(), err)) + os.Exit(1) + } else { + entityCollectorTuples = append(entityCollectorTuples, EntityCollectorTuple{ + entity: entityType, + collector: dcgmCollector, + }) + } + } + } + + if IsDCGMExpClockEventsCountEnabled(cf.counterSet.ExporterCounters) { + if newCollector, err := cf.enableExpCollector(counters.DCGMExpClockEventsCount); err != nil { + slog.Error(fmt.Sprintf("collector '%s' cannot be initialized; err: %v", counters.DCGMExpClockEventsCount, err)) + os.Exit(1) + } else { + entityCollectorTuples = append(entityCollectorTuples, EntityCollectorTuple{ + entity: dcgm.FE_GPU, + collector: newCollector, + }) + } + } + + if IsDCGMExpXIDErrorsCountEnabled(cf.counterSet.ExporterCounters) { + if newCollector, err := cf.enableExpCollector(counters.DCGMExpXIDErrorsCount); err != nil { + slog.Error(fmt.Sprintf("collector '%s' cannot be initialized; err: %v", counters.DCGMExpXIDErrorsCount, err)) + os.Exit(1) + } else { + entityCollectorTuples = append(entityCollectorTuples, EntityCollectorTuple{ + entity: dcgm.FE_GPU, + collector: newCollector, + }) + } + } + + if IsDCGMExpGPUHealthStatusEnabled(cf.counterSet.ExporterCounters) { + if newCollector, err := cf.enableExpCollector(counters.DCGMExpGPUHealthStatus); err != nil { + slog.Error(fmt.Sprintf("collector '%s' cannot be initialized; err: %v", counters.DCGMExpGPUHealthStatus, err)) + os.Exit(1) + } else { + entityCollectorTuples = append(entityCollectorTuples, EntityCollectorTuple{ + entity: dcgm.FE_GPU, + collector: newCollector, + }) + } + } + + return entityCollectorTuples +} + +func (cf *collectorFactory) enableDCGMCollector(entityWatchList devicewatchlistmanager.WatchList) (Collector, error, +) { + newCollector, err := NewDCGMCollector(cf.counterSet.DCGMCounters, cf.hostname, cf.config, + entityWatchList) + if err != nil { + return nil, err + } + + return newCollector, nil +} + +func (cf *collectorFactory) enableExpCollector(expCollectorName string) (Collector, error) { + entityType := dcgm.FE_GPU + + item, exists := cf.deviceWatchListManager.EntityWatchList(entityType) + if !exists { + return nil, fmt.Errorf("entity type '%s' does not exist", entityType.String()) + } + + var newCollector Collector + var err error + switch expCollectorName { + case counters.DCGMExpClockEventsCount: + newCollector, err = NewClockEventsCollector(cf.counterSet.ExporterCounters, cf.hostname, cf.config, + item) + case counters.DCGMExpXIDErrorsCount: + newCollector, err = NewXIDCollector(cf.counterSet.ExporterCounters, cf.hostname, cf.config, + item) + case counters.DCGMExpGPUHealthStatus: + newCollector, err = NewGPUHealthStatusCollector(cf.counterSet.ExporterCounters, + cf.hostname, + cf.config, + item, + ) + default: + err = fmt.Errorf("invalid collector '%s'", expCollectorName) + } + + if err != nil { + return nil, err + } + + slog.Info(fmt.Sprintf("collector '%s' initialized", expCollectorName)) + return newCollector, nil +} diff --git a/internal/pkg/collector/collector_factory_test.go b/internal/pkg/collector/collector_factory_test.go new file mode 100644 index 00000000..1c533d87 --- /dev/null +++ b/internal/pkg/collector/collector_factory_test.go @@ -0,0 +1,580 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "errors" + "fmt" + "strings" + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + + osmock "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/os" + osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" + + mockdcgm "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/dcgmprovider" + mockdeviceinfo "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/deviceinfo" + mockdevicewatchlistmanager "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/devicewatchlistmanager" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" +) + +var deviceWatcher = devicewatcher.NewDeviceWatcher() + +var mockGPU = deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + GPUInstances: []deviceinfo.GPUInstanceInfo{}, +} + +func Test_collectorFactory_Register(t *testing.T) { + dcgmCounter := counters.Counter{ + FieldID: dcgm.DCGM_FI_DEV_GPU_TEMP, + FieldName: "DCGM_FI_DEV_GPU_TEMP", + PromType: "gauge", + Help: "", + } + + ctrl := gomock.NewController(t) + + mockDeviceInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDeviceInfo.EXPECT().InfoType().Return(dcgm.FE_NONE).AnyTimes() + mockDeviceInfo.EXPECT().GOpts().Return(appconfig.DeviceOptions{Flex: true}).AnyTimes() + mockDeviceInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() + mockDeviceInfo.EXPECT().GPU(uint(0)).Return(mockGPU).AnyTimes() + + defaultDeviceWatchList := *devicewatchlistmanager.NewWatchList(mockDeviceInfo, []dcgm.Short{42}, nil, + deviceWatcher, int64(1)) + + tests := []struct { + name string + cs *counters.CounterSet + getDeviceWatchListManager func() devicewatchlistmanager.Manager + hostname string + config *appconfig.Config + setupDCGMMock func(*mockdcgm.MockDCGM) + assert func(*testing.T, []EntityCollectorTuple) + wantsPanic bool + }{ + { + name: fmt.Sprintf("Collector enabled for the %s", dcgm.FE_GPU.String()), + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{dcgmCounter}, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + mockDeviceWatchListManager.EXPECT().EntityWatchList(gomock.Any()).Return(devicewatchlistmanager.WatchList{}, + false).AnyTimes() + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(42)) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, nil).AnyTimes() + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_GPU, + mockGPU.DeviceInfo.GPU).Return(nil).AnyTimes() + + mockFieldHandle := dcgm.FieldHandle{} + mockFieldHandle.SetHandle(uintptr(43)) + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Eq([]dcgm.Short{42})).Return( + mockFieldHandle, nil).AnyTimes() + + mockDCGM.EXPECT().WatchFieldsWithGroupEx(gomock.Eq(mockFieldHandle), + gomock.Eq(mockGroupHandle), + gomock.Any(), + gomock.Any(), + gomock.Any(), + ).Return(nil).AnyTimes() + }, + assert: func(t *testing.T, entityCollectorTuples []EntityCollectorTuple) { + require.Len(t, entityCollectorTuples, 1) + require.Equal(t, entityCollectorTuples[0].Entity(), dcgm.FE_GPU) + require.IsType(t, &DCGMCollector{}, entityCollectorTuples[0].Collector()) + }, + }, + { + name: fmt.Sprintf("Collector enabled for the %s but DCGM returns error", dcgm.FE_GPU.String()), + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{dcgmCounter}, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + mockDeviceWatchListManager.EXPECT().EntityWatchList(gomock.Any()).Return(devicewatchlistmanager.WatchList{}, + false).AnyTimes() + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockGroupHandle := dcgm.GroupHandle{} + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, errors.New("boom")).AnyTimes() + }, + wantsPanic: true, + }, + { + name: "DCGM_EXP_CLOCK_EVENTS_COUNT collector is enabled", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_CLOCK_EVENTS_COUNT", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true).AnyTimes() + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: setupDCGMMockForDCGMExpMetrics([]dcgm.Short{112}), + assert: func(t *testing.T, entityCollectorTuples []EntityCollectorTuple) { + require.Len(t, entityCollectorTuples, 1) + require.Equal(t, entityCollectorTuples[0].Entity(), dcgm.FE_GPU) + require.IsType(t, &clockEventsCollector{}, entityCollectorTuples[0].Collector()) + }, + }, + { + name: "DCGM_EXP_CLOCK_EVENTS_COUNT collector can not be initialized", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_CLOCK_EVENTS_COUNT", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(gomock.Any()).Return(devicewatchlistmanager. + WatchList{}, false).AnyTimes() + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + wantsPanic: true, + assert: func(t *testing.T, entityCollectorTuples []EntityCollectorTuple) { + require.Len(t, entityCollectorTuples, 0) + }, + }, + { + name: "DCGM_EXP_CLOCK_EVENTS_COUNT collector can not be created by DCGM", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_CLOCK_EVENTS_COUNT", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + mockDeviceWatchListManager.EXPECT().EntityWatchList(gomock.Any()).Return(devicewatchlistmanager.WatchList{}, + false).AnyTimes() + return mockDeviceWatchListManager + }, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockGroupHandle := dcgm.GroupHandle{} + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, errors.New("boom")).AnyTimes() + }, + hostname: "testhost", + config: &appconfig.Config{}, + wantsPanic: true, + }, + { + name: "DCGM_EXP_XID_ERRORS_COUNT collector is enabled", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_XID_ERRORS_COUNT", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + mockDeviceWatchListManager.EXPECT().EntityWatchList(gomock.Any()).Return(devicewatchlistmanager.WatchList{}, + false).AnyTimes() + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: setupDCGMMockForDCGMExpMetrics([]dcgm.Short{230}), + assert: func(t *testing.T, entityCollectorTuples []EntityCollectorTuple) { + require.Len(t, entityCollectorTuples, 1) + require.Equal(t, entityCollectorTuples[0].Entity(), dcgm.FE_GPU) + require.IsType(t, &xidCollector{}, entityCollectorTuples[0].Collector()) + }, + }, + { + name: "DCGM_EXP_XID_ERRORS_COUNT collector can not be initialized", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_XID_ERRORS_COUNT", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(gomock.Any()).Return(devicewatchlistmanager. + WatchList{}, false).AnyTimes() + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + wantsPanic: true, + assert: func(t *testing.T, entityCollectorTuples []EntityCollectorTuple) { + require.Len(t, entityCollectorTuples, 0) + }, + }, + { + name: "DCGM_EXP_XID_ERRORS_COUNT collector can not be created by DCGM", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_XID_ERRORS_COUNT", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + mockDeviceWatchListManager.EXPECT().EntityWatchList(gomock.Any()).Return(devicewatchlistmanager.WatchList{}, + false).AnyTimes() + return mockDeviceWatchListManager + }, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockGroupHandle := dcgm.GroupHandle{} + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, errors.New("boom")).AnyTimes() + }, + hostname: "testhost", + config: &appconfig.Config{}, + wantsPanic: true, + }, + { + name: "DCGM_EXP_GPU_HEALTH_STATUS collector is enabled", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_GPU_HEALTH_STATUS", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockDCGM.EXPECT().GetSupportedDevices().Return([]uint{0}, nil).AnyTimes() + mockDCGM.EXPECT().HealthSet(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + mockDCGM.EXPECT().GetAllDeviceCount().Return(uint(1), nil).AnyTimes() + mockDCGM.EXPECT().GetDeviceInfo(gomock.Eq(uint(0))).Return(dcgm.Device{}, nil).AnyTimes() + mockDCGM.EXPECT().GetGpuInstanceHierarchy().Return(dcgm.MigHierarchy_v2{}, nil).AnyTimes() + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(dcgm.FieldHandle{}, nil) + mockDCGM.EXPECT().WatchFieldsWithGroupEx(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + Return(nil).AnyTimes() + setupDCGMMockForDCGMExpMetrics([]dcgm.Short{230})(mockDCGM) + }, + assert: func(t *testing.T, entityCollectorTuples []EntityCollectorTuple) { + require.Len(t, entityCollectorTuples, 1) + require.Equal(t, entityCollectorTuples[0].Entity(), dcgm.FE_GPU) + require.IsType(t, &gpuHealthStatusCollector{}, entityCollectorTuples[0].Collector()) + }, + }, + { + name: "DCGM_EXP_GPU_HEALTH_STATUS collector can not be initialized", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_GPU_HEALTH_STATUS", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockDCGM.EXPECT().GetSupportedDevices().Return([]uint{}, errors.New("boom!")).AnyTimes() + }, + wantsPanic: true, + }, + { + name: "DCGM_EXP_GPU_HEALTH_STATUS collector can not be initialized when zero supported devices", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_GPU_HEALTH_STATUS", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockDCGM.EXPECT().GetSupportedDevices().Return([]uint{}, nil) + }, + wantsPanic: true, + }, + { + name: "DCGM_EXP_GPU_HEALTH_STATUS collector can not be initialized when entity group can not be created", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_GPU_HEALTH_STATUS", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockDCGM.EXPECT().GetSupportedDevices().Return([]uint{0}, nil) + mockDCGM.EXPECT().CreateGroup(gomock.Cond(func(x any) bool { + return strings.HasPrefix(x.(string), "gpu_health_monitor_") + })).Return(dcgm.GroupHandle{}, errors.New("boom!")) + }, + wantsPanic: true, + }, + { + name: "DCGM_EXP_GPU_HEALTH_STATUS collector can not be initialized when entity can not be added to the group", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_GPU_HEALTH_STATUS", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockDCGM.EXPECT().GetSupportedDevices().Return([]uint{0}, nil) + mockDCGM.EXPECT().CreateGroup(gomock.Cond(func(x any) bool { + return strings.HasPrefix(x.(string), "gpu_health_monitor_") + })).Return(dcgm.GroupHandle{}, nil) + mockDCGM.EXPECT().AddEntityToGroup(gomock.Any(), gomock.Any(), gomock.Eq(uint(0))).Return(errors.New("boom!")) + }, + wantsPanic: true, + }, + { + name: "DCGM_EXP_GPU_HEALTH_STATUS collector can not be initialized when enable healthcheck returns an error", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_GPU_HEALTH_STATUS", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockDCGM.EXPECT().GetSupportedDevices().Return([]uint{0}, nil) + mockDCGM.EXPECT().CreateGroup(gomock.Cond(func(x any) bool { + return strings.HasPrefix(x.(string), "gpu_health_monitor_") + })).Return(dcgm.GroupHandle{}, nil) + mockDCGM.EXPECT().AddEntityToGroup(gomock.Any(), gomock.Any(), gomock.Eq(uint(0))).Return(nil) + mockDCGM.EXPECT().HealthSet(gomock.Any(), gomock.Eq(dcgm.DCGM_HEALTH_WATCH_ALL)).Return(errors.New("boom!")) + }, + wantsPanic: true, + }, + { + name: "DCGM_EXP_GPU_HEALTH_STATUS collector can not be initialized when deviceinfo.Initialize returns an error", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_GPU_HEALTH_STATUS", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockDCGM.EXPECT().GetSupportedDevices().Return([]uint{0}, nil) + mockDCGM.EXPECT().CreateGroup(gomock.Cond(func(x any) bool { + return strings.HasPrefix(x.(string), "gpu_health_monitor_") + })).Return(dcgm.GroupHandle{}, nil) + mockDCGM.EXPECT().AddEntityToGroup(gomock.Any(), gomock.Any(), gomock.Eq(uint(0))).Return(nil) + mockDCGM.EXPECT().HealthSet(gomock.Any(), gomock.Eq(dcgm.DCGM_HEALTH_WATCH_ALL)).Return(nil) + mockDCGM.EXPECT().GetAllDeviceCount().Return(uint(0), errors.New("boom!")) + }, + wantsPanic: true, + }, + { + name: "DCGM_EXP_GPU_HEALTH_STATUS collector can not be initialized when device watch returns an error", + cs: &counters.CounterSet{ + DCGMCounters: []counters.Counter{}, + ExporterCounters: []counters.Counter{ + { + FieldName: "DCGM_EXP_GPU_HEALTH_STATUS", + }, + }, + }, + getDeviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_GPU).Return(defaultDeviceWatchList, + true) + return mockDeviceWatchListManager + }, + hostname: "testhost", + config: &appconfig.Config{}, + setupDCGMMock: func(mockDCGM *mockdcgm.MockDCGM) { + mockDCGM.EXPECT().GetSupportedDevices().Return([]uint{0}, nil) + mockDCGM.EXPECT().CreateGroup(gomock.Cond(func(x any) bool { + return strings.HasPrefix(x.(string), "gpu_health_monitor_") + })).Return(dcgm.GroupHandle{}, nil) + mockDCGM.EXPECT().AddEntityToGroup(gomock.Any(), gomock.Any(), gomock.Eq(uint(0))).Return(nil) + mockDCGM.EXPECT().HealthSet(gomock.Any(), gomock.Eq(dcgm.DCGM_HEALTH_WATCH_ALL)).Return(nil) + mockDCGM.EXPECT().GetAllDeviceCount().Return(uint(1), nil) + mockDCGM.EXPECT().GetDeviceInfo(gomock.Eq(uint(0))).Return(dcgm.Device{}, nil) + mockDCGM.EXPECT().GetGpuInstanceHierarchy().Return(dcgm.MigHierarchy_v2{}, nil) + mockDCGM.EXPECT().CreateGroup(gomock.Cond(func(x any) bool { + return strings.HasPrefix(x.(string), "gpu-collector-group") + })).Return(dcgm.GroupHandle{}, errors.New("boom!")) + }, + wantsPanic: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockDCGMProvider := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + + mOS := osmock.NewMockOS(ctrl) + mOS.EXPECT().Exit(gomock.Eq(1)).Do(func(code int) { + panic("os.Exit") + }).AnyTimes() + os = mOS + defer func() { + os = osinterface.RealOS{} + }() + + dcgmprovider.SetClient(mockDCGMProvider) + if tt.setupDCGMMock != nil { + tt.setupDCGMMock(mockDCGMProvider) + } + + if tt.wantsPanic { + require.PanicsWithValue(t, "os.Exit", func() { + InitCollectorFactory(tt.cs, tt.getDeviceWatchListManager(), tt.hostname, + tt.config).NewCollectors() + }) + return + } + entityCollectors := InitCollectorFactory(tt.cs, tt.getDeviceWatchListManager(), tt.hostname, + tt.config).NewCollectors() + if tt.assert != nil { + tt.assert(t, entityCollectors) + } + }) + } +} + +func setupDCGMMockForDCGMExpMetrics(fields []dcgm.Short) func(mockDCGM *mockdcgm.MockDCGM) { + return func(mockDCGM *mockdcgm.MockDCGM) { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(42)) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, nil).AnyTimes() + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_GPU, + mockGPU.DeviceInfo.GPU).Return(nil).AnyTimes() + + mockFieldHandle := dcgm.FieldHandle{} + mockFieldHandle.SetHandle(uintptr(43)) + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Eq(fields)).Return( + mockFieldHandle, nil).AnyTimes() + + mockDCGM.EXPECT().WatchFieldsWithGroupEx(gomock.Eq(mockFieldHandle), + gomock.Eq(mockGroupHandle), + gomock.Any(), + gomock.Any(), + gomock.Any(), + ).Return(nil).AnyTimes() + } +} diff --git a/internal/pkg/collector/const.go b/internal/pkg/collector/const.go new file mode 100644 index 00000000..b239510d --- /dev/null +++ b/internal/pkg/collector/const.go @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +const ( + windowSizeInMSLabel = "window_size_in_ms" + + skipDCGMValue = "SKIPPING DCGM VALUE" + FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING" +) diff --git a/internal/pkg/collector/expcollector.go b/internal/pkg/collector/expcollector.go new file mode 100644 index 00000000..ebc2572c --- /dev/null +++ b/internal/pkg/collector/expcollector.go @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "fmt" + "log/slog" + "maps" + "time" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicemonitoring" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" +) + +type expCollector struct { + baseExpCollector + fieldValueParser func(val int64) []int64 // Function to parse the field value + labelFiller func(map[string]string, int64) // Function to fill labels + windowSize int // Window size +} + +func (c *expCollector) getMetrics() (MetricsByCounter, error) { + err := dcgmprovider.Client().UpdateAllFields() + if err != nil { + return nil, err + } + + mapEntityIDToValues := map[uint]map[int64]int{} + + window := time.Now().Add(-time.Duration(c.windowSize) * time.Millisecond) + + for _, group := range c.deviceWatchList.DeviceGroups() { + values, _, err := dcgmprovider.Client().GetValuesSince(group, c.deviceWatchList.DeviceFieldGroup(), window) + if err != nil { + return nil, err + } + + for _, val := range values { + if val.Status == 0 { + if _, exists := mapEntityIDToValues[val.EntityId]; !exists { + mapEntityIDToValues[val.EntityId] = map[int64]int{} + } + + for _, v := range c.fieldValueParser(val.Int64()) { + mapEntityIDToValues[val.EntityId][v] += 1 + } + } + } + } + + labels := map[string]string{} + labels[windowSizeInMSLabel] = fmt.Sprint(c.windowSize) + + monitoringInfo := devicemonitoring.GetMonitoredEntities(c.deviceWatchList.DeviceInfo()) + metrics := make(MetricsByCounter) + useOld := c.config.UseOldNamespace + uuid := "UUID" + if useOld { + uuid = "uuid" + } + for _, mi := range monitoringInfo { + if len(c.labelsCounters) > 0 && len(c.deviceWatchList.LabelDeviceFields()) > 0 { + err := c.getLabelsFromCounters(mi, labels) + if err != nil { + return nil, err + } + } + entityValues, exists := mapEntityIDToValues[mi.DeviceInfo.GPU] + if exists { + for entityValue, val := range entityValues { + + metricValueLabels := maps.Clone(labels) + c.labelFiller(metricValueLabels, entityValue) + + m := c.createMetric(metricValueLabels, mi, uuid, val) + + metrics[c.counter] = append(metrics[c.counter], m) + } + } else { + // Create metric with Zero value if group (mapEntityIDToValues) is empty + m := c.createMetric(labels, mi, uuid, 0) + metrics[c.counter] = append(metrics[c.counter], m) + } + } + + return metrics, nil +} + +// newExpCollector is a constructor for the expCollector +func newExpCollector( + labelsCounters []counters.Counter, + hostname string, + config *appconfig.Config, + deviceWatchList devicewatchlistmanager.WatchList, +) (expCollector, error) { + collector := expCollector{ + baseExpCollector: baseExpCollector{ + deviceWatchList: deviceWatchList, + hostname: hostname, + config: config, + labelsCounters: labelsCounters, + }, + + fieldValueParser: func(val int64) []int64 { + return []int64{val} + }, + labelFiller: func(metricValueLabels map[string]string, entityValue int64) { + // This function is intentionally left blank + }, + } + + var err error + + collector.cleanups, err = collector.deviceWatchList.Watch() + if err != nil { + slog.Warn(fmt.Sprintf("Failed to watch metrics: %s", err)) + return expCollector{}, err + } + + return collector, nil +} diff --git a/pkg/dcgmexporter/gpu_collector.go b/internal/pkg/collector/gpu_collector.go similarity index 58% rename from pkg/dcgmexporter/gpu_collector.go rename to internal/pkg/collector/gpu_collector.go index e4cac498..8bed9226 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/internal/pkg/collector/gpu_collector.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,95 +14,78 @@ * limitations under the License. */ -package dcgmexporter +package collector import ( "errors" "fmt" + "log/slog" "strconv" "strings" "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/sirupsen/logrus" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicemonitoring" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" ) const unknownErr = "Unknown Error" -type DCGMCollectorConstructor func([]Counter, string, *Config, FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, - func(), error) +type DCGMCollector struct { + counters []counters.Counter + cleanups []func() + useOldNamespace bool + deviceWatchList devicewatchlistmanager.WatchList + hostname string + replaceBlanksInModelName bool +} func NewDCGMCollector( - c []Counter, + c []counters.Counter, hostname string, - config *Config, - fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem, -) (*DCGMCollector, func(), error) { - if fieldEntityGroupTypeSystemInfo.isEmpty() { - return nil, func() {}, errors.New("fieldEntityGroupTypeSystemInfo is empty") + config *appconfig.Config, + deviceWatchList devicewatchlistmanager.WatchList, +) (*DCGMCollector, error) { + if deviceWatchList.IsEmpty() { + return nil, errors.New("deviceWatchList is empty") } collector := &DCGMCollector{ - Counters: c, - DeviceFields: fieldEntityGroupTypeSystemInfo.DeviceFields, - SysInfo: fieldEntityGroupTypeSystemInfo.SystemInfo, - Hostname: hostname, + counters: c, + deviceWatchList: deviceWatchList, + hostname: hostname, } if config == nil { - logrus.Warn("Config is empty") - return collector, func() { collector.Cleanup() }, nil + slog.Warn("Config is empty") + return collector, nil } - collector.UseOldNamespace = config.UseOldNamespace - collector.ReplaceBlanksInModelName = config.ReplaceBlanksInModelName + collector.useOldNamespace = config.UseOldNamespace + collector.replaceBlanksInModelName = config.ReplaceBlanksInModelName - _, _, cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, - fieldEntityGroupTypeSystemInfo.SystemInfo, - int64(config.CollectInterval)*1000) - if err != nil { - logrus.Fatal("Failed to watch metrics: ", err) - } - - collector.Cleanups = cleanups - - return collector, func() { collector.Cleanup() }, nil -} - -func GetSystemInfo(config *Config, entityType dcgm.Field_Entity_Group) (*SystemInfo, error) { - sysInfo, err := InitializeSystemInfo(config.GPUDevices, - config.SwitchDevices, - config.CPUDevices, - config.UseFakeGPUs, entityType) + cleanups, err := deviceWatchList.Watch() if err != nil { return nil, err } - return &sysInfo, err -} -func GetHostname(config *Config) (string, error) { - hostname := "" - var err error - if !config.NoHostname { - if nodeName := os.Getenv("NODE_NAME"); nodeName != "" { - hostname = nodeName - } else { - hostname, err = os.Hostname() - if err != nil { - return "", err - } - } - } - return hostname, nil + collector.cleanups = cleanups + + return collector, nil } func (c *DCGMCollector) Cleanup() { - for _, c := range c.Cleanups { + for _, c := range c.cleanups { c() } } func (c *DCGMCollector) GetMetrics() (MetricsByCounter, error) { - monitoringInfo := GetMonitoredEntities(c.SysInfo) + monitoringInfo := devicemonitoring.GetMonitoredEntities(c.deviceWatchList.DeviceInfo()) metrics := make(MetricsByCounter) @@ -110,78 +93,70 @@ func (c *DCGMCollector) GetMetrics() (MetricsByCounter, error) { var vals []dcgm.FieldValue_v1 var err error if mi.Entity.EntityGroupId == dcgm.FE_LINK { - vals, err = dcgm.LinkGetLatestValues(mi.Entity.EntityId, mi.ParentId, c.DeviceFields) + vals, err = dcgmprovider.Client().LinkGetLatestValues(mi.Entity.EntityId, mi.ParentId, + c.deviceWatchList.DeviceFields()) } else { - vals, err = dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.DeviceFields) + vals, err = dcgmprovider.Client().EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, + c.deviceWatchList.DeviceFields()) } if err != nil { if derr, ok := err.(*dcgm.DcgmError); ok { if derr.Code == dcgm.DCGM_ST_CONNECTION_NOT_VALID { - logrus.Fatal("Could not retrieve metrics: ", err) + slog.Error("Could not retrieve metrics: " + err.Error()) + os.Exit(1) } } return nil, err } // InstanceInfo will be nil for GPUs - if c.SysInfo.InfoType == dcgm.FE_SWITCH || c.SysInfo.InfoType == dcgm.FE_LINK { - ToSwitchMetric(metrics, vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) - } else if c.SysInfo.InfoType == dcgm.FE_CPU || c.SysInfo.InfoType == dcgm.FE_CPU_CORE { - ToCPUMetric(metrics, vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) - } else { - ToMetric(metrics, + switch c.deviceWatchList.DeviceInfo().InfoType() { + case dcgm.FE_SWITCH, dcgm.FE_LINK: + toSwitchMetric(metrics, vals, c.counters, mi, c.useOldNamespace, c.hostname) + case dcgm.FE_CPU, dcgm.FE_CPU_CORE: + toCPUMetric(metrics, vals, c.counters, mi, c.useOldNamespace, c.hostname) + default: + toMetric(metrics, vals, - c.Counters, + c.counters, mi.DeviceInfo, mi.InstanceInfo, - c.UseOldNamespace, - c.Hostname, - c.ReplaceBlanksInModelName) + c.useOldNamespace, + c.hostname, + c.replaceBlanksInModelName) } } return metrics, nil } -func ShouldMonitorDeviceType(fields []dcgm.Short, entityType dcgm.Field_Entity_Group) bool { - if len(fields) == 0 { - return false - } - - if len(fields) == 1 && fields[0] == dcgm.DCGM_FI_DRIVER_VERSION { - return false - } - - return true -} - -func FindCounterField(c []Counter, fieldID uint) (Counter, error) { +func findCounterField(c []counters.Counter, fieldID uint) (counters.Counter, error) { for i := 0; i < len(c); i++ { if uint(c[i].FieldID) == fieldID { return c[i], nil } } - return Counter{}, fmt.Errorf("could not find counter corresponding to field ID '%d'", fieldID) + return counters.Counter{}, fmt.Errorf("could not find counter corresponding to field ID '%d'", fieldID) } -func ToSwitchMetric( +func toSwitchMetric( metrics MetricsByCounter, - values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string, + values []dcgm.FieldValue_v1, c []counters.Counter, mi devicemonitoring.Info, useOld bool, hostname string, ) { labels := map[string]string{} for _, val := range values { - v := ToString(val) + v := toString(val) // Filter out counters with no value and ignored fields for this entity - counter, err := FindCounterField(c, val.FieldId) + counter, err := findCounterField(c, val.FieldId) if err != nil { continue } - if counter.PromType == "label" { + if counter.IsLabel() { labels[counter.FieldName] = v continue } @@ -190,7 +165,7 @@ func ToSwitchMetric( uuid = "uuid" } var m Metric - if v == SkipDCGMValue { + if v == skipDCGMValue { continue } else { m = Metric{ @@ -212,22 +187,22 @@ func ToSwitchMetric( } } -func ToCPUMetric( +func toCPUMetric( metrics MetricsByCounter, - values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string, + values []dcgm.FieldValue_v1, c []counters.Counter, mi devicemonitoring.Info, useOld bool, hostname string, ) { labels := map[string]string{} for _, val := range values { - v := ToString(val) + v := toString(val) // Filter out counters with no value and ignored fields for this entity - counter, err := FindCounterField(c, val.FieldId) + counter, err := findCounterField(c, val.FieldId) if err != nil { continue } - if counter.PromType == "label" { + if counter.IsLabel() { labels[counter.FieldName] = v continue } @@ -236,7 +211,7 @@ func ToCPUMetric( uuid = "uuid" } var m Metric - if v == SkipDCGMValue { + if v == skipDCGMValue { continue } else { m = Metric{ @@ -258,12 +233,12 @@ func ToCPUMetric( } } -func ToMetric( +func toMetric( metrics MetricsByCounter, values []dcgm.FieldValue_v1, - c []Counter, + c []counters.Counter, d dcgm.Device, - instanceInfo *GPUInstanceInfo, + instanceInfo *deviceinfo.GPUInstanceInfo, useOld bool, hostname string, replaceBlanksInModelName bool, @@ -271,18 +246,18 @@ func ToMetric( labels := map[string]string{} for _, val := range values { - v := ToString(val) + v := toString(val) // Filter out counters with no value and ignored fields for this entity - if v == SkipDCGMValue { + if v == skipDCGMValue { continue } - counter, err := FindCounterField(c, val.FieldId) + counter, err := findCounterField(c, val.FieldId) if err != nil { continue } - if counter.PromType == "label" { + if counter.IsLabel() { labels[counter.FieldName] = v continue } @@ -342,52 +317,52 @@ func getGPUModel(d dcgm.Device, replaceBlanksInModelName bool) string { return gpuModel } -func ToString(value dcgm.FieldValue_v1) string { +func toString(value dcgm.FieldValue_v1) string { switch value.FieldType { case dcgm.DCGM_FT_INT64: switch v := value.Int64(); v { case dcgm.DCGM_FT_INT32_BLANK: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_INT32_NOT_FOUND: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_INT32_NOT_SUPPORTED: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_INT64_BLANK: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_INT64_NOT_FOUND: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_INT64_NOT_SUPPORTED: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED: - return SkipDCGMValue + return skipDCGMValue default: return fmt.Sprintf("%d", value.Int64()) } case dcgm.DCGM_FT_DOUBLE: switch v := value.Float64(); v { case dcgm.DCGM_FT_FP64_BLANK: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_FP64_NOT_FOUND: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_FP64_NOT_SUPPORTED: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED: - return SkipDCGMValue + return skipDCGMValue default: return fmt.Sprintf("%f", value.Float64()) } case dcgm.DCGM_FT_STRING: switch v := value.String(); v { case dcgm.DCGM_FT_STR_BLANK: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_STR_NOT_FOUND: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_STR_NOT_SUPPORTED: - return SkipDCGMValue + return skipDCGMValue case dcgm.DCGM_FT_STR_NOT_PERMISSIONED: - return SkipDCGMValue + return skipDCGMValue default: return v } diff --git a/internal/pkg/collector/gpu_collector_test.go b/internal/pkg/collector/gpu_collector_test.go new file mode 100644 index 00000000..cdb2ce39 --- /dev/null +++ b/internal/pkg/collector/gpu_collector_test.go @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "fmt" + "reflect" + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" +) + +func TestToMetric(t *testing.T) { + fieldValue := [4096]byte{} + fieldValue[0] = 42 + values := []dcgm.FieldValue_v1{ + { + FieldId: 150, + FieldType: dcgm.DCGM_FT_INT64, + Value: fieldValue, + }, + } + + c := []counters.Counter{ + { + FieldID: 150, + FieldName: "DCGM_FI_DEV_GPU_TEMP", + PromType: "gauge", + Help: "Temperature Help info", + }, + } + + d := dcgm.Device{ + UUID: "fake0", + Identifiers: dcgm.DeviceIdentifiers{ + Model: "NVIDIA T400 4GB", + }, + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, + } + + var instanceInfo *deviceinfo.GPUInstanceInfo = nil + + type testCase struct { + replaceBlanksInModelName bool + expectedGPUModelName string + } + + testCases := []testCase{ + { + replaceBlanksInModelName: true, + expectedGPUModelName: "NVIDIA-T400-4GB", + }, + { + replaceBlanksInModelName: false, + expectedGPUModelName: "NVIDIA T400 4GB", + }, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("When replaceBlanksInModelName is %t", tc.replaceBlanksInModelName), func(t *testing.T) { + metrics := make(map[counters.Counter][]Metric) + toMetric(metrics, values, c, d, instanceInfo, false, "", tc.replaceBlanksInModelName) + assert.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(counters.Counter)] + assert.Equal(t, "42", metricValues[0].Value) + assert.Equal(t, tc.expectedGPUModelName, metricValues[0].GPUModelName) + + assert.Equal(t, d.UUID, metricValues[0].GPUUUID) + assert.Equal(t, d.PCI.BusID, metricValues[0].GPUPCIBusID) + }) + } +} + +func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) { + c := []counters.Counter{ + { + FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS, + FieldName: "DCGM_FI_DEV_GPU_TEMP", + PromType: "gauge", + Help: "Temperature Help info", + }, + } + + d := dcgm.Device{ + UUID: "fake0", + Identifiers: dcgm.DeviceIdentifiers{ + Model: "NVIDIA T400 4GB", + }, + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, + } + + var instanceInfo *deviceinfo.GPUInstanceInfo = nil + + type testCase struct { + name string + fieldValue byte + expectedErr string + } + + testCases := []testCase{ + { + name: "when DCGM_FI_DEV_XID_ERRORS has no error", + fieldValue: 0, + expectedErr: xidErrCodeToText[0], + }, + { + name: "when DCGM_FI_DEV_XID_ERRORS has known value", + fieldValue: 42, + expectedErr: xidErrCodeToText[42], + }, + { + name: "when DCGM_FI_DEV_XID_ERRORS has unknown value", + fieldValue: 255, + expectedErr: unknownErr, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + fieldValue := [4096]byte{} + fieldValue[0] = tc.fieldValue + values := []dcgm.FieldValue_v1{ + { + FieldId: dcgm.DCGM_FI_DEV_XID_ERRORS, + FieldType: dcgm.DCGM_FT_INT64, + Value: fieldValue, + }, + } + + metrics := make(map[counters.Counter][]Metric) + toMetric(metrics, values, c, d, instanceInfo, false, "", false) + assert.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(counters.Counter)] + assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Value) + assert.Contains(t, metricValues[0].Attributes, "err_code") + assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Attributes["err_code"]) + assert.Contains(t, metricValues[0].Attributes, "err_msg") + assert.Equal(t, tc.expectedErr, metricValues[0].Attributes["err_msg"]) + + assert.Equal(t, d.UUID, metricValues[0].GPUUUID) + assert.Equal(t, d.PCI.BusID, metricValues[0].GPUPCIBusID) + }) + } +} diff --git a/internal/pkg/collector/gpu_health_collector.go b/internal/pkg/collector/gpu_health_collector.go new file mode 100644 index 00000000..62ec8065 --- /dev/null +++ b/internal/pkg/collector/gpu_health_collector.go @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "errors" + "fmt" + "maps" + "slices" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/sirupsen/logrus" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicemonitoring" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/logging" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/utils" +) + +var gpuHealthChecks = []dcgm.HealthSystem{ + dcgm.DCGM_HEALTH_WATCH_PCIE, + dcgm.DCGM_HEALTH_WATCH_NVLINK, + dcgm.DCGM_HEALTH_WATCH_PMU, + dcgm.DCGM_HEALTH_WATCH_MCU, + dcgm.DCGM_HEALTH_WATCH_MEM, + dcgm.DCGM_HEALTH_WATCH_SM, + dcgm.DCGM_HEALTH_WATCH_INFOROM, + dcgm.DCGM_HEALTH_WATCH_THERMAL, + dcgm.DCGM_HEALTH_WATCH_POWER, + dcgm.DCGM_HEALTH_WATCH_DRIVER, +} + +type gpuHealthStatusCollector struct { + baseExpCollector + groupID dcgm.GroupHandle + deviceInfoProvider deviceinfo.Provider +} + +func (c *gpuHealthStatusCollector) GetMetrics() (MetricsByCounter, error) { + // Read the GPU health status. + gpuHealthStatus, err := dcgmprovider.Client().HealthCheck(c.groupID) + if err != nil { + return MetricsByCounter{}, err + } + + monitoringInfo := devicemonitoring.GetMonitoredEntities(c.deviceInfoProvider) + + // Get the GPU in the group + groupInfo, err := dcgmprovider.Client().GetGroupInfo(c.groupID) + if err != nil { + return MetricsByCounter{}, err + } + + groupEntityPairSet := make(map[dcgm.GroupEntityPair]struct{}) + + for _, entityPair := range groupInfo.EntityList { + groupEntityPairSet[entityPair] = struct{}{} + } + + // Find monitoring info for GPU in the group + monitoringInfoInGroup := make([]devicemonitoring.Info, 0) + + for _, info := range monitoringInfo { + if _, exists := groupEntityPairSet[info.Entity]; exists { + monitoringInfoInGroup = append(monitoringInfoInGroup, info) + } + } + + metrics := make(MetricsByCounter) + metrics[c.counter] = make([]Metric, 0) + + useOld := c.config.UseOldNamespace + uuid := "UUID" + if useOld { + uuid = "uuid" + } + + entityHealthSystemToIncident := map[dcgm.GroupEntityPair]map[dcgm.HealthSystem]dcgm.Incident{} + + for _, mi := range monitoringInfoInGroup { + entityHealthSystemToIncident[mi.Entity] = make(map[dcgm.HealthSystem]dcgm.Incident) + // Populate the table with default values + for _, healthSystem := range gpuHealthChecks { + entityHealthSystemToIncident[mi.Entity][healthSystem] = dcgm.Incident{ + System: healthSystem, + Health: dcgm.DCGM_HEALTH_RESULT_PASS, + Error: dcgm.DiagErrorDetail{}, + } + } + } + + // We assyme that each health check may produce only one incident per system + for _, incident := range gpuHealthStatus.Incidents { + entityHealthSystemToIncident[incident.EntityInfo][incident.System] = incident + } + + labels := map[string]string{} + + for _, mi := range monitoringInfoInGroup { + if len(c.labelsCounters) > 0 && len(c.deviceWatchList.LabelDeviceFields()) > 0 { + err := c.getLabelsFromCounters(mi, labels) + if err != nil { + return nil, err + } + } + for _, healthSystem := range gpuHealthChecks { + incident := entityHealthSystemToIncident[mi.Entity][healthSystem] + metricValueLabels := maps.Clone(labels) + metricValueLabels["health_watch"] = healthSystemWatchToString(incident.System) + metricValueLabels["health_error_code"] = healthCheckErrorToString(incident.Error.Code) + m := c.createMetric(metricValueLabels, mi, uuid, int(incident.Health)) + metrics[c.counter] = append(metrics[c.counter], m) + } + } + + return metrics, nil +} + +func (c *gpuHealthStatusCollector) Cleanup() { + for _, cleanup := range c.cleanups { + cleanup() + } +} + +func NewGPUHealthStatusCollector( + counterList counters.CounterList, + hostname string, + config *appconfig.Config, + deviceWatchList devicewatchlistmanager.WatchList, +) (Collector, error) { + if !IsDCGMExpGPUHealthStatusEnabled(counterList) { + logrus.Error(counters.DCGMExpGPUHealthStatus + " collector is disabled") + return nil, fmt.Errorf(counters.DCGMExpGPUHealthStatus + " collector is disabled") + } + + supportedGPUs, err := dcgmprovider.Client().GetSupportedDevices() + if err != nil { + logrus.WithError(err).Error("Failed to get supported GPU devices") + return nil, err + } + + if len(supportedGPUs) == 0 { + logrus.Error("No supported GPU devices found") + return nil, errors.New("no supported GPU devices found") + } + + // Create Group + newGroupNumber, err := utils.RandUint64() + if err != nil { + logrus.WithError(err).Error("Failed to generate new group number") + return nil, err + } + + cleanups := []func(){} + + groupID, err := dcgmprovider.Client().CreateGroup(fmt.Sprintf("gpu_health_monitor_%d", newGroupNumber)) + if err != nil { + logrus.WithError(err).Error("Failed to create group") + return nil, err + } + + cleanups = append(cleanups, func() { + destroyErr := dcgmprovider.Client().DestroyGroup(groupID) + if destroyErr != nil { + logrus.WithFields(logrus.Fields{ + logging.GroupIDKey: groupID, + logrus.ErrorKey: destroyErr, + }).Warn("cannot destroy group") + } + }) + + for _, gpu := range supportedGPUs { + err = dcgmprovider.Client().AddEntityToGroup(groupID, dcgm.FE_GPU, gpu) + if err != nil { + logrus.WithError(err).WithField("gpu", gpu).Error("Failed to add GPU device to group") + return nil, err + } + } + + err = dcgmprovider.Client().HealthSet(groupID, dcgm.DCGM_HEALTH_WATCH_ALL) + if err != nil { + logrus.WithError(err).Error("Failed to set health watch") + return nil, err + } + + deviceInfoProvider, err := deviceinfo.Initialize(appconfig.DeviceOptions{ + MinorRange: []int{-1}, + MajorRange: []int{-1}, + }, + appconfig.DeviceOptions{}, + appconfig.DeviceOptions{}, + config.UseFakeGPUs, dcgm.FE_GPU) + if err != nil { + return nil, err + } + + if !deviceWatchList.IsEmpty() { + watchListCleanups, err := deviceWatchList.Watch() + if err != nil { + logrus.WithError(err).Error("Failed to watch metrics") + return nil, err + } + + cleanups = append(cleanups, watchListCleanups...) + } + + return &gpuHealthStatusCollector{ + baseExpCollector: baseExpCollector{ + counter: counterList[slices.IndexFunc(counterList, func(c counters.Counter) bool { + return c.FieldName == counters.DCGMExpGPUHealthStatus + })], + labelsCounters: counterList.LabelCounters(), + hostname: hostname, + config: config, + cleanups: cleanups, + deviceWatchList: deviceWatchList, + }, + groupID: groupID, + deviceInfoProvider: deviceInfoProvider, + }, nil +} + +func IsDCGMExpGPUHealthStatusEnabled(counterList counters.CounterList) bool { + return slices.ContainsFunc(counterList, func(c counters.Counter) bool { + return c.FieldName == counters.DCGMExpGPUHealthStatus + }) +} + +var healthSystemWatchToStringMap = map[dcgm.HealthSystem]string{ + dcgm.DCGM_HEALTH_WATCH_PCIE: "PCIE", + dcgm.DCGM_HEALTH_WATCH_NVLINK: "NVLINK", + dcgm.DCGM_HEALTH_WATCH_PMU: "PMU", + dcgm.DCGM_HEALTH_WATCH_MCU: "MCU", + dcgm.DCGM_HEALTH_WATCH_MEM: "MEM", + dcgm.DCGM_HEALTH_WATCH_SM: "SM", + dcgm.DCGM_HEALTH_WATCH_INFOROM: "INFOROM", + dcgm.DCGM_HEALTH_WATCH_THERMAL: "THERMAL", + dcgm.DCGM_HEALTH_WATCH_POWER: "POWER", + dcgm.DCGM_HEALTH_WATCH_DRIVER: "DRIVER", + dcgm.DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL: "NVSWITCH_NONFATAL", + dcgm.DCGM_HEALTH_WATCH_NVSWITCH_FATAL: "NVSWITCH_FATAL", +} + +func healthSystemWatchToString(heathSystem dcgm.HealthSystem) string { + name, ok := healthSystemWatchToStringMap[heathSystem] + if !ok { + return "" + } + return name +} + +var healthCheckErrorToStringMap = map[dcgm.HealthCheckErrorCode]string{ + dcgm.DCGM_FR_OK: "DCGM_FR_OK", + dcgm.DCGM_FR_UNKNOWN: "DCGM_FR_UNKNOWN", + dcgm.DCGM_FR_UNRECOGNIZED: "DCGM_FR_UNRECOGNIZED", + dcgm.DCGM_FR_PCI_REPLAY_RATE: "DCGM_FR_PCI_REPLAY_RATE", + dcgm.DCGM_FR_VOLATILE_DBE_DETECTED: "DCGM_FR_VOLATILE_DBE_DETECTED", + dcgm.DCGM_FR_VOLATILE_SBE_DETECTED: "DCGM_FR_VOLATILE_SBE_DETECTED", + dcgm.DCGM_FR_PENDING_PAGE_RETIREMENTS: "DCGM_FR_PENDING_PAGE_RETIREMENTS", + dcgm.DCGM_FR_RETIRED_PAGES_LIMIT: "DCGM_FR_RETIRED_PAGES_LIMIT", + dcgm.DCGM_FR_RETIRED_PAGES_DBE_LIMIT: "DCGM_FR_RETIRED_PAGES_DBE_LIMIT", + dcgm.DCGM_FR_CORRUPT_INFOROM: "DCGM_FR_CORRUPT_INFOROM", + dcgm.DCGM_FR_CLOCK_THROTTLE_THERMAL: "DCGM_FR_CLOCK_THROTTLE_THERMAL", + dcgm.DCGM_FR_POWER_UNREADABLE: "DCGM_FR_POWER_UNREADABLE", + dcgm.DCGM_FR_CLOCK_THROTTLE_POWER: "DCGM_FR_CLOCK_THROTTLE_POWER", + dcgm.DCGM_FR_NVLINK_ERROR_THRESHOLD: "DCGM_FR_NVLINK_ERROR_THRESHOLD", + dcgm.DCGM_FR_NVLINK_DOWN: "DCGM_FR_NVLINK_DOWN", + dcgm.DCGM_FR_NVSWITCH_FATAL_ERROR: "DCGM_FR_NVSWITCH_FATAL_ERROR", + dcgm.DCGM_FR_NVSWITCH_NON_FATAL_ERROR: "DCGM_FR_NVSWITCH_NON_FATAL_ERROR", + dcgm.DCGM_FR_NVSWITCH_DOWN: "DCGM_FR_NVSWITCH_DOWN", + dcgm.DCGM_FR_NO_ACCESS_TO_FILE: "DCGM_FR_NO_ACCESS_TO_FILE", + dcgm.DCGM_FR_NVML_API: "DCGM_FR_NVML_API", + dcgm.DCGM_FR_DEVICE_COUNT_MISMATCH: "DCGM_FR_DEVICE_COUNT_MISMATCH", + dcgm.DCGM_FR_BAD_PARAMETER: "DCGM_FR_BAD_PARAMETER", + dcgm.DCGM_FR_CANNOT_OPEN_LIB: "DCGM_FR_CANNOT_OPEN_LIB", + dcgm.DCGM_FR_DENYLISTED_DRIVER: "DCGM_FR_DENYLISTED_DRIVER", + dcgm.DCGM_FR_NVML_LIB_BAD: "DCGM_FR_NVML_LIB_BAD", + dcgm.DCGM_FR_GRAPHICS_PROCESSES: "DCGM_FR_GRAPHICS_PROCESSES", + dcgm.DCGM_FR_HOSTENGINE_CONN: "DCGM_FR_HOSTENGINE_CONN", + dcgm.DCGM_FR_FIELD_QUERY: "DCGM_FR_FIELD_QUERY", + dcgm.DCGM_FR_BAD_CUDA_ENV: "DCGM_FR_BAD_CUDA_ENV", + dcgm.DCGM_FR_PERSISTENCE_MODE: "DCGM_FR_PERSISTENCE_MODE", + dcgm.DCGM_FR_LOW_BANDWIDTH: "DCGM_FR_LOW_BANDWIDTH", + dcgm.DCGM_FR_HIGH_LATENCY: "DCGM_FR_HIGH_LATENCY", + dcgm.DCGM_FR_CANNOT_GET_FIELD_TAG: "DCGM_FR_CANNOT_GET_FIELD_TAG", + dcgm.DCGM_FR_FIELD_VIOLATION: "DCGM_FR_FIELD_VIOLATION", + dcgm.DCGM_FR_FIELD_THRESHOLD: "DCGM_FR_FIELD_THRESHOLD", + dcgm.DCGM_FR_FIELD_VIOLATION_DBL: "DCGM_FR_FIELD_VIOLATION_DBL", + dcgm.DCGM_FR_FIELD_THRESHOLD_DBL: "DCGM_FR_FIELD_THRESHOLD_DBL", + dcgm.DCGM_FR_UNSUPPORTED_FIELD_TYPE: "DCGM_FR_UNSUPPORTED_FIELD_TYPE", + dcgm.DCGM_FR_FIELD_THRESHOLD_TS: "DCGM_FR_FIELD_THRESHOLD_TS", + dcgm.DCGM_FR_FIELD_THRESHOLD_TS_DBL: "DCGM_FR_FIELD_THRESHOLD_TS_DBL", + dcgm.DCGM_FR_THERMAL_VIOLATIONS: "DCGM_FR_THERMAL_VIOLATIONS", + dcgm.DCGM_FR_THERMAL_VIOLATIONS_TS: "DCGM_FR_THERMAL_VIOLATIONS_TS", + dcgm.DCGM_FR_TEMP_VIOLATION: "DCGM_FR_TEMP_VIOLATION", + dcgm.DCGM_FR_THROTTLING_VIOLATION: "DCGM_FR_THROTTLING_VIOLATION", + dcgm.DCGM_FR_INTERNAL: "DCGM_FR_INTERNAL", + dcgm.DCGM_FR_PCIE_GENERATION: "DCGM_FR_PCIE_GENERATION", + dcgm.DCGM_FR_PCIE_WIDTH: "DCGM_FR_PCIE_WIDTH", + dcgm.DCGM_FR_ABORTED: "DCGM_FR_ABORTED", + dcgm.DCGM_FR_TEST_DISABLED: "DCGM_FR_TEST_DISABLED", + dcgm.DCGM_FR_CANNOT_GET_STAT: "DCGM_FR_CANNOT_GET_STAT", + dcgm.DCGM_FR_STRESS_LEVEL: "DCGM_FR_STRESS_LEVEL", + dcgm.DCGM_FR_CUDA_API: "DCGM_FR_CUDA_API", + dcgm.DCGM_FR_FAULTY_MEMORY: "DCGM_FR_FAULTY_MEMORY", + dcgm.DCGM_FR_CANNOT_SET_WATCHES: "DCGM_FR_CANNOT_SET_WATCHES", + dcgm.DCGM_FR_CUDA_UNBOUND: "DCGM_FR_CUDA_UNBOUND", + dcgm.DCGM_FR_ECC_DISABLED: "DCGM_FR_ECC_DISABLED", + dcgm.DCGM_FR_MEMORY_ALLOC: "DCGM_FR_MEMORY_ALLOC", + dcgm.DCGM_FR_CUDA_DBE: "DCGM_FR_CUDA_DBE", + dcgm.DCGM_FR_MEMORY_MISMATCH: "DCGM_FR_MEMORY_MISMATCH", + dcgm.DCGM_FR_CUDA_DEVICE: "DCGM_FR_CUDA_DEVICE", + dcgm.DCGM_FR_ECC_UNSUPPORTED: "DCGM_FR_ECC_UNSUPPORTED", + dcgm.DCGM_FR_ECC_PENDING: "DCGM_FR_ECC_PENDING", + dcgm.DCGM_FR_MEMORY_BANDWIDTH: "DCGM_FR_MEMORY_BANDWIDTH", + dcgm.DCGM_FR_TARGET_POWER: "DCGM_FR_TARGET_POWER", + dcgm.DCGM_FR_API_FAIL: "DCGM_FR_API_FAIL", + dcgm.DCGM_FR_API_FAIL_GPU: "DCGM_FR_API_FAIL_GPU", + dcgm.DCGM_FR_CUDA_CONTEXT: "DCGM_FR_CUDA_CONTEXT", + dcgm.DCGM_FR_DCGM_API: "DCGM_FR_DCGM_API", + dcgm.DCGM_FR_CONCURRENT_GPUS: "DCGM_FR_CONCURRENT_GPUS", + dcgm.DCGM_FR_TOO_MANY_ERRORS: "DCGM_FR_TOO_MANY_ERRORS", + dcgm.DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD: "DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD", + dcgm.DCGM_FR_NVLINK_ERROR_CRITICAL: "DCGM_FR_NVLINK_ERROR_CRITICAL", + dcgm.DCGM_FR_ENFORCED_POWER_LIMIT: "DCGM_FR_ENFORCED_POWER_LIMIT", + dcgm.DCGM_FR_MEMORY_ALLOC_HOST: "DCGM_FR_MEMORY_ALLOC_HOST", + dcgm.DCGM_FR_GPU_OP_MODE: "DCGM_FR_GPU_OP_MODE", + dcgm.DCGM_FR_NO_MEMORY_CLOCKS: "DCGM_FR_NO_MEMORY_CLOCKS", + dcgm.DCGM_FR_NO_GRAPHICS_CLOCKS: "DCGM_FR_NO_GRAPHICS_CLOCKS", + dcgm.DCGM_FR_HAD_TO_RESTORE_STATE: "DCGM_FR_HAD_TO_RESTORE_STATE", + dcgm.DCGM_FR_L1TAG_UNSUPPORTED: "DCGM_FR_L1TAG_UNSUPPORTED", + dcgm.DCGM_FR_L1TAG_MISCOMPARE: "DCGM_FR_L1TAG_MISCOMPARE", + dcgm.DCGM_FR_ROW_REMAP_FAILURE: "DCGM_FR_ROW_REMAP_FAILURE", + dcgm.DCGM_FR_UNCONTAINED_ERROR: "DCGM_FR_UNCONTAINED_ERROR", + dcgm.DCGM_FR_EMPTY_GPU_LIST: "DCGM_FR_EMPTY_GPU_LIST", + dcgm.DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS: "DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS", + dcgm.DCGM_FR_UNCORRECTABLE_ROW_REMAP: "DCGM_FR_UNCORRECTABLE_ROW_REMAP", + dcgm.DCGM_FR_PENDING_ROW_REMAP: "DCGM_FR_PENDING_ROW_REMAP", + dcgm.DCGM_FR_BROKEN_P2P_MEMORY_DEVICE: "DCGM_FR_BROKEN_P2P_MEMORY_DEVICE", + dcgm.DCGM_FR_BROKEN_P2P_WRITER_DEVICE: "DCGM_FR_BROKEN_P2P_WRITER_DEVICE", + dcgm.DCGM_FR_NVSWITCH_NVLINK_DOWN: "DCGM_FR_NVSWITCH_NVLINK_DOWN", + dcgm.DCGM_FR_EUD_BINARY_PERMISSIONS: "DCGM_FR_EUD_BINARY_PERMISSIONS", + dcgm.DCGM_FR_EUD_NON_ROOT_USER: "DCGM_FR_EUD_NON_ROOT_USER", + dcgm.DCGM_FR_EUD_SPAWN_FAILURE: "DCGM_FR_EUD_SPAWN_FAILURE", + dcgm.DCGM_FR_EUD_TIMEOUT: "DCGM_FR_EUD_TIMEOUT", + dcgm.DCGM_FR_EUD_ZOMBIE: "DCGM_FR_EUD_ZOMBIE", + dcgm.DCGM_FR_EUD_NON_ZERO_EXIT_CODE: "DCGM_FR_EUD_NON_ZERO_EXIT_CODE", + dcgm.DCGM_FR_EUD_TEST_FAILED: "DCGM_FR_EUD_TEST_FAILED", + dcgm.DCGM_FR_FILE_CREATE_PERMISSIONS: "DCGM_FR_FILE_CREATE_PERMISSIONS", + dcgm.DCGM_FR_PAUSE_RESUME_FAILED: "DCGM_FR_PAUSE_RESUME_FAILED", + dcgm.DCGM_FR_PCIE_H_REPLAY_VIOLATION: "DCGM_FR_PCIE_H_REPLAY_VIOLATION", + dcgm.DCGM_FR_GPU_EXPECTED_NVLINKS_UP: "DCGM_FR_GPU_EXPECTED_NVLINKS_UP", + dcgm.DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP: "DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP", + dcgm.DCGM_FR_XID_ERROR: "DCGM_FR_XID_ERROR", + dcgm.DCGM_FR_SBE_VIOLATION: "DCGM_FR_SBE_VIOLATION", + dcgm.DCGM_FR_DBE_VIOLATION: "DCGM_FR_DBE_VIOLATION", + dcgm.DCGM_FR_PCIE_REPLAY_VIOLATION: "DCGM_FR_PCIE_REPLAY_VIOLATION", + dcgm.DCGM_FR_SBE_THRESHOLD_VIOLATION: "DCGM_FR_SBE_THRESHOLD_VIOLATION", + dcgm.DCGM_FR_DBE_THRESHOLD_VIOLATION: "DCGM_FR_DBE_THRESHOLD_VIOLATION", + dcgm.DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION: "DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION", + dcgm.DCGM_FR_CUDA_FM_NOT_INITIALIZED: "DCGM_FR_CUDA_FM_NOT_INITIALIZED", + dcgm.DCGM_FR_SXID_ERROR: "DCGM_FR_SXID_ERROR", + dcgm.DCGM_FR_ERROR_SENTINEL: "DCGM_FR_ERROR_SENTINEL", +} + +func healthCheckErrorToString(err dcgm.HealthCheckErrorCode) string { + return healthCheckErrorToStringMap[err] +} diff --git a/internal/pkg/collector/gpu_health_collector_test.go b/internal/pkg/collector/gpu_health_collector_test.go new file mode 100644 index 00000000..0d44e12c --- /dev/null +++ b/internal/pkg/collector/gpu_health_collector_test.go @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "errors" + "reflect" + "strings" + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + + mockdcgm "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/dcgmprovider" + mockdeviceinfo "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" +) + +func TestNewGPUHealthStatusCollector(t *testing.T) { + type testCase struct { + name string + counterList counters.CounterList + setDCGMproviderState func(*mockdcgm.MockDCGM) + assertResult func(Collector, error) + } + + testCases := []testCase{ + { + name: "returns error when collector is disabled", + counterList: []counters.Counter{}, + assertResult: func(c Collector, err error) { + assert.Nil(t, c) + assert.Error(t, err) + }, + }, + { + name: "returns no errors, whe collector is enabled", + counterList: []counters.Counter{ + { + FieldName: "DCGM_EXP_GPU_HEALTH_STATUS", + }, + }, + setDCGMproviderState: func(mockDCGMProvider *mockdcgm.MockDCGM) { + mockDCGMProvider.EXPECT().DestroyGroup(gomock.Any()).Return(errors.New("boom!")).Times(2) + mockDCGMProvider.EXPECT().FieldGroupDestroy(gomock.Any()).Return(errors.New("boom!")) + }, + assertResult: func(c Collector, err error) { + assert.NotNil(t, c) + assert.NoError(t, err) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Initialize the mock controller + ctrl := gomock.NewController(t) + + mockDCGMProvider := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + + dcgmprovider.SetClient(mockDCGMProvider) + if tc.setDCGMproviderState != nil { + tc.setDCGMproviderState(mockDCGMProvider) + } + setDefaultExpectationsForGPUHealthStatusCollectorMockDCGMProvider(t, mockDCGMProvider) + + // Create a new collector + collector, err := NewGPUHealthStatusCollector(tc.counterList, + "", + &appconfig.Config{}, + getDefaultDeviceWatchListForGPUHealthStatusCollectorMockDCGMProvider(ctrl), + ) + + tc.assertResult(collector, err) + if collector != nil { + // Cleanup the collector + assert.NotPanics(t, func() { + collector.Cleanup() + }) + } + }) + } +} + +func setDefaultExpectationsForGPUHealthStatusCollectorMockDCGMProvider(t *testing.T, mockDCGMProvider *mockdcgm.MockDCGM) { + t.Helper() + mockDCGMProvider.EXPECT().GetSupportedDevices().Return([]uint{0}, nil).AnyTimes() + mockDCGMProvider.EXPECT().CreateGroup(gomock.Cond(func(x any) bool { + return strings.HasPrefix(x.(string), "gpu_health_monitor_") + })).Return(dcgm.GroupHandle{}, nil).AnyTimes() + mockDCGMProvider.EXPECT().AddEntityToGroup(gomock.Any(), gomock.Any(), gomock.Eq(uint(0))).Return(nil).AnyTimes() + mockDCGMProvider.EXPECT().HealthSet(gomock.Any(), gomock.Eq(dcgm.DCGM_HEALTH_WATCH_ALL)).Return(nil).AnyTimes() + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil).AnyTimes() + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Eq(uint(0))).Return(dcgm.Device{}, nil).AnyTimes() + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(dcgm.MigHierarchy_v2{}, nil).AnyTimes() + mockDCGMProvider.EXPECT().CreateGroup(gomock.Cond(func(x any) bool { + return strings.HasPrefix(x.(string), "gpu-collector-group") + })).Return(dcgm.GroupHandle{}, nil).AnyTimes() + mockDCGMProvider.EXPECT().AddEntityToGroup(gomock.Any(), gomock.Any(), gomock.Eq(uint(0))).Return(nil).AnyTimes() + mockDCGMProvider.EXPECT().FieldGroupCreate(gomock.Cond(func(x any) bool { + return strings.HasPrefix(x.(string), "gpu-collector-fieldgroup") + }), gomock.Any()).Return(dcgm.FieldHandle{}, nil).AnyTimes() + mockDCGMProvider.EXPECT().WatchFieldsWithGroupEx(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + Return(nil).AnyTimes() + mockDCGMProvider.EXPECT().EntityGetLatestValues(gomock.Any(), gomock.Any(), gomock.Any()). + Return([]dcgm.FieldValue_v1{}, nil).AnyTimes() + + healthCheckResponse := dcgm.HealthResponse{ + OverallHealth: dcgm.DCGM_HEALTH_RESULT_FAIL, + Incidents: []dcgm.Incident{ + { + System: dcgm.DCGM_HEALTH_WATCH_THERMAL, + Health: dcgm.DCGM_HEALTH_RESULT_FAIL, + Error: dcgm.DiagErrorDetail{ + Message: "boom!", + Code: dcgm.DCGM_FR_THERMAL_VIOLATIONS, + }, + EntityInfo: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU, + EntityId: uint(0), + }, + }, + }, + } + + mockDCGMProvider.EXPECT().HealthCheck(gomock.Any()).Return(healthCheckResponse, nil).AnyTimes() + mockDCGMProvider.EXPECT().GetGroupInfo(gomock.Any()).Return(&dcgm.GroupInfo{ + EntityList: []dcgm.GroupEntityPair{ + {EntityId: uint(0), EntityGroupId: dcgm.FE_GPU}, + }, + }, nil).AnyTimes() +} + +func getDefaultDeviceWatchListForGPUHealthStatusCollectorMockDCGMProvider(ctrl *gomock.Controller) devicewatchlistmanager.WatchList { + mockDeviceInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDeviceInfo.EXPECT().InfoType().Return(dcgm.FE_NONE).AnyTimes() + mockDeviceInfo.EXPECT().GOpts().Return(appconfig.DeviceOptions{Flex: true}).AnyTimes() + mockDeviceInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() + mockDeviceInfo.EXPECT().GPU(uint(0)).Return(mockGPU).AnyTimes() + + return *devicewatchlistmanager.NewWatchList(mockDeviceInfo, + []dcgm.Short{42}, + []dcgm.Short{524}, + deviceWatcher, + int64(1)) +} + +func TestGPUHealthStatusCollector_GetMetrics_ErrorHandling(t *testing.T) { + var counterList counters.CounterList = []counters.Counter{ + { + FieldName: "DCGM_EXP_GPU_HEALTH_STATUS", + }, + { + FieldName: "DCGM_FI_DRIVER_VERSION", + PromType: "label", + FieldID: dcgm.DCGM_FI_DEV_VGPU_DRIVER_VERSION, + }, + } + + type testCase struct { + name string + setDCGMproviderState func(*mockdcgm.MockDCGM) + asserResult func(MetricsByCounter, error) + } + + testCases := []testCase{ + { + name: "returns Metrics without errors", + asserResult: func(metrics MetricsByCounter, err error) { + require.NoError(t, err) + // We expect 1 metric: DCGM_EXP_GPU_HEALTH_STATUS + require.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(counters.Counter)] + assert.Len(t, metricValues, len(gpuHealthChecks), "number of metric values doesn't match to number of healthchecks") + + var thermalViolationsFound bool + + for _, value := range metricValues { + healthWatch := value.Labels["health_watch"] + healthErrorCode := value.Labels["health_error_code"] + if healthWatch == "THERMAL" && healthErrorCode == "DCGM_FR_THERMAL_VIOLATIONS" { + assert.Equal(t, "20", value.Value) + thermalViolationsFound = true + } else { + assert.Equal(t, "0", value.Value) + } + } + assert.True(t, thermalViolationsFound, "expected DCGM_FR_THERMAL_VIOLATIONS error not found") + }, + }, + + { + name: "When HealthCheck returns error", + setDCGMproviderState: func(mockDCGMProvider *mockdcgm.MockDCGM) { + // Clear expectations for SomeMethod + mockDCGMProvider.EXPECT().HealthCheck(gomock.Any()).Return(dcgm.HealthResponse{}, + errors.New("boom!")) + }, + asserResult: func(metrics MetricsByCounter, err error) { + assert.Error(t, err) + assert.Empty(t, metrics) + }, + }, + { + name: "When GetGroupInfo returns error", + setDCGMproviderState: func(mockDCGMProvider *mockdcgm.MockDCGM) { + mockDCGMProvider.EXPECT().GetGroupInfo(gomock.Any()).Return(nil, errors.New("boom!")).AnyTimes() + }, + asserResult: func(metrics MetricsByCounter, err error) { + assert.Error(t, err) + assert.Empty(t, metrics) + }, + }, + { + name: "When EntityGetLatestValues returns error", + setDCGMproviderState: func(mockDCGMProvider *mockdcgm.MockDCGM) { + mockDCGMProvider.EXPECT().EntityGetLatestValues(gomock.Any(), gomock.Any(), gomock.Any()). + Return([]dcgm.FieldValue_v1{}, errors.New("boom!")).AnyTimes() + }, + asserResult: func(metrics MetricsByCounter, err error) { + assert.Error(t, err) + assert.Empty(t, metrics) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Initialize the mock controller + ctrl := gomock.NewController(t) + + mockDCGMProvider := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + + dcgmprovider.SetClient(mockDCGMProvider) + + // We need to set new expectations, and then set the default ones. + if tc.setDCGMproviderState != nil { + tc.setDCGMproviderState(mockDCGMProvider) + } + + setDefaultExpectationsForGPUHealthStatusCollectorMockDCGMProvider(t, mockDCGMProvider) + + // Create a new collector + collector, err := NewGPUHealthStatusCollector(counterList, + "", + &appconfig.Config{ + UseOldNamespace: true, + }, + getDefaultDeviceWatchListForGPUHealthStatusCollectorMockDCGMProvider(ctrl), + ) + + require.NoError(t, err) + + metrics, err := collector.GetMetrics() + + tc.asserResult(metrics, err) + + ctrl.Finish() // This will finish the current controller + }) + } +} + +func TestIsDCGMExpGPUHealthStatusEnabled(t *testing.T) { + tests := []struct { + name string + arg counters.CounterList + want bool + }{ + { + name: "empty", + arg: counters.CounterList{}, + want: false, + }, + { + name: "counter event count disabled", + arg: counters.CounterList{ + counters.Counter{ + FieldID: 1, + FieldName: "random1", + }, + counters.Counter{ + FieldID: 2, + FieldName: "random2", + }, + }, + want: false, + }, + { + name: "counter event count enabled", + arg: counters.CounterList{ + counters.Counter{ + FieldID: 1, + FieldName: counters.DCGMExpGPUHealthStatus, + }, + counters.Counter{ + FieldID: 2, + FieldName: "random2", + }, + }, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equalf(t, tt.want, IsDCGMExpGPUHealthStatusEnabled(tt.arg), "unexpected response") + }) + } +} + +func TestHealthSystemWatchToString(t *testing.T) { + type testCase struct { + name string + heathSystem dcgm.HealthSystem + expected string + } + + testCases := []testCase{ + { + name: "returns POWER when dcgm.DCGM_HEALTH_WATCH_POWER", + heathSystem: dcgm.DCGM_HEALTH_WATCH_POWER, + expected: "POWER", + }, + { + name: "returns empty string when dcgm.HealthSystem is unknown", + heathSystem: dcgm.HealthSystem(100500), + expected: "", + }, + } + + for _, tc := range testCases { + actual := healthSystemWatchToString(tc.heathSystem) + assert.Equal(t, tc.expected, actual) + } +} diff --git a/internal/pkg/collector/types.go b/internal/pkg/collector/types.go new file mode 100644 index 00000000..499b0863 --- /dev/null +++ b/internal/pkg/collector/types.go @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "fmt" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" +) + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/collector/mock_collector.go -package=collector -copyright_file=../../../hack/header.txt . Collector + +// Collector interface +type Collector interface { + GetMetrics() (MetricsByCounter, error) + Cleanup() +} + +type EntityCollectorTuple struct { + entity dcgm.Field_Entity_Group + collector Collector +} + +func (e *EntityCollectorTuple) SetEntity(entity dcgm.Field_Entity_Group) { + e.entity = entity +} + +func (e *EntityCollectorTuple) Entity() dcgm.Field_Entity_Group { + return e.entity +} + +func (e *EntityCollectorTuple) SetCollector(collector Collector) { + e.collector = collector +} + +func (e *EntityCollectorTuple) Collector() Collector { + return e.collector +} + +type Metric struct { + Counter counters.Counter + Value string + + GPU string + GPUUUID string + GPUDevice string + GPUModelName string + GPUPCIBusID string + + UUID string + + MigProfile string + GPUInstanceID string + Hostname string + + Labels map[string]string + Attributes map[string]string +} + +func (m Metric) GetIDOfType(idType appconfig.KubernetesGPUIDType) (string, error) { + // For MIG devices, return the MIG profile instead of + if m.MigProfile != "" { + return fmt.Sprintf("%s-%s", m.GPU, m.GPUInstanceID), nil + } + switch idType { + case appconfig.GPUUID: + return m.GPUUUID, nil + case appconfig.DeviceName: + return m.GPUDevice, nil + } + return "", fmt.Errorf("unsupported KubernetesGPUIDType for MetricID '%s'", idType) +} + +// MetricsByCounter represents a map where each Counter is associated with a slice of Metric objects +type MetricsByCounter map[counters.Counter][]Metric diff --git a/pkg/dcgmexporter/os.go b/internal/pkg/collector/variables.go similarity index 97% rename from pkg/dcgmexporter/os.go rename to internal/pkg/collector/variables.go index da351ca4..b88dd531 100644 --- a/pkg/dcgmexporter/os.go +++ b/internal/pkg/collector/variables.go @@ -14,7 +14,7 @@ * limitations under the License. */ -package dcgmexporter +package collector import osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" diff --git a/internal/pkg/collector/xid_collector.go b/internal/pkg/collector/xid_collector.go new file mode 100644 index 00000000..e5271471 --- /dev/null +++ b/internal/pkg/collector/xid_collector.go @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "fmt" + "log/slog" + "slices" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" +) + +type xidCollector struct { + expCollector +} + +func (c *xidCollector) GetMetrics() (MetricsByCounter, error) { + return c.expCollector.getMetrics() +} + +func NewXIDCollector( + counterList counters.CounterList, + hostname string, + config *appconfig.Config, + deviceWatchList devicewatchlistmanager.WatchList, +) (Collector, error) { + if !IsDCGMExpXIDErrorsCountEnabled(counterList) { + slog.Error(counters.DCGMExpXIDErrorsCount + " collector is disabled") + return nil, fmt.Errorf(counters.DCGMExpXIDErrorsCount + " collector is disabled") + } + + collector := xidCollector{} + var err error + deviceWatchList.SetDeviceFields([]dcgm.Short{dcgm.DCGM_FI_DEV_XID_ERRORS}) + + collector.expCollector, err = newExpCollector( + counterList.LabelCounters(), + hostname, + config, + deviceWatchList, + ) + if err != nil { + return nil, err + } + + collector.counter = counterList[slices.IndexFunc(counterList, func(c counters.Counter) bool { + return c.FieldName == counters.DCGMExpXIDErrorsCount + })] + + collector.labelFiller = func(metricValueLabels map[string]string, entityValue int64) { + metricValueLabels["xid"] = fmt.Sprint(entityValue) + } + + collector.windowSize = config.XIDCountWindowSize + + return &collector, nil +} + +func IsDCGMExpXIDErrorsCountEnabled(counterList counters.CounterList) bool { + return slices.ContainsFunc(counterList, func(c counters.Counter) bool { + return c.FieldName == counters.DCGMExpXIDErrorsCount + }) +} diff --git a/internal/pkg/collector/xid_collector_test.go b/internal/pkg/collector/xid_collector_test.go new file mode 100644 index 00000000..9bd436bd --- /dev/null +++ b/internal/pkg/collector/xid_collector_test.go @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package collector + +import ( + "fmt" + "slices" + "testing" + "time" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + + mockdcgm "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/dcgmprovider" + mockdevicewatcher "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/devicewatcher" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" +) + +func TestIsDCGMExpXIDErrorsCountEnabled(t *testing.T) { + tests := []struct { + name string + arg counters.CounterList + want bool + }{ + { + name: "empty", + arg: counters.CounterList{}, + want: false, + }, + { + name: "counter disabled", + arg: counters.CounterList{ + counters.Counter{ + FieldID: 1, + FieldName: "random1", + }, + counters.Counter{ + FieldID: 2, + FieldName: "random2", + }, + }, + want: false, + }, + { + name: "counter enabled", + arg: counters.CounterList{ + counters.Counter{ + FieldID: 1, + FieldName: counters.DCGMExpXIDErrorsCount, + }, + counters.Counter{ + FieldID: 2, + FieldName: "random2", + }, + }, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equalf(t, tt.want, IsDCGMExpXIDErrorsCountEnabled(tt.arg), "unexpected response") + }) + } +} + +func TestNewXIDCollector(t *testing.T) { + ctrl := gomock.NewController(t) + mockDeviceWatcher := mockdevicewatcher.NewMockWatcher(ctrl) + + sampleDeviceInfo := &deviceinfo.Info{} + sampleDeviceFields := []dcgm.Short{42} + sampleCollectorInterval := int64(1) + sampleConfig := appconfig.Config{} + sampleHostname := "localhost" + var sampleCleanups []func() + + sampleDCGMExpXIDCounter := counters.Counter{ + FieldID: 1, + FieldName: counters.DCGMExpXIDErrorsCount, + } + + sampleOtherCounter := counters.Counter{ + FieldID: 2, + FieldName: "random2", + } + + sampleLabelCounter := counters.Counter{ + FieldID: 3, + FieldName: "random2", + PromType: "label", + } + + type args struct { + counterList counters.CounterList + hostname string + config *appconfig.Config + deviceWatchList *devicewatchlistmanager.WatchList + } + tests := []struct { + name string + args args + conditions func(watcher *mockdevicewatcher.MockWatcher) + want func(string, *appconfig.Config, devicewatchlistmanager.WatchList) Collector + wantErr bool + }{ + { + name: "counter is disabled ", + args: args{ + counterList: counters.CounterList{}, + hostname: sampleHostname, + config: nil, + deviceWatchList: &devicewatchlistmanager.WatchList{}, + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher) {}, + want: func( + _ string, _ *appconfig.Config, + _ devicewatchlistmanager.WatchList, + ) Collector { + return nil + }, + wantErr: true, + }, + { + name: "new XID collector watcher fails", + args: args{ + counterList: counters.CounterList{ + sampleDCGMExpXIDCounter, + sampleOtherCounter, + sampleLabelCounter, + }, + hostname: sampleHostname, + config: &sampleConfig, + deviceWatchList: devicewatchlistmanager.NewWatchList(sampleDeviceInfo, sampleDeviceFields, nil, + mockDeviceWatcher, sampleCollectorInterval), + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher) { + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return(nil, + dcgm.FieldHandle{}, + sampleCleanups, fmt.Errorf("some error")) + }, + want: func( + _ string, _ *appconfig.Config, + _ devicewatchlistmanager.WatchList, + ) Collector { + return nil + }, + wantErr: true, + }, + { + name: "new XID collector ", + args: args{ + counterList: counters.CounterList{ + sampleDCGMExpXIDCounter, + sampleOtherCounter, + sampleLabelCounter, + }, + hostname: sampleHostname, + config: &sampleConfig, + deviceWatchList: devicewatchlistmanager.NewWatchList(sampleDeviceInfo, sampleDeviceFields, nil, + mockDeviceWatcher, sampleCollectorInterval), + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher) { + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return(nil, + dcgm.FieldHandle{}, + sampleCleanups, nil) + }, + want: func( + hostname string, config *appconfig.Config, + deviceWatchList devicewatchlistmanager.WatchList, + ) Collector { + deviceWatchList.SetDeviceFields([]dcgm.Short{dcgm.DCGM_FI_DEV_XID_ERRORS}) + return &xidCollector{ + expCollector{ + baseExpCollector: baseExpCollector{ + deviceWatchList: deviceWatchList, + counter: sampleDCGMExpXIDCounter, + labelsCounters: []counters.Counter{sampleLabelCounter}, + hostname: hostname, + config: config, + cleanups: sampleCleanups, + }, + windowSize: config.XIDCountWindowSize, + }, + } + }, + wantErr: false, + }, + { + name: "new XID collector with no label counters", + args: args{ + counterList: counters.CounterList{ + sampleDCGMExpXIDCounter, + sampleOtherCounter, + }, + hostname: sampleHostname, + config: &sampleConfig, + deviceWatchList: devicewatchlistmanager.NewWatchList(sampleDeviceInfo, sampleDeviceFields, nil, + mockDeviceWatcher, sampleCollectorInterval), + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher) { + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return(nil, + dcgm.FieldHandle{}, + sampleCleanups, nil) + }, + want: func( + hostname string, config *appconfig.Config, + deviceWatchList devicewatchlistmanager.WatchList, + ) Collector { + deviceWatchList.SetDeviceFields([]dcgm.Short{dcgm.DCGM_FI_DEV_XID_ERRORS}) + return &xidCollector{ + expCollector{ + baseExpCollector: baseExpCollector{ + deviceWatchList: deviceWatchList, + counter: sampleDCGMExpXIDCounter, + labelsCounters: nil, + hostname: hostname, + config: config, + cleanups: sampleCleanups, + }, + windowSize: config.XIDCountWindowSize, + }, + } + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.conditions(mockDeviceWatcher) + + got, err := NewXIDCollector(tt.args.counterList, tt.args.hostname, tt.args.config, + *tt.args.deviceWatchList) + want := tt.want(tt.args.hostname, tt.args.config, *tt.args.deviceWatchList) + + if !tt.wantErr { + assert.NoError(t, err, "unexpected error") + + wantAttrs := testutils.GetFields(&want.(*xidCollector).expCollector, testutils.Fields) + gotAttrs := testutils.GetFields(&got.(*xidCollector).expCollector, testutils.Fields) + assert.Equal(t, wantAttrs, gotAttrs, "unexpected result") + + gotFuncAttrs := testutils.GetFields(&got.(*xidCollector).expCollector, testutils.Functions) + for functionName, value := range gotFuncAttrs { + assert.NotNilf(t, value, "unexpected %s to be not nil", functionName) + } + } else { + assert.Error(t, err, "expected error") + assert.Equal(t, want, got, "unexpected result") + } + }) + } +} + +func sortXIDMetrics(metrics []Metric) { + slices.SortFunc(metrics, func(a, b Metric) int { + if a.GPU < b.GPU { + return -1 + } else if a.GPU == b.GPU { + if a.Labels["xid"] < b.Labels["xid"] { + return -1 + } + } + return 1 + }) +} + +func xidMetricsCreator( + counter counters.Counter, gpuID uint, value, hostname, mockFieldName, + mockFieldLabelValue string, mockXID uint64, +) Metric { + return Metric{ + Counter: counter, + Value: value, + GPU: fmt.Sprintf("%d", gpuID), + GPUUUID: "", + GPUDevice: fmt.Sprintf("nvidia%d", gpuID), + GPUModelName: "", + UUID: "UUID", + MigProfile: "", + GPUInstanceID: "", + Hostname: hostname, + Labels: map[string]string{ + windowSizeInMSLabel: "0", + mockFieldName: mockFieldLabelValue, + "xid": fmt.Sprint(mockXID), + }, + Attributes: map[string]string{}, + } +} + +func Test_xidCollector_GetMetrics(t *testing.T) { + /******* Mock DCGM *************/ + ctrl := gomock.NewController(t) + mockDCGM := mockdcgm.NewMockDCGM(ctrl) + mockDeviceWatcher := mockdevicewatcher.NewMockWatcher(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGM) + + /******** Mock Counters ************/ + mockDCGMXIDCounter := counters.Counter{ + FieldID: 1, + FieldName: counters.DCGMExpXIDErrorsCount, + } + + mockOtherCounter := counters.Counter{ + FieldID: 2, + FieldName: "random2", + } + + mockLabelDeviceField := dcgm.Short(3) + mockFieldName := "random3" + mockLabelValue := "this is mock label" + mockLabelCounter := counters.Counter{ + FieldID: mockLabelDeviceField, + FieldName: mockFieldName, + PromType: "label", + } + + /******** Mock Device Info *********/ + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + /******** Other Mock Inputs ************/ + gpuID1 := uint(0) + gpuID2 := uint(1) + + mockDeviceFields := []dcgm.Short{42} + mockCollectorInterval := int64(1) + mockConfig := appconfig.Config{} + mockHostname := "localhost" + var mockCleanups []func() + + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + mockLatestValues := []dcgm.FieldValue_v1{ + { + FieldId: 150, + FieldType: dcgm.DCGM_FT_INT64, + Value: [4096]byte{42}, + }, + { + FieldId: uint(mockLabelDeviceField), + FieldType: dcgm.DCGM_FT_STRING, + Value: testutils.StrToByteArray(mockLabelValue), + }, + { + FieldId: uint(mockLabelDeviceField), + FieldType: dcgm.DCGM_FT_STRING, + Value: testutils.StrToByteArray(dcgm.DCGM_FT_STR_NOT_FOUND), + }, + } + + tests := []struct { + name string + collector func() Collector + conditions func(*mockdevicewatcher.MockWatcher, byte, byte) + want func() (MetricsByCounter, byte, byte) + wantErr bool + }{ + { + name: "XID collector with single XID event", + collector: func() Collector { + counterList := counters.CounterList{ + mockDCGMXIDCounter, + mockOtherCounter, + mockLabelCounter, + } + deviceWatchList := devicewatchlistmanager.NewWatchList(mockGPUDeviceInfo, mockDeviceFields, + []dcgm.Short{mockLabelDeviceField}, mockDeviceWatcher, mockCollectorInterval) + + collector, _ := NewXIDCollector(counterList, mockHostname, &mockConfig, *deviceWatchList) + return collector + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher, gpu1Value, gpu2Value byte) { + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: gpuID1, Value: [4096]byte{gpu1Value}}, + {EntityId: gpuID2, Value: [4096]byte{gpu2Value}}, + } + + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return([]dcgm.GroupHandle{mockGroupHandle1}, + mockFieldGroupHandle, + mockCleanups, nil) + + mockDCGM.EXPECT().UpdateAllFields().Return(nil) + mockDCGM.EXPECT().GetValuesSince(mockGroupHandle1, mockFieldGroupHandle, + gomock.AssignableToTypeOf(time.Time{})).Return(mockEntitiesResult, time.Time{}, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID1, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID2, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + }, + want: func() (MetricsByCounter, byte, byte) { + mockXIDErr1 := uint64(42) + mockXIDErr2 := uint64(46) + + return MetricsByCounter{ + mockDCGMXIDCounter: []Metric{ + xidMetricsCreator(mockDCGMXIDCounter, gpuID1, "1", mockHostname, + mockFieldName, + mockLabelValue, mockXIDErr1), + xidMetricsCreator(mockDCGMXIDCounter, gpuID2, "1", mockHostname, + mockFieldName, + mockLabelValue, mockXIDErr2), + }, + }, byte(mockXIDErr1), byte(mockXIDErr2) + }, + wantErr: false, + }, + { + name: "xid collector with multiple events", + collector: func() Collector { + counterList := counters.CounterList{ + mockDCGMXIDCounter, + mockOtherCounter, + mockLabelCounter, + } + deviceWatchList := devicewatchlistmanager.NewWatchList(mockGPUDeviceInfo, mockDeviceFields, + []dcgm.Short{mockLabelDeviceField}, mockDeviceWatcher, mockCollectorInterval) + + collector, _ := NewXIDCollector(counterList, mockHostname, &mockConfig, *deviceWatchList) + return collector + }, + conditions: func(watcher *mockdevicewatcher.MockWatcher, xidErr1, xidErr2 byte) { + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: gpuID1, Value: [4096]byte{xidErr1}}, + {EntityId: gpuID1, Value: [4096]byte{xidErr1}}, + {EntityId: gpuID1, Value: [4096]byte{xidErr2}}, + {EntityId: gpuID2, Value: [4096]byte{xidErr1}}, + {EntityId: gpuID2, Value: [4096]byte{xidErr2}}, + {EntityId: gpuID2, Value: [4096]byte{xidErr2}}, + {EntityId: gpuID2, Value: [4096]byte{xidErr2}}, + } + + watcher.EXPECT().WatchDeviceFields(gomock.Any(), gomock.Any(), + gomock.Any()).Return([]dcgm.GroupHandle{mockGroupHandle1}, + mockFieldGroupHandle, + mockCleanups, nil) + + mockDCGM.EXPECT().UpdateAllFields().Return(nil) + mockDCGM.EXPECT().GetValuesSince(mockGroupHandle1, mockFieldGroupHandle, + gomock.AssignableToTypeOf(time.Time{})).Return(mockEntitiesResult, time.Time{}, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID1, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + mockDCGM.EXPECT().EntityGetLatestValues(dcgm.FE_GPU, gpuID2, + []dcgm.Short{mockLabelDeviceField}).Return(mockLatestValues, nil) + }, + want: func() (MetricsByCounter, byte, byte) { + mockXIDErr1 := uint64(42) + mockXIDErr2 := uint64(46) + + return MetricsByCounter{ + mockDCGMXIDCounter: []Metric{ + xidMetricsCreator(mockDCGMXIDCounter, gpuID1, "2", mockHostname, + mockFieldName, + mockLabelValue, mockXIDErr1), + xidMetricsCreator(mockDCGMXIDCounter, gpuID1, "1", mockHostname, + mockFieldName, + mockLabelValue, mockXIDErr2), + xidMetricsCreator(mockDCGMXIDCounter, gpuID2, "1", mockHostname, + mockFieldName, + mockLabelValue, mockXIDErr1), + xidMetricsCreator(mockDCGMXIDCounter, gpuID2, "3", mockHostname, + mockFieldName, + mockLabelValue, mockXIDErr2), + }, + }, byte(mockXIDErr1), byte(mockXIDErr2) + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + want, gpu1Value, gpu2Value := tt.want() + tt.conditions(mockDeviceWatcher, gpu1Value, gpu2Value) + c := tt.collector() + + got, err := c.GetMetrics() + + if !tt.wantErr { + assert.NoError(t, err, "GetMetrics() failed") + assert.NotEmpty(t, got) + + wantMetrics := want[mockDCGMXIDCounter] + gotMetrics := got[mockDCGMXIDCounter] + + assert.Len(t, gotMetrics, len(wantMetrics), "GetMetrics() returned wrong number of metrics") + + sortXIDMetrics(wantMetrics) + sortXIDMetrics(gotMetrics) + + assert.Equalf(t, wantMetrics, gotMetrics, "GetMetrics()") + } + }) + } +} diff --git a/pkg/dcgmexporter/xid_errors.go b/internal/pkg/collector/xid_errors.go similarity index 99% rename from pkg/dcgmexporter/xid_errors.go rename to internal/pkg/collector/xid_errors.go index 64be5363..b3e0cca4 100644 --- a/pkg/dcgmexporter/xid_errors.go +++ b/internal/pkg/collector/xid_errors.go @@ -14,7 +14,7 @@ * limitations under the License. */ -package dcgmexporter +package collector // Based on this doc: https://docs.nvidia.com/deploy/xid-errors/#topic_4 var xidErrCodeToText = []string{ diff --git a/internal/pkg/counters/const.go b/internal/pkg/counters/const.go new file mode 100644 index 00000000..e6fb310f --- /dev/null +++ b/internal/pkg/counters/const.go @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package counters + +const ( + undefinedConfigMapData = "none" + + cpuFieldsStart = 1100 + dcpFieldsStart = 1000 + + DCGMExpClockEventsCount = "DCGM_EXP_CLOCK_EVENTS_COUNT" + DCGMExpXIDErrorsCount = "DCGM_EXP_XID_ERRORS_COUNT" + DCGMExpGPUHealthStatus = "DCGM_EXP_GPU_HEALTH_STATUS" +) diff --git a/pkg/dcgmexporter/parser.go b/internal/pkg/counters/counter_config.go similarity index 74% rename from pkg/dcgmexporter/parser.go rename to internal/pkg/counters/counter_config.go index f25036ab..8eb7050f 100644 --- a/pkg/dcgmexporter/parser.go +++ b/internal/pkg/counters/counter_config.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,29 +14,25 @@ * limitations under the License. */ -package dcgmexporter +package counters import ( "context" "encoding/csv" "fmt" + "log/slog" "strings" "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/sirupsen/logrus" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" -) -const ( - cpuFieldsStart = 1100 - dcpFieldsStart = 1000 + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" ) -func GetCounterSet(c *Config) (*CounterSet, error) { +func GetCounterSet(c *appconfig.Config) (*CounterSet, error) { var ( err error records [][]string @@ -48,27 +44,29 @@ func GetCounterSet(c *Config) (*CounterSet, error) { var client kubernetes.Interface client, err = getKubeClient() if err != nil { - logrus.Fatal(err) + slog.Error(err.Error()) + os.Exit(1) } records, err = readConfigMap(client, c) if err != nil { - logrus.Fatal(err) + slog.Error(err.Error()) + os.Exit(1) } } else { err = fmt.Errorf("no configmap data specified") } if err != nil || c.ConfigMapData == undefinedConfigMapData { - logrus.Infof("Falling back to metric file '%s'", c.CollectorsFile) + slog.Info(fmt.Sprintf("Falling back to metric file '%s'", c.CollectorsFile)) records, err = ReadCSVFile(c.CollectorsFile) if err != nil { - logrus.Errorf("Could not read metrics file '%s'; err: %v", c.CollectorsFile, err) + slog.Error(fmt.Sprintf("Could not read metrics file '%s'; err: %v", c.CollectorsFile, err)) return res, err } } - res, err = extractCounters(records, c) + res, err = ExtractCounters(records, c) if err != nil { return res, err } @@ -91,7 +89,7 @@ func ReadCSVFile(filename string) ([][]string, error) { return records, err } -func extractCounters(records [][]string, c *Config) (*CounterSet, error) { +func ExtractCounters(records [][]string, c *appconfig.Config) (*CounterSet, error) { res := CounterSet{} for i, record := range records { @@ -118,7 +116,13 @@ func extractCounters(records [][]string, c *Config) (*CounterSet, error) { if err != nil { return nil, fmt.Errorf("could not find DCGM field; err: %w", err) } else if expField != DCGMFIUnknown { - res.ExporterCounters = append(res.ExporterCounters, Counter{dcgm.Short(expField), record[0], record[1], record[2]}) + res.ExporterCounters = append(res.ExporterCounters, + Counter{ + FieldID: dcgm.Short(expField), + FieldName: record[0], + PromType: record[1], + Help: record[2], + }) continue } } @@ -129,7 +133,7 @@ func extractCounters(records [][]string, c *Config) (*CounterSet, error) { if !useOld { if !fieldIsSupported(uint(fieldID), c) { - logrus.Warnf("Skipping line %d ('%s'): metric not enabled", i, record[0]) + slog.Warn(fmt.Sprintf("Skipping line %d ('%s'): metric not enabled", i, record[0])) continue } @@ -137,10 +141,11 @@ func extractCounters(records [][]string, c *Config) (*CounterSet, error) { return nil, fmt.Errorf("could not find Prometheus metric type '%s'", record[1]) } - res.DCGMCounters = append(res.DCGMCounters, Counter{fieldID, record[0], record[1], record[2]}) + res.DCGMCounters = append(res.DCGMCounters, + Counter{FieldID: fieldID, FieldName: record[0], PromType: record[1], Help: record[2]}) } else { if !fieldIsSupported(uint(oldFieldID), c) { - logrus.Warnf("Skipping line %d ('%s'): metric not enabled", i, record[0]) + slog.Warn(fmt.Sprintf("Skipping line %d ('%s'): metric not enabled", i, record[0])) continue } @@ -148,14 +153,15 @@ func extractCounters(records [][]string, c *Config) (*CounterSet, error) { return nil, fmt.Errorf("could not find Prometheus metric type '%s'", record[1]) } - res.DCGMCounters = append(res.DCGMCounters, Counter{oldFieldID, record[0], record[1], record[2]}) + res.DCGMCounters = append(res.DCGMCounters, + Counter{FieldID: oldFieldID, FieldName: record[0], PromType: record[1], Help: record[2]}) } } return &res, nil } -func fieldIsSupported(fieldID uint, c *Config) bool { +func fieldIsSupported(fieldID uint, c *appconfig.Config) bool { if fieldID < dcpFieldsStart || fieldID >= cpuFieldsStart { return true } @@ -175,7 +181,7 @@ func fieldIsSupported(fieldID uint, c *Config) bool { return false } -func readConfigMap(kubeClient kubernetes.Interface, c *Config) ([][]string, error) { +func readConfigMap(kubeClient kubernetes.Interface, c *appconfig.Config) ([][]string, error) { parts := strings.Split(c.ConfigMapData, ":") if len(parts) != 2 { return nil, fmt.Errorf("malformed configmap-data '%s'", c.ConfigMapData) diff --git a/pkg/dcgmexporter/parser_test.go b/internal/pkg/counters/counter_config_test.go similarity index 81% rename from pkg/dcgmexporter/parser_test.go rename to internal/pkg/counters/counter_config_test.go index 0f00e25b..ca852517 100644 --- a/pkg/dcgmexporter/parser_test.go +++ b/internal/pkg/counters/counter_config_test.go @@ -1,4 +1,20 @@ -package dcgmexporter +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package counters import ( "testing" @@ -7,6 +23,8 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/fake" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" ) func TestEmptyConfigMap(t *testing.T) { @@ -19,7 +37,7 @@ func TestEmptyConfigMap(t *testing.T) { Data: map[string]string{"metrics": ""}, }) - c := Config{ + c := appconfig.Config{ ConfigMapData: "default:configmap1", } records, err := readConfigMap(clientset, &c) @@ -37,7 +55,7 @@ func TestValidConfigMap(t *testing.T) { Data: map[string]string{"metrics": "DCGM_FI_DEV_GPU_TEMP, gauge, temperature"}, }) - c := Config{ + c := appconfig.Config{ ConfigMapData: "default:configmap1", } records, err := readConfigMap(clientset, &c) @@ -55,7 +73,7 @@ func TestInvalidConfigMapData(t *testing.T) { Data: map[string]string{"bad": "DCGM_FI_DEV_GPU_TEMP, gauge, temperature"}, }) - c := Config{ + c := appconfig.Config{ ConfigMapData: "default:configmap1", } records, err := readConfigMap(clientset, &c) @@ -72,7 +90,7 @@ func TestInvalidConfigMapName(t *testing.T) { }, }) - c := Config{ + c := appconfig.Config{ ConfigMapData: "default:configmap1", } records, err := readConfigMap(clientset, &c) @@ -89,7 +107,7 @@ func TestInvalidConfigMapNamespace(t *testing.T) { }, }) - c := Config{ + c := appconfig.Config{ ConfigMapData: "default:configmap1", } records, err := readConfigMap(clientset, &c) @@ -142,7 +160,7 @@ func extractCountersHelper(t *testing.T, input string, valid bool) { t.Fatalf("Cannot close temp file: %v", err) } - c := Config{ + c := appconfig.Config{ ConfigMapData: undefinedConfigMapData, CollectorsFile: tmpFile.Name(), } diff --git a/pkg/dcgmexporter/exporter_metrics.go b/internal/pkg/counters/exporter_counters.go similarity index 82% rename from pkg/dcgmexporter/exporter_metrics.go rename to internal/pkg/counters/exporter_counters.go index ecf7ab7f..f4471083 100644 --- a/pkg/dcgmexporter/exporter_metrics.go +++ b/internal/pkg/counters/exporter_counters.go @@ -14,30 +14,28 @@ * limitations under the License. */ -package dcgmexporter +package counters import "fmt" -const ( - dcgmExpClockEventsCount = "DCGM_EXP_CLOCK_EVENTS_COUNT" - dcgmExpXIDErrorsCount = "DCGM_EXP_XID_ERRORS_COUNT" -) - type ExporterCounter uint16 const ( DCGMFIUnknown ExporterCounter = 0 DCGMXIDErrorsCount ExporterCounter = iota + 9000 DCGMClockEventsCount ExporterCounter = iota + 9000 + DCGMGPUHealthStatus ExporterCounter = iota + 9000 ) // String method to convert the enum value to a string func (enm ExporterCounter) String() string { switch enm { case DCGMXIDErrorsCount: - return dcgmExpXIDErrorsCount + return DCGMExpXIDErrorsCount case DCGMClockEventsCount: - return dcgmExpClockEventsCount + return DCGMExpClockEventsCount + case DCGMGPUHealthStatus: + return DCGMExpGPUHealthStatus default: return "DCGM_FI_UNKNOWN" } @@ -47,13 +45,14 @@ func (enm ExporterCounter) String() string { var DCGMFields = map[string]ExporterCounter{ DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount, DCGMClockEventsCount.String(): DCGMClockEventsCount, + DCGMGPUHealthStatus.String(): DCGMGPUHealthStatus, DCGMFIUnknown.String(): DCGMFIUnknown, } func IdentifyMetricType(s string) (ExporterCounter, error) { mv, ok := DCGMFields[s] if !ok { - return mv, fmt.Errorf("Unknown ExporterCounter field '%s'", s) + return mv, fmt.Errorf("unknown ExporterCounter field '%s'", s) } return mv, nil } diff --git a/pkg/dcgmexporter/exporter_metrics_test.go b/internal/pkg/counters/exporter_counters_test.go similarity index 95% rename from pkg/dcgmexporter/exporter_metrics_test.go rename to internal/pkg/counters/exporter_counters_test.go index 60710393..3e216ee0 100644 --- a/pkg/dcgmexporter/exporter_metrics_test.go +++ b/internal/pkg/counters/exporter_counters_test.go @@ -5,7 +5,7 @@ * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -14,7 +14,7 @@ * limitations under the License. */ -package dcgmexporter +package counters import ( "testing" diff --git a/internal/pkg/counters/types.go b/internal/pkg/counters/types.go new file mode 100644 index 00000000..2da015c8 --- /dev/null +++ b/internal/pkg/counters/types.go @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package counters + +import ( + "github.com/NVIDIA/go-dcgm/pkg/dcgm" +) + +type Counter struct { + FieldID dcgm.Short + FieldName string + PromType string + Help string +} + +func (c Counter) IsLabel() bool { + return c.PromType == "label" +} + +type CounterList []Counter + +func (c CounterList) LabelCounters() CounterList { + var labelsCounters CounterList + for _, counter := range c { + if counter.IsLabel() { + labelsCounters = append(labelsCounters, counter) + } + } + + return labelsCounters +} + +type CounterSet struct { + DCGMCounters CounterList + ExporterCounters CounterList +} diff --git a/internal/pkg/counters/variables.go b/internal/pkg/counters/variables.go new file mode 100644 index 00000000..95bcbd6f --- /dev/null +++ b/internal/pkg/counters/variables.go @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package counters + +import osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" + +var os osinterface.OS = osinterface.RealOS{} + +var promMetricType = map[string]bool{ + "gauge": true, + "counter": true, + "histogram": true, + "summary": true, + "label": true, +} diff --git a/internal/pkg/dcgmprovider/dcgm.go b/internal/pkg/dcgmprovider/dcgm.go new file mode 100644 index 00000000..175bd698 --- /dev/null +++ b/internal/pkg/dcgmprovider/dcgm.go @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmprovider + +import ( + "fmt" + "log/slog" + "os" + "time" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" +) + +var dcgmInterface DCGM + +// Initialize sets up the Singleton DCGM interface using the provided configuration. +func Initialize(config *appconfig.Config) { + dcgmInterface = newDCGMProvider(config) +} + +// reset clears the current DCGM interface instance. +func reset() { + dcgmInterface = nil +} + +// Client retrieves the current DCGM interface instance. +func Client() DCGM { + return dcgmInterface +} + +// SetClient sets the current DCGM interface instance to the provided one. +func SetClient(d DCGM) { + dcgmInterface = d +} + +// dcgmProvider implements DCGM Interface +type dcgmProvider struct { + shutdown func() + moduleCleanup func() +} + +// newDCGMProvider initializes a new DCGM provider based on the provided configuration +func newDCGMProvider(config *appconfig.Config) DCGM { + // Check if a DCGM client already exists and return it if so. + if Client() != nil { + slog.Info("DCGM already initialized") + return Client() + } + + client := dcgmProvider{} + + // Connect to a remote DCGM host engine if configured. + if config.UseRemoteHE { + slog.Info("Attempting to connect to remote hostengine at " + config.RemoteHEInfo) + cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0") + if err != nil { + cleanup() + slog.Error(err.Error()) + os.Exit(1) + } + client.shutdown = cleanup + } else { + if config.EnableDCGMLog { + os.Setenv("__DCGM_DBG_FILE", "-") + os.Setenv("__DCGM_DBG_LVL", config.DCGMLogLevel) + } + + // Initialize a local/embedded DCGM instance. + slog.Info("Attempting to initialize DCGM.") + cleanup, err := dcgm.Init(dcgm.Embedded) + if err != nil { + slog.Error(err.Error()) + os.Exit(1) + } + client.shutdown = cleanup + } + + // Initialize the DcgmFields module + if val := dcgm.FieldsInit(); val < 0 { + slog.Error(fmt.Sprintf("Failed to initialize DCGM Fields module; err: %d", val)) + os.Exit(1) + } else { + slog.Info("Initialized DCGM Fields module.") + } + + return client +} + +func (d dcgmProvider) AddEntityToGroup( + groupId dcgm.GroupHandle, entityGroupId dcgm.Field_Entity_Group, + entityId uint, +) error { + return dcgm.AddEntityToGroup(groupId, entityGroupId, entityId) +} + +func (d dcgmProvider) AddLinkEntityToGroup(groupId dcgm.GroupHandle, index uint, parentId uint) error { + return dcgm.AddLinkEntityToGroup(groupId, index, parentId) +} + +func (d dcgmProvider) CreateFakeEntities(entities []dcgm.MigHierarchyInfo) ([]uint, error) { + return dcgm.CreateFakeEntities(entities) +} + +func (d dcgmProvider) CreateGroup(groupName string) (dcgm.GroupHandle, error) { + return dcgm.CreateGroup(groupName) +} + +func (d dcgmProvider) DestroyGroup(groupId dcgm.GroupHandle) error { + return dcgm.DestroyGroup(groupId) +} + +func (d dcgmProvider) EntitiesGetLatestValues( + entities []dcgm.GroupEntityPair, fields []dcgm.Short, flags uint, +) ([]dcgm.FieldValue_v2, error) { + return dcgm.EntitiesGetLatestValues(entities, fields, flags) +} + +func (d dcgmProvider) EntityGetLatestValues( + entityGroup dcgm.Field_Entity_Group, entityId uint, fields []dcgm.Short, +) ([]dcgm.FieldValue_v1, + error, +) { + return dcgm.EntityGetLatestValues(entityGroup, entityId, fields) +} + +func (d dcgmProvider) Fv2_String(fv dcgm.FieldValue_v2) string { + return dcgm.Fv2_String(fv) +} + +func (d dcgmProvider) FieldGetById(fieldId dcgm.Short) dcgm.FieldMeta { + return dcgm.FieldGetById(fieldId) +} + +func (d dcgmProvider) FieldGroupCreate(fieldsGroupName string, fields []dcgm.Short) (dcgm.FieldHandle, error) { + return dcgm.FieldGroupCreate(fieldsGroupName, fields) +} + +func (d dcgmProvider) FieldGroupDestroy(fieldsGroup dcgm.FieldHandle) error { + return dcgm.FieldGroupDestroy(fieldsGroup) +} + +func (d dcgmProvider) GetAllDeviceCount() (uint, error) { + return dcgm.GetAllDeviceCount() +} + +func (d dcgmProvider) GetCpuHierarchy() (dcgm.CpuHierarchy_v1, error) { + return dcgm.GetCpuHierarchy() +} + +func (d dcgmProvider) GetDeviceInfo(gpuId uint) (dcgm.Device, error) { + return dcgm.GetDeviceInfo(gpuId) +} + +func (d dcgmProvider) GetEntityGroupEntities(entityGroup dcgm.Field_Entity_Group) ([]uint, error) { + return dcgm.GetEntityGroupEntities(entityGroup) +} + +func (d dcgmProvider) GetGpuInstanceHierarchy() (dcgm.MigHierarchy_v2, error) { + return dcgm.GetGpuInstanceHierarchy() +} + +func (d dcgmProvider) GetNvLinkLinkStatus() ([]dcgm.NvLinkStatus, error) { + return dcgm.GetNvLinkLinkStatus() +} + +func (d dcgmProvider) GetSupportedDevices() ([]uint, error) { + return dcgm.GetSupportedDevices() +} + +func (d dcgmProvider) GetSupportedMetricGroups(gpuId uint) ([]dcgm.MetricGroup, error) { + return dcgm.GetSupportedMetricGroups(gpuId) +} + +func (d dcgmProvider) GetValuesSince( + gpuGroup dcgm.GroupHandle, fieldGroup dcgm.FieldHandle, sinceTime time.Time, +) ([]dcgm.FieldValue_v2, time.Time, error) { + return dcgm.GetValuesSince(gpuGroup, fieldGroup, sinceTime) +} + +func (d dcgmProvider) GroupAllGPUs() dcgm.GroupHandle { + return dcgm.GroupAllGPUs() +} + +func (d dcgmProvider) InjectFieldValue( + gpu uint, fieldID uint, fieldType uint, status int, ts int64, value interface{}, +) error { + return dcgm.InjectFieldValue(gpu, fieldID, fieldType, status, ts, value) +} + +func (d dcgmProvider) LinkGetLatestValues(index uint, parentId uint, fields []dcgm.Short) ([]dcgm.FieldValue_v1, + error, +) { + return dcgm.LinkGetLatestValues(index, parentId, fields) +} + +func (d dcgmProvider) NewDefaultGroup(groupName string) (dcgm.GroupHandle, error) { + return dcgm.NewDefaultGroup(groupName) +} + +func (d dcgmProvider) UpdateAllFields() error { + return dcgm.UpdateAllFields() +} + +func (d dcgmProvider) WatchFieldsWithGroupEx( + fieldsGroup dcgm.FieldHandle, group dcgm.GroupHandle, updateFreq int64, maxKeepAge float64, + maxKeepSamples int32, +) error { + return dcgm.WatchFieldsWithGroupEx(fieldsGroup, group, updateFreq, maxKeepAge, maxKeepSamples) +} + +// Cleanup performs cleanup operations for the DCGM provider, including terminating modules and shutting down DCGM. +func (d dcgmProvider) Cleanup() { + // Terminates the DcgmFields module + slog.Info("Attempting to terminate DCGM Fields module.") + if val := dcgm.FieldsTerm(); val < 0 { + slog.Error(fmt.Sprintf("Failed to terminate DCGM Fields module; err: %d", val)) + } + + // Shuts down the DCGM instance. + slog.Info("Attempting to terminate DCGM.") + d.shutdown() + + reset() +} + +func (d dcgmProvider) HealthSet(groupID dcgm.GroupHandle, systems dcgm.HealthSystem) error { + return dcgm.HealthSet(groupID, systems) +} + +func (d dcgmProvider) HealthGet(groupID dcgm.GroupHandle) (dcgm.HealthSystem, error) { + return dcgm.HealthGet(groupID) +} + +func (d dcgmProvider) HealthCheck(groupID dcgm.GroupHandle) (dcgm.HealthResponse, error) { + return dcgm.HealthCheck(groupID) +} + +func (d dcgmProvider) GetGroupInfo(groupID dcgm.GroupHandle) (*dcgm.GroupInfo, error) { + return dcgm.GetGroupInfo(groupID) +} diff --git a/internal/pkg/dcgmprovider/types.go b/internal/pkg/dcgmprovider/types.go new file mode 100644 index 00000000..18dea167 --- /dev/null +++ b/internal/pkg/dcgmprovider/types.go @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/dcgmprovider/mock_client.go -package=dcgmprovider -copyright_file=../../../hack/header.txt . DCGM + +package dcgmprovider + +import ( + "time" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" +) + +type DCGM interface { + AddEntityToGroup(dcgm.GroupHandle, dcgm.Field_Entity_Group, uint) error + AddLinkEntityToGroup(dcgm.GroupHandle, uint, uint) error + CreateFakeEntities(entities []dcgm.MigHierarchyInfo) ([]uint, error) + CreateGroup(string) (dcgm.GroupHandle, error) + DestroyGroup(groupId dcgm.GroupHandle) error + EntitiesGetLatestValues([]dcgm.GroupEntityPair, []dcgm.Short, uint) ([]dcgm.FieldValue_v2, error) + EntityGetLatestValues(dcgm.Field_Entity_Group, uint, []dcgm.Short) ([]dcgm.FieldValue_v1, error) + Fv2_String(fv dcgm.FieldValue_v2) string + FieldGetById(dcgm.Short) dcgm.FieldMeta + FieldGroupCreate(string, []dcgm.Short) (dcgm.FieldHandle, error) + FieldGroupDestroy(dcgm.FieldHandle) error + GetAllDeviceCount() (uint, error) + GetCpuHierarchy() (dcgm.CpuHierarchy_v1, error) + GetDeviceInfo(uint) (dcgm.Device, error) + GetEntityGroupEntities(entityGroup dcgm.Field_Entity_Group) ([]uint, error) + GetGpuInstanceHierarchy() (dcgm.MigHierarchy_v2, error) + GetNvLinkLinkStatus() ([]dcgm.NvLinkStatus, error) + GetSupportedDevices() ([]uint, error) + GetSupportedMetricGroups(uint) ([]dcgm.MetricGroup, error) + GetValuesSince(dcgm.GroupHandle, dcgm.FieldHandle, time.Time) ([]dcgm.FieldValue_v2, time.Time, error) + GroupAllGPUs() dcgm.GroupHandle + InjectFieldValue(gpu uint, fieldID uint, fieldType uint, status int, ts int64, value interface{}) error + LinkGetLatestValues(uint, uint, []dcgm.Short) ([]dcgm.FieldValue_v1, error) + NewDefaultGroup(string) (dcgm.GroupHandle, error) + UpdateAllFields() error + WatchFieldsWithGroupEx(dcgm.FieldHandle, dcgm.GroupHandle, int64, float64, int32) error + Cleanup() + HealthSet(groupID dcgm.GroupHandle, systems dcgm.HealthSystem) error + HealthGet(groupID dcgm.GroupHandle) (dcgm.HealthSystem, error) + HealthCheck(groupID dcgm.GroupHandle) (dcgm.HealthResponse, error) + GetGroupInfo(groupID dcgm.GroupHandle) (*dcgm.GroupInfo, error) +} diff --git a/internal/pkg/deviceinfo/device_info.go b/internal/pkg/deviceinfo/device_info.go new file mode 100644 index 00000000..04e9d269 --- /dev/null +++ b/internal/pkg/deviceinfo/device_info.go @@ -0,0 +1,597 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package deviceinfo + +import ( + "fmt" + "log/slog" + "slices" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/bits-and-blooms/bitset" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" +) + +const deviceInitMessage = "System entities of type %s initialized" + +type Info struct { + gpuCount uint + gpus [dcgm.MAX_NUM_DEVICES]GPUInfo + switches []SwitchInfo + cpus []CPUInfo + gOpt appconfig.DeviceOptions + sOpt appconfig.DeviceOptions + cOpt appconfig.DeviceOptions + infoType dcgm.Field_Entity_Group +} + +func (s *Info) GPUCount() uint { + return s.gpuCount +} + +func (s *Info) GPUs() []GPUInfo { + return s.gpus[:] +} + +func (s *Info) GPU(i uint) GPUInfo { + return s.gpus[i] +} + +func (s *Info) Switches() []SwitchInfo { + return s.switches +} + +func (s *Info) Switch(i uint) SwitchInfo { + return s.switches[i] +} + +func (s *Info) CPUs() []CPUInfo { + return s.cpus +} + +func (s *Info) CPU(i uint) CPUInfo { + return s.cpus[i] +} + +func (s *Info) GOpts() appconfig.DeviceOptions { + return s.gOpt +} + +func (s *Info) SOpts() appconfig.DeviceOptions { + return s.sOpt +} + +func (s *Info) COpts() appconfig.DeviceOptions { + return s.cOpt +} + +func (s *Info) InfoType() dcgm.Field_Entity_Group { + return s.infoType +} + +func Initialize( + gOpt appconfig.DeviceOptions, sOpt appconfig.DeviceOptions, cOpt appconfig.DeviceOptions, useFakeGPUs bool, + entityType dcgm.Field_Entity_Group, +) (*Info, error) { + deviceInfo := &Info{} + var err error + + slog.Info(fmt.Sprintf("Initializing system entities of type '%s'", entityType.String())) + switch entityType { + case dcgm.FE_LINK: + deviceInfo.infoType = dcgm.FE_LINK + err = deviceInfo.initializeNvSwitchInfo(sOpt) + case dcgm.FE_SWITCH: + deviceInfo.infoType = dcgm.FE_SWITCH + err = deviceInfo.initializeNvSwitchInfo(sOpt) + case dcgm.FE_GPU: + deviceInfo.infoType = dcgm.FE_GPU + err = deviceInfo.initializeGPUInfo(gOpt, useFakeGPUs) + case dcgm.FE_CPU: + deviceInfo.infoType = dcgm.FE_CPU + err = deviceInfo.initializeCPUInfo(cOpt) + case dcgm.FE_CPU_CORE: + deviceInfo.infoType = dcgm.FE_CPU_CORE + err = deviceInfo.initializeCPUInfo(cOpt) + default: + err = fmt.Errorf("invalid entity type '%d'", entityType) + } + + return deviceInfo, err +} + +func (s *Info) initializeGPUInfo(gOpt appconfig.DeviceOptions, useFakeGPUs bool) error { + gpuCount, err := dcgmprovider.Client().GetAllDeviceCount() + if err != nil { + return err + } + s.gpuCount = gpuCount + + for i := uint(0); i < s.gpuCount; i++ { + // TODO (roarora): Use of array to store GPUs makes it harder to ignore GPUs (including GPU Instances) which + // should be filtered out based on `Major` attribute in Device Options. Fix it! + + // Default mig enabled to false + s.gpus[i].MigEnabled = false + s.gpus[i].DeviceInfo, err = dcgmprovider.Client().GetDeviceInfo(i) + if err != nil { + if useFakeGPUs { + s.gpus[i].DeviceInfo.GPU = i + s.gpus[i].DeviceInfo.UUID = fmt.Sprintf("fake%d", i) + } else { + return err + } + } + } + + hierarchy, err := dcgmprovider.Client().GetGpuInstanceHierarchy() + if err != nil { + return err + } + + if hierarchy.Count > 0 { + var entities []dcgm.GroupEntityPair + + gpuID := uint(0) + instanceIndex := 0 + for i := uint(0); i < hierarchy.Count; i++ { + entityID := hierarchy.EntityList[i].Entity.EntityId + + if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU { + + // We are adding a GPU instance + gpuID = hierarchy.EntityList[i].Parent.EntityId + + instanceInfo := GPUInstanceInfo{ + Info: hierarchy.EntityList[i].Info, + ProfileName: "", + EntityId: entityID, + } + s.gpus[gpuID].MigEnabled = true + s.gpus[gpuID].GPUInstances = append(s.gpus[gpuID].GPUInstances, instanceInfo) + entities = append(entities, dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: entityID}) + instanceIndex = len(s.gpus[gpuID].GPUInstances) - 1 + } else if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU_I { + // TODO (roarora): Fix this implementation as it expects Instances and Compute Instances to be reported + // in a certain sequence if, that is not the case results are incorrect. + + // Add the compute instance, gpuId is recorded previously + ciInfo := ComputeInstanceInfo{hierarchy.EntityList[i].Info, "", entityID} + s.gpus[gpuID].GPUInstances[instanceIndex].ComputeInstances = append(s.gpus[gpuID].GPUInstances[instanceIndex].ComputeInstances, + ciInfo) + } + } + + err = s.populateMigProfileNames(entities) + if err != nil { + return err + } + } + + s.gOpt = gOpt + err = s.verifyDevicePresence() + if err == nil { + slog.Debug(fmt.Sprintf(deviceInitMessage, s.infoType)) + } + return err +} + +func (s *Info) initializeCPUInfo(cOpt appconfig.DeviceOptions) error { + hierarchy, err := dcgmprovider.Client().GetCpuHierarchy() + if err != nil { + return err + } + + if hierarchy.NumCpus <= 0 { + return fmt.Errorf("no cpus to monitor") + } + + for i := 0; i < int(hierarchy.NumCpus); i++ { + // monitor only the CPUs as per the device options input + if cOpt.Flex || s.shouldMonitor(cOpt.MajorRange, hierarchy.Cpus[i].CpuId) { + cores := getCoreArray(hierarchy.Cpus[i].OwnedCores) + + monitoredCores := make([]uint, 0) + for _, core := range cores { + // monitor only the CPU cores as per the device options input + if cOpt.Flex || s.shouldMonitor(cOpt.MinorRange, core) { + monitoredCores = append(monitoredCores, core) + } + } + + cpu := CPUInfo{ + hierarchy.Cpus[i].CpuId, + monitoredCores, + } + + s.cpus = append(s.cpus, cpu) + } + } + + s.cOpt = cOpt + + // ensures all the CPUs and Cores to monitor have been discovered + err = s.verifyCPUDevicePresence() + if err != nil { + return err + } + + // Ensure correct CPUs and Cores are monitored + slog.Debug(fmt.Sprintf(deviceInitMessage, s.infoType)) + return nil +} + +func (s *Info) initializeNvSwitchInfo(sOpt appconfig.DeviceOptions) error { + switches, err := dcgmprovider.Client().GetEntityGroupEntities(dcgm.FE_SWITCH) + if err != nil { + return err + } + + if len(switches) <= 0 { + return fmt.Errorf("no switches to monitor") + } + + links, err := dcgmprovider.Client().GetNvLinkLinkStatus() + if err != nil { + return err + } + + for i := 0; i < len(switches); i++ { + // monitor only the Switches as per the device options input + if sOpt.Flex || s.shouldMonitor(sOpt.MajorRange, switches[i]) { + + var matchingLinks []dcgm.NvLinkStatus + for _, link := range links { + // monitor only the NV Link as per the device options input + if sOpt.Flex || s.shouldMonitor(sOpt.MinorRange, link.Index) { + if link.ParentType == dcgm.FE_SWITCH && link.ParentId == switches[i] { + matchingLinks = append(matchingLinks, link) + } + } + } + + sw := SwitchInfo{ + switches[i], + matchingLinks, + } + + s.switches = append(s.switches, sw) + } + } + + s.sOpt = sOpt + err = s.verifySwitchDevicePresence() + if err == nil { + slog.Debug(fmt.Sprintf(deviceInitMessage, s.infoType)) + } + + return err +} + +func (s *Info) setGPUInstanceProfileName(entityId uint, profileName string) bool { + for i := uint(0); i < s.gpuCount; i++ { + for j := range s.gpus[i].GPUInstances { + if s.gpus[i].GPUInstances[j].EntityId == entityId { + s.gpus[i].GPUInstances[j].ProfileName = profileName + return true + } + } + } + + return false +} + +func (s *Info) setMigProfileNames(values []dcgm.FieldValue_v2) error { + var err error + var errFound bool + errStr := "cannot find match for entities:" + + for _, v := range values { + if !s.setGPUInstanceProfileName(v.EntityId, dcgmprovider.Client().Fv2_String(v)) { + errStr = fmt.Sprintf("%s group %d, id %d", errStr, v.EntityGroupId, v.EntityId) + errFound = true + } + } + + if errFound { + err = fmt.Errorf("%s", errStr) + } + + return err +} + +func (s *Info) populateMigProfileNames(entities []dcgm.GroupEntityPair) error { + if len(entities) == 0 { + // There are no entities to populate + return nil + } + + var fields []dcgm.Short + fields = append(fields, dcgm.DCGM_FI_DEV_NAME) + flags := dcgm.DCGM_FV_FLAG_LIVE_DATA + values, err := dcgmprovider.Client().EntitiesGetLatestValues(entities, fields, flags) + if err != nil { + return err + } + + return s.setMigProfileNames(values) +} + +func (s *Info) gpuIDExists(gpuId int) bool { + for i := uint(0); i < s.gpuCount; i++ { + if s.gpus[i].DeviceInfo.GPU == uint(gpuId) { + return true + } + } + return false +} + +func (s *Info) gpuInstanceIDExists(gpuInstanceId int) bool { + for i := uint(0); i < s.gpuCount; i++ { + for _, instance := range s.gpus[i].GPUInstances { + if instance.EntityId == uint(gpuInstanceId) { + return true + } + } + } + return false +} + +func (s *Info) cpuIDExists(cpuId int) bool { + for _, cpu := range s.cpus { + if cpu.EntityId == uint(cpuId) { + return true + } + } + return false +} + +func (s *Info) cpuCoreIDExists(coreId int) bool { + for _, cpu := range s.cpus { + for _, core := range cpu.Cores { + if core == uint(coreId) { + return true + } + } + } + return false +} + +func (s *Info) switchIDExists(switchId int) bool { + for _, sw := range s.switches { + if sw.EntityId == uint(switchId) { + return true + } + } + return false +} + +func (s *Info) linkIDExists(linkId int) bool { + for _, sw := range s.switches { + for _, link := range sw.NvLinks { + if link.Index == uint(linkId) { + return true + } + } + } + return false +} + +func (s *Info) verifyDevicePresence() error { + if s.gOpt.Flex { + return nil + } + + if len(s.gOpt.MajorRange) > 0 && s.gOpt.MajorRange[0] != -1 { + // Verify we can find all the specified gpus + for _, gpuID := range s.gOpt.MajorRange { + if !s.gpuIDExists(gpuID) { + return fmt.Errorf("couldn't find requested GPU ID '%d'", gpuID) + } + } + } + + if len(s.gOpt.MinorRange) > 0 && s.gOpt.MinorRange[0] != -1 { + for _, gpuInstanceID := range s.gOpt.MinorRange { + if !s.gpuInstanceIDExists(gpuInstanceID) { + return fmt.Errorf("couldn't find requested GPU instance ID '%d'", gpuInstanceID) + } + } + } + + return nil +} + +func (s *Info) verifyCPUDevicePresence() error { + if s.cOpt.Flex { + return nil + } + + if len(s.cOpt.MajorRange) > 0 && s.cOpt.MajorRange[0] != -1 { + // Verify we can find all the specified CPUs + for _, cpuID := range s.cOpt.MajorRange { + if !s.cpuIDExists(cpuID) { + return fmt.Errorf("couldn't find requested CPU ID '%d'", cpuID) + } + } + } + + if len(s.cOpt.MinorRange) > 0 && s.cOpt.MinorRange[0] != -1 { + for _, coreID := range s.cOpt.MinorRange { + if !s.cpuCoreIDExists(coreID) { + return fmt.Errorf("couldn't find requested CPU core '%d'", coreID) + } + } + } + + return nil +} + +func (s *Info) shouldMonitor(monitoringRange []int, val uint) bool { + if len(monitoringRange) > 0 { + if monitoringRange[0] == -1 { + return true + } else { + return slices.Contains(monitoringRange, int(val)) + } + } + + return false +} + +func (s *Info) verifySwitchDevicePresence() error { + if s.sOpt.Flex { + return nil + } + + if len(s.sOpt.MajorRange) > 0 && s.sOpt.MajorRange[0] != -1 { + // Verify we can find all the specified switches + for _, swID := range s.sOpt.MajorRange { + if !s.switchIDExists(swID) { + return fmt.Errorf("couldn't find requested NvSwitch ID '%d'", swID) + } + } + } + + if len(s.sOpt.MinorRange) > 0 && s.sOpt.MinorRange[0] != -1 { + for _, linkID := range s.sOpt.MinorRange { + if !s.linkIDExists(linkID) { + return fmt.Errorf("couldn't find requested NvLink '%d'", linkID) + } + } + } + + return nil +} + +func (s *Info) IsCPUWatched(cpuID uint) bool { + if !slices.ContainsFunc(s.cpus, func(cpu CPUInfo) bool { + return cpu.EntityId == cpuID + }) { + return false + } + + if s.cOpt.Flex { + return true + } + + if len(s.cOpt.MajorRange) > 0 && s.cOpt.MajorRange[0] == -1 { + return true + } + + return slices.ContainsFunc(s.cOpt.MajorRange, func(cpu int) bool { + return uint(cpu) == cpuID + }) +} + +func (s *Info) IsCoreWatched(coreID uint, cpuID uint) bool { + if s.cOpt.Flex { + return true + } + + // Find a CPU + cpuIdx := slices.IndexFunc(s.cpus, func(cpu CPUInfo) bool { + return s.IsCPUWatched(cpu.EntityId) && cpu.EntityId == cpuID + }) + + if cpuIdx > -1 { + if len(s.cOpt.MinorRange) > 0 && s.cOpt.MinorRange[0] == -1 { + return true + } + + return slices.Contains(s.cOpt.MinorRange, int(coreID)) + } + + return false +} + +func (s *Info) IsSwitchWatched(switchID uint) bool { + if s.sOpt.Flex { + return true + } + + // When MajorRange contains -1 value, we do monitorig of all switches + if len(s.sOpt.MajorRange) > 0 && s.sOpt.MajorRange[0] == -1 { + return true + } + + return slices.Contains(s.sOpt.MajorRange, int(switchID)) +} + +func (s *Info) IsLinkWatched(linkIndex uint, switchID uint) bool { + if s.sOpt.Flex { + return true + } + + // Find a switch + switchIdx := slices.IndexFunc(s.switches, func(si SwitchInfo) bool { + return si.EntityId == switchID && s.IsSwitchWatched(si.EntityId) + }) + + if switchIdx > -1 { + // Switch exists and is watched + sw := s.switches[switchIdx] + + if len(s.sOpt.MinorRange) > 0 && s.sOpt.MinorRange[0] == -1 { + return true + } + + // The Link exists + if slices.ContainsFunc(sw.NvLinks, func(nls dcgm.NvLinkStatus) bool { + return nls.Index == linkIndex + }) { + // and the link index in the Minor range + return slices.Contains(s.sOpt.MinorRange, int(linkIndex)) + } + } + + return false +} + +func getCoreArray(bitmask []uint64) []uint { + var cores []uint + bits := make([]uint64, dcgm.MAX_CPU_CORE_BITMASK_COUNT) + + for i := 0; i < len(bitmask); i++ { + bits[i] = bitmask[i] + } + + b := bitset.From(bits) + + for i := uint(0); i < dcgm.MAX_NUM_CPU_CORES; i++ { + if b.Test(i) { + cores = append(cores, i) + } + } + + return cores +} + +// Helper Functions + +func GetGPUInstanceIdentifier(deviceInfo Provider, gpuuuid string, gpuInstanceID uint) string { + for i := uint(0); i < deviceInfo.GPUCount(); i++ { + if deviceInfo.GPU(i).DeviceInfo.UUID == gpuuuid { + identifier := fmt.Sprintf("%d-%d", deviceInfo.GPU(i).DeviceInfo.GPU, gpuInstanceID) + return identifier + } + } + + return "" +} diff --git a/internal/pkg/deviceinfo/device_info_test.go b/internal/pkg/deviceinfo/device_info_test.go new file mode 100644 index 00000000..f947b671 --- /dev/null +++ b/internal/pkg/deviceinfo/device_info_test.go @@ -0,0 +1,2749 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package deviceinfo + +import ( + "fmt" + "slices" + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + + mockdcgm "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" +) + +var fakeProfileName = "2fake.4gb" + +func SpoofGPUDeviceInfo() Info { + var deviceInfo Info + deviceInfo.gpuCount = 2 + deviceInfo.gpus[0].DeviceInfo.GPU = 0 + gi := GPUInstanceInfo{ + Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlProfileSlices: 3}, + ProfileName: fakeProfileName, + EntityId: 0, + } + deviceInfo.gpus[0].GPUInstances = append(deviceInfo.gpus[0].GPUInstances, gi) + gi2 := GPUInstanceInfo{ + Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlInstanceId: 1, NvmlProfileSlices: 3}, + ProfileName: fakeProfileName, + EntityId: 14, + } + deviceInfo.gpus[1].GPUInstances = append(deviceInfo.gpus[1].GPUInstances, gi2) + deviceInfo.gpus[1].DeviceInfo.GPU = 1 + + return deviceInfo +} + +func TestGetters(t *testing.T) { + fakeDevices := SpoofGPUDevices() + fakeDeviceInfo := [dcgm.MAX_NUM_DEVICES]GPUInfo{} + fakeDeviceInfo[0] = GPUInfo{ + DeviceInfo: fakeDevices[0], + MigEnabled: false, + } + fakeDeviceInfo[1] = GPUInfo{ + DeviceInfo: fakeDevices[1], + MigEnabled: true, + } + + fakeSwitches := []SwitchInfo{ + { + EntityId: 0, + NvLinks: nil, + }, + { + EntityId: 1, + NvLinks: nil, + }, + } + + fakeCPUs := []CPUInfo{ + { + EntityId: 0, + Cores: nil, + }, + { + EntityId: 1, + Cores: nil, + }, + } + + fakeGOpts := appconfig.DeviceOptions{ + Flex: true, + } + + fakeSOpts := appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{-1}, + MinorRange: []int{1, 2, 3}, + } + + fakeCOpts := appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0, 1}, + MinorRange: []int{1, 2, 3}, + } + + fakeInfoType := dcgm.FE_GPU + + deviceInfo := Info{ + gpuCount: uint(len(fakeDevices)), + gpus: fakeDeviceInfo, + switches: fakeSwitches, + cpus: fakeCPUs, + gOpt: fakeGOpts, + sOpt: fakeSOpts, + cOpt: fakeCOpts, + infoType: fakeInfoType, + } + + assert.Equal(t, uint(len(fakeDevices)), deviceInfo.GPUCount(), "GPU count mismatch") + assert.Equal(t, fakeDeviceInfo[:], deviceInfo.GPUs(), "GPUs mismatch") + assert.Equal(t, fakeDeviceInfo[0], deviceInfo.GPU(uint(0)), "GPU mismatch") + assert.Equal(t, fakeSwitches, deviceInfo.Switches(), "Switches mismatch") + assert.Equal(t, fakeSwitches[1], deviceInfo.Switch(uint(1)), "Switch mismatch") + assert.Equal(t, fakeCPUs, deviceInfo.CPUs(), "CPUs mismatch") + assert.Equal(t, fakeCPUs[1], deviceInfo.CPU(uint(1)), "CPU mismatch") + assert.Equal(t, fakeGOpts, deviceInfo.GOpts(), "GPUs options mismatch") + assert.Equal(t, fakeSOpts, deviceInfo.SOpts(), "Switches options mismatch") + assert.Equal(t, fakeCOpts, deviceInfo.COpts(), "CPUs options mismatch") + assert.Equal(t, fakeInfoType, deviceInfo.InfoType(), "InfoType mismatch") +} + +func TestInitialize(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGMProvider := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGMProvider) + + fakeDevices := SpoofGPUDevices() + _, fakeGPUs, _, _ := SpoofMigHierarchy() + + tests := []struct { + name string + gOpts appconfig.DeviceOptions + sOpts appconfig.DeviceOptions + cOpts appconfig.DeviceOptions + entityType dcgm.Field_Entity_Group + mockCalls func() + expectedOutput func() *Info + assertions func(*Info, *Info) + wantErr bool + }{ + { + name: "Initialize GPUs", + gOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_GPU, + mockCalls: func() { + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 1, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil) + }, + expectedOutput: func() *Info { + return &Info{ + gpuCount: 0, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{ + { + DeviceInfo: fakeDevices[0], + }, + }, + switches: nil, + cpus: nil, + gOpt: appconfig.DeviceOptions{Flex: true}, + sOpt: appconfig.DeviceOptions{}, + cOpt: appconfig.DeviceOptions{}, + infoType: dcgm.FE_GPU, + } + }, + assertions: func(expected, actual *Info) { + assert.Equal(t, expected.gpus[0].DeviceInfo, actual.gpus[0].DeviceInfo, + "GPU device info mismatch") + + assert.Equal(t, expected.gpus[0].MigEnabled, actual.gpus[0].MigEnabled, + "MIG info mismatch") + + assert.Equal(t, len(expected.gpus[0].GPUInstances), len(actual.gpus[0].GPUInstances), + "GPU Instances length mismatch") + + assert.Equal(t, expected.gOpt, actual.gOpt, "GPU options mismatch") + + assert.Equal(t, expected.infoType, actual.infoType, "GPU info type mismatch") + }, + wantErr: false, + }, + { + name: "Initialize GPUs error", + gOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_GPU, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(0), fmt.Errorf("some error")) + }, + wantErr: true, + }, + { + name: "Initialize Switches", + sOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_SWITCH, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return([]uint{1}, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return([]dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + }, nil) + }, + expectedOutput: func() *Info { + return &Info{ + gpuCount: 0, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{}, + switches: []SwitchInfo{ + { + EntityId: uint(1), + NvLinks: []dcgm.NvLinkStatus{ + { + ParentId: uint(1), + ParentType: dcgm.FE_SWITCH, + Index: uint(1), + }, + }, + }, + }, + cpus: nil, + gOpt: appconfig.DeviceOptions{}, + sOpt: appconfig.DeviceOptions{Flex: true}, + cOpt: appconfig.DeviceOptions{}, + infoType: dcgm.FE_SWITCH, + } + }, + assertions: func(expected, actual *Info) { + assert.Equal(t, len(expected.switches), len(actual.switches), + "Switches length mismatch") + + assert.Equal(t, expected.switches[0].EntityId, actual.switches[0].EntityId, + "Switch Entity ID mismatch") + + assert.Equal(t, len(expected.switches[0].NvLinks), len(actual.switches[0].NvLinks), + "Switches NV link length mismatch") + + assert.Equal(t, expected.switches[0].NvLinks[0].Index, actual.switches[0].NvLinks[0].Index, + "Switches NV link Index mismatch") + + assert.Equal(t, expected.sOpt, actual.sOpt, "Switch options mismatch") + + assert.Equal(t, expected.infoType, actual.infoType, "Switch info type mismatch") + }, + wantErr: false, + }, + { + name: "Initialize Switches error", + sOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_SWITCH, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(dcgm.FE_SWITCH).Return([]uint{uint(0)}, + fmt.Errorf("some error")) + }, + wantErr: true, + }, + { + name: "Initialize NV Links", + sOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_LINK, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return([]uint{1}, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return([]dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + }, nil) + }, + expectedOutput: func() *Info { + return &Info{ + gpuCount: 0, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{}, + switches: []SwitchInfo{ + { + EntityId: uint(1), + NvLinks: []dcgm.NvLinkStatus{ + { + ParentId: uint(1), + ParentType: dcgm.FE_SWITCH, + Index: uint(1), + }, + }, + }, + }, + cpus: nil, + gOpt: appconfig.DeviceOptions{}, + sOpt: appconfig.DeviceOptions{Flex: true}, + cOpt: appconfig.DeviceOptions{}, + infoType: dcgm.FE_LINK, + } + }, + assertions: func(expected, actual *Info) { + assert.Equal(t, len(expected.switches), len(actual.switches), + "Switches length mismatch") + + assert.Equal(t, expected.switches[0].EntityId, actual.switches[0].EntityId, + "Switch Entity ID mismatch") + + assert.Equal(t, len(expected.switches[0].NvLinks), len(actual.switches[0].NvLinks), + "Switches NV link length mismatch") + + assert.Equal(t, expected.switches[0].NvLinks[0].Index, actual.switches[0].NvLinks[0].Index, + "Switches NV link Index mismatch") + + assert.Equal(t, expected.sOpt, actual.sOpt, "NV Link options mismatch") + + assert.Equal(t, expected.infoType, actual.infoType, "NV Link info type mismatch") + }, + wantErr: false, + }, + { + name: "Initialize NV Link error", + sOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_LINK, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(dcgm.FE_SWITCH).Return([]uint{uint(0)}, + fmt.Errorf("some error")) + }, + wantErr: true, + }, + { + name: "initialize CPUs", + cOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_CPU, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 1, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{1, 2, 8}, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + expectedOutput: func() *Info { + return &Info{ + gpuCount: 0, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{}, + switches: nil, + cpus: []CPUInfo{ + { + EntityId: uint(1), + Cores: []uint{0, 65, 131}, + }, + }, + gOpt: appconfig.DeviceOptions{}, + sOpt: appconfig.DeviceOptions{}, + cOpt: appconfig.DeviceOptions{Flex: true}, + infoType: dcgm.FE_CPU, + } + }, + assertions: func(expected, actual *Info) { + assert.Equal(t, len(expected.cpus), len(actual.cpus), + "CPU length mismatch") + + assert.Equal(t, expected.cpus[0].EntityId, expected.cpus[0].EntityId, + "CPU Entity ID mismatch") + + assert.Equal(t, len(expected.cpus[0].Cores), len(actual.cpus[0].Cores), + "CPU Core length mismatch") + + assert.True(t, slices.Equal(expected.cpus[0].Cores, actual.cpus[0].Cores), + "CPU Cores mismatch") + + assert.Equal(t, expected.cOpt, actual.cOpt, "CPU options mismatch") + + assert.Equal(t, expected.infoType, actual.infoType, "CPU info type mismatch") + }, + wantErr: false, + }, + { + name: "Initialize CPU error", + cOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_CPU, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(dcgm.CpuHierarchy_v1{}, fmt.Errorf("some error")) + }, + wantErr: true, + }, + { + name: "Initialize CPU Cores", + cOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_CPU_CORE, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 1, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{1, 2, 8}, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + expectedOutput: func() *Info { + return &Info{ + gpuCount: 0, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{}, + switches: nil, + cpus: []CPUInfo{ + { + EntityId: uint(1), + Cores: []uint{0, 65, 131}, + }, + }, + gOpt: appconfig.DeviceOptions{}, + sOpt: appconfig.DeviceOptions{}, + cOpt: appconfig.DeviceOptions{Flex: true}, + infoType: dcgm.FE_CPU_CORE, + } + }, + assertions: func(expected, actual *Info) { + assert.Equal(t, len(expected.cpus), len(actual.cpus), + "CPU length mismatch") + + assert.Equal(t, expected.cpus[0].EntityId, expected.cpus[0].EntityId, + "CPU Entity ID mismatch") + + assert.Equal(t, len(expected.cpus[0].Cores), len(actual.cpus[0].Cores), + "CPU Core length mismatch") + + assert.True(t, slices.Equal(expected.cpus[0].Cores, actual.cpus[0].Cores), + "CPU Cores mismatch") + + assert.Equal(t, expected.cOpt, actual.cOpt, "CPU options mismatch") + + assert.Equal(t, expected.infoType, actual.infoType, "CPU info type mismatch") + }, + wantErr: false, + }, + { + name: "Initialize CPU Cores error", + cOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_CPU_CORE, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(dcgm.CpuHierarchy_v1{}, fmt.Errorf("some error")) + }, + wantErr: true, + }, + { + name: "Initialize Invalid type error", + cOpts: appconfig.DeviceOptions{Flex: true}, + entityType: dcgm.FE_VGPU, + mockCalls: func() {}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.mockCalls() + + if !tt.wantErr { + deviceInfo, err := Initialize(tt.gOpts, tt.sOpts, tt.cOpts, false, tt.entityType) + assert.NoError(t, err, "Error not expected") + assert.NotNil(t, deviceInfo, "Expected output to be not nil") + + expectedDeviceInfo := tt.expectedOutput() + tt.assertions(expectedDeviceInfo, deviceInfo) + } else { + _, err := Initialize(tt.gOpts, tt.sOpts, tt.cOpts, false, tt.entityType) + assert.Error(t, err, "Error expected") + } + }) + } +} + +func TestInitializeGPUInfo(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGMProvider := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGMProvider) + + fakeDevices := SpoofGPUDevices() + fakeMigHierarchy, fakeGPUs, fakeGPUInstances, fakeGPUComputeInstances := SpoofMigHierarchy() + + tests := []struct { + name string + gOpts appconfig.DeviceOptions + mockCalls func() + expectedOutput map[uint]GPUInfo + wantErr bool + }{ + { + name: "GPU with 0 Device Count", + gOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(0), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(dcgm.MigHierarchy_v2{ + Count: 0, + }, nil) + }, + expectedOutput: map[uint]GPUInfo{}, + wantErr: false, + }, + { + name: "GPU with 0 Device Count with GetAllDeviceCount error", + gOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(0), fmt.Errorf("some error")) + }, + expectedOutput: map[uint]GPUInfo{}, + wantErr: true, + }, + { + name: "GPU Count 1 with No Hierarchy", + gOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 1, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil) + }, + expectedOutput: map[uint]GPUInfo{ + 0: { + DeviceInfo: fakeDevices[0], + }, + }, + wantErr: false, + }, + { + name: "GPU count 2 GPU with No Hierarchy", + gOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 2, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + mockHierarchy.EntityList[0] = fakeGPUs[1] + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil) + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + expectedOutput: map[uint]GPUInfo{ + 0: {DeviceInfo: fakeDevices[0]}, + 1: {DeviceInfo: fakeDevices[1]}, + }, + wantErr: false, + }, + { + name: "GPU Count 1 with No Hierarchy but GetDeviceInfo error", + gOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], fmt.Errorf("some error")) + }, + expectedOutput: map[uint]GPUInfo{}, + wantErr: true, + }, + { + name: "GPU Count 1 with No Hierarchy but GetGpuInstanceHierarchy error", + gOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(dcgm.MigHierarchy_v2{}, + fmt.Errorf("some error")) + }, + expectedOutput: map[uint]GPUInfo{}, + wantErr: true, + }, + { + name: "GPU Count 1 with Hierarchy", + gOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 6, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + mockHierarchy.EntityList[1] = fakeGPUInstances[0] + mockHierarchy.EntityList[2] = fakeGPUComputeInstances[0] + mockHierarchy.EntityList[3] = fakeGPUComputeInstances[1] + mockHierarchy.EntityList[4] = fakeGPUInstances[1] + mockHierarchy.EntityList[5] = fakeGPUComputeInstances[2] + + mockEntitiesInput := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[0].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[1].Entity.EntityId}, + } + + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: mockEntitiesInput[0].EntityId}, + {EntityId: mockEntitiesInput[1].EntityId}, + } + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(mockEntitiesInput, gomock.Any(), + gomock.Any()).Return(mockEntitiesResult, nil) + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[0]).Return("instance_profile_0") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[1]).Return("instance_profile_1") + }, + expectedOutput: map[uint]GPUInfo{ + 0: { + DeviceInfo: fakeDevices[0], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[0].Entity.EntityId, + Info: fakeGPUInstances[0].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[0].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[0].Info, + }, + { + EntityId: fakeGPUComputeInstances[1].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[1].Info, + }, + }, + ProfileName: "instance_profile_0", + }, + { + EntityId: fakeGPUInstances[1].Entity.EntityId, + Info: fakeGPUInstances[1].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[2].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[2].Info, + }, + }, + ProfileName: "instance_profile_1", + }, + }, + MigEnabled: true, + }, + }, + wantErr: false, + }, + { + name: "GPU Count 2 with Hierarchy", + gOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockEntitiesInput := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[0].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[1].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[2].Entity.EntityId}, + } + + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: mockEntitiesInput[0].EntityId}, + {EntityId: mockEntitiesInput[1].EntityId}, + {EntityId: mockEntitiesInput[2].EntityId}, + } + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(fakeMigHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(mockEntitiesInput, gomock.Any(), + gomock.Any()).Return(mockEntitiesResult, nil) + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[0]).Return("instance_profile_0") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[1]).Return("instance_profile_1") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[2]).Return("instance_profile_2") + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + expectedOutput: map[uint]GPUInfo{ + 0: { + DeviceInfo: fakeDevices[0], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[0].Entity.EntityId, + Info: fakeGPUInstances[0].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[0].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[0].Info, + }, + { + EntityId: fakeGPUComputeInstances[1].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[1].Info, + }, + }, + ProfileName: "instance_profile_0", + }, + { + EntityId: fakeGPUInstances[1].Entity.EntityId, + Info: fakeGPUInstances[1].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[2].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[2].Info, + }, + }, + ProfileName: "instance_profile_1", + }, + }, + MigEnabled: true, + }, + 1: { + DeviceInfo: fakeDevices[1], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[2].Entity.EntityId, + Info: fakeGPUInstances[2].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[3].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[3].Info, + }, + }, + ProfileName: "instance_profile_2", + }, + }, + MigEnabled: true, + }, + }, + wantErr: false, + }, + { + name: "GPU Count 2 with Hierarchy but EntitiesGetLatestValues error", + gOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(fakeMigHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(gomock.Any(), gomock.Any(), + gomock.Any()).Return([]dcgm.FieldValue_v2{}, fmt.Errorf("some error")) + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + wantErr: true, + }, + /* + // TODO (roarora): Today, a different sequence out of GetGpuInstanceHierarchy causes an error in exporter + { + name: "GPU Count 2 with Hierarchy Different MIG Hierarchy Sequence", + gOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 9, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + mockHierarchy.EntityList[1] = fakeGPUInstances[0] + mockHierarchy.EntityList[2] = fakeGPUInstances[1] + mockHierarchy.EntityList[3] = fakeGPUComputeInstances[0] + mockHierarchy.EntityList[4] = fakeGPUComputeInstances[1] + mockHierarchy.EntityList[5] = fakeGPUComputeInstances[2] + mockHierarchy.EntityList[6] = fakeGPUs[1] + mockHierarchy.EntityList[7] = fakeGPUInstances[2] + mockHierarchy.EntityList[8] = fakeGPUComputeInstances[3] + + mockEntitiesInput := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[0].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[1].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[2].Entity.EntityId}, + } + + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: mockEntitiesInput[0].EntityId}, + {EntityId: mockEntitiesInput[1].EntityId}, + {EntityId: mockEntitiesInput[2].EntityId}, + } + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(mockEntitiesInput, gomock.Any(), + gomock.Any()).Return(mockEntitiesResult, nil) + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[0]).Return("instance_profile_0") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[1]).Return("instance_profile_1") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[2]).Return("instance_profile_2") + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + expectedOutput: map[uint]GPUInfo{ + 0: { + DeviceInfo: fakeDevices[0], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[0].Entity.EntityId, + Info: fakeGPUInstances[0].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[0].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[0].Info, + }, + { + EntityId: fakeGPUComputeInstances[1].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[1].Info, + }, + }, + ProfileName: "instance_profile_0", + }, + { + EntityId: fakeGPUInstances[1].Entity.EntityId, + Info: fakeGPUInstances[1].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[2].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[2].Info, + }, + }, + ProfileName: "instance_profile_1", + }, + }, + MigEnabled: true, + }, + 1: { + DeviceInfo: fakeDevices[1], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[2].Entity.EntityId, + Info: fakeGPUInstances[2].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[3].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[3].Info, + }, + }, + ProfileName: "instance_profile_2", + }, + }, + MigEnabled: true, + }, + }, + wantErr: false, + },*/ + { + name: "GPU Count 2 with Hierarchy and device options", + gOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0, 1}, + MinorRange: []int{1, 2, 3}, + }, + mockCalls: func() { + mockEntitiesInput := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[0].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[1].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[2].Entity.EntityId}, + } + + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: mockEntitiesInput[0].EntityId}, + {EntityId: mockEntitiesInput[1].EntityId}, + {EntityId: mockEntitiesInput[2].EntityId}, + } + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(fakeMigHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(mockEntitiesInput, gomock.Any(), + gomock.Any()).Return(mockEntitiesResult, nil) + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[0]).Return("instance_profile_0") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[1]).Return("instance_profile_1") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[2]).Return("instance_profile_2") + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + expectedOutput: map[uint]GPUInfo{ + 0: { + DeviceInfo: fakeDevices[0], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[0].Entity.EntityId, + Info: fakeGPUInstances[0].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[0].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[0].Info, + }, + { + EntityId: fakeGPUComputeInstances[1].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[1].Info, + }, + }, + ProfileName: "instance_profile_0", + }, + { + EntityId: fakeGPUInstances[1].Entity.EntityId, + Info: fakeGPUInstances[1].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[2].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[2].Info, + }, + }, + ProfileName: "instance_profile_1", + }, + }, + MigEnabled: true, + }, + 1: { + DeviceInfo: fakeDevices[1], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[2].Entity.EntityId, + Info: fakeGPUInstances[2].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[3].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[3].Info, + }, + }, + ProfileName: "instance_profile_2", + }, + }, + MigEnabled: true, + }, + }, + wantErr: false, + }, + /* + // TODO (roarora): Today, Specifying Major range does not remove extra GPUs + { + name: "GPU Count 2 with Hierarchy and device options with extra GPU discovery", + gOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0}, + MinorRange: []int{1, 2}, + }, + mockCalls: func() { + mockEntitiesInput := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[0].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[1].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[2].Entity.EntityId}, + } + + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: mockEntitiesInput[0].EntityId}, + {EntityId: mockEntitiesInput[1].EntityId}, + {EntityId: mockEntitiesInput[2].EntityId}, + } + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(fakeMigHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(mockEntitiesInput, gomock.Any(), + gomock.Any()).Return(mockEntitiesResult, nil) + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[0]).Return("instance_profile_0") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[1]).Return("instance_profile_1") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[2]).Return("instance_profile_2") + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + expectedOutput: map[uint]GPUInfo{ + 0: { + DeviceInfo: fakeDevices[0], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[0].Entity.EntityId, + Info: fakeGPUInstances[0].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[0].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[0].Info, + }, + { + EntityId: fakeGPUComputeInstances[1].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[1].Info, + }, + }, + ProfileName: "instance_profile_0", + }, + { + EntityId: fakeGPUInstances[1].Entity.EntityId, + Info: fakeGPUInstances[1].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[2].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[2].Info, + }, + }, + ProfileName: "instance_profile_1", + }, + }, + MigEnabled: true, + }, + }, + wantErr: false, + }, + // TODO (roarora): Today, Specifying Minor range does not remove extra GPU Instances + { + name: "GPU Count 2 with Hierarchy and device options with extra GPU Instance discovery", + gOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0, 1}, + MinorRange: []int{1, 3}, + }, + mockCalls: func() { + mockEntitiesInput := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[0].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[1].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[2].Entity.EntityId}, + } + + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: mockEntitiesInput[0].EntityId}, + {EntityId: mockEntitiesInput[1].EntityId}, + {EntityId: mockEntitiesInput[2].EntityId}, + } + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(fakeMigHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(mockEntitiesInput, gomock.Any(), + gomock.Any()).Return(mockEntitiesResult, nil) + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[0]).Return("instance_profile_0") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[1]).Return("instance_profile_1") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[2]).Return("instance_profile_2") + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + expectedOutput: map[uint]GPUInfo{ + 0: { + DeviceInfo: fakeDevices[0], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[0].Entity.EntityId, + Info: fakeGPUInstances[0].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[0].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[0].Info, + }, + { + EntityId: fakeGPUComputeInstances[1].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[1].Info, + }, + }, + ProfileName: "instance_profile_0", + }, + }, + MigEnabled: true, + }, + 1: { + DeviceInfo: fakeDevices[1], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[2].Entity.EntityId, + Info: fakeGPUInstances[2].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[3].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[3].Info, + }, + }, + ProfileName: "instance_profile_2", + }, + }, + MigEnabled: true, + }, + }, + wantErr: false, + }, + */ + { + name: "GPU Count 2 with Hierarchy and device options Major -1", + gOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{-1}, + MinorRange: []int{1, 2, 3}, + }, + mockCalls: func() { + mockEntitiesInput := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[0].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[1].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[2].Entity.EntityId}, + } + + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: mockEntitiesInput[0].EntityId}, + {EntityId: mockEntitiesInput[1].EntityId}, + {EntityId: mockEntitiesInput[2].EntityId}, + } + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(fakeMigHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(mockEntitiesInput, gomock.Any(), + gomock.Any()).Return(mockEntitiesResult, nil) + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[0]).Return("instance_profile_0") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[1]).Return("instance_profile_1") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[2]).Return("instance_profile_2") + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + expectedOutput: map[uint]GPUInfo{ + 0: { + DeviceInfo: fakeDevices[0], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[0].Entity.EntityId, + Info: fakeGPUInstances[0].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[0].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[0].Info, + }, + { + EntityId: fakeGPUComputeInstances[1].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[1].Info, + }, + }, + ProfileName: "instance_profile_0", + }, + { + EntityId: fakeGPUInstances[1].Entity.EntityId, + Info: fakeGPUInstances[1].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[2].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[2].Info, + }, + }, + ProfileName: "instance_profile_1", + }, + }, + MigEnabled: true, + }, + 1: { + DeviceInfo: fakeDevices[1], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[2].Entity.EntityId, + Info: fakeGPUInstances[2].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[3].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[3].Info, + }, + }, + ProfileName: "instance_profile_2", + }, + }, + MigEnabled: true, + }, + }, + wantErr: false, + }, + { + name: "GPU Count 2 with Hierarchy and device options Major -1 and Minor -1", + gOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + mockCalls: func() { + mockEntitiesInput := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[0].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[1].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[2].Entity.EntityId}, + } + + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: mockEntitiesInput[0].EntityId}, + {EntityId: mockEntitiesInput[1].EntityId}, + {EntityId: mockEntitiesInput[2].EntityId}, + } + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(fakeMigHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(mockEntitiesInput, gomock.Any(), + gomock.Any()).Return(mockEntitiesResult, nil) + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[0]).Return("instance_profile_0") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[1]).Return("instance_profile_1") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[2]).Return("instance_profile_2") + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + expectedOutput: map[uint]GPUInfo{ + 0: { + DeviceInfo: fakeDevices[0], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[0].Entity.EntityId, + Info: fakeGPUInstances[0].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[0].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[0].Info, + }, + { + EntityId: fakeGPUComputeInstances[1].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[1].Info, + }, + }, + ProfileName: "instance_profile_0", + }, + { + EntityId: fakeGPUInstances[1].Entity.EntityId, + Info: fakeGPUInstances[1].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[2].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[2].Info, + }, + }, + ProfileName: "instance_profile_1", + }, + }, + MigEnabled: true, + }, + 1: { + DeviceInfo: fakeDevices[1], + GPUInstances: []GPUInstanceInfo{ + { + EntityId: fakeGPUInstances[2].Entity.EntityId, + Info: fakeGPUInstances[2].Info, + ComputeInstances: []ComputeInstanceInfo{ + { + EntityId: fakeGPUComputeInstances[3].Entity.EntityId, + InstanceInfo: fakeGPUComputeInstances[3].Info, + }, + }, + ProfileName: "instance_profile_2", + }, + }, + MigEnabled: true, + }, + }, + wantErr: false, + }, + { + name: "GPU Count 2 with Hierarchy and missing GPU", + gOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0, 1, 2}, + MinorRange: []int{1, 2, 3}, + }, + mockCalls: func() { + mockEntitiesInput := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[0].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[1].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[2].Entity.EntityId}, + } + + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: mockEntitiesInput[0].EntityId}, + {EntityId: mockEntitiesInput[1].EntityId}, + {EntityId: mockEntitiesInput[2].EntityId}, + } + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(fakeMigHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(mockEntitiesInput, gomock.Any(), + gomock.Any()).Return(mockEntitiesResult, nil) + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[0]).Return("instance_profile_0") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[1]).Return("instance_profile_1") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[2]).Return("instance_profile_2") + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + wantErr: true, + }, + { + name: "GPU Count 2 with Hierarchy and missing GPU Instances", + gOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0, 1}, + MinorRange: []int{1, 2, 3, 4}, + }, + mockCalls: func() { + mockEntitiesInput := []dcgm.GroupEntityPair{ + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[0].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[1].Entity.EntityId}, + {EntityGroupId: dcgm.FE_GPU_I, EntityId: fakeGPUInstances[2].Entity.EntityId}, + } + + mockEntitiesResult := []dcgm.FieldValue_v2{ + {EntityId: mockEntitiesInput[0].EntityId}, + {EntityId: mockEntitiesInput[1].EntityId}, + {EntityId: mockEntitiesInput[2].EntityId}, + } + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(len(fakeDevices)), nil) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(fakeMigHierarchy, nil) + mockDCGMProvider.EXPECT().EntitiesGetLatestValues(mockEntitiesInput, gomock.Any(), + gomock.Any()).Return(mockEntitiesResult, nil) + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[0]).Return("instance_profile_0") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[1]).Return("instance_profile_1") + mockDCGMProvider.EXPECT().Fv2_String(mockEntitiesResult[2]).Return("instance_profile_2") + + for i := 0; i < len(fakeDevices); i++ { + mockDCGMProvider.EXPECT().GetDeviceInfo(uint(i)).Return(fakeDevices[i], nil) + } + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.mockCalls() + + if !tt.wantErr { + deviceInfo := Info{} + err := deviceInfo.initializeGPUInfo(tt.gOpts, false) + assert.NoError(t, err, "Error not expected") + assert.Equal(t, len(tt.expectedOutput), int(deviceInfo.gpuCount), "GPU length mismatch") + + for i := 0; i < int(deviceInfo.gpuCount); i++ { + actualGPU := deviceInfo.gpus[i] + expectedGPU := tt.expectedOutput[actualGPU.DeviceInfo.GPU] + + assert.Equal(t, expectedGPU.DeviceInfo, actualGPU.DeviceInfo, + "GPU device info mismatch") + + assert.Equal(t, expectedGPU.MigEnabled, actualGPU.MigEnabled, + "MIG info mismatch") + + assert.Equal(t, len(expectedGPU.GPUInstances), len(actualGPU.GPUInstances), + "GPU Instances length mismatch") + + // Ensure each GPU Instance and Computer matches + for _, expectedInstance := range expectedGPU.GPUInstances { + instanceExist := slices.ContainsFunc(actualGPU.GPUInstances, + func(actualInstance GPUInstanceInfo) bool { + return expectedInstance.Info == actualInstance.Info && + expectedInstance.EntityId == actualInstance.EntityId && + slices.Equal(expectedInstance.ComputeInstances, actualInstance.ComputeInstances) + }) + + assert.True(t, instanceExist, "Expected instance %+v not found", expectedInstance) + } + } + } else { + deviceInfo := Info{} + err := deviceInfo.initializeGPUInfo(tt.gOpts, false) + assert.Error(t, err, "Error expected") + } + }) + } +} + +func TestInitializeCPUInfo(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGMProvider := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGMProvider) + + tests := []struct { + name string + cOpts appconfig.DeviceOptions + mockCalls func() + expectedCPUCoreOutput map[uint][]int + wantErr bool + }{ + { + name: "CPU Hierarchy with 0 CPUs", + cOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 0, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + wantErr: true, + }, + { + name: "CPU Hierarchy with 1 CPU", + cOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 1, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{1, 2, 8}, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + expectedCPUCoreOutput: map[uint][]int{0: {0, 65, 131}}, + wantErr: false, + }, + { + name: "CPU Hierarchy with 1 CPUs but GetCpuHierarchy error", + cOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 1, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{1, 2, 8}, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, fmt.Errorf("some error")) + }, + wantErr: true, + }, + { + name: "CPU Hierarchy with 2 CPUs", + cOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 2, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{1, 2, 8}, + }, + { + CpuId: 1, + OwnedCores: []uint64{8, 16, 32}, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + expectedCPUCoreOutput: map[uint][]int{0: {0, 65, 131}, 1: {3, 68, 133}}, + wantErr: false, + }, + { + name: "CPU Hierarchy with multiple CPUs and device options", + cOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0, 1, 2, 3, 4}, + MinorRange: []int{1, 2, 4, 8, 16, 32, 64, 128, 256}, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 5, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{0b10110}, + }, + { + CpuId: 1, + OwnedCores: []uint64{0x100010100}, + }, + { + CpuId: 2, + OwnedCores: []uint64{0x0, 0x1, 0x1, 0x0}, + }, + { + CpuId: 3, + OwnedCores: []uint64{0x0, 0x0, 0x0, 0x0, 0x1}, + }, + { + CpuId: 4, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + expectedCPUCoreOutput: map[uint][]int{0: {1, 2, 4}, 1: {8, 16, 32}, 2: {64, 128}, 3: {256}, 4: {}}, + wantErr: false, + }, + { + name: "CPU Hierarchy with multiple CPUs and device options with extra CPU discovery", + cOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0, 1, 2}, + MinorRange: []int{1, 2, 4, 8, 16, 32, 64, 128}, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 5, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{0b10110}, + }, + { + CpuId: 1, + OwnedCores: []uint64{0x100010100}, + }, + { + CpuId: 2, + OwnedCores: []uint64{0x0, 0x1, 0x1}, + }, + { + CpuId: 3, + OwnedCores: []uint64{0x0, 0x0, 0x0, 0x0, 0x1}, + }, + { + CpuId: 4, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + expectedCPUCoreOutput: map[uint][]int{0: {1, 2, 4}, 1: {8, 16, 32}, 2: {64, 128}}, + wantErr: false, + }, + { + name: "CPU Hierarchy with multiple CPUs and device options with extra CPU core discovery", + cOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0, 1, 2}, + MinorRange: []int{1, 2, 4, 8, 16, 32, 64}, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 5, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{0b10110}, + }, + { + CpuId: 1, + OwnedCores: []uint64{0x100010100}, + }, + { + CpuId: 2, + OwnedCores: []uint64{0x0, 0x1, 0x1, 0x1}, + }, + { + CpuId: 3, + OwnedCores: []uint64{0x0, 0x0, 0x0, 0x0, 0x1}, + }, + { + CpuId: 4, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + expectedCPUCoreOutput: map[uint][]int{0: {1, 2, 4}, 1: {8, 16, 32}, 2: {64}}, + wantErr: false, + }, + { + name: "CPU Hierarchy with multiple CPUs and device options Major -1", + cOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{-1}, + MinorRange: []int{1, 2, 4, 8, 16, 32, 64, 128, 256}, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 5, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{0b10110}, + }, + { + CpuId: 1, + OwnedCores: []uint64{0x100010100}, + }, + { + CpuId: 2, + OwnedCores: []uint64{0x0, 0x1, 0x1, 0x0}, + }, + { + CpuId: 3, + OwnedCores: []uint64{0x0, 0x0, 0x0, 0x0, 0x1}, + }, + { + CpuId: 4, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + expectedCPUCoreOutput: map[uint][]int{0: {1, 2, 4}, 1: {8, 16, 32}, 2: {64, 128}, 3: {256}, 4: {}}, + wantErr: false, + }, + { + name: "CPU Hierarchy with multiple CPUs and device options Major -1 and Minor -1", + cOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 5, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{0b10110}, + }, + { + CpuId: 1, + OwnedCores: []uint64{0x100010100}, + }, + { + CpuId: 2, + OwnedCores: []uint64{0x0, 0x1, 0x1, 0x0}, + }, + { + CpuId: 3, + OwnedCores: []uint64{0x0, 0x0, 0x0, 0x0, 0x1}, + }, + { + CpuId: 4, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + expectedCPUCoreOutput: map[uint][]int{0: {1, 2, 4}, 1: {8, 16, 32}, 2: {64, 128}, 3: {256}, 4: {}}, + wantErr: false, + }, + { + name: "CPU Hierarchy with multiple CPUs and missing CPU", + cOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0, 1, 2, 3, 4, 5}, + MinorRange: []int{1, 2, 4, 8, 16, 32, 64, 128, 256}, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 5, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{0b10110}, + }, + { + CpuId: 1, + OwnedCores: []uint64{0x100010100}, + }, + { + CpuId: 2, + OwnedCores: []uint64{0x0, 0x1, 0x1, 0x0}, + }, + { + CpuId: 3, + OwnedCores: []uint64{0x0, 0x0, 0x0, 0x0, 0x1}, + }, + { + CpuId: 4, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + wantErr: true, + }, + { + name: "CPU Hierarchy with multiple CPUs and missing CPU cores", + cOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0, 1, 2, 3, 4}, + MinorRange: []int{1, 2, 4, 8, 16, 32, 64, 128, 256, 1024}, + }, + mockCalls: func() { + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + NumCpus: 5, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{0b10110}, + }, + { + CpuId: 1, + OwnedCores: []uint64{0x100010100}, + }, + { + CpuId: 2, + OwnedCores: []uint64{0x0, 0x1, 0x1, 0x0}, + }, + { + CpuId: 3, + OwnedCores: []uint64{0x0, 0x0, 0x0, 0x0, 0x1}, + }, + { + CpuId: 4, + }, + }, + } + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil) + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.mockCalls() + + if !tt.wantErr { + + deviceInfo := Info{} + err := deviceInfo.initializeCPUInfo(tt.cOpts) + assert.NoError(t, err, "Error not expected") + assert.Equal(t, len(tt.expectedCPUCoreOutput), len(deviceInfo.cpus), "CPU length mismatch") + + for _, cpu := range deviceInfo.cpus { + assert.Equal(t, len(tt.expectedCPUCoreOutput[cpu.EntityId]), len(cpu.Cores), "Core length mismatch") + + for _, core := range cpu.Cores { + assert.True(t, slices.Contains(tt.expectedCPUCoreOutput[cpu.EntityId], int(core)), + "Core mismatch") + } + } + } else { + deviceInfo := Info{} + err := deviceInfo.initializeCPUInfo(tt.cOpts) + assert.Error(t, err, "Error expected") + } + }) + } +} + +func TestInitializeNvSwitchInfo(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGMProvider := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGMProvider) + + tests := []struct { + name string + sOpts appconfig.DeviceOptions + switchOutput []uint + linkStatusOutput []dcgm.NvLinkStatus + mockCalls func([]uint, []dcgm.NvLinkStatus) + expectedSwitchToLinkMap map[uint][]uint + wantErr bool + }{ + { + name: "Zero Switches", + sOpts: appconfig.DeviceOptions{ + Flex: true, + }, + switchOutput: []uint{}, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + }, + wantErr: true, + }, + { + name: "Single switch Single Link", + sOpts: appconfig.DeviceOptions{ + Flex: true, + }, + switchOutput: []uint{1}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + expectedSwitchToLinkMap: map[uint][]uint{1: {1}}, + wantErr: false, + }, + { + name: "Single switch Multiple Links", + sOpts: appconfig.DeviceOptions{ + Flex: true, + }, + switchOutput: []uint{1}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(2)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(3)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + expectedSwitchToLinkMap: map[uint][]uint{1: {1, 2, 3}}, + wantErr: false, + }, + { + name: "Multiple switch Multiple Links", + sOpts: appconfig.DeviceOptions{ + Flex: true, + }, + switchOutput: []uint{1, 2, 3}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(2)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(3)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + expectedSwitchToLinkMap: map[uint][]uint{1: {1, 2}, 2: {3}, 3: {}}, + wantErr: false, + }, + { + name: "Multiple switch Multiple Links with device options", + sOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{1, 2, 3, 4, 5}, + MinorRange: []int{1, 2, 3, 4, 5, 6, 7, 8, 9}, + }, + switchOutput: []uint{1, 2, 3, 4, 5}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(2)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(3)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(4)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(5)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(6)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(7)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(8)}, + {ParentId: uint(4), ParentType: dcgm.FE_SWITCH, Index: uint(9)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + expectedSwitchToLinkMap: map[uint][]uint{1: {1, 2, 3}, 2: {4, 5, 6}, 3: {7, 8}, 4: {9}, 5: {}}, + wantErr: false, + }, + { + name: "Multiple switch Multiple Links with device options with extra Switch discovery", + sOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{1, 2, 3}, + MinorRange: []int{1, 2, 3, 4, 5, 6, 7, 8}, + }, + switchOutput: []uint{1, 2, 3, 4, 5}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(2)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(3)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(4)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(5)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(6)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(7)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(8)}, + {ParentId: uint(4), ParentType: dcgm.FE_SWITCH, Index: uint(9)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + expectedSwitchToLinkMap: map[uint][]uint{1: {1, 2, 3}, 2: {4, 5, 6}, 3: {7, 8}}, + wantErr: false, + }, + { + name: "Multiple switch Multiple Links with device options with extra Link discovery", + sOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{1, 2, 3}, + MinorRange: []int{1, 2, 3, 4, 5, 6, 7}, + }, + switchOutput: []uint{1, 2, 3, 4}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(2)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(3)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(4)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(5)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(6)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(7)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(8)}, + {ParentId: uint(4), ParentType: dcgm.FE_SWITCH, Index: uint(9)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + expectedSwitchToLinkMap: map[uint][]uint{1: {1, 2, 3}, 2: {4, 5, 6}, 3: {7}}, + wantErr: false, + }, + { + name: "Multiple switch Multiple Links and device options Major -1", + sOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{-1}, + MinorRange: []int{1, 2, 3, 4, 5, 6, 7, 8, 9}, + }, + switchOutput: []uint{1, 2, 3, 4, 5}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(2)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(3)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(4)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(5)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(6)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(7)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(8)}, + {ParentId: uint(4), ParentType: dcgm.FE_SWITCH, Index: uint(9)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + expectedSwitchToLinkMap: map[uint][]uint{1: {1, 2, 3}, 2: {4, 5, 6}, 3: {7, 8}, 4: {9}, 5: {}}, + wantErr: false, + }, + { + name: "Multiple switch Multiple Links and device options Major empty", + sOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{}, + MinorRange: []int{-1}, + }, + switchOutput: []uint{1, 2, 3, 4, 5}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(2)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(3)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(4)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(5)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(6)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(7)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(8)}, + {ParentId: uint(4), ParentType: dcgm.FE_SWITCH, Index: uint(9)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + expectedSwitchToLinkMap: map[uint][]uint{}, + wantErr: false, + }, + { + name: "Multiple switch Multiple Links and device options Major -1 and Minor -1", + sOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + switchOutput: []uint{1, 2, 3, 4, 5}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(2)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(3)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(4)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(5)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(6)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(7)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(8)}, + {ParentId: uint(4), ParentType: dcgm.FE_SWITCH, Index: uint(9)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + expectedSwitchToLinkMap: map[uint][]uint{1: {1, 2, 3}, 2: {4, 5, 6}, 3: {7, 8}, 4: {9}, 5: {}}, + wantErr: false, + }, + { + name: "Multiple switch Multiple Links with missing switches", + sOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{1, 2, 3, 4, 5, 6}, + MinorRange: []int{1, 2, 3, 4, 5, 6, 7, 8, 9}, + }, + switchOutput: []uint{1, 2, 3, 4}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(2)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(3)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(4)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(5)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(6)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(7)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(8)}, + {ParentId: uint(4), ParentType: dcgm.FE_SWITCH, Index: uint(9)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + wantErr: true, + }, + { + name: "Multiple switch Multiple Links with missing links", + sOpts: appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{1, 2, 3, 4}, + MinorRange: []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, + }, + switchOutput: []uint{1, 2, 3, 4}, + linkStatusOutput: []dcgm.NvLinkStatus{ + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(1)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(2)}, + {ParentId: uint(1), ParentType: dcgm.FE_SWITCH, Index: uint(3)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(4)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(5)}, + {ParentId: uint(2), ParentType: dcgm.FE_SWITCH, Index: uint(6)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(7)}, + {ParentId: uint(3), ParentType: dcgm.FE_SWITCH, Index: uint(8)}, + {ParentId: uint(4), ParentType: dcgm.FE_SWITCH, Index: uint(9)}, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, nil) + }, + wantErr: true, + }, + { + name: "Error GetEntityGroupEntities Response", + sOpts: appconfig.DeviceOptions{ + Flex: true, + }, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, fmt.Errorf("some error")) + }, + wantErr: true, + }, + { + name: "Error GetNvLinkLinkStatus Response", + sOpts: appconfig.DeviceOptions{ + Flex: true, + }, + switchOutput: []uint{1}, + mockCalls: func(switchOutput []uint, linkStatusOutput []dcgm.NvLinkStatus) { + mockDCGMProvider.EXPECT().GetEntityGroupEntities(gomock.Any()).Return( + switchOutput, nil) + mockDCGMProvider.EXPECT().GetNvLinkLinkStatus().Return(linkStatusOutput, fmt.Errorf("some error")) + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.mockCalls(tt.switchOutput, tt.linkStatusOutput) + + if !tt.wantErr { + deviceInfo := Info{} + err := deviceInfo.initializeNvSwitchInfo(tt.sOpts) + assert.NoError(t, err, "Error not expected") + assert.Equal(t, len(tt.expectedSwitchToLinkMap), len(deviceInfo.switches), "Switch length mismatch") + + for _, swInfo := range deviceInfo.switches { + assert.Equal(t, len(tt.expectedSwitchToLinkMap[swInfo.EntityId]), len(swInfo.NvLinks), + "NV Link length mismatch") + + for _, nvLink := range swInfo.NvLinks { + assert.True(t, slices.Contains(tt.expectedSwitchToLinkMap[swInfo.EntityId], nvLink.Index), + "NV Link Index mismatch") + } + } + } else { + deviceInfo := Info{} + err := deviceInfo.initializeNvSwitchInfo(tt.sOpts) + assert.Error(t, err, "Error expected") + } + }) + } +} + +func TestVerifyDevicePresence(t *testing.T) { + deviceInfo := SpoofGPUDeviceInfo() + deviceInfo.gOpt.Flex = true + err := deviceInfo.verifyDevicePresence() + require.Equal(t, err, nil, "Expected to have no error, but found %s", err) + + deviceInfo.gOpt.Flex = false + deviceInfo.gOpt.MajorRange = append(deviceInfo.gOpt.MajorRange, -1) + deviceInfo.gOpt.MinorRange = append(deviceInfo.gOpt.MinorRange, -1) + err = deviceInfo.verifyDevicePresence() + require.Equal(t, err, nil, "Expected to have no error, but found %s", err) + + deviceInfo.gOpt.MinorRange[0] = 10 // this GPU instance doesn't exist + err = deviceInfo.verifyDevicePresence() + require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU instance, but none found") + + deviceInfo.gOpt.MajorRange[0] = 10 // this GPU doesn't exist + deviceInfo.gOpt.MinorRange[0] = -1 + err = deviceInfo.verifyDevicePresence() + require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU, but none found") + + // Add gpus and instances that exist + deviceInfo.gOpt.MajorRange[0] = 0 + deviceInfo.gOpt.MajorRange = append(deviceInfo.gOpt.MajorRange, 1) + deviceInfo.gOpt.MinorRange[0] = 0 + deviceInfo.gOpt.MinorRange = append(deviceInfo.gOpt.MinorRange, 14) + err = deviceInfo.verifyDevicePresence() + require.Equal(t, err, nil, "Expected to have no error, but found %s", err) +} + +func TestIsSwitchWatched(t *testing.T) { + tests := []struct { + name string + switchID uint + deviceInfo Info + want bool + }{ + { + name: "Monitor all devices", + switchID: 1, + deviceInfo: Info{ + sOpt: appconfig.DeviceOptions{ + Flex: true, + }, + }, + want: true, + }, + { + name: "MajorRange empty", + switchID: 2, + deviceInfo: Info{ + sOpt: appconfig.DeviceOptions{ + MajorRange: []int{}, + }, + }, + want: false, + }, + { + name: "MajorRange contains -1 to watch all devices", + switchID: 3, + deviceInfo: Info{ + sOpt: appconfig.DeviceOptions{ + MajorRange: []int{-1}, + }, + }, + want: true, + }, + { + name: "SwitchID in MajorRange", + switchID: 4, + deviceInfo: Info{ + sOpt: appconfig.DeviceOptions{ + MajorRange: []int{3, 4, 5}, + }, + }, + want: true, + }, + { + name: "SwitchID not in MajorRange", + switchID: 5, + deviceInfo: Info{ + sOpt: appconfig.DeviceOptions{ + MajorRange: []int{3, 4, 6}, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tt.deviceInfo.IsSwitchWatched(tt.switchID) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestIsLinkWatched(t *testing.T) { + tests := []struct { + name string + linkIndex uint + switchID uint + deviceInfo Info + want bool + }{ + { + name: "Monitor all devices", + linkIndex: 1, + deviceInfo: Info{sOpt: appconfig.DeviceOptions{Flex: true}}, + want: true, + }, + { + name: "No watched devices", + linkIndex: 1, + deviceInfo: Info{}, + want: false, + }, + { + name: "Watched link with empty MinorRange", + linkIndex: 2, + deviceInfo: Info{ + sOpt: appconfig.DeviceOptions{ + MajorRange: []int{-1}, + }, + switches: []SwitchInfo{ + { + EntityId: 1, + NvLinks: []dcgm.NvLinkStatus{ + {Index: 2}, + }, + }, + }, + }, + want: false, + }, + { + name: "MinorRange contains -1 to watch all links", + switchID: 1, + linkIndex: 3, + deviceInfo: Info{ + sOpt: appconfig.DeviceOptions{ + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + switches: []SwitchInfo{ + { + EntityId: 1, + NvLinks: []dcgm.NvLinkStatus{ + {Index: 3}, + }, + }, + }, + }, + want: true, + }, + { + name: "The link not in the watched switch", + switchID: 1, + linkIndex: 4, + deviceInfo: Info{ + sOpt: appconfig.DeviceOptions{ + MajorRange: []int{-1}, + MinorRange: []int{1, 2, 3}, + }, + switches: []SwitchInfo{ + { + EntityId: 1, + NvLinks: []dcgm.NvLinkStatus{ + {Index: 4}, + }, + }, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tt.deviceInfo.IsLinkWatched(tt.linkIndex, tt.switchID) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestIsCPUWatched(t *testing.T) { + tests := []struct { + name string + cpuID uint + deviceInfo Info + want bool + }{ + { + name: "Monitor all devices", + cpuID: 1, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{Flex: true}, + cpus: []CPUInfo{ + { + EntityId: 1, + }, + }, + }, + want: true, + }, + { + name: "MajorRange Contains -1", + cpuID: 2, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{MajorRange: []int{-1}}, + cpus: []CPUInfo{ + { + EntityId: 2, + }, + }, + }, + want: true, + }, + { + name: "CPU ID in MajorRange", + cpuID: 3, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{MajorRange: []int{1, 2, 3}}, + cpus: []CPUInfo{ + { + EntityId: 3, + }, + }, + }, + want: true, + }, + { + name: "CPU ID Not in MajorRange", + cpuID: 4, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{MajorRange: []int{1, 2, 3}}, + cpus: []CPUInfo{ + { + EntityId: 4, + }, + }, + }, + want: false, + }, + { + name: "MajorRange Empty", + cpuID: 5, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{MajorRange: []int{}}, + cpus: []CPUInfo{ + { + EntityId: 5, + }, + }, + }, + want: false, + }, + { + name: "CPU not found", + cpuID: 6, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{MajorRange: []int{}}, + cpus: []CPUInfo{ + { + EntityId: 5, + }, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, tt.deviceInfo.IsCPUWatched(tt.cpuID)) + }) + } +} + +func TestIsCoreWatched(t *testing.T) { + tests := []struct { + name string + coreID uint + cpuID uint + deviceInfo Info + want bool + }{ + { + name: "Monitor all devices", + coreID: 1, + cpuID: 1, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{Flex: true}, + }, + want: true, + }, + { + name: "Core in MinorRange", + coreID: 2, + cpuID: 1, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{ + MinorRange: []int{1, 2, 3}, + MajorRange: []int{-1}, + }, + cpus: []CPUInfo{{EntityId: 1}}, + }, + want: true, + }, + { + name: "Core Not in MinorRange", + coreID: 4, + cpuID: 1, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{ + MinorRange: []int{1, 2, 3}, + MajorRange: []int{-1}, + }, + cpus: []CPUInfo{{EntityId: 1}}, + }, + want: false, + }, + { + name: "MinorRange Contains -1", + coreID: 5, + cpuID: 1, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{ + MinorRange: []int{-1}, + MajorRange: []int{-1}, + }, + cpus: []CPUInfo{{EntityId: 1}}, + }, + want: true, + }, + { + name: "CPU Not Found", + coreID: 1, + cpuID: 2, + deviceInfo: Info{ + cOpt: appconfig.DeviceOptions{ + MinorRange: []int{1, 2, 3}, + MajorRange: []int{-1}, + }, + cpus: []CPUInfo{{EntityId: 1}}, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, tt.deviceInfo.IsCoreWatched(tt.coreID, tt.cpuID)) + }) + } +} + +func TestSetMigProfileNames(t *testing.T) { + config := &appconfig.Config{ + UseRemoteHE: false, + } + dcgmprovider.Initialize(config) + defer dcgmprovider.Client().Cleanup() + + tests := []struct { + name string + deviceInfo Info + values []dcgm.FieldValue_v2 + valid bool + }{ + { + name: "MIG profile found", + deviceInfo: Info{ + gpuCount: 1, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{ + { + GPUInstances: []GPUInstanceInfo{ + {EntityId: 1}, + }, + }, + }, + }, + values: []dcgm.FieldValue_v2{ + { + EntityId: 1, + FieldType: dcgm.DCGM_FT_STRING, + StringValue: &fakeProfileName, + }, + }, + valid: true, + }, + { + name: "Multiple MIG gpus", + deviceInfo: Info{ + gpuCount: 3, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{ + { + GPUInstances: []GPUInstanceInfo{ + {EntityId: 1}, + }, + }, + { + GPUInstances: []GPUInstanceInfo{ + {EntityId: 2}, + }, + }, + { + GPUInstances: []GPUInstanceInfo{ + {EntityId: 3}, + }, + }, + }, + }, + values: []dcgm.FieldValue_v2{ + { + EntityId: 2, + FieldType: dcgm.DCGM_FT_STRING, + StringValue: &fakeProfileName, + }, + }, + valid: true, + }, + { + name: "Multiple MIG gpus and Values", + deviceInfo: Info{ + gpuCount: 3, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{ + { + GPUInstances: []GPUInstanceInfo{ + {EntityId: 1}, + }, + }, + { + GPUInstances: []GPUInstanceInfo{ + {EntityId: 2}, + }, + }, + { + GPUInstances: []GPUInstanceInfo{ + {EntityId: 3}, + }, + }, + }, + }, + values: []dcgm.FieldValue_v2{ + { + EntityId: 2, + FieldType: dcgm.DCGM_FT_STRING, + StringValue: &fakeProfileName, + }, + { + EntityId: 3, + FieldType: dcgm.DCGM_FT_STRING, + StringValue: &fakeProfileName, + }, + }, + valid: true, + }, + { + name: "MIG profile not found", + deviceInfo: Info{ + gpuCount: 1, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{ + { + GPUInstances: []GPUInstanceInfo{ + {EntityId: 1}, + }, + }, + }, + }, + values: []dcgm.FieldValue_v2{ + { + EntityId: 2, + FieldType: dcgm.DCGM_FT_STRING, + StringValue: &fakeProfileName, + }, + }, + valid: false, + }, + { + name: "MIG profile not string type", + deviceInfo: Info{ + gpuCount: 1, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{ + { + GPUInstances: []GPUInstanceInfo{ + {EntityId: 1}, + }, + }, + }, + }, + values: []dcgm.FieldValue_v2{ + { + EntityId: 1, + FieldType: dcgm.DCGM_FT_BINARY, + StringValue: &fakeProfileName, + Value: [4096]byte{'1', '2', '3'}, + }, + }, + valid: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.valid { + assert.NoError(t, tt.deviceInfo.setMigProfileNames(tt.values), "Expected no error.") + } else { + assert.Error(t, tt.deviceInfo.setMigProfileNames(tt.values), "Expected an error.") + } + }) + } +} + +func Test_getCoreArray(t *testing.T) { + tests := []struct { + name string + bitmask []uint64 + want []uint + }{ + { + name: "Empty bitmask", + bitmask: []uint64{}, + want: []uint{}, + }, + { + name: "Single value - single core", + bitmask: []uint64{1}, + want: []uint{0}, + }, + { + name: "Multiple values - multiple cores", + bitmask: []uint64{1, 2, 8}, + want: []uint{0, 65, 131}, + }, + { + name: "Single uint64 value - multiple cores", + bitmask: []uint64{0b1101}, + want: []uint{0, 2, 3}, + }, + { + name: "Multiple uint64 values - multiple cores", + bitmask: []uint64{0b1101, 0b0111}, + want: []uint{0, 2, 3, 64, 65, 66}, + }, + { + name: "Large bitmask", + bitmask: []uint64{0b1101, 0b1010, 0b1111000011110000}, + want: []uint{0, 2, 3, 65, 67, 132, 133, 134, 135, 140, 141, 142, 143}, + }, + { + name: "Overflow uint64 values", + bitmask: []uint64{ + 0b0001, 0b0001, 0b0001, 0b0001, 0b0001, 0b0001, 0b0001, 0b0001, 0b0001, 0b0001, 0b0001, + 0b0001, 0b0001, 0b0001, 0b0001, 0b0001, 0b0001, 0b0001, + }, + want: []uint{0, 64, 128, 192, 256, 320, 384, 548, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if len(tt.bitmask) > 16 { + assert.Panics(t, func() { getCoreArray(tt.bitmask) }, "Expected getCoreArray to panic") + } else { + result := getCoreArray(tt.bitmask) + assert.True(t, slices.Equal(tt.want, result), "getCoreArray results not equal", tt.want, result) + } + }) + } +} + +func TestGetGPUInstanceIdentifier(t *testing.T) { + fakeDevices := SpoofGPUDevices() + gpuInstanceID := 3 + + type args struct { + deviceInfo Provider + gpuuuid string + gpuInstanceID uint + } + tests := []struct { + name string + args args + expectedOutput string + }{ + { + name: "GPU UUID found", + args: args{ + deviceInfo: &Info{ + gpuCount: 2, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{ + { + DeviceInfo: fakeDevices[0], + }, + { + DeviceInfo: fakeDevices[1], + }, + }, + }, + gpuuuid: fakeDevices[1].UUID, + gpuInstanceID: uint(gpuInstanceID), + }, + expectedOutput: fmt.Sprintf("%d-%d", fakeDevices[1].GPU, gpuInstanceID), + }, + { + name: "GPU UUID not found", + args: args{ + deviceInfo: &Info{ + gpuCount: 2, + gpus: [dcgm.MAX_NUM_DEVICES]GPUInfo{ + { + DeviceInfo: fakeDevices[0], + }, + { + DeviceInfo: fakeDevices[1], + }, + }, + }, + gpuuuid: "random", + }, + expectedOutput: "", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equalf(t, tt.expectedOutput, GetGPUInstanceIdentifier(tt.args.deviceInfo, tt.args.gpuuuid, + tt.args.gpuInstanceID), "GPU Instance Identifier mismatch") + }) + } +} diff --git a/internal/pkg/deviceinfo/testutils.go b/internal/pkg/deviceinfo/testutils.go new file mode 100644 index 00000000..4dad7540 --- /dev/null +++ b/internal/pkg/deviceinfo/testutils.go @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package deviceinfo + +import ( + "math" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" +) + +func SpoofGPUDevices() []dcgm.Device { + sampleDevices := []dcgm.Device{ + { + GPU: 0, + UUID: "000000000000", + Identifiers: dcgm.DeviceIdentifiers{ + Model: "NVIDIA T400 4GB", + }, + }, + { + GPU: 1, + UUID: "11111111111", + Identifiers: dcgm.DeviceIdentifiers{ + Model: "NVIDIA A100 40GB", + }, + }, + } + + return sampleDevices +} + +func SpoofMigHierarchy() (dcgm.MigHierarchy_v2, []dcgm.MigHierarchyInfo_v2, []dcgm.MigHierarchyInfo_v2, + []dcgm.MigHierarchyInfo_v2, +) { + sampleMigHierarchy := dcgm.MigHierarchy_v2{ + Version: 2, + Count: 9, + } + + // First GPU + sampleGPU1 := dcgm.MigHierarchyInfo_v2{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 0}, + Parent: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_NONE, EntityId: math.MaxUint}, + Info: dcgm.MigEntityInfo{ + GpuUuid: "FAKE_GPU1", + NvmlGpuIndex: 0, + NvmlInstanceId: math.MaxUint, + NvmlComputeInstanceId: math.MaxUint, + NvmlMigProfileId: math.MaxUint, + NvmlProfileSlices: 0, + }, + } + + // Second GPU + sampleGPU2 := dcgm.MigHierarchyInfo_v2{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 1}, + Parent: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_NONE, EntityId: math.MaxUint}, + Info: dcgm.MigEntityInfo{ + GpuUuid: "FAKE_GPU2", + NvmlGpuIndex: 1, + NvmlInstanceId: math.MaxUint, + NvmlComputeInstanceId: math.MaxUint, + NvmlMigProfileId: math.MaxUint, + NvmlProfileSlices: 0, + }, + } + + // First GPU Instance in GPU1 + sampleGPU1Instance1 := dcgm.MigHierarchyInfo_v2{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: 1}, + Parent: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 0}, + Info: dcgm.MigEntityInfo{ + GpuUuid: "FAKE_GPU1_I1", + NvmlGpuIndex: 0, + NvmlInstanceId: 0, + NvmlComputeInstanceId: math.MaxUint, + NvmlMigProfileId: 1, + NvmlProfileSlices: 4, + }, + } + + // Second GPU Instance in GPU1 + sampleGPU1Instance2 := dcgm.MigHierarchyInfo_v2{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: 2}, + Parent: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 0}, + Info: dcgm.MigEntityInfo{ + GpuUuid: "FAKE_GPU1_I2", + NvmlGpuIndex: 0, + NvmlInstanceId: 1, + NvmlComputeInstanceId: math.MaxUint, + NvmlMigProfileId: 2, + NvmlProfileSlices: 2, + }, + } + + // First Compute Instance in the First GPU Instance in GPU1 + sampleGPU1Instance1CI1 := dcgm.MigHierarchyInfo_v2{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_CI, EntityId: 1}, + Parent: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: 1}, + Info: dcgm.MigEntityInfo{ + GpuUuid: "FAKE_GPU1_I1_CI1", + NvmlGpuIndex: 0, + NvmlInstanceId: 0, + NvmlComputeInstanceId: 0, + NvmlMigProfileId: 3, + NvmlProfileSlices: 1, + }, + } + + // Second Compute Instance in the First GPU Instance in GPU1 + sampleGPU1Instance1CI2 := dcgm.MigHierarchyInfo_v2{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_CI, EntityId: 2}, + Parent: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: 1}, + Info: dcgm.MigEntityInfo{ + GpuUuid: "FAKE_GPU1_I1_CI2", + NvmlGpuIndex: 0, + NvmlInstanceId: 0, + NvmlComputeInstanceId: 1, + NvmlMigProfileId: 4, + NvmlProfileSlices: 1, + }, + } + + // First Compute Instance in the Second GPU Instance in GPU1 + sampleGPU1Instance2CI1 := dcgm.MigHierarchyInfo_v2{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_CI, EntityId: 3}, + Parent: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: 2}, + Info: dcgm.MigEntityInfo{ + GpuUuid: "FAKE_GPU1_I2_CI1", + NvmlGpuIndex: 0, + NvmlInstanceId: 1, + NvmlComputeInstanceId: 2, + NvmlMigProfileId: 5, + NvmlProfileSlices: 1, + }, + } + + // First GPU Instance in GPU2 + sampleGPU2Instance1 := dcgm.MigHierarchyInfo_v2{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: 3}, + Parent: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: 1}, + Info: dcgm.MigEntityInfo{ + GpuUuid: "FAKE_GPU2_I1", + NvmlGpuIndex: 1, + NvmlInstanceId: 0, + NvmlComputeInstanceId: math.MaxUint, + NvmlMigProfileId: 6, + NvmlProfileSlices: 4, + }, + } + + // First Compute Instance in the First GPU Instance in GPU2 + sampleGPU2Instance1CI1 := dcgm.MigHierarchyInfo_v2{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_CI, EntityId: 4}, + Parent: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: 3}, + Info: dcgm.MigEntityInfo{ + GpuUuid: "FAKE_GPU2_I1_CI1", + NvmlGpuIndex: 1, + NvmlInstanceId: 0, + NvmlComputeInstanceId: 0, + NvmlMigProfileId: 7, + NvmlProfileSlices: 1, + }, + } + + sampleMigHierarchy.EntityList[0] = sampleGPU1 + sampleMigHierarchy.EntityList[1] = sampleGPU1Instance1 + sampleMigHierarchy.EntityList[2] = sampleGPU1Instance1CI1 + sampleMigHierarchy.EntityList[3] = sampleGPU1Instance1CI2 + sampleMigHierarchy.EntityList[4] = sampleGPU1Instance2 + sampleMigHierarchy.EntityList[5] = sampleGPU1Instance2CI1 + sampleMigHierarchy.EntityList[6] = sampleGPU2 + sampleMigHierarchy.EntityList[7] = sampleGPU2Instance1 + sampleMigHierarchy.EntityList[8] = sampleGPU2Instance1CI1 + + return sampleMigHierarchy, []dcgm.MigHierarchyInfo_v2{sampleGPU1, sampleGPU2}, + []dcgm.MigHierarchyInfo_v2{sampleGPU1Instance1, sampleGPU1Instance2, sampleGPU2Instance1}, + []dcgm.MigHierarchyInfo_v2{ + sampleGPU1Instance1CI1, sampleGPU1Instance1CI2, sampleGPU1Instance2CI1, + sampleGPU2Instance1CI1, + } +} diff --git a/internal/pkg/deviceinfo/types.go b/internal/pkg/deviceinfo/types.go new file mode 100644 index 00000000..af52d27e --- /dev/null +++ b/internal/pkg/deviceinfo/types.go @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/deviceinfo/mock_device_info.go -package=deviceinfo -copyright_file=../../../hack/header.txt . Provider + +package deviceinfo + +import ( + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" +) + +type Provider interface { + GPUCount() uint + GPUs() []GPUInfo + GPU(i uint) GPUInfo + Switches() []SwitchInfo + Switch(i uint) SwitchInfo + CPUs() []CPUInfo + CPU(i uint) CPUInfo + GOpts() appconfig.DeviceOptions + SOpts() appconfig.DeviceOptions + COpts() appconfig.DeviceOptions + InfoType() dcgm.Field_Entity_Group + IsCPUWatched(cpuID uint) bool + IsCoreWatched(coreID uint, cpuID uint) bool + IsSwitchWatched(switchID uint) bool + IsLinkWatched(linkIndex uint, switchID uint) bool +} + +type GPUInfo struct { + DeviceInfo dcgm.Device + GPUInstances []GPUInstanceInfo + MigEnabled bool +} + +type GPUInstanceInfo struct { + Info dcgm.MigEntityInfo + ProfileName string + EntityId uint + ComputeInstances []ComputeInstanceInfo +} + +type ComputeInstanceInfo struct { + InstanceInfo dcgm.MigEntityInfo + ProfileName string + EntityId uint +} + +type CPUInfo struct { + EntityId uint + Cores []uint +} + +type SwitchInfo struct { + EntityId uint + NvLinks []dcgm.NvLinkStatus +} diff --git a/internal/pkg/devicemonitoring/const.go b/internal/pkg/devicemonitoring/const.go new file mode 100644 index 00000000..847ec613 --- /dev/null +++ b/internal/pkg/devicemonitoring/const.go @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devicemonitoring + +const ( + PARENT_ID_IGNORED = 0 +) diff --git a/internal/pkg/devicemonitoring/device_monitoring.go b/internal/pkg/devicemonitoring/device_monitoring.go new file mode 100644 index 00000000..f062d404 --- /dev/null +++ b/internal/pkg/devicemonitoring/device_monitoring.go @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devicemonitoring + +import ( + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" +) + +func GetMonitoredEntities(deviceInfo deviceinfo.Provider) []Info { + var monitoring []Info + + switch deviceInfo.InfoType() { + case dcgm.FE_SWITCH: + monitoring = monitorAllSwitches(deviceInfo) + case dcgm.FE_LINK: + monitoring = monitorAllLinks(deviceInfo) + case dcgm.FE_CPU: + monitoring = monitorAllCPUs(deviceInfo) + case dcgm.FE_CPU_CORE: + monitoring = monitorAllCPUCores(deviceInfo) + default: + if deviceInfo.GOpts().Flex { + monitoring = monitorAllGPUInstances(deviceInfo, true) + } else { + monitoring = handleGPUOptions(deviceInfo) + } + } + + return monitoring +} + +func handleGPUOptions(deviceInfo deviceinfo.Provider) []Info { + var monitoring []Info + + // Current logic: + // if MajorRange -1, MinorRange -1: Monitor all GPUs and GPU Instances + // if MajorRange -1, MinorRange : Monitor all GPU and specific GPU Instances + // if MajorRange , MinorRange -1: Monitor specific GPU and all GPU Instances + // if MajorRange , MinorRange : Monitor specific GPUs and specific GPU Instances + if len(deviceInfo.GOpts().MajorRange) > 0 && deviceInfo.GOpts().MajorRange[0] == -1 { + monitoring = monitorAllGPUs(deviceInfo) + } else { + for _, gpuID := range deviceInfo.GOpts().MajorRange { + // We've already verified that everything in the options list exists + monitoring = append(monitoring, *monitorGPU(deviceInfo, gpuID)) + } + } + + if len(deviceInfo.GOpts().MinorRange) > 0 && deviceInfo.GOpts().MinorRange[0] == -1 { + monitoring = append(monitoring, monitorAllGPUInstances(deviceInfo, false)...) + } else { + for _, gpuInstanceID := range deviceInfo.GOpts().MinorRange { + // We've already verified that everything in the options list exists + monitoring = append(monitoring, *monitorGPUInstance(deviceInfo, gpuInstanceID)) + } + } + + return monitoring +} + +func monitorAllGPUs(deviceInfo deviceinfo.Provider) []Info { + var monitoring []Info + + for i := uint(0); i < deviceInfo.GPUCount(); i++ { + mi := Info{ + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: deviceInfo.GPU(i).DeviceInfo.GPU}, + deviceInfo.GPU(i).DeviceInfo, + nil, + PARENT_ID_IGNORED, + } + monitoring = append(monitoring, mi) + } + + return monitoring +} + +func monitorAllGPUInstances(deviceInfo deviceinfo.Provider, addFlexibly bool) []Info { + var monitoring []Info + + for i := uint(0); i < deviceInfo.GPUCount(); i++ { + // If the GPU Instance count is 0, addFlexibly allows adding GPU to the monitoring list. + if addFlexibly && len(deviceInfo.GPU(i).GPUInstances) == 0 { + mi := Info{ + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: deviceInfo.GPU(i).DeviceInfo.GPU}, + deviceInfo.GPU(i).DeviceInfo, + nil, + PARENT_ID_IGNORED, + } + monitoring = append(monitoring, mi) + } else { + for j := 0; j < len(deviceInfo.GPU(i).GPUInstances); j++ { + mi := Info{ + dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: deviceInfo.GPU(i).GPUInstances[j].EntityId, + }, + deviceInfo.GPU(i).DeviceInfo, + &deviceInfo.GPU(i).GPUInstances[j], + PARENT_ID_IGNORED, + } + monitoring = append(monitoring, mi) + } + } + } + + return monitoring +} + +func monitorAllCPUs(deviceInfo deviceinfo.Provider) []Info { + var monitoring []Info + + for _, cpu := range deviceInfo.CPUs() { + if !deviceInfo.IsCPUWatched(cpu.EntityId) { + continue + } + + mi := Info{ + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU, EntityId: cpu.EntityId}, + dcgm.Device{}, + nil, + PARENT_ID_IGNORED, + } + monitoring = append(monitoring, mi) + } + + return monitoring +} + +func monitorAllCPUCores(deviceInfo deviceinfo.Provider) []Info { + var monitoring []Info + + for _, cpu := range deviceInfo.CPUs() { + if !deviceInfo.IsCPUWatched(cpu.EntityId) { + continue + } + + for _, core := range cpu.Cores { + if !deviceInfo.IsCoreWatched(core, cpu.EntityId) { + continue + } + + mi := Info{ + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: core}, + dcgm.Device{}, + nil, + cpu.EntityId, + } + monitoring = append(monitoring, mi) + } + } + + return monitoring +} + +func monitorAllSwitches(deviceInfo deviceinfo.Provider) []Info { + var monitoring []Info + + for _, sw := range deviceInfo.Switches() { + if !deviceInfo.IsSwitchWatched(sw.EntityId) { + continue + } + + mi := Info{ + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_SWITCH, EntityId: sw.EntityId}, + dcgm.Device{}, + nil, + PARENT_ID_IGNORED, + } + monitoring = append(monitoring, mi) + } + + return monitoring +} + +func monitorAllLinks(deviceInfo deviceinfo.Provider) []Info { + var monitoring []Info + + for _, sw := range deviceInfo.Switches() { + if !deviceInfo.IsSwitchWatched(sw.EntityId) { + continue + } + + for _, link := range sw.NvLinks { + if link.State != dcgm.LS_UP { + continue + } + + if !deviceInfo.IsLinkWatched(link.Index, sw.EntityId) { + continue + } + + mi := Info{ + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_LINK, EntityId: link.Index}, + dcgm.Device{}, + nil, + link.ParentId, + } + monitoring = append(monitoring, mi) + } + } + + return monitoring +} + +func monitorGPU(deviceInfo deviceinfo.Provider, gpuID int) *Info { + for i := uint(0); i < deviceInfo.GPUCount(); i++ { + if deviceInfo.GPU(i).DeviceInfo.GPU == uint(gpuID) { + return &Info{ + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: deviceInfo.GPU(i).DeviceInfo.GPU}, + deviceInfo.GPU(i).DeviceInfo, + nil, + PARENT_ID_IGNORED, + } + } + } + + return nil +} + +func monitorGPUInstance(deviceInfo deviceinfo.Provider, gpuInstanceID int) *Info { + for i := uint(0); i < deviceInfo.GPUCount(); i++ { + for _, instance := range deviceInfo.GPU(i).GPUInstances { + if instance.EntityId == uint(gpuInstanceID) { + return &Info{ + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: uint(gpuInstanceID)}, + deviceInfo.GPU(i).DeviceInfo, + &instance, + PARENT_ID_IGNORED, + } + } + } + } + + return nil +} diff --git a/internal/pkg/devicemonitoring/device_monitoring_test.go b/internal/pkg/devicemonitoring/device_monitoring_test.go new file mode 100644 index 00000000..91444243 --- /dev/null +++ b/internal/pkg/devicemonitoring/device_monitoring_test.go @@ -0,0 +1,1610 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devicemonitoring + +import ( + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + + mockdeviceinfo "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" +) + +func TestGetMonitoredEntities(t *testing.T) { + tests := []struct { + name string + mockFunc func() *mockdeviceinfo.MockProvider + want []Info + }{ + { + name: "GPU Count 2, Flex = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, Flex = false, Major -1, Minor -1", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + ctrl := gomock.NewController(t) + + gOpts := appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, Flex = false, Major -1, Minor 14", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + ctrl := gomock.NewController(t) + + gOpts := appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{-1}, + MinorRange: []int{14}, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, Flex = false, Major 1, Minor -1", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + ctrl := gomock.NewController(t) + + gOpts := appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{1}, + MinorRange: []int{-1}, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, Flex = false, Major 0, Minor 14", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + ctrl := gomock.NewController(t) + + gOpts := appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{0}, + MinorRange: []int{14}, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, Flex = false, Minor -1", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + ctrl := gomock.NewController(t) + + gOpts := appconfig.DeviceOptions{ + Flex: false, + MinorRange: []int{-1}, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, GPU Instance Count 1 each, Flex = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + ctrl := gomock.NewController(t) + + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, GPU Instance Count 2 and 0, Flex = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{ + testutils.MockGPUInstanceInfo1, + testutils.MockGPUInstanceInfo2, + } + + ctrl := gomock.NewController(t) + + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "Switch Count 2, Watched 1", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedSwitches := map[uint]bool{0: false, 1: true} + return testutils.MockSwitchDeviceInfo(ctrl, 2, nil, watchedSwitches, nil, dcgm.FE_SWITCH) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_SWITCH, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "Switch Count 5, Link Count 4, Switch Watched = true, Link Watched = true, link-up = mix", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + + switchToNvLinks[0] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {0, 0}: true, + {0, 1}: true, + {1, 0}: true, + {1, 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_LINK, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 0, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_LINK, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 1, + }, + }, + }, + { + name: "CPU Count 3, watched = mix", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedCPUs := map[uint]bool{0: false, 1: true, 2: false} + return testutils.MockCPUDeviceInfo(ctrl, 3, nil, watchedCPUs, nil, dcgm.FE_CPU) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "CPU Count 2, Core Count 4, CPU Watched = true, Core Watched = mix", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {0, 0}: true, + {0, 1}: false, + {1, 0}: false, + {1, 1}: true, + } + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, dcgm.FE_CPU_CORE) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 0, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 1, + }, + }, + }, + } + for _, tt := range tests { + deviceInfo := tt.mockFunc() + t.Run(tt.name, func(t *testing.T) { + got := GetMonitoredEntities(deviceInfo) + assert.Equalf(t, tt.want, got, "Unexpected Output") + }) + } +} + +func Test_monitorAllGPUs(t *testing.T) { + tests := []struct { + name string + mockFunc func() *mockdeviceinfo.MockProvider + want []Info + }{ + { + name: "GPU Count 0", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 0, nil) + }, + want: nil, + }, + { + name: "GPU Count 1", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, gpuInstanceInfos) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + deviceInfo := tt.mockFunc() + got := monitorAllGPUs(deviceInfo) + assert.Equalf(t, tt.want, got, "Unexpected Output") + }) + } +} + +func Test_monitorAllGPUInstances(t *testing.T) { + tests := []struct { + name string + mockFunc func() *mockdeviceinfo.MockProvider + addFlexibly bool + want []Info + }{ + { + name: "GPU Count 0, addFlexibly true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 0, nil) + }, + addFlexibly: true, + want: nil, + }, + { + name: "GPU Count 0, addFlexibly false", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 0, nil) + }, + addFlexibly: false, + want: nil, + }, + { + name: "GPU Count 1, addFlexibly true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, nil) + }, + addFlexibly: true, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 1, addFlexibly false", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, nil) + }, + addFlexibly: false, + want: nil, + }, + { + name: "GPU Count 2, addFlexibly true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 2, nil) + }, + addFlexibly: true, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, addFlexibly false", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 2, nil) + }, + addFlexibly: false, + want: nil, + }, + { + name: "GPU Count 1, GPU Instance Count 1", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, gpuInstanceInfos) + }, + addFlexibly: true, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 1, GPU Instance Count 2", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{ + testutils.MockGPUInstanceInfo1, + testutils.MockGPUInstanceInfo2, + } + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, gpuInstanceInfos) + }, + addFlexibly: true, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, GPU Instance Count 1 each", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + }, + addFlexibly: true, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, GPU Instance Count 2 and 0, addFlexibly true", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{ + testutils.MockGPUInstanceInfo1, + testutils.MockGPUInstanceInfo2, + } + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + }, + addFlexibly: true, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "GPU Count 2, GPU Instance Count 2 and 0, addFlexibly false", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{ + testutils.MockGPUInstanceInfo1, + testutils.MockGPUInstanceInfo2, + } + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + }, + addFlexibly: false, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + deviceInfo := tt.mockFunc() + got := monitorAllGPUInstances(deviceInfo, tt.addFlexibly) + assert.Equalf(t, tt.want, got, "Unexpected Output") + }) + } +} + +func Test_monitorAllSwitches(t *testing.T) { + tests := []struct { + name string + mockFunc func() *mockdeviceinfo.MockProvider + want []Info + }{ + { + name: "Switch Count 0", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockSwitchDeviceInfo(ctrl, 0, nil, nil, nil, dcgm.FE_SWITCH) + }, + want: nil, + }, + { + name: "Switch Count 1, watched = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedSwitches := map[uint]bool{0: true} + return testutils.MockSwitchDeviceInfo(ctrl, 1, nil, watchedSwitches, nil, dcgm.FE_SWITCH) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_SWITCH, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "Switch Count 1, watched = false", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedSwitches := map[uint]bool{0: false} + return testutils.MockSwitchDeviceInfo(ctrl, 1, nil, watchedSwitches, nil, dcgm.FE_SWITCH) + }, + want: nil, + }, + { + name: "Switch Count 2, watched = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedSwitches := map[uint]bool{0: true, 1: true} + return testutils.MockSwitchDeviceInfo(ctrl, 2, nil, watchedSwitches, nil, dcgm.FE_SWITCH) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_SWITCH, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_SWITCH, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "Switch Count 2, watched = false", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedSwitches := map[uint]bool{0: false, 1: false} + return testutils.MockSwitchDeviceInfo(ctrl, 2, nil, watchedSwitches, nil, dcgm.FE_SWITCH) + }, + want: nil, + }, + { + name: "Switch Count 3, watched = mix", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedSwitches := map[uint]bool{0: false, 1: true, 2: false} + return testutils.MockSwitchDeviceInfo(ctrl, 3, nil, watchedSwitches, nil, dcgm.FE_SWITCH) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_SWITCH, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + } + for _, tt := range tests { + deviceInfo := tt.mockFunc() + t.Run(tt.name, func(t *testing.T) { + got := monitorAllSwitches(deviceInfo) + assert.Equalf(t, tt.want, got, "Unexpected Output") + }) + } +} + +func Test_monitorAllLinks(t *testing.T) { + tests := []struct { + name string + mockFunc func() *mockdeviceinfo.MockProvider + want []Info + }{ + { + name: "Switch Count 0", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockSwitchDeviceInfo(ctrl, 0, nil, nil, nil, dcgm.FE_SWITCH) + }, + want: nil, + }, + { + name: "Switch Count 2, Link Count 0", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedSwitches := map[uint]bool{0: true, 1: true} + return testutils.MockSwitchDeviceInfo(ctrl, 2, nil, watchedSwitches, nil, dcgm.FE_SWITCH) + }, + want: nil, + }, + { + name: "Switch Count 1, Link Count 1, Switch Watched = true, Link Watched = true, Link Up = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {0, 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 1, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_LINK, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 0, + }, + }, + }, + { + name: "Switch Count 1, Link Count 1, Switch Watched = false, Link Watched = true, Link Up = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: false} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {0, 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 1, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + want: nil, + }, + { + name: "Switch Count 1, Link Count 1, Switch Watched = true, Link Watched = false, Link Up = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {0, 1}: false, + } + return testutils.MockSwitchDeviceInfo(ctrl, 1, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + want: nil, + }, + { + name: "Switch Count 1, Link Count 1, Switch Watched = true, Link Watched = true, Link Up = false", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1} + + watchedSwitches := map[uint]bool{0: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {0, 0}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 1, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + want: nil, + }, + { + name: "Switch Count 2, Link Count 2, Switch Watched = true, Link Watched = true, Link Up = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + + switchToNvLinks[0] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {0, 1}: true, + {1, 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 2, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_LINK, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 0, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_LINK, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 1, + }, + }, + }, + { + name: "Switch Count 5, Link Count 4, Switch Watched = true, Link Watched = mix, Link Up = mix", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + + switchToNvLinks[0] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {0, 0}: true, + {0, 1}: false, + {1, 0}: true, + {1, 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_LINK, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 1, + }, + }, + }, + } + for _, tt := range tests { + deviceInfo := tt.mockFunc() + t.Run(tt.name, func(t *testing.T) { + got := monitorAllLinks(deviceInfo) + assert.Equalf(t, tt.want, got, "Unexpected Output") + }) + } +} + +func Test_monitorAllCPUs(t *testing.T) { + tests := []struct { + name string + mockFunc func() *mockdeviceinfo.MockProvider + want []Info + }{ + { + name: "CPU Count 0", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockCPUDeviceInfo(ctrl, 0, nil, nil, nil, dcgm.FE_CPU) + }, + want: nil, + }, + { + name: "CPU Count 1, watched = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedCPUs := map[uint]bool{0: true} + return testutils.MockCPUDeviceInfo(ctrl, 1, nil, watchedCPUs, nil, dcgm.FE_CPU) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "CPU Count 1, watched = false", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedCPUs := map[uint]bool{0: false} + return testutils.MockCPUDeviceInfo(ctrl, 1, nil, watchedCPUs, nil, dcgm.FE_CPU) + }, + want: nil, + }, + { + name: "CPU Count 2, watched = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedCPUs := map[uint]bool{0: true, 1: true} + return testutils.MockCPUDeviceInfo(ctrl, 2, nil, watchedCPUs, nil, dcgm.FE_CPU) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + { + name: "CPU Count 2, watched = false", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedCPUs := map[uint]bool{0: false, 1: false} + return testutils.MockCPUDeviceInfo(ctrl, 2, nil, watchedCPUs, nil, dcgm.FE_CPU) + }, + want: nil, + }, + { + name: "Switch Count 3, watched = mix", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedCPUs := map[uint]bool{0: false, 1: true, 2: false} + return testutils.MockCPUDeviceInfo(ctrl, 3, nil, watchedCPUs, nil, dcgm.FE_CPU) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + }, + } + for _, tt := range tests { + deviceInfo := tt.mockFunc() + t.Run(tt.name, func(t *testing.T) { + got := monitorAllCPUs(deviceInfo) + assert.Equalf(t, tt.want, got, "Unexpected Output") + }) + } +} + +func Test_monitorAllCPUCores(t *testing.T) { + tests := []struct { + name string + mockFunc func() *mockdeviceinfo.MockProvider + want []Info + }{ + { + name: "CPU Count 0", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockCPUDeviceInfo(ctrl, 0, nil, nil, nil, dcgm.FE_CPU_CORE) + }, + want: nil, + }, + { + name: "CPU Count 2, Core Count 0", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + watchedCPUs := map[uint]bool{0: true, 1: true} + return testutils.MockCPUDeviceInfo(ctrl, 2, nil, watchedCPUs, nil, dcgm.FE_CPU_CORE) + }, + want: nil, + }, + { + name: "CPU Count 1, Core Count 1, CPU Watched = true, Core Watched = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{1} + + watchedCPUs := map[uint]bool{0: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {0, 1}: true, + } + return testutils.MockCPUDeviceInfo(ctrl, 1, cpuToCores, watchedCPUs, watchedCores, dcgm.FE_CPU_CORE) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 0, + }, + }, + }, + { + name: "CPU Count 1, Core Count 1, CPU Watched = false, Core Watched = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{1} + + watchedCPUs := map[uint]bool{0: false} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {0, 1}: true, + } + return testutils.MockCPUDeviceInfo(ctrl, 1, cpuToCores, watchedCPUs, watchedCores, dcgm.FE_CPU_CORE) + }, + want: nil, + }, + { + name: "CPU Count 1, Core Count 1, CPU Watched = true, Core Watched = false", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{1} + + watchedCPUs := map[uint]bool{0: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {0, 1}: false, + } + return testutils.MockCPUDeviceInfo(ctrl, 1, cpuToCores, watchedCPUs, watchedCores, dcgm.FE_CPU_CORE) + }, + want: nil, + }, + { + name: "CPU Count 2, Core Count 4, CPU Watched = true, Core Watched = true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {0, 0}: true, + {0, 1}: true, + {1, 0}: true, + {1, 1}: true, + } + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, dcgm.FE_CPU_CORE) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 0, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 0, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 1, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 1, + }, + }, + }, + { + name: "CPU Count 2, Core Count 4, CPU Watched = true, Core Watched = mix", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {0, 0}: true, + {0, 1}: false, + {1, 0}: false, + {1, 1}: true, + } + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, dcgm.FE_CPU_CORE) + }, + want: []Info{ + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 0, + }, + { + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{}, + InstanceInfo: nil, + ParentId: 1, + }, + }, + }, + } + for _, tt := range tests { + deviceInfo := tt.mockFunc() + t.Run(tt.name, func(t *testing.T) { + got := monitorAllCPUCores(deviceInfo) + assert.Equalf(t, tt.want, got, "Unexpected Output") + }) + } +} + +func Test_monitorGPU(t *testing.T) { + tests := []struct { + name string + mockFunc func() *mockdeviceinfo.MockProvider + gpuID int + want *Info + }{ + { + name: "GPU Count 0", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 0, nil) + }, + gpuID: 0, + want: nil, + }, + { + name: "GPU Count 1", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, gpuInstanceInfos) + }, + gpuID: 0, + want: &Info{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(0)}, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + { + name: "GPU Count 1, gpuID mismatch", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, gpuInstanceInfos) + }, + gpuID: 1000, + want: nil, + }, + { + name: "GPU Count 2, one GPU ID match", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + }, + gpuID: 1, + want: &Info{ + Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: uint(1)}, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: nil, + ParentId: PARENT_ID_IGNORED, + }, + }, + } + + for _, tt := range tests { + deviceInfo := tt.mockFunc() + t.Run(tt.name, func(t *testing.T) { + got := monitorGPU(deviceInfo, tt.gpuID) + assert.Equalf(t, tt.want, got, "Unexpected Output") + }) + } +} + +func Test_monitorGPUInstance(t *testing.T) { + tests := []struct { + name string + mockFunc func() *mockdeviceinfo.MockProvider + gpuInstanceID int + want *Info + }{ + { + name: "GPU Count 0, addFlexibly true", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 0, nil) + }, + gpuInstanceID: 0, + want: nil, + }, + { + name: "GPU Count 1, GPU Instance Count 0", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, nil) + }, + gpuInstanceID: 0, + want: nil, + }, + { + name: "GPU Count 2, GPU Instance Count 0", + mockFunc: func() *mockdeviceinfo.MockProvider { + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 2, nil) + }, + gpuInstanceID: 0, + want: nil, + }, + { + name: "GPU Count 1, GPU Instance Count 1", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, gpuInstanceInfos) + }, + gpuInstanceID: 0, + want: &Info{ + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo1.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo1, + ParentId: PARENT_ID_IGNORED, + }, + }, + { + name: "GPU Count 1, GPU Instance Count 1, GPU Instance ID mismatch", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, gpuInstanceInfos) + }, + gpuInstanceID: 1000, + want: nil, + }, + { + name: "GPU Count 1, GPU Instance Count 2, one match", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{ + testutils.MockGPUInstanceInfo1, + testutils.MockGPUInstanceInfo2, + } + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 1, gpuInstanceInfos) + }, + gpuInstanceID: 14, + want: &Info{ + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(0), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + { + name: "GPU Count 2, GPU Instance Count 1 each, one match", + mockFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + ctrl := gomock.NewController(t) + return testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + }, + gpuInstanceID: 14, + want: &Info{ + Entity: dcgm.GroupEntityPair{ + EntityGroupId: dcgm.FE_GPU_I, + EntityId: testutils.MockGPUInstanceInfo2.EntityId, + }, + DeviceInfo: dcgm.Device{ + GPU: uint(1), + }, + InstanceInfo: &testutils.MockGPUInstanceInfo2, + ParentId: PARENT_ID_IGNORED, + }, + }, + } + for _, tt := range tests { + deviceInfo := tt.mockFunc() + t.Run(tt.name, func(t *testing.T) { + got := monitorGPUInstance(deviceInfo, tt.gpuInstanceID) + assert.Equalf(t, tt.want, got, "Unexpected Output") + }) + } +} diff --git a/pkg/dcgmexporter/test_utils.go b/internal/pkg/devicemonitoring/types.go similarity index 60% rename from pkg/dcgmexporter/test_utils.go rename to internal/pkg/devicemonitoring/types.go index 6c13aeaf..eb94c8dc 100644 --- a/pkg/dcgmexporter/test_utils.go +++ b/internal/pkg/devicemonitoring/types.go @@ -14,29 +14,17 @@ * limitations under the License. */ -package dcgmexporter +package devicemonitoring import ( - "testing" - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/stretchr/testify/assert" -) - -func setupTest(t *testing.T) func(t *testing.T) { - cleanup, err := dcgm.Init(dcgm.Embedded) - assert.NoError(t, err) - return func(t *testing.T) { - defer cleanup() - } -} + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" +) -func runOnlyWithLiveGPUs(t *testing.T) { - t.Helper() - gpus, err := dcgm.GetSupportedDevices() - assert.NoError(t, err) - if len(gpus) < 1 { - t.Skip("Skipping test that requires live GPUs. None were found") - } +type Info struct { + Entity dcgm.GroupEntityPair + DeviceInfo dcgm.Device + InstanceInfo *deviceinfo.GPUInstanceInfo + ParentId uint } diff --git a/internal/pkg/devicewatcher/const.go b/internal/pkg/devicewatcher/const.go new file mode 100644 index 00000000..a6a1d994 --- /dev/null +++ b/internal/pkg/devicewatcher/const.go @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devicewatcher + +const ( + DCGM_ST_NOT_CONFIGURED = "Setting not configured" + + maxKeepAge = 600.0 // How long to keep data for this field in seconds + maxKeepSamples = 0 // Maximum number of samples to keep. 0=no limit +) diff --git a/internal/pkg/devicewatcher/device_watcher.go b/internal/pkg/devicewatcher/device_watcher.go new file mode 100644 index 00000000..56d78b07 --- /dev/null +++ b/internal/pkg/devicewatcher/device_watcher.go @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devicewatcher + +import ( + "context" + "fmt" + "log/slog" + "strings" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicemonitoring" + . "github.com/NVIDIA/dcgm-exporter/internal/pkg/logging" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/utils" +) + +type DeviceWatcher struct{} + +func NewDeviceWatcher() *DeviceWatcher { + return &DeviceWatcher{} +} + +func (d *DeviceWatcher) GetDeviceFields(counters []counters.Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short { + var deviceFields []dcgm.Short + for _, counter := range counters { + fieldMeta := dcgmprovider.Client().FieldGetById(counter.FieldID) + + if shouldIncludeField(entityType, fieldMeta.EntityLevel) { + deviceFields = append(deviceFields, counter.FieldID) + } + } + + return deviceFields +} + +func shouldIncludeField(entityType, fieldLevel dcgm.Field_Entity_Group) bool { + if fieldLevel == entityType || fieldLevel == dcgm.FE_NONE { + return true + } + + switch entityType { + case dcgm.FE_GPU: + return fieldLevel == dcgm.FE_GPU_CI || fieldLevel == dcgm.FE_GPU_I || fieldLevel == dcgm.FE_VGPU + case dcgm.FE_CPU: + return fieldLevel == dcgm.FE_CPU_CORE + case dcgm.FE_SWITCH: + return fieldLevel == dcgm.FE_LINK + default: + return false + } +} + +func (d *DeviceWatcher) WatchDeviceFields( + deviceFields []dcgm.Short, deviceInfo deviceinfo.Provider, updateFreqInUsec int64, +) ([]dcgm.GroupHandle, dcgm.FieldHandle, []func(), error) { + var err error + var cleanups []func() + var groups []dcgm.GroupHandle + + switch deviceInfo.InfoType() { + case dcgm.FE_LINK: + // This handles NV link case only. + groups, cleanups, err = d.createNVLinkGroups(deviceInfo) + case dcgm.FE_CPU_CORE: + // This handles CPU Core case only. + groups, cleanups, err = d.createCPUCoreGroups(deviceInfo) + default: + // This handles GPUs (including GPU Instances), CPUs and Switches cases. + groups, cleanups, err = d.createGroups(deviceInfo) + } + if err != nil { + return nil, dcgm.FieldHandle{}, utils.CleanupOnError(cleanups), err + } else if len(groups) == 0 { + return nil, dcgm.FieldHandle{}, cleanups, nil + } + + fieldGroup, cleanup, fieldGroupErr := newFieldGroup(deviceFields) + if fieldGroupErr != nil { + return nil, dcgm.FieldHandle{}, utils.CleanupOnError(cleanups), fieldGroupErr + } + cleanups = append(cleanups, cleanup) + + for _, group := range groups { + err = watchFieldGroup(group, fieldGroup, updateFreqInUsec) + if err != nil { + return nil, dcgm.FieldHandle{}, utils.CleanupOnError(cleanups), err + } + } + + return groups, fieldGroup, cleanups, nil +} + +func (d *DeviceWatcher) createGroups(deviceInfo deviceinfo.Provider) ([]dcgm.GroupHandle, []func(), + error, +) { + if group, cleanup, err := d.createGenericGroup(deviceInfo); err != nil { + return []dcgm.GroupHandle{}, []func(){cleanup}, err + } else if group != nil { + return []dcgm.GroupHandle{*group}, []func(){cleanup}, nil + } + + return []dcgm.GroupHandle{}, []func(){}, nil +} + +func (d *DeviceWatcher) createGenericGroup(deviceInfo deviceinfo.Provider) (*dcgm.GroupHandle, func(), + error, +) { + monitoringInfo := devicemonitoring.GetMonitoredEntities(deviceInfo) + if len(monitoringInfo) == 0 { + return nil, doNothing, nil + } + + groupID, cleanup, err := createGroup() + if err != nil { + return nil, cleanup, err + } + + for _, mi := range monitoringInfo { + err := dcgmprovider.Client().AddEntityToGroup(groupID, mi.Entity.EntityGroupId, mi.Entity.EntityId) + if err != nil { + return &groupID, cleanup, err + } + } + + return &groupID, cleanup, nil +} + +func (d *DeviceWatcher) createCPUCoreGroups(deviceInfo deviceinfo.Provider) ([]dcgm.GroupHandle, []func(), + error, +) { + var groups []dcgm.GroupHandle + var cleanups []func() + var err error + + for _, cpu := range deviceInfo.CPUs() { + if !deviceInfo.IsCPUWatched(cpu.EntityId) { + continue + } + + var groupCoreCount int + var groupID dcgm.GroupHandle + for _, core := range cpu.Cores { + if !deviceInfo.IsCoreWatched(core, cpu.EntityId) { + continue + } + + // Create per-cpu core groups or after max number of CPU cores have been added to current group + if groupCoreCount%dcgm.DCGM_GROUP_MAX_ENTITIES == 0 { + var cleanup func() + + groupID, cleanup, err = createGroup() + if err != nil { + return nil, cleanups, err + } + + cleanups = append(cleanups, cleanup) + groups = append(groups, groupID) + } + + groupCoreCount++ + + err = dcgmprovider.Client().AddEntityToGroup(groupID, dcgm.FE_CPU_CORE, core) + if err != nil { + return groups, cleanups, err + } + } + } + + return groups, cleanups, nil +} + +func (d *DeviceWatcher) createNVLinkGroups(deviceInfo deviceinfo.Provider) ([]dcgm.GroupHandle, []func(), + error, +) { + var groups []dcgm.GroupHandle + var cleanups []func() + var err error + + /* Create per-switch link groups */ + for _, sw := range deviceInfo.Switches() { + if !deviceInfo.IsSwitchWatched(sw.EntityId) { + continue + } + + var groupLinkCount int + var groupID dcgm.GroupHandle + for _, link := range sw.NvLinks { + if link.State != dcgm.LS_UP { + continue + } + + if !deviceInfo.IsLinkWatched(link.Index, sw.EntityId) { + continue + } + + // Create per-switch link groups + if groupLinkCount == 0 { + var cleanup func() + + groupID, cleanup, err = createGroup() + if err != nil { + return nil, cleanups, err + } + + cleanups = append(cleanups, cleanup) + groups = append(groups, groupID) + } + + groupLinkCount++ + + err = dcgmprovider.Client().AddLinkEntityToGroup(groupID, link.Index, link.ParentId) + if err != nil { + return groups, cleanups, err + } + } + } + + return groups, cleanups, nil +} + +func createGroup() (dcgm.GroupHandle, func(), error) { + newGroupNumber, err := utils.RandUint64() + if err != nil { + return dcgm.GroupHandle{}, doNothing, err + } + + groupID, err := dcgmprovider.Client().CreateGroup(fmt.Sprintf("gpu-collector-group-%d", newGroupNumber)) + if err != nil { + return dcgm.GroupHandle{}, doNothing, err + } + + cleanup := func() { + destroyErr := dcgmprovider.Client().DestroyGroup(groupID) + if destroyErr != nil && !strings.Contains(destroyErr.Error(), DCGM_ST_NOT_CONFIGURED) { + slog.LogAttrs(context.Background(), slog.LevelWarn, "cannot destroy group", + slog.Any(GroupIDKey, groupID), + slog.String(ErrorKey, destroyErr.Error()), + ) + } + } + return groupID, cleanup, nil +} + +func newFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error) { + newFieldGroupNumber, err := utils.RandUint64() + if err != nil { + return dcgm.FieldHandle{}, doNothing, err + } + + name := fmt.Sprintf("gpu-collector-fieldgroup-%d", newFieldGroupNumber) + fieldGroup, err := dcgmprovider.Client().FieldGroupCreate(name, deviceFields) + if err != nil { + return dcgm.FieldHandle{}, doNothing, err + } + + cleanup := func() { + err := dcgmprovider.Client().FieldGroupDestroy(fieldGroup) + if err != nil { + slog.Warn("Cannot destroy field group.", + slog.String(ErrorKey, err.Error()), + ) + } + } + + return fieldGroup, cleanup, nil +} + +func watchFieldGroup( + group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, +) error { + err := dcgmprovider.Client().WatchFieldsWithGroupEx(field, group, updateFreq, maxKeepAge, maxKeepSamples) + if err != nil { + return err + } + + return nil +} diff --git a/internal/pkg/devicewatcher/device_watcher_test.go b/internal/pkg/devicewatcher/device_watcher_test.go new file mode 100644 index 00000000..123d914e --- /dev/null +++ b/internal/pkg/devicewatcher/device_watcher_test.go @@ -0,0 +1,1951 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devicewatcher + +import ( + "crypto/rand" + "fmt" + "slices" + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + + mockdcgm "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/dcgmprovider" + mockdeviceinfo "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" +) + +func TestDeviceWatcher_WatchDeviceFields(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGM := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGM) + + tests := []struct { + name string + mockDeviceInfoFunc func() *mockdeviceinfo.MockProvider + mockDCGMFunc func([]dcgm.GroupHandle, dcgm.FieldHandle) + expectGroupIDs func() []dcgm.GroupHandle + expectFieldGroupID func() dcgm.FieldHandle + wantErr bool + }{ + { + name: "Watch Switch Links", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(0), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(1), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(0), uint(1)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(1), uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldGroupHandle, nil) + mockDCGM.EXPECT().FieldGroupDestroy(mockFieldGroupHandle).Return(nil) + + mockDCGM.EXPECT().WatchFieldsWithGroupEx(mockFieldGroupHandle, mockGroupHandles[0], gomock.Any(), + gomock.Any(), gomock.Any()).Return(nil) + mockDCGM.EXPECT().WatchFieldsWithGroupEx(mockFieldGroupHandle, mockGroupHandles[1], gomock.Any(), + gomock.Any(), gomock.Any()).Return(nil) + }, + wantErr: false, + }, + { + name: "Watch Switch Links when No Switches watched", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + watchedSwitches := map[uint]bool{0: false, 1: false} + return testutils.MockSwitchDeviceInfo(ctrl, 5, nil, watchedSwitches, nil, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + return nil + }, + expectFieldGroupID: func() dcgm.FieldHandle { + return dcgm.FieldHandle{} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) {}, + wantErr: false, + }, + { + name: "Watch Switch Links but got AddLinkEntityToGroup Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + return dcgm.FieldHandle{} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(0), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(1), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(0), uint(1)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(1), + uint(1)).Return(fmt.Errorf("some error")) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(fmt.Errorf("some other error")) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + }, + wantErr: true, + }, + { + name: "Watch Switch Links but got FieldGroupCreate Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(0), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(1), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(0), uint(1)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(1), uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldGroupHandle, + fmt.Errorf("some error")) + }, + wantErr: true, + }, + { + name: "Watch Switch Links but got WatchFieldsWithGroupEx Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(0), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(1), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(0), uint(1)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(1), uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldGroupHandle, nil) + mockDCGM.EXPECT().FieldGroupDestroy(mockFieldGroupHandle).Return(nil) + + mockDCGM.EXPECT().WatchFieldsWithGroupEx(mockFieldGroupHandle, mockGroupHandles[0], gomock.Any(), + gomock.Any(), gomock.Any()).Return(nil) + mockDCGM.EXPECT().WatchFieldsWithGroupEx(mockFieldGroupHandle, mockGroupHandles[1], gomock.Any(), + gomock.Any(), gomock.Any()).Return(fmt.Errorf("some error")) + }, + wantErr: true, + }, + { + name: "Watch GPUs", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(1)) + + return []dcgm.GroupHandle{mockGroupHandle} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_GPU, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_GPU, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldGroupHandle, nil) + mockDCGM.EXPECT().FieldGroupDestroy(mockFieldGroupHandle).Return(nil) + + mockDCGM.EXPECT().WatchFieldsWithGroupEx(mockFieldGroupHandle, mockGroupHandles[0], gomock.Any(), + gomock.Any(), gomock.Any()).Return(nil) + }, + wantErr: false, + }, + { + name: "Watch GPUs when No GPUs or GPU Instances to monitor", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 0, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + expectGroupIDs: func() []dcgm.GroupHandle { + return []dcgm.GroupHandle{} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + return dcgm.FieldHandle{} + }, + mockDCGMFunc: func(_ []dcgm.GroupHandle, _ dcgm.FieldHandle) {}, + wantErr: false, + }, + { + name: "Watch GPUs but got AddEntityToGroup Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(1)) + + return []dcgm.GroupHandle{mockGroupHandle} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + return dcgm.FieldHandle{} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, _ dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_GPU, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_GPU, + uint(1)).Return(fmt.Errorf("some error")) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(fmt.Errorf("some other error")) + }, + wantErr: true, + }, + { + name: "Watch CPU Cores", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU_CORE) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(1)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[1], dcgm.FE_CPU_CORE, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[1], dcgm.FE_CPU_CORE, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldGroupHandle, nil) + mockDCGM.EXPECT().FieldGroupDestroy(mockFieldGroupHandle).Return(nil) + + mockDCGM.EXPECT().WatchFieldsWithGroupEx(mockFieldGroupHandle, mockGroupHandles[0], gomock.Any(), + gomock.Any(), gomock.Any()).Return(nil) + mockDCGM.EXPECT().WatchFieldsWithGroupEx(mockFieldGroupHandle, mockGroupHandles[1], gomock.Any(), + gomock.Any(), gomock.Any()).Return(nil) + }, + wantErr: false, + }, + { + name: "No CPU cores to watch", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + watchedCPUs := map[uint]bool{0: false, 1: false} + mockGPUDeviceInfo := testutils.MockCPUDeviceInfo(ctrl, 2, nil, watchedCPUs, nil, + dcgm.FE_CPU_CORE) + + return mockGPUDeviceInfo + }, + expectGroupIDs: func() []dcgm.GroupHandle { + return nil + }, + expectFieldGroupID: func() dcgm.FieldHandle { + return dcgm.FieldHandle{} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) {}, + wantErr: false, + }, + { + name: "Watch CPU cores when Create Group Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU_CORE) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + return dcgm.FieldHandle{} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], fmt.Errorf("random error")) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + }, + wantErr: true, + }, + { + name: "Watch CPUs", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + return []dcgm.GroupHandle{mockGroupHandle1} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldGroupHandle, nil) + mockDCGM.EXPECT().FieldGroupDestroy(mockFieldGroupHandle).Return(nil) + + mockDCGM.EXPECT().WatchFieldsWithGroupEx(mockFieldGroupHandle, mockGroupHandles[0], gomock.Any(), + gomock.Any(), gomock.Any()).Return(nil) + }, + wantErr: false, + }, + { + name: "Watch CPUs when CPUs to monitor", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + + watchedCPUs := map[uint]bool{0: false, 1: false} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + return []dcgm.GroupHandle{} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + return dcgm.FieldHandle{} + }, + mockDCGMFunc: func(_ []dcgm.GroupHandle, _ dcgm.FieldHandle) {}, + wantErr: false, + }, + { + name: "Watch CPUs but got AddEntityToGroup Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + + watchedCPUs := map[uint]bool{0: false, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(1)) + + return []dcgm.GroupHandle{mockGroupHandle} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + return dcgm.FieldHandle{} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, _ dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU, + uint(1)).Return(fmt.Errorf("some error")) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(fmt.Errorf("some other error")) + }, + wantErr: true, + }, + { + name: "Watch Switches", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_SWITCH) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + return []dcgm.GroupHandle{mockGroupHandle1} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_SWITCH, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_SWITCH, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldGroupHandle, nil) + mockDCGM.EXPECT().FieldGroupDestroy(mockFieldGroupHandle).Return(nil) + + mockDCGM.EXPECT().WatchFieldsWithGroupEx(mockFieldGroupHandle, mockGroupHandles[0], gomock.Any(), + gomock.Any(), gomock.Any()).Return(nil) + }, + wantErr: false, + }, + { + name: "Watch CPUs when no switches available", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + return testutils.MockSwitchDeviceInfo(ctrl, 0, nil, nil, nil, + dcgm.FE_SWITCH) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + return []dcgm.GroupHandle{} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + return dcgm.FieldHandle{} + }, + mockDCGMFunc: func(_ []dcgm.GroupHandle, _ dcgm.FieldHandle) {}, + wantErr: false, + }, + { + name: "Watch CPUs when Create Group error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_SWITCH) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + return []dcgm.GroupHandle{mockGroupHandle1} + }, + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle, mockFieldGroupHandle dcgm.FieldHandle) { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], fmt.Errorf("random error")) + }, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockDeviceInfo := tt.mockDeviceInfoFunc() + mockGroupIDs := tt.expectGroupIDs() + mockFieldGroupIDs := tt.expectFieldGroupID() + tt.mockDCGMFunc(mockGroupIDs, mockFieldGroupIDs) + + d := NewDeviceWatcher() + inputFields := []dcgm.Short{1, 2, 3, 4} + _, _, gotFuncs, err := d.WatchDeviceFields(inputFields, mockDeviceInfo, 1000000) + // Ensure DestroyGroup functions gets called + for _, gotFunc := range gotFuncs { + gotFunc() + } + + if !tt.wantErr { + assert.Nil(t, err, "expected no error") + } else { + assert.NotNil(t, err, "expected no error.") + assert.Nil(t, gotFuncs, "expected cleanup functions to be nil") + } + }) + } +} + +func TestDeviceWatcher_createGenericGroup(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGM := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGM) + + tests := []struct { + name string + mockDeviceInfoFunc func() *mockdeviceinfo.MockProvider + mockDCGMFunc func(dcgm.GroupHandle) func() + expectGroupID func() *dcgm.GroupHandle + wantErr bool + }{ + { + name: "Create Group for GPUs", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + expectGroupID: func() *dcgm.GroupHandle { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(1)) + + return &mockGroupHandle + }, + mockDCGMFunc: func(mockGroupHandle dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_GPU, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_GPU, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandle).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "Create Group for GPU Instances", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + gpuInstanceInfos := make(map[int][]deviceinfo.GPUInstanceInfo) + gpuInstanceInfos[0] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo1} + gpuInstanceInfos[1] = []deviceinfo.GPUInstanceInfo{testutils.MockGPUInstanceInfo2} + + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, gpuInstanceInfos) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + expectGroupID: func() *dcgm.GroupHandle { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(1)) + + return &mockGroupHandle + }, + mockDCGMFunc: func(mockGroupHandle dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_GPU_I, + testutils.MockGPUInstanceInfo1.EntityId).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_GPU_I, + testutils.MockGPUInstanceInfo2.EntityId).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandle).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "Create Group for CPUs", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU) + }, + expectGroupID: func() *dcgm.GroupHandle { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(1)) + + return &mockGroupHandle + }, + mockDCGMFunc: func(mockGroupHandle dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_CPU, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_CPU, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandle).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "Create Group for Switches", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_SWITCH) + }, + expectGroupID: func() *dcgm.GroupHandle { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(1)) + + return &mockGroupHandle + }, + mockDCGMFunc: func(mockGroupHandle dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_SWITCH, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_SWITCH, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandle).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "No GPUs or GPU Instances to monitor", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 0, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + expectGroupID: func() *dcgm.GroupHandle { + return nil + }, + mockDCGMFunc: func(_ dcgm.GroupHandle) func() { + return doNothing + }, + wantErr: false, + }, + { + name: "Random Unit Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + expectGroupID: func() *dcgm.GroupHandle { + return nil + }, + mockDCGMFunc: func(_ dcgm.GroupHandle) func() { + // Simulate a failure in rand.Reader using mock rand.Reader + mockReader := &testutils.MockReader{Err: fmt.Errorf("mock error")} + + originalReader := rand.Reader + rand.Reader = mockReader + return func() { + rand.Reader = originalReader + } + }, + wantErr: true, + }, + { + name: "Create Group Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + expectGroupID: func() *dcgm.GroupHandle { + return nil + }, + mockDCGMFunc: func(_ dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(dcgm.GroupHandle{}, fmt.Errorf("random error")) + + return doNothing + }, + wantErr: true, + }, + { + name: "AddEntityToGroup Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + expectGroupID: func() *dcgm.GroupHandle { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(1)) + + return &mockGroupHandle + }, + mockDCGMFunc: func(mockGroupHandle dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_GPU, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_GPU, + uint(1)).Return(fmt.Errorf("some error")) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandle).Return(fmt.Errorf("some other error")) + + return doNothing + }, + wantErr: true, + }, + { + name: "DestroyGroup Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + }, + expectGroupID: func() *dcgm.GroupHandle { + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(uintptr(1)) + + return &mockGroupHandle + }, + mockDCGMFunc: func(mockGroupHandle dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_GPU, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandle, dcgm.FE_GPU, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandle).Return(fmt.Errorf("some error")) + + return doNothing + }, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockDeviceInfo := tt.mockDeviceInfoFunc() + mockGroupID := tt.expectGroupID() + inputGroupID := dcgm.GroupHandle{} + if mockGroupID != nil { + inputGroupID = *mockGroupID + } + + f := tt.mockDCGMFunc(inputGroupID) + defer f() + + d := &DeviceWatcher{} + gotGroupID, gotFunc, err := d.createGenericGroup(mockDeviceInfo) + gotFunc() // Ensure DestroyGroup function gets called + + if !tt.wantErr { + assert.Nil(t, err, "expected no error") + assert.Equal(t, mockGroupID, gotGroupID, "expected group IDs to be the same.") + } else { + assert.NotNil(t, err, "expected no error.") + } + }) + } +} + +func TestDeviceWatcher_createCPUCoreGroups(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGM := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGM) + + tests := []struct { + name string + mockDeviceInfoFunc func() *mockdeviceinfo.MockProvider + mockDCGMFunc func(mockGroupHandles []dcgm.GroupHandle) func() + expectGroupIDs func() []dcgm.GroupHandle + wantErr bool + }{ + { + name: "Create Group for CPU Cores", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU_CORE) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(1)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[1], dcgm.FE_CPU_CORE, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[1], dcgm.FE_CPU_CORE, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "No CPU watched", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + watchedCPUs := map[uint]bool{0: false, 1: false} + mockGPUDeviceInfo := testutils.MockCPUDeviceInfo(ctrl, 2, nil, watchedCPUs, nil, + dcgm.FE_CPU_CORE) + + return mockGPUDeviceInfo + }, + expectGroupIDs: func() []dcgm.GroupHandle { + return nil + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + return doNothing + }, + wantErr: false, + }, + { + name: "Only CPUs watched", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + watchedCPUs := map[uint]bool{0: true, 1: true} + return testutils.MockCPUDeviceInfo(ctrl, 2, nil, watchedCPUs, nil, + dcgm.FE_CPU_CORE) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + return nil + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + return doNothing + }, + wantErr: false, + }, + { + name: "Only 1 Core watched", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: false, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: false, + {ParentID: 1, ChildID: 1}: false, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU_CORE) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + return []dcgm.GroupHandle{mockGroupHandle1} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "One Core Each watched", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: false, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: false, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU_CORE) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(1)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[1], dcgm.FE_CPU_CORE, uint(0)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "Random Unit Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU_CORE) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + mockDCGMFunc: func(_ []dcgm.GroupHandle) func() { + // Simulate a failure in rand.Reader using mock rand.Reader + mockReader := &testutils.MockReader{Err: fmt.Errorf("mock error")} + + originalReader := rand.Reader + rand.Reader = mockReader + return func() { + rand.Reader = originalReader + } + }, + wantErr: true, + }, + { + name: "Create Group Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU_CORE) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], fmt.Errorf("random error")) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + return doNothing + }, + wantErr: true, + }, + { + name: "AddEntityToGroup Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + cpuToCores := make(map[int][]uint) + cpuToCores[0] = []uint{0, 1} + cpuToCores[1] = []uint{0, 1} + + watchedCPUs := map[uint]bool{0: true, 1: true} + watchedCores := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + + return testutils.MockCPUDeviceInfo(ctrl, 2, cpuToCores, watchedCPUs, watchedCores, + dcgm.FE_CPU_CORE) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[0], dcgm.FE_CPU_CORE, uint(1)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[1], dcgm.FE_CPU_CORE, uint(0)).Return(nil) + mockDCGM.EXPECT().AddEntityToGroup(mockGroupHandles[1], dcgm.FE_CPU_CORE, + uint(1)).Return(fmt.Errorf("some error")) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(fmt.Errorf("some other error")) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + return doNothing + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockDeviceInfo := tt.mockDeviceInfoFunc() + mockGroupIDs := tt.expectGroupIDs() + f := tt.mockDCGMFunc(mockGroupIDs) + defer f() + + d := &DeviceWatcher{} + gotGroupIDs, gotFuncs, err := d.createCPUCoreGroups(mockDeviceInfo) + // Ensure DestroyGroup functions gets called + for _, gotFunc := range gotFuncs { + gotFunc() + } + + if !tt.wantErr { + assert.Nil(t, err, "expected no error") + assert.Equal(t, mockGroupIDs, gotGroupIDs, "expected group IDs to be the same.") + } else { + assert.NotNil(t, err, "expected no error.") + } + }) + } +} + +func TestDeviceWatcher_createNVLinkGroups(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGM := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGM) + + tests := []struct { + name string + mockDeviceInfoFunc func() *mockdeviceinfo.MockProvider + mockDCGMFunc func(mockGroupHandles []dcgm.GroupHandle) func() + expectGroupIDs func() []dcgm.GroupHandle + wantErr bool + }{ + { + name: "Create Group for Switch Links", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(0), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(1), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(0), uint(1)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(1), uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "No Switches watched", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + watchedSwitches := map[uint]bool{0: false, 1: false} + return testutils.MockSwitchDeviceInfo(ctrl, 5, nil, watchedSwitches, nil, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + return nil + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + return doNothing + }, + wantErr: false, + }, + { + name: "Only Switches watched", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + watchedSwitches := map[uint]bool{0: true, 1: true} + return testutils.MockSwitchDeviceInfo(ctrl, 5, nil, watchedSwitches, nil, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + return nil + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + return doNothing + }, + wantErr: false, + }, + { + name: "Only 1 NV Link watched", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: false, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: false, + {ParentID: 1, ChildID: 1}: false, + } + + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + return []dcgm.GroupHandle{mockGroupHandle1} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(1), uint(0)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "One NV Link Each watched", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: false, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: false, + } + + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(1), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(0), uint(1)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "One NV Link Each watched but one link down", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: false, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: false, + } + + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + return []dcgm.GroupHandle{mockGroupHandle1} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(1), uint(0)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "One NV Link Each watched but all watched NV links down", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + mockLink2 := testutils.MockNVLinkVal2 + mockLink2.State = 2 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, mockLink2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{testutils.MockNVLinkVal1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: false, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: false, + } + + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + return nil + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + return doNothing + }, + wantErr: false, + }, + { + name: "Random Unit Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + mockDCGMFunc: func(_ []dcgm.GroupHandle) func() { + // Simulate a failure in rand.Reader using mock rand.Reader + mockReader := &testutils.MockReader{Err: fmt.Errorf("mock error")} + + originalReader := rand.Reader + rand.Reader = mockReader + return func() { + rand.Reader = originalReader + } + }, + wantErr: true, + }, + { + name: "Create Group Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], fmt.Errorf("random error")) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(0), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(1), uint(0)).Return(nil) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(nil) + return doNothing + }, + wantErr: true, + }, + { + name: "AddLinkEntityToGroup Error", + mockDeviceInfoFunc: func() *mockdeviceinfo.MockProvider { + mockLink1 := testutils.MockNVLinkVal1 + mockLink1.State = 3 + + switchToNvLinks := make(map[int][]dcgm.NvLinkStatus) + switchToNvLinks[0] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + switchToNvLinks[1] = []dcgm.NvLinkStatus{mockLink1, testutils.MockNVLinkVal2} + + watchedSwitches := map[uint]bool{0: true, 1: true} + watchedLinks := map[testutils.WatchedEntityKey]bool{ + {ParentID: 0, ChildID: 0}: true, + {ParentID: 0, ChildID: 1}: true, + {ParentID: 1, ChildID: 0}: true, + {ParentID: 1, ChildID: 1}: true, + } + return testutils.MockSwitchDeviceInfo(ctrl, 5, switchToNvLinks, watchedSwitches, watchedLinks, + dcgm.FE_LINK) + }, + expectGroupIDs: func() []dcgm.GroupHandle { + mockGroupHandle1 := dcgm.GroupHandle{} + mockGroupHandle1.SetHandle(uintptr(1)) + + mockGroupHandle2 := dcgm.GroupHandle{} + mockGroupHandle2.SetHandle(uintptr(2)) + + return []dcgm.GroupHandle{mockGroupHandle1, mockGroupHandle2} + }, + mockDCGMFunc: func(mockGroupHandles []dcgm.GroupHandle) func() { + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[0], nil) + mockDCGM.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandles[1], nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(0), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[0], uint(1), uint(0)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(0), uint(1)).Return(nil) + mockDCGM.EXPECT().AddLinkEntityToGroup(mockGroupHandles[1], uint(1), + uint(1)).Return(fmt.Errorf("some error")) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[0]).Return(fmt.Errorf("some other error")) + mockDCGM.EXPECT().DestroyGroup(mockGroupHandles[1]).Return(nil) + return doNothing + }, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockDeviceInfo := tt.mockDeviceInfoFunc() + mockGroupIDs := tt.expectGroupIDs() + f := tt.mockDCGMFunc(mockGroupIDs) + defer f() + + d := &DeviceWatcher{} + gotGroupIDs, gotFuncs, err := d.createNVLinkGroups(mockDeviceInfo) + // Ensure DestroyGroup functions gets called + for _, gotFunc := range gotFuncs { + gotFunc() + } + + if !tt.wantErr { + assert.Nil(t, err, "expected no error") + assert.Equal(t, mockGroupIDs, gotGroupIDs, "expected group IDs to be the same.") + } else { + assert.NotNil(t, err, "expected no error.") + } + }) + } +} + +func Test_newFieldGroup(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGM := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGM) + + tests := []struct { + name string + mockDCGMFunc func(dcgm.FieldHandle) func() + expectFieldGroupID func() dcgm.FieldHandle + wantErr bool + }{ + { + name: "Create Group for Switch Links", + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockFieldGroupHandle dcgm.FieldHandle) func() { + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldGroupHandle, nil) + mockDCGM.EXPECT().FieldGroupDestroy(mockFieldGroupHandle).Return(nil) + + return doNothing + }, + wantErr: false, + }, + { + name: "Random Unit Error", + expectFieldGroupID: func() dcgm.FieldHandle { + return dcgm.FieldHandle{} + }, + mockDCGMFunc: func(mockFieldGroupHandle dcgm.FieldHandle) func() { + // Simulate a failure in rand.Reader using mock rand.Reader + mockReader := &testutils.MockReader{Err: fmt.Errorf("mock error")} + + originalReader := rand.Reader + rand.Reader = mockReader + return func() { + rand.Reader = originalReader + } + }, + wantErr: true, + }, + { + name: "Field Group Create Error", + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockFieldGroupHandle dcgm.FieldHandle) func() { + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldGroupHandle, + fmt.Errorf("random error")) + + return doNothing + }, + wantErr: true, + }, + { + name: "Field Group Destroy Error", + expectFieldGroupID: func() dcgm.FieldHandle { + mockFieldGroupHandle := dcgm.FieldHandle{} + mockFieldGroupHandle.SetHandle(uintptr(1)) + + return mockFieldGroupHandle + }, + mockDCGMFunc: func(mockFieldGroupHandle dcgm.FieldHandle) func() { + mockDCGM.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldGroupHandle, nil) + mockDCGM.EXPECT().FieldGroupDestroy(mockFieldGroupHandle).Return(fmt.Errorf("some other error")) + + return doNothing + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockFieldGroupIDs := tt.expectFieldGroupID() + f := tt.mockDCGMFunc(mockFieldGroupIDs) + defer f() + + input := []dcgm.Short{1, 2, 3, 4} + gotFieldGroupIDs, gotFunc, err := newFieldGroup(input) + gotFunc() // Ensure DestroyGroup functions gets called + + if !tt.wantErr { + assert.Nil(t, err, "expected no error") + assert.Equal(t, mockFieldGroupIDs, gotFieldGroupIDs, "expected field group IDs to be the same.") + } else { + assert.NotNil(t, err, "expected no error.") + } + }) + } +} + +func TestDeviceWatcher_GetDeviceFields(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGM := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGM) + + type args struct { + counterList []counters.Counter + entityType dcgm.Field_Entity_Group + } + tests := []struct { + name string + args args + mockDCGMFunc func([]dcgm.Short) + want func() []dcgm.Short + }{ + { + name: "GPU, GPU Instance and VGPU Counters", + args: args{ + counterList: testutils.SampleCounters, + entityType: dcgm.FE_GPU, + }, + mockDCGMFunc: func(fieldIDs []dcgm.Short) { + for _, fieldID := range fieldIDs { + mockDCGM.EXPECT().FieldGetById(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + } + }, + want: func() []dcgm.Short { + return append(testutils.SampleGPUFieldIDs, testutils.SampleDriverVersionCounter.FieldID) + }, + }, + { + name: "GPU Instance Counters", + args: args{ + counterList: testutils.SampleCounters, + entityType: dcgm.FE_GPU_I, + }, + mockDCGMFunc: func(fieldIDs []dcgm.Short) { + for _, fieldID := range fieldIDs { + mockDCGM.EXPECT().FieldGetById(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + } + }, + want: func() []dcgm.Short { + return []dcgm.Short{ + testutils.SampleGPUPowerUsageCounter.FieldID, + testutils.SampleDriverVersionCounter.FieldID, + } + }, + }, + { + name: "VGPU Counters", + args: args{ + counterList: testutils.SampleCounters, + entityType: dcgm.FE_VGPU, + }, + mockDCGMFunc: func(fieldIDs []dcgm.Short) { + for _, fieldID := range fieldIDs { + mockDCGM.EXPECT().FieldGetById(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + } + }, + want: func() []dcgm.Short { + return []dcgm.Short{ + testutils.SampleVGPULicenseStatusCounter.FieldID, + testutils.SampleDriverVersionCounter.FieldID, + } + }, + }, + { + name: "CPU and CPU Core Counters", + args: args{ + counterList: testutils.SampleCounters, + entityType: dcgm.FE_CPU, + }, + mockDCGMFunc: func(fieldIDs []dcgm.Short) { + for _, fieldID := range fieldIDs { + mockDCGM.EXPECT().FieldGetById(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + } + }, + want: func() []dcgm.Short { + return []dcgm.Short{ + testutils.SampleCPUUtilTotalCounter.FieldID, + testutils.SampleDriverVersionCounter.FieldID, + } + }, + }, + { + name: "Switch and NV Link Counters", + args: args{ + counterList: testutils.SampleCounters, + entityType: dcgm.FE_SWITCH, + }, + mockDCGMFunc: func(fieldIDs []dcgm.Short) { + for _, fieldID := range fieldIDs { + mockDCGM.EXPECT().FieldGetById(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + } + }, + want: func() []dcgm.Short { + return []dcgm.Short{ + testutils.SampleSwitchCurrentTempCounter.FieldID, + testutils.SampleSwitchLinkFlitErrorsCounter.FieldID, + testutils.SampleDriverVersionCounter.FieldID, + } + }, + }, + { + name: "NV Link Counters", + args: args{ + counterList: testutils.SampleCounters, + entityType: dcgm.FE_LINK, + }, + mockDCGMFunc: func(fieldIDs []dcgm.Short) { + for _, fieldID := range fieldIDs { + mockDCGM.EXPECT().FieldGetById(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + } + }, + want: func() []dcgm.Short { + return []dcgm.Short{ + testutils.SampleSwitchLinkFlitErrorsCounter.FieldID, + testutils.SampleDriverVersionCounter.FieldID, + } + }, + }, + { + name: "Invalid Entity Type", + args: args{ + counterList: testutils.SampleCounters, + entityType: dcgm.FE_COUNT, + }, + mockDCGMFunc: func(fieldIDs []dcgm.Short) { + for _, fieldID := range fieldIDs { + mockDCGM.EXPECT().FieldGetById(fieldID).Return(testutils.SampleFieldIDToFieldMeta[fieldID]) + } + }, + want: func() []dcgm.Short { + return []dcgm.Short{ + testutils.SampleDriverVersionCounter.FieldID, + } + }, + }, + { + name: "No Counters", + args: args{ + counterList: []counters.Counter{}, + entityType: dcgm.FE_GPU, + }, + mockDCGMFunc: func(_ []dcgm.Short) {}, + want: func() []dcgm.Short { + return nil + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.mockDCGMFunc(testutils.SampleAllFieldIDs) + + d := &DeviceWatcher{} + want := tt.want() + got := d.GetDeviceFields(tt.args.counterList, tt.args.entityType) + + slices.Sort(want) + slices.Sort(got) + assert.Equal(t, want, got, "Device fields mismatch") + }) + } +} diff --git a/internal/pkg/devicewatcher/types.go b/internal/pkg/devicewatcher/types.go new file mode 100644 index 00000000..53e0a205 --- /dev/null +++ b/internal/pkg/devicewatcher/types.go @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/devicewatcher/mock_device_watcher.go -package=devicewatcher -copyright_file=../../../hack/header.txt . Watcher + +package devicewatcher + +import ( + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" +) + +type Watcher interface { + GetDeviceFields([]counters.Counter, dcgm.Field_Entity_Group) []dcgm.Short + WatchDeviceFields([]dcgm.Short, deviceinfo.Provider, int64) ([]dcgm.GroupHandle, dcgm.FieldHandle, []func(), error) +} diff --git a/internal/pkg/devicewatcher/variables.go b/internal/pkg/devicewatcher/variables.go new file mode 100644 index 00000000..d209d996 --- /dev/null +++ b/internal/pkg/devicewatcher/variables.go @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devicewatcher + +var doNothing = func() { + // This function is intentionally left blank +} diff --git a/internal/pkg/devicewatchlistmanager/device_watchlist_manager.go b/internal/pkg/devicewatchlistmanager/device_watchlist_manager.go new file mode 100644 index 00000000..00a9f6c9 --- /dev/null +++ b/internal/pkg/devicewatchlistmanager/device_watchlist_manager.go @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devicewatchlistmanager + +import ( + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher" +) + +// DeviceTypesToWatch supported entity group types +var DeviceTypesToWatch = []dcgm.Field_Entity_Group{ + dcgm.FE_GPU, + dcgm.FE_SWITCH, + dcgm.FE_LINK, + dcgm.FE_CPU, + dcgm.FE_CPU_CORE, +} + +type WatchList struct { + deviceInfo deviceinfo.Provider + deviceFields []dcgm.Short + deviceGroups []dcgm.GroupHandle + deviceFieldGroup dcgm.FieldHandle + labelDeviceFields []dcgm.Short + watcher devicewatcher.Watcher + collectInterval int64 +} + +func NewWatchList( + deviceInfo deviceinfo.Provider, deviceFields, labelDeviceFields []dcgm.Short, + watcher devicewatcher.Watcher, collectInterval int64, +) *WatchList { + return &WatchList{ + deviceInfo: deviceInfo, + deviceFields: deviceFields, + labelDeviceFields: labelDeviceFields, + watcher: watcher, + collectInterval: collectInterval, + } +} + +func (d *WatchList) DeviceInfo() deviceinfo.Provider { + return d.deviceInfo +} + +func (d *WatchList) DeviceFields() []dcgm.Short { + return d.deviceFields +} + +func (d *WatchList) SetDeviceFields(deviceFields []dcgm.Short) { + d.deviceFields = deviceFields +} + +func (d *WatchList) LabelDeviceFields() []dcgm.Short { + return d.labelDeviceFields +} + +func (d *WatchList) IsEmpty() bool { + return len(d.deviceFields) == 0 +} + +func (d *WatchList) Watch() ([]func(), error) { + var cleanups []func() + var err error + + d.deviceGroups, d.deviceFieldGroup, cleanups, err = d.watcher.WatchDeviceFields(d.deviceFields, d.deviceInfo, + d.collectInterval*1000) + return cleanups, err +} + +func (d *WatchList) DeviceGroups() []dcgm.GroupHandle { + return d.deviceGroups +} + +func (d *WatchList) DeviceFieldGroup() dcgm.FieldHandle { + return d.deviceFieldGroup +} + +// WatchListManager manages multiple entities and their corresponding WatchLists, counters to watch +// and device options. +type WatchListManager struct { + entityWatchLists map[dcgm.Field_Entity_Group]WatchList + counters counters.CounterList + gOpts appconfig.DeviceOptions + sOpts appconfig.DeviceOptions + cOpts appconfig.DeviceOptions + useFakeGPUs bool +} + +// NewWatchListManager creates a new instance of the WatchListManager +func NewWatchListManager( + counters counters.CounterList, config *appconfig.Config, +) *WatchListManager { + return &WatchListManager{ + entityWatchLists: make(map[dcgm.Field_Entity_Group]WatchList), + counters: counters, + gOpts: config.GPUDeviceOptions, + sOpts: config.SwitchDeviceOptions, + cOpts: config.CPUDeviceOptions, + useFakeGPUs: config.UseFakeGPUs, + } +} + +// CreateEntityWatchList identifies an entity's device fields, label field to monitor +// and loads its device information. +func (e *WatchListManager) CreateEntityWatchList( + entityType dcgm.Field_Entity_Group, watcher devicewatcher.Watcher, collectInterval int64, +) error { + deviceFields := watcher.GetDeviceFields(e.counters, entityType) + + labelDeviceFields := watcher.GetDeviceFields(e.counters.LabelCounters(), entityType) + + deviceInfo, err := deviceinfo.Initialize(e.gOpts, e.sOpts, e.cOpts, e.useFakeGPUs, entityType) + if err != nil { + return err + } + + e.entityWatchLists[entityType] = *NewWatchList( + deviceInfo, + deviceFields, + labelDeviceFields, + watcher, + collectInterval) + + return err +} + +// EntityWatchList returns a given entity's WatchList and true if such WatchList exists otherwise +// an empty WatchList and false. +func (e *WatchListManager) EntityWatchList(deviceType dcgm.Field_Entity_Group) (WatchList, bool) { + entityWatchList, exists := e.entityWatchLists[deviceType] + return entityWatchList, exists +} diff --git a/internal/pkg/devicewatchlistmanager/device_watchlist_manager_test.go b/internal/pkg/devicewatchlistmanager/device_watchlist_manager_test.go new file mode 100644 index 00000000..88ee975b --- /dev/null +++ b/internal/pkg/devicewatchlistmanager/device_watchlist_manager_test.go @@ -0,0 +1,780 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devicewatchlistmanager + +import ( + "fmt" + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + + mockdcgm "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/dcgmprovider" + mockdeviceinfo "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/deviceinfo" + mockdevicewatcher "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/devicewatcher" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" +) + +var ( + deviceOptionFalse = appconfig.DeviceOptions{ + Flex: false, + MajorRange: nil, + MinorRange: nil, + } + + deviceOptionTrue = appconfig.DeviceOptions{ + Flex: true, + MajorRange: nil, + MinorRange: nil, + } + + deviceOptionOther = appconfig.DeviceOptions{ + Flex: false, + MajorRange: []int{1}, + MinorRange: []int{-1}, + } + + mockDeviceInfoFunc = func(ctrl *gomock.Controller) *mockdeviceinfo.MockProvider { + gOpts := appconfig.DeviceOptions{ + Flex: true, + } + + mockGPUDeviceInfo := testutils.MockGPUDeviceInfo(ctrl, 2, nil) + mockGPUDeviceInfo.EXPECT().GOpts().Return(gOpts).AnyTimes() + + return mockGPUDeviceInfo + } +) + +func TestNewWatchList(t *testing.T) { + ctrl := gomock.NewController(t) + + type args struct { + deviceInfo deviceinfo.Provider + deviceFields []dcgm.Short + labelDeviceFields []dcgm.Short + newDeviceFields []dcgm.Short + collectInterval int64 + } + tests := []struct { + name string + args args + wantEmpty bool + wantWatchErr bool + }{ + { + name: "New Watch List", + args: args{ + deviceInfo: mockDeviceInfoFunc(ctrl), + deviceFields: []dcgm.Short{1, 2, 3, 4}, + labelDeviceFields: []dcgm.Short{100, 101}, + collectInterval: int64(1), + }, + wantEmpty: false, + wantWatchErr: false, + }, + { + name: "Empty Device Fields", + args: args{ + deviceInfo: mockDeviceInfoFunc(ctrl), + deviceFields: nil, + labelDeviceFields: []dcgm.Short{100, 101}, + collectInterval: int64(1), + }, + wantEmpty: true, + wantWatchErr: false, + }, + { + name: "SetDevice Fields", + args: args{ + deviceInfo: mockDeviceInfoFunc(ctrl), + deviceFields: []dcgm.Short{1, 2, 3, 4}, + labelDeviceFields: []dcgm.Short{100, 101}, + newDeviceFields: []dcgm.Short{1000}, + collectInterval: int64(1), + }, + wantEmpty: false, + wantWatchErr: false, + }, + { + name: "Watch Error", + args: args{ + deviceInfo: mockDeviceInfoFunc(ctrl), + deviceFields: nil, + labelDeviceFields: []dcgm.Short{100, 101}, + collectInterval: int64(1), + }, + wantEmpty: true, + wantWatchErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockDeviceWatcher := mockdevicewatcher.NewMockWatcher(ctrl) + + var err error + if tt.wantWatchErr { + err = fmt.Errorf("some error") + } + + mockDeviceWatcher.EXPECT().WatchDeviceFields(tt.args.deviceFields, tt.args.deviceInfo, + tt.args.collectInterval*1000).Return([]dcgm.GroupHandle{}, dcgm.FieldHandle{}, []func(){}, err) + + got := NewWatchList(tt.args.deviceInfo, tt.args.deviceFields, tt.args.labelDeviceFields, mockDeviceWatcher, + tt.args.collectInterval) + + assert.Equal(t, tt.args.deviceInfo, got.DeviceInfo(), "Unexpected DeviceInfo() output.") + assert.Equal(t, tt.args.deviceFields, got.DeviceFields(), "Unexpected DeviceFields() output.") + assert.Equal(t, tt.args.labelDeviceFields, got.LabelDeviceFields(), + "Unexpected LabelDeviceFields() output.") + assert.Equal(t, tt.wantEmpty, got.IsEmpty(), "Unexpected IsEmpty() output.") + + _, err = got.Watch() + if !tt.wantWatchErr { + assert.Nil(t, err, "expected no error") + } else { + assert.NotNil(t, err, "expected error") + } + + if tt.args.newDeviceFields != nil { + got.SetDeviceFields(tt.args.newDeviceFields) + assert.Equal(t, tt.args.newDeviceFields, got.DeviceFields(), + "Unexpected DeviceFields() output after SetDeviceFields().") + assert.NotEqual(t, tt.args.deviceFields, got.DeviceFields(), + "Unexpected DeviceFields() output after SetDeviceFields().") + } + }) + } +} + +func TestNewWatchListManager(t *testing.T) { + type args struct { + counters counters.CounterList + config *appconfig.Config + } + tests := []struct { + name string + args args + want *WatchListManager + }{ + { + name: "New Watch List Manager", + args: args{ + counters: testutils.SampleCounters, + config: &appconfig.Config{ + GPUDeviceOptions: deviceOptionFalse, + SwitchDeviceOptions: deviceOptionTrue, + CPUDeviceOptions: deviceOptionOther, + UseFakeGPUs: false, + }, + }, + want: &WatchListManager{ + entityWatchLists: make(map[dcgm.Field_Entity_Group]WatchList), + counters: testutils.SampleCounters, + gOpts: deviceOptionFalse, + sOpts: deviceOptionTrue, + cOpts: deviceOptionOther, + useFakeGPUs: false, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equalf(t, tt.want, NewWatchListManager(tt.args.counters, tt.args.config), + "Unexpected NewWatchListManager output") + }) + } +} + +func TestWatchListManager_CreateEntityWatchList(t *testing.T) { + ctrl := gomock.NewController(t) + mockDCGMProvider := mockdcgm.NewMockDCGM(ctrl) + + realDCGM := dcgmprovider.Client() + defer func() { + dcgmprovider.SetClient(realDCGM) + }() + dcgmprovider.SetClient(mockDCGMProvider) + + type fields struct { + entityWatchLists map[dcgm.Field_Entity_Group]WatchList + entityWatchListsCount int + counters counters.CounterList + gOpts appconfig.DeviceOptions + sOpts appconfig.DeviceOptions + cOpts appconfig.DeviceOptions + useFakeGPUs bool + } + type args struct { + entityType dcgm.Field_Entity_Group + watcher *mockdevicewatcher.MockWatcher + collectInterval int64 + } + tests := []struct { + name string + fields fields + args args + deviceFields []dcgm.Short + mockFunc func( + *mockdevicewatcher.MockWatcher, counters.CounterList, counters.CounterList, + dcgm.Field_Entity_Group, []dcgm.Short, []dcgm.Short, + ) + wantFunc func( + *WatchListManager, dcgm.Field_Entity_Group, []dcgm.Short, []dcgm.Short, + *mockdevicewatcher.MockWatcher, int64, + ) map[dcgm.Field_Entity_Group]WatchList + wantErr bool + }{ + { + name: "Create GPU WatchList", + fields: fields{ + entityWatchLists: make(map[dcgm.Field_Entity_Group]WatchList), + entityWatchListsCount: 1, + counters: testutils.SampleCounters, + gOpts: deviceOptionFalse, + sOpts: deviceOptionTrue, + cOpts: deviceOptionOther, + useFakeGPUs: false, + }, + args: args{ + entityType: dcgm.FE_GPU, + watcher: mockdevicewatcher.NewMockWatcher(ctrl), + collectInterval: 1, + }, + deviceFields: testutils.SampleGPUFieldIDs, + mockFunc: func( + watcher *mockdevicewatcher.MockWatcher, counters, labelCounters counters.CounterList, + entityType dcgm.Field_Entity_Group, deviceFields, labelDeviceFields []dcgm.Short, + ) { + watcher.EXPECT().GetDeviceFields(counters, entityType).Return(deviceFields) + watcher.EXPECT().GetDeviceFields(labelCounters, entityType).Return(labelDeviceFields) + + fakeDevices := deviceinfo.SpoofGPUDevices() + _, fakeGPUs, _, _ := deviceinfo.SpoofMigHierarchy() + + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 1, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + + // Times 2 because the wantFunc is also calling the same method + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil).Times(2) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], nil).Times(2) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil).Times(2) + }, + wantFunc: func( + e *WatchListManager, entityType dcgm.Field_Entity_Group, deviceFields, + labelDeviceFields []dcgm.Short, watcher *mockdevicewatcher.MockWatcher, collectInterval int64, + ) map[dcgm.Field_Entity_Group]WatchList { + watchList := make(map[dcgm.Field_Entity_Group]WatchList) + + mockDeviceInfo, _ := deviceinfo.Initialize(e.gOpts, e.sOpts, e.cOpts, e.useFakeGPUs, entityType) + watchList[entityType] = *NewWatchList(mockDeviceInfo, deviceFields, labelDeviceFields, watcher, + collectInterval) + + return watchList + }, + }, + { + name: "Override existing GPU WatchList", + fields: fields{ + entityWatchLists: map[dcgm.Field_Entity_Group]WatchList{ + dcgm.FE_GPU: { + deviceInfo: &deviceinfo.Info{}, + deviceFields: []dcgm.Short{10, 20, 30}, + labelDeviceFields: []dcgm.Short{100, 200, 300}, + watcher: nil, + collectInterval: 10000, + }, + }, + entityWatchListsCount: 1, + counters: testutils.SampleCounters, + gOpts: deviceOptionFalse, + sOpts: deviceOptionTrue, + cOpts: deviceOptionOther, + useFakeGPUs: false, + }, + args: args{ + entityType: dcgm.FE_GPU, + watcher: mockdevicewatcher.NewMockWatcher(ctrl), + collectInterval: 1, + }, + deviceFields: testutils.SampleGPUFieldIDs, + mockFunc: func( + watcher *mockdevicewatcher.MockWatcher, counters, labelCounters counters.CounterList, + entityType dcgm.Field_Entity_Group, deviceFields, labelDeviceFields []dcgm.Short, + ) { + watcher.EXPECT().GetDeviceFields(counters, entityType).Return(deviceFields) + watcher.EXPECT().GetDeviceFields(labelCounters, entityType).Return(labelDeviceFields) + + fakeDevices := deviceinfo.SpoofGPUDevices() + _, fakeGPUs, _, _ := deviceinfo.SpoofMigHierarchy() + + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 1, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + + // Times 2 because the wantFunc is also calling the same method + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil).Times(2) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], nil).Times(2) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil).Times(2) + }, + wantFunc: func( + e *WatchListManager, entityType dcgm.Field_Entity_Group, deviceFields, + labelDeviceFields []dcgm.Short, watcher *mockdevicewatcher.MockWatcher, collectInterval int64, + ) map[dcgm.Field_Entity_Group]WatchList { + watchList := make(map[dcgm.Field_Entity_Group]WatchList) + + mockDeviceInfo, _ := deviceinfo.Initialize(e.gOpts, e.sOpts, e.cOpts, e.useFakeGPUs, entityType) + watchList[entityType] = *NewWatchList(mockDeviceInfo, deviceFields, labelDeviceFields, watcher, + collectInterval) + + return watchList + }, + }, + { + name: "Multiple Type WatchList", + fields: fields{ + entityWatchLists: map[dcgm.Field_Entity_Group]WatchList{ + dcgm.FE_GPU: { + deviceInfo: &deviceinfo.Info{}, + deviceFields: []dcgm.Short{10, 20, 30}, + labelDeviceFields: []dcgm.Short{100, 200, 300}, + watcher: nil, + collectInterval: 10000, + }, + dcgm.FE_CPU: { + deviceInfo: &deviceinfo.Info{}, + deviceFields: []dcgm.Short{11, 21, 31}, + labelDeviceFields: []dcgm.Short{110, 210, 310}, + watcher: nil, + collectInterval: 10000, + }, + }, + entityWatchListsCount: 2, + counters: testutils.SampleCounters, + gOpts: deviceOptionFalse, + sOpts: deviceOptionTrue, + cOpts: deviceOptionOther, + useFakeGPUs: false, + }, + args: args{ + entityType: dcgm.FE_GPU, + watcher: mockdevicewatcher.NewMockWatcher(ctrl), + collectInterval: 1, + }, + deviceFields: testutils.SampleGPUFieldIDs, + mockFunc: func( + watcher *mockdevicewatcher.MockWatcher, counters, labelCounters counters.CounterList, + entityType dcgm.Field_Entity_Group, deviceFields, labelDeviceFields []dcgm.Short, + ) { + watcher.EXPECT().GetDeviceFields(counters, entityType).Return(deviceFields) + watcher.EXPECT().GetDeviceFields(labelCounters, entityType).Return(labelDeviceFields) + + fakeDevices := deviceinfo.SpoofGPUDevices() + _, fakeGPUs, _, _ := deviceinfo.SpoofMigHierarchy() + + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 1, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + + // Times 2 because the wantFunc is also calling the same method + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil).Times(2) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], nil).Times(2) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil).Times(2) + }, + wantFunc: func( + e *WatchListManager, entityType dcgm.Field_Entity_Group, deviceFields, + labelDeviceFields []dcgm.Short, watcher *mockdevicewatcher.MockWatcher, collectInterval int64, + ) map[dcgm.Field_Entity_Group]WatchList { + watchList := make(map[dcgm.Field_Entity_Group]WatchList) + for entity, existingWatchList := range e.entityWatchLists { + watchList[entity] = existingWatchList + } + + mockDeviceInfo, _ := deviceinfo.Initialize(e.gOpts, e.sOpts, e.cOpts, e.useFakeGPUs, entityType) + watchList[entityType] = *NewWatchList(mockDeviceInfo, deviceFields, labelDeviceFields, watcher, + collectInterval) + + return watchList + }, + }, + { + name: "Multiple Type WatchList and different type", + fields: fields{ + entityWatchLists: map[dcgm.Field_Entity_Group]WatchList{ + dcgm.FE_SWITCH: { + deviceInfo: &deviceinfo.Info{}, + deviceFields: []dcgm.Short{10, 20, 30}, + labelDeviceFields: []dcgm.Short{100, 200, 300}, + watcher: nil, + collectInterval: 10000, + }, + dcgm.FE_CPU: { + deviceInfo: &deviceinfo.Info{}, + deviceFields: []dcgm.Short{11, 21, 31}, + labelDeviceFields: []dcgm.Short{110, 210, 310}, + watcher: nil, + collectInterval: 10000, + }, + }, + entityWatchListsCount: 3, + counters: testutils.SampleCounters, + gOpts: deviceOptionFalse, + sOpts: deviceOptionTrue, + cOpts: deviceOptionOther, + useFakeGPUs: false, + }, + args: args{ + entityType: dcgm.FE_GPU, + watcher: mockdevicewatcher.NewMockWatcher(ctrl), + collectInterval: 1, + }, + deviceFields: testutils.SampleGPUFieldIDs, + mockFunc: func( + watcher *mockdevicewatcher.MockWatcher, counters, labelCounters counters.CounterList, + entityType dcgm.Field_Entity_Group, deviceFields, labelDeviceFields []dcgm.Short, + ) { + watcher.EXPECT().GetDeviceFields(counters, entityType).Return(deviceFields) + watcher.EXPECT().GetDeviceFields(labelCounters, entityType).Return(labelDeviceFields) + + fakeDevices := deviceinfo.SpoofGPUDevices() + _, fakeGPUs, _, _ := deviceinfo.SpoofMigHierarchy() + + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 1, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + + // Times 2 because the wantFunc is also calling the same method + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil).Times(2) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], nil).Times(2) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil).Times(2) + }, + wantFunc: func( + e *WatchListManager, entityType dcgm.Field_Entity_Group, deviceFields, + labelDeviceFields []dcgm.Short, watcher *mockdevicewatcher.MockWatcher, collectInterval int64, + ) map[dcgm.Field_Entity_Group]WatchList { + watchList := make(map[dcgm.Field_Entity_Group]WatchList) + for entity, existingWatchList := range e.entityWatchLists { + watchList[entity] = existingWatchList + } + + mockDeviceInfo, _ := deviceinfo.Initialize(e.gOpts, e.sOpts, e.cOpts, e.useFakeGPUs, entityType) + watchList[entityType] = *NewWatchList(mockDeviceInfo, deviceFields, labelDeviceFields, watcher, + collectInterval) + + return watchList + }, + }, + { + name: "Device Info initialize error", + fields: fields{ + entityWatchLists: make(map[dcgm.Field_Entity_Group]WatchList), + counters: testutils.SampleCounters, + gOpts: deviceOptionFalse, + sOpts: deviceOptionTrue, + cOpts: deviceOptionOther, + useFakeGPUs: false, + }, + args: args{ + entityType: dcgm.FE_GPU, + watcher: mockdevicewatcher.NewMockWatcher(ctrl), + collectInterval: 1, + }, + deviceFields: testutils.SampleGPUFieldIDs, + mockFunc: func( + watcher *mockdevicewatcher.MockWatcher, counters, labelCounters counters.CounterList, + entityType dcgm.Field_Entity_Group, deviceFields, labelDeviceFields []dcgm.Short, + ) { + watcher.EXPECT().GetDeviceFields(counters, entityType).Return(deviceFields) + watcher.EXPECT().GetDeviceFields(labelCounters, entityType).Return(labelDeviceFields) + + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(0), fmt.Errorf("some error")) + }, + wantFunc: func( + e *WatchListManager, entityType dcgm.Field_Entity_Group, deviceFields, + labelDeviceFields []dcgm.Short, watcher *mockdevicewatcher.MockWatcher, collectInterval int64, + ) map[dcgm.Field_Entity_Group]WatchList { + return nil + }, + wantErr: true, + }, + { + name: "No GPU WatchList", + fields: fields{ + entityWatchLists: make(map[dcgm.Field_Entity_Group]WatchList), + entityWatchListsCount: 1, + counters: []counters.Counter{}, + gOpts: deviceOptionFalse, + sOpts: deviceOptionTrue, + cOpts: deviceOptionOther, + useFakeGPUs: false, + }, + args: args{ + entityType: dcgm.FE_GPU, + watcher: mockdevicewatcher.NewMockWatcher(ctrl), + collectInterval: 1, + }, + deviceFields: []dcgm.Short{}, + mockFunc: func( + watcher *mockdevicewatcher.MockWatcher, counters, labelCounters counters.CounterList, + entityType dcgm.Field_Entity_Group, deviceFields, labelDeviceFields []dcgm.Short, + ) { + watcher.EXPECT().GetDeviceFields(counters, entityType).Return(deviceFields).Times(1) + watcher.EXPECT().GetDeviceFields(labelCounters, entityType).Return(deviceFields).Times(1) + + fakeDevices := deviceinfo.SpoofGPUDevices() + _, fakeGPUs, _, _ := deviceinfo.SpoofMigHierarchy() + + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 1, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + + // Times 2 because the wantFunc is also calling the same method + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil).Times(2) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], nil).Times(2) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil).Times(2) + }, + wantFunc: func( + e *WatchListManager, + entityType dcgm.Field_Entity_Group, + deviceFields, + labelDeviceFields []dcgm.Short, + watcher *mockdevicewatcher.MockWatcher, + collectInterval int64, + ) map[dcgm.Field_Entity_Group]WatchList { + watchList := make(map[dcgm.Field_Entity_Group]WatchList) + + mockDeviceInfo, _ := deviceinfo.Initialize(e.gOpts, e.sOpts, e.cOpts, e.useFakeGPUs, entityType) + watchList[entityType] = *NewWatchList(mockDeviceInfo, deviceFields, []dcgm.Short{}, watcher, + collectInterval) + + return watchList + }, + wantErr: false, + }, + { + name: "Only Driver Version to Watch", + fields: fields{ + entityWatchLists: make(map[dcgm.Field_Entity_Group]WatchList), + entityWatchListsCount: 1, + counters: []counters.Counter{}, + gOpts: deviceOptionFalse, + sOpts: deviceOptionTrue, + cOpts: deviceOptionOther, + useFakeGPUs: false, + }, + args: args{ + entityType: dcgm.FE_GPU, + watcher: mockdevicewatcher.NewMockWatcher(ctrl), + collectInterval: 1, + }, + deviceFields: []dcgm.Short{testutils.SampleDriverVersionCounter.FieldID}, + mockFunc: func( + watcher *mockdevicewatcher.MockWatcher, + counters, labelCounters counters.CounterList, + entityType dcgm.Field_Entity_Group, + deviceFields, labelDeviceFields []dcgm.Short, + ) { + watcher.EXPECT().GetDeviceFields(counters, entityType).Return(deviceFields).Times(1) + watcher.EXPECT().GetDeviceFields(labelCounters, entityType).Return(deviceFields).Times(1) + + fakeDevices := deviceinfo.SpoofGPUDevices() + _, fakeGPUs, _, _ := deviceinfo.SpoofMigHierarchy() + + mockHierarchy := dcgm.MigHierarchy_v2{ + Count: 1, + } + mockHierarchy.EntityList[0] = fakeGPUs[0] + + // Times 2 because the wantFunc is also calling the same method + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil).Times(2) + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(fakeDevices[0], nil).Times(2) + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockHierarchy, nil).Times(2) + }, + wantFunc: func( + e *WatchListManager, + entityType dcgm.Field_Entity_Group, + deviceFields, + labelDeviceFields []dcgm.Short, + watcher *mockdevicewatcher.MockWatcher, + collectInterval int64, + ) map[dcgm.Field_Entity_Group]WatchList { + watchList := make(map[dcgm.Field_Entity_Group]WatchList) + + mockDeviceInfo, _ := deviceinfo.Initialize(e.gOpts, e.sOpts, e.cOpts, e.useFakeGPUs, entityType) + watchList[entityType] = *NewWatchList(mockDeviceInfo, deviceFields, labelDeviceFields, watcher, + collectInterval) + + return watchList + }, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + e := &WatchListManager{ + entityWatchLists: tt.fields.entityWatchLists, + counters: tt.fields.counters, + gOpts: tt.fields.gOpts, + sOpts: tt.fields.sOpts, + cOpts: tt.fields.cOpts, + useFakeGPUs: tt.fields.useFakeGPUs, + } + + tt.mockFunc( + tt.args.watcher, + tt.fields.counters, + tt.fields.counters.LabelCounters(), + tt.args.entityType, + tt.deviceFields, + []dcgm.Short{testutils.SampleDriverVersionCounter.FieldID}, + ) + + want := tt.wantFunc( + e, + tt.args.entityType, + tt.deviceFields, + []dcgm.Short{testutils.SampleDriverVersionCounter.FieldID}, + tt.args.watcher, + tt.args.collectInterval, + ) + + err := e.CreateEntityWatchList(tt.args.entityType, tt.args.watcher, tt.args.collectInterval) + got := e.entityWatchLists + gotEntityWatchList, exist := e.EntityWatchList(tt.args.entityType) + + if !tt.wantErr { + assert.Nil(t, err, "expected no error") + wantEntityWatchList := want[tt.args.entityType] + + assert.True(t, exist, "expected entity to exist") + assert.Equal(t, want, got, "expected output to be equal") + assert.Equal(t, tt.fields.entityWatchListsCount, len(got), + "expected entityWatchLists count to be equal") + assert.Equal(t, wantEntityWatchList, gotEntityWatchList, "expected entity results to be equal") + } else { + assert.NotNil(t, err, "expected an error.") + assert.Equal(t, 0, len(got), "expected output to be zero") + assert.False(t, exist, "expected entity to not exist") + } + }) + } +} + +func TestWatchListManager_EntityWatchList(t *testing.T) { + tests := []struct { + name string + deviceType dcgm.Field_Entity_Group + entityWatchLists map[dcgm.Field_Entity_Group]WatchList + wantWatchList WatchList + wantExist bool + override bool + }{ + { + name: "Get GPU WatchList", + deviceType: dcgm.FE_GPU, + entityWatchLists: map[dcgm.Field_Entity_Group]WatchList{ + dcgm.FE_GPU: { + deviceInfo: &deviceinfo.Info{}, + deviceFields: []dcgm.Short{10, 20, 30}, + labelDeviceFields: []dcgm.Short{100, 200, 300}, + watcher: nil, + collectInterval: 10000, + }, + }, + wantWatchList: WatchList{ + deviceInfo: &deviceinfo.Info{}, + deviceFields: []dcgm.Short{10, 20, 30}, + labelDeviceFields: []dcgm.Short{100, 200, 300}, + watcher: nil, + collectInterval: 10000, + }, + wantExist: true, + }, + { + name: "Get latest GPU WatchList", + deviceType: dcgm.FE_GPU, + entityWatchLists: map[dcgm.Field_Entity_Group]WatchList{ + dcgm.FE_GPU: { + deviceInfo: &deviceinfo.Info{}, + deviceFields: []dcgm.Short{10, 20, 30}, + labelDeviceFields: []dcgm.Short{100, 200, 300}, + watcher: nil, + collectInterval: 10000, + }, + }, + wantWatchList: WatchList{ + deviceInfo: &deviceinfo.Info{}, + deviceFields: []dcgm.Short{101, 201, 301}, + labelDeviceFields: []dcgm.Short{1001, 2001, 3001}, + watcher: nil, + collectInterval: 10000, + }, + wantExist: true, + override: true, + }, + { + name: "Empty WatchList", + deviceType: dcgm.FE_GPU, + entityWatchLists: map[dcgm.Field_Entity_Group]WatchList{}, + wantWatchList: WatchList{}, + wantExist: false, + }, + { + name: "Get GPU WatchList when only CPU Entity exist", + deviceType: dcgm.FE_GPU, + entityWatchLists: map[dcgm.Field_Entity_Group]WatchList{ + dcgm.FE_CPU: { + deviceInfo: &deviceinfo.Info{}, + deviceFields: []dcgm.Short{10, 20, 30}, + labelDeviceFields: []dcgm.Short{100, 200, 300}, + watcher: nil, + collectInterval: 10000, + }, + }, + wantWatchList: WatchList{}, + wantExist: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + e := &WatchListManager{ + entityWatchLists: tt.entityWatchLists, + } + + if tt.override { + e.entityWatchLists[tt.deviceType] = tt.wantWatchList + } + + gotEntityWatchList, exist := e.EntityWatchList(tt.deviceType) + assert.Equal(t, tt.wantExist, exist, "expected entity exist value to be equal") + assert.Equal(t, tt.wantWatchList, gotEntityWatchList, "expected output to be equal") + }) + } +} diff --git a/internal/pkg/devicewatchlistmanager/types.go b/internal/pkg/devicewatchlistmanager/types.go new file mode 100644 index 00000000..e856fd81 --- /dev/null +++ b/internal/pkg/devicewatchlistmanager/types.go @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/devicewatchlistmanager/mock_device_watchlist_manager.go -package=devicewatchlistmanager -copyright_file=../../../hack/header.txt . Manager + +package devicewatchlistmanager + +import ( + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher" +) + +type Manager interface { + CreateEntityWatchList(dcgm.Field_Entity_Group, devicewatcher.Watcher, int64) error + EntityWatchList(dcgm.Field_Entity_Group) (WatchList, bool) +} diff --git a/internal/pkg/elf/README.md b/internal/pkg/elf/README.md new file mode 100644 index 00000000..fa1e8378 --- /dev/null +++ b/internal/pkg/elf/README.md @@ -0,0 +1,3 @@ +# Exec - wrapper package for system debug/elf package + +The package allows to mock debug/elf package functions for testing purposes. diff --git a/internal/pkg/elf/elf.go b/internal/pkg/elf/elf.go new file mode 100644 index 00000000..a547943b --- /dev/null +++ b/internal/pkg/elf/elf.go @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package elf + +import ( + "debug/elf" +) + +var _ ELF = (*RealELF)(nil) + +type RealELF struct{} + +func (r RealELF) Open(name string) (*elf.File, error) { + return elf.Open(name) +} diff --git a/internal/pkg/elf/types.go b/internal/pkg/elf/types.go new file mode 100644 index 00000000..f55cf498 --- /dev/null +++ b/internal/pkg/elf/types.go @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package elf + +import "debug/elf" + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/elf/mock_elf.go -package=elf -copyright_file=../../../hack/header.txt . ELF +type ELF interface { + Open(name string) (*elf.File, error) +} diff --git a/internal/pkg/exec/README.md b/internal/pkg/exec/README.md new file mode 100644 index 00000000..85171b19 --- /dev/null +++ b/internal/pkg/exec/README.md @@ -0,0 +1,3 @@ +# Exec - wrapper package for system os/exec package + +The package allows to mock os/exec package functions for testing purposes. diff --git a/internal/pkg/exec/exec.go b/internal/pkg/exec/exec.go new file mode 100644 index 00000000..1af40c62 --- /dev/null +++ b/internal/pkg/exec/exec.go @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package exec + +import "os/exec" + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/exec/mock_exec.go -package=exec -copyright_file=../../../hack/header.txt . Exec +type Exec interface { + Command(name string, arg ...string) Cmd +} + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/exec/mock_cmd.go -package=exec -copyright_file=../../../hack/header.txt . Cmd +type Cmd interface { + Output() ([]byte, error) +} + +var ( + _ Exec = (*RealExec)(nil) + _ Cmd = (*RealCmd)(nil) +) + +type RealExec struct{} + +func (r RealExec) Command(name string, arg ...string) Cmd { + return &RealCmd{cmd: exec.Command(name, arg...)} +} + +type RealCmd struct { + cmd *exec.Cmd +} + +func (r *RealCmd) Output() ([]byte, error) { + return r.cmd.Output() +} diff --git a/internal/pkg/hostname/hostname.go b/internal/pkg/hostname/hostname.go new file mode 100644 index 00000000..c714929d --- /dev/null +++ b/internal/pkg/hostname/hostname.go @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package hostname + +import ( + "net" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" +) + +var os osinterface.OS = osinterface.RealOS{} + +// GetHostname return a hostname where metric was collected. +func GetHostname(config *appconfig.Config) (string, error) { + if config.UseRemoteHE { + return parseRemoteHostname(config) + } + return getLocalHostname() +} + +func parseRemoteHostname(config *appconfig.Config) (string, error) { + // Extract the hostname or IP address part from the appconfig.RemoteHEInfo + // This handles inputs like "localhost:5555", "example.com:5555", or "192.168.1.1:5555" + host, _, err := net.SplitHostPort(config.RemoteHEInfo) + if err != nil { + // If there's an error, it might be because there's no port in the appconfig.RemoteHEInfo + // In that case, use the appconfig.RemoteHEInfo as is + host = config.RemoteHEInfo + } + return host, nil +} + +func getLocalHostname() (string, error) { + if nodeName := os.Getenv("NODE_NAME"); nodeName != "" { + return nodeName, nil + } + hostname, err := os.Hostname() + if err != nil { + return "", err + } + return hostname, nil +} diff --git a/internal/pkg/hostname/hostname_test.go b/internal/pkg/hostname/hostname_test.go new file mode 100644 index 00000000..78c9afc8 --- /dev/null +++ b/internal/pkg/hostname/hostname_test.go @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package hostname + +import ( + "errors" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + + osmock "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/os" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" +) + +func TestGetHostname(t *testing.T) { + tests := []struct { + name string + config *appconfig.Config + hook func() func() + want string + wantErr assert.ErrorAssertionFunc + }{ + { + name: "When os.Hostname() return hostname", + config: &appconfig.Config{UseRemoteHE: false}, + hook: func() func() { + ctrl := gomock.NewController(t) + m := osmock.NewMockOS(ctrl) + m.EXPECT().Getenv(gomock.Eq("NODE_NAME")) + m.EXPECT().Hostname().Return("test-hostname", nil).AnyTimes() + os = m + return func() { + os = osinterface.RealOS{} + } + }, + want: "test-hostname", + }, + { + name: "When GetHostname uses the NODE_NAME env variable", + config: &appconfig.Config{UseRemoteHE: false}, + hook: func() func() { + ctrl := gomock.NewController(t) + m := osmock.NewMockOS(ctrl) + m.EXPECT().Getenv(gomock.Eq("NODE_NAME")).Return("test-hostname") + os = m + return func() { + os = osinterface.RealOS{} + } + }, + want: "test-hostname", + }, + { + name: "When os.Hostname() return error", + config: &appconfig.Config{UseRemoteHE: false}, + hook: func() func() { + ctrl := gomock.NewController(t) + m := osmock.NewMockOS(ctrl) + m.EXPECT().Getenv(gomock.Eq("NODE_NAME")) + m.EXPECT().Hostname().Return("", errors.New("Boom!")).AnyTimes() + os = m + return func() { + os = osinterface.RealOS{} + } + }, + want: "", + }, + { + name: "When os.Hostname() return error", + config: &appconfig.Config{UseRemoteHE: false}, + hook: func() func() { + ctrl := gomock.NewController(t) + m := osmock.NewMockOS(ctrl) + m.EXPECT().Getenv(gomock.Eq("NODE_NAME")) + m.EXPECT().Hostname().Return("", errors.New("Boom!")).AnyTimes() + os = m + return func() { + os = osinterface.RealOS{} + } + }, + want: "", + wantErr: assert.Error, + }, + { + name: "When appconfig.UseRemoteHE is true and remote hostname is name", + config: &appconfig.Config{ + UseRemoteHE: true, + RemoteHEInfo: "example.com:5555", + }, + want: "example.com", + }, + { + name: "When appconfig.UseRemoteHE is true and hostname is IP address", + config: &appconfig.Config{ + UseRemoteHE: true, + RemoteHEInfo: "127.0.0.1", + }, + want: "127.0.0.1", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.hook != nil { + cleanup := tt.hook() + defer cleanup() + } + got, err := GetHostname(tt.config) + if tt.wantErr != nil && !tt.wantErr(t, err, fmt.Sprintf("GetHostname(%v)", tt.config)) { + return + } + assert.Equalf(t, tt.want, got, "GetHostname(%v)", tt.config) + }) + } +} diff --git a/internal/pkg/integration_test/collector_test.go b/internal/pkg/integration_test/collector_test.go new file mode 100644 index 00000000..357bbdf9 --- /dev/null +++ b/internal/pkg/integration_test/collector_test.go @@ -0,0 +1,1117 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package integration_test + +import ( + "bytes" + "fmt" + "reflect" + "slices" + "strconv" + "testing" + "time" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + io_prometheus_client "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + "google.golang.org/grpc" + "k8s.io/kubelet/pkg/apis/podresources/v1alpha1" + "k8s.io/utils/ptr" + + mockdcgmprovider "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/rendermetrics" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" +) + +var deviceWatcher = devicewatcher.NewDeviceWatcher() + +var expectedGPUMetrics = map[string]bool{ + testutils.SampleGPUTempCounter.FieldName: true, + testutils.SampleGPUTotalEnergyCounter.FieldName: true, + testutils.SampleGPUPowerUsageCounter.FieldName: true, + testutils.SampleVGPULicenseStatusCounter.FieldName: true, +} + +var expectedCPUMetrics = map[string]bool{ + testutils.SampleCPUUtilTotalCounter.FieldName: true, +} + +func setupTest() func() { + config := &appconfig.Config{ + UseRemoteHE: false, + } + + dcgmprovider.Initialize(config) + + return func() { + defer dcgmprovider.Client().Cleanup() + } +} + +func runOnlyWithLiveGPUs(t *testing.T) { + t.Helper() + gpus, err := dcgmprovider.Client().GetSupportedDevices() + assert.NoError(t, err) + if len(gpus) < 1 { + t.Skip("Skipping test that requires live GPUs. None were found") + } +} + +func mockDCGM(ctrl *gomock.Controller) *mockdcgmprovider.MockDCGM { + // Mock results outputs + mockDevice := dcgm.Device{ + GPU: 0, + UUID: "fake1", + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, + } + + mockMigHierarchy := dcgm.MigHierarchy_v2{ + Count: 0, + } + + mockCPUHierarchy := dcgm.CpuHierarchy_v1{ + Version: 0, + NumCpus: 1, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{ + { + CpuId: 0, + OwnedCores: []uint64{0, 18446744073709551360, 65535}, + }, + }, + } + + mockGroupHandle := dcgm.GroupHandle{} + mockGroupHandle.SetHandle(1) + + mockFieldHandle := dcgm.FieldHandle{} + mockFieldHandle.SetHandle(1) + + mockDCGMProvider := mockdcgmprovider.NewMockDCGM(ctrl) + mockDCGMProvider.EXPECT().GetAllDeviceCount().Return(uint(1), nil).AnyTimes() + mockDCGMProvider.EXPECT().AddEntityToGroup(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + mockDCGMProvider.EXPECT().GetGpuInstanceHierarchy().Return(mockMigHierarchy, nil).AnyTimes() + mockDCGMProvider.EXPECT().GetCpuHierarchy().Return(mockCPUHierarchy, nil).AnyTimes() + mockDCGMProvider.EXPECT().CreateGroup(gomock.Any()).Return(mockGroupHandle, nil).AnyTimes() + mockDCGMProvider.EXPECT().DestroyGroup(gomock.Any()).Return(nil).AnyTimes() + mockDCGMProvider.EXPECT().FieldGroupCreate(gomock.Any(), gomock.Any()).Return(mockFieldHandle, nil).AnyTimes() + mockDCGMProvider.EXPECT().FieldGroupDestroy(gomock.Any()).Return(nil).AnyTimes() + mockDCGMProvider.EXPECT().WatchFieldsWithGroupEx(gomock.Any(), gomock.Any(), gomock.Any(), + gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + mockDCGMProvider.EXPECT().GetDeviceInfo(gomock.Any()).Return(mockDevice, nil).AnyTimes() + + return mockDCGMProvider +} + +func TestClockEventsCollector_NewClocksThrottleReasonsCollector(t *testing.T) { + config := &appconfig.Config{ + UseRemoteHE: false, + GPUDeviceOptions: appconfig.DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + } + + dcgmprovider.Initialize(config) + defer dcgmprovider.Client().Cleanup() + + allCounters := []counters.Counter{ + { + FieldID: dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + }, + } + + deviceWatchListManager := devicewatchlistmanager.NewWatchListManager(allCounters, config) + err := deviceWatchListManager.CreateEntityWatchList(dcgm.FE_GPU, deviceWatcher, + int64(config.CollectInterval)) + require.NoError(t, err) + item, _ := deviceWatchListManager.EntityWatchList(dcgm.FE_GPU) + + t.Run("Should Return Error When DCGM_EXP_CLOCK_EVENTS_COUNT is not present", func(t *testing.T) { + records := [][]string{ + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + } + cc, err := counters.ExtractCounters(records, config) + require.NoError(t, err) + require.Len(t, cc.ExporterCounters, 0) + require.Len(t, cc.DCGMCounters, 1) + clockEventCollector, err := collector.NewClockEventsCollector(cc.DCGMCounters, "", config, item) + require.Error(t, err) + require.Nil(t, clockEventCollector) + }) + + t.Run("Should Return Error When Counter Param Is Empty", func(t *testing.T) { + counterList := make([]counters.Counter, 0) + clockEventCollector, err := collector.NewClockEventsCollector(counterList, "", config, item) + require.Error(t, err) + require.Nil(t, clockEventCollector) + }) + + t.Run("Should Not Return Error When DCGM_EXP_CLOCK_EVENTS_COUNT Present More Than Once", func(t *testing.T) { + records := [][]string{ + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, + } + cc, err := counters.ExtractCounters(records, config) + require.NoError(t, err) + for i := range cc.DCGMCounters { + if cc.DCGMCounters[i].PromType == "label" { + cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) + } + } + clockEventCollector, err := collector.NewClockEventsCollector(cc.ExporterCounters, "", config, item) + require.NoError(t, err) + require.NotNil(t, clockEventCollector) + }) +} + +func TestClockEventsCollector_Gather(t *testing.T) { + teardownTest := setupTest() + defer teardownTest() + runOnlyWithLiveGPUs(t) + testutils.RequireLinux(t) + + hostname := "local-test" + config := &appconfig.Config{ + GPUDeviceOptions: appconfig.DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + ClockEventsCountWindowSize: int(time.Duration(5) * time.Minute), + } + + records := [][]string{ + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + } + + cc, err := counters.ExtractCounters(records, config) + require.NoError(t, err) + require.Len(t, cc.ExporterCounters, 1) + require.Len(t, cc.DCGMCounters, 1) + + cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters.LabelCounters()...) + + // Create fake GPU + numGPUs, err := dcgmprovider.Client().GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > dcgm.MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) + } + + entityList := []dcgm.MigHierarchyInfo{ + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + } + + gpuIDs, err := dcgmprovider.Client().CreateFakeEntities(entityList) + require.NoError(t, err) + require.NotEmpty(t, gpuIDs) + + type clockEventsCountExpectation map[string]string + expectations := map[string]clockEventsCountExpectation{} + + for i, gpuID := range gpuIDs { + err = dcgmprovider.Client().InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(collector.DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL|collector.DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL), + ) + require.NoError(t, err) + + err = dcgmprovider.Client().InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(collector.DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL|collector.DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL), + ) + require.NoError(t, err) + + err = dcgmprovider.Client().InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(collector.DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE), + ) + require.NoError(t, err) + + expectations[fmt.Sprint(gpuID)] = clockEventsCountExpectation{ + collector.DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL.String(): "2", + collector.DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL.String(): "2", + collector.DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE.String(): "1", + } + } + + // Create a fake K8S to emulate work on K8S environment + tmpDir, cleanup := testutils.CreateTmpDir(t) + defer cleanup() + socketPath := tmpDir + "/kubelet.sock" + server := grpc.NewServer() + + gpuIDsAsString := make([]string, len(gpuIDs)) + + for i, g := range gpuIDs { + gpuIDsAsString[i] = fmt.Sprint(g) + } + + v1alpha1.RegisterPodResourcesListerServer(server, + testutils.NewMockPodResourcesServer(appconfig.NvidiaResourceName, gpuIDsAsString)) + // Tell that the app is running on K8S + config.Kubernetes = true + config.PodResourcesKubeletSocket = socketPath + + allCounters := []counters.Counter{ + { + FieldID: dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + }, + } + + allCounters = append(allCounters, cc.ExporterCounters.LabelCounters()...) + + deviceWatchListManager := devicewatchlistmanager.NewWatchListManager(allCounters, config) + err = deviceWatchListManager.CreateEntityWatchList(dcgm.FE_GPU, deviceWatcher, + int64(config.CollectInterval)) + require.NoError(t, err) + + item, _ := deviceWatchListManager.EntityWatchList(dcgm.FE_GPU) + + clockEventCollector, err := collector.NewClockEventsCollector(cc.ExporterCounters, hostname, config, item) + require.NoError(t, err) + + defer func() { + clockEventCollector.Cleanup() + }() + + metrics, err := clockEventCollector.GetMetrics() + require.NoError(t, err) + require.NotEmpty(t, metrics) + // We expect 1 metric: DCGM_EXP_CLOCK_EVENTS_COUNT + require.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(counters.Counter)] + + for i := 0; i < len(metricValues); i++ { + gpuID, err := strconv.ParseUint(metricValues[i].GPU, 10, 64) + if err == nil { + if !slices.Contains(gpuIDs, uint(gpuID)) { + metricValues = append(metricValues[:i], metricValues[i+1:]...) + } + } + } + + // We expect 9 records, because we have 3 fake GPU and each GPU experienced 3 CLOCK_EVENTS + require.Len(t, metricValues, 9) + for _, val := range metricValues { + require.Contains(t, val.Labels, "window_size_in_ms") + require.Equal(t, fmt.Sprint(config.ClockEventsCountWindowSize), val.Labels["window_size_in_ms"]) + expected, exists := expectations[val.GPU] + require.True(t, exists) + actualReason, exists := val.Labels["clock_event"] + require.True(t, exists) + expectedVal, exists := expected[actualReason] + require.True(t, exists) + require.Equal(t, expectedVal, val.Value) + } +} + +func TestClockEventsCollector_Gather_AllTheThings(t *testing.T) { + teardownTest := setupTest() + defer teardownTest() + runOnlyWithLiveGPUs(t) + + hostname := "local-test" + config := &appconfig.Config{ + GPUDeviceOptions: appconfig.DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + ClockEventsCountWindowSize: int(time.Duration(5) * time.Minute), + } + + records := [][]string{ + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + } + + cc, err := counters.ExtractCounters(records, config) + require.NoError(t, err) + require.Len(t, cc.ExporterCounters, 1) + require.Len(t, cc.DCGMCounters, 1) + + for i := range cc.DCGMCounters { + if cc.DCGMCounters[i].PromType == "label" { + cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) + } + } + + // Create fake GPU + numGPUs, err := dcgmprovider.Client().GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > dcgm.MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) + } + + entityList := []dcgm.MigHierarchyInfo{ + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + } + + gpuIDs, err := dcgmprovider.Client().CreateFakeEntities(entityList) + require.NoError(t, err) + require.NotEmpty(t, gpuIDs) + + type clockThrottleReasonExpectation map[string]string + expectations := map[string]clockThrottleReasonExpectation{} + + require.Len(t, gpuIDs, 1) + gpuID := gpuIDs[0] + err = dcgmprovider.Client().InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(1)*time.Second).UnixMicro(), + int64(collector.DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE| + collector.DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING| + collector.DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP| + collector.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN| + collector.DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST| + collector.DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL| + collector.DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL| + collector.DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE| + collector.DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS), + ) + + require.NoError(t, err) + + expectations[fmt.Sprint(gpuID)] = clockThrottleReasonExpectation{ + collector.DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE.String(): "1", + collector.DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING.String(): "1", + collector.DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP.String(): "1", + collector.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN.String(): "1", + collector.DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST.String(): "1", + collector.DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL.String(): "1", + collector.DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL.String(): "1", + collector.DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE.String(): "1", + collector.DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS.String(): "1", + } + + allCounters := []counters.Counter{ + { + FieldID: dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + }, + } + + allCounters = append(allCounters, cc.ExporterCounters.LabelCounters()...) + + deviceWatchListManager := devicewatchlistmanager.NewWatchListManager(allCounters, config) + + err = deviceWatchListManager.CreateEntityWatchList(dcgm.FE_GPU, deviceWatcher, + int64(config.CollectInterval)) + require.NoError(t, err) + + item, _ := deviceWatchListManager.EntityWatchList(dcgm.FE_GPU) + + clockEventCollector, err := collector.NewClockEventsCollector(cc.ExporterCounters, hostname, config, item) + require.NoError(t, err) + + defer func() { + clockEventCollector.Cleanup() + }() + + metrics, err := clockEventCollector.GetMetrics() + require.NoError(t, err) + require.NotEmpty(t, metrics) + // We expect 1 metric: DCGM_EXP_CLOCK_EVENTS_COUNT + require.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(counters.Counter)] + + metricValues = getFakeGPUMetrics(metricValues, gpuIDs) + + // Expected 9 metric values, because we injected 9 reasons + require.Len(t, metricValues, 9) + for _, val := range metricValues { + require.Contains(t, val.Labels, "window_size_in_ms") + require.Equal(t, fmt.Sprint(config.ClockEventsCountWindowSize), val.Labels["window_size_in_ms"]) + expected, exists := expectations[val.GPU] + require.True(t, exists) + actualReason, exists := val.Labels["clock_event"] + require.True(t, exists) + expectedVal, exists := expected[actualReason] + require.True(t, exists) + require.Equal(t, expectedVal, val.Value) + } +} + +func TestClockEventsCollector_Gather_AllTheThings_WhenNoLabels(t *testing.T) { + teardownTest := setupTest() + defer teardownTest() + runOnlyWithLiveGPUs(t) + + hostname := "local-test" + config := &appconfig.Config{ + GPUDeviceOptions: appconfig.DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + ClockEventsCountWindowSize: int(time.Duration(5) * time.Minute), + } + + records := [][]string{ + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, + } + + cc, err := counters.ExtractCounters(records, config) + require.NoError(t, err) + require.Len(t, cc.ExporterCounters, 1) + require.Len(t, cc.DCGMCounters, 0) + + // Create fake GPU + numGPUs, err := dcgmprovider.Client().GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > dcgm.MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) + } + + entityList := []dcgm.MigHierarchyInfo{ + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + } + + gpuIDs, err := dcgmprovider.Client().CreateFakeEntities(entityList) + require.NoError(t, err) + require.NotEmpty(t, gpuIDs) + + gpuID := gpuIDs[0] + err = dcgmprovider.Client().InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(1)*time.Second).UnixMicro(), + int64(collector.DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE| + collector.DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING| + collector.DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP| + collector.DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN| + collector.DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST| + collector.DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL| + collector.DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL| + collector.DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE| + collector.DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS), + ) + + require.NoError(t, err) + + allCounters := []counters.Counter{ + { + FieldID: dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + }, + } + + deviceWatchListManager := devicewatchlistmanager.NewWatchListManager(allCounters, config) + + err = deviceWatchListManager.CreateEntityWatchList(dcgm.FE_GPU, deviceWatcher, + int64(config.CollectInterval)) + require.NoError(t, err) + + item, _ := deviceWatchListManager.EntityWatchList(dcgm.FE_GPU) + + clockEventCollector, err := collector.NewClockEventsCollector(cc.ExporterCounters, hostname, config, item) + require.NoError(t, err) + + defer func() { + clockEventCollector.Cleanup() + }() + + metrics, err := clockEventCollector.GetMetrics() + require.NoError(t, err) + require.NotEmpty(t, metrics) + // We expect 1 metric: DCGM_EXP_CLOCK_EVENTS_COUNT + require.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(counters.Counter)] + // Exclude the real GPU from the test + metricValues = getFakeGPUMetrics(metricValues, gpuIDs) + // Expected 9 metric values, because we injected 9 reasons + require.Len(t, metricValues, 9) +} + +func getFakeGPUMetrics(metricValues []collector.Metric, gpuIDs []uint) []collector.Metric { + for i := 0; i < len(metricValues); i++ { + gpuID, err := strconv.ParseUint(metricValues[i].GPU, 10, 64) + if err == nil { + if !slices.Contains(gpuIDs, uint(gpuID)) { + metricValues = append(metricValues[:i], metricValues[i+1:]...) + } + } + } + return metricValues +} + +func TestXIDCollector_Gather_Encode(t *testing.T) { + teardownTest := setupTest() + defer teardownTest() + runOnlyWithLiveGPUs(t) + + hostname := "local-test" + config := &appconfig.Config{ + GPUDeviceOptions: appconfig.DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + XIDCountWindowSize: int(time.Duration(5) * time.Minute), + } + + records := [][]string{ + { + "DCGM_EXP_XID_ERRORS_COUNT", + "gauge", + "Count of XID Errors within user-specified time window (see xid-count-window-size param).", + }, + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + } + + cc, err := counters.ExtractCounters(records, config) + require.NoError(t, err) + require.Len(t, cc.ExporterCounters, 1) + require.Len(t, cc.DCGMCounters, 1) + + for i := range cc.DCGMCounters { + if cc.DCGMCounters[i].PromType == "label" { + cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) + } + } + + // Get a number of hardware GPUs + hardwareGPUs, err := dcgmprovider.Client().GetAllDeviceCount() + require.NoError(t, err) + + if hardwareGPUs+1 > dcgm.MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) + } + + entityList := []dcgm.MigHierarchyInfo{ + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + } + + // Create fake GPU + fakeGPUIDs, err := dcgmprovider.Client().CreateFakeEntities(entityList) + require.NoError(t, err) + require.NotEmpty(t, fakeGPUIDs) + + for i, gpuID := range fakeGPUIDs { + err = dcgmprovider.Client().InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_XID_ERRORS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(42), + ) + require.NoError(t, err) + + err = dcgmprovider.Client().InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_XID_ERRORS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(42), + ) + require.NoError(t, err) + + err = dcgmprovider.Client().InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_XID_ERRORS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(46), + ) + require.NoError(t, err) + + } + + allCounters := []counters.Counter{ + { + FieldID: dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + }, + } + + allCounters = append(allCounters, cc.ExporterCounters.LabelCounters()...) + + deviceWatchListManager := devicewatchlistmanager.NewWatchListManager(allCounters, config) + err = deviceWatchListManager.CreateEntityWatchList(dcgm.FE_GPU, deviceWatcher, + int64(config.CollectInterval)) + require.NoError(t, err) + + item, exists := deviceWatchListManager.EntityWatchList(dcgm.FE_GPU) + require.True(t, exists) + + xidCollector, err := collector.NewXIDCollector(cc.ExporterCounters, hostname, config, item) + require.NoError(t, err) + + defer func() { + xidCollector.Cleanup() + }() + + metrics, err := xidCollector.GetMetrics() + require.NoError(t, err) + require.NotEmpty(t, metrics) + // We expect 1 metric: DCGM_EXP_XID_ERRORS_COUNT + require.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(counters.Counter)] + + fakeGPUIDMap := map[string]struct{}{} + for _, fakeGPUID := range fakeGPUIDs { + fakeGPUIDMap[fmt.Sprint(fakeGPUID)] = struct{}{} + } + + conditionFakeGPUOnly := func(m collector.Metric) bool { + _, exists := fakeGPUIDMap[m.GPU] + return exists + } + + // We want to filter out physical GPU and keep fake only + metricValues = filterMetrics(metricValues, conditionFakeGPUOnly) + require.Len(t, metricValues, len(fakeGPUIDs)*2) + + for _, val := range metricValues { + require.Contains(t, val.Labels, "window_size_in_ms") + require.Equal(t, fmt.Sprint(config.XIDCountWindowSize), val.Labels["window_size_in_ms"]) + } + + // We inject new error + err = dcgmprovider.Client().InjectFieldValue(fakeGPUIDs[0], + dcgm.DCGM_FI_DEV_XID_ERRORS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().UnixMicro(), + int64(19), + ) + require.NoError(t, err) + + // Wait for 1 second + time.Sleep(1 * time.Second) + + metrics, err = xidCollector.GetMetrics() + require.NoError(t, err) + require.NotEmpty(t, metrics) + + // We expect 1 metric: DCGM_EXP_XID_ERRORS_COUNT + require.Len(t, metrics, 1) + // We get metric value with the last index + metricValues = metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(counters.Counter)] + // We want to filter out physical GPU and keep fake only + metricValues = filterMetrics(metricValues, conditionFakeGPUOnly) + // We update metrics with slice, that doesn't contain physical GPU + metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(counters.Counter)] = metricValues + + // We have 3 fake GPU and each GPU experienced 3 XID errors: 42, 46, 19 to GPU0 + require.Len(t, metricValues, 1+(len(fakeGPUIDs)*2)) + for _, val := range metricValues { + require.Contains(t, val.Labels, "window_size_in_ms") + require.Equal(t, fmt.Sprint(config.XIDCountWindowSize), val.Labels["window_size_in_ms"]) + } + + // Now we check the metric rendering + var b bytes.Buffer + err = rendermetrics.RenderGroup(&b, dcgm.FE_GPU, metrics) + require.NoError(t, err) + require.NotEmpty(t, b) + + var parser expfmt.TextParser + mf, err := parser.TextToMetricFamilies(&b) + require.NoError(t, err) + require.NotEmpty(t, mf) + require.Len(t, mf, 1) + metricFamily := mf[reflect.ValueOf(mf).MapKeys()[0].Interface().(string)] + require.NotNil(t, metricFamily.Name) + assert.Equal(t, "DCGM_EXP_XID_ERRORS_COUNT", *metricFamily.Name) + assert.Equal(t, "Count of XID Errors within user-specified time window (see xid-count-window-size param).", + *metricFamily.Help) + assert.Equal(t, io_prometheus_client.MetricType_GAUGE, *metricFamily.Type) + // We have 3 fake GPU and each GPU, except the one experienced XID errors: 42, 46, 19 + require.Len(t, metricFamily.Metric, 1+(len(fakeGPUIDs)*2)) + for _, mv := range metricFamily.Metric { + require.NotNil(t, mv.Gauge.Value) + if *(mv.Gauge.Value) == 0 { + // We don't inject XID errors into the hardware GPU, so we do not expect XID label + assert.Len(t, mv.Label, 7) + assert.False(t, slices.ContainsFunc(mv.Label, func(lp *io_prometheus_client.LabelPair) bool { + return ptr.Deref(lp.Name, "") == "xid" + })) + continue + } + assert.Len(t, mv.Label, 9) + assert.Equal(t, "gpu", *mv.Label[0].Name) + assert.Equal(t, "UUID", *mv.Label[1].Name) + assert.Equal(t, "pci_bus_id", *mv.Label[2].Name) + assert.NotEmpty(t, *mv.Label[2].Value) + assert.Equal(t, "device", *mv.Label[3].Name) + assert.Equal(t, "modelName", *mv.Label[4].Name) + assert.Equal(t, "Hostname", *mv.Label[5].Name) + assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *mv.Label[6].Name) + assert.Equal(t, "window_size_in_ms", *mv.Label[7].Name) + assert.Equal(t, "xid", *mv.Label[8].Name) + assert.NotEmpty(t, *mv.Label[8].Value) + } +} + +func TestXIDCollector_NewXIDCollector(t *testing.T) { + config := &appconfig.Config{ + UseRemoteHE: false, + GPUDeviceOptions: appconfig.DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + } + + dcgmprovider.Initialize(config) + defer dcgmprovider.Client().Cleanup() + + allCounters := []counters.Counter{ + { + FieldID: dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, + }, + } + + deviceWatchListManager := devicewatchlistmanager.NewWatchListManager(allCounters, config) + err := deviceWatchListManager.CreateEntityWatchList(dcgm.FE_GPU, deviceWatcher, + int64(config.CollectInterval)) + require.NoError(t, err) + + item, _ := deviceWatchListManager.EntityWatchList(dcgm.FE_GPU) + + t.Run("Should Return Error When DCGM_EXP_XID_ERRORS_COUNT is not present", func(t *testing.T) { + records := [][]string{ + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + } + cc, err := counters.ExtractCounters(records, config) + require.NoError(t, err) + require.Len(t, cc.ExporterCounters, 0) + require.Len(t, cc.DCGMCounters, 1) + + xidCollector, err := collector.NewXIDCollector(cc.DCGMCounters, "", config, item) + require.Error(t, err) + require.Nil(t, xidCollector) + }) + + t.Run("Should Return Error When Counters Param Is Empty", func(t *testing.T) { + emptyCounters := make([]counters.Counter, 0) + xidCollector, err := collector.NewXIDCollector(emptyCounters, "", config, item) + require.Error(t, err) + require.Nil(t, xidCollector) + }) + + t.Run("Should Not Return Error When DCGM_EXP_XID_ERRORS_COUNT Present More Than Once", func(t *testing.T) { + records := [][]string{ + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + { + "DCGM_EXP_XID_ERRORS_COUNT", + "gauge", + "Count of XID Errors within user-specified time window (see xid-count-window-size param).", + }, + { + "DCGM_EXP_XID_ERRORS_COUNT", + "gauge", + "Count of XID Errors within user-specified time window (see xid-count-window-size param).", + }, + { + "DCGM_EXP_XID_ERRORS_COUNT", + "gauge", + "Count of XID Errors within user-specified time window (see xid-count-window-size param).", + }, + } + cc, err := counters.ExtractCounters(records, config) + require.NoError(t, err) + for i := range cc.DCGMCounters { + if cc.DCGMCounters[i].PromType == "label" { + cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) + } + } + xidCollector, err := collector.NewXIDCollector(cc.ExporterCounters, "", config, item) + require.NoError(t, err) + require.NotNil(t, xidCollector) + }) +} + +func filterMetrics(metricValues []collector.Metric, condition func(metric collector.Metric) bool) []collector.Metric { + var result []collector.Metric + for _, metricValue := range metricValues { + if condition(metricValue) { + result = append(result, metricValue) + } + } + return result +} + +func TestDCGMCollector(t *testing.T) { + config := &appconfig.Config{ + UseRemoteHE: false, + } + dcgmprovider.Initialize(config) + defer dcgmprovider.Client().Cleanup() + + dcgmCollector := testDCGMGPUCollector(t, testutils.SampleCounters) + dcgmCollector.Cleanup() + + dcgmCollector = testDCGMCPUCollector(t, testutils.SampleCounters) + dcgmCollector.Cleanup() +} + +func testDCGMGPUCollector(t *testing.T, counters []counters.Counter) *collector.DCGMCollector { + dOpt := appconfig.DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + } + config := appconfig.Config{ + GPUDeviceOptions: dOpt, + NoHostname: false, + UseOldNamespace: false, + UseFakeGPUs: false, + CollectInterval: 1, + } + + // Store actual dcgm provider + realDCGMProvider := dcgmprovider.Client() + defer dcgmprovider.SetClient(realDCGMProvider) + + ctrl := gomock.NewController(t) + mockDCGMProvider := mockDCGM(ctrl) + + // Calls where actual API calls and results are desirable + mockDCGMProvider.EXPECT().FieldGetById(gomock.Any()). + DoAndReturn(func(fieldID dcgm.Short) dcgm.FieldMeta { + return realDCGMProvider.FieldGetById(fieldID) + }).AnyTimes() + + mockDCGMProvider.EXPECT().EntityGetLatestValues(gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(func(entityGroup dcgm.Field_Entity_Group, entityId uint, fields []dcgm.Short) ([]dcgm.FieldValue_v1, + error, + ) { + return realDCGMProvider.EntityGetLatestValues(entityGroup, entityId, fields) + }).AnyTimes() + + // Set mock DCGM provider + dcgmprovider.SetClient(mockDCGMProvider) + + deviceWatchListManager := devicewatchlistmanager.NewWatchListManager(counters, &config) + + err := deviceWatchListManager.CreateEntityWatchList(dcgm.FE_GPU, deviceWatcher, + int64(config.CollectInterval)) + require.NoError(t, err) + + gpuItem, exists := deviceWatchListManager.EntityWatchList(dcgm.FE_GPU) + require.True(t, exists) + + g, err := collector.NewDCGMCollector(counters, "", &config, gpuItem) + require.NoError(t, err) + + /* Test for error when no switches are available to monitor. */ + switchItem, exists := deviceWatchListManager.EntityWatchList(dcgm.FE_SWITCH) + assert.False(t, exists, "dcgm.FE_SWITCH should not be available") + + _, err = collector.NewDCGMCollector(counters, "", &config, switchItem) + require.Error(t, err, "NewDCGMCollector should return error") + + /* Test for error when no cpus are available to monitor. */ + cpuItem, exist := deviceWatchListManager.EntityWatchList(dcgm.FE_CPU) + require.False(t, exist, "dcgm.FE_CPU should not be available") + + _, err = collector.NewDCGMCollector(counters, "", &config, cpuItem) + require.Error(t, err, "NewDCGMCollector should return error") + + out, err := g.GetMetrics() + require.NoError(t, err) + require.Greater(t, len(out), 0, "Check that you have a GPU on this node") + require.Len(t, out, len(expectedGPUMetrics), + fmt.Sprintf("Expected: %+v \nGot: %+v", expectedGPUMetrics, out)) + + seenMetrics := map[string]bool{} + for _, metrics := range out { + for _, metric := range metrics { + seenMetrics[metric.Counter.FieldName] = true + require.NotEmpty(t, metric.GPU) + require.NotEmpty(t, metric.GPUUUID) + require.NotEmpty(t, metric.GPUPCIBusID) + require.NotEmpty(t, metric.Value) + require.NotEqual(t, metric.Value, collector.FailedToConvert) + } + } + require.Equal(t, seenMetrics, expectedGPUMetrics) + + return g +} + +func testDCGMCPUCollector(t *testing.T, counters []counters.Counter) *collector.DCGMCollector { + dOpt := appconfig.DeviceOptions{Flex: true, MajorRange: []int{-1}, MinorRange: []int{-1}} + config := appconfig.Config{ + CPUDeviceOptions: dOpt, + NoHostname: false, + UseOldNamespace: false, + UseFakeGPUs: false, + } + + realDCGMProvider := dcgmprovider.Client() + defer dcgmprovider.SetClient(realDCGMProvider) + + ctrl := gomock.NewController(t) + mockDCGMProvider := mockDCGM(ctrl) + + // Calls where actual API calls and results are desirable + mockDCGMProvider.EXPECT().FieldGetById(gomock.Any()). + DoAndReturn(func(fieldID dcgm.Short) dcgm.FieldMeta { + return realDCGMProvider.FieldGetById(fieldID) + }).AnyTimes() + + mockDCGMProvider.EXPECT().EntityGetLatestValues(gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(func(entityGroup dcgm.Field_Entity_Group, entityId uint, fields []dcgm.Short) ([]dcgm.FieldValue_v1, + error, + ) { + return realDCGMProvider.EntityGetLatestValues(entityGroup, entityId, fields) + }).AnyTimes() + + dcgmprovider.SetClient(mockDCGMProvider) + + /* Test that only cpu metrics are collected for cpu entities. */ + deviceWatchListManager := devicewatchlistmanager.NewWatchListManager(counters, &config) + err := deviceWatchListManager.CreateEntityWatchList(dcgm.FE_CPU, deviceWatcher, + int64(config.CollectInterval)) + require.NoError(t, err) + + err = deviceWatchListManager.CreateEntityWatchList(dcgm.FE_CPU, deviceWatcher, + int64(config.CollectInterval)) + require.NoError(t, err) + + cpuItem, cpuItemExist := deviceWatchListManager.EntityWatchList(dcgm.FE_CPU) + require.True(t, cpuItemExist) + + c, err := collector.NewDCGMCollector(counters, "", &config, cpuItem) + require.NoError(t, err) + + out, err := c.GetMetrics() + require.NoError(t, err) + require.Greater(t, len(out), 0, "Check that the fake CPU has been registered") + + for _, dev := range out { + seenMetrics := map[string]bool{} + for _, metric := range dev { + seenMetrics[metric.Counter.FieldName] = true + require.NotEmpty(t, metric.GPU) + + require.NotEmpty(t, metric.Value) + require.NotEqual(t, metric.Value, collector.FailedToConvert) + } + require.Equal(t, seenMetrics, expectedCPUMetrics) + } + + return c +} + +func TestGPUCollector_GetMetrics(t *testing.T) { + teardownTest := setupTest() + defer teardownTest() + + runOnlyWithLiveGPUs(t) + // Create fake GPU + numGPUs, err := dcgmprovider.Client().GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > dcgm.MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) + } + + entityList := []dcgm.MigHierarchyInfo{ + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + } + + gpuIDs, err := dcgmprovider.Client().CreateFakeEntities(entityList) + require.NoError(t, err) + require.NotEmpty(t, gpuIDs) + + numGPUs, err = dcgmprovider.Client().GetAllDeviceCount() + require.NoError(t, err) + + intputCounters := []counters.Counter{ + { + FieldID: 100, + FieldName: "DCGM_FI_DEV_SM_CLOCK", + PromType: "gauge", + Help: "SM clock frequency (in MHz).", + }, + } + + dOpt := appconfig.DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + } + config := appconfig.Config{ + GPUDeviceOptions: dOpt, + NoHostname: false, + UseOldNamespace: false, + UseFakeGPUs: false, + } + + deviceWatchListManager := devicewatchlistmanager.NewWatchListManager(intputCounters, &config) + err = deviceWatchListManager.CreateEntityWatchList(dcgm.FE_GPU, deviceWatcher, + int64(config.CollectInterval)) + require.NoError(t, err) + + gpuItem, exists := deviceWatchListManager.EntityWatchList(dcgm.FE_GPU) + require.True(t, exists) + + c, err := collector.NewDCGMCollector(intputCounters, "", &config, gpuItem) + require.NoError(t, err) + + defer c.Cleanup() + + out, err := c.GetMetrics() + require.NoError(t, err) + require.Len(t, out, 1) + + values := out[intputCounters[0]] + + require.Equal(t, numGPUs, uint(len(values))) +} diff --git a/internal/pkg/integration_test/transformation_test.go b/internal/pkg/integration_test/transformation_test.go new file mode 100644 index 00000000..1c93a432 --- /dev/null +++ b/internal/pkg/integration_test/transformation_test.go @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package integration_test + +import ( + "fmt" + "reflect" + "testing" + + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + "k8s.io/kubelet/pkg/apis/podresources/v1alpha1" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/transformation" +) + +const ( + // Note standard resource attributes + podAttribute = "pod" + namespaceAttribute = "namespace" + containerAttribute = "container" +) + +func TestProcessPodMapper(t *testing.T) { + testutils.RequireLinux(t) + + tmpDir, cleanup := testutils.CreateTmpDir(t) + defer cleanup() + + config := &appconfig.Config{ + UseRemoteHE: false, + } + + dcgmprovider.Initialize(config) + defer dcgmprovider.Client().Cleanup() + + c := testDCGMGPUCollector(t, testutils.SampleCounters) + defer c.Cleanup() + + out, err := c.GetMetrics() + require.NoError(t, err) + + original := out + + arbirtaryMetric := out[reflect.ValueOf(out).MapKeys()[0].Interface().(counters.Counter)] + + socketPath := tmpDir + "/kubelet.sock" + server := grpc.NewServer() + gpus := getGPUUUIDs(arbirtaryMetric) + v1alpha1.RegisterPodResourcesListerServer(server, + testutils.NewMockPodResourcesServer(appconfig.NvidiaResourceName, gpus)) + + cleanup = testutils.StartMockServer(t, server, socketPath) + defer cleanup() + + podMapper := transformation.NewPodMapper(&appconfig.Config{ + KubernetesGPUIdType: appconfig.GPUUID, + PodResourcesKubeletSocket: socketPath, + }) + require.NoError(t, err) + var deviceInfo deviceinfo.Provider + err = podMapper.Process(out, deviceInfo) + require.NoError(t, err) + + require.Len(t, out, len(original)) + for _, metrics := range out { + for _, metric := range metrics { + require.Contains(t, metric.Attributes, podAttribute) + require.Contains(t, metric.Attributes, namespaceAttribute) + require.Contains(t, metric.Attributes, containerAttribute) + require.Equal(t, metric.Attributes[podAttribute], fmt.Sprintf("gpu-pod-%s", metric.GPU)) + require.Equal(t, metric.Attributes[namespaceAttribute], "default") + require.Equal(t, metric.Attributes[containerAttribute], "default") + } + } +} + +func getGPUUUIDs(metrics []collector.Metric) []string { + gpus := make([]string, len(metrics)) + for i, dev := range metrics { + gpus[i] = dev.GPUUUID + } + + return gpus +} diff --git a/internal/pkg/logging/const.go b/internal/pkg/logging/const.go new file mode 100644 index 00000000..95a0a482 --- /dev/null +++ b/internal/pkg/logging/const.go @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package logging + +// Constants for logging fields +const ( + GroupIDKey = "groupID" + DumpKey = "dump" + StackTrace = "stacktrace" + FieldEntityGroupKey = "fieldEntityGroup" + MetricsKey = "metrics" + DeviceInfoKey = "deviceInfo" + ErrorKey = "error" +) diff --git a/internal/pkg/logging/logger_adapter.go b/internal/pkg/logging/logger_adapter.go deleted file mode 100644 index 19374778..00000000 --- a/internal/pkg/logging/logger_adapter.go +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package logging - -import ( - "fmt" - - "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/sirupsen/logrus" -) - -// LogrusAdapter is an adapter that allows logrus Logger to be used as a go-kit/log Logger. -type LogrusAdapter struct { - Logger *logrus.Logger -} - -// NewLogrusAdapter creates a new LogrusAdapter with the provided logrus.Logger. -func NewLogrusAdapter(logger *logrus.Logger) log.Logger { - return &LogrusAdapter{ - Logger: logger, - } -} - -// Log implements the go-kit/log Logger interface. -func (la *LogrusAdapter) Log(keyvals ...interface{}) error { - // keyvals is a slice of interfaces, that represents a key-value pairs. - if len(keyvals)%2 != 0 { - keyvals = append(keyvals, "MISSING") - } - - fields := logrus.Fields{} - for i := 0; i < len(keyvals); i += 2 { - key, ok := keyvals[i].(string) - if !ok { - // If the key is not la string, use la default key - key = "missing_key" - } - fields[key] = keyvals[i+1] - } - - // The go-kit/log uses msg field to keep log message, we don't want to use message as field in the logrus. - msg, exists := fields["msg"] - if exists { - delete(fields, "msg") - } - - // The go-kit/log uses level fields to keep log level. We need to convert this field into logrus value. - lvl, exists := fields["level"] - if !exists { - fields["level"] = level.InfoValue() - } - delete(fields, "level") - parsedLvl, err := logrus.ParseLevel(fmt.Sprint(lvl)) - if err != nil { - parsedLvl = logrus.InfoLevel - } - - la.Logger.WithFields(fields).Log(parsedLvl, msg) - - return nil -} diff --git a/internal/pkg/logging/logger_adapter_test.go b/internal/pkg/logging/logger_adapter_test.go deleted file mode 100644 index b4eb38bd..00000000 --- a/internal/pkg/logging/logger_adapter_test.go +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package logging - -import ( - "testing" - - "github.com/go-kit/log/level" - "github.com/sirupsen/logrus" - "github.com/sirupsen/logrus/hooks/test" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestLogrusAdapter_Log(t *testing.T) { - type testCase struct { - name string - keyvals []interface{} - assert func(*testing.T, *logrus.Entry) - } - - //"msg", "Listening on", "address" - testCases := []testCase{ - { - name: "Success", - keyvals: []interface{}{ - "level", - level.InfoValue, - "msg", - "Listening on", - "address", - "127.0.0.0.1:8080", - }, - assert: func(t *testing.T, entry *logrus.Entry) { - t.Helper() - require.NotNil(t, entry) - assert.Equal(t, "Listening on", entry.Message) - require.Contains(t, entry.Data, "address") - assert.Equal(t, "127.0.0.0.1:8080", entry.Data["address"]) - }, - }, - { - name: "When no Level", - keyvals: []interface{}{ - "msg", - "Listening on", - "address", - "127.0.0.0.1:8080", - }, - assert: func(t *testing.T, entry *logrus.Entry) { - t.Helper() - require.NotNil(t, entry) - assert.Equal(t, "Listening on", entry.Message) - require.Contains(t, entry.Data, "address") - assert.Equal(t, "127.0.0.0.1:8080", entry.Data["address"]) - }, - }, - { - name: "When key is not string", - keyvals: []interface{}{ - "msg", - "Listening on", - 42, - "127.0.0.0.1:8080", - }, - assert: func(t *testing.T, entry *logrus.Entry) { - t.Helper() - require.NotNil(t, entry) - assert.Equal(t, "Listening on", entry.Message) - require.Contains(t, entry.Data, "missing_key") - assert.Equal(t, "127.0.0.0.1:8080", entry.Data["missing_key"]) - }, - }, - { - name: "When value is missing", - keyvals: []interface{}{ - "msg", - "Listening on", - "address", - }, - assert: func(t *testing.T, entry *logrus.Entry) { - t.Helper() - require.NotNil(t, entry) - assert.Equal(t, "Listening on", entry.Message) - require.Contains(t, entry.Data, "address") - assert.Equal(t, "MISSING", entry.Data["address"]) - }, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - logrusLogger, logHook := test.NewNullLogger() - logger := NewLogrusAdapter(logrusLogger) - err := logger.Log(tc.keyvals...) - require.NoError(t, err) - tc.assert(t, logHook.LastEntry()) - }) - } -} diff --git a/internal/pkg/nvmlprovider/provider.go b/internal/pkg/nvmlprovider/provider.go index 32678933..786ba936 100644 --- a/internal/pkg/nvmlprovider/provider.go +++ b/internal/pkg/nvmlprovider/provider.go @@ -19,96 +19,151 @@ package nvmlprovider import ( "errors" "fmt" + "log/slog" "strconv" "strings" - "sync" "github.com/NVIDIA/go-nvml/pkg/nvml" - "github.com/sirupsen/logrus" ) -var nvmlOnce *sync.Once = new(sync.Once) - type MIGDeviceInfo struct { ParentUUID string GPUInstanceID int ComputeInstanceID int } +var nvmlInterface NVML + +// Initialize sets up the Singleton NVML interface. +func Initialize() { + nvmlInterface = newNVMLProvider() +} + +// reset clears the current NVML interface instance. +func reset() { + nvmlInterface = nil +} + +// Client retrieves the current NVML interface instance. +func Client() NVML { + return nvmlInterface +} + +// SetClient sets the current NVML interface instance to the provided one. +func SetClient(n NVML) { + nvmlInterface = n +} + +// nvmlProvider implements NVML Interface +type nvmlProvider struct { + initialized bool +} + +func newNVMLProvider() NVML { + // Check if a NVML client already exists and return it if so. + if Client() != nil && Client().(nvmlProvider).initialized { + slog.Info("NVML already initialized.") + return Client() + } + + slog.Info("Attempting to initialize NVML library.") + ret := nvml.Init() + if ret != nvml.SUCCESS { + err := errors.New(nvml.ErrorString(ret)) + slog.Error(fmt.Sprintf("Cannot init NVML library; err: %v", err)) + return nvmlProvider{initialized: false} + } + + return nvmlProvider{initialized: true} +} + +func (n nvmlProvider) preCheck() error { + if !n.initialized { + return fmt.Errorf("NVML library not initialized") + } + + return nil +} + // GetMIGDeviceInfoByID returns information about MIG DEVICE by ID -func GetMIGDeviceInfoByID(uuid string) (*MIGDeviceInfo, error) { - var err error - - nvmlOnce.Do(func() { - ret := nvml.Init() - if ret != nvml.SUCCESS { - err = errors.New(nvml.ErrorString(ret)) - logrus.Error("Can not init NVML library.") - } - }) - if err != nil { +func (n nvmlProvider) GetMIGDeviceInfoByID(uuid string) (*MIGDeviceInfo, error) { + if err := n.preCheck(); err != nil { + slog.Error(fmt.Sprintf("failed to get MIG Device Info; err: %v", err)) return nil, err } - // 1. With drivers >= R470 (470.42.01+), each MIG device is assigned a GPU UUID starting - // with MIG-. - device, ret := nvml.DeviceGetHandleByUUID(uuid) if ret == nvml.SUCCESS { - parentDevice, ret := device.GetDeviceHandleFromMigDeviceHandle() - if ret != nvml.SUCCESS { - return nil, errors.New(nvml.ErrorString(ret)) - } - - parentUUID, ret := parentDevice.GetUUID() - if ret != nvml.SUCCESS { - return nil, errors.New(nvml.ErrorString(ret)) - } - - gi, ret := device.GetGpuInstanceId() - if ret != nvml.SUCCESS { - return nil, errors.New(nvml.ErrorString(ret)) - } - - ci, ret := device.GetComputeInstanceId() - if ret != nvml.SUCCESS { - return nil, errors.New(nvml.ErrorString(ret)) - } - - return &MIGDeviceInfo{ - ParentUUID: parentUUID, - GPUInstanceID: gi, - ComputeInstanceID: ci, - }, nil + return getMIGDeviceInfoForNewDriver(device) } - // 2. With drivers < R470 (e.g. R450 and R460), each MIG device is enumerated by - // specifying the CI and the corresponding parent GI. The format follows this - // convention: MIG-//. + return getMIGDeviceInfoForOldDriver(uuid) +} + +// getMIGDeviceInfoForNewDriver identifies MIG Device Information for drivers >= R470 (470.42.01+), +// each MIG device is assigned a GPU UUID starting with MIG-. +func getMIGDeviceInfoForNewDriver(device nvml.Device) (*MIGDeviceInfo, error) { + parentDevice, ret := device.GetDeviceHandleFromMigDeviceHandle() + if ret != nvml.SUCCESS { + return nil, errors.New(nvml.ErrorString(ret)) + } + parentUUID, ret := parentDevice.GetUUID() + if ret != nvml.SUCCESS { + return nil, errors.New(nvml.ErrorString(ret)) + } + + gi, ret := device.GetGpuInstanceId() + if ret != nvml.SUCCESS { + return nil, errors.New(nvml.ErrorString(ret)) + } + + ci, ret := device.GetComputeInstanceId() + if ret != nvml.SUCCESS { + return nil, errors.New(nvml.ErrorString(ret)) + } + + return &MIGDeviceInfo{ + ParentUUID: parentUUID, + GPUInstanceID: gi, + ComputeInstanceID: ci, + }, nil +} + +// getMIGDeviceInfoForOldDriver identifies MIG Device Information for drivers < R470 (e.g. R450 and R460), +// each MIG device is enumerated by specifying the CI and the corresponding parent GI. The format follows this +// convention: MIG-//. +func getMIGDeviceInfoForOldDriver(uuid string) (*MIGDeviceInfo, error) { tokens := strings.SplitN(uuid, "-", 2) if len(tokens) != 2 || tokens[0] != "MIG" { - return nil, fmt.Errorf("unable to parse UUID '%s' as MIG device", uuid) + return nil, fmt.Errorf("unable to parse '%s' as MIG device UUID", uuid) } - tokens = strings.SplitN(tokens[1], "/", 3) - if len(tokens) != 3 || !strings.HasPrefix(tokens[0], "GPU-") { - return nil, fmt.Errorf("unable to parse UUID '%s' as MIG device", uuid) + gpuTokens := strings.SplitN(tokens[1], "/", 3) + if len(gpuTokens) != 3 || !strings.HasPrefix(gpuTokens[0], "GPU-") { + return nil, fmt.Errorf("invalid MIG device UUID '%s'", uuid) } - gi, err := strconv.Atoi(tokens[1]) + gi, err := strconv.Atoi(gpuTokens[1]) if err != nil { - return nil, fmt.Errorf("unable to parse UUID '%s' as MIG device", uuid) + return nil, fmt.Errorf("invalid GPU instance ID '%s' for MIG device '%s'", gpuTokens[1], uuid) } - ci, err := strconv.Atoi(tokens[2]) + ci, err := strconv.Atoi(gpuTokens[2]) if err != nil { - return nil, fmt.Errorf("unable to parse UUID '%s' as MIG device", uuid) + return nil, fmt.Errorf("invalid Compute instance ID '%s' for MIG device '%s'", gpuTokens[2], uuid) } return &MIGDeviceInfo{ - ParentUUID: tokens[0], + ParentUUID: gpuTokens[0], GPUInstanceID: gi, ComputeInstanceID: ci, }, nil } + +// Cleanup performs cleanup operations for the NVML provider +func (n nvmlProvider) Cleanup() { + if err := n.preCheck(); err == nil { + reset() + } +} diff --git a/internal/pkg/nvmlprovider/provider_test.go b/internal/pkg/nvmlprovider/provider_test.go index 0b63a7f4..fde0a76d 100644 --- a/internal/pkg/nvmlprovider/provider_test.go +++ b/internal/pkg/nvmlprovider/provider_test.go @@ -22,21 +22,34 @@ import ( "github.com/stretchr/testify/assert" ) +func TestGetMIGDeviceInfoByID_When_NVML_Not_Initialized(t *testing.T) { + validMIGUUID := "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5" + newNvmlProvider := nvmlProvider{} + + deviceInfo, err := newNvmlProvider.GetMIGDeviceInfoByID(validMIGUUID) + assert.Error(t, err, "uuid: %v, Device Info: %+v", validMIGUUID, deviceInfo) +} + func TestGetMIGDeviceInfoByID_When_DriverVersion_Below_R470(t *testing.T) { + Initialize() + assert.NotNil(t, Client(), "expected NVML Client to be not nil") + assert.True(t, Client().(nvmlProvider).initialized, "expected Client to be initialized") + defer Client().Cleanup() + tests := []struct { - name string - uuid string - expectedGPU string - expectedGi int - expectedCi int - expectedError bool + name string + uuid string + expectedMIGInfo *MIGDeviceInfo + expectedError bool }{ { - name: "Successfull Parsing", - uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", - expectedGPU: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5", - expectedGi: 1, - expectedCi: 5, + name: "Successful Parsing", + uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", + expectedMIGInfo: &MIGDeviceInfo{ + ParentUUID: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5", + GPUInstanceID: 1, + ComputeInstanceID: 5, + }, }, { name: "Fail, Missing MIG at the beginning of UUID", @@ -62,41 +75,41 @@ func TestGetMIGDeviceInfoByID_When_DriverVersion_Below_R470(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - deviceInfo, err := GetMIGDeviceInfoByID(tc.uuid) - if tc.expectedError && err != nil { - return - } - if tc.expectedError && err == nil { - t.Fatalf("Expected an error, but didn't get one: uuid: %v, (gpu: %v, gi: %v, ci: %v)", - tc.uuid, - deviceInfo.ParentUUID, - deviceInfo.GPUInstanceID, - deviceInfo.ComputeInstanceID) - } - if !tc.expectedError && err != nil { - t.Fatalf("Unexpected error: %v, uuid: %v, (gpu: %v, gi: %v, ci: %v)", - err, - tc.uuid, - deviceInfo.ParentUUID, - deviceInfo.GPUInstanceID, - deviceInfo.ComputeInstanceID) + deviceInfo, err := Client().GetMIGDeviceInfoByID(tc.uuid) + if tc.expectedError { + assert.Error(t, err, "uuid: %v, Device Info: %+v", tc.uuid, deviceInfo) + } else { + assert.Nil(t, err, "err: %v, uuid: %v", err, tc.uuid) + assert.Equal(t, tc.expectedMIGInfo, deviceInfo, "MIG uuid '%v' parsed incorrectly", tc.uuid) } + }) + } +} - assert.Equal(t, tc.expectedGPU, deviceInfo.ParentUUID, "MIG UUID parsed incorrectly: uuid: %v, (gpu: %v, gi: %v, ci: %v)", - tc.uuid, - deviceInfo.ParentUUID, - deviceInfo.GPUInstanceID, - deviceInfo.ComputeInstanceID) - assert.Equal(t, tc.expectedGi, deviceInfo.GPUInstanceID, "MIG UUID parsed incorrectly: uuid: %v, (gpu: %v, gi: %v, ci: %v)", - tc.uuid, - deviceInfo.ParentUUID, - deviceInfo.GPUInstanceID, - deviceInfo.ComputeInstanceID) - assert.Equal(t, tc.expectedCi, deviceInfo.ComputeInstanceID, "MIG UUID parsed incorrectly: uuid: %v, (gpu: %v, gi: %v, ci: %v)", - tc.uuid, - deviceInfo.ParentUUID, - deviceInfo.GPUInstanceID, - deviceInfo.ComputeInstanceID) +func Test_newNVMLProvider(t *testing.T) { + tests := []struct { + name string + preRunFunc func() NVML + }{ + { + name: "NVML not initialized", + preRunFunc: func() NVML { + return nvmlProvider{initialized: true} + }, + }, + { + name: "NVML already initialized", + preRunFunc: func() NVML { + Initialize() + return Client() + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + want := tt.preRunFunc() + defer reset() + assert.Equalf(t, want, newNVMLProvider(), "Unexpected Output") }) } } diff --git a/internal/pkg/nvmlprovider/types.go b/internal/pkg/nvmlprovider/types.go new file mode 100644 index 00000000..507b7afd --- /dev/null +++ b/internal/pkg/nvmlprovider/types.go @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/nvmlprovider/mock_client.go -package=nvmlprovider -copyright_file=../../../hack/header.txt . NVML + +package nvmlprovider + +type NVML interface { + GetMIGDeviceInfoByID(string) (*MIGDeviceInfo, error) + Cleanup() +} diff --git a/internal/pkg/os/os.go b/internal/pkg/os/os.go index 891d3893..2e676fce 100644 --- a/internal/pkg/os/os.go +++ b/internal/pkg/os/os.go @@ -18,9 +18,9 @@ package os import "os" -//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/os.go -package=os -copyright_file=../../../hack/header.txt . OS -//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/dir_entry.go -package=os -copyright_file=../../../hack/header.txt os DirEntry -//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/file_info.go -package=os -copyright_file=../../../hack/header.txt io/fs FileInfo +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/mock_os.go -package=os -copyright_file=../../../hack/header.txt . OS +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/mock_dir_entry.go -package=os -copyright_file=../../../hack/header.txt os DirEntry +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/mock_file_info.go -package=os -copyright_file=../../../hack/header.txt io/fs FileInfo type OS interface { CreateTemp(dir, pattern string) (*os.File, error) Getenv(key string) string @@ -33,6 +33,7 @@ type OS interface { Stat(name string) (os.FileInfo, error) TempDir() string ReadDir(name string) ([]os.DirEntry, error) + Exit(code int) } type RealOS struct{} @@ -80,3 +81,5 @@ func (RealOS) Remove(name string) error { func (RealOS) ReadDir(name string) ([]os.DirEntry, error) { return os.ReadDir(name) } + +func (RealOS) Exit(code int) { os.Exit(code) } diff --git a/internal/pkg/prerequisites/dcgmlib_rule.go b/internal/pkg/prerequisites/dcgmlib_rule.go new file mode 100644 index 00000000..5bd8ce0f --- /dev/null +++ b/internal/pkg/prerequisites/dcgmlib_rule.go @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package prerequisites + +import ( + debugelf "debug/elf" + "fmt" + "log/slog" + "strings" +) + +const ( + libdcgmco = "libdcgm.so.4" + procSelfExe = "/proc/self/exe" + ldconfig = "ldconfig" + ldconfigParam = "-p" +) + +type dcgmLibExistsRule struct{} + +// Validate checks if libdcgm.so.4 exists and matches with the machine architecture. +func (c dcgmLibExistsRule) Validate() error { + // On Ubuntu, ldconfig is a wrapper around ldconfig.real + ldconfigPath := fmt.Sprintf("/sbin/%s.real", ldconfig) + if _, err := os.Stat(ldconfigPath); err != nil { + ldconfigPath = "/sbin/" + ldconfig + } + // Get list of shared libraries. See: man ldconfig + out, err := exec.Command(ldconfigPath, ldconfigParam).Output() + if err != nil { + return err + } + + for _, match := range rxLDCacheEntry.FindAllSubmatch(out, -1) { + libName := strings.TrimSpace(string(match[1])) + if libName == libdcgmco { + libPath := strings.TrimSpace(string(match[2])) + selfMachine, err := c.readELF(procSelfExe) + if err != nil { + return err + } + libMachine, err := c.readELF(libPath) + if err != nil { + // When datacenter-gpu-manager uninstalled, the ldconfig -p may return that the libdcgm.so.4 is present, + // but the library file was removed. + slog.Error(err.Error()) + return errLibdcgmNotFound + } + + if selfMachine != libMachine { + return fmt.Errorf("the %s library architecture mismatch with the system; wanted: %s, received: %s", + libdcgmco, selfMachine, libMachine) + } + + return nil + } + } + + return errLibdcgmNotFound +} + +func (c dcgmLibExistsRule) readELF(name string) (debugelf.Machine, error) { + elfFile, err := elf.Open(name) + if err != nil { + return 0, fmt.Errorf("could not open %s: %v", name, err) + } + if err := elfFile.Close(); err != nil { + slog.Warn(fmt.Sprintf("could not close ELF: %v", err)) + } + + return elfFile.Machine, nil +} diff --git a/internal/pkg/prerequisites/dcgmlib_rule_test.go b/internal/pkg/prerequisites/dcgmlib_rule_test.go new file mode 100644 index 00000000..d96f0c83 --- /dev/null +++ b/internal/pkg/prerequisites/dcgmlib_rule_test.go @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package prerequisites + +import ( + "errors" + "testing" + + "go.uber.org/mock/gomock" + + debugelf "debug/elf" + + "github.com/stretchr/testify/require" + + mockelf "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/elf" + mockexec "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/exec" +) + +func Test_dcgmLibExistsRule_Validate(t *testing.T) { + ldconfigPath := "/sbin/ldconfig.real" + + type testCase struct { + Name string + ExecMockExpectations func(*gomock.Controller, *mockexec.MockExec) + ELFMockExpectations func(*gomock.Controller, *mockelf.MockELF) + AssertErr func(err error) + } + + testCases := []testCase{ + { + Name: "no error", + ExecMockExpectations: func(ctrl *gomock.Controller, mockExec *mockexec.MockExec) { + output := `1211 libs found in cache '/etc/ld.so.cache' + libdcgm.so.4 (libc6,x86-64) => /lib/x86_64-linux-gnu/libdcgm.so.4 + Cache generated by: ldconfig (Ubuntu GLIBC 2.35-0ubuntu3.7) stable release version 2.35` + cmd := mockexec.NewMockCmd(ctrl) + cmd.EXPECT().Output().AnyTimes().Return([]byte(output), nil) + mockExec.EXPECT().Command(gomock.Eq(ldconfigPath), gomock.Eq(ldconfigParam)).AnyTimes().Return(cmd) + }, + ELFMockExpectations: func(c *gomock.Controller, mockELF *mockelf.MockELF) { + self := &debugelf.File{ + FileHeader: debugelf.FileHeader{ + Machine: debugelf.EM_X86_64, + }, + } + mockELF.EXPECT().Open(gomock.Eq("/proc/self/exe")).AnyTimes().Return(self, nil) + + libdcgm := &debugelf.File{ + FileHeader: debugelf.FileHeader{ + Machine: debugelf.EM_X86_64, + }, + } + mockELF.EXPECT().Open(gomock.Eq("/lib/x86_64-linux-gnu/libdcgm.so.4")).AnyTimes().Return(libdcgm, nil) + }, + AssertErr: func(err error) { + require.NoError(t, err) + }, + }, + { + Name: "returns error when library is not found", + ExecMockExpectations: func(ctrl *gomock.Controller, mockExec *mockexec.MockExec) { + output := `1211 libs found in cache '/etc/ld.so.cache' + libcuda.so (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so + Cache generated by: ldconfig (Ubuntu GLIBC 2.35-0ubuntu3.7) stable release version 2.35` + cmd := mockexec.NewMockCmd(ctrl) + cmd.EXPECT().Output().AnyTimes().Return([]byte(output), nil) + mockExec.EXPECT().Command(gomock.Eq(ldconfigPath), gomock.Eq(ldconfigParam)).AnyTimes().Return(cmd) + }, + ELFMockExpectations: func(c *gomock.Controller, mockELF *mockelf.MockELF) { + self := &debugelf.File{ + FileHeader: debugelf.FileHeader{ + Machine: debugelf.EM_X86_64, + }, + } + mockELF.EXPECT().Open(gomock.Eq("/proc/self/exe")).AnyTimes().Return(self, nil) + + libdcgm := &debugelf.File{ + FileHeader: debugelf.FileHeader{ + Machine: debugelf.EM_X86_64, + }, + } + mockELF.EXPECT().Open(gomock.Eq("/lib/x86_64-linux-gnu/libdcgm.so.4")).AnyTimes().Return(libdcgm, nil) + }, + AssertErr: func(err error) { + require.Error(t, err) + require.ErrorContains(t, err, "the libdcgm.so.4 library was not found. Install Data Center GPU Manager (DCGM).") + }, + }, + { + Name: "returns error when can not execute command", + ExecMockExpectations: func(ctrl *gomock.Controller, mockExec *mockexec.MockExec) { + cmd := mockexec.NewMockCmd(ctrl) + cmd.EXPECT().Output().AnyTimes().Return([]byte{}, errors.New("boom!")) + mockExec.EXPECT().Command(gomock.Eq(ldconfigPath), gomock.Eq(ldconfigParam)).AnyTimes().Return(cmd) + }, + AssertErr: func(err error) { + require.Error(t, err) + }, + }, + { + Name: "error when can not open /proc/self/exe", + ExecMockExpectations: func(ctrl *gomock.Controller, mockExec *mockexec.MockExec) { + output := `1211 libs found in cache '/etc/ld.so.cache' + libdcgm.so.4 (libc6,x86-64) => /lib/x86_64-linux-gnu/libdcgm.so.4 + Cache generated by: ldconfig (Ubuntu GLIBC 2.35-0ubuntu3.7) stable release version 2.35` + cmd := mockexec.NewMockCmd(ctrl) + cmd.EXPECT().Output().AnyTimes().Return([]byte(output), nil) + mockExec.EXPECT().Command(gomock.Eq(ldconfigPath), gomock.Eq(ldconfigParam)).AnyTimes().Return(cmd) + }, + ELFMockExpectations: func(c *gomock.Controller, mockELF *mockelf.MockELF) { + mockELF.EXPECT().Open(gomock.Eq("/proc/self/exe")).AnyTimes().Return(nil, errors.New("boom!")) + }, + AssertErr: func(err error) { + require.Error(t, err) + }, + }, + { + Name: "returns error when library architecture missmatch", + ExecMockExpectations: func(ctrl *gomock.Controller, mockExec *mockexec.MockExec) { + output := `1211 libs found in cache '/etc/ld.so.cache' + libdcgm.so.4 (libc6,x86-64) => /lib/x86_64-linux-gnu/libdcgm.so.4 + Cache generated by: ldconfig (Ubuntu GLIBC 2.35-0ubuntu3.7) stable release version 2.35` + cmd := mockexec.NewMockCmd(ctrl) + cmd.EXPECT().Output().AnyTimes().Return([]byte(output), nil) + mockExec.EXPECT().Command(gomock.Eq(ldconfigPath), gomock.Eq(ldconfigParam)).AnyTimes().Return(cmd) + }, + ELFMockExpectations: func(c *gomock.Controller, mockELF *mockelf.MockELF) { + self := &debugelf.File{ + FileHeader: debugelf.FileHeader{ + Machine: debugelf.EM_X86_64, + }, + } + mockELF.EXPECT().Open(gomock.Eq("/proc/self/exe")).AnyTimes().Return(self, nil) + + libdcgm := &debugelf.File{ + FileHeader: debugelf.FileHeader{ + Machine: debugelf.EM_AARCH64, + }, + } + mockELF.EXPECT().Open(gomock.Eq("/lib/x86_64-linux-gnu/libdcgm.so.4")).AnyTimes().Return(libdcgm, nil) + }, + AssertErr: func(err error) { + require.Error(t, err) + require.ErrorContains(t, err, + "the libdcgm.so.4 library architecture mismatch with the system; wanted: EM_X86_64, received: EM_AARCH64") + }, + }, + { + Name: "returns error when library file can not be open", + ExecMockExpectations: func(ctrl *gomock.Controller, mockExec *mockexec.MockExec) { + output := `1211 libs found in cache '/etc/ld.so.cache' + libdcgm.so.4 (libc6,x86-64) => /lib/x86_64-linux-gnu/libdcgm.so.4 + Cache generated by: ldconfig (Ubuntu GLIBC 2.35-0ubuntu3.7) stable release version 2.35` + cmd := mockexec.NewMockCmd(ctrl) + cmd.EXPECT().Output().AnyTimes().Return([]byte(output), nil) + mockExec.EXPECT().Command(gomock.Eq(ldconfigPath), gomock.Eq(ldconfigParam)).AnyTimes().Return(cmd) + }, + ELFMockExpectations: func(c *gomock.Controller, mockELF *mockelf.MockELF) { + self := &debugelf.File{ + FileHeader: debugelf.FileHeader{ + Machine: debugelf.EM_X86_64, + }, + } + mockELF.EXPECT().Open(gomock.Eq("/proc/self/exe")).AnyTimes().Return(self, nil) + + mockELF.EXPECT().Open(gomock.Eq("/lib/x86_64-linux-gnu/libdcgm.so.4")).AnyTimes().Return(nil, errors.New("boom!")) + }, + AssertErr: func(err error) { + require.Error(t, err) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.Name, func(t *testing.T) { + ctrl := gomock.NewController(t) + + executor := mockexec.NewMockExec(ctrl) + + if tc.ExecMockExpectations != nil { + tc.ExecMockExpectations(ctrl, executor) + } + exec = executor + + elfreader := mockelf.NewMockELF(ctrl) + + if tc.ELFMockExpectations != nil { + tc.ELFMockExpectations(ctrl, elfreader) + } + elf = elfreader + + err := dcgmLibExistsRule{}.Validate() + tc.AssertErr(err) + }) + } +} diff --git a/internal/pkg/prerequisites/types.go b/internal/pkg/prerequisites/types.go new file mode 100644 index 00000000..c5e39156 --- /dev/null +++ b/internal/pkg/prerequisites/types.go @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package prerequisites + +type rule interface { + Validate() error +} diff --git a/internal/pkg/prerequisites/validation.go b/internal/pkg/prerequisites/validation.go new file mode 100644 index 00000000..d5e42aab --- /dev/null +++ b/internal/pkg/prerequisites/validation.go @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package prerequisites + +var rules = []rule{ + dcgmLibExistsRule{}, +} + +func Validate() error { + for _, rule := range rules { + err := rule.Validate() + if err != nil { + return err + } + } + + return nil +} diff --git a/internal/pkg/prerequisites/validation_test.go b/internal/pkg/prerequisites/validation_test.go new file mode 100644 index 00000000..21e85be9 --- /dev/null +++ b/internal/pkg/prerequisites/validation_test.go @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package prerequisites + +import ( + debugelf "debug/elf" + "testing" + + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + + mockelf "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/elf" + mockexec "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/exec" + mockos "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/os" + + realos "os" +) + +func TestValidate(t *testing.T) { + type testCase struct { + Name string + OSMockExpectations func(*gomock.Controller, *mockos.MockOS) + LDConfigPath string + } + + tests := []testCase{ + { + Name: "Ubuntu-based system with /sbin/ldconfig.real", + OSMockExpectations: func(ctrl *gomock.Controller, mo *mockos.MockOS) { + mfi := mockos.NewMockFileInfo(ctrl) + mo.EXPECT().Stat("/sbin/ldconfig.real").Return(mfi, nil) + }, + LDConfigPath: "/sbin/ldconfig.real", + }, + { + Name: "Linux system without /sbin/ldconfig.real", + OSMockExpectations: func(ctrl *gomock.Controller, mo *mockos.MockOS) { + mo.EXPECT().Stat("/sbin/ldconfig.real").Return(nil, &realos.PathError{}) + }, + LDConfigPath: "/sbin/ldconfig", + }, + } + + for _, tc := range tests { + + ctrl := gomock.NewController(t) + + osinstance := mockos.NewMockOS(ctrl) + tc.OSMockExpectations(ctrl, osinstance) + + os = osinstance + + executor := mockexec.NewMockExec(ctrl) + + output := `1211 libs found in cache '/etc/ld.so.cache' + libdcgm.so.4 (libc6,x86-64) => /lib/x86_64-linux-gnu/libdcgm.so.4 + Cache generated by: ldconfig (Ubuntu GLIBC 2.35-0ubuntu3.7) stable release version 2.35` + cmd := mockexec.NewMockCmd(ctrl) + cmd.EXPECT().Output().AnyTimes().Return([]byte(output), nil) + executor.EXPECT().Command(gomock.Eq(tc.LDConfigPath), gomock.Eq(ldconfigParam)).AnyTimes().Return(cmd) + + exec = executor + + elfreader := mockelf.NewMockELF(ctrl) + + self := &debugelf.File{ + FileHeader: debugelf.FileHeader{ + Machine: debugelf.EM_X86_64, + }, + } + elfreader.EXPECT().Open(gomock.Eq("/proc/self/exe")).AnyTimes().Return(self, nil) + + libdcgm := &debugelf.File{ + FileHeader: debugelf.FileHeader{ + Machine: debugelf.EM_X86_64, + }, + } + elfreader.EXPECT().Open(gomock.Eq("/lib/x86_64-linux-gnu/libdcgm.so.4")).AnyTimes().Return(libdcgm, nil) + + elf = elfreader + + err := Validate() + require.NoError(t, err) + } +} diff --git a/internal/pkg/prerequisites/variables.go b/internal/pkg/prerequisites/variables.go new file mode 100644 index 00000000..32c6c452 --- /dev/null +++ b/internal/pkg/prerequisites/variables.go @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package prerequisites + +import ( + "fmt" + "regexp" + + elfinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/elf" + execinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/exec" + osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" +) + +var ( + os osinterface.OS = osinterface.RealOS{} + + exec execinterface.Exec = execinterface.RealExec{} + + elf elfinterface.ELF = elfinterface.RealELF{} + + // rxLDCacheEntry matches the following library strings: + // libdcgm.so.4 (libc6,x86-64) => /lib/x86_64-linux-gnu/libdcgm.so.4 + // ld-linux.so.2 (ELF) => /lib/ld-linux.so.2 + // ld-linux-x86-64.so.2 (libc6,x86-64) => /lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 + rxLDCacheEntry = regexp.MustCompile(`(?m)^(.*)\s*\(.*\)\s*=>\s*(.*)$`) + + errLibdcgmNotFound = fmt.Errorf("the %s library was not found. Install Data Center GPU Manager (DCGM).", libdcgmco) +) diff --git a/internal/pkg/registry/registry.go b/internal/pkg/registry/registry.go new file mode 100644 index 00000000..40065cc5 --- /dev/null +++ b/internal/pkg/registry/registry.go @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package registry + +import ( + "sync" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "golang.org/x/sync/errgroup" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" +) + +// groupCounterTuple represents a composite key, that consists Group and Counter. +// The groupCounterTuple is necessary to maintain uniqueness of Group and Counter pairs. +type groupCounterTuple struct { + Group dcgm.Field_Entity_Group + Counter counters.Counter +} + +type Registry struct { + collectorGroups map[dcgm.Field_Entity_Group][]collector.Collector + collectorGroupsSeen map[collector.EntityCollectorTuple]struct{} + mtx sync.RWMutex +} + +// NewRegistry creates a new registry +func NewRegistry() *Registry { + return &Registry{ + collectorGroups: map[dcgm.Field_Entity_Group][]collector.Collector{}, + collectorGroupsSeen: map[collector.EntityCollectorTuple]struct{}{}, + } +} + +// Register registers a collector with the registry. +func (r *Registry) Register(entityCollectorTuples collector.EntityCollectorTuple) { + if _, exists := r.collectorGroupsSeen[entityCollectorTuples]; exists { + return + } + r.collectorGroups[entityCollectorTuples.Entity()] = append(r.collectorGroups[entityCollectorTuples.Entity()], + entityCollectorTuples.Collector()) + r.collectorGroupsSeen[entityCollectorTuples] = struct{}{} +} + +// Gather gathers metrics from all registered collectors. +func (r *Registry) Gather() (MetricsByCounterGroup, error) { + r.mtx.Lock() + defer r.mtx.Unlock() + + var wg sync.WaitGroup + + g := new(errgroup.Group) + + var sm sync.Map + + for group, collectors := range r.collectorGroups { + for _, c := range collectors { + c := c // creates new c, see https://golang.org/doc/faq#closures_and_goroutines + group := group + wg.Add(1) + g.Go(func() error { + metrics, err := c.GetMetrics() + if err != nil { + return err + } + + for counter, metricVals := range metrics { + val, _ := sm.LoadOrStore(groupCounterTuple{Group: group, Counter: counter}, []collector.Metric{}) + out := val.([]collector.Metric) + out = append(out, metricVals...) + sm.Store(groupCounterTuple{Group: group, Counter: counter}, out) + } + + return nil + }) + } + } + + if err := g.Wait(); err != nil { + return nil, err + } + + output := MetricsByCounterGroup{} + + sm.Range(func(key, value interface{}) bool { + tuple := key.(groupCounterTuple) + if _, exists := output[tuple.Group]; !exists { + output[tuple.Group] = map[counters.Counter][]collector.Metric{} + } + output[tuple.Group][tuple.Counter] = value.([]collector.Metric) + return true // continue iteration + }) + + return output, nil +} + +// Cleanup resources of registered collectors +func (r *Registry) Cleanup() { + for _, collectors := range r.collectorGroups { + for _, c := range collectors { + c.Cleanup() + } + } +} diff --git a/internal/pkg/registry/registry_test.go b/internal/pkg/registry/registry_test.go new file mode 100644 index 00000000..9844e934 --- /dev/null +++ b/internal/pkg/registry/registry_test.go @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package registry + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + + collectorpkg "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" +) + +type mockCollector struct { + mock.Mock +} + +func (m *mockCollector) GetMetrics() (collectorpkg.MetricsByCounter, error) { + args := m.Called() + return args.Get(0).(collectorpkg.MetricsByCounter), args.Error(1) +} + +func (m *mockCollector) Cleanup() { + m.Called() +} + +func TestRegistry_Gather(t *testing.T) { + collector := new(mockCollector) + + metrics := collectorpkg.MetricsByCounter{} + counterA := counters.Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + } + + metrics[counterA] = append(metrics[counterA], collectorpkg.Metric{ + GPU: "0", + Counter: counterA, + Attributes: map[string]string{}, + }) + + counterB := counters.Counter{ + FieldName: "DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", + PromType: "gauge", + } + + metrics[counterB] = append(metrics[counterB], collectorpkg.Metric{ + GPU: "0", + Counter: counterB, + Value: "42", + Attributes: map[string]string{}, + }) + + type test struct { + name string + collectorState func() *mock.Call + assert func(MetricsByCounterGroup, error) + } + + tests := []test{ + { + name: "When collector return no errors", + collectorState: func() *mock.Call { + cs := collector.On("GetMetrics").Return(metrics, nil) + cs.On("Cleanup").Return() + return cs + }, + assert: func(mbcg MetricsByCounterGroup, err error) { + require.NoError(t, err) + require.Contains(t, mbcg, dcgm.FE_GPU) + require.Len(t, mbcg, 1) + require.Len(t, mbcg[dcgm.FE_GPU], 2) + }, + }, + { + name: "When collector return errors", + collectorState: func() *mock.Call { + cs := collector.On("GetMetrics").Return(collectorpkg.MetricsByCounter{}, errors.New("Boom!")) + cs.On("Cleanup").Return() + return cs + }, + assert: func(mbcg MetricsByCounterGroup, err error) { + require.Error(t, err) + require.Len(t, mbcg, 0) + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + reg := NewRegistry() + newEntityCollectorTuple := collectorpkg.EntityCollectorTuple{} + newEntityCollectorTuple.SetEntity(dcgm.FE_GPU) + newEntityCollectorTuple.SetCollector(collector) + reg.Register(newEntityCollectorTuple) + mockCall := tc.collectorState() + got, err := reg.Gather() + tc.assert(got, err) + mockCall.Unset() + reg.Cleanup() + }) + } +} + +func TestRegistry_Register_Accepts_Duplicates_(t *testing.T) { + reg := NewRegistry() + collector := new(mockCollector) + + newEntityCollectorTuple1 := collectorpkg.EntityCollectorTuple{} + newEntityCollectorTuple1.SetEntity(dcgm.FE_GPU) + newEntityCollectorTuple1.SetCollector(collector) + + newEntityCollectorTuple2 := collectorpkg.EntityCollectorTuple{} + newEntityCollectorTuple2.SetEntity(dcgm.FE_GPU) + newEntityCollectorTuple2.SetCollector(collector) + + reg.Register(newEntityCollectorTuple1) + reg.Register(newEntityCollectorTuple2) + assert.Len(t, reg.collectorGroups, 1) + assert.Len(t, reg.collectorGroupsSeen, 1) +} diff --git a/internal/pkg/registry/types.go b/internal/pkg/registry/types.go new file mode 100644 index 00000000..d4faf7d9 --- /dev/null +++ b/internal/pkg/registry/types.go @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package registry + +import ( + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" +) + +// MetricsByCounterGroup represents a group of metrics by specific counter groups +type MetricsByCounterGroup map[dcgm.Field_Entity_Group]collector.MetricsByCounter diff --git a/internal/pkg/rendermetrics/render_metrics.go b/internal/pkg/rendermetrics/render_metrics.go new file mode 100644 index 00000000..f99af1d8 --- /dev/null +++ b/internal/pkg/rendermetrics/render_metrics.go @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rendermetrics + +import ( + "fmt" + "io" + "sync" + "text/template" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" +) + +/* +* The goal here is to get to the following format: +* ``` +* # HELP FIELD_ID HELP_MSG +* # TYPE FIELD_ID PROM_TYPE +* FIELD_ID{gpu="GPU_INDEX_0",uuid="GPU_UUID", attr...} VALUE +* FIELD_ID{gpu="GPU_INDEX_N",uuid="GPU_UUID", attr...} VALUE +* ... +* ``` + */ + +var ( + gpuMetricsFormat = ` +{{- range $counter, $metrics := . -}} +# HELP {{ $counter.FieldName }} {{ $counter.Help }} +# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} +{{- range $metric := $metrics }} +{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",pci_bus_id="{{ $metric.GPUPCIBusID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} + +{{- range $k, $v := $metric.Labels -}} + ,{{ $k }}="{{ $v }}" +{{- end -}} +{{- range $k, $v := $metric.Attributes -}} + ,{{ $k }}="{{ $v }}" +{{- end -}} + +} {{ $metric.Value -}} +{{- end }} +{{ end }}` + + switchMetricsFormat = ` +{{- range $counter, $metrics := . -}} +# HELP {{ $counter.FieldName }} {{ $counter.Help }} +# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} +{{- range $metric := $metrics }} +{{ $counter.FieldName }}{nvswitch="{{ $metric.GPU }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} + +{{- range $k, $v := $metric.Labels -}} + ,{{ $k }}="{{ $v }}" +{{- end -}} +} {{ $metric.Value -}} +{{- end }} +{{ end }}` + + linkMetricsFormat = ` +{{- range $counter, $metrics := . -}} +# HELP {{ $counter.FieldName }} {{ $counter.Help }} +# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} +{{- range $metric := $metrics }} +{{ $counter.FieldName }}{nvlink="{{ $metric.GPU }}",nvswitch="{{ $metric.GPUDevice }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} + +{{- range $k, $v := $metric.Labels -}} + ,{{ $k }}="{{ $v }}" +{{- end -}} +} {{ $metric.Value -}} +{{- end }} +{{ end }}` + + cpuMetricsFormat = ` +{{- range $counter, $metrics := . -}} +# HELP {{ $counter.FieldName }} {{ $counter.Help }} +# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} +{{- range $metric := $metrics }} +{{ $counter.FieldName }}{cpu="{{ $metric.GPU }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} + +{{- range $k, $v := $metric.Labels -}} + ,{{ $k }}="{{ $v }}" +{{- end -}} +} {{ $metric.Value -}} +{{- end }} +{{ end }}` + + cpuCoreMetricsFormat = ` +{{- range $counter, $metrics := . -}} +# HELP {{ $counter.FieldName }} {{ $counter.Help }} +# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} +{{- range $metric := $metrics }} +{{ $counter.FieldName }}{cpucore="{{ $metric.GPU }}",cpu="{{ $metric.GPUDevice }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} + +{{- range $k, $v := $metric.Labels -}} + ,{{ $k }}="{{ $v }}" +{{- end -}} +} {{ $metric.Value -}} +{{- end }} +{{ end }}` +) + +var getGPUMetricsTemplate = sync.OnceValue(func() *template.Template { + return template.Must(template.New("gpuMetricsFormat").Parse(gpuMetricsFormat)) +}) + +var getSwitchMetricsTemplate = sync.OnceValue(func() *template.Template { + return template.Must(template.New("switchMetricsFormat").Parse(switchMetricsFormat)) +}) + +var getLinkMetricsTemplate = sync.OnceValue(func() *template.Template { + return template.Must(template.New("linkMetricsFormat").Parse(linkMetricsFormat)) +}) + +var getCPUMetricsTemplate = sync.OnceValue(func() *template.Template { + return template.Must(template.New("cpuMetricsFormat").Parse(cpuMetricsFormat)) +}) + +var getCPUCoreMetricsTemplate = sync.OnceValue(func() *template.Template { + return template.Must(template.New("cpuMetricsFormat").Parse(cpuCoreMetricsFormat)) +}) + +func RenderGroup(w io.Writer, group dcgm.Field_Entity_Group, metrics collector.MetricsByCounter) error { + var tmpl *template.Template + + switch group { + case dcgm.FE_GPU: + tmpl = getGPUMetricsTemplate() + case dcgm.FE_SWITCH: + tmpl = getSwitchMetricsTemplate() + case dcgm.FE_LINK: + tmpl = getLinkMetricsTemplate() + case dcgm.FE_CPU: + tmpl = getCPUMetricsTemplate() + case dcgm.FE_CPU_CORE: + tmpl = getCPUCoreMetricsTemplate() + default: + return fmt.Errorf("unexpected group: %s", group.String()) + } + return tmpl.Execute(w, metrics) +} diff --git a/internal/pkg/rendermetrics/render_metrics_test.go b/internal/pkg/rendermetrics/render_metrics_test.go new file mode 100644 index 00000000..548dbb76 --- /dev/null +++ b/internal/pkg/rendermetrics/render_metrics_test.go @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rendermetrics + +import ( + "bytes" + "fmt" + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" +) + +func getMetricsByCounterWithTestMetric() collector.MetricsByCounter { + metrics := collector.MetricsByCounter{} + counter := getTestMetric() + + metrics[counter] = append(metrics[counter], collector.Metric{ + GPU: "0", + GPUDevice: "nvidia0", + GPUModelName: "NVIDIA T400 4GB", + Hostname: "testhost", + UUID: "UUID", + GPUUUID: "GPU-00000000-0000-0000-0000-000000000000", + Counter: counter, + Value: "42", + Attributes: map[string]string{}, + }) + return metrics +} + +func getTestMetric() counters.Counter { + counter := counters.Counter{ + FieldID: 2000, + FieldName: "TEST_METRIC", + PromType: "gauge", + } + return counter +} + +func Test_render(t *testing.T) { + metrics := getMetricsByCounterWithTestMetric() + + tests := []struct { + name string + group dcgm.Field_Entity_Group + metrics collector.MetricsByCounter + want string + wantErr assert.ErrorAssertionFunc + }{ + { + name: fmt.Sprintf("Render %s", dcgm.FE_GPU.String()), + group: dcgm.FE_GPU, + metrics: metrics, + want: `# HELP TEST_METRIC +# TYPE TEST_METRIC gauge +TEST_METRIC{gpu="0",UUID="GPU-00000000-0000-0000-0000-000000000000",pci_bus_id="",device="nvidia0",modelName="NVIDIA T400 4GB",Hostname="testhost"} 42 +`, + }, + { + name: fmt.Sprintf("Render %s", dcgm.FE_SWITCH.String()), + group: dcgm.FE_SWITCH, + metrics: metrics, + want: `# HELP TEST_METRIC +# TYPE TEST_METRIC gauge +TEST_METRIC{nvswitch="0",Hostname="testhost"} 42 +`, + }, + { + name: fmt.Sprintf("Render %s", dcgm.FE_LINK.String()), + group: dcgm.FE_LINK, + metrics: metrics, + want: `# HELP TEST_METRIC +# TYPE TEST_METRIC gauge +TEST_METRIC{nvlink="0",nvswitch="nvidia0",Hostname="testhost"} 42 +`, + }, + { + name: fmt.Sprintf("Render %s", dcgm.FE_CPU.String()), + group: dcgm.FE_CPU, + metrics: metrics, + want: `# HELP TEST_METRIC +# TYPE TEST_METRIC gauge +TEST_METRIC{cpu="0",Hostname="testhost"} 42 +`, + }, + { + name: fmt.Sprintf("Render %s", dcgm.FE_CPU_CORE.String()), + group: dcgm.FE_CPU_CORE, + metrics: metrics, + want: `# HELP TEST_METRIC +# TYPE TEST_METRIC gauge +TEST_METRIC{cpucore="0",cpu="nvidia0",Hostname="testhost"} 42 +`, + }, + { + name: "Render unknown group", + group: 42, + metrics: metrics, + want: ``, + wantErr: assert.Error, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + w := &bytes.Buffer{} + err := RenderGroup(w, tt.group, tt.metrics) + if tt.wantErr != nil && + !tt.wantErr(t, err, fmt.Sprintf("RenderGroup(w, %v, %v)", tt.group, tt.metrics)) { + return + } + assert.Equalf(t, tt.want, w.String(), "RenderGroup(w, %v, %v)", tt.group, tt.metrics) + }) + } +} diff --git a/internal/pkg/server/server.go b/internal/pkg/server/server.go new file mode 100644 index 00000000..9fe1c32f --- /dev/null +++ b/internal/pkg/server/server.go @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package server + +import ( + "bytes" + "context" + "io" + "log/slog" + "net/http" + "os" + "sync" + "time" + + "github.com/gorilla/mux" + "github.com/prometheus/exporter-toolkit/web" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/logging" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/registry" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/rendermetrics" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/transformation" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/utils" +) + +const internalServerError = "internal server error" + +func NewMetricsServer( + c *appconfig.Config, + metrics chan string, + deviceWatchListManager devicewatchlistmanager.Manager, + registry *registry.Registry, +) (*MetricsServer, func(), error) { + router := mux.NewRouter() + serverv1 := &MetricsServer{ + server: &http.Server{ + Addr: c.Address, + Handler: router, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + }, + webConfig: &web.FlagConfig{ + WebListenAddresses: &[]string{c.Address}, + WebSystemdSocket: &c.WebSystemdSocket, + WebConfigFile: &c.WebConfigFile, + }, + metricsChan: metrics, + metrics: "", + registry: registry, + config: c, + transformations: transformation.GetTransformations(c), + deviceWatchListManager: deviceWatchListManager, + } + + router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("X-Content-Type-Options", "nosniff") + w.WriteHeader(http.StatusOK) + _, err := w.Write([]byte(` + GPU Exporter + +

GPU Exporter

+

Metrics

+ + `)) + if err != nil { + slog.Error("Failed to write response.", slog.String(logging.ErrorKey, err.Error())) + http.Error(w, internalServerError, http.StatusInternalServerError) + return + } + }) + + router.HandleFunc("/health", serverv1.Health) + router.HandleFunc("/metrics", serverv1.Metrics) + + return serverv1, func() {}, nil +} + +func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) { + defer wg.Done() + + var httpwg sync.WaitGroup + httpwg.Add(1) + go func() { + defer httpwg.Done() + slog.Info("Starting webserver") + if err := web.ListenAndServe(s.server, s.webConfig, slog.Default()); err != nil && err != http.ErrServerClosed { + slog.Error("Failed to Listen and Server HTTP server.", slog.String(logging.ErrorKey, err.Error())) + os.Exit(1) + } + }() + + httpwg.Add(1) + go func() { + defer httpwg.Done() + for { + select { + case <-stop: + return + } + } + }() + + <-stop + if err := s.server.Shutdown(context.Background()); err != nil { + slog.Error("Failed to shutdown HTTP server.", slog.String(logging.ErrorKey, err.Error())) + s.fatal() + } + + if err := utils.WaitWithTimeout(&httpwg, 3*time.Second); err != nil { + slog.Error("Failed waiting for HTTP server to shutdown.", slog.String(logging.ErrorKey, err.Error())) + s.fatal() + } +} + +func (s *MetricsServer) fatal() { + os.Exit(1) +} + +func (s *MetricsServer) Metrics(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("X-Content-Type-Options", "nosniff") + metricGroups, err := s.registry.Gather() + if err != nil { + slog.Error("Failed to gather metrics from collectors", slog.String(logging.ErrorKey, err.Error())) + http.Error(w, internalServerError, http.StatusInternalServerError) + return + } + var buf bytes.Buffer + err = s.render(&buf, metricGroups) + if err != nil { + http.Error(w, internalServerError, http.StatusInternalServerError) + return + } + _, err = w.Write(buf.Bytes()) + if err != nil { + slog.Error("Failed to write response.", slog.String(logging.ErrorKey, err.Error())) + http.Error(w, "failed to write response", http.StatusInternalServerError) + return + } +} + +func (s *MetricsServer) render(w io.Writer, metricGroups registry.MetricsByCounterGroup) error { + for group, metrics := range metricGroups { + deviceWatchList, exists := s.deviceWatchListManager.EntityWatchList(group) + if exists { + for _, transformation := range s.transformations { + err := transformation.Process(metrics, deviceWatchList.DeviceInfo()) + if err != nil { + slog.LogAttrs(context.Background(), slog.LevelError, "Failed to apply transformations on metrics", + slog.String(logging.ErrorKey, err.Error()), + slog.String(logging.FieldEntityGroupKey, group.String()), + slog.Any(logging.MetricsKey, metrics), + slog.Any(logging.DeviceInfoKey, deviceWatchList.DeviceInfo), + ) + return err + } + } + + err := rendermetrics.RenderGroup(w, group, metrics) + if err != nil { + slog.LogAttrs(context.Background(), slog.LevelError, "Failed to renderGroup metrics", + slog.String(logging.ErrorKey, err.Error()), + slog.String(logging.FieldEntityGroupKey, group.String()), + slog.Any(logging.MetricsKey, metrics), + slog.Any(logging.DeviceInfoKey, deviceWatchList.DeviceInfo), + ) + return err + } + } + } + return nil +} + +func (s *MetricsServer) Health(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("X-Content-Type-Options", "nosniff") + _, err := w.Write([]byte("KO")) + if err != nil { + slog.Error("Failed to write response.", slog.String(logging.ErrorKey, err.Error())) + http.Error(w, "failed to write response", http.StatusInternalServerError) + } +} diff --git a/internal/pkg/server/server_test.go b/internal/pkg/server/server_test.go new file mode 100644 index 00000000..186efab1 --- /dev/null +++ b/internal/pkg/server/server_test.go @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package server + +import ( + "errors" + "net" + "net/http" + "net/http/httptest" + "strings" + "syscall" + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + + mockcollectorpkg "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/collector" + mockdeviceinfo "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/deviceinfo" + mockdevicewatchlistmanager "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/devicewatchlistmanager" + mocktransformation "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/transformation" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/registry" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/transformation" +) + +const expectedResponse = `# HELP TEST_METRIC +# TYPE TEST_METRIC gauge +TEST_METRIC{gpu="0",UUID="GPU-00000000-0000-0000-0000-000000000000",pci_bus_id="",device="nvidia0",modelName="NVIDIA T400 4GB",Hostname="testhost"} 42 +` + +var deviceWatcher = devicewatcher.NewDeviceWatcher() + +func getMetricsByCounterWithTestMetric() collector.MetricsByCounter { + metrics := collector.MetricsByCounter{} + counter := getTestMetric() + + metrics[counter] = append(metrics[counter], collector.Metric{ + GPU: "0", + GPUDevice: "nvidia0", + GPUModelName: "NVIDIA T400 4GB", + Hostname: "testhost", + UUID: "UUID", + GPUUUID: "GPU-00000000-0000-0000-0000-000000000000", + Counter: counter, + Value: "42", + Attributes: map[string]string{}, + }) + return metrics +} + +func getTestMetric() counters.Counter { + counter := counters.Counter{ + FieldID: 2000, + FieldName: "TEST_METRIC", + PromType: "gauge", + } + return counter +} + +func TestMetrics(t *testing.T) { + ctrl := gomock.NewController(t) + + metrics := getMetricsByCounterWithTestMetric() + + tests := []struct { + name string + group dcgm.Field_Entity_Group + collector func() collector.Collector + transformer func() transformation.Transform + assert func(*testing.T, *httptest.ResponseRecorder) + }{ + { + name: "Returns 200", + group: dcgm.FE_GPU, + collector: func() collector.Collector { + mockCollector := mockcollectorpkg.NewMockCollector(ctrl) + mockCollector.EXPECT().GetMetrics().Return(metrics, nil).AnyTimes() + return mockCollector + }, + transformer: func() transformation.Transform { + mockTransformation := mocktransformation.NewMockTransform(ctrl) + mockTransformation.EXPECT().Process(gomock.Any(), gomock.Any()) + return mockTransformation + }, + assert: func(t *testing.T, recorder *httptest.ResponseRecorder) { + assert.Equal(t, http.StatusOK, recorder.Code) + assert.Equal(t, expectedResponse, recorder.Body.String()) + }, + }, + { + name: "Returns 500 when Collector return error", + group: dcgm.FE_GPU, + collector: func() collector.Collector { + mockCollector := mockcollectorpkg.NewMockCollector(ctrl) + mockCollector.EXPECT().GetMetrics().Return(nil, errors.New("boom")).AnyTimes() + return mockCollector + }, + transformer: func() transformation.Transform { + return mocktransformation.NewMockTransform(ctrl) + }, + assert: func(t *testing.T, recorder *httptest.ResponseRecorder) { + assert.Equal(t, http.StatusInternalServerError, recorder.Code) + assert.Equal(t, internalServerError, strings.TrimSpace(recorder.Body.String())) + }, + }, + { + name: "Returns 500 when Transformer returns error", + group: dcgm.FE_GPU, + collector: func() collector.Collector { + mockCollector := mockcollectorpkg.NewMockCollector(ctrl) + mockCollector.EXPECT().GetMetrics().Return(metrics, nil).AnyTimes() + return mockCollector + }, + transformer: func() transformation.Transform { + mockTransformation := mocktransformation.NewMockTransform(ctrl) + mockTransformation.EXPECT().Process(gomock.Any(), gomock.Any()).Return(errors.New("boom")).AnyTimes() + return mockTransformation + }, + assert: func(t *testing.T, recorder *httptest.ResponseRecorder) { + assert.Equal(t, http.StatusInternalServerError, recorder.Code) + assert.Equal(t, internalServerError, strings.TrimSpace(recorder.Body.String())) + }, + }, + { + name: "Returns 500 when group is unknown", + group: dcgm.FE_NONE, + collector: func() collector.Collector { + mockCollector := mockcollectorpkg.NewMockCollector(ctrl) + mockCollector.EXPECT().GetMetrics().Return(metrics, nil).AnyTimes() + return mockCollector + }, + transformer: func() transformation.Transform { + mockTransformation := mocktransformation.NewMockTransform(ctrl) + mockTransformation.EXPECT().Process(gomock.Any(), gomock.Any()) + return mockTransformation + }, + assert: func(t *testing.T, recorder *httptest.ResponseRecorder) { + assert.Equal(t, http.StatusInternalServerError, recorder.Code) + assert.Equal(t, internalServerError, strings.TrimSpace(recorder.Body.String())) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + reg := registry.NewRegistry() + entityCollectorTuple := collector.EntityCollectorTuple{} + entityCollectorTuple.SetEntity(tt.group) + entityCollectorTuple.SetCollector(tt.collector()) + reg.Register(entityCollectorTuple) + + mockDeviceInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDeviceInfo.EXPECT().InfoType().Return(tt.group).AnyTimes() + mockDeviceInfo.EXPECT().GOpts().Return(appconfig.DeviceOptions{}).AnyTimes() + + defaultDeviceWatchList := *devicewatchlistmanager.NewWatchList( + mockDeviceInfo, + []dcgm.Short{42}, + nil, + deviceWatcher, + 1, + ) + + metricServer := &MetricsServer{ + registry: reg, + deviceWatchListManager: func(group dcgm.Field_Entity_Group) devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(group).Return(defaultDeviceWatchList, + true).AnyTimes() + return mockDeviceWatchListManager + }(tt.group), + transformations: []transformation.Transform{ + tt.transformer(), + }, + } + + recorder := httptest.NewRecorder() + metricServer.Metrics(recorder, nil) + if tt.assert != nil { + tt.assert(t, recorder) + } + }) + } +} + +// mockResponseWriter is a custom writer that simulates a network operation error. +type mockResponseWriter struct { + httptest.ResponseRecorder +} + +func (m *mockResponseWriter) Write([]byte) (int, error) { + // Simulate a network operation error. + return 0, &net.OpError{ + Op: "write", + Net: "tcp", + Source: nil, + Addr: nil, + Err: syscall.EPIPE, + } +} + +func TestMetricsReturnsErrorWhenClientClosedConnection(t *testing.T) { + ctrl := gomock.NewController(t) + + metrics := getMetricsByCounterWithTestMetric() + + mockCollector := mockcollectorpkg.NewMockCollector(ctrl) + mockCollector.EXPECT().GetMetrics().Return(metrics, nil).AnyTimes() + + reg := registry.NewRegistry() + entityCollectorTuple := collector.EntityCollectorTuple{} + entityCollectorTuple.SetEntity(dcgm.FE_GPU) + entityCollectorTuple.SetCollector(mockCollector) + reg.Register(entityCollectorTuple) + + mockDeviceInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockDeviceInfo.EXPECT().InfoType().Return(dcgm.FE_CPU).AnyTimes() + mockDeviceInfo.EXPECT().GOpts().Return(appconfig.DeviceOptions{}).AnyTimes() + + defaultDeviceWatchList := *devicewatchlistmanager.NewWatchList( + mockDeviceInfo, + []dcgm.Short{42}, + nil, + deviceWatcher, + 1, + ) + + metricServer := &MetricsServer{ + registry: reg, + deviceWatchListManager: func() devicewatchlistmanager.Manager { + mockDeviceWatchListManager := mockdevicewatchlistmanager.NewMockManager(ctrl) + mockDeviceWatchListManager.EXPECT().EntityWatchList(dcgm.FE_CPU).Return(defaultDeviceWatchList, + true).AnyTimes() + mockDeviceWatchListManager.EXPECT().EntityWatchList(gomock.Any()).Return(devicewatchlistmanager.WatchList{}, + false).AnyTimes() + return mockDeviceWatchListManager + }(), + transformations: []transformation.Transform{}, + } + recorder := &mockResponseWriter{} + metricServer.Metrics(recorder, nil) + assert.Equal(t, http.StatusInternalServerError, recorder.Code) + assert.Nil(t, recorder.Body) +} + +func TestHealthReturnsOK(t *testing.T) { + metricServer := &MetricsServer{} + recorder := httptest.NewRecorder() + metricServer.Health(recorder, nil) + assert.Equal(t, http.StatusOK, recorder.Code) +} + +func TestHealthReturnsOKWhenWriteReturnsError(t *testing.T) { + metricServer := &MetricsServer{} + recorder := &mockResponseWriter{} + metricServer.Health(recorder, nil) + assert.Equal(t, http.StatusInternalServerError, recorder.Code) +} diff --git a/internal/pkg/server/types.go b/internal/pkg/server/types.go new file mode 100644 index 00000000..0c355992 --- /dev/null +++ b/internal/pkg/server/types.go @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package server + +import ( + "net/http" + "sync" + + "github.com/prometheus/exporter-toolkit/web" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/registry" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/transformation" +) + +type MetricsServer struct { + sync.Mutex + + server *http.Server + webConfig *web.FlagConfig + metrics string + metricsChan chan string + registry *registry.Registry + config *appconfig.Config + transformations []transformation.Transform + deviceWatchListManager devicewatchlistmanager.Manager +} diff --git a/pkg/stdout/capture.go b/internal/pkg/stdout/capture.go similarity index 84% rename from pkg/stdout/capture.go rename to internal/pkg/stdout/capture.go index d1854911..4817f0de 100644 --- a/pkg/stdout/capture.go +++ b/internal/pkg/stdout/capture.go @@ -19,13 +19,12 @@ package stdout import ( "bufio" "context" + "log/slog" "os" "syscall" - - "github.com/sirupsen/logrus" ) -// Capture go and C stdout and stderr and writes to logrus.StandardLogger +// Capture go and C stdout and stderr and writes to std output func Capture(ctx context.Context, inner func() error) error { stdout, err := syscall.Dup(syscall.Stdout) if err != nil { @@ -63,13 +62,13 @@ func Capture(ctx context.Context, inner func() error) error { logEntry := scanner.Text() parsedLogEntry := parseOutputEntry(logEntry) if parsedLogEntry.IsRawString { - _, err := logrus.StandardLogger().Out.Write([]byte(parsedLogEntry.Message + "\n")) + _, err := os.Stdout.Write([]byte(parsedLogEntry.Message + "\n")) if err != nil { return } continue } - logrus.WithField("dcgm_level", parsedLogEntry.Level).Info(parsedLogEntry.Message) + slog.LogAttrs(ctx, slog.LevelInfo, parsedLogEntry.Message, slog.String("dcgm_level", parsedLogEntry.Level)) } }() diff --git a/pkg/stdout/capture_test.go b/internal/pkg/stdout/capture_test.go similarity index 76% rename from pkg/stdout/capture_test.go rename to internal/pkg/stdout/capture_test.go index 3c1d9a0a..89a2a888 100644 --- a/pkg/stdout/capture_test.go +++ b/internal/pkg/stdout/capture_test.go @@ -20,11 +20,10 @@ import ( "bytes" "context" "fmt" + "os" "strings" "testing" - "time" - "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" ) @@ -61,19 +60,36 @@ func TestCapture(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) + // Create a buffer to capture stdout output + var buf bytes.Buffer + + // Save the original stdout + stdout := os.Stdout + + // Create a pipe to redirect stdout + r, w, err := os.Pipe() + assert.NoError(t, err) - buf := &bytes.Buffer{} - logrus.SetOutput(buf) + os.Stdout = w // Redirect stdout to the write end of the pipe - err := Capture(ctx, func() error { + ctx, cancel := context.WithCancel(context.Background()) + err = Capture(ctx, func() error { fmt.Println(tc.logMessage) return nil }) assert.NoError(t, err) - time.Sleep(1 * time.Millisecond) - tc.assert(t, buf.String()) + + // Close the write end of the pipe to allow reading all data + _ = w.Close() + os.Stdout = stdout // Restore original stdout + + // Read from the pipe directly into the buffer + _, err = buf.ReadFrom(r) + assert.NoError(t, err) + if tc.assert != nil { + tc.assert(t, buf.String()) + } cancel() }) } diff --git a/pkg/stdout/capture_test_wrapper.go b/internal/pkg/stdout/capture_test_wrapper.go similarity index 56% rename from pkg/stdout/capture_test_wrapper.go rename to internal/pkg/stdout/capture_test_wrapper.go index 2d9b645c..8b8b76fa 100644 --- a/pkg/stdout/capture_test_wrapper.go +++ b/internal/pkg/stdout/capture_test_wrapper.go @@ -24,34 +24,52 @@ void printBoom() { } */ import "C" + import ( "bytes" "context" + "os" "strings" "testing" - "time" - "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func testCaptureWithCGO(t *testing.T) { t.Helper() + // Create a buffer to capture stdout output + var buf bytes.Buffer - ctx, cancel := context.WithCancel(context.Background()) + // Save the original stdout + stdout := os.Stdout + + // Create a pipe to redirect stdout + r, w, err := os.Pipe() + assert.NoError(t, err) + + os.Stdout = w // Redirect stdout to the write end of the pipe - buf := &bytes.Buffer{} - logrus.SetOutput(buf) + ctx, cancel := context.WithCancel(context.Background()) - err := Capture(ctx, func() error { + err = Capture(ctx, func() error { C.printBoom() return nil }) assert.NoError(t, err) - - time.Sleep(10 * time.Millisecond) + // It takes a time before CGO flushes logs to the std output + // We need to wait until we start to receive the data + // Create temporary buffer to detect data + var tempBuf [1]byte + // Read from the pipe to ensure data is available + _, err = r.Read(tempBuf[:]) // Block until data is written + assert.NoError(t, err) + buf.Write(tempBuf[:]) // Start capturing the data + // Close the write end of the pipe to allow reading all data + _ = w.Close() + _, err = buf.ReadFrom(r) // Read the remaining data + assert.NoError(t, err) require.Equal(t, "Boom", strings.TrimSpace(buf.String())) - + os.Stdout = stdout // Restore original stdout cancel() } diff --git a/pkg/stdout/stdoutprocessor.go b/internal/pkg/stdout/stdoutprocessor.go similarity index 100% rename from pkg/stdout/stdoutprocessor.go rename to internal/pkg/stdout/stdoutprocessor.go diff --git a/internal/pkg/testutils/const.go b/internal/pkg/testutils/const.go new file mode 100644 index 00000000..54ef7fd3 --- /dev/null +++ b/internal/pkg/testutils/const.go @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package testutils + +import ( + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" +) + +var fakeProfileName = "2fake.4gb" + +var ( + MockGPUInstanceInfo1 = deviceinfo.GPUInstanceInfo{ + Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlProfileSlices: 3}, + ProfileName: fakeProfileName, + EntityId: 0, + } + + MockGPUInstanceInfo2 = deviceinfo.GPUInstanceInfo{ + Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlInstanceId: 1, NvmlProfileSlices: 3}, + ProfileName: fakeProfileName, + EntityId: 14, + } + + MockNVLinkVal1 = dcgm.NvLinkStatus{ + State: 2, + Index: 0, + } + + MockNVLinkVal2 = dcgm.NvLinkStatus{ + State: 3, + Index: 1, + } +) diff --git a/internal/pkg/testutils/test_utils.go b/internal/pkg/testutils/test_utils.go new file mode 100644 index 00000000..782ac7b8 --- /dev/null +++ b/internal/pkg/testutils/test_utils.go @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package testutils + +import ( + "context" + "fmt" + "net" + "reflect" + "runtime" + "testing" + "time" + "unsafe" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + "google.golang.org/grpc" + "k8s.io/kubelet/pkg/apis/podresources/v1alpha1" + + mockdeviceinfo "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" +) + +// MockReader is a mock implementation of rand.Reader that always returns an error +type MockReader struct { + Err error +} + +func (r *MockReader) Read(_ []byte) (n int, err error) { + return 0, r.Err +} + +// RequireLinux checks if tests are being executed on a Linux platform or not +func RequireLinux(t *testing.T) { + t.Helper() + if runtime.GOOS != "linux" { + t.Skipf("Test is not supported on %q", runtime.GOOS) + } +} + +func MockGPUDeviceInfo( + ctrl *gomock.Controller, gpuCount int, gpuToGpuInstanceInfos map[int][]deviceinfo.GPUInstanceInfo, +) *mockdeviceinfo.MockProvider { + mockSystemInfo := mockdeviceinfo.NewMockProvider(ctrl) + + mockGPUs := make([]deviceinfo.GPUInfo, 0) + + for i := range gpuCount { + gpuInfo := deviceinfo.GPUInfo{} + gpuInfo.DeviceInfo.GPU = uint(i) + + if gpuInstanceInfos, exist := gpuToGpuInstanceInfos[i]; exist { + gpuInfo.GPUInstances = gpuInstanceInfos + } + + mockGPUs = append(mockGPUs, gpuInfo) + mockSystemInfo.EXPECT().GPU(uint(i)).Return(gpuInfo).AnyTimes() + } + + mockSystemInfo.EXPECT().GPUCount().Return(uint(gpuCount)).AnyTimes() + mockSystemInfo.EXPECT().GPUs().Return(mockGPUs).AnyTimes() + mockSystemInfo.EXPECT().InfoType().Return(dcgm.FE_NONE).AnyTimes() + + return mockSystemInfo +} + +func MockCPUDeviceInfo( + ctrl *gomock.Controller, cpuCount int, cpuToCores map[int][]uint, watchedCPUs map[uint]bool, + watchedCores map[WatchedEntityKey]bool, infoType dcgm.Field_Entity_Group, +) *mockdeviceinfo.MockProvider { + mockSystemInfo := mockdeviceinfo.NewMockProvider(ctrl) + + mockCPUs := make([]deviceinfo.CPUInfo, 0) + + for i := range cpuCount { + cpuInfo := deviceinfo.CPUInfo{} + cpuInfo.EntityId = uint(i) + + if cores, exist := cpuToCores[i]; exist { + cpuInfo.Cores = []uint{} + + for _, core := range cores { + cpuInfo.Cores = append(cpuInfo.Cores, core) + + mockSystemInfo.EXPECT().IsCoreWatched(core, + uint(i)).Return(watchedCores[WatchedEntityKey{uint(i), core}]).AnyTimes() + } + } + + mockSystemInfo.EXPECT().IsCPUWatched(cpuInfo.EntityId).Return(watchedCPUs[cpuInfo.EntityId]).AnyTimes() + mockSystemInfo.EXPECT().CPU(uint(i)).Return(cpuInfo).AnyTimes() + + mockCPUs = append(mockCPUs, cpuInfo) + } + + mockSystemInfo.EXPECT().CPUs().Return(mockCPUs).AnyTimes() + mockSystemInfo.EXPECT().InfoType().Return(infoType).AnyTimes() + + return mockSystemInfo +} + +func MockSwitchDeviceInfo( + ctrl *gomock.Controller, switchCount int, switchToNvLinks map[int][]dcgm.NvLinkStatus, + watchedSwitches map[uint]bool, watchedLinks map[WatchedEntityKey]bool, infoType dcgm.Field_Entity_Group, +) *mockdeviceinfo.MockProvider { + mockSystemInfo := mockdeviceinfo.NewMockProvider(ctrl) + + mockSwitches := make([]deviceinfo.SwitchInfo, 0) + + for i := range switchCount { + switchInfo := deviceinfo.SwitchInfo{} + switchInfo.EntityId = uint(i) + + if nvLinks, exist := switchToNvLinks[i]; exist { + switchInfo.NvLinks = []dcgm.NvLinkStatus{} + + for _, nvLink := range nvLinks { + nvLink.ParentId = uint(i) + nvLink.ParentType = dcgm.FE_SWITCH + switchInfo.NvLinks = append(switchInfo.NvLinks, nvLink) + + mockSystemInfo.EXPECT().IsLinkWatched(nvLink.Index, + uint(i)).Return(watchedLinks[WatchedEntityKey{uint(i), nvLink.Index}]).AnyTimes() + } + } + + mockSystemInfo.EXPECT().IsSwitchWatched(switchInfo.EntityId).Return(watchedSwitches[switchInfo.EntityId]).AnyTimes() + mockSystemInfo.EXPECT().Switch(uint(i)).Return(switchInfo).AnyTimes() + + mockSwitches = append(mockSwitches, switchInfo) + } + + mockSystemInfo.EXPECT().Switches().Return(mockSwitches).AnyTimes() + mockSystemInfo.EXPECT().InfoType().Return(infoType).AnyTimes() + + return mockSystemInfo +} + +// GetStructPrivateFieldValue returns private field value +func GetStructPrivateFieldValue[T any](t *testing.T, v any, fieldName string) T { + t.Helper() + var result T + value := reflect.ValueOf(v) + if value.Kind() == reflect.Ptr { + value = value.Elem() + } + + if value.Kind() != reflect.Struct { + t.Errorf("The type %s is not stuct", value.Type()) + return result + } + + fieldVal := value.FieldByName(fieldName) + + if !fieldVal.IsValid() { + t.Errorf("The field %s is invalid for the %s type", fieldName, value.Type()) + return result + } + + fieldPtr := unsafe.Pointer(fieldVal.UnsafeAddr()) + + // Cast the field pointer to a pointer of the correct type + realPtr := (*T)(fieldPtr) + + return *realPtr +} + +func CreateTmpDir(t *testing.T) (string, func()) { + path, err := os.MkdirTemp("", "dcgm-exporter") + require.NoError(t, err) + + return path, func() { + require.NoError(t, os.RemoveAll(path)) + } +} + +type MockPodResourcesServer struct { + resourceName string + gpus []string +} + +func NewMockPodResourcesServer(resourceName string, gpus []string) *MockPodResourcesServer { + return &MockPodResourcesServer{ + resourceName: resourceName, + gpus: gpus, + } +} + +func (s *MockPodResourcesServer) List( + ctx context.Context, req *v1alpha1.ListPodResourcesRequest, +) (*v1alpha1.ListPodResourcesResponse, error) { + podResources := make([]*v1alpha1.PodResources, len(s.gpus)) + + for i, gpu := range s.gpus { + podResources[i] = &v1alpha1.PodResources{ + Name: fmt.Sprintf("gpu-pod-%d", i), + Namespace: "default", + Containers: []*v1alpha1.ContainerResources{ + { + Name: "default", + Devices: []*v1alpha1.ContainerDevices{ + { + ResourceName: s.resourceName, + DeviceIds: []string{gpu}, + }, + }, + }, + }, + } + } + + return &v1alpha1.ListPodResourcesResponse{ + PodResources: podResources, + }, nil +} + +func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() { + l, err := net.Listen("unix", socket) + require.NoError(t, err) + + stopped := make(chan interface{}) + + go func() { + err := server.Serve(l) + assert.NoError(t, err) + close(stopped) + }() + + return func() { + server.Stop() + select { + case <-stopped: + return + case <-time.After(1 * time.Second): + t.Fatal("Failed waiting for gRPC server to stop.") + } + } +} + +type FieldType int + +const ( + Fields FieldType = iota + Functions + All +) + +// GetFields returns a map of fields of a struct, including unexported fields, based on the specified field type. +func GetFields(input interface{}, fieldType FieldType) map[string]interface{} { + result := make(map[string]interface{}) + val := reflect.ValueOf(input) + + if val.Kind() == reflect.Ptr { + val = val.Elem() + } + + if val.Kind() != reflect.Struct { + return result + } + + typ := val.Type() + + for i := 0; i < val.NumField(); i++ { + field := val.Field(i) + fieldTyp := typ.Field(i) + + // Determine if the field should be included based on the specified field type + includeField := false + switch fieldType { + case Fields: + includeField = field.Kind() != reflect.Func + case Functions: + includeField = field.Kind() == reflect.Func + case All: + includeField = true + } + + if !includeField { + continue + } + + // Access unexported fields + if !field.CanInterface() { + field = reflect.NewAt(field.Type(), unsafe.Pointer(field.UnsafeAddr())).Elem() + } + + result[fieldTyp.Name] = field.Interface() + } + + return result +} + +func StrToByteArray(str string) [4096]byte { + var byteArray [4096]byte + copy(byteArray[:], str) + return byteArray +} diff --git a/internal/pkg/testutils/testutils.go b/internal/pkg/testutils/testutils.go deleted file mode 100644 index 8ed485dd..00000000 --- a/internal/pkg/testutils/testutils.go +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package testutils - -import ( - "reflect" - "runtime" - "testing" - "unsafe" -) - -// RequireLinux checks if -func RequireLinux(t *testing.T) { - t.Helper() - if runtime.GOOS != "linux" { - t.Skipf("Test is not supported on %q", runtime.GOOS) - } -} - -// GetStructPrivateFieldValue returns private field value -func GetStructPrivateFieldValue[T any](t *testing.T, v any, fieldName string) T { - t.Helper() - var result T - value := reflect.ValueOf(v) - if value.Kind() == reflect.Ptr { - value = value.Elem() - } - - if value.Kind() != reflect.Struct { - t.Errorf("The type %s is not stuct", value.Type()) - return result - } - - fieldVal := value.FieldByName(fieldName) - - if !fieldVal.IsValid() { - t.Errorf("The field %s is invalid for the %s type", fieldName, value.Type()) - return result - } - - fieldPtr := unsafe.Pointer(fieldVal.UnsafeAddr()) - - // Cast the field pointer to a pointer of the correct type - realPtr := (*T)(fieldPtr) - - return *realPtr -} diff --git a/internal/pkg/testutils/types.go b/internal/pkg/testutils/types.go new file mode 100644 index 00000000..d3c0be1d --- /dev/null +++ b/internal/pkg/testutils/types.go @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package testutils + +type WatchedEntityKey struct { + ParentID uint + ChildID uint +} diff --git a/internal/pkg/testutils/variables.go b/internal/pkg/testutils/variables.go new file mode 100644 index 00000000..95106fa0 --- /dev/null +++ b/internal/pkg/testutils/variables.go @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package testutils + +import ( + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" +) + +var os osinterface.OS = osinterface.RealOS{} + +var ( + SampleGPUTempCounter = counters.Counter{ + FieldID: dcgm.DCGM_FI_DEV_GPU_TEMP, + FieldName: "DCGM_FI_DEV_GPU_TEMP", + PromType: "gauge", + Help: "Temperature Help info", + } + + SampleGPUTotalEnergyCounter = counters.Counter{ + FieldID: dcgm.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, + FieldName: "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION", + PromType: "gauge", + Help: "Energy help info", + } + + SampleGPUPowerUsageCounter = counters.Counter{ + FieldID: dcgm.DCGM_FI_DEV_POWER_USAGE, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + Help: "Power help info", + } + + SampleVGPULicenseStatusCounter = counters.Counter{ + FieldID: dcgm.DCGM_FI_DEV_VGPU_LICENSE_STATUS, + FieldName: "DCGM_FI_DEV_VGPU_LICENSE_STATUS", + PromType: "gauge", + Help: "vgpu license status", + } + + SampleDriverVersionCounter = counters.Counter{ + FieldID: dcgm.DCGM_FI_DRIVER_VERSION, + FieldName: "DCGM_FI_DRIVER_VERSION", + PromType: "label", + Help: "Driver version", + } + + SampleSwitchCurrentTempCounter = counters.Counter{ + FieldID: dcgm.DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT, + FieldName: "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT", + PromType: "gauge", + Help: "switch temperature", + } + + SampleSwitchLinkFlitErrorsCounter = counters.Counter{ + FieldID: dcgm.DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS, + FieldName: "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS", + PromType: "gauge", + Help: "per-link flit errors", + } + + SampleCPUUtilTotalCounter = counters.Counter{ + FieldID: dcgm.DCGM_FI_DEV_CPU_UTIL_TOTAL, + FieldName: "DCGM_FI_DEV_CPU_UTIL_TOTAL", + PromType: "gauge", + Help: "Total CPU utilization", + } + + SampleCounters = []counters.Counter{ + SampleGPUTempCounter, + SampleGPUTotalEnergyCounter, + SampleGPUPowerUsageCounter, + SampleDriverVersionCounter, + /* test that switch and link metrics are filtered out automatically when devices are not detected */ + SampleSwitchCurrentTempCounter, + SampleSwitchLinkFlitErrorsCounter, + /* test that vgpu metrics are not filtered out */ + SampleVGPULicenseStatusCounter, + /* test that cpu and cpu core metrics are filtered out automatically when devices are not detected */ + SampleCPUUtilTotalCounter, + } + + SampleAllFieldIDs = []dcgm.Short{ + SampleGPUTempCounter.FieldID, SampleGPUTotalEnergyCounter.FieldID, + SampleGPUPowerUsageCounter.FieldID, SampleVGPULicenseStatusCounter.FieldID, + SampleDriverVersionCounter.FieldID, SampleSwitchCurrentTempCounter.FieldID, + SampleSwitchLinkFlitErrorsCounter.FieldID, SampleCPUUtilTotalCounter.FieldID, + } + + SampleGPUFieldIDs = []dcgm.Short{ + SampleGPUTempCounter.FieldID, SampleGPUTotalEnergyCounter.FieldID, + SampleGPUPowerUsageCounter.FieldID, SampleVGPULicenseStatusCounter.FieldID, + } + + SampleFieldIDToFieldMeta = map[dcgm.Short]dcgm.FieldMeta{ + SampleGPUTempCounter.FieldID: { + FieldId: SampleGPUTempCounter.FieldID, + EntityLevel: dcgm.FE_GPU, + }, + SampleGPUTotalEnergyCounter.FieldID: { + FieldId: SampleGPUTotalEnergyCounter.FieldID, + EntityLevel: dcgm.FE_GPU, + }, + SampleGPUPowerUsageCounter.FieldID: { + FieldId: SampleGPUPowerUsageCounter.FieldID, + EntityLevel: dcgm.FE_GPU_I, + }, + SampleVGPULicenseStatusCounter.FieldID: { + FieldId: SampleVGPULicenseStatusCounter.FieldID, + EntityLevel: dcgm.FE_VGPU, + }, + SampleDriverVersionCounter.FieldID: { + FieldId: SampleDriverVersionCounter.FieldID, + EntityLevel: dcgm.FE_NONE, + }, + SampleSwitchCurrentTempCounter.FieldID: { + FieldId: SampleSwitchCurrentTempCounter.FieldID, + EntityLevel: dcgm.FE_SWITCH, + }, + SampleSwitchLinkFlitErrorsCounter.FieldID: { + FieldId: SampleSwitchLinkFlitErrorsCounter.FieldID, + EntityLevel: dcgm.FE_LINK, + }, + SampleCPUUtilTotalCounter.FieldID: { + FieldId: SampleCPUUtilTotalCounter.FieldID, + EntityLevel: dcgm.FE_CPU_CORE, + }, + } +) diff --git a/internal/pkg/transformation/const.go b/internal/pkg/transformation/const.go new file mode 100644 index 00000000..3461918d --- /dev/null +++ b/internal/pkg/transformation/const.go @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +const ( + // Note standard resource attributes + podAttribute = "pod" + namespaceAttribute = "namespace" + containerAttribute = "container" + + hpcJobAttribute = "hpc_job" + + oldPodAttribute = "pod_name" + oldNamespaceAttribute = "pod_namespace" + oldContainerAttribute = "container_name" +) diff --git a/pkg/dcgmexporter/hpc.go b/internal/pkg/transformation/hpc.go similarity index 62% rename from pkg/dcgmexporter/hpc.go rename to internal/pkg/transformation/hpc.go index e360b096..08b6bea7 100644 --- a/pkg/dcgmexporter/hpc.go +++ b/internal/pkg/transformation/hpc.go @@ -14,23 +14,29 @@ * limitations under the License. */ -package dcgmexporter +package transformation import ( "bufio" + "fmt" + "log/slog" sysOS "os" "path" "strconv" - "github.com/sirupsen/logrus" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/logging" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/utils" ) type hpcMapper struct { - Config *Config + Config *appconfig.Config } -func newHPCMapper(c *Config) *hpcMapper { - logrus.Infof("HPC job mapping is enabled and watch for the %q directory", c.HPCJobMappingDir) +func newHPCMapper(c *appconfig.Config) *hpcMapper { + slog.Info(fmt.Sprintf("HPC job mapping is enabled and watch for the %q directory", c.HPCJobMappingDir)) return &hpcMapper{ Config: c, } @@ -40,10 +46,11 @@ func (p *hpcMapper) Name() string { return "hpcMapper" } -func (p *hpcMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error { +func (p *hpcMapper) Process(metrics collector.MetricsByCounter, _ deviceinfo.Provider) error { _, err := os.Stat(p.Config.HPCJobMappingDir) if err != nil { - logrus.WithError(err).Warnf("Unable to access HPC job mapping file directory '%s' - directory not found. Ignoring.", p.Config.HPCJobMappingDir) + slog.Error(fmt.Sprintf("Unable to access HPC job mapping file directory '%s' - directory not found. Ignoring.", + p.Config.HPCJobMappingDir), slog.String(logging.ErrorKey, err.Error())) return nil } @@ -54,7 +61,7 @@ func (p *hpcMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error gpuToJobMap := make(map[string][]string) - logrus.Debugf("HPC job mapping files: %#v", gpuFiles) + slog.Debug(fmt.Sprintf("HPC job mapping files: %#v", gpuFiles)) for _, gpuFileName := range gpuFiles { jobs, err := readFile(path.Join(p.Config.HPCJobMappingDir, gpuFileName)) @@ -68,17 +75,18 @@ func (p *hpcMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error gpuToJobMap[gpuFileName] = append(gpuToJobMap[gpuFileName], jobs...) } - logrus.Debugf("GPU to job mapping: %+v", gpuToJobMap) + slog.Debug(fmt.Sprintf("GPU to job mapping: %+v", gpuToJobMap)) for counter := range metrics { - var modifiedMetrics []Metric + var modifiedMetrics []collector.Metric for _, metric := range metrics[counter] { jobs, exists := gpuToJobMap[metric.GPU] if exists { for _, job := range jobs { - modifiedMetric, err := deepCopy(metric) + modifiedMetric, err := utils.DeepCopy(metric) if err != nil { - logrus.WithError(err).Errorf("Can not create deepCopy for the value: %v", metric) + slog.Error(fmt.Sprintf("Can not create deepCopy for the value: %v", metric), + slog.String(logging.ErrorKey, err.Error())) continue } modifiedMetric.Attributes[hpcJobAttribute] = job @@ -104,7 +112,8 @@ func readFile(path string) ([]string, error) { defer func(file *sysOS.File) { err := file.Close() if err != nil { - logrus.WithError(err).Errorf("Failed for close the file: %s", file.Name()) + slog.Error(fmt.Sprintf("Failed for close the file: %s", file.Name()), + slog.String(logging.ErrorKey, err.Error())) } }(file) @@ -130,25 +139,25 @@ func getGPUFiles(dirPath string) ([]string, error) { return nil, err } - logrus.Debugf("hpc mapper: %d files in the %q found", len(files), dirPath) + slog.Debug(fmt.Sprintf("hpc mapper: %d files in the %q found", len(files), dirPath)) var mappingFiles []string for _, file := range files { finfo, err := file.Info() if err != nil { - logrus.Warnf("HPC mapper: can not get file info for the %s file.", file.Name()) + slog.Warn(fmt.Sprintf("HPC mapper: can not get file info for the %s file.", file.Name())) continue // Skip files that we can't read } if finfo.IsDir() { - logrus.Debugf("HPC mapper: the %q file is directory", file.Name()) + slog.Debug(fmt.Sprintf("HPC mapper: the %q file is directory", file.Name())) continue // Skip directories } _, err = strconv.Atoi(file.Name()) if err != nil { - logrus.Debugf("HPC mapper: file %q name doesn't match with GPU ID convention", file.Name()) + slog.Debug(fmt.Sprintf("HPC mapper: file %q name doesn't match with GPU ID convention", file.Name())) continue } mappingFiles = append(mappingFiles, file.Name()) diff --git a/pkg/dcgmexporter/hpc_test.go b/internal/pkg/transformation/hpc_test.go similarity index 77% rename from pkg/dcgmexporter/hpc_test.go rename to internal/pkg/transformation/hpc_test.go index 8b834955..0cb3730e 100644 --- a/pkg/dcgmexporter/hpc_test.go +++ b/internal/pkg/transformation/hpc_test.go @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package dcgmexporter +package transformation import ( "cmp" @@ -24,13 +24,15 @@ import ( "slices" "testing" - "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/mock/gomock" - osmock "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/os" + mockos "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/os" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" ) @@ -39,39 +41,39 @@ func TestHPCProcess(t *testing.T) { tests := []struct { name string - config *Config + config *appconfig.Config fsState func() func() - assertion func(*testing.T, MetricsByCounter) + assertion func(*testing.T, collector.MetricsByCounter) wantErr assert.ErrorAssertionFunc }{ { name: "When all GPU have job files", - config: &Config{HPCJobMappingDir: "/var/run/nvidia/slurm"}, + config: &appconfig.Config{HPCJobMappingDir: "/var/run/nvidia/slurm"}, fsState: func() func() { ctrl := gomock.NewController(t) - mOS := osmock.NewMockOS(ctrl) - mFileInfoGPU0 := osmock.NewMockFileInfo(ctrl) + mOS := mockos.NewMockOS(ctrl) + mFileInfoGPU0 := mockos.NewMockFileInfo(ctrl) mFileInfoGPU0.EXPECT().IsDir().Return(false).AnyTimes() - mDirEntryGPU0 := osmock.NewMockDirEntry(ctrl) + mDirEntryGPU0 := mockos.NewMockDirEntry(ctrl) mDirEntryGPU0.EXPECT().Info().Return(mFileInfoGPU0, nil).AnyTimes() mDirEntryGPU0.EXPECT().Name().Return("0").AnyTimes() - mFileInfoGPU1 := osmock.NewMockFileInfo(ctrl) + mFileInfoGPU1 := mockos.NewMockFileInfo(ctrl) mFileInfoGPU1.EXPECT().IsDir().Return(false).AnyTimes() - mDirEntryGPU1 := osmock.NewMockDirEntry(ctrl) + mDirEntryGPU1 := mockos.NewMockDirEntry(ctrl) mDirEntryGPU1.EXPECT().Info().Return(mFileInfoGPU1, nil).AnyTimes() mDirEntryGPU1.EXPECT().Name().Return("1").AnyTimes() - mFileInfoDir := osmock.NewMockFileInfo(ctrl) + mFileInfoDir := mockos.NewMockFileInfo(ctrl) mFileInfoDir.EXPECT().IsDir().Return(true).AnyTimes() - mDirEntryDir := osmock.NewMockDirEntry(ctrl) + mDirEntryDir := mockos.NewMockDirEntry(ctrl) mDirEntryDir.EXPECT().Info().Return(mFileInfoDir, nil).AnyTimes() mDirEntryDir.EXPECT().Name().Return("iamdir").AnyTimes() - mDirEntryDamagedFile := osmock.NewMockDirEntry(ctrl) + mDirEntryDamagedFile := mockos.NewMockDirEntry(ctrl) mDirEntryDamagedFile.EXPECT().Info().Return(nil, errors.New("boom")).AnyTimes() mDirEntryDamagedFile.EXPECT().Name().Return("iamerror").AnyTimes() @@ -107,13 +109,13 @@ func TestHPCProcess(t *testing.T) { _ = realOS.Remove(slurm1.Name()) } }, - assertion: func(t *testing.T, mbc MetricsByCounter) { + assertion: func(t *testing.T, mbc collector.MetricsByCounter) { require.Len(t, mbc, 1, "metrics are expected for a single counter only.") // We get metric value with 0 index - metricValues := mbc[reflect.ValueOf(mbc).MapKeys()[0].Interface().(Counter)] + metricValues := mbc[reflect.ValueOf(mbc).MapKeys()[0].Interface().(counters.Counter)] require.Len(t, metricValues, 4, "received unexpected number of metric values.") // Sort metrics by GPU ID - slices.SortFunc(metricValues, func(a, b Metric) int { + slices.SortFunc(metricValues, func(a, b collector.Metric) int { return cmp.Compare(a.GPU, b.GPU) }) assert.Equal(t, "0", metricValues[0].GPU) @@ -141,20 +143,20 @@ func TestHPCProcess(t *testing.T) { defer cleanup() } - metrics := MetricsByCounter{} - counter := Counter{ + metrics := collector.MetricsByCounter{} + counter := counters.Counter{ FieldID: 155, FieldName: "DCGM_FI_DEV_POWER_USAGE", PromType: "gauge", } - metrics[counter] = append(metrics[counter], Metric{ + metrics[counter] = append(metrics[counter], collector.Metric{ GPU: "0", GPUUUID: uuid.New().String(), GPUDevice: "nvidia0", GPUInstanceID: "", Value: "42", - Counter: Counter{ + Counter: counters.Counter{ FieldID: 155, FieldName: "DCGM_FI_DEV_POWER_USAGE", PromType: "gauge", @@ -162,13 +164,13 @@ func TestHPCProcess(t *testing.T) { Attributes: map[string]string{}, }) - metrics[counter] = append(metrics[counter], Metric{ + metrics[counter] = append(metrics[counter], collector.Metric{ GPU: "1", GPUUUID: uuid.New().String(), GPUDevice: "nvidia1", GPUInstanceID: "1", Value: "451", - Counter: Counter{ + Counter: counters.Counter{ FieldID: 155, FieldName: "DCGM_FI_DEV_POWER_USAGE", PromType: "gauge", @@ -176,13 +178,13 @@ func TestHPCProcess(t *testing.T) { Attributes: map[string]string{}, }) - metrics[counter] = append(metrics[counter], Metric{ + metrics[counter] = append(metrics[counter], collector.Metric{ GPU: "2", GPUUUID: uuid.New().String(), GPUDevice: "nvidia3", GPUInstanceID: "2", Value: "1984", - Counter: Counter{ + Counter: counters.Counter{ FieldID: 155, FieldName: "DCGM_FI_DEV_POWER_USAGE", PromType: "gauge", @@ -190,26 +192,9 @@ func TestHPCProcess(t *testing.T) { Attributes: map[string]string{}, }) - sysInfo := SystemInfo{ - GPUCount: 2, - GPUs: [dcgm.MAX_NUM_DEVICES]GPUInfo{ - { - DeviceInfo: dcgm.Device{ - UUID: "00000000-0000-0000-0000-000000000000", - GPU: 0, - }, - }, - { - DeviceInfo: dcgm.Device{ - UUID: "00000000-0000-0000-0000-000000000001", - GPU: 1, - }, - }, - }, - } mapper := newHPCMapper(tt.config) - err := mapper.Process(metrics, sysInfo) - if tt.wantErr != nil && !tt.wantErr(t, err, fmt.Sprintf("hpcMapper.Process(%v,%v)", metrics, sysInfo)) { + err := mapper.Process(metrics, nil) + if tt.wantErr != nil && !tt.wantErr(t, err, fmt.Sprintf("hpcMapper.Process(%v,%v)", metrics, nil)) { return } tt.assertion(t, metrics) @@ -218,5 +203,5 @@ func TestHPCProcess(t *testing.T) { } func TestHPCName(t *testing.T) { - assert.Equal(t, "hpcMapper", newHPCMapper(&Config{}).Name()) + assert.Equal(t, "hpcMapper", newHPCMapper(&appconfig.Config{}).Name()) } diff --git a/pkg/dcgmexporter/kubernetes.go b/internal/pkg/transformation/kubernetes.go similarity index 75% rename from pkg/dcgmexporter/kubernetes.go rename to internal/pkg/transformation/kubernetes.go index 8fb8d7d2..1121023e 100644 --- a/pkg/dcgmexporter/kubernetes.go +++ b/internal/pkg/transformation/kubernetes.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,22 +14,27 @@ * limitations under the License. */ -package dcgmexporter +package transformation import ( "context" "fmt" + "log/slog" "net" "regexp" "slices" "strings" "time" - "github.com/sirupsen/logrus" + "google.golang.org/grpc/resolver" + "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1alpha1" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" ) @@ -38,26 +43,25 @@ var ( gkeMigDeviceIDRegex = regexp.MustCompile(`^nvidia([0-9]+)/gi([0-9]+)$`) gkeVirtualGPUDeviceIDSeparator = "/vgpu" - nvmlGetMIGDeviceInfoByIDHook = nvmlprovider.GetMIGDeviceInfoByID ) -func NewPodMapper(c *Config) (*PodMapper, error) { - logrus.Infof("Kubernetes metrics collection enabled!") +func NewPodMapper(c *appconfig.Config) *PodMapper { + slog.Info("Kubernetes metrics collection enabled!") return &PodMapper{ Config: c, - }, nil + } } func (p *PodMapper) Name() string { return "podMapper" } -func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error { +func (p *PodMapper) Process(metrics collector.MetricsByCounter, deviceInfo deviceinfo.Provider) error { socketPath := p.Config.PodResourcesKubeletSocket _, err := os.Stat(socketPath) if os.IsNotExist(err) { - logrus.Info("No Kubelet socket, ignoring") + slog.Info("No Kubelet socket, ignoring") return nil } @@ -73,15 +77,17 @@ func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error return err } - deviceToPod := p.toDeviceToPod(pods, sysInfo) + slog.Debug(fmt.Sprintf("Podresources API response: %+v", pods)) + + deviceToPod := p.toDeviceToPod(pods, deviceInfo) - logrus.Debugf("Device to pod mapping: %+v", deviceToPod) + slog.Debug(fmt.Sprintf("Device to pod mapping: %+v", deviceToPod)) // Note: for loop are copies the value, if we want to change the value // and not the copy, we need to use the indexes for counter := range metrics { for j, val := range metrics[counter] { - deviceID, err := val.getIDOfType(p.Config.KubernetesGPUIdType) + deviceID, err := val.GetIDOfType(p.Config.KubernetesGPUIdType) if err != nil { return err } @@ -105,20 +111,17 @@ func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error } func connectToServer(socket string) (*grpc.ClientConn, func(), error) { - ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout) - defer cancel() - - conn, err := grpc.DialContext(ctx, + resolver.SetDefaultScheme("passthrough") + conn, err := grpc.NewClient( socket, grpc.WithTransportCredentials(insecure.NewCredentials()), - grpc.WithBlock(), grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { d := net.Dialer{} return d.DialContext(ctx, "unix", addr) }), ) if err != nil { - return nil, func() {}, fmt.Errorf("failure connecting to '%s'; err: %w", socket, err) + return nil, doNothing, fmt.Errorf("failure connecting to '%s'; err: %w", socket, err) } return conn, func() { conn.Close() }, nil @@ -139,7 +142,7 @@ func (p *PodMapper) listPods(conn *grpc.ClientConn) (*podresourcesapi.ListPodRes } func (p *PodMapper) toDeviceToPod( - devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo SystemInfo, + devicePods *podresourcesapi.ListPodResourcesResponse, deviceInfo deviceinfo.Provider, ) map[string]PodInfo { deviceToPodMap := make(map[string]PodInfo) @@ -148,9 +151,9 @@ func (p *PodMapper) toDeviceToPod( for _, device := range container.GetDevices() { resourceName := device.GetResourceName() - if resourceName != nvidiaResourceName && !slices.Contains(p.Config.NvidiaResourceNames, resourceName) { + if resourceName != appconfig.NvidiaResourceName && !slices.Contains(p.Config.NvidiaResourceNames, resourceName) { // Mig resources appear differently than GPU resources - if !strings.HasPrefix(resourceName, nvidiaMigResourcePrefix) { + if !strings.HasPrefix(resourceName, appconfig.NvidiaMigResourcePrefix) { continue } } @@ -162,14 +165,14 @@ func (p *PodMapper) toDeviceToPod( } for _, deviceID := range device.GetDeviceIds() { - if strings.HasPrefix(deviceID, MIG_UUID_PREFIX) { - migDevice, err := nvmlGetMIGDeviceInfoByIDHook(deviceID) + if strings.HasPrefix(deviceID, appconfig.MIG_UUID_PREFIX) { + migDevice, err := nvmlprovider.Client().GetMIGDeviceInfoByID(deviceID) if err == nil { - giIdentifier := GetGPUInstanceIdentifier(sysInfo, migDevice.ParentUUID, + giIdentifier := deviceinfo.GetGPUInstanceIdentifier(deviceInfo, migDevice.ParentUUID, uint(migDevice.GPUInstanceID)) deviceToPodMap[giIdentifier] = podInfo } - gpuUUID := deviceID[len(MIG_UUID_PREFIX):] + gpuUUID := deviceID[len(appconfig.MIG_UUID_PREFIX):] deviceToPodMap[gpuUUID] = podInfo } else if gkeMigDeviceIDMatches := gkeMigDeviceIDRegex.FindStringSubmatch(deviceID); gkeMigDeviceIDMatches != nil { var gpuIndex string diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go new file mode 100644 index 00000000..72f26c99 --- /dev/null +++ b/internal/pkg/transformation/kubernetes_test.go @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import ( + "fmt" + "reflect" + "testing" + + "github.com/sirupsen/logrus" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + "google.golang.org/grpc" + podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1alpha1" + + mockdeviceinfo "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/deviceinfo" + mocknvmlprovider "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/nvmlprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" +) + +func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { + testutils.RequireLinux(t) + logrus.SetLevel(logrus.DebugLevel) + type TestCase struct { + KubernetesGPUIDType appconfig.KubernetesGPUIDType + GPUInstanceID uint + ResourceName string + MetricGPUID string + MetricGPUDevice string + MetricMigProfile string + PODGPUID string + NvidiaResourceNames []string + } + + testCases := []TestCase{ + { + KubernetesGPUIDType: appconfig.GPUUID, + ResourceName: appconfig.NvidiaResourceName, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + }, + { + KubernetesGPUIDType: appconfig.GPUUID, + ResourceName: appconfig.NvidiaResourceName, + MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + MetricMigProfile: "", + }, + { + KubernetesGPUIDType: appconfig.GPUUID, + ResourceName: appconfig.NvidiaResourceName, + GPUInstanceID: 3, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + MetricMigProfile: "", + PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + }, + { + KubernetesGPUIDType: appconfig.DeviceName, + ResourceName: appconfig.NvidiaResourceName, + GPUInstanceID: 3, + MetricMigProfile: "mig", + PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + }, + { + KubernetesGPUIDType: appconfig.DeviceName, + ResourceName: appconfig.NvidiaResourceName, + MetricMigProfile: "mig", + PODGPUID: "nvidia0/gi0", + }, + { + KubernetesGPUIDType: appconfig.DeviceName, + ResourceName: appconfig.NvidiaResourceName, + MetricGPUDevice: "0", + PODGPUID: "0/vgpu", + }, + { + KubernetesGPUIDType: appconfig.GPUUID, + ResourceName: appconfig.NvidiaResourceName, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5::", + }, + { + KubernetesGPUIDType: appconfig.GPUUID, + ResourceName: "nvidia.com/mig-1g.10gb", + MetricMigProfile: "1g.10gb", + MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + MetricGPUDevice: "0", + GPUInstanceID: 3, + }, + { + KubernetesGPUIDType: appconfig.GPUUID, + ResourceName: "nvidia.com/a100", + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + NvidiaResourceNames: []string{"nvidia.com/a100"}, + }, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("when type %s, pod device id %s metric device id %s and gpu device %s", + tc.KubernetesGPUIDType, + tc.PODGPUID, + tc.MetricGPUID, + tc.MetricGPUDevice, + ), + func(t *testing.T) { + tmpDir, cleanup := testutils.CreateTmpDir(t) + defer cleanup() + socketPath := tmpDir + "/kubelet.sock" + server := grpc.NewServer() + + config := &appconfig.Config{ + UseRemoteHE: false, + } + + dcgmprovider.Initialize(config) + defer dcgmprovider.Client().Cleanup() + + gpus := []string{tc.PODGPUID} + podresourcesapi.RegisterPodResourcesListerServer(server, + testutils.NewMockPodResourcesServer(tc.ResourceName, gpus)) + + cleanup = testutils.StartMockServer(t, server, socketPath) + defer cleanup() + + migDeviceInfo := &nvmlprovider.MIGDeviceInfo{ + ParentUUID: "00000000-0000-0000-0000-000000000000", + GPUInstanceID: 3, + ComputeInstanceID: 0, + } + + ctrl := gomock.NewController(t) + mockNVMLProvider := mocknvmlprovider.NewMockNVML(ctrl) + mockNVMLProvider.EXPECT().GetMIGDeviceInfoByID(gomock.Any()).Return(migDeviceInfo, nil).AnyTimes() + nvmlprovider.SetClient(mockNVMLProvider) + + podMapper := NewPodMapper(&appconfig.Config{ + KubernetesGPUIdType: tc.KubernetesGPUIDType, + PodResourcesKubeletSocket: socketPath, + NvidiaResourceNames: tc.NvidiaResourceNames, + }) + require.NotNil(t, podMapper) + metrics := collector.MetricsByCounter{} + counter := counters.Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + } + + metrics[counter] = append(metrics[counter], collector.Metric{ + GPU: "0", + GPUUUID: tc.MetricGPUID, + GPUDevice: tc.MetricGPUDevice, + GPUInstanceID: fmt.Sprint(tc.GPUInstanceID), + Value: "42", + MigProfile: tc.MetricMigProfile, + Counter: counters.Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + }, + Attributes: map[string]string{}, + }) + + mockGPU := deviceinfo.GPUInfo{ + DeviceInfo: dcgm.Device{ + UUID: "00000000-0000-0000-0000-000000000000", + GPU: 0, + }, + MigEnabled: true, + } + + mockSystemInfo := mockdeviceinfo.NewMockProvider(ctrl) + mockSystemInfo.EXPECT().GPUCount().Return(uint(1)).AnyTimes() + mockSystemInfo.EXPECT().GPU(uint(0)).Return(mockGPU).AnyTimes() + + err := podMapper.Process(metrics, mockSystemInfo) + require.NoError(t, err) + assert.Len(t, metrics, 1) + for _, metric := range metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(counters.Counter)] { + require.Contains(t, metric.Attributes, podAttribute) + require.Contains(t, metric.Attributes, namespaceAttribute) + require.Contains(t, metric.Attributes, containerAttribute) + + // TODO currently we rely on ordering and implicit expectations of the mock implementation + // This should be a table comparison + require.Equal(t, fmt.Sprintf("gpu-pod-%d", 0), metric.Attributes[podAttribute]) + require.Equal(t, "default", metric.Attributes[namespaceAttribute]) + require.Equal(t, "default", metric.Attributes[containerAttribute]) + } + }) + } +} diff --git a/internal/pkg/transformation/transformer.go b/internal/pkg/transformation/transformer.go new file mode 100644 index 00000000..86f56c84 --- /dev/null +++ b/internal/pkg/transformation/transformer.go @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import ( + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" +) + +// GetTransformations return list of transformation applicable for metrics +func GetTransformations(c *appconfig.Config) []Transform { + var transformations []Transform + if c.Kubernetes { + podMapper := NewPodMapper(c) + transformations = append(transformations, podMapper) + } + + if c.HPCJobMappingDir != "" { + hpcMapper := newHPCMapper(c) + transformations = append(transformations, hpcMapper) + } + + return transformations +} diff --git a/internal/pkg/transformation/transformer_test.go b/internal/pkg/transformation/transformer_test.go new file mode 100644 index 00000000..f2ab6652 --- /dev/null +++ b/internal/pkg/transformation/transformer_test.go @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" +) + +func TestGetTransformations(t *testing.T) { + tests := []struct { + name string + config *appconfig.Config + assert func(*testing.T, []Transform) + }{ + { + name: "The environment is not kubernetes", + config: &appconfig.Config{ + Kubernetes: false, + }, + assert: func(t *testing.T, transforms []Transform) { + assert.Len(t, transforms, 0) + }, + }, + { + name: "The environment is kubernetes", + config: &appconfig.Config{ + Kubernetes: true, + }, + assert: func(t *testing.T, transforms []Transform) { + assert.Len(t, transforms, 1) + }, + }, + { + name: "The environment is HPC cluster", + config: &appconfig.Config{ + HPCJobMappingDir: "/var/run/nvidia/slurm", + }, + assert: func(t *testing.T, transforms []Transform) { + assert.Len(t, transforms, 1) + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + transformations := GetTransformations(tt.config) + tt.assert(t, transformations) + }) + } +} diff --git a/internal/pkg/transformation/types.go b/internal/pkg/transformation/types.go new file mode 100644 index 00000000..2bc896fc --- /dev/null +++ b/internal/pkg/transformation/types.go @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import ( + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" +) + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/transformations/mock_transformer.go -package=transformation -copyright_file=../../../hack/header.txt . Transform + +type Transform interface { + Process(metrics collector.MetricsByCounter, deviceInfo deviceinfo.Provider) error + Name() string +} + +type PodMapper struct { + Config *appconfig.Config +} + +type PodInfo struct { + Name string + Namespace string + Container string +} diff --git a/internal/pkg/transformation/variables.go b/internal/pkg/transformation/variables.go new file mode 100644 index 00000000..93e7da6c --- /dev/null +++ b/internal/pkg/transformation/variables.go @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package transformation + +import osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" + +var os osinterface.OS = osinterface.RealOS{} + +var doNothing = func() { + // This function is intentionally left blank +} diff --git a/pkg/dcgmexporter/utils.go b/internal/pkg/utils/utils.go similarity index 73% rename from pkg/dcgmexporter/utils.go rename to internal/pkg/utils/utils.go index 6f5391bc..b194a4ae 100644 --- a/pkg/dcgmexporter/utils.go +++ b/internal/pkg/utils/utils.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,10 +14,12 @@ * limitations under the License. */ -package dcgmexporter +package utils import ( "bytes" + "crypto/rand" + "encoding/binary" "encoding/gob" "fmt" "sync" @@ -38,7 +40,17 @@ func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error { } } -func deepCopy[T any](src T) (dst T, err error) { +func RandUint64() (uint64, error) { + var num uint64 + err := binary.Read(rand.Reader, binary.BigEndian, &num) + if err != nil { + return 0, fmt.Errorf("failed to generate random 64-bit number; err: %w", err) + } + + return num, nil +} + +func DeepCopy[T any](src T) (dst T, err error) { var buf bytes.Buffer defer func() { @@ -63,3 +75,11 @@ func deepCopy[T any](src T) (dst T, err error) { return dst, nil } + +func CleanupOnError(cleanups []func()) []func() { + for _, cleanup := range cleanups { + cleanup() + } + + return nil +} diff --git a/pkg/dcgmexporter/utils_test.go b/internal/pkg/utils/utils_test.go similarity index 50% rename from pkg/dcgmexporter/utils_test.go rename to internal/pkg/utils/utils_test.go index c36e1e9f..3c9c488b 100644 --- a/pkg/dcgmexporter/utils_test.go +++ b/internal/pkg/utils/utils_test.go @@ -14,15 +14,19 @@ * limitations under the License. */ -package dcgmexporter +package utils import ( + "crypto/rand" + "fmt" "sync" "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" ) func TestWaitWithTimeout(t *testing.T) { @@ -46,17 +50,82 @@ func TestWaitWithTimeout(t *testing.T) { }) } +func TestRandUint64_Success(t *testing.T) { + num, err := RandUint64() + assert.Nil(t, err, "Unexpected error: %v", err) + assert.NotZero(t, num, "Expected a non-zero uint64, but got 0") +} + +func TestRandUint64_Failure(t *testing.T) { + // Simulate a failure in rand.Reader using mock rand.Reader + mockReader := &testutils.MockReader{Err: fmt.Errorf("mock error")} + + originalReader := rand.Reader + rand.Reader = mockReader + defer func() { + rand.Reader = originalReader + }() + + num, err := RandUint64() + assert.NotNil(t, err, "Expected an error") + assert.Zero(t, num, fmt.Sprintf("Expected a uint64, but got %d", num)) +} + func TestDeepCopy(t *testing.T) { t.Run("Return error when pointer value is nil", func(t *testing.T) { - got, err := deepCopy[*struct{}](nil) + got, err := DeepCopy[*struct{}](nil) assert.Nil(t, got) assert.Error(t, err) }) t.Run("Return error when src is unsupported type", func(t *testing.T) { ch := make(chan int) - got, err := deepCopy(ch) + got, err := DeepCopy(ch) assert.Nil(t, got) assert.Error(t, err) }) } + +func TestCleanupOnError(t *testing.T) { + tests := []struct { + name string + cleanups []func() + want []func() + }{ + { + name: "Nil cleanup functions", + cleanups: nil, + want: nil, + }, + { + name: "Empty cleanup functions", + cleanups: []func(){}, + want: nil, + }, + { + name: "One cleanup functions", + cleanups: []func(){ + func() {}, + }, + want: nil, + }, + { + name: "Multiple cleanup functions", + cleanups: []func(){ + func() {}, + func() { + func() { + // This function is intentionally left blank + }() + }, + func() {}, + }, + want: nil, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equalf(t, tt.want, CleanupOnError(tt.cleanups), "expected output to be the same.") + }) + } +} diff --git a/packaging/config-files/systemd/nvidia-dcgm-exporter.service b/packaging/config-files/systemd/nvidia-dcgm-exporter.service new file mode 100644 index 00000000..aae6f167 --- /dev/null +++ b/packaging/config-files/systemd/nvidia-dcgm-exporter.service @@ -0,0 +1,33 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +[Unit] +Description=NVIDIA DCGM-exporter service +Wants=nvidia-dcgm.service +After=nvidia-dcgm.service + +[Service] +User=root +PrivateTmp=false + +StandardOutput=append:/var/dcgm-exporter.log +StandardError=append:/var/dcgm-exporter.log + +ExecStart=/usr/bin/dcgm-exporter -f /etc/dcgm-exporter/default-counters.csv + +Restart=on-abort + +[Install] +WantedBy=multi-user.target diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index c78f8434..7da26ca3 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "fmt" + "log/slog" "os" "os/signal" "runtime" @@ -17,11 +18,22 @@ import ( "time" "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" - "github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter" - "github.com/NVIDIA/dcgm-exporter/pkg/stdout" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/hostname" + . "github.com/NVIDIA/dcgm-exporter/internal/pkg/logging" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/prerequisites" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/registry" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/server" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/stdout" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/utils" ) const ( @@ -150,9 +162,9 @@ func NewApp(buildVersion ...string) *cli.App { }, &cli.StringFlag{ Name: CLIKubernetesGPUIDType, - Value: string(dcgmexporter.GPUUID), + Value: string(appconfig.GPUUID), Usage: fmt.Sprintf("Choose Type of GPU ID to use to map kubernetes resources to pods. Possible values: '%s', '%s'", - dcgmexporter.GPUUID, dcgmexporter.DeviceName), + appconfig.GPUUID, appconfig.DeviceName), EnvVars: []string{"DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE"}, }, &cli.StringFlag{ @@ -185,7 +197,7 @@ func NewApp(buildVersion ...string) *cli.App { &cli.StringFlag{ Name: CLIWebConfigFile, Value: "", - Usage: "TLS config file following webConfig spec.", + Usage: "Web configuration file following webConfig spec: https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md.", EnvVars: []string{"DCGM_EXPORTER_WEB_CONFIG_FILE"}, }, &cli.IntFlag{ @@ -222,7 +234,7 @@ func NewApp(buildVersion ...string) *cli.App { }, &cli.StringFlag{ Name: CLIDCGMLogLevel, - Value: dcgmexporter.DCGMDbgLvlNone, + Value: DCGMDbgLvlNone, Usage: "Specify the DCGM log verbosity level. This parameter is effective only when the '--enable-dcgm-log' option is set to 'true'. Possible values: NONE, FATAL, ERROR, WARN, INFO, DEBUG and VERB", EnvVars: []string{"DCGM_EXPORTER_DCGM_LOG_LEVEL"}, }, @@ -255,7 +267,8 @@ func NewApp(buildVersion ...string) *cli.App { }) } else { err := "dcgm-exporter is only supported on Linux." - logrus.Fatal(err) + slog.Error(err) + fatal() return nil } @@ -266,6 +279,10 @@ func NewApp(buildVersion ...string) *cli.App { return c } +func fatal() { + os.Exit(1) +} + func newOSWatcher(sigs ...os.Signal) chan os.Signal { sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, sigs...) @@ -280,7 +297,7 @@ func action(c *cli.Context) (err error) { // during initialization and return an error. defer func() { if r := recover(); r != nil { - logrus.WithField(dcgmexporter.LoggerStackTrace, string(debug.Stack())).Error("Encountered a failure.") + slog.Error("Encountered a failure.", slog.String(StackTrace, string(debug.Stack()))) err = fmt.Errorf("encountered a failure; err: %v", r) } }() @@ -291,7 +308,12 @@ func action(c *cli.Context) (err error) { func startDCGMExporter(c *cli.Context, cancel context.CancelFunc) error { restart: - logrus.Info("Starting dcgm-exporter") + var version string + if c != nil && c.App != nil { + version = c.App.Version + } + + slog.Info("Starting dcgm-exporter", slog.String("Version", version)) config, err := contextToConfig(c) if err != nil { @@ -300,41 +322,40 @@ restart: enableDebugLogging(config) - cleanupDCGM := initDCGM(config) - defer cleanupDCGM() + err = prerequisites.Validate() + if err != nil { + return err + } + + // Initialize DCGM Provider Instance + dcgmprovider.Initialize(config) + defer dcgmprovider.Client().Cleanup() + + slog.Info("DCGM successfully initialized!") - logrus.Info("DCGM successfully initialized!") + // Initialize NVML Provider Instance + nvmlprovider.Initialize() + defer nvmlprovider.Client().Cleanup() - dcgm.FieldsInit() - defer dcgm.FieldsTerm() + slog.Info("NVML provider successfully initialized!") fillConfigMetricGroups(config) cs := getCounters(config) - fieldEntityGroupTypeSystemInfo := getFieldEntityGroupTypeSystemInfo(cs, config) + deviceWatchListManager := startDeviceWatchListManager(cs, config) - hostname, err := dcgmexporter.GetHostname(config) + hostname, err := hostname.GetHostname(config) if err != nil { return err } - pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, - cs.DCGMCounters, - hostname, - dcgmexporter.NewDCGMCollector, - fieldEntityGroupTypeSystemInfo, - ) - defer cleanup() - if err != nil { - logrus.Fatal(err) - } + cf := collector.InitCollectorFactory(cs, deviceWatchListManager, hostname, config) - cRegistry := dcgmexporter.NewRegistry() - - enableDCGMExpXIDErrorsCountCollector(cs, fieldEntityGroupTypeSystemInfo, hostname, config, cRegistry) - - enableDCGMExpClockEventsCount(cs, fieldEntityGroupTypeSystemInfo, hostname, config, cRegistry) + cRegistry := registry.NewRegistry() + for _, entityCollector := range cf.NewCollectors() { + cRegistry.Register(entityCollector) + } defer func() { cRegistry.Cleanup() @@ -346,11 +367,8 @@ restart: stop := make(chan interface{}) wg.Add(1) - go pipeline.Run(ch, stop, &wg) - wg.Add(1) - - server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch, cRegistry) + server, cleanup, err := server.NewMetricsServer(config, ch, deviceWatchListManager, cRegistry) defer cleanup() if err != nil { return err @@ -362,9 +380,10 @@ restart: sig := <-sigs close(stop) cancel() - err = dcgmexporter.WaitWithTimeout(&wg, time.Second*2) + err = utils.WaitWithTimeout(&wg, time.Second*2) if err != nil { - logrus.Fatal(err) + slog.Error(err.Error()) + fatal() } if sig == syscall.SIGHUP { @@ -374,69 +393,40 @@ restart: return nil } -func enableDCGMExpClockEventsCount(cs *dcgmexporter.CounterSet, fieldEntityGroupTypeSystemInfo *dcgmexporter.FieldEntityGroupTypeSystemInfo, hostname string, config *dcgmexporter.Config, cRegistry *dcgmexporter.Registry) { - if dcgmexporter.IsDCGMExpClockEventsCountEnabled(cs.ExporterCounters) { - item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - if !exists { - logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockEventsCount.String()) - } - clocksThrottleReasonsCollector, err := dcgmexporter.NewClockEventsCollector( - cs.ExporterCounters, hostname, config, item) - if err != nil { - logrus.Fatal(err) - } - - cRegistry.Register(clocksThrottleReasonsCollector) - - logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockEventsCount.String()) - } -} - -func enableDCGMExpXIDErrorsCountCollector(cs *dcgmexporter.CounterSet, fieldEntityGroupTypeSystemInfo *dcgmexporter.FieldEntityGroupTypeSystemInfo, hostname string, config *dcgmexporter.Config, cRegistry *dcgmexporter.Registry) { - if dcgmexporter.IsDCGMExpXIDErrorsCountEnabled(cs.ExporterCounters) { - item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - if !exists { - logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMXIDErrorsCount.String()) - } - - xidCollector, err := dcgmexporter.NewXIDCollector(cs.ExporterCounters, hostname, config, item) - if err != nil { - logrus.Fatal(err) - } - - cRegistry.Register(xidCollector) - - logrus.Infof("%s collector initialized", dcgmexporter.DCGMXIDErrorsCount.String()) - } -} - -func getFieldEntityGroupTypeSystemInfo(cs *dcgmexporter.CounterSet, config *dcgmexporter.Config) *dcgmexporter.FieldEntityGroupTypeSystemInfo { - var allCounters []dcgmexporter.Counter +func startDeviceWatchListManager( + cs *counters.CounterSet, config *appconfig.Config, +) devicewatchlistmanager.Manager { + // Create a list containing DCGM Collector, Exp Collectors and all the label Collectors + var allCounters counters.CounterList + var deviceWatchListManager devicewatchlistmanager.Manager allCounters = append(allCounters, cs.DCGMCounters...) allCounters = appendDCGMXIDErrorsCountDependency(allCounters, cs) allCounters = appendDCGMClockEventsCountDependency(cs, allCounters) - fieldEntityGroupTypeSystemInfo := dcgmexporter.NewEntityGroupTypeSystemInfo(allCounters, config) + deviceWatchListManager = devicewatchlistmanager.NewWatchListManager(allCounters, config) + deviceWatcher := devicewatcher.NewDeviceWatcher() - for _, egt := range dcgmexporter.FieldEntityGroupTypeToMonitor { - err := fieldEntityGroupTypeSystemInfo.Load(egt) + for _, deviceType := range devicewatchlistmanager.DeviceTypesToWatch { + err := deviceWatchListManager.CreateEntityWatchList(deviceType, deviceWatcher, int64(config.CollectInterval)) if err != nil { - logrus.Infof("Not collecting %s metrics; %s", egt.String(), err) + slog.Info(fmt.Sprintf("Not collecting %s metrics; %s", deviceType.String(), err)) } } - return fieldEntityGroupTypeSystemInfo + return deviceWatchListManager } // appendDCGMXIDErrorsCountDependency appends DCGM counters required for the DCGM_EXP_CLOCK_EVENTS_COUNT metric -func appendDCGMClockEventsCountDependency(cs *dcgmexporter.CounterSet, allCounters []dcgmexporter.Counter) []dcgmexporter.Counter { +func appendDCGMClockEventsCountDependency( + cs *counters.CounterSet, allCounters []counters.Counter, +) []counters.Counter { if len(cs.ExporterCounters) > 0 { - if containsField(cs.ExporterCounters, dcgmexporter.DCGMClockEventsCount) && - !containsField(allCounters, dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS) { + if containsField(cs.ExporterCounters, counters.DCGMClockEventsCount) && + !containsField(allCounters, dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS) { allCounters = append(allCounters, - dcgmexporter.Counter{ - FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + counters.Counter{ + FieldID: dcgm.DCGM_FI_DEV_CLOCKS_EVENT_REASONS, }) } } @@ -444,12 +434,14 @@ func appendDCGMClockEventsCountDependency(cs *dcgmexporter.CounterSet, allCounte } // appendDCGMXIDErrorsCountDependency appends DCGM counters required for the DCGM_EXP_XID_ERRORS_COUNT metric -func appendDCGMXIDErrorsCountDependency(allCounters []dcgmexporter.Counter, cs *dcgmexporter.CounterSet) []dcgmexporter.Counter { +func appendDCGMXIDErrorsCountDependency( + allCounters []counters.Counter, cs *counters.CounterSet, +) []counters.Counter { if len(cs.ExporterCounters) > 0 { - if containsField(cs.ExporterCounters, dcgmexporter.DCGMXIDErrorsCount) && + if containsField(cs.ExporterCounters, counters.DCGMXIDErrorsCount) && !containsField(allCounters, dcgm.DCGM_FI_DEV_XID_ERRORS) { allCounters = append(allCounters, - dcgmexporter.Counter{ + counters.Counter{ FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS, }) } @@ -457,16 +449,17 @@ func appendDCGMXIDErrorsCountDependency(allCounters []dcgmexporter.Counter, cs * return allCounters } -func containsField(slice []dcgmexporter.Counter, fieldID dcgmexporter.ExporterCounter) bool { - return slices.ContainsFunc(slice, func(counter dcgmexporter.Counter) bool { +func containsField(slice []counters.Counter, fieldID counters.ExporterCounter) bool { + return slices.ContainsFunc(slice, func(counter counters.Counter) bool { return counter.FieldID == dcgm.Short(fieldID) }) } -func getCounters(config *dcgmexporter.Config) *dcgmexporter.CounterSet { - cs, err := dcgmexporter.GetCounterSet(config) +func getCounters(config *appconfig.Config) *counters.CounterSet { + cs, err := counters.GetCounterSet(config) if err != nil { - logrus.Fatal(err) + slog.Error(err.Error()) + os.Exit(1) } // Copy labels from DCGM Counters to ExporterCounters @@ -478,58 +471,32 @@ func getCounters(config *dcgmexporter.Config) *dcgmexporter.CounterSet { return cs } -func fillConfigMetricGroups(config *dcgmexporter.Config) { +func fillConfigMetricGroups(config *appconfig.Config) { var groups []dcgm.MetricGroup - groups, err := dcgm.GetSupportedMetricGroups(0) + groups, err := dcgmprovider.Client().GetSupportedMetricGroups(0) if err != nil { config.CollectDCP = false - logrus.Info("Not collecting DCP metrics: ", err) + slog.Info("Not collecting DCP metrics: " + err.Error()) } else { - logrus.Info("Collecting DCP Metrics") + slog.Info("Collecting DCP Metrics") config.MetricGroups = groups } } -func enableDebugLogging(config *dcgmexporter.Config) { +func enableDebugLogging(config *appconfig.Config) { if config.Debug { // enable debug logging - logrus.SetLevel(logrus.DebugLevel) - logrus.Debug("Debug output is enabled") + slog.SetLogLoggerLevel(slog.LevelDebug) + slog.Debug("Debug output is enabled") } - logrus.Debugf("Command line: %s", strings.Join(os.Args, " ")) + slog.Debug(fmt.Sprintf("Command line: %s", strings.Join(os.Args, " "))) - logrus.WithField(dcgmexporter.LoggerDumpKey, fmt.Sprintf("%+v", config)).Debug("Loaded configuration") -} - -func initDCGM(config *dcgmexporter.Config) func() { - if config.UseRemoteHE { - logrus.Info("Attemping to connect to remote hostengine at ", config.RemoteHEInfo) - cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0") - if err != nil { - cleanup() - logrus.Fatal(err) - } - return cleanup - } else { - - if config.EnableDCGMLog { - os.Setenv("__DCGM_DBG_FILE", "-") - os.Setenv("__DCGM_DBG_LVL", config.DCGMLogLevel) - } - - cleanup, err := dcgm.Init(dcgm.Embedded) - if err != nil { - cleanup() - logrus.Fatal(err) - } - - return cleanup - } + slog.Debug("Loaded configuration", slog.String(DumpKey, fmt.Sprintf("%+v", config))) } -func parseDeviceOptions(devices string) (dcgmexporter.DeviceOptions, error) { - var dOpt dcgmexporter.DeviceOptions +func parseDeviceOptions(devices string) (appconfig.DeviceOptions, error) { + var dOpt appconfig.DeviceOptions letterAndRange := strings.Split(devices, ":") count := len(letterAndRange) @@ -591,7 +558,7 @@ func parseDeviceOptions(devices string) (dcgmexporter.DeviceOptions, error) { return dOpt, nil } -func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { +func contextToConfig(c *cli.Context) (*appconfig.Config, error) { gOpt, err := parseDeviceOptions(c.String(CLIGPUDevices)) if err != nil { return nil, err @@ -608,23 +575,23 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { } dcgmLogLevel := c.String(CLIDCGMLogLevel) - if !slices.Contains(dcgmexporter.DCGMDbgLvlValues, dcgmLogLevel) { + if !slices.Contains(DCGMDbgLvlValues, dcgmLogLevel) { return nil, fmt.Errorf("invalid %s parameter value: %s", CLIDCGMLogLevel, dcgmLogLevel) } - return &dcgmexporter.Config{ + return &appconfig.Config{ CollectorsFile: c.String(CLIFieldsFile), Address: c.String(CLIAddress), CollectInterval: c.Int(CLICollectInterval), Kubernetes: c.Bool(CLIKubernetes), - KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)), + KubernetesGPUIdType: appconfig.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)), CollectDCP: true, UseOldNamespace: c.Bool(CLIUseOldNamespace), UseRemoteHE: c.IsSet(CLIRemoteHEInfo), RemoteHEInfo: c.String(CLIRemoteHEInfo), - GPUDevices: gOpt, - SwitchDevices: sOpt, - CPUDevices: cOpt, + GPUDeviceOptions: gOpt, + SwitchDeviceOptions: sOpt, + CPUDeviceOptions: cOpt, NoHostname: c.Bool(CLINoHostname), UseFakeGPUs: c.Bool(CLIUseFakeGPUs), ConfigMapData: c.String(CLIConfigMapData), diff --git a/pkg/cmd/app_test.go b/pkg/cmd/app_test.go index 9035c6bd..0bcd238f 100644 --- a/pkg/cmd/app_test.go +++ b/pkg/cmd/app_test.go @@ -23,27 +23,30 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" - "github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter" ) -func Test_getFieldEntityGroupTypeSystemInfo(t *testing.T) { - config := &dcgmexporter.Config{ - GPUDevices: dcgmexporter.DeviceOptions{}, - SwitchDevices: dcgmexporter.DeviceOptions{}, - CPUDevices: dcgmexporter.DeviceOptions{}, - UseFakeGPUs: true, +func Test_getDeviceWatchListManager(t *testing.T) { + config := &appconfig.Config{ + GPUDeviceOptions: appconfig.DeviceOptions{}, + SwitchDeviceOptions: appconfig.DeviceOptions{}, + CPUDeviceOptions: appconfig.DeviceOptions{}, + UseFakeGPUs: true, } tests := []struct { name string - counterSet *dcgmexporter.CounterSet - assertion func(*testing.T, *dcgmexporter.FieldEntityGroupTypeSystemInfo) + counterSet *counters.CounterSet + assertion func(*testing.T, devicewatchlistmanager.Manager) }{ { name: "When DCGM_FI_DEV_XID_ERRORS and DCGM_EXP_XID_ERRORS_COUNT enabled", - counterSet: &dcgmexporter.CounterSet{ - DCGMCounters: []dcgmexporter.Counter{ + counterSet: &counters.CounterSet{ + DCGMCounters: []counters.Counter{ { FieldID: 230, FieldName: "DCGM_FI_DEV_XID_ERRORS", @@ -51,7 +54,7 @@ func Test_getFieldEntityGroupTypeSystemInfo(t *testing.T) { Help: "Value of the last XID error encountered.", }, }, - ExporterCounters: []dcgmexporter.Counter{ + ExporterCounters: []counters.Counter{ { FieldID: 9001, FieldName: "DCGM_EXP_XID_ERRORS_COUNT", @@ -60,17 +63,17 @@ func Test_getFieldEntityGroupTypeSystemInfo(t *testing.T) { }, }, }, - assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + assertion: func(t *testing.T, got devicewatchlistmanager.Manager) { require.NotNil(t, got) - values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + values := testutils.GetStructPrivateFieldValue[[]counters.Counter](t, got, "counters") require.Len(t, values, 1) assert.Equal(t, dcgm.Short(230), values[0].FieldID) }, }, { name: "When DCGM_FI_DEV_XID_ERRORS enabled", - counterSet: &dcgmexporter.CounterSet{ - DCGMCounters: []dcgmexporter.Counter{ + counterSet: &counters.CounterSet{ + DCGMCounters: []counters.Counter{ { FieldID: 230, FieldName: "DCGM_FI_DEV_XID_ERRORS", @@ -79,17 +82,17 @@ func Test_getFieldEntityGroupTypeSystemInfo(t *testing.T) { }, }, }, - assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + assertion: func(t *testing.T, got devicewatchlistmanager.Manager) { require.NotNil(t, got) - values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + values := testutils.GetStructPrivateFieldValue[[]counters.Counter](t, got, "counters") require.Len(t, values, 1) assert.Equal(t, dcgm.Short(230), values[0].FieldID) }, }, { name: "When DCGM_EXP_XID_ERRORS_COUNT enabled", - counterSet: &dcgmexporter.CounterSet{ - ExporterCounters: []dcgmexporter.Counter{ + counterSet: &counters.CounterSet{ + ExporterCounters: []counters.Counter{ { FieldID: 9001, FieldName: "DCGM_EXP_XID_ERRORS_COUNT", @@ -98,33 +101,33 @@ func Test_getFieldEntityGroupTypeSystemInfo(t *testing.T) { }, }, }, - assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + assertion: func(t *testing.T, got devicewatchlistmanager.Manager) { require.NotNil(t, got) - values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + values := testutils.GetStructPrivateFieldValue[[]counters.Counter](t, got, "counters") require.Len(t, values, 1) assert.Equal(t, dcgm.Short(230), values[0].FieldID) }, }, { name: "When no counters", - counterSet: &dcgmexporter.CounterSet{}, - assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + counterSet: &counters.CounterSet{}, + assertion: func(t *testing.T, got devicewatchlistmanager.Manager) { require.NotNil(t, got) - values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + values := testutils.GetStructPrivateFieldValue[[]counters.Counter](t, got, "counters") require.Len(t, values, 0) }, }, { name: "When DCGM_FI_DEV_CLOCK_THROTTLE_REASON and DCGM_EXP_CLOCK_EVENTS_COUNT enabled", - counterSet: &dcgmexporter.CounterSet{ - DCGMCounters: []dcgmexporter.Counter{ + counterSet: &counters.CounterSet{ + DCGMCounters: []counters.Counter{ { FieldID: 112, FieldName: "DCGM_FI_DEV_CLOCK_THROTTLE_REASON", PromType: "gauge", }, }, - ExporterCounters: []dcgmexporter.Counter{ + ExporterCounters: []counters.Counter{ { FieldID: 9002, FieldName: "DCGM_EXP_CLOCK_EVENTS_COUNT", @@ -133,18 +136,18 @@ func Test_getFieldEntityGroupTypeSystemInfo(t *testing.T) { }, }, }, - assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + assertion: func(t *testing.T, got devicewatchlistmanager.Manager) { require.NotNil(t, got) require.NotNil(t, got) - values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + values := testutils.GetStructPrivateFieldValue[[]counters.Counter](t, got, "counters") require.Len(t, values, 1) assert.Equal(t, dcgm.Short(112), values[0].FieldID) }, }, { name: "When DCGM_FI_DEV_CLOCK_THROTTLE_REASON enabled", - counterSet: &dcgmexporter.CounterSet{ - DCGMCounters: []dcgmexporter.Counter{ + counterSet: &counters.CounterSet{ + DCGMCounters: []counters.Counter{ { FieldID: 112, FieldName: "DCGM_FI_DEV_CLOCK_THROTTLE_REASON", @@ -152,17 +155,17 @@ func Test_getFieldEntityGroupTypeSystemInfo(t *testing.T) { }, }, }, - assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + assertion: func(t *testing.T, got devicewatchlistmanager.Manager) { require.NotNil(t, got) - values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + values := testutils.GetStructPrivateFieldValue[[]counters.Counter](t, got, "counters") require.Len(t, values, 1) assert.Equal(t, dcgm.Short(112), values[0].FieldID) }, }, { name: "When DCGM_EXP_CLOCK_EVENTS_COUNT enabled", - counterSet: &dcgmexporter.CounterSet{ - ExporterCounters: []dcgmexporter.Counter{ + counterSet: &counters.CounterSet{ + ExporterCounters: []counters.Counter{ { FieldID: 9002, FieldName: "DCGM_EXP_CLOCK_EVENTS_COUNT", @@ -171,21 +174,21 @@ func Test_getFieldEntityGroupTypeSystemInfo(t *testing.T) { }, }, }, - assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + assertion: func(t *testing.T, got devicewatchlistmanager.Manager) { require.NotNil(t, got) - values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + values := testutils.GetStructPrivateFieldValue[[]counters.Counter](t, got, "counters") require.Len(t, values, 1) assert.Equal(t, dcgm.Short(112), values[0].FieldID) }, }, } - cleanupDCGM := initDCGM(config) - defer cleanupDCGM() + dcgmprovider.Initialize(config) + defer dcgmprovider.Client().Cleanup() for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := getFieldEntityGroupTypeSystemInfo(tt.counterSet, config) + got := startDeviceWatchListManager(tt.counterSet, config) if tt.assertion == nil { t.Skip(tt.name) } diff --git a/pkg/dcgmexporter/const.go b/pkg/cmd/const.go similarity index 73% rename from pkg/dcgmexporter/const.go rename to pkg/cmd/const.go index 594356cc..49b4795c 100644 --- a/pkg/dcgmexporter/const.go +++ b/pkg/cmd/const.go @@ -14,23 +14,7 @@ * limitations under the License. */ -package dcgmexporter - -// Constants for logging fields -const ( - LoggerGroupIDKey = "groupID" - LoggerDumpKey = "dump" - LoggerStackTrace = "stacktrace" -) - -const ( - PARENT_ID_IGNORED = 0 - DCGM_ST_NOT_CONFIGURED = "Setting not configured" -) - -const ( - windowSizeInMSLabel = "window_size_in_ms" -) +package cmd // DCGMDbgLvl is a DCGM library debug level. const ( @@ -43,7 +27,8 @@ const ( DCGMDbgLvlVerb = "VERB" ) -var DCGMDbgLvlValues = []string{DCGMDbgLvlNone, +var DCGMDbgLvlValues = []string{ + DCGMDbgLvlNone, DCGMDbgLvlFatal, DCGMDbgLvlError, DCGMDbgLvlWarn, diff --git a/pkg/dcgmexporter/clock_events_collector_test.go b/pkg/dcgmexporter/clock_events_collector_test.go deleted file mode 100644 index 380715c5..00000000 --- a/pkg/dcgmexporter/clock_events_collector_test.go +++ /dev/null @@ -1,483 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "fmt" - "reflect" - "slices" - "strconv" - "testing" - "time" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/stretchr/testify/require" - "google.golang.org/grpc" - podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1alpha1" - - "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" -) - -func TestClockEventsCollector_Gather(t *testing.T) { - teardownTest := setupTest(t) - defer teardownTest(t) - runOnlyWithLiveGPUs(t) - testutils.RequireLinux(t) - - hostname := "local-test" - config := &Config{ - GPUDevices: DeviceOptions{ - Flex: true, - MajorRange: []int{-1}, - MinorRange: []int{-1}, - }, - ClockEventsCountWindowSize: int(time.Duration(5) * time.Minute), - } - - records := [][]string{ - {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, - {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, - } - - cc, err := extractCounters(records, config) - require.NoError(t, err) - require.Len(t, cc.ExporterCounters, 1) - require.Len(t, cc.DCGMCounters, 1) - - for i := range cc.DCGMCounters { - if cc.DCGMCounters[i].PromType == "label" { - cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) - } - } - - // Create fake GPU - numGPUs, err := dcgm.GetAllDeviceCount() - require.NoError(t, err) - - if numGPUs+1 > dcgm.MAX_NUM_DEVICES { - t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) - } - - entityList := []dcgm.MigHierarchyInfo{ - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - } - - gpuIDs, err := dcgm.CreateFakeEntities(entityList) - require.NoError(t, err) - require.NotEmpty(t, gpuIDs) - - type clockEventsCountExpectation map[string]string - expectations := map[string]clockEventsCountExpectation{} - - for i, gpuID := range gpuIDs { - err = dcgm.InjectFieldValue(gpuID, - dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - dcgm.DCGM_FT_INT64, - 0, - time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), - int64(DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL|DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL), - ) - require.NoError(t, err) - - err = dcgm.InjectFieldValue(gpuID, - dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - dcgm.DCGM_FT_INT64, - 0, - time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), - int64(DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL|DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL), - ) - require.NoError(t, err) - - err = dcgm.InjectFieldValue(gpuID, - dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - dcgm.DCGM_FT_INT64, - 0, - time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), - int64(DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE), - ) - require.NoError(t, err) - - expectations[fmt.Sprint(gpuID)] = clockEventsCountExpectation{ - DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL.String(): "2", - DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL.String(): "2", - DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE.String(): "1", - } - } - - // Create a fake K8S to emulate work on K8S environment - tmpDir, cleanup := CreateTmpDir(t) - defer cleanup() - socketPath := tmpDir + "/kubelet.sock" - server := grpc.NewServer() - - gpuIDsAsString := make([]string, len(gpuIDs)) - - for i, g := range gpuIDs { - gpuIDsAsString[i] = fmt.Sprint(g) - } - - podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(nvidiaResourceName, gpuIDsAsString)) - // Tell that the app is running on K8S - config.Kubernetes = true - config.PodResourcesKubeletSocket = socketPath - - allCounters := []Counter{ - { - FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - }, - } - - fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) - err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) - require.NoError(t, err) - - item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - - collector, err := NewClockEventsCollector(cc.ExporterCounters, hostname, config, item) - require.NoError(t, err) - - defer func() { - collector.Cleanup() - }() - - metrics, err := collector.GetMetrics() - require.NoError(t, err) - require.NotEmpty(t, metrics) - // We expect 1 metric: DCGM_EXP_CLOCK_EVENTS_COUNT - require.Len(t, metrics, 1) - // We get metric value with 0 index - metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - - for i := 0; i < len(metricValues); i++ { - gpuID, err := strconv.ParseUint(metricValues[i].GPU, 10, 64) - if err == nil { - if !slices.Contains(gpuIDs, uint(gpuID)) { - metricValues = append(metricValues[:i], metricValues[i+1:]...) - } - } - } - - // We expect 9 records, because we have 3 fake GPU and each GPU experienced 3 CLOCK_EVENTS - require.Len(t, metricValues, 9) - for _, val := range metricValues { - require.Contains(t, val.Labels, "window_size_in_ms") - require.Equal(t, fmt.Sprint(config.ClockEventsCountWindowSize), val.Labels["window_size_in_ms"]) - expected, exists := expectations[val.GPU] - require.True(t, exists) - actualReason, exists := val.Labels["clock_event"] - require.True(t, exists) - expectedVal, exists := expected[actualReason] - require.True(t, exists) - require.Equal(t, expectedVal, val.Value) - } -} - -func TestClockEventsCollector_NewClocksThrottleReasonsCollector(t *testing.T) { - config := &Config{ - GPUDevices: DeviceOptions{ - Flex: true, - MajorRange: []int{-1}, - MinorRange: []int{-1}, - }, - } - - teardownTest := setupTest(t) - defer teardownTest(t) - - allCounters := []Counter{ - { - FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - }, - } - - fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) - err := fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) - require.NoError(t, err) - item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - - t.Run("Should Return Error When DCGM_EXP_CLOCK_EVENTS_COUNT is not present", func(t *testing.T) { - records := [][]string{ - {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, - } - cc, err := extractCounters(records, config) - require.NoError(t, err) - require.Len(t, cc.ExporterCounters, 0) - require.Len(t, cc.DCGMCounters, 1) - collector, err := NewClockEventsCollector(cc.DCGMCounters, "", config, item) - require.Error(t, err) - require.Nil(t, collector) - }) - - t.Run("Should Return Error When Counter Param Is Empty", func(t *testing.T) { - counters := make([]Counter, 0) - collector, err := NewClockEventsCollector(counters, "", config, item) - require.Error(t, err) - require.Nil(t, collector) - }) - - t.Run("Should Not Return Error When DCGM_EXP_CLOCK_EVENTS_COUNT Present More Than Once", func(t *testing.T) { - records := [][]string{ - {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, - {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, - {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, - {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, - } - cc, err := extractCounters(records, config) - require.NoError(t, err) - for i := range cc.DCGMCounters { - if cc.DCGMCounters[i].PromType == "label" { - cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) - } - } - collector, err := NewClockEventsCollector(cc.ExporterCounters, "", config, item) - require.NoError(t, err) - require.NotNil(t, collector) - }) -} - -func TestClockEventsCollector_Gather_AllTheThings(t *testing.T) { - teardownTest := setupTest(t) - defer teardownTest(t) - runOnlyWithLiveGPUs(t) - - hostname := "local-test" - config := &Config{ - GPUDevices: DeviceOptions{ - Flex: true, - MajorRange: []int{-1}, - MinorRange: []int{-1}, - }, - ClockEventsCountWindowSize: int(time.Duration(5) * time.Minute), - } - - records := [][]string{ - {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, - {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, - } - - cc, err := extractCounters(records, config) - require.NoError(t, err) - require.Len(t, cc.ExporterCounters, 1) - require.Len(t, cc.DCGMCounters, 1) - - for i := range cc.DCGMCounters { - if cc.DCGMCounters[i].PromType == "label" { - cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) - } - } - - // Create fake GPU - numGPUs, err := dcgm.GetAllDeviceCount() - require.NoError(t, err) - - if numGPUs+1 > dcgm.MAX_NUM_DEVICES { - t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) - } - - entityList := []dcgm.MigHierarchyInfo{ - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - } - - gpuIDs, err := dcgm.CreateFakeEntities(entityList) - require.NoError(t, err) - require.NotEmpty(t, gpuIDs) - - type clockThrottleReasonExpectation map[string]string - expectations := map[string]clockThrottleReasonExpectation{} - - require.Len(t, gpuIDs, 1) - gpuID := gpuIDs[0] - err = dcgm.InjectFieldValue(gpuID, - dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - dcgm.DCGM_FT_INT64, - 0, - time.Now().Add(-time.Duration(1)*time.Second).UnixMicro(), - int64(DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE| - DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING| - DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP| - DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN| - DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST| - DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL| - DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL| - DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE| - DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS), - ) - - require.NoError(t, err) - - expectations[fmt.Sprint(gpuID)] = clockThrottleReasonExpectation{ - DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE.String(): "1", - DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING.String(): "1", - DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP.String(): "1", - DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN.String(): "1", - DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST.String(): "1", - DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL.String(): "1", - DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL.String(): "1", - DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE.String(): "1", - DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS.String(): "1", - } - - allCounters := []Counter{ - { - FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - }, - } - - fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) - - err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) - require.NoError(t, err) - - item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - - collector, err := NewClockEventsCollector(cc.ExporterCounters, hostname, config, item) - require.NoError(t, err) - - defer func() { - collector.Cleanup() - }() - - metrics, err := collector.GetMetrics() - require.NoError(t, err) - require.NotEmpty(t, metrics) - // We expect 1 metric: DCGM_EXP_CLOCK_EVENTS_COUNT - require.Len(t, metrics, 1) - // We get metric value with 0 index - metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - - metricValues = getFakeGPUMetrics(metricValues, gpuIDs) - - // Expected 9 metric values, because we injected 9 reasons - require.Len(t, metricValues, 9) - for _, val := range metricValues { - require.Contains(t, val.Labels, "window_size_in_ms") - require.Equal(t, fmt.Sprint(config.ClockEventsCountWindowSize), val.Labels["window_size_in_ms"]) - expected, exists := expectations[val.GPU] - require.True(t, exists) - actualReason, exists := val.Labels["clock_event"] - require.True(t, exists) - expectedVal, exists := expected[actualReason] - require.True(t, exists) - require.Equal(t, expectedVal, val.Value) - } -} - -func TestClockEventsCollector_Gather_AllTheThings_WhenNoLabels(t *testing.T) { - teardownTest := setupTest(t) - defer teardownTest(t) - runOnlyWithLiveGPUs(t) - - hostname := "local-test" - config := &Config{ - GPUDevices: DeviceOptions{ - Flex: true, - MajorRange: []int{-1}, - MinorRange: []int{-1}, - }, - ClockEventsCountWindowSize: int(time.Duration(5) * time.Minute), - } - - records := [][]string{ - {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, - } - - cc, err := extractCounters(records, config) - require.NoError(t, err) - require.Len(t, cc.ExporterCounters, 1) - require.Len(t, cc.DCGMCounters, 0) - - // Create fake GPU - numGPUs, err := dcgm.GetAllDeviceCount() - require.NoError(t, err) - - if numGPUs+1 > dcgm.MAX_NUM_DEVICES { - t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) - } - - entityList := []dcgm.MigHierarchyInfo{ - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - } - - gpuIDs, err := dcgm.CreateFakeEntities(entityList) - require.NoError(t, err) - require.NotEmpty(t, gpuIDs) - - gpuID := gpuIDs[0] - err = dcgm.InjectFieldValue(gpuID, - dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - dcgm.DCGM_FT_INT64, - 0, - time.Now().Add(-time.Duration(1)*time.Second).UnixMicro(), - int64(DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE| - DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING| - DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP| - DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN| - DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST| - DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL| - DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL| - DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE| - DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS), - ) - - require.NoError(t, err) - - allCounters := []Counter{ - { - FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - }, - } - - fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) - - err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) - require.NoError(t, err) - - item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - - collector, err := NewClockEventsCollector(cc.ExporterCounters, hostname, config, item) - require.NoError(t, err) - - defer func() { - collector.Cleanup() - }() - - metrics, err := collector.GetMetrics() - require.NoError(t, err) - require.NotEmpty(t, metrics) - // We expect 1 metric: DCGM_EXP_CLOCK_EVENTS_COUNT - require.Len(t, metrics, 1) - // We get metric value with 0 index - metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - // Exclude the real GPU from the test - metricValues = getFakeGPUMetrics(metricValues, gpuIDs) - // Expected 9 metric values, because we injected 9 reasons - require.Len(t, metricValues, 9) -} - -func getFakeGPUMetrics(metricValues []Metric, gpuIDs []uint) []Metric { - for i := 0; i < len(metricValues); i++ { - gpuID, err := strconv.ParseUint(metricValues[i].GPU, 10, 64) - if err == nil { - if !slices.Contains(gpuIDs, uint(gpuID)) { - metricValues = append(metricValues[:i], metricValues[i+1:]...) - } - } - } - return metricValues -} diff --git a/pkg/dcgmexporter/dcgm.go b/pkg/dcgmexporter/dcgm.go deleted file mode 100644 index e348bf96..00000000 --- a/pkg/dcgmexporter/dcgm.go +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "fmt" - "math/rand" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/sirupsen/logrus" -) - -func NewGroup() (dcgm.GroupHandle, func(), error) { - group, err := dcgm.NewDefaultGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) - if err != nil { - return dcgm.GroupHandle{}, func() {}, err - } - - return group, func() { - err := dcgm.DestroyGroup(group) - if err != nil { - logrus.WithError(err).Warn("Cannot destroy field group.") - } - }, nil -} - -func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short { - var deviceFields []dcgm.Short - for _, f := range counters { - meta := dcgm.FieldGetById(f.FieldID) - - if meta.EntityLevel == entityType || meta.EntityLevel == dcgm.FE_NONE { - deviceFields = append(deviceFields, f.FieldID) - } else if entityType == dcgm.FE_GPU && (meta.EntityLevel == dcgm.FE_GPU_CI || meta.EntityLevel == dcgm.FE_GPU_I || meta.EntityLevel == dcgm.FE_VGPU) { - deviceFields = append(deviceFields, f.FieldID) - } else if entityType == dcgm.FE_CPU && (meta.EntityLevel == dcgm.FE_CPU || meta.EntityLevel == dcgm.FE_CPU_CORE) { - deviceFields = append(deviceFields, f.FieldID) - } - } - - return deviceFields -} - -func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error) { - name := fmt.Sprintf("gpu-collector-fieldgroup-%d", rand.Uint64()) - fieldGroup, err := dcgm.FieldGroupCreate(name, deviceFields) - if err != nil { - return dcgm.FieldHandle{}, func() {}, err - } - - return fieldGroup, func() { - err := dcgm.FieldGroupDestroy(fieldGroup) - if err != nil { - logrus.WithError(err).Warn("Cannot destroy field group.") - } - }, nil -} - -func WatchFieldGroup( - group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32, -) error { - err := dcgm.WatchFieldsWithGroupEx(field, group, updateFreq, maxKeepAge, maxKeepSamples) - if err != nil { - return err - } - - return nil -} - -func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]dcgm.GroupHandle, dcgm.FieldHandle, []func(), error) { - var err error - var cleanups []func() - var cleanup func() - var groups []dcgm.GroupHandle - var fieldGroup dcgm.FieldHandle - - if sysInfo.InfoType == dcgm.FE_LINK { - /* one group per-nvswitch is created for nvlinks */ - groups, cleanups, err = CreateLinkGroupsFromSystemInfo(sysInfo) - } else if sysInfo.InfoType == dcgm.FE_CPU_CORE { - /* one group per-CPU is created for cpu cores */ - groups, cleanups, err = CreateCoreGroupsFromSystemInfo(sysInfo) - } else { - group, cleanup, err := CreateGroupFromSystemInfo(sysInfo) - if err == nil { - groups = append(groups, group) - cleanups = append(cleanups, cleanup) - } - } - - if err != nil { - goto fail - } - - for _, gr := range groups { - fieldGroup, cleanup, err = NewFieldGroup(deviceFields) - if err != nil { - goto fail - } - - cleanups = append(cleanups, cleanup) - - err = WatchFieldGroup(gr, fieldGroup, collectIntervalUsec, 0.0, 1) - if err != nil { - goto fail - } - } - - return groups, fieldGroup, cleanups, nil - -fail: - for _, f := range cleanups { - f() - } - - return nil, dcgm.FieldHandle{}, nil, err -} diff --git a/pkg/dcgmexporter/expcollector.go b/pkg/dcgmexporter/expcollector.go deleted file mode 100644 index 68778db8..00000000 --- a/pkg/dcgmexporter/expcollector.go +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "fmt" - "io" - "maps" - "sync" - "sync/atomic" - "text/template" - "time" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/sirupsen/logrus" -) - -var expMetricsFormat = ` - -{{- range $counter, $metrics := . -}} -# HELP {{ $counter.FieldName }} {{ $counter.Help }} -# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} -{{- range $metric := $metrics }} -{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",pci_bus_id="{{ $metric.GPUPCIBusID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} - -{{- range $k, $v := $metric.Labels -}} - ,{{ $k }}="{{ $v }}" -{{- end -}} -{{- range $k, $v := $metric.Attributes -}} - ,{{ $k }}="{{ $v }}" -{{- end -}} - -} {{ $metric.Value -}} -{{- end }} -{{ end }}` - -// Collector interface -type Collector interface { - GetMetrics() (MetricsByCounter, error) - Cleanup() -} - -var getExpMetricTemplate = sync.OnceValue(func() *template.Template { - return template.Must(template.New("expMetrics").Parse(expMetricsFormat)) -}) - -func encodeExpMetrics(w io.Writer, metrics MetricsByCounter) error { - tmpl := getExpMetricTemplate() - return tmpl.Execute(w, metrics) -} - -var expCollectorFieldGroupIdx atomic.Uint32 - -type expCollector struct { - sysInfo SystemInfo // Hardware system info - counter Counter // Counter that collector - hostname string // Hostname - config *Config // Configuration settings - labelDeviceFields []dcgm.Short // Fields used for labels - counterDeviceFields []dcgm.Short // Fields used for the counter - labelsCounters []Counter // Counters used for labels - cleanups []func() // Cleanup functions - fieldValueParser func(val int64) []int64 // Function to parse the field value - labelFiller func(map[string]string, int64) // Function to fill labels - windowSize int // Window size - transformations []Transform // Transformers for metric postprocessing - deviceGroups []dcgm.GroupHandle - deviceFieldGroup dcgm.FieldHandle -} - -func (c *expCollector) getMetrics() (MetricsByCounter, error) { - err := dcgm.UpdateAllFields() - if err != nil { - return nil, err - } - - mapEntityIDToValues := map[uint]map[int64]int{} - - window := time.Now().Add(-time.Duration(c.windowSize) * time.Millisecond) - - for _, group := range c.deviceGroups { - values, _, err := dcgm.GetValuesSince(group, c.deviceFieldGroup, window) - if err != nil { - return nil, err - } - for _, val := range values { - if val.Status == 0 { - if _, exists := mapEntityIDToValues[val.EntityId]; !exists { - mapEntityIDToValues[val.EntityId] = map[int64]int{} - } - for _, v := range c.fieldValueParser(val.Int64()) { - mapEntityIDToValues[val.EntityId][v] += 1 - } - } - } - } - - labels := map[string]string{} - labels[windowSizeInMSLabel] = fmt.Sprint(c.windowSize) - - monitoringInfo := GetMonitoredEntities(c.sysInfo) - metrics := make(MetricsByCounter) - useOld := c.config.UseOldNamespace - uuid := "UUID" - if useOld { - uuid = "uuid" - } - for _, mi := range monitoringInfo { - if len(c.labelsCounters) > 0 { - err := c.getLabelsFromCounters(mi, labels) - if err != nil { - return nil, err - } - } - entityValues, exists := mapEntityIDToValues[mi.DeviceInfo.GPU] - if exists { - for entityValue, val := range entityValues { - - metricValueLabels := maps.Clone(labels) - c.labelFiller(metricValueLabels, entityValue) - - m := c.createMetric(metricValueLabels, mi, uuid, val) - - metrics[c.counter] = append(metrics[c.counter], m) - } - } else { - // Create metric with Zero value if group (mapEntityIDToValues) is empty - m := c.createMetric(labels, mi, uuid, 0) - metrics[c.counter] = append(metrics[c.counter], m) - } - } - - for _, transform := range c.transformations { - err := transform.Process(metrics, c.sysInfo) - if err != nil { - return nil, fmt.Errorf("failed to transform metrics for transform '%s'; err: %v", transform.Name(), err) - } - } - - return metrics, nil -} - -func (c *expCollector) createMetric(labels map[string]string, mi MonitoringInfo, uuid string, val int) Metric { - gpuModel := getGPUModel(mi.DeviceInfo, c.config.ReplaceBlanksInModelName) - - m := Metric{ - Counter: c.counter, - Value: fmt.Sprint(val), - UUID: uuid, - GPU: fmt.Sprintf("%d", mi.DeviceInfo.GPU), - GPUUUID: mi.DeviceInfo.UUID, - GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU), - GPUModelName: gpuModel, - GPUPCIBusID: mi.DeviceInfo.PCI.BusID, - Hostname: c.hostname, - - Labels: labels, - Attributes: map[string]string{}, - } - if mi.InstanceInfo != nil { - m.MigProfile = mi.InstanceInfo.ProfileName - m.GPUInstanceID = fmt.Sprintf("%d", mi.InstanceInfo.Info.NvmlInstanceId) - } else { - m.MigProfile = "" - m.GPUInstanceID = "" - } - return m -} - -func (c *expCollector) getLabelsFromCounters(mi MonitoringInfo, labels map[string]string) error { - latestValues, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.labelDeviceFields) - if err != nil { - return err - } - // Extract Labels - for _, val := range latestValues { - v := ToString(val) - // Filter out counters with no value and ignored fields for this entity - if v == SkipDCGMValue { - continue - } - - counter, err := FindCounterField(c.labelsCounters, val.FieldId) - if err != nil { - continue - } - - if counter.PromType == "label" { - labels[counter.FieldName] = v - continue - } - } - return nil -} - -func (c *expCollector) Cleanup() { - for _, cleanup := range c.cleanups { - cleanup() - } -} - -// newExpCollector is a constructor for the expCollector -func newExpCollector( - counters []Counter, - hostname string, - counterDeviceFields []dcgm.Short, - config *Config, - fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem, -) expCollector { - var labelsCounters []Counter - for i := 0; i < len(counters); i++ { - if counters[i].PromType == "label" { - labelsCounters = append(labelsCounters, counters[i]) - } - } - - labelDeviceFields := NewDeviceFields(labelsCounters, dcgm.FE_GPU) - - transformations := getTransformations(config) - - collector := expCollector{ - hostname: hostname, - config: config, - labelDeviceFields: labelDeviceFields, - labelsCounters: labelsCounters, - counterDeviceFields: counterDeviceFields, - fieldValueParser: func(val int64) []int64 { - return []int64{val} - }, - labelFiller: func(metricValueLabels map[string]string, entityValue int64) {}, - transformations: transformations, - } - - collector.sysInfo = fieldEntityGroupTypeSystemInfo.SystemInfo - - var err error - - collector.deviceGroups, collector.deviceFieldGroup, collector.cleanups, err = SetupDcgmFieldsWatch(collector.counterDeviceFields, - collector.sysInfo, - int64(config.CollectInterval)*1000) - if err != nil { - logrus.Fatal("Failed to watch metrics: ", err) - } - - return collector -} diff --git a/pkg/dcgmexporter/field_entity_group_system_info.go b/pkg/dcgmexporter/field_entity_group_system_info.go deleted file mode 100644 index e6ce4b53..00000000 --- a/pkg/dcgmexporter/field_entity_group_system_info.go +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "fmt" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" -) - -// FieldEntityGroupTypeToMonitor supported entity group types -var FieldEntityGroupTypeToMonitor = []dcgm.Field_Entity_Group{ - dcgm.FE_GPU, - dcgm.FE_SWITCH, - dcgm.FE_LINK, - dcgm.FE_CPU, - dcgm.FE_CPU_CORE, -} - -type FieldEntityGroupTypeSystemInfoItem struct { - SystemInfo SystemInfo - DeviceFields []dcgm.Short -} - -func (f FieldEntityGroupTypeSystemInfoItem) isEmpty() bool { - return len(f.DeviceFields) == 0 -} - -// FieldEntityGroupTypeSystemInfo represents a mapping between FieldEntityGroupType and SystemInfo -type FieldEntityGroupTypeSystemInfo struct { - items map[dcgm.Field_Entity_Group]FieldEntityGroupTypeSystemInfoItem - counters []Counter - gpuDevices DeviceOptions - switchDevices DeviceOptions - cpuDevices DeviceOptions - useFakeGPUs bool -} - -// NewEntityGroupTypeSystemInfo creates a new instance of the FieldEntityGroupTypeSystemInfo -func NewEntityGroupTypeSystemInfo(c []Counter, config *Config) *FieldEntityGroupTypeSystemInfo { - return &FieldEntityGroupTypeSystemInfo{ - items: make(map[dcgm.Field_Entity_Group]FieldEntityGroupTypeSystemInfoItem), - counters: c, - gpuDevices: config.GPUDevices, - switchDevices: config.SwitchDevices, - cpuDevices: config.CPUDevices, - useFakeGPUs: config.UseFakeGPUs, - } -} - -// Load loads SystemInfo for a provided Field_Entity_Group -func (e *FieldEntityGroupTypeSystemInfo) Load(entityType dcgm.Field_Entity_Group) error { - var deviceFields = NewDeviceFields(e.counters, entityType) - - if !ShouldMonitorDeviceType(deviceFields, entityType) { - return fmt.Errorf("no fields to watch for device type: %d", entityType) - } - - sysInfo, err := GetSystemInfo(&Config{ - GPUDevices: e.gpuDevices, - SwitchDevices: e.switchDevices, - CPUDevices: e.cpuDevices, - UseFakeGPUs: e.useFakeGPUs, - }, entityType) - if err != nil { - return err - } - - e.items[entityType] = FieldEntityGroupTypeSystemInfoItem{ - SystemInfo: *sysInfo, - DeviceFields: deviceFields, - } - - return err -} - -// Get returns FieldEntityGroupTypeSystemInfoItem, bool by dcgm.Field_Entity_Group -func (e *FieldEntityGroupTypeSystemInfo) Get(key dcgm.Field_Entity_Group) (FieldEntityGroupTypeSystemInfoItem, bool) { - val, exists := e.items[key] - return val, exists -} diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go deleted file mode 100644 index 2f38d442..00000000 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ /dev/null @@ -1,486 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "fmt" - "reflect" - "testing" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -var sampleCounters = []Counter{ - {dcgm.DCGM_FI_DEV_GPU_TEMP, "DCGM_FI_DEV_GPU_TEMP", "gauge", "Temperature Help info"}, - {dcgm.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION", "gauge", "Energy help info"}, - {dcgm.DCGM_FI_DEV_POWER_USAGE, "DCGM_FI_DEV_POWER_USAGE", "gauge", "Power help info"}, - {dcgm.DCGM_FI_DRIVER_VERSION, "DCGM_FI_DRIVER_VERSION", "label", "Driver version"}, - /* test that switch and link metrics are filtered out automatically when devices are not detected */ - { - dcgm.DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT, - "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT", - "gauge", - "switch temperature", - }, - { - dcgm.DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS, - "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS", - "gauge", - "per-link flit errors", - }, - /* test that vgpu metrics are not filtered out */ - {dcgm.DCGM_FI_DEV_VGPU_LICENSE_STATUS, "DCGM_FI_DEV_VGPU_LICENSE_STATUS", "gauge", "vgpu license status"}, - /* test that cpu and cpu core metrics are filtered out automatically when devices are not detected */ - {dcgm.DCGM_FI_DEV_CPU_UTIL_TOTAL, "DCGM_FI_DEV_CPU_UTIL_TOTAL", "gauge", "Total CPU utilization"}, -} - -var expectedMetrics = map[string]bool{ - "DCGM_FI_DEV_GPU_TEMP": true, - "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": true, - "DCGM_FI_DEV_POWER_USAGE": true, - "DCGM_FI_DEV_VGPU_LICENSE_STATUS": true, -} - -var expectedCPUMetrics = map[string]bool{ - "DCGM_FI_DEV_CPU_UTIL_TOTAL": true, -} - -func TestDCGMCollector(t *testing.T) { - cleanup, err := dcgm.Init(dcgm.Embedded) - require.NoError(t, err) - defer cleanup() - - _, cleanup = testDCGMGPUCollector(t, sampleCounters) - cleanup() - - _, cleanup = testDCGMCPUCollector(t, sampleCounters) - cleanup() -} - -func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) { - dOpt := DeviceOptions{ - Flex: true, - MajorRange: []int{-1}, - MinorRange: []int{-1}, - } - config := Config{ - GPUDevices: dOpt, - NoHostname: false, - UseOldNamespace: false, - UseFakeGPUs: false, - CollectInterval: 1, - } - - dcgmGetAllDeviceCount = func() (uint, error) { - return 1, nil - } - - dcgmGetDeviceInfo = func(gpuId uint) (dcgm.Device, error) { - dev := dcgm.Device{ - GPU: 0, - UUID: fmt.Sprintf("fake%d", gpuId), - PCI: dcgm.PCIInfo{ - BusID: "00000000:0000:0000.0", - }, - } - - return dev, nil - } - - dcgmGetGpuInstanceHierarchy = func() (dcgm.MigHierarchy_v2, error) { - hierarchy := dcgm.MigHierarchy_v2{ - Count: 0, - } - return hierarchy, nil - } - - dcgmAddEntityToGroup = func( - groupId dcgm.GroupHandle, entityGroupId dcgm.Field_Entity_Group, entityId uint, - ) (err error) { - return nil - } - - dcgmGetCpuHierarchy = func() (dcgm.CpuHierarchy_v1, error) { - CPU := dcgm.CpuHierarchyCpu_v1{ - CpuId: 0, - OwnedCores: []uint64{0}, - } - hierarchy := dcgm.CpuHierarchy_v1{ - Version: 0, - NumCpus: 1, - Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{CPU}, - } - - return hierarchy, nil - } - - defer func() { - dcgmGetAllDeviceCount = dcgm.GetAllDeviceCount - dcgmGetDeviceInfo = dcgm.GetDeviceInfo - dcgmGetGpuInstanceHierarchy = dcgm.GetGpuInstanceHierarchy - dcgmAddEntityToGroup = dcgm.AddEntityToGroup - }() - - fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(counters, &config) - - err := fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) - require.NoError(t, err) - - gpuItem, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - require.True(t, exists) - - g, cleanup, err := NewDCGMCollector(counters, "", &config, gpuItem) - require.NoError(t, err) - - /* Test for error when no switches are available to monitor. */ - switchItem, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_SWITCH) - assert.False(t, exists, "dcgm.FE_SWITCH should not be available") - - _, _, err = NewDCGMCollector(counters, "", &config, switchItem) - require.Error(t, err, "NewDCGMCollector should return error") - - /* Test for error when no cpus are available to monitor. */ - cpuItem, exist := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_CPU) - require.False(t, exist, "dcgm.FE_CPU should not be available") - - _, _, err = NewDCGMCollector(counters, "", &config, cpuItem) - require.Error(t, err, "NewDCGMCollector should return error") - - out, err := g.GetMetrics() - require.NoError(t, err) - require.Greater(t, len(out), 0, "Check that you have a GPU on this node") - require.Len(t, out, len(expectedMetrics)) - - seenMetrics := map[string]bool{} - for _, metrics := range out { - for _, metric := range metrics { - seenMetrics[metric.Counter.FieldName] = true - require.NotEmpty(t, metric.GPU) - require.NotEmpty(t, metric.GPUUUID) - require.NotEmpty(t, metric.GPUPCIBusID) - require.NotEmpty(t, metric.Value) - require.NotEqual(t, metric.Value, FailedToConvert) - } - } - require.Equal(t, seenMetrics, expectedMetrics) - - return g, cleanup -} - -func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) { - dOpt := DeviceOptions{true, []int{-1}, []int{-1}} - config := Config{ - CPUDevices: dOpt, - NoHostname: false, - UseOldNamespace: false, - UseFakeGPUs: false, - } - - dcgmGetAllDeviceCount = func() (uint, error) { - return 0, nil - } - - dcgmGetDeviceInfo = func(gpuId uint) (dcgm.Device, error) { - dev := dcgm.Device{ - GPU: 0, - DCGMSupported: "No", - UUID: fmt.Sprintf("fake%d", gpuId), - PCI: dcgm.PCIInfo{ - BusID: "00000000:0000:0000.0", - }, - } - - return dev, nil - } - - dcgmGetGpuInstanceHierarchy = func() (dcgm.MigHierarchy_v2, error) { - hierarchy := dcgm.MigHierarchy_v2{ - Count: 0, - } - return hierarchy, nil - } - - dcgmAddEntityToGroup = func( - groupId dcgm.GroupHandle, entityGroupId dcgm.Field_Entity_Group, entityId uint, - ) (err error) { - return nil - } - - dcgmGetCpuHierarchy = func() (dcgm.CpuHierarchy_v1, error) { - CPU := dcgm.CpuHierarchyCpu_v1{ - CpuId: 0, - OwnedCores: []uint64{0, 18446744073709551360, 65535}, - } - hierarchy := dcgm.CpuHierarchy_v1{ - Version: 0, - NumCpus: 1, - Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{CPU}, - } - - return hierarchy, nil - } - - defer func() { - dcgmGetAllDeviceCount = dcgm.GetAllDeviceCount - dcgmGetDeviceInfo = dcgm.GetDeviceInfo - dcgmGetGpuInstanceHierarchy = dcgm.GetGpuInstanceHierarchy - dcgmAddEntityToGroup = dcgm.AddEntityToGroup - }() - - /* Test that only cpu metrics are collected for cpu entities. */ - - fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(counters, &config) - err := fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_CPU) - require.NoError(t, err) - - err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_CPU) - require.NoError(t, err) - - cpuItem, cpuItemExist := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_CPU) - require.True(t, cpuItemExist) - - c, cleanup, err := NewDCGMCollector(counters, "", &config, cpuItem) - require.NoError(t, err) - - out, err := c.GetMetrics() - require.NoError(t, err) - require.Greater(t, len(out), 0, "Check that the fake CPU has been registered") - - for _, dev := range out { - seenMetrics := map[string]bool{} - for _, metric := range dev { - seenMetrics[metric.Counter.FieldName] = true - require.NotEmpty(t, metric.GPU) - require.Empty(t, metric.GPUUUID) - require.Empty(t, metric.GPUPCIBusID) - require.NotEmpty(t, metric.Value) - require.NotEqual(t, metric.Value, FailedToConvert) - } - require.Equal(t, seenMetrics, expectedCPUMetrics) - } - - return c, cleanup -} - -func TestToMetric(t *testing.T) { - fieldValue := [4096]byte{} - fieldValue[0] = 42 - values := []dcgm.FieldValue_v1{ - { - FieldId: 150, - FieldType: dcgm.DCGM_FT_INT64, - Value: fieldValue, - }, - } - - c := []Counter{ - { - FieldID: 150, - FieldName: "DCGM_FI_DEV_GPU_TEMP", - PromType: "gauge", - Help: "Temperature Help info", - }, - } - - d := dcgm.Device{ - UUID: "fake0", - Identifiers: dcgm.DeviceIdentifiers{ - Model: "NVIDIA T400 4GB", - }, - PCI: dcgm.PCIInfo{ - BusID: "00000000:0000:0000.0", - }, - } - - var instanceInfo *GPUInstanceInfo = nil - - type testCase struct { - replaceBlanksInModelName bool - expectedGPUModelName string - } - - testCases := []testCase{ - { - replaceBlanksInModelName: true, - expectedGPUModelName: "NVIDIA-T400-4GB", - }, - { - replaceBlanksInModelName: false, - expectedGPUModelName: "NVIDIA T400 4GB", - }, - } - - for _, tc := range testCases { - t.Run(fmt.Sprintf("When replaceBlanksInModelName is %t", tc.replaceBlanksInModelName), func(t *testing.T) { - metrics := make(map[Counter][]Metric) - ToMetric(metrics, values, c, d, instanceInfo, false, "", tc.replaceBlanksInModelName) - assert.Len(t, metrics, 1) - // We get metric value with 0 index - metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - assert.Equal(t, "42", metricValues[0].Value) - assert.Equal(t, tc.expectedGPUModelName, metricValues[0].GPUModelName) - - assert.Equal(t, d.UUID, metricValues[0].GPUUUID) - assert.Equal(t, d.PCI.BusID, metricValues[0].GPUPCIBusID) - }) - } -} - -func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) { - c := []Counter{ - { - FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS, - FieldName: "DCGM_FI_DEV_GPU_TEMP", - PromType: "gauge", - Help: "Temperature Help info", - }, - } - - d := dcgm.Device{ - UUID: "fake0", - Identifiers: dcgm.DeviceIdentifiers{ - Model: "NVIDIA T400 4GB", - }, - PCI: dcgm.PCIInfo{ - BusID: "00000000:0000:0000.0", - }, - } - - var instanceInfo *GPUInstanceInfo = nil - - type testCase struct { - name string - fieldValue byte - expectedErr string - } - - testCases := []testCase{ - { - name: "when DCGM_FI_DEV_XID_ERRORS has no error", - fieldValue: 0, - expectedErr: xidErrCodeToText[0], - }, - { - name: "when DCGM_FI_DEV_XID_ERRORS has known value", - fieldValue: 42, - expectedErr: xidErrCodeToText[42], - }, - { - name: "when DCGM_FI_DEV_XID_ERRORS has unknown value", - fieldValue: 255, - expectedErr: unknownErr, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - fieldValue := [4096]byte{} - fieldValue[0] = tc.fieldValue - values := []dcgm.FieldValue_v1{ - { - FieldId: dcgm.DCGM_FI_DEV_XID_ERRORS, - FieldType: dcgm.DCGM_FT_INT64, - Value: fieldValue, - }, - } - - metrics := make(map[Counter][]Metric) - ToMetric(metrics, values, c, d, instanceInfo, false, "", false) - assert.Len(t, metrics, 1) - // We get metric value with 0 index - metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Value) - assert.Contains(t, metricValues[0].Attributes, "err_code") - assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Attributes["err_code"]) - assert.Contains(t, metricValues[0].Attributes, "err_msg") - assert.Equal(t, tc.expectedErr, metricValues[0].Attributes["err_msg"]) - - assert.Equal(t, d.UUID, metricValues[0].GPUUUID) - assert.Equal(t, d.PCI.BusID, metricValues[0].GPUPCIBusID) - }) - } -} - -func TestGPUCollector_GetMetrics(t *testing.T) { - teardownTest := setupTest(t) - defer teardownTest(t) - - runOnlyWithLiveGPUs(t) - // Create fake GPU - numGPUs, err := dcgm.GetAllDeviceCount() - require.NoError(t, err) - - if numGPUs+1 > dcgm.MAX_NUM_DEVICES { - t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) - } - - entityList := []dcgm.MigHierarchyInfo{ - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - } - - gpuIDs, err := dcgm.CreateFakeEntities(entityList) - require.NoError(t, err) - require.NotEmpty(t, gpuIDs) - - numGPUs, err = dcgm.GetAllDeviceCount() - require.NoError(t, err) - - counters := []Counter{ - { - FieldID: 100, - FieldName: "DCGM_FI_DEV_SM_CLOCK", - PromType: "gauge", - Help: "SM clock frequency (in MHz).", - }, - } - - dOpt := DeviceOptions{ - Flex: true, - MajorRange: []int{-1}, - MinorRange: []int{-1}, - } - config := Config{ - GPUDevices: dOpt, - NoHostname: false, - UseOldNamespace: false, - UseFakeGPUs: false, - } - - fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(counters, &config) - err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) - require.NoError(t, err) - - gpuItem, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - require.True(t, exists) - - c, cleanup, err := NewDCGMCollector(counters, "", &config, gpuItem) - require.NoError(t, err) - - defer cleanup() - - out, err := c.GetMetrics() - require.NoError(t, err) - require.Len(t, out, 1) - - values := out[counters[0]] - - require.Equal(t, numGPUs, uint(len(values))) -} diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go deleted file mode 100644 index 3b48efe2..00000000 --- a/pkg/dcgmexporter/kubernetes_test.go +++ /dev/null @@ -1,337 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "context" - "fmt" - "net" - "reflect" - "testing" - "time" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "google.golang.org/grpc" - podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1alpha1" - - "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" - "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" -) - -func TestProcessPodMapper(t *testing.T) { - testutils.RequireLinux(t) - - tmpDir, cleanup := CreateTmpDir(t) - defer cleanup() - - cleanup, err := dcgm.Init(dcgm.Embedded) - require.NoError(t, err) - defer cleanup() - - c, cleanup := testDCGMGPUCollector(t, sampleCounters) - defer cleanup() - - out, err := c.GetMetrics() - require.NoError(t, err) - - original := out - - arbirtaryMetric := out[reflect.ValueOf(out).MapKeys()[0].Interface().(Counter)] - - socketPath := tmpDir + "/kubelet.sock" - server := grpc.NewServer() - gpus := GetGPUUUIDs(arbirtaryMetric) - podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(nvidiaResourceName, gpus)) - - cleanup = StartMockServer(t, server, socketPath) - defer cleanup() - - podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: GPUUID, PodResourcesKubeletSocket: socketPath}) - require.NoError(t, err) - var sysInfo SystemInfo - err = podMapper.Process(out, sysInfo) - require.NoError(t, err) - - require.Len(t, out, len(original)) - for _, metrics := range out { - for _, metric := range metrics { - require.Contains(t, metric.Attributes, podAttribute) - require.Contains(t, metric.Attributes, namespaceAttribute) - require.Contains(t, metric.Attributes, containerAttribute) - require.Equal(t, metric.Attributes[podAttribute], fmt.Sprintf("gpu-pod-%s", metric.GPU)) - require.Equal(t, metric.Attributes[namespaceAttribute], "default") - require.Equal(t, metric.Attributes[containerAttribute], "default") - } - } -} - -func GetGPUUUIDs(metrics []Metric) []string { - gpus := make([]string, len(metrics)) - for i, dev := range metrics { - gpus[i] = dev.GPUUUID - } - - return gpus -} - -func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() { - l, err := net.Listen("unix", socket) - require.NoError(t, err) - - stopped := make(chan interface{}) - - go func() { - err := server.Serve(l) - assert.NoError(t, err) - close(stopped) - }() - - return func() { - server.Stop() - select { - case <-stopped: - return - case <-time.After(1 * time.Second): - t.Fatal("Failed waiting for gRPC server to stop.") - } - } -} - -func CreateTmpDir(t *testing.T) (string, func()) { - path, err := os.MkdirTemp("", "dcgm-exporter") - require.NoError(t, err) - - return path, func() { - require.NoError(t, os.RemoveAll(path)) - } -} - -// Contains a list of UUIDs -type PodResourcesMockServer struct { - resourceName string - gpus []string -} - -func NewPodResourcesMockServer(resourceName string, gpus []string) *PodResourcesMockServer { - return &PodResourcesMockServer{ - resourceName: resourceName, - gpus: gpus, - } -} - -func (s *PodResourcesMockServer) List( - ctx context.Context, req *podresourcesapi.ListPodResourcesRequest, -) (*podresourcesapi.ListPodResourcesResponse, error) { - podResources := make([]*podresourcesapi.PodResources, len(s.gpus)) - - for i, gpu := range s.gpus { - podResources[i] = &podresourcesapi.PodResources{ - Name: fmt.Sprintf("gpu-pod-%d", i), - Namespace: "default", - Containers: []*podresourcesapi.ContainerResources{ - { - Name: "default", - Devices: []*podresourcesapi.ContainerDevices{ - { - ResourceName: s.resourceName, - DeviceIds: []string{gpu}, - }, - }, - }, - }, - } - } - - return &podresourcesapi.ListPodResourcesResponse{ - PodResources: podResources, - }, nil -} - -func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { - testutils.RequireLinux(t) - - type TestCase struct { - KubernetesGPUIDType KubernetesGPUIDType - GPUInstanceID uint - ResourceName string - MetricGPUID string - MetricGPUDevice string - MetricMigProfile string - PODGPUID string - NvidiaResourceNames []string - } - - testCases := []TestCase{ - { - KubernetesGPUIDType: GPUUID, - ResourceName: nvidiaResourceName, - MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", - PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", - }, - { - KubernetesGPUIDType: GPUUID, - ResourceName: nvidiaResourceName, - MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", - PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", - MetricMigProfile: "", - }, - { - KubernetesGPUIDType: GPUUID, - ResourceName: nvidiaResourceName, - GPUInstanceID: 3, - MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", - MetricMigProfile: "", - PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", - }, - { - KubernetesGPUIDType: DeviceName, - ResourceName: nvidiaResourceName, - GPUInstanceID: 3, - MetricMigProfile: "mig", - PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", - }, - { - KubernetesGPUIDType: DeviceName, - ResourceName: nvidiaResourceName, - MetricMigProfile: "mig", - PODGPUID: "nvidia0/gi0", - }, - { - KubernetesGPUIDType: DeviceName, - ResourceName: nvidiaResourceName, - MetricGPUDevice: "0", - PODGPUID: "0/vgpu", - }, - { - KubernetesGPUIDType: GPUUID, - ResourceName: nvidiaResourceName, - MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", - PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5::", - }, - { - KubernetesGPUIDType: GPUUID, - ResourceName: "nvidia.com/mig-1g.10gb", - MetricMigProfile: "1g.10gb", - MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", - PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", - MetricGPUDevice: "0", - GPUInstanceID: 3, - }, - { - KubernetesGPUIDType: GPUUID, - ResourceName: "nvidia.com/a100", - MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", - PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", - NvidiaResourceNames: []string{"nvidia.com/a100"}, - }, - } - - for _, tc := range testCases { - t.Run(fmt.Sprintf("when type %s, pod device id %s metric device id %s and gpu device %s", - tc.KubernetesGPUIDType, - tc.PODGPUID, - tc.MetricGPUID, - tc.MetricGPUDevice, - ), - func(t *testing.T) { - tmpDir, cleanup := CreateTmpDir(t) - defer cleanup() - socketPath := tmpDir + "/kubelet.sock" - server := grpc.NewServer() - - cleanup, err := dcgm.Init(dcgm.Embedded) - require.NoError(t, err) - defer cleanup() - - gpus := []string{tc.PODGPUID} - podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(tc.ResourceName, gpus)) - - cleanup = StartMockServer(t, server, socketPath) - defer cleanup() - - nvmlGetMIGDeviceInfoByIDHook = func(uuid string) (*nvmlprovider.MIGDeviceInfo, error) { - return &nvmlprovider.MIGDeviceInfo{ - ParentUUID: "00000000-0000-0000-0000-000000000000", - GPUInstanceID: 3, - ComputeInstanceID: 0, - }, nil - } - - defer func() { - nvmlGetMIGDeviceInfoByIDHook = nvmlprovider.GetMIGDeviceInfoByID - }() - - podMapper, err := NewPodMapper(&Config{ - KubernetesGPUIdType: tc.KubernetesGPUIDType, - PodResourcesKubeletSocket: socketPath, - NvidiaResourceNames: tc.NvidiaResourceNames, - }) - require.NoError(t, err) - require.NotNil(t, podMapper) - metrics := MetricsByCounter{} - counter := Counter{ - FieldID: 155, - FieldName: "DCGM_FI_DEV_POWER_USAGE", - PromType: "gauge", - } - - metrics[counter] = append(metrics[counter], Metric{ - GPU: "0", - GPUUUID: tc.MetricGPUID, - GPUDevice: tc.MetricGPUDevice, - GPUInstanceID: fmt.Sprint(tc.GPUInstanceID), - Value: "42", - MigProfile: tc.MetricMigProfile, - Counter: Counter{ - FieldID: 155, - FieldName: "DCGM_FI_DEV_POWER_USAGE", - PromType: "gauge", - }, - Attributes: map[string]string{}, - }) - - sysInfo := SystemInfo{ - GPUCount: 1, - GPUs: [dcgm.MAX_NUM_DEVICES]GPUInfo{ - { - DeviceInfo: dcgm.Device{ - UUID: "00000000-0000-0000-0000-000000000000", - GPU: 0, - }, - MigEnabled: true, - }, - }, - } - err = podMapper.Process(metrics, sysInfo) - require.NoError(t, err) - assert.Len(t, metrics, 1) - for _, metric := range metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] { - require.Contains(t, metric.Attributes, podAttribute) - require.Contains(t, metric.Attributes, namespaceAttribute) - require.Contains(t, metric.Attributes, containerAttribute) - - // TODO currently we rely on ordering and implicit expectations of the mock implementation - // This should be a table comparison - require.Equal(t, fmt.Sprintf("gpu-pod-%d", 0), metric.Attributes[podAttribute]) - require.Equal(t, "default", metric.Attributes[namespaceAttribute]) - require.Equal(t, "default", metric.Attributes[containerAttribute]) - } - }) - } -} diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go deleted file mode 100644 index fd4b25c0..00000000 --- a/pkg/dcgmexporter/pipeline.go +++ /dev/null @@ -1,377 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "bytes" - "fmt" - "sync" - "text/template" - "time" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/sirupsen/logrus" -) - -func NewMetricsPipeline(config *Config, - counters []Counter, - hostname string, - newDCGMCollector DCGMCollectorConstructor, - fieldEntityGroupTypeSystemInfo *FieldEntityGroupTypeSystemInfo, -) (*MetricsPipeline, func(), error) { - logrus.WithField(LoggerDumpKey, fmt.Sprintf("%+v", counters)).Debug("Counters are initialized") - - cleanups := []func(){} - - var ( - gpuCollector *DCGMCollector - switchCollector *DCGMCollector - linkCollector *DCGMCollector - cpuCollector *DCGMCollector - coreCollector *DCGMCollector - err error - ) - - if item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU); exists { - var cleanup func() - gpuCollector, cleanup, err = newDCGMCollector(counters, hostname, config, item) - if err != nil { - logrus.Warn("Cannot create DCGMCollector for dcgm.FE_GPU") - } - cleanups = append(cleanups, cleanup) - } - - if item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_SWITCH); exists { - var cleanup func() - switchCollector, cleanup, err = newDCGMCollector(counters, hostname, config, item) - if err != nil { - logrus.Warn("Cannot create DCGMCollector for dcgm.FE_SWITCH") - } - cleanups = append(cleanups, cleanup) - } - - if item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_LINK); exists { - var cleanup func() - linkCollector, cleanup, err = newDCGMCollector(counters, hostname, config, item) - if err != nil { - logrus.Warn("Cannot create DCGMCollector for dcgm.FE_LINK") - } - cleanups = append(cleanups, cleanup) - } - - if item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_CPU); exists { - var cleanup func() - cpuCollector, cleanup, err = newDCGMCollector(counters, hostname, config, item) - if err != nil { - logrus.Warn("Cannot create DCGMCollector for dcgm.FE_CPU") - } - cleanups = append(cleanups, cleanup) - } - - if item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_CPU_CORE); exists { - var cleanup func() - coreCollector, cleanup, err = newDCGMCollector(counters, hostname, config, item) - if err != nil { - logrus.Warn("Cannot create DCGMCollector for dcgm.FE_CPU_CORE") - } - cleanups = append(cleanups, cleanup) - } - - transformations := getTransformations(config) - - return &MetricsPipeline{ - config: config, - - migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)), - switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)), - linkMetricsFormat: template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)), - cpuMetricsFormat: template.Must(template.New("cpuMetrics").Parse(cpuMetricsFormat)), - cpuCoreMetricsFormat: template.Must(template.New("cpuMetrics").Parse(cpuCoreMetricsFormat)), - - counters: counters, - gpuCollector: gpuCollector, - switchCollector: switchCollector, - linkCollector: linkCollector, - transformations: transformations, - cpuCollector: cpuCollector, - coreCollector: coreCollector, - }, func() { - for _, cleanup := range cleanups { - cleanup() - } - }, nil -} - -func getTransformations(c *Config) []Transform { - transformations := []Transform{} - if c.Kubernetes { - podMapper, err := NewPodMapper(c) - if err != nil { - logrus.Warnf("Could not enable kubernetes metric collection: %v", err) - } else { - transformations = append(transformations, podMapper) - } - } - - if c.HPCJobMappingDir != "" { - hpcMapper := newHPCMapper(c) - transformations = append(transformations, hpcMapper) - } - - return transformations -} - -// Primarely for testing, caller expected to cleanup the collector -func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error) { - return &MetricsPipeline{ - config: c, - - migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)), - switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)), - linkMetricsFormat: template.Must(template.New("switchMetrics").Parse(linkMetricsFormat)), - cpuMetricsFormat: template.Must(template.New("cpuMetrics").Parse(cpuMetricsFormat)), - cpuCoreMetricsFormat: template.Must(template.New("cpuMetrics").Parse(cpuCoreMetricsFormat)), - - counters: collector.Counters, - gpuCollector: collector, - }, func() {}, nil -} - -func (m *MetricsPipeline) Run(out chan string, stop chan interface{}, wg *sync.WaitGroup) { - defer wg.Done() - - logrus.Info("Pipeline starting") - - // Note we are using a ticker so that we can stick as close as possible to the collect interval. - // e.g: The CollectInterval is 10s and the transformation pipeline takes 5s, the time will - // ensure we really collect metrics every 10s by firing an event 5s after the run function completes. - t := time.NewTicker(time.Millisecond * time.Duration(m.config.CollectInterval)) - defer t.Stop() - - for { - select { - case <-stop: - return - case <-t.C: - o, err := m.run() - if err != nil { - logrus.Errorf("Failed to collect metrics; err: %v", err) - /* flush output rather than output stale data */ - out <- "" - continue - } - - if len(out) == cap(out) { - logrus.Errorf("Channel is full skipping.") - } else { - out <- o - } - } - } -} - -func (m *MetricsPipeline) run() (string, error) { - var metrics map[Counter][]Metric - var err error - var formatted string - - if m.gpuCollector != nil { - /* Collect GPU Metrics */ - metrics, err = m.gpuCollector.GetMetrics() - if err != nil { - return "", fmt.Errorf("failed to collect gpu metrics; err: %w", err) - } - - for _, transform := range m.transformations { - err := transform.Process(metrics, m.gpuCollector.SysInfo) - if err != nil { - return "", fmt.Errorf("failed to transform metrics for transform '%s'; err: %w", transform.Name(), err) - } - } - - formatted, err = FormatMetrics(m.migMetricsFormat, metrics) - if err != nil { - return "", fmt.Errorf("failed to format metrics; err: %w", err) - } - } - - if m.switchCollector != nil { - /* Collect Switch Metrics */ - metrics, err = m.switchCollector.GetMetrics() - if err != nil { - return "", fmt.Errorf("failed to collect switch metrics; err: %w", err) - } - - if len(metrics) > 0 { - switchFormatted, err := FormatMetrics(m.switchMetricsFormat, metrics) - if err != nil { - logrus.Warnf("Failed to format switch metrics with error: %v", err) - } - - formatted = formatted + switchFormatted - } - } - - if m.linkCollector != nil { - /* Collect Link Metrics */ - metrics, err = m.linkCollector.GetMetrics() - if err != nil { - return "", fmt.Errorf("failed to collect link metrics; err: %w", err) - } - - if len(metrics) > 0 { - switchFormatted, err := FormatMetrics(m.linkMetricsFormat, metrics) - if err != nil { - logrus.Warnf("failed to format link metrics; err: %v", err) - } - - formatted = formatted + switchFormatted - } - } - - if m.cpuCollector != nil { - /* Collect CPU Metrics */ - metrics, err = m.cpuCollector.GetMetrics() - if err != nil { - return "", fmt.Errorf("failed to collect CPU metrics; err: %w", err) - } - - if len(metrics) > 0 { - cpuFormatted, err := FormatMetrics(m.cpuMetricsFormat, metrics) - if err != nil { - logrus.Warnf("Failed to format cpu metrics with error: %v", err) - } - - formatted = formatted + cpuFormatted - } - } - - if m.coreCollector != nil { - /* Collect cpu core Metrics */ - metrics, err = m.coreCollector.GetMetrics() - if err != nil { - return "", fmt.Errorf("failed to collect CPU core metrics; err: %w", err) - } - - if len(metrics) > 0 { - coreFormatted, err := FormatMetrics(m.cpuCoreMetricsFormat, metrics) - if err != nil { - logrus.Warnf("failed to format cpu core metrics; err: %v", err) - } - - formatted = formatted + coreFormatted - } - } - - return formatted, nil -} - -/* -* The goal here is to get to the following format: -* ``` -* # HELP FIELD_ID HELP_MSG -* # TYPE FIELD_ID PROM_TYPE -* FIELD_ID{gpu="GPU_INDEX_0",uuid="GPU_UUID", attr...} VALUE -* FIELD_ID{gpu="GPU_INDEX_N",uuid="GPU_UUID", attr...} VALUE -* ... -* ``` - */ - -var migMetricsFormat = ` -{{- range $counter, $metrics := . -}} -# HELP {{ $counter.FieldName }} {{ $counter.Help }} -# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} -{{- range $metric := $metrics }} -{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",pci_bus_id="{{ $metric.GPUPCIBusID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} - -{{- range $k, $v := $metric.Labels -}} - ,{{ $k }}="{{ $v }}" -{{- end -}} -{{- range $k, $v := $metric.Attributes -}} - ,{{ $k }}="{{ $v }}" -{{- end -}} - -} {{ $metric.Value -}} -{{- end }} -{{ end }}` - -var switchMetricsFormat = ` -{{- range $counter, $metrics := . -}} -# HELP {{ $counter.FieldName }} {{ $counter.Help }} -# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} -{{- range $metric := $metrics }} -{{ $counter.FieldName }}{nvswitch="{{ $metric.GPU }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} - -{{- range $k, $v := $metric.Labels -}} - ,{{ $k }}="{{ $v }}" -{{- end -}} -} {{ $metric.Value -}} -{{- end }} -{{ end }}` - -var linkMetricsFormat = ` -{{- range $counter, $metrics := . -}} -# HELP {{ $counter.FieldName }} {{ $counter.Help }} -# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} -{{- range $metric := $metrics }} -{{ $counter.FieldName }}{nvlink="{{ $metric.GPU }}",nvswitch="{{ $metric.GPUDevice }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} - -{{- range $k, $v := $metric.Labels -}} - ,{{ $k }}="{{ $v }}" -{{- end -}} -} {{ $metric.Value -}} -{{- end }} -{{ end }}` - -var cpuMetricsFormat = ` -{{- range $counter, $metrics := . -}} -# HELP {{ $counter.FieldName }} {{ $counter.Help }} -# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} -{{- range $metric := $metrics }} -{{ $counter.FieldName }}{cpu="{{ $metric.GPU }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} - -{{- range $k, $v := $metric.Labels -}} - ,{{ $k }}="{{ $v }}" -{{- end -}} -} {{ $metric.Value -}} -{{- end }} -{{ end }}` - -var cpuCoreMetricsFormat = ` -{{- range $counter, $metrics := . -}} -# HELP {{ $counter.FieldName }} {{ $counter.Help }} -# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} -{{- range $metric := $metrics }} -{{ $counter.FieldName }}{cpucore="{{ $metric.GPU }}",cpu="{{ $metric.GPUDevice }}"{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} - -{{- range $k, $v := $metric.Labels -}} - ,{{ $k }}="{{ $v }}" -{{- end -}} -} {{ $metric.Value -}} -{{- end }} -{{ end }}` - -// FormatMetrics Template is passed here so that it isn't recompiled at each iteration -func FormatMetrics(t *template.Template, groupedMetrics MetricsByCounter) (string, error) { - // Format metrics - var res bytes.Buffer - if err := t.Execute(&res, groupedMetrics); err != nil { - return "", err - } - - return res.String(), nil -} diff --git a/pkg/dcgmexporter/pipeline_test.go b/pkg/dcgmexporter/pipeline_test.go deleted file mode 100644 index f9385eda..00000000 --- a/pkg/dcgmexporter/pipeline_test.go +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "errors" - "testing" - - "github.com/sirupsen/logrus" - "github.com/stretchr/testify/assert" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/stretchr/testify/require" -) - -func TestRun(t *testing.T) { - cleanup, err := dcgm.Init(dcgm.Embedded) - require.NoError(t, err) - defer cleanup() - - c, cleanup := testDCGMGPUCollector(t, sampleCounters) - defer cleanup() - - p, cleanup, err := NewMetricsPipelineWithGPUCollector(&Config{}, c) - require.NoError(t, err) - defer cleanup() - require.NoError(t, err) - - out, err := p.run() - require.NoError(t, err) - require.NotEmpty(t, out) - - // Note it is pretty difficult to make non superficial tests without - // writting a full blown parser, always look at the results - // We'll be testing them more throughly in the e2e tests (e.g: by running prometheus). - t.Logf("Pipeline result is:\n%v", out) -} - -func testNewDCGMCollector(t *testing.T, - counter *int, enabledCollector map[dcgm.Field_Entity_Group]struct{}, -) DCGMCollectorConstructor { - t.Helper() - return func(c []Counter, - hostname string, - config *Config, - fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem, - ) (*DCGMCollector, func(), error) { - // should always create GPU Collector - if fieldEntityGroupTypeSystemInfo.SystemInfo.InfoType != dcgm.FE_GPU { - if _, ok := enabledCollector[fieldEntityGroupTypeSystemInfo.SystemInfo.InfoType]; !ok { - t.Errorf("collector '%s' should not be created", fieldEntityGroupTypeSystemInfo.SystemInfo.InfoType) - return nil, func() {}, nil - } - } - - collector := &DCGMCollector{} - cleanups := []func(){ - func() { - *counter++ - }, - } - collector.Cleanups = cleanups - - return collector, func() { collector.Cleanup() }, nil - } -} - -func TestCountPipelineCleanup(t *testing.T) { - f, err := os.CreateTemp("", "empty.*.csv") - require.NoError(t, err) - defer os.Remove(f.Name()) - defer f.Close() - - for _, c := range []struct { - name string - enabledCollector map[dcgm.Field_Entity_Group]struct{} - }{{ - name: "only_gpu", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_GPU: {}, - }, - }, { - name: "gpu_switch", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_SWITCH: {}, - }, - }, { - name: "gpu_link", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_LINK: {}, - }, - }, { - name: "gpu_cpu", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_CPU: {}, - }, - }, { - name: "gpu_core", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_CPU_CORE: {}, - }, - }, { - name: "gpu_switch_link", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_SWITCH: {}, - dcgm.FE_LINK: {}, - }, - }, { - name: "gpu_cpu_core", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_CPU: {}, - dcgm.FE_CPU_CORE: {}, - }, - }, { - name: "all", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_SWITCH: {}, - dcgm.FE_LINK: {}, - dcgm.FE_CPU: {}, - dcgm.FE_CPU_CORE: {}, - }, - }} { - t.Run(c.name, func(t *testing.T) { - cleanupCounter := 0 - - config := &Config{ - Kubernetes: false, - ConfigMapData: undefinedConfigMapData, - CollectorsFile: f.Name(), - } - - cc, err := GetCounterSet(config) - if err != nil { - logrus.Fatal(err) - } - - fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(cc.DCGMCounters, config) - - for egt := range c.enabledCollector { - // We inject system info for unit test purpose - fieldEntityGroupTypeSystemInfo.items[egt] = FieldEntityGroupTypeSystemInfoItem{ - SystemInfo: SystemInfo{ - InfoType: egt, - }, - } - } - - _, cleanup, err := NewMetricsPipeline(config, - cc.DCGMCounters, - "", - testNewDCGMCollector(t, &cleanupCounter, c.enabledCollector), - fieldEntityGroupTypeSystemInfo) - require.NoError(t, err, "case: %s failed", c.name) - - cleanup() - require.Equal(t, len(c.enabledCollector), cleanupCounter, "case: %s failed", c.name) - }) - } -} - -func TestNewMetricsPipelineWhenFieldEntityGroupTypeSystemInfoItemIsEmpty(t *testing.T) { - cleanup, err := dcgm.Init(dcgm.Embedded) - require.NoError(t, err) - defer cleanup() - - config := &Config{} - - fieldEntityGroupTypeSystemInfo := &FieldEntityGroupTypeSystemInfo{ - items: map[dcgm.Field_Entity_Group]FieldEntityGroupTypeSystemInfoItem{ - dcgm.FE_GPU: {}, - dcgm.FE_SWITCH: {}, - dcgm.FE_LINK: {}, - dcgm.FE_CPU: {}, - dcgm.FE_CPU_CORE: {}, - }, - } - - p, cleanup, err := NewMetricsPipeline(config, - sampleCounters, - "", - func(_ []Counter, _ string, _ *Config, item FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) { - assert.True(t, item.isEmpty()) - return nil, func() {}, errors.New("empty") - }, - fieldEntityGroupTypeSystemInfo, - ) - require.NoError(t, err) - defer cleanup() - require.NoError(t, err) - - out, err := p.run() - require.NoError(t, err) - require.Empty(t, out) -} diff --git a/pkg/dcgmexporter/registry.go b/pkg/dcgmexporter/registry.go deleted file mode 100644 index 3b62df4c..00000000 --- a/pkg/dcgmexporter/registry.go +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "sync" - - "golang.org/x/sync/errgroup" -) - -type Registry struct { - collectors []Collector - mtx sync.RWMutex -} - -func NewRegistry() *Registry { - return &Registry{ - collectors: make([]Collector, 0), - } -} - -// Register registers a collector with the registry. -func (r *Registry) Register(c Collector) { - r.collectors = append(r.collectors, c) -} - -// Gather gathers metrics from all registered collectors. -func (r *Registry) Gather() (MetricsByCounter, error) { - r.mtx.Lock() - defer r.mtx.Unlock() - - var wg sync.WaitGroup - wg.Add(len(r.collectors)) - - g := new(errgroup.Group) - - var sm sync.Map - - for _, c := range r.collectors { - c := c //creates new c, see https://golang.org/doc/faq#closures_and_goroutines - g.Go(func() error { - metrics, err := c.GetMetrics() - - if err != nil { - return err - } - - for counter, metricVals := range metrics { - val, _ := sm.LoadOrStore(counter, []Metric{}) - out := val.([]Metric) - out = append(out, metricVals...) - sm.Store(counter, out) - } - - return nil - }) - } - - if err := g.Wait(); err != nil { - return nil, err - } - - output := MetricsByCounter{} - - sm.Range(func(key, value interface{}) bool { - output[key.(Counter)] = value.([]Metric) - return true // continue iteration - }) - - return output, nil -} - -// Cleanup resources of registered collectors -func (r *Registry) Cleanup() { - for _, c := range r.collectors { - c.Cleanup() - } -} diff --git a/pkg/dcgmexporter/registry_test.go b/pkg/dcgmexporter/registry_test.go deleted file mode 100644 index f7da1ccf..00000000 --- a/pkg/dcgmexporter/registry_test.go +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "errors" - "testing" - - "github.com/stretchr/testify/mock" - "github.com/stretchr/testify/require" -) - -type mockCollector struct { - mock.Mock -} - -func (m *mockCollector) GetMetrics() (MetricsByCounter, error) { - args := m.Called() - return args.Get(0).(MetricsByCounter), args.Error(1) -} - -func (m *mockCollector) Cleanup() { - m.Called() -} - -func TestRegistry_Gather(t *testing.T) { - collector := new(mockCollector) - reg := NewRegistry() - - metrics := MetricsByCounter{} - counterA := Counter{ - FieldID: 155, - FieldName: "DCGM_FI_DEV_POWER_USAGE", - PromType: "gauge", - } - metrics[counterA] = append(metrics[counterA], Metric{ - GPU: "0", - Counter: counterA, - Attributes: map[string]string{}, - }) - - counterB := Counter{ - FieldName: "DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", - PromType: "gauge", - } - - metrics[counterB] = append(metrics[counterB], Metric{ - GPU: "0", - Counter: counterB, - Value: "42", - Attributes: map[string]string{}, - }) - - type test struct { - name string - collectorState func() *mock.Call - assert func(MetricsByCounter, error) - } - - tests := []test{ - { - name: "When collector return no errors", - collectorState: func() *mock.Call { - return collector.On("GetMetrics").Return(metrics, nil) - }, - assert: func(mbc MetricsByCounter, err error) { - require.NoError(t, err) - require.Len(t, mbc, 2) - }, - }, - { - name: "When collector return errors", - collectorState: func() *mock.Call { - return collector.On("GetMetrics").Return(MetricsByCounter{}, errors.New("Boom!")) - }, - assert: func(mbc MetricsByCounter, err error) { - require.Error(t, err) - require.Len(t, mbc, 0) - }, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - reg.collectors = nil - reg.Register(collector) - mockCall := tc.collectorState() - got, err := reg.Gather() - tc.assert(got, err) - mockCall.Unset() - }) - - } -} diff --git a/pkg/dcgmexporter/server.go b/pkg/dcgmexporter/server.go deleted file mode 100644 index 5910094d..00000000 --- a/pkg/dcgmexporter/server.go +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "context" - "net/http" - "sync" - "time" - - "github.com/gorilla/mux" - "github.com/prometheus/exporter-toolkit/web" - "github.com/sirupsen/logrus" - - "github.com/NVIDIA/dcgm-exporter/internal/pkg/logging" -) - -func NewMetricsServer(c *Config, metrics chan string, registry *Registry) (*MetricsServer, func(), error) { - router := mux.NewRouter() - serverv1 := &MetricsServer{ - server: &http.Server{ - Addr: c.Address, - Handler: router, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - }, - webConfig: &web.FlagConfig{ - WebListenAddresses: &[]string{c.Address}, - WebSystemdSocket: &c.WebSystemdSocket, - WebConfigFile: &c.WebConfigFile, - }, - metricsChan: metrics, - metrics: "", - registry: registry, - } - - router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("X-Content-Type-Options", "nosniff") - w.WriteHeader(http.StatusOK) - _, err := w.Write([]byte(` - GPU Exporter - -

GPU Exporter

-

Metrics

- - `)) - if err != nil { - logrus.WithError(err).Error("Failed to write response.") - http.Error(w, "failed to write response", http.StatusInternalServerError) - return - } - }) - - router.HandleFunc("/health", serverv1.Health) - router.HandleFunc("/metrics", serverv1.Metrics) - - return serverv1, func() {}, nil -} - -func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) { - defer wg.Done() - // Wrap the logrus logger with the LogrusAdapter - logger := logging.NewLogrusAdapter(logrus.StandardLogger()) - - var httpwg sync.WaitGroup - httpwg.Add(1) - go func() { - defer httpwg.Done() - logrus.Info("Starting webserver") - if err := web.ListenAndServe(s.server, s.webConfig, logger); err != nil && err != http.ErrServerClosed { - logrus.WithError(err).Fatal("Failed to Listen and Server HTTP server.") - } - }() - - httpwg.Add(1) - go func() { - defer httpwg.Done() - for { - select { - case <-stop: - return - case m := <-s.metricsChan: - s.updateMetrics(m) - } - } - }() - - <-stop - if err := s.server.Shutdown(context.Background()); err != nil { - logrus.WithError(err).Fatal("Failed to shutdown HTTP server.") - } - - if err := WaitWithTimeout(&httpwg, 3*time.Second); err != nil { - logrus.WithError(err).Fatal("Failed waiting for HTTP server to shutdown.") - } -} - -func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request) { - w.Header().Set("X-Content-Type-Options", "nosniff") - w.WriteHeader(http.StatusOK) - _, err := w.Write([]byte(s.getMetrics())) - if err != nil { - logrus.WithError(err).Error("Failed to write response.") - http.Error(w, "failed to write response", http.StatusInternalServerError) - return - } - metrics, err := s.registry.Gather() - if err != nil { - logrus.WithError(err).Error("Failed to write response.") - http.Error(w, "failed to write response", http.StatusInternalServerError) - return - } - err = encodeExpMetrics(w, metrics) - if err != nil { - http.Error(w, "failed to write response", http.StatusInternalServerError) - return - } -} - -func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request) { - if s.getMetrics() == "" { - w.Header().Set("X-Content-Type-Options", "nosniff") - w.WriteHeader(http.StatusServiceUnavailable) - _, err := w.Write([]byte("KO")) - if err != nil { - logrus.WithError(err).Error("Failed to write response.") - http.Error(w, "failed to write response", http.StatusInternalServerError) - } - } else { - w.Header().Set("X-Content-Type-Options", "nosniff") - w.WriteHeader(http.StatusOK) - _, err := w.Write([]byte("OK")) - if err != nil { - logrus.WithError(err).Error("Failed to write response.") - http.Error(w, "failed to write response", http.StatusInternalServerError) - } - } -} - -func (s *MetricsServer) updateMetrics(m string) { - s.Lock() - defer s.Unlock() - - s.metrics = m -} - -func (s *MetricsServer) getMetrics() string { - s.Lock() - defer s.Unlock() - - return s.metrics -} diff --git a/pkg/dcgmexporter/system_info.go b/pkg/dcgmexporter/system_info.go deleted file mode 100644 index 6d448828..00000000 --- a/pkg/dcgmexporter/system_info.go +++ /dev/null @@ -1,890 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "fmt" - "math/rand" - "slices" - "strings" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/bits-and-blooms/bitset" - "github.com/sirupsen/logrus" -) - -var ( - dcgmGetAllDeviceCount = dcgm.GetAllDeviceCount - dcgmGetDeviceInfo = dcgm.GetDeviceInfo - dcgmGetGpuInstanceHierarchy = dcgm.GetGpuInstanceHierarchy - dcgmAddEntityToGroup = dcgm.AddEntityToGroup - dcgmCreateGroup = dcgm.CreateGroup - dcgmGetCpuHierarchy = dcgm.GetCpuHierarchy -) - -type ComputeInstanceInfo struct { - InstanceInfo dcgm.MigEntityInfo - ProfileName string - EntityId uint -} - -type GPUInstanceInfo struct { - Info dcgm.MigEntityInfo - ProfileName string - EntityId uint - ComputeInstances []ComputeInstanceInfo -} - -type GPUInfo struct { - DeviceInfo dcgm.Device - GPUInstances []GPUInstanceInfo - MigEnabled bool -} - -type SwitchInfo struct { - EntityId uint - NvLinks []dcgm.NvLinkStatus -} - -type CPUInfo struct { - EntityId uint - Cores []uint -} - -type SystemInfo struct { - GPUCount uint - GPUs [dcgm.MAX_NUM_DEVICES]GPUInfo - gOpt DeviceOptions - sOpt DeviceOptions - cOpt DeviceOptions - InfoType dcgm.Field_Entity_Group - Switches []SwitchInfo - CPUs []CPUInfo -} - -type MonitoringInfo struct { - Entity dcgm.GroupEntityPair - DeviceInfo dcgm.Device - InstanceInfo *GPUInstanceInfo - ParentId uint -} - -func SetGPUInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool { - for i := uint(0); i < sysInfo.GPUCount; i++ { - for j := range sysInfo.GPUs[i].GPUInstances { - if sysInfo.GPUs[i].GPUInstances[j].EntityId == entityId { - sysInfo.GPUs[i].GPUInstances[j].ProfileName = profileName - return true - } - } - } - - return false -} - -func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error { - var err error - var errFound bool - errStr := "cannot find match for entities:" - - for _, v := range values { - if !SetGPUInstanceProfileName(sysInfo, v.EntityId, dcgm.Fv2_String(v)) { - errStr = fmt.Sprintf("%s group %d, id %d", errStr, v.EntityGroupId, v.EntityId) - errFound = true - } - } - - if errFound { - err = fmt.Errorf("%s", errStr) - } - - return err -} - -func PopulateMigProfileNames(sysInfo *SystemInfo, entities []dcgm.GroupEntityPair) error { - if len(entities) == 0 { - // There are no entities to populate - return nil - } - - var fields []dcgm.Short - fields = append(fields, dcgm.DCGM_FI_DEV_NAME) - flags := dcgm.DCGM_FV_FLAG_LIVE_DATA - values, err := dcgm.EntitiesGetLatestValues(entities, fields, flags) - - if err != nil { - return err - } - - return SetMigProfileNames(sysInfo, values) -} - -func GPUIdExists(sysInfo *SystemInfo, gpuId int) bool { - for i := uint(0); i < sysInfo.GPUCount; i++ { - if sysInfo.GPUs[i].DeviceInfo.GPU == uint(gpuId) { - return true - } - } - return false -} - -func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool { - for _, sw := range sysInfo.Switches { - if sw.EntityId == uint(switchId) { - return true - } - } - return false -} - -func CPUIdExists(sysInfo *SystemInfo, cpuId int) bool { - for _, cpu := range sysInfo.CPUs { - if cpu.EntityId == uint(cpuId) { - return true - } - } - return false -} - -func GPUInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool { - for i := uint(0); i < sysInfo.GPUCount; i++ { - for _, instance := range sysInfo.GPUs[i].GPUInstances { - if instance.EntityId == uint(gpuInstanceId) { - return true - } - } - } - return false -} - -func LinkIdExists(sysInfo *SystemInfo, linkId int) bool { - for _, sw := range sysInfo.Switches { - for _, link := range sw.NvLinks { - if link.Index == uint(linkId) { - return true - } - } - } - return false -} - -func CPUCoreIdExists(sysInfo *SystemInfo, coreId int) bool { - for _, cpu := range sysInfo.CPUs { - for _, core := range cpu.Cores { - if core == uint(coreId) { - return true - } - } - } - return false -} - -func VerifyCPUDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error { - if sOpt.Flex { - return nil - } - - if len(sOpt.MajorRange) > 0 && sOpt.MajorRange[0] != -1 { - // Verify we can find all the specified Switches - for _, cpuID := range sOpt.MajorRange { - if !SwitchIdExists(sysInfo, cpuID) { - return fmt.Errorf("couldn't find requested CPU ID '%d'", cpuID) - } - } - } - - if len(sOpt.MinorRange) > 0 && sOpt.MinorRange[0] != -1 { - for _, coreID := range sOpt.MinorRange { - if !CPUCoreIdExists(sysInfo, coreID) { - return fmt.Errorf("couldn't find requested CPU core '%d'", coreID) - } - } - } - - return nil -} - -func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error { - if sOpt.Flex { - return nil - } - - if len(sOpt.MajorRange) > 0 && sOpt.MajorRange[0] != -1 { - // Verify we can find all the specified Switches - for _, swID := range sOpt.MajorRange { - if !SwitchIdExists(sysInfo, swID) { - return fmt.Errorf("couldn't find requested NvSwitch ID '%d'", swID) - } - } - } - - if len(sOpt.MinorRange) > 0 && sOpt.MinorRange[0] != -1 { - for _, linkID := range sOpt.MinorRange { - if !LinkIdExists(sysInfo, linkID) { - return fmt.Errorf("couldn't find requested NvLink '%d'", linkID) - } - } - } - - return nil -} - -func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error { - if gOpt.Flex { - return nil - } - - if len(gOpt.MajorRange) > 0 && gOpt.MajorRange[0] != -1 { - // Verify we can find all the specified GPUs - for _, gpuID := range gOpt.MajorRange { - if !GPUIdExists(sysInfo, gpuID) { - return fmt.Errorf("couldn't find requested GPU ID '%d'", gpuID) - } - } - } - - if len(gOpt.MinorRange) > 0 && gOpt.MinorRange[0] != -1 { - for _, gpuInstanceID := range gOpt.MinorRange { - if !GPUInstanceIdExists(sysInfo, gpuInstanceID) { - return fmt.Errorf("couldn't find requested GPU instance ID '%d'", gpuInstanceID) - } - } - } - - return nil -} - -func getCoreArray(bitmask []uint64) []uint { - - var cores []uint - bits := make([]uint64, dcgm.MAX_CPU_CORE_BITMASK_COUNT) - - for i := 0; i < len(bitmask); i++ { - bits[i] = uint64(bitmask[i]) - } - - b := bitset.From(bits) - - for i := uint(0); i < dcgm.MAX_NUM_CPU_CORES; i++ { - if b.Test(i) { - cores = append(cores, uint(i)) - } - } - - return cores -} - -func InitializeCPUInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error) { - hierarchy, err := dcgmGetCpuHierarchy() - if err != nil { - return sysInfo, err - } - - if hierarchy.NumCpus <= 0 { - return sysInfo, fmt.Errorf("no CPUs to monitor") - } - - for i := 0; i < int(hierarchy.NumCpus); i++ { - cores := getCoreArray([]uint64(hierarchy.Cpus[i].OwnedCores)) - - cpu := CPUInfo{ - hierarchy.Cpus[i].CpuId, - cores, - } - - sysInfo.CPUs = append(sysInfo.CPUs, cpu) - } - - sysInfo.cOpt = sOpt - - err = VerifyCPUDevicePresence(&sysInfo, sOpt) - if err != nil { - return sysInfo, err - } - logrus.Debugf("System entities of type %s initialized", sysInfo.InfoType) - return sysInfo, nil -} - -func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error) { - switches, err := dcgm.GetEntityGroupEntities(dcgm.FE_SWITCH) - if err != nil { - return sysInfo, err - } - - if len(switches) <= 0 { - return sysInfo, fmt.Errorf("no switches to monitor") - } - - links, err := dcgm.GetNvLinkLinkStatus() - if err != nil { - return sysInfo, err - } - - for i := 0; i < len(switches); i++ { - var matchingLinks []dcgm.NvLinkStatus - for _, link := range links { - if link.ParentType == dcgm.FE_SWITCH && link.ParentId == uint(switches[i]) { - matchingLinks = append(matchingLinks, link) - } - } - - sw := SwitchInfo{ - switches[i], - matchingLinks, - } - - sysInfo.Switches = append(sysInfo.Switches, sw) - } - - sysInfo.sOpt = sOpt - err = VerifySwitchDevicePresence(&sysInfo, sOpt) - if err == nil { - logrus.Debugf("System entities of type %s initialized", sysInfo.InfoType) - } - - return sysInfo, err -} - -func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool) (SystemInfo, error) { - gpuCount, err := dcgmGetAllDeviceCount() - if err != nil { - return sysInfo, err - } - sysInfo.GPUCount = gpuCount - - for i := uint(0); i < sysInfo.GPUCount; i++ { - // Default mig enabled to false - sysInfo.GPUs[i].MigEnabled = false - sysInfo.GPUs[i].DeviceInfo, err = dcgmGetDeviceInfo(i) - if err != nil { - if useFakeGPUs { - sysInfo.GPUs[i].DeviceInfo.GPU = i - sysInfo.GPUs[i].DeviceInfo.UUID = fmt.Sprintf("fake%d", i) - } else { - return sysInfo, err - } - } - } - - hierarchy, err := dcgmGetGpuInstanceHierarchy() - if err != nil { - return sysInfo, err - } - - if hierarchy.Count > 0 { - var entities []dcgm.GroupEntityPair - - gpuID := uint(0) - instanceIndex := 0 - for i := uint(0); i < hierarchy.Count; i++ { - if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU { - // We are adding a GPU instance - gpuID = hierarchy.EntityList[i].Parent.EntityId - entityID := hierarchy.EntityList[i].Entity.EntityId - instanceInfo := GPUInstanceInfo{ - Info: hierarchy.EntityList[i].Info, - ProfileName: "", - EntityId: entityID, - } - sysInfo.GPUs[gpuID].MigEnabled = true - sysInfo.GPUs[gpuID].GPUInstances = append(sysInfo.GPUs[gpuID].GPUInstances, instanceInfo) - entities = append(entities, dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: entityID}) - instanceIndex = len(sysInfo.GPUs[gpuID].GPUInstances) - 1 - } else if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU_I { - // Add the compute instance, gpuId is recorded previously - entityID := hierarchy.EntityList[i].Entity.EntityId - ciInfo := ComputeInstanceInfo{hierarchy.EntityList[i].Info, "", entityID} - sysInfo.GPUs[gpuID].GPUInstances[instanceIndex].ComputeInstances = append(sysInfo.GPUs[gpuID].GPUInstances[instanceIndex].ComputeInstances, - ciInfo) - } - } - - err = PopulateMigProfileNames(&sysInfo, entities) - if err != nil { - return sysInfo, err - } - } - - sysInfo.gOpt = gOpt - err = VerifyDevicePresence(&sysInfo, gOpt) - if err == nil { - logrus.Debugf("System entities of type %s initialized", sysInfo.InfoType) - } - return sysInfo, err -} - -func InitializeSystemInfo( - gOpt DeviceOptions, sOpt DeviceOptions, cOpt DeviceOptions, useFakeGPUs bool, entityType dcgm.Field_Entity_Group, -) (SystemInfo, error) { - sysInfo := SystemInfo{} - - logrus.Info("Initializing system entities of type: ", entityType) - switch entityType { - case dcgm.FE_LINK: - sysInfo.InfoType = dcgm.FE_LINK - return InitializeNvSwitchInfo(sysInfo, sOpt) - case dcgm.FE_SWITCH: - sysInfo.InfoType = dcgm.FE_SWITCH - return InitializeNvSwitchInfo(sysInfo, sOpt) - case dcgm.FE_GPU: - sysInfo.InfoType = dcgm.FE_GPU - return InitializeGPUInfo(sysInfo, gOpt, useFakeGPUs) - case dcgm.FE_CPU: - sysInfo.InfoType = dcgm.FE_CPU - return InitializeCPUInfo(sysInfo, cOpt) - case dcgm.FE_CPU_CORE: - sysInfo.InfoType = dcgm.FE_CPU_CORE - return InitializeCPUInfo(sysInfo, cOpt) - } - - return sysInfo, fmt.Errorf("unhandled entity type '%d'", entityType) -} - -func CreateCoreGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error) { - var groups []dcgm.GroupHandle - var cleanups []func() - var groupID dcgm.GroupHandle - var err error - - /* Create per-cpu core groups */ - for _, cpu := range sysInfo.CPUs { - if !IsCPUWatched(cpu.EntityId, sysInfo) { - continue - } - - for i, core := range cpu.Cores { - - if i == 0 || i%dcgm.DCGM_GROUP_MAX_ENTITIES == 0 { - groupID, err = dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) - if err != nil { - return nil, cleanups, err - } - - groups = append(groups, groupID) - } - - if !IsCoreWatched(core, cpu.EntityId, sysInfo) { - continue - } - - err = dcgm.AddEntityToGroup(groupID, dcgm.FE_CPU_CORE, core) - - if err != nil { - return groups, cleanups, err - } - - cleanups = append(cleanups, func() { - err := dcgm.DestroyGroup(groupID) - if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) { - logrus.WithFields(logrus.Fields{ - LoggerGroupIDKey: groupID, - logrus.ErrorKey: err, - }).Warn("can not destroy group") - } - }) - } - } - - return groups, cleanups, nil -} - -func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error) { - var groups []dcgm.GroupHandle - var cleanups []func() - - /* Create per-switch link groups */ - for _, sw := range sysInfo.Switches { - if !IsSwitchWatched(sw.EntityId, sysInfo) { - continue - } - - groupID, err := dcgmCreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) - if err != nil { - return nil, cleanups, err - } - - groups = append(groups, groupID) - - for _, link := range sw.NvLinks { - if link.State != dcgm.LS_UP { - continue - } - - if !IsLinkWatched(link.Index, sw.EntityId, sysInfo) { - continue - } - - err = dcgm.AddLinkEntityToGroup(groupID, link.Index, link.ParentId) - - if err != nil { - return groups, cleanups, err - } - - cleanups = append(cleanups, func() { - err := dcgm.DestroyGroup(groupID) - if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) { - logrus.WithFields(logrus.Fields{ - LoggerGroupIDKey: groupID, - logrus.ErrorKey: err, - }).Warn("can not destroy group") - } - }) - } - } - - return groups, cleanups, nil -} - -func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error) { - monitoringInfo := GetMonitoredEntities(sysInfo) - groupID, err := dcgmCreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) - if err != nil { - return dcgm.GroupHandle{}, func() {}, err - } - - for _, mi := range monitoringInfo { - err := dcgmAddEntityToGroup(groupID, mi.Entity.EntityGroupId, mi.Entity.EntityId) - if err != nil { - return groupID, func() { - err := dcgm.DestroyGroup(groupID) - if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) { - logrus.WithFields(logrus.Fields{ - LoggerGroupIDKey: groupID, - logrus.ErrorKey: err, - }).Warn("can not destroy group") - } - }, err - } - } - - return groupID, func() { - err := dcgm.DestroyGroup(groupID) - if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) { - logrus.WithFields(logrus.Fields{ - LoggerGroupIDKey: groupID, - logrus.ErrorKey: err, - }).Warn("can not destroy group") - } - }, nil -} - -func AddAllGPUs(sysInfo SystemInfo) []MonitoringInfo { - var monitoring []MonitoringInfo - - for i := uint(0); i < sysInfo.GPUCount; i++ { - mi := MonitoringInfo{ - dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: sysInfo.GPUs[i].DeviceInfo.GPU}, - sysInfo.GPUs[i].DeviceInfo, - nil, - PARENT_ID_IGNORED, - } - monitoring = append(monitoring, mi) - } - - return monitoring -} - -func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo { - var monitoring []MonitoringInfo - - for _, sw := range sysInfo.Switches { - if !IsSwitchWatched(sw.EntityId, sysInfo) { - continue - } - - mi := MonitoringInfo{ - dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_SWITCH, EntityId: sw.EntityId}, - dcgm.Device{}, - nil, - PARENT_ID_IGNORED, - } - monitoring = append(monitoring, mi) - } - - return monitoring -} - -func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo { - var monitoring []MonitoringInfo - - for _, sw := range sysInfo.Switches { - for _, link := range sw.NvLinks { - if link.State != dcgm.LS_UP { - continue - } - - if !IsSwitchWatched(sw.EntityId, sysInfo) { - continue - } - - if !IsLinkWatched(link.Index, sw.EntityId, sysInfo) { - continue - } - - mi := MonitoringInfo{ - dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_LINK, EntityId: link.Index}, - dcgm.Device{}, - nil, - link.ParentId, - } - monitoring = append(monitoring, mi) - } - } - - return monitoring -} - -func IsSwitchWatched(switchID uint, sysInfo SystemInfo) bool { - if sysInfo.sOpt.Flex { - return true - } - - // When MajorRange contains -1 value, we do monitorig of all switches - if len(sysInfo.sOpt.MajorRange) > 0 && sysInfo.sOpt.MajorRange[0] == -1 { - return true - } - - return slices.Contains(sysInfo.sOpt.MajorRange, int(switchID)) -} - -func IsLinkWatched(linkIndex uint, switchID uint, sysInfo SystemInfo) bool { - if sysInfo.sOpt.Flex { - return true - } - - // Find a switch - switchIdx := slices.IndexFunc(sysInfo.Switches, func(si SwitchInfo) bool { - return si.EntityId == switchID && IsSwitchWatched(si.EntityId, sysInfo) - }) - - if switchIdx > -1 { - // Switch exists and is watched - sw := sysInfo.Switches[switchIdx] - - if len(sysInfo.sOpt.MinorRange) > 0 && sysInfo.sOpt.MinorRange[0] == -1 { - return true - } - - // The Link exists - if slices.ContainsFunc(sw.NvLinks, func(nls dcgm.NvLinkStatus) bool { - return nls.Index == linkIndex - }) { - // and the link index in the Minor range - return slices.Contains(sysInfo.sOpt.MinorRange, int(linkIndex)) - } - } - - return false -} - -func IsCPUWatched(cpuID uint, sysInfo SystemInfo) bool { - - if !slices.ContainsFunc(sysInfo.CPUs, func(cpu CPUInfo) bool { - return cpu.EntityId == cpuID - }) { - return false - } - - if sysInfo.cOpt.Flex { - return true - } - - if len(sysInfo.cOpt.MajorRange) > 0 && sysInfo.cOpt.MajorRange[0] == -1 { - return true - } - - return slices.ContainsFunc(sysInfo.cOpt.MajorRange, func(cpu int) bool { - return uint(cpu) == cpuID - }) -} - -func IsCoreWatched(coreID uint, cpuID uint, sysInfo SystemInfo) bool { - if sysInfo.cOpt.Flex { - return true - } - - // Find a CPU - cpuIdx := slices.IndexFunc(sysInfo.CPUs, func(cpu CPUInfo) bool { - return IsCPUWatched(cpu.EntityId, sysInfo) && cpu.EntityId == cpuID - }) - - if cpuIdx > -1 { - if len(sysInfo.cOpt.MinorRange) > 0 && sysInfo.cOpt.MinorRange[0] == -1 { - return true - } - - return slices.Contains(sysInfo.cOpt.MinorRange, int(coreID)) - } - - return false -} - -func AddAllCPUs(sysInfo SystemInfo) []MonitoringInfo { - var monitoring []MonitoringInfo - - for _, cpu := range sysInfo.CPUs { - if !IsCPUWatched(cpu.EntityId, sysInfo) { - continue - } - - mi := MonitoringInfo{ - dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU, EntityId: cpu.EntityId}, - dcgm.Device{}, - nil, - PARENT_ID_IGNORED, - } - monitoring = append(monitoring, mi) - } - - return monitoring -} - -func AddAllCPUCores(sysInfo SystemInfo) []MonitoringInfo { - var monitoring []MonitoringInfo - - for _, cpu := range sysInfo.CPUs { - for _, core := range cpu.Cores { - if !IsCPUWatched(cpu.EntityId, sysInfo) { - continue - } - - if !IsCoreWatched(core, cpu.EntityId, sysInfo) { - continue - } - - mi := MonitoringInfo{ - dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: core}, - dcgm.Device{}, - nil, - cpu.EntityId, - } - monitoring = append(monitoring, mi) - } - } - - return monitoring -} - -func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo { - var monitoring []MonitoringInfo - - for i := uint(0); i < sysInfo.GPUCount; i++ { - if addFlexibly && len(sysInfo.GPUs[i].GPUInstances) == 0 { - mi := MonitoringInfo{ - dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: sysInfo.GPUs[i].DeviceInfo.GPU}, - sysInfo.GPUs[i].DeviceInfo, - nil, - PARENT_ID_IGNORED, - } - monitoring = append(monitoring, mi) - } else { - for j := 0; j < len(sysInfo.GPUs[i].GPUInstances); j++ { - mi := MonitoringInfo{ - dcgm.GroupEntityPair{ - EntityGroupId: dcgm.FE_GPU_I, - EntityId: sysInfo.GPUs[i].GPUInstances[j].EntityId, - }, - sysInfo.GPUs[i].DeviceInfo, - &sysInfo.GPUs[i].GPUInstances[j], - PARENT_ID_IGNORED, - } - monitoring = append(monitoring, mi) - } - } - } - - return monitoring -} - -func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuID int) *MonitoringInfo { - for i := uint(0); i < sysInfo.GPUCount; i++ { - if sysInfo.GPUs[i].DeviceInfo.GPU == uint(gpuID) { - return &MonitoringInfo{ - dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: sysInfo.GPUs[i].DeviceInfo.GPU}, - sysInfo.GPUs[i].DeviceInfo, - nil, - PARENT_ID_IGNORED, - } - } - } - - return nil -} - -func GetMonitoringInfoForGPUInstance(sysInfo SystemInfo, gpuInstanceID int) *MonitoringInfo { - for i := uint(0); i < sysInfo.GPUCount; i++ { - for _, instance := range sysInfo.GPUs[i].GPUInstances { - if instance.EntityId == uint(gpuInstanceID) { - return &MonitoringInfo{ - dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: uint(gpuInstanceID)}, - sysInfo.GPUs[i].DeviceInfo, - &instance, - PARENT_ID_IGNORED, - } - } - } - } - - return nil -} - -func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo { - var monitoring []MonitoringInfo - - if sysInfo.InfoType == dcgm.FE_SWITCH { - monitoring = AddAllSwitches(sysInfo) - } else if sysInfo.InfoType == dcgm.FE_LINK { - monitoring = AddAllLinks(sysInfo) - } else if sysInfo.InfoType == dcgm.FE_CPU { - monitoring = AddAllCPUs(sysInfo) - } else if sysInfo.InfoType == dcgm.FE_CPU_CORE { - monitoring = AddAllCPUCores(sysInfo) - } else if sysInfo.gOpt.Flex { - monitoring = AddAllGPUInstances(sysInfo, true) - } else { - if len(sysInfo.gOpt.MajorRange) > 0 && sysInfo.gOpt.MajorRange[0] == -1 { - monitoring = AddAllGPUs(sysInfo) - } else { - for _, gpuID := range sysInfo.gOpt.MajorRange { - // We've already verified that everything in the options list exists - monitoring = append(monitoring, *GetMonitoringInfoForGPU(sysInfo, gpuID)) - } - } - - if len(sysInfo.gOpt.MinorRange) > 0 && sysInfo.gOpt.MinorRange[0] == -1 { - monitoring = AddAllGPUInstances(sysInfo, false) - } else { - for _, gpuInstanceID := range sysInfo.gOpt.MinorRange { - // We've already verified that everything in the options list exists - monitoring = append(monitoring, *GetMonitoringInfoForGPUInstance(sysInfo, gpuInstanceID)) - } - } - } - - return monitoring -} - -func GetGPUInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceID uint) string { - for i := uint(0); i < sysInfo.GPUCount; i++ { - if sysInfo.GPUs[i].DeviceInfo.UUID == gpuuuid { - identifier := fmt.Sprintf("%d-%d", sysInfo.GPUs[i].DeviceInfo.GPU, gpuInstanceID) - return identifier - } - } - - return "" -} diff --git a/pkg/dcgmexporter/system_info_test.go b/pkg/dcgmexporter/system_info_test.go deleted file mode 100644 index 6f2e45cc..00000000 --- a/pkg/dcgmexporter/system_info_test.go +++ /dev/null @@ -1,671 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "fmt" - "testing" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -var fakeProfileName = "2fake.4gb" - -func SpoofSwitchSystemInfo() SystemInfo { - var sysInfo SystemInfo - sysInfo.InfoType = dcgm.FE_SWITCH - sw1 := SwitchInfo{ - EntityId: 0, - } - sw2 := SwitchInfo{ - EntityId: 1, - } - - l1 := dcgm.NvLinkStatus{ - ParentId: 0, - ParentType: dcgm.FE_SWITCH, - State: 2, - Index: 0, - } - - l2 := dcgm.NvLinkStatus{ - ParentId: 0, - ParentType: dcgm.FE_SWITCH, - State: 3, - Index: 1, - } - - l3 := dcgm.NvLinkStatus{ - ParentId: 1, - ParentType: dcgm.FE_SWITCH, - State: 2, - Index: 0, - } - - l4 := dcgm.NvLinkStatus{ - ParentId: 1, - ParentType: dcgm.FE_SWITCH, - State: 3, - Index: 1, - } - - sw1.NvLinks = append(sw1.NvLinks, l1) - sw1.NvLinks = append(sw1.NvLinks, l2) - sw2.NvLinks = append(sw2.NvLinks, l3) - sw2.NvLinks = append(sw2.NvLinks, l4) - - sysInfo.Switches = append(sysInfo.Switches, sw1) - sysInfo.Switches = append(sysInfo.Switches, sw2) - - sysInfo.sOpt.MajorRange = []int{-1} - sysInfo.sOpt.MinorRange = []int{-1} - - return sysInfo -} - -func SpoofSystemInfo() SystemInfo { - var sysInfo SystemInfo - sysInfo.GPUCount = 2 - sysInfo.GPUs[0].DeviceInfo.GPU = 0 - gi := GPUInstanceInfo{ - Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlProfileSlices: 3}, - ProfileName: fakeProfileName, - EntityId: 0, - } - sysInfo.GPUs[0].GPUInstances = append(sysInfo.GPUs[0].GPUInstances, gi) - gi2 := GPUInstanceInfo{ - Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlInstanceId: 1, NvmlProfileSlices: 3}, - ProfileName: fakeProfileName, - EntityId: 14, - } - sysInfo.GPUs[1].GPUInstances = append(sysInfo.GPUs[1].GPUInstances, gi2) - sysInfo.GPUs[1].DeviceInfo.GPU = 1 - - return sysInfo -} - -func TestMonitoredEntities(t *testing.T) { - sysInfo := SpoofSystemInfo() - sysInfo.gOpt.Flex = true - - monitoring := GetMonitoredEntities(sysInfo) - require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring))) - instanceCount := 0 - gpuCount := 0 - for _, mi := range monitoring { - if mi.Entity.EntityGroupId == dcgm.FE_GPU_I { - instanceCount = instanceCount + 1 - require.NotEqual(t, mi.InstanceInfo, nil, "Expected InstanceInfo to be populated but it wasn't") - require.Equal(t, mi.InstanceInfo.ProfileName, fakeProfileName, "Expected profile named '%s' but found '%s'", - fakeProfileName, mi.InstanceInfo.ProfileName) - if mi.Entity.EntityId != uint(0) { - // One of these should be 0, the other should be 14 - require.Equal(t, mi.Entity.EntityId, uint(14), "Expected 14 as EntityId but found %s", - monitoring[1].Entity.EntityId) - } - } else { - gpuCount = gpuCount + 1 - require.Equal(t, mi.InstanceInfo, (*GPUInstanceInfo)(nil), "Expected InstanceInfo to be nil but it wasn't") - } - } - require.Equal(t, instanceCount, 2, "Expected 2 GPU instances but found %d", instanceCount) - require.Equal(t, gpuCount, 0, "Expected 0 GPUs but found %d", gpuCount) - - sysInfo.GPUs[0].GPUInstances = sysInfo.GPUs[0].GPUInstances[:0] - sysInfo.GPUs[1].GPUInstances = sysInfo.GPUs[1].GPUInstances[:0] - monitoring = GetMonitoredEntities(sysInfo) - require.Equal(t, 2, len(monitoring), fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring))) - for i, mi := range monitoring { - require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_GPU, "Expected FE_GPU but found %d", mi.Entity.EntityGroupId) - require.Equal(t, uint(i), mi.DeviceInfo.GPU, "Expected GPU %d but found %d", i, mi.DeviceInfo.GPU) - require.Equal(t, (*GPUInstanceInfo)(nil), mi.InstanceInfo, - "Expected InstanceInfo not to be populated but it was") - } -} - -func TestVerifyDevicePresence(t *testing.T) { - sysInfo := SpoofSystemInfo() - var dOpt DeviceOptions - dOpt.Flex = true - err := VerifyDevicePresence(&sysInfo, dOpt) - require.Equal(t, err, nil, "Expected to have no error, but found %s", err) - - dOpt.Flex = false - dOpt.MajorRange = append(dOpt.MajorRange, -1) - dOpt.MinorRange = append(dOpt.MinorRange, -1) - err = VerifyDevicePresence(&sysInfo, dOpt) - require.Equal(t, err, nil, "Expected to have no error, but found %s", err) - - dOpt.MinorRange[0] = 10 // this GPU instance doesn't exist - err = VerifyDevicePresence(&sysInfo, dOpt) - require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU instance, but none found") - - dOpt.MajorRange[0] = 10 // this GPU doesn't exist - dOpt.MinorRange[0] = -1 - err = VerifyDevicePresence(&sysInfo, dOpt) - require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU, but none found") - - // Add GPUs and instances that exist - dOpt.MajorRange[0] = 0 - dOpt.MajorRange = append(dOpt.MajorRange, 1) - dOpt.MinorRange[0] = 0 - dOpt.MinorRange = append(dOpt.MinorRange, 14) - err = VerifyDevicePresence(&sysInfo, dOpt) - require.Equal(t, err, nil, "Expected to have no error, but found %s", err) -} - -func TestMonitoredSwitches(t *testing.T) { - sysInfo := SpoofSwitchSystemInfo() - - /* test that only switches are returned */ - monitoring := GetMonitoredEntities(sysInfo) - require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored switches but found %d", len(monitoring))) - for _, mi := range monitoring { - require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_SWITCH, - fmt.Sprintf("Should have only returned switches but returned %d", mi.Entity.EntityGroupId)) - } - - /* test that only "up" links are monitored and 1 from each switch */ - sysInfo.InfoType = dcgm.FE_LINK - monitoring = GetMonitoredEntities(sysInfo) - require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored links but found %d", len(monitoring))) - for i, mi := range monitoring { - require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_LINK, - fmt.Sprintf("Should have only returned links but returned %d", mi.Entity.EntityGroupId)) - require.Equal(t, mi.ParentId, uint(i), "Link should reference switch parent") - } -} - -func TestIsSwitchWatched(t *testing.T) { - tests := []struct { - name string - switchID uint - sysInfo SystemInfo - want bool - }{ - { - name: "Monitor all devices", - switchID: 1, - sysInfo: SystemInfo{ - sOpt: DeviceOptions{ - Flex: true, - }, - }, - want: true, - }, - { - name: "MajorRange empty", - switchID: 2, - sysInfo: SystemInfo{ - sOpt: DeviceOptions{ - MajorRange: []int{}, - }, - }, - want: false, - }, - { - name: "MajorRange contains -1 to watch all devices", - switchID: 3, - sysInfo: SystemInfo{ - sOpt: DeviceOptions{ - MajorRange: []int{-1}, - }, - }, - want: true, - }, - { - name: "SwitchID in MajorRange", - switchID: 4, - sysInfo: SystemInfo{ - sOpt: DeviceOptions{ - MajorRange: []int{3, 4, 5}, - }, - }, - want: true, - }, - { - name: "SwitchID not in MajorRange", - switchID: 5, - sysInfo: SystemInfo{ - sOpt: DeviceOptions{ - MajorRange: []int{3, 4, 6}, - }, - }, - want: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := IsSwitchWatched(tt.switchID, tt.sysInfo) - assert.Equal(t, tt.want, got) - }) - } -} - -func TestIsLinkWatched(t *testing.T) { - tests := []struct { - name string - linkIndex uint - switchID uint - sysInfo SystemInfo - want bool - }{ - { - name: "Monitor all devices", - linkIndex: 1, - sysInfo: SystemInfo{sOpt: DeviceOptions{Flex: true}}, - want: true, - }, - { - name: "No watched devices", - linkIndex: 1, - sysInfo: SystemInfo{}, - want: false, - }, - { - name: "Watched link with empty MinorRange", - linkIndex: 2, - sysInfo: SystemInfo{ - sOpt: DeviceOptions{ - MajorRange: []int{-1}, - }, - Switches: []SwitchInfo{ - { - EntityId: 1, - NvLinks: []dcgm.NvLinkStatus{ - {Index: 2}, - }, - }, - }, - }, - want: false, - }, - { - name: "MinorRange contains -1 to watch all links", - switchID: 1, - linkIndex: 3, - sysInfo: SystemInfo{ - sOpt: DeviceOptions{ - MajorRange: []int{-1}, - MinorRange: []int{-1}, - }, - Switches: []SwitchInfo{ - { - EntityId: 1, - NvLinks: []dcgm.NvLinkStatus{ - {Index: 3}, - }, - }, - }, - }, - want: true, - }, - { - name: "The link not in the watched switch", - switchID: 1, - linkIndex: 4, - sysInfo: SystemInfo{ - sOpt: DeviceOptions{ - MajorRange: []int{-1}, - MinorRange: []int{1, 2, 3}, - }, - Switches: []SwitchInfo{ - { - EntityId: 1, - NvLinks: []dcgm.NvLinkStatus{ - {Index: 4}, - }, - }, - }, - }, - want: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := IsLinkWatched(tt.linkIndex, tt.switchID, tt.sysInfo) - assert.Equal(t, tt.want, got) - }) - } -} - -func TestIsCPUWatched(t *testing.T) { - tests := []struct { - name string - cpuID uint - sysInfo SystemInfo - want bool - }{ - { - name: "Monitor all devices", - cpuID: 1, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{Flex: true}, - CPUs: []CPUInfo{ - { - EntityId: 1, - }, - }, - }, - want: true, - }, - { - name: "MajorRange Contains -1", - cpuID: 2, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{MajorRange: []int{-1}}, - CPUs: []CPUInfo{ - { - EntityId: 2, - }, - }, - }, - want: true, - }, - { - name: "CPU ID in MajorRange", - cpuID: 3, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{MajorRange: []int{1, 2, 3}}, - CPUs: []CPUInfo{ - { - EntityId: 3, - }, - }, - }, - want: true, - }, - { - name: "CPU ID Not in MajorRange", - cpuID: 4, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{MajorRange: []int{1, 2, 3}}, - CPUs: []CPUInfo{ - { - EntityId: 4, - }, - }, - }, - want: false, - }, - { - name: "MajorRange Empty", - cpuID: 5, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{MajorRange: []int{}}, - CPUs: []CPUInfo{ - { - EntityId: 5, - }, - }, - }, - want: false, - }, - { - name: "CPU not found", - cpuID: 6, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{MajorRange: []int{}}, - CPUs: []CPUInfo{ - { - EntityId: 5, - }, - }, - }, - want: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.want, IsCPUWatched(tt.cpuID, tt.sysInfo)) - }) - } -} - -func TestIsCoreWatched(t *testing.T) { - tests := []struct { - name string - coreID uint - cpuID uint - sysInfo SystemInfo - want bool - }{ - { - name: "Monitor all devices", - coreID: 1, - cpuID: 1, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{Flex: true}, - }, - want: true, - }, - { - name: "Core in MinorRange", - coreID: 2, - cpuID: 1, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{ - MinorRange: []int{1, 2, 3}, - MajorRange: []int{-1}, - }, - CPUs: []CPUInfo{{EntityId: 1}}, - }, - want: true, - }, - { - name: "Core Not in MinorRange", - coreID: 4, - cpuID: 1, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{ - MinorRange: []int{1, 2, 3}, - MajorRange: []int{-1}, - }, - CPUs: []CPUInfo{{EntityId: 1}}, - }, - want: false, - }, - { - name: "MinorRange Contains -1", - coreID: 5, - cpuID: 1, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{ - MinorRange: []int{-1}, - MajorRange: []int{-1}, - }, - CPUs: []CPUInfo{{EntityId: 1}}, - }, - want: true, - }, - { - name: "CPU Not Found", - coreID: 1, - cpuID: 2, - sysInfo: SystemInfo{ - cOpt: DeviceOptions{ - MinorRange: []int{1, 2, 3}, - MajorRange: []int{-1}, - }, - CPUs: []CPUInfo{{EntityId: 1}}, - }, - want: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.want, IsCoreWatched(tt.coreID, tt.cpuID, tt.sysInfo)) - }) - } -} - -func TestSetMigProfileNames(t *testing.T) { - tests := []struct { - name string - sysInfo SystemInfo - values []dcgm.FieldValue_v2 - valid bool - }{ - { - name: "MIG profile found", - sysInfo: SystemInfo{ - GPUCount: 1, - GPUs: [dcgm.MAX_NUM_DEVICES]GPUInfo{ - { - GPUInstances: []GPUInstanceInfo{ - {EntityId: 1}, - }, - }, - }, - }, - values: []dcgm.FieldValue_v2{ - { - EntityId: 1, - FieldType: dcgm.DCGM_FT_STRING, - StringValue: &fakeProfileName, - }, - }, - valid: true, - }, - { - name: "Multiple MIG GPUs", - sysInfo: SystemInfo{ - GPUCount: 3, - GPUs: [dcgm.MAX_NUM_DEVICES]GPUInfo{ - { - GPUInstances: []GPUInstanceInfo{ - {EntityId: 1}, - }, - }, - { - GPUInstances: []GPUInstanceInfo{ - {EntityId: 2}, - }, - }, - { - GPUInstances: []GPUInstanceInfo{ - {EntityId: 3}, - }, - }, - }, - }, - values: []dcgm.FieldValue_v2{ - { - EntityId: 2, - FieldType: dcgm.DCGM_FT_STRING, - StringValue: &fakeProfileName, - }, - }, - valid: true, - }, - { - name: "Multiple MIG GPUs and Values", - sysInfo: SystemInfo{ - GPUCount: 3, - GPUs: [dcgm.MAX_NUM_DEVICES]GPUInfo{ - { - GPUInstances: []GPUInstanceInfo{ - {EntityId: 1}, - }, - }, - { - GPUInstances: []GPUInstanceInfo{ - {EntityId: 2}, - }, - }, - { - GPUInstances: []GPUInstanceInfo{ - {EntityId: 3}, - }, - }, - }, - }, - values: []dcgm.FieldValue_v2{ - { - EntityId: 2, - FieldType: dcgm.DCGM_FT_STRING, - StringValue: &fakeProfileName, - }, - { - EntityId: 3, - FieldType: dcgm.DCGM_FT_STRING, - StringValue: &fakeProfileName, - }, - }, - valid: true, - }, - { - name: "MIG profile not found", - sysInfo: SystemInfo{ - GPUCount: 1, - GPUs: [dcgm.MAX_NUM_DEVICES]GPUInfo{ - { - GPUInstances: []GPUInstanceInfo{ - {EntityId: 1}, - }, - }, - }, - }, - values: []dcgm.FieldValue_v2{ - { - EntityId: 2, - FieldType: dcgm.DCGM_FT_STRING, - StringValue: &fakeProfileName, - }, - }, - valid: false, - }, - { - name: "MIG profile not string type", - sysInfo: SystemInfo{ - GPUCount: 1, - GPUs: [dcgm.MAX_NUM_DEVICES]GPUInfo{ - { - GPUInstances: []GPUInstanceInfo{ - {EntityId: 1}, - }, - }, - }, - }, - values: []dcgm.FieldValue_v2{ - { - EntityId: 1, - FieldType: dcgm.DCGM_FT_BINARY, - StringValue: &fakeProfileName, - Value: [4096]byte{'1', '2', '3'}, - }, - }, - valid: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if tt.valid { - assert.NoError(t, SetMigProfileNames(&tt.sysInfo, tt.values), "Expected no error.") - } else { - assert.Error(t, SetMigProfileNames(&tt.sysInfo, tt.values), "Expected an error.") - } - }) - } -} diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go deleted file mode 100644 index 246afe02..00000000 --- a/pkg/dcgmexporter/types.go +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "fmt" - "net/http" - "sync" - "text/template" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/prometheus/exporter-toolkit/web" -) - -var ( - SkipDCGMValue = "SKIPPING DCGM VALUE" - FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING" - - nvidiaResourceName = "nvidia.com/gpu" - nvidiaMigResourcePrefix = "nvidia.com/mig-" - MIG_UUID_PREFIX = "MIG-" - - // Note standard resource attributes - podAttribute = "pod" - namespaceAttribute = "namespace" - containerAttribute = "container" - - hpcJobAttribute = "hpc_job" - - oldPodAttribute = "pod_name" - oldNamespaceAttribute = "pod_namespace" - oldContainerAttribute = "container_name" - - undefinedConfigMapData = "none" -) - -type Transform interface { - Process(metrics MetricsByCounter, sysInfo SystemInfo) error - Name() string -} - -type MetricsPipeline struct { - config *Config - - transformations []Transform - migMetricsFormat *template.Template - switchMetricsFormat *template.Template - linkMetricsFormat *template.Template - cpuMetricsFormat *template.Template - cpuCoreMetricsFormat *template.Template - - counters []Counter - gpuCollector *DCGMCollector - switchCollector *DCGMCollector - linkCollector *DCGMCollector - cpuCollector *DCGMCollector - coreCollector *DCGMCollector -} - -type DCGMCollector struct { - Counters []Counter - DeviceFields []dcgm.Short - Cleanups []func() - UseOldNamespace bool - SysInfo SystemInfo - Hostname string - ReplaceBlanksInModelName bool -} - -type Counter struct { - FieldID dcgm.Short - FieldName string - PromType string - Help string -} - -type Metric struct { - Counter Counter - Value string - - GPU string - GPUUUID string - GPUDevice string - GPUModelName string - GPUPCIBusID string - - UUID string - - MigProfile string - GPUInstanceID string - Hostname string - - Labels map[string]string - Attributes map[string]string -} - -func (m Metric) getIDOfType(idType KubernetesGPUIDType) (string, error) { - // For MIG devices, return the MIG profile instead of - if m.MigProfile != "" { - return fmt.Sprintf("%s-%s", m.GPU, m.GPUInstanceID), nil - } - switch idType { - case GPUUID: - return m.GPUUUID, nil - case DeviceName: - return m.GPUDevice, nil - } - return "", fmt.Errorf("unsupported KubernetesGPUIDType for MetricID '%s'", idType) -} - -var promMetricType = map[string]bool{ - "gauge": true, - "counter": true, - "histogram": true, - "summary": true, - "label": true, -} - -type MetricsServer struct { - sync.Mutex - - server *http.Server - webConfig *web.FlagConfig - metrics string - metricsChan chan string - registry *Registry -} - -type PodMapper struct { - Config *Config -} - -type PodInfo struct { - Name string - Namespace string - Container string -} - -// MetricsByCounter represents a map where each Counter is associated with a slice of Metric objects -type MetricsByCounter map[Counter][]Metric - -// CounterSet return -type CounterSet struct { - DCGMCounters []Counter - ExporterCounters []Counter -} diff --git a/pkg/dcgmexporter/xid_collector.go b/pkg/dcgmexporter/xid_collector.go deleted file mode 100644 index 88ca020c..00000000 --- a/pkg/dcgmexporter/xid_collector.go +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "fmt" - "slices" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/sirupsen/logrus" -) - -type xidCollector struct { - expCollector -} - -func (c *xidCollector) GetMetrics() (MetricsByCounter, error) { - return c.expCollector.getMetrics() -} - -func NewXIDCollector(counters []Counter, - hostname string, - config *Config, - fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (Collector, error) { - if !IsDCGMExpXIDErrorsCountEnabled(counters) { - logrus.Error(dcgmExpXIDErrorsCount + " collector is disabled") - return nil, fmt.Errorf(dcgmExpXIDErrorsCount + " collector is disabled") - } - - collector := xidCollector{} - collector.expCollector = newExpCollector(counters, - hostname, - []dcgm.Short{dcgm.DCGM_FI_DEV_XID_ERRORS}, - config, - fieldEntityGroupTypeSystemInfo) - - collector.counter = counters[slices.IndexFunc(counters, func(c Counter) bool { - return c.FieldName == dcgmExpXIDErrorsCount - })] - - collector.labelFiller = func(metricValueLabels map[string]string, entityValue int64) { - metricValueLabels["xid"] = fmt.Sprint(entityValue) - } - - collector.windowSize = config.XIDCountWindowSize - - return &collector, nil -} - -func IsDCGMExpXIDErrorsCountEnabled(counters []Counter) bool { - return slices.ContainsFunc(counters, func(c Counter) bool { - return c.FieldName == dcgmExpXIDErrorsCount - }) -} diff --git a/pkg/dcgmexporter/xid_collector_test.go b/pkg/dcgmexporter/xid_collector_test.go deleted file mode 100644 index ceaf02d1..00000000 --- a/pkg/dcgmexporter/xid_collector_test.go +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dcgmexporter - -import ( - "bytes" - "fmt" - "reflect" - "slices" - "testing" - "time" - - "github.com/NVIDIA/go-dcgm/pkg/dcgm" - io_prometheus_client "github.com/prometheus/client_model/go" - "github.com/prometheus/common/expfmt" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "k8s.io/utils/ptr" -) - -func TestXIDCollector_Gather_Encode(t *testing.T) { - teardownTest := setupTest(t) - defer teardownTest(t) - runOnlyWithLiveGPUs(t) - - hostname := "local-test" - config := &Config{ - GPUDevices: DeviceOptions{ - Flex: true, - MajorRange: []int{-1}, - MinorRange: []int{-1}, - }, - XIDCountWindowSize: int(time.Duration(5) * time.Minute), - } - - records := [][]string{ - {"DCGM_EXP_XID_ERRORS_COUNT", "gauge", "Count of XID Errors within user-specified time window (see xid-count-window-size param)."}, - {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, - } - - cc, err := extractCounters(records, config) - require.NoError(t, err) - require.Len(t, cc.ExporterCounters, 1) - require.Len(t, cc.DCGMCounters, 1) - - for i := range cc.DCGMCounters { - if cc.DCGMCounters[i].PromType == "label" { - cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) - } - } - - // Get a number of hardware GPUs - hardwareGPUs, err := dcgm.GetAllDeviceCount() - require.NoError(t, err) - - if hardwareGPUs+1 > dcgm.MAX_NUM_DEVICES { - t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) - } - - entityList := []dcgm.MigHierarchyInfo{ - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, - } - - // Create fake GPU - fakeGPUIDs, err := dcgm.CreateFakeEntities(entityList) - require.NoError(t, err) - require.NotEmpty(t, fakeGPUIDs) - - for i, gpuID := range fakeGPUIDs { - err = dcgm.InjectFieldValue(gpuID, - dcgm.DCGM_FI_DEV_XID_ERRORS, - dcgm.DCGM_FT_INT64, - 0, - time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), - int64(42), - ) - require.NoError(t, err) - - err = dcgm.InjectFieldValue(gpuID, - dcgm.DCGM_FI_DEV_XID_ERRORS, - dcgm.DCGM_FT_INT64, - 0, - time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), - int64(42), - ) - require.NoError(t, err) - - err = dcgm.InjectFieldValue(gpuID, - dcgm.DCGM_FI_DEV_XID_ERRORS, - dcgm.DCGM_FT_INT64, - 0, - time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), - int64(46), - ) - require.NoError(t, err) - - } - - allCounters := []Counter{ - { - FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - }, - } - - fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) - err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) - require.NoError(t, err) - - item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - require.True(t, exists) - - xidCollector, err := NewXIDCollector(cc.ExporterCounters, hostname, config, item) - require.NoError(t, err) - - defer func() { - xidCollector.Cleanup() - }() - - metrics, err := xidCollector.GetMetrics() - require.NoError(t, err) - require.NotEmpty(t, metrics) - // We expect 1 metric: DCGM_EXP_XID_ERRORS_COUNT - require.Len(t, metrics, 1) - // We get metric value with 0 index - metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - - fakeGPUIDMap := map[string]struct{}{} - for _, fakeGPUID := range fakeGPUIDs { - fakeGPUIDMap[fmt.Sprint(fakeGPUID)] = struct{}{} - } - - conditionFakeGPUOnly := func(m Metric) bool { - _, exists := fakeGPUIDMap[m.GPU] - return exists - } - - // We want to filter out physical GPU and keep fake only - metricValues = filterMetrics(metricValues, conditionFakeGPUOnly) - - require.Len(t, metricValues, len(fakeGPUIDs)*2) - for _, val := range metricValues { - require.Contains(t, val.Labels, "window_size_in_ms") - require.Equal(t, fmt.Sprint(config.XIDCountWindowSize), val.Labels["window_size_in_ms"]) - } - - // We inject new error - err = dcgm.InjectFieldValue(fakeGPUIDs[0], - dcgm.DCGM_FI_DEV_XID_ERRORS, - dcgm.DCGM_FT_INT64, - 0, - time.Now().UnixMicro(), - int64(19), - ) - require.NoError(t, err) - - // Wait for 1 second - time.Sleep(1 * time.Second) - - metrics, err = xidCollector.GetMetrics() - require.NoError(t, err) - require.NotEmpty(t, metrics) - - // We expect 1 metric: DCGM_EXP_XID_ERRORS_COUNT - require.Len(t, metrics, 1) - - // We get metric value with the last index - metricValues = metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - // We want to filter out physical GPU and keep fake only - metricValues = filterMetrics(metricValues, conditionFakeGPUOnly) - // We update metrics with slice, that doesn't contain physical GPU - metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] = metricValues - - // We have 3 fake GPU and each GPU experienced 3 XID errors: 42, 46, 19 to GPU0 - require.Len(t, metricValues, 1+(len(fakeGPUIDs)*2)) - for _, val := range metricValues { - require.Contains(t, val.Labels, "window_size_in_ms") - require.Equal(t, fmt.Sprint(config.XIDCountWindowSize), val.Labels["window_size_in_ms"]) - } - - // Now we check the metric rendering - var b bytes.Buffer - err = encodeExpMetrics(&b, metrics) - require.NoError(t, err) - require.NotEmpty(t, b) - - var parser expfmt.TextParser - mf, err := parser.TextToMetricFamilies(&b) - require.NoError(t, err) - require.NotEmpty(t, mf) - require.Len(t, mf, 1) - metricFamily := mf[reflect.ValueOf(mf).MapKeys()[0].Interface().(string)] - require.NotNil(t, metricFamily.Name) - assert.Equal(t, "DCGM_EXP_XID_ERRORS_COUNT", *metricFamily.Name) - assert.Equal(t, "Count of XID Errors within user-specified time window (see xid-count-window-size param).", *metricFamily.Help) - assert.Equal(t, io_prometheus_client.MetricType_GAUGE, *metricFamily.Type) - // We have 3 fake GPU and each GPU, except the one experienced XID errors: 42, 46, 19 - require.Len(t, metricFamily.Metric, 1+(len(fakeGPUIDs)*2)) - for _, mv := range metricFamily.Metric { - require.NotNil(t, mv.Gauge.Value) - if *(mv.Gauge.Value) == 0 { - // We don't inject XID errors into the hardware GPU, so we do not expect XID label - assert.Len(t, mv.Label, 7) - assert.False(t, slices.ContainsFunc(mv.Label, func(lp *io_prometheus_client.LabelPair) bool { - return ptr.Deref(lp.Name, "") == "xid" - })) - continue - } - assert.Len(t, mv.Label, 9) - assert.Equal(t, "gpu", *mv.Label[0].Name) - assert.Equal(t, "UUID", *mv.Label[1].Name) - assert.Equal(t, "pci_bus_id", *mv.Label[2].Name) - assert.NotEmpty(t, *mv.Label[2].Value) - assert.Equal(t, "device", *mv.Label[3].Name) - assert.Equal(t, "modelName", *mv.Label[4].Name) - assert.Equal(t, "Hostname", *mv.Label[5].Name) - assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *mv.Label[6].Name) - assert.Equal(t, "window_size_in_ms", *mv.Label[7].Name) - assert.Equal(t, "xid", *mv.Label[8].Name) - assert.NotEmpty(t, *mv.Label[8].Value) - } -} - -func filterMetrics(metricValues []Metric, condition func(Metric) bool) []Metric { - var result []Metric - for _, metricValue := range metricValues { - if condition(metricValue) { - result = append(result, metricValue) - } - } - return result -} - -func TestXIDCollector_NewXIDCollector(t *testing.T) { - config := &Config{ - GPUDevices: DeviceOptions{ - Flex: true, - MajorRange: []int{-1}, - MinorRange: []int{-1}, - }, - } - - teardownTest := setupTest(t) - defer teardownTest(t) - - allCounters := []Counter{ - { - FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - }, - } - - fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) - err := fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) - require.NoError(t, err) - - item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - - t.Run("Should Return Error When DCGM_EXP_XID_ERRORS_COUNT is not present", func(t *testing.T) { - records := [][]string{ - {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, - } - cc, err := extractCounters(records, config) - require.NoError(t, err) - require.Len(t, cc.ExporterCounters, 0) - require.Len(t, cc.DCGMCounters, 1) - - xidCollector, err := NewXIDCollector(cc.DCGMCounters, "", config, item) - require.Error(t, err) - require.Nil(t, xidCollector) - }) - - t.Run("Should Return Error When Counters Param Is Empty", func(t *testing.T) { - counters := make([]Counter, 0) - xidCollector, err := NewXIDCollector(counters, "", config, item) - require.Error(t, err) - require.Nil(t, xidCollector) - }) - - t.Run("Should Not Return Error When DCGM_EXP_XID_ERRORS_COUNT Present More Than Once", func(t *testing.T) { - records := [][]string{ - {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, - {"DCGM_EXP_XID_ERRORS_COUNT", "gauge", "Count of XID Errors within user-specified time window (see xid-count-window-size param)."}, - {"DCGM_EXP_XID_ERRORS_COUNT", "gauge", "Count of XID Errors within user-specified time window (see xid-count-window-size param)."}, - {"DCGM_EXP_XID_ERRORS_COUNT", "gauge", "Count of XID Errors within user-specified time window (see xid-count-window-size param)."}, - } - cc, err := extractCounters(records, config) - require.NoError(t, err) - for i := range cc.DCGMCounters { - if cc.DCGMCounters[i].PromType == "label" { - cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) - } - } - xidCollector, err := NewXIDCollector(cc.ExporterCounters, "", config, item) - require.NoError(t, err) - require.NotNil(t, xidCollector) - }) -} diff --git a/scripts/test_coverage.sh b/scripts/test_coverage.sh index db49bd43..e7859be5 100644 --- a/scripts/test_coverage.sh +++ b/scripts/test_coverage.sh @@ -24,11 +24,30 @@ go test $(go list ./... | grep -v "/tests/e2e/") \ -coverprofile=unit_coverage.out \ -json > test_results.json +if [ $? -ne 0 ]; then + echo "Unit tests failed." + exit 1 +fi + +echo "Running integration tests..." +go test ./internal/pkg/integration_test/... \ + -count=1 \ + -timeout 5m \ + -covermode=count \ + -coverpkg=./internal/pkg/... \ + -coverprofile=integration_coverage.out \ + -json >> test_results.json + +if [ $? -ne 0 ]; then + echo "Integration tests failed." + exit 1 +fi + echo "Merging coverage profiles..." -gocovmerge unit_coverage.out > combined_coverage.out.tmp +gocovmerge unit_coverage.out integration_coverage.out > combined_coverage.out.tmp # Remove mocks from coverage cat combined_coverage.out.tmp | grep -v "mock_" > tests.cov # Cleanup -rm combined_coverage.out.tmp unit_coverage.out +rm combined_coverage.out.tmp integration_coverage.out unit_coverage.out \ No newline at end of file diff --git a/service-monitor.yaml b/service-monitor.yaml index 57aaa800..e70dc90a 100644 --- a/service-monitor.yaml +++ b/service-monitor.yaml @@ -18,12 +18,12 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.6.1" + app.kubernetes.io/version: "4.0.0" spec: selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.6.1" + app.kubernetes.io/version: "4.0.0" endpoints: - port: "metrics" path: "/metrics" diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index ae9a6411..6607eb2a 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -16,8 +16,10 @@ GO_CMD ?= go NAMESPACE ?= "dcgm-exporter" CHART ?= "./../../deployment/" IMAGE_REPOSITORY ?= "nvcr.io/nvidia/k8s/dcgm-exporter" -IMAGE_TAG ?= "3.3.9-3.6.1-ubuntu22.04" +IMAGE_TAG ?= "4.0.0-4.0.0-ubuntu22.04" KUBECONFIG ?= "~/.kube/config" +RUNTIME_CLASS ?= "" +NO_CLEANUP ?= "false" define TEST_CMD @if [ -z ${KUBECONFIG} ]; then \ @@ -32,7 +34,9 @@ define TEST_CMD -chart="$(CHART)" \ -namespace=$(NAMESPACE) \ -image-repository=$(IMAGE_REPOSITORY) \ - -image-tag=$(IMAGE_TAG) + -image-tag=$(IMAGE_TAG) \ + -runtime-class=$(RUNTIME_CLASS) \ + -no-cleanup=$(NO_CLEANUP) endef .PHONY: e2e-test @@ -45,5 +49,23 @@ e2e-test-no-profiling: @$(TEST_CMD) \ -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}" +.PHONY: e2e-basic-auth +e2e-basic-auth: + @$(TEST_CMD) \ + -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}" \ + --ginkgo.label-filter=basicAuth + +.PHONY: e2e-tls +e2e-tls: + @$(TEST_CMD) \ + -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}" \ + --ginkgo.label-filter=tls + +.PHONY: e2e-default +e2e-default: + @$(TEST_CMD) \ + -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}" \ + --ginkgo.label-filter=default + binary: - go test -c --tags="e2e" . \ No newline at end of file + go test -c --tags="e2e" . diff --git a/tests/e2e/e2e_actions_test.go b/tests/e2e/e2e_actions_test.go index 40e6b15c..2c879fcb 100644 --- a/tests/e2e/e2e_actions_test.go +++ b/tests/e2e/e2e_actions_test.go @@ -21,13 +21,16 @@ import ( "context" "fmt" "os" + "time" - "github.com/NVIDIA/dcgm-exporter/tests/e2e/internal/framework" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" "k8s.io/client-go/rest" restclient "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" + + "github.com/NVIDIA/dcgm-exporter/tests/e2e/internal/framework" ) func shouldCreateK8SConfig() *restclient.Config { @@ -45,10 +48,10 @@ func shouldResolvePath() { } func shouldCreateNamespace(ctx context.Context, kubeClient *framework.KubeClient, labels map[string]string) { - _, _ = fmt.Fprintf(GinkgoWriter, "Creating namespace: %q started.\n", testContext.namespace) + By(fmt.Sprintf("Creating namespace: %q started.", testContext.namespace)) _, err := kubeClient.CreateNamespace(ctx, testContext.namespace, labels) Expect(err).ShouldNot(HaveOccurred(), "Creating namespace: failed") - _, _ = fmt.Fprintf(GinkgoWriter, "Creating namespace: %q completed\n", testContext.namespace) + By(fmt.Sprintf("Creating namespace: %q completed\n", testContext.namespace)) } func shouldCreateKubeClient(config *rest.Config) *framework.KubeClient { @@ -77,17 +80,17 @@ func shouldCreateHelmClient(config *rest.Config) *framework.HelmClient { func shouldUninstallHelmChart(helmClient *framework.HelmClient, helmReleaseName string) { if helmClient != nil && helmReleaseName != "" { - _, _ = fmt.Fprintf(GinkgoWriter, "Helm chart uninstall: release %q of the helm chart: %q started.\n", + By(fmt.Sprintf("Helm chart uninstall: release %q of the helm chart: %q started.", helmReleaseName, - testContext.chart) + testContext.chart)) err := helmClient.Uninstall(helmReleaseName) if err != nil { Fail(fmt.Sprintf("Helm chart uninstall: release: %s uninstall failed with error: %v", helmReleaseName, err)) } else { - _, _ = fmt.Fprintf(GinkgoWriter, "Helm chart uninstall: release %q of the helm chart: %q completed.\n", + By(fmt.Sprintf("Helm chart uninstall: release %q of the helm chart: %q completed.", helmReleaseName, - testContext.chart) + testContext.chart)) } } } @@ -102,13 +105,98 @@ func shouldCleanupHelmClient(helmClient *framework.HelmClient) { } func shouldDeleteNamespace(ctx context.Context, kubeClient *framework.KubeClient) { - _, _ = fmt.Fprintf(GinkgoWriter, "Namespace deletion: %q namespace started.\n", testContext.namespace) + By(fmt.Sprintf("Namespace deletion: %q namespace started.", testContext.namespace)) if kubeClient != nil { err := kubeClient.DeleteNamespace(ctx, testContext.namespace) if err != nil { - Fail(fmt.Sprintf("Namespace deletion: Failed to delete namespace %q with error: %v", testContext.namespace, err)) + Fail(fmt.Sprintf("Namespace deletion: Failed to delete namespace %q with error: %v", testContext.namespace, + err)) } else { - _, _ = fmt.Fprintf(GinkgoWriter, "Namespace deletion: %q namespace completed.\n", testContext.namespace) + By(fmt.Sprintf("Namespace deletion: %q namespace completed.\n", testContext.namespace)) } } } + +func shouldCheckIfPodCreated( + ctx context.Context, kubeClient *framework.KubeClient, labels map[string]string, +) *corev1.Pod { + By("Pod creation verification: started") + + var dcgmExpPod *corev1.Pod + + Eventually(func(ctx context.Context) bool { + pods, err := kubeClient.GetPodsByLabel(ctx, testContext.namespace, labels) + if err != nil { + Fail(fmt.Sprintf("Pod creation: Failed with error: %v", err)) + return false + } + + if len(pods) == 1 { + dcgmExpPod = &pods[0] + return true + } + + return false + }).WithPolling(time.Second).Within(15 * time.Minute).WithContext(ctx).Should(BeTrue()) + + By("Pod creation verification: completed") + + return dcgmExpPod +} + +func getDefaultHelmValues() []string { + values := []string{ + fmt.Sprintf("serviceMonitor.enabled=%v", false), + } + + if testContext.arguments != "" { + values = append(values, fmt.Sprintf("arguments=%s", testContext.arguments)) + } + + if testContext.imageRepository != "" { + values = append(values, fmt.Sprintf("image.repository=%s", testContext.imageRepository)) + } + + if testContext.imageTag != "" { + values = append(values, fmt.Sprintf("image.tag=%s", testContext.imageTag)) + } + + if testContext.runtimeClass != "" { + values = append(values, fmt.Sprintf("runtimeClassName=%s", testContext.runtimeClass)) + } + + return values +} + +func shouldCheckIfPodIsReady(ctx context.Context, kubeClient *framework.KubeClient, namespace, podName string) { + By("Checking pod status: started") + Eventually(func(ctx context.Context) bool { + isReady, err := kubeClient.CheckPodStatus(ctx, + namespace, + podName, + func(namespace, podName string, status corev1.PodStatus) (bool, error) { + for _, c := range status.Conditions { + if c.Type != corev1.PodReady { + continue + } + if c.Status == corev1.ConditionTrue { + return true, nil + } + } + + for _, c := range status.ContainerStatuses { + if c.State.Waiting != nil && c.State.Waiting.Reason == "CrashLoopBackOff" { + return false, fmt.Errorf("pod %s in namespace %s is in CrashLoopBackOff", podName, namespace) + } + } + + return false, nil + }) + if err != nil { + Fail(fmt.Sprintf("Checking pod status: Failed with error: %v", err)) + } + + return isReady + }).WithPolling(time.Second).Within(15 * time.Minute).WithContext(ctx).Should(BeTrue()) + By("Checking pod status: completed") +} diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go index 6e2c056c..a5831ed1 100644 --- a/tests/e2e/e2e_suite_test.go +++ b/tests/e2e/e2e_suite_test.go @@ -18,25 +18,19 @@ package e2e import ( - "bytes" "context" "fmt" - "slices" - "time" - corev1 "k8s.io/api/core/v1" - "k8s.io/utils/ptr" + . "github.com/onsi/ginkgo/v2" "github.com/NVIDIA/dcgm-exporter/tests/e2e/internal/framework" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - "github.com/prometheus/common/expfmt" ) const ( podLabel = "pod" namespaceLabel = "namespace" containerLabel = "container" + e2eRunIDLabel = "e2eRunID" dcgmExporterPort = 9400 @@ -48,7 +42,10 @@ const ( workloadImage = "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04" ) -var expectedLabels = []string{podLabel, namespaceLabel, containerLabel} +var ( + expectedLabels = []string{podLabel, namespaceLabel, containerLabel} + dcgmExporterPodLabels = map[string]string{dcgmExporterPodNameLabel: dcgmExporterPodNameLabelValue} +) type testContextType struct { kubeconfig string @@ -57,240 +54,62 @@ type testContextType struct { imageTag string arguments string namespace string + runtimeClass string + noCleanup bool } var _ = Describe("dcgm-exporter-e2e-suite", func() { - When("DCGM exporter is deployed on kubernetes", Ordered, func() { + Context("DCGM exporter is deployed on kubernetes", Ordered, func() { // Init global suite vars var ( - kubeClient *framework.KubeClient - helmClient *framework.HelmClient - - labels = map[string]string{ - "e2eRunID": runID.String(), + kubeClient *framework.KubeClient + helmClient *framework.HelmClient + testRunLabels = map[string]string{ + e2eRunIDLabel: runID.String(), } - - helmReleaseName string - dcgmExpPod *corev1.Pod - workloadPod *corev1.Pod ) - BeforeAll(func(ctx context.Context) { - if testContext.kubeconfig == "" { - _, _ = fmt.Fprintln(GinkgoWriter, "kubeconfig parameter is empty. Defaulting to ~/.kube/config") - } - - if len(testContext.chart) == 0 { - Fail("chart parameter is empty") - } - - shouldResolvePath() - - kubeConfigShouldExists() + if testContext.kubeconfig == "" { + _, _ = fmt.Fprintln(GinkgoWriter, "kubeconfig parameter is empty. Defaulting to ~/.kube/config") + } - k8sConfig := shouldCreateK8SConfig() + if len(testContext.chart) == 0 { + Fail("chart parameter is empty") + } - kubeClient = shouldCreateKubeClient(k8sConfig) + shouldResolvePath() - helmClient = shouldCreateHelmClient(k8sConfig) - }) - - AfterAll(func(ctx context.Context) { - _, _ = fmt.Fprintln(GinkgoWriter, "Clean up: starting") + kubeConfigShouldExists() - shouldUninstallHelmChart(helmClient, helmReleaseName) - shouldCleanupHelmClient(helmClient) + k8sConfig := shouldCreateK8SConfig() - shouldDeleteNamespace(ctx, kubeClient) + kubeClient = shouldCreateKubeClient(k8sConfig) - _, _ = fmt.Fprintln(GinkgoWriter, "Clean up: completed") - }) + helmClient = shouldCreateHelmClient(k8sConfig) - It("should create namespace", func(ctx context.Context) { - shouldCreateNamespace(ctx, kubeClient, labels) + BeforeAll(func(ctx context.Context) { + shouldCreateNamespace(ctx, kubeClient, testRunLabels) }) - It("should install dcgm-exporter helm chart", func(ctx context.Context) { - _, _ = fmt.Fprintf(GinkgoWriter, "Helm chart installation: %q chart started.\n", - testContext.chart) - - values := []string{ - fmt.Sprintf("serviceMonitor.enabled=%v", false), - } - - if testContext.arguments != "" { - values = append(values, fmt.Sprintf("arguments=%s", testContext.arguments)) - } - - if testContext.imageRepository != "" { - values = append(values, fmt.Sprintf("image.repository=%s", testContext.imageRepository)) - } - if testContext.imageTag != "" { - values = append(values, fmt.Sprintf("image.tag=%s", testContext.imageTag)) + AfterAll(func(ctx context.Context) { + if testContext.noCleanup { + _, _ = fmt.Fprintln(GinkgoWriter, "Clean up: skipped") + Skip("Clean up skipped, by user request") } - var err error - - helmReleaseName, err = helmClient.Install(ctx, values, framework.HelmChartOptions{ - CleanupOnFail: true, - GenerateName: true, - Timeout: 5 * time.Minute, - Wait: true, - DryRun: false, - }) - Expect(err).ShouldNot(HaveOccurred(), "Helm chart installation: %q chart failed with error err: %v", testContext.chart, err) - - _, _ = fmt.Fprintf(GinkgoWriter, "Helm chart installation: %q completed.\n", - testContext.chart) - _, _ = fmt.Fprintf(GinkgoWriter, "Helm chart installation: new %q release name.\n", - helmReleaseName) - }) - - labelMap := map[string]string{dcgmExporterPodNameLabel: dcgmExporterPodNameLabelValue} - - It("should create dcgm-exporter pod", func(ctx context.Context) { - _, _ = fmt.Fprintln(GinkgoWriter, "Pod creation verification: started") - - Eventually(func(ctx context.Context) bool { - pods, err := kubeClient.GetPodsByLabel(ctx, testContext.namespace, labelMap) - if err != nil { - Fail(fmt.Sprintf("Pod creation: Failed with error: %v", err)) - return false - } - - if len(pods) == 1 { - dcgmExpPod = &pods[0] - return true - } - - return false - }).WithPolling(time.Second).Within(15 * time.Minute).WithContext(ctx).Should(BeTrue()) - - _, _ = fmt.Fprintln(GinkgoWriter, "Pod creation verification: completed") - }) - - It("should ensure that the dcgm-exporter pod is ready", func(ctx context.Context) { - _, _ = fmt.Fprintln(GinkgoWriter, "Checking pod status: started") - Eventually(func(ctx context.Context) bool { - isReady, err := kubeClient.CheckPodStatus(ctx, - testContext.namespace, - dcgmExpPod.Name, - func(namespace, podName string, status corev1.PodStatus) (bool, error) { - for _, c := range status.Conditions { - if c.Type != corev1.PodReady { - continue - } - if c.Status == corev1.ConditionTrue { - return true, nil - } - } - - for _, c := range status.ContainerStatuses { - if c.State.Waiting != nil && c.State.Waiting.Reason == "CrashLoopBackOff" { - return false, fmt.Errorf("pod %s in namespace %s is in CrashLoopBackOff", podName, namespace) - } - } - - return false, nil - }) - if err != nil { - Fail(fmt.Sprintf("Checking pod status: Failed with error: %v", err)) - } - - return isReady - }).WithPolling(time.Second).Within(15 * time.Minute).WithContext(ctx).Should(BeTrue()) - _, _ = fmt.Fprintln(GinkgoWriter, "Checking pod status: completed") - }) - - It("should create a workload pod", func(ctx context.Context) { - _, _ = fmt.Fprintln(GinkgoWriter, "Workload pod creation: started") - - var err error + By("Clean up: starting") - workloadPod, err = kubeClient.CreatePod(ctx, - testContext.namespace, - labels, - workloadPodName, - workloadContainerName, - workloadImage, - ) - - Expect(err).ShouldNot(HaveOccurred(), - "Workload pod creation: Failed create workload pod with err: %v", err) - Eventually(func(ctx context.Context) bool { - isReady, err := kubeClient.CheckPodStatus(ctx, - testContext.namespace, - workloadPod.Name, func(namespace, podName string, status corev1.PodStatus) (bool, error) { - return status.Phase == corev1.PodSucceeded, nil - }) - if err != nil { - Fail(fmt.Sprintf("Workload pod creation: Checking pod status: Failed with error: %v", err)) - } - - return isReady - }).WithPolling(time.Second).Within(15 * time.Minute).WithContext(ctx).Should(BeTrue()) - - _, _ = fmt.Fprintln(GinkgoWriter, "Workload pod creation: completed") - }) - - It("should wait for 30 seconds, to read metrics", func() { - time.Sleep(30 * time.Second) - }) - - var metricsResponse []byte - - It("should read metrics", func(ctx context.Context) { - _, _ = fmt.Fprintln(GinkgoWriter, "Read metrics: started") - - Eventually(func(ctx context.Context) bool { - var err error + shouldCleanupHelmClient(helmClient) - metricsResponse, err = kubeClient.DoHttpRequest(ctx, - testContext.namespace, - dcgmExpPod.Name, - dcgmExporterPort, - "metrics") - if err != nil { - Fail(fmt.Sprintf("Read metrics: Failed with error: %v", err)) - } + shouldDeleteNamespace(ctx, kubeClient) - return len(metricsResponse) > 0 - }).WithPolling(time.Second).Within(time.Minute).WithContext(ctx).Should(BeTrue()) - _, _ = fmt.Fprintln(GinkgoWriter, "Read metrics: completed") + By("Clean up: completed") }) - It("should verify metrics", func(ctx context.Context) { - Expect(metricsResponse).ShouldNot(BeEmpty()) + VerifyDefaultHelmConfiguration(kubeClient, helmClient, testRunLabels) - var parser expfmt.TextParser - metricFamilies, err := parser.TextToMetricFamilies(bytes.NewReader(metricsResponse)) - Expect(err).ShouldNot(HaveOccurred()) - Expect(len(metricFamilies)).Should(BeNumerically(">", 0)) + VerifyHelmConfigurationWhenTLSEnabled(kubeClient, helmClient, testRunLabels) - for _, metricFamily := range metricFamilies { - Expect(metricFamily).ShouldNot(BeNil()) - metrics := metricFamily.GetMetric() - Expect(metrics).ShouldNot(BeNil()) - - // Each metric must have namespace, pod and container labels - for _, metric := range metrics { - var actualLabels []string - for _, label := range metric.Label { - labelName := ptr.Deref(label.Name, "") - if slices.Contains(expectedLabels, labelName) { - actualLabels = append(actualLabels, labelName) - Expect(label.Value).ShouldNot(BeNil()) - Expect(ptr.Deref(label.Value, "")).ShouldNot(BeEmpty(), "The %s metric contains a label named %q label with empty value.", - ptr.Deref(metricFamily.Name, ""), - labelName, - ) - } - } - Expect(len(actualLabels)).Should(Equal(len(expectedLabels)), - "Metric %s doesn't contains expected labels: %v, actual labels: %v", - ptr.Deref(metricFamily.Name, ""), expectedLabels, metric.Label) - } - } - }) + VerifyHelmConfigurationWhenHttpBasicAuthEnabled(kubeClient, helmClient, testRunLabels) }) }) diff --git a/tests/e2e/e2e_verify_default_configuration_test.go b/tests/e2e/e2e_verify_default_configuration_test.go new file mode 100644 index 00000000..c96b8816 --- /dev/null +++ b/tests/e2e/e2e_verify_default_configuration_test.go @@ -0,0 +1,178 @@ +//go:build e2e + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package e2e + +import ( + "bytes" + "context" + "fmt" + "slices" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/prometheus/common/expfmt" + corev1 "k8s.io/api/core/v1" + "k8s.io/utils/ptr" + + "github.com/NVIDIA/dcgm-exporter/tests/e2e/internal/framework" +) + +// VerifyDefaultHelmConfiguration tests the helm chart with default configuration +var VerifyDefaultHelmConfiguration = func( + kubeClient *framework.KubeClient, + helmClient *framework.HelmClient, + testRunLabels map[string]string, +) bool { + return Context("and uses a default helm configuration", Label("default"), func() { + var ( + helmReleaseName string + dcgmExpPod *corev1.Pod + workloadPod *corev1.Pod + ) + + AfterAll(func(ctx context.Context) { + shouldUninstallHelmChart(helmClient, helmReleaseName) + }) + + It("should install dcgm-exporter helm chart", func(ctx context.Context) { + By(fmt.Sprintf("Helm chart installation: %q chart started.", + testContext.chart)) + + values := getDefaultHelmValues() + + var err error + + helmReleaseName, err = helmClient.Install(ctx, framework.HelmChartOptions{ + CleanupOnFail: true, + GenerateName: true, + Timeout: 5 * time.Minute, + Wait: true, + DryRun: false, + }, framework.WithValues(values...)) + Expect(err).ShouldNot(HaveOccurred(), "Helm chart installation: %q chart failed with error err: %v", + testContext.chart, err) + + By(fmt.Sprintf("Helm chart installation: %q completed.", + testContext.chart)) + By(fmt.Sprintf("Helm chart installation: new %q release name.", + helmReleaseName)) + }) + + It("should create dcgm-exporter pod", func(ctx context.Context) { + dcgmExpPod = shouldCheckIfPodCreated(ctx, kubeClient, dcgmExporterPodLabels) + }) + + It("should ensure that the dcgm-exporter pod is ready", func(ctx context.Context) { + shouldCheckIfPodIsReady(ctx, kubeClient, dcgmExpPod.Namespace, dcgmExpPod.Name) + }) + + It("should create a workload pod", func(ctx context.Context) { + _, _ = fmt.Fprintln(GinkgoWriter, "Workload pod creation: started") + + var err error + + workloadPod, err = kubeClient.CreatePod(ctx, + testContext.namespace, + testRunLabels, + workloadPodName, + workloadContainerName, + workloadImage, + testContext.runtimeClass, + ) + + Expect(err).ShouldNot(HaveOccurred(), + "Workload pod creation: Failed create workload pod with err: %v", err) + Eventually(func(ctx context.Context) bool { + isReady, err := kubeClient.CheckPodStatus(ctx, + testContext.namespace, + workloadPod.Name, func(namespace, podName string, status corev1.PodStatus) (bool, error) { + return status.Phase == corev1.PodSucceeded, nil + }) + if err != nil { + Fail(fmt.Sprintf("Workload pod creation: Checking pod status: Failed with error: %v", err)) + } + + return isReady + }).WithPolling(time.Second).Within(15 * time.Minute).WithContext(ctx).Should(BeTrue()) + + By("Workload pod creation: completed") + }) + + It("should wait for 30 seconds, to read metrics", func() { + time.Sleep(30 * time.Second) + }) + + var metricsResponse []byte + + It("should read metrics", func(ctx context.Context) { + _, _ = fmt.Fprintln(GinkgoWriter, "Read metrics: started") + + Eventually(func(ctx context.Context) bool { + var err error + + metricsResponse, err = kubeClient.DoHTTPRequest(ctx, + testContext.namespace, + dcgmExpPod.Name, + dcgmExporterPort, + "metrics") + if err != nil { + Fail(fmt.Sprintf("Read metrics: Failed with error: %v", err)) + } + + return len(metricsResponse) > 0 + }).WithPolling(time.Second).Within(time.Minute).WithContext(ctx).Should(BeTrue()) + _, _ = fmt.Fprintln(GinkgoWriter, "Read metrics: completed") + }) + + It("should verify metrics", func(ctx context.Context) { + Expect(metricsResponse).ShouldNot(BeEmpty()) + + var parser expfmt.TextParser + metricFamilies, err := parser.TextToMetricFamilies(bytes.NewReader(metricsResponse)) + Expect(err).ShouldNot(HaveOccurred()) + Expect(len(metricFamilies)).Should(BeNumerically(">", 0)) + + for _, metricFamily := range metricFamilies { + Expect(metricFamily).ShouldNot(BeNil()) + metrics := metricFamily.GetMetric() + Expect(metrics).ShouldNot(BeNil()) + + // Each metric must have namespace, pod and container labels + for _, metric := range metrics { + var actualLabels []string + for _, label := range metric.Label { + labelName := ptr.Deref(label.Name, "") + if slices.Contains(expectedLabels, labelName) { + actualLabels = append(actualLabels, labelName) + Expect(label.Value).ShouldNot(BeNil()) + Expect(ptr.Deref(label.Value, "")).ShouldNot(BeEmpty(), + "The %s metric contains a label named %q label with empty value.", + ptr.Deref(metricFamily.Name, ""), + labelName, + ) + } + } + Expect(len(actualLabels)).Should(Equal(len(expectedLabels)), + "Metric %s doesn't contains expected labels: %v, actual labels: %v", + ptr.Deref(metricFamily.Name, ""), expectedLabels, metric.Label) + } + } + }) + }) +} diff --git a/tests/e2e/e2e_verify_http_basic_auth_test.go b/tests/e2e/e2e_verify_http_basic_auth_test.go new file mode 100644 index 00000000..4dcedc9f --- /dev/null +++ b/tests/e2e/e2e_verify_http_basic_auth_test.go @@ -0,0 +1,134 @@ +//go:build e2e + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package e2e + +import ( + "context" + "crypto/tls" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "github.com/NVIDIA/dcgm-exporter/tests/e2e/internal/framework" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" +) + +// VerifyHelmConfigurationWhenHttpBasicAuthEnabled tests helm chart when Http basic authentication is enabled +var VerifyHelmConfigurationWhenHttpBasicAuthEnabled = func(kubeClient *framework.KubeClient, + helmClient *framework.HelmClient, + testRunLabels map[string]string, +) bool { + return Context("and HTTP basic auth is enabled", Label("basicAuth"), func() { + var ( + helmReleaseName string + dcgmExpPod *corev1.Pod + ) + + AfterAll(func(ctx context.Context) { + shouldUninstallHelmChart(helmClient, helmReleaseName) + }) + + userName := "alice" + userPassword := "Pa$$w0rd" + + It("should install dcgm-exporter helm chart", func(ctx context.Context) { + By(fmt.Sprintf("Helm chart installation: %q chart started.", + testContext.chart)) + + values := getDefaultHelmValues() + + var jsonValues []string + + type basicAuth struct { + Users map[string]string `json:"users"` + } + + basicAuthValue := basicAuth{ + Users: map[string]string{ + userName: userPassword, + }, + } + basicAuthValueJson, err := json.Marshal(basicAuthValue) + Expect(err).ShouldNot(HaveOccurred()) + + jsonValues = append(jsonValues, fmt.Sprintf("basicAuth=%s", string(basicAuthValueJson))) + + helmReleaseName, err = helmClient.Install(ctx, framework.HelmChartOptions{ + CleanupOnFail: true, + GenerateName: true, + Timeout: 5 * time.Minute, + Wait: true, + DryRun: false, + }, framework.WithValues(values...), framework.WithJSONValues(jsonValues...)) + Expect(err).ShouldNot(HaveOccurred(), "Helm chart installation: %q chart failed with error err: %v", testContext.chart, err) + + By(fmt.Sprintf("Helm chart installation: %q completed.", + testContext.chart)) + By(fmt.Sprintf("Helm chart installation: new %q release name.", + helmReleaseName)) + }) + + It("should create dcgm-exporter pod", func(ctx context.Context) { + dcgmExpPod = shouldCheckIfPodCreated(ctx, kubeClient, dcgmExporterPodLabels) + }) + + It("should ensure that the dcgm-exporter pod is ready", func(ctx context.Context) { + shouldCheckIfPodIsReady(ctx, kubeClient, dcgmExpPod.Namespace, dcgmExpPod.Name) + }) + + It("should check that the metric endpoint protected by basic HTTP auth", func(ctx context.Context) { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + kubeClient.ErrWriter = GinkgoWriter + kubeClient.OutWriter = GinkgoWriter + localPort, err := kubeClient.PortForward(ctx, dcgmExpPod.Namespace, dcgmExpPod.Name, 9400) + Expect(err).ShouldNot(HaveOccurred()) + Expect(localPort).Should(BeNumerically(">", 0)) + httpClient := &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + }, + } + + By("Ensure that HTTP request returns 401 error when no credentials provided") + resp, err := httpClient.Get(fmt.Sprintf("http://localhost:%d/metrics", localPort)) + Expect(err).ShouldNot(HaveOccurred()) + Expect(resp.StatusCode).To(Equal(401)) + body, err := io.ReadAll(resp.Body) + Expect(err).NotTo(HaveOccurred()) + Expect(string(body)).To(ContainSubstring("Unauthorized")) + + By("Ensure that HTTP request returns 200 error") + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://localhost:%d/metrics", localPort), http.NoBody) + Expect(err).ShouldNot(HaveOccurred()) + req.SetBasicAuth(userName, userPassword) + resp, err = httpClient.Do(req) + Expect(err).ShouldNot(HaveOccurred()) + Expect(resp.StatusCode).To(Equal(200)) + _, err = io.ReadAll(resp.Body) + Expect(err).NotTo(HaveOccurred()) + }) + }) +} diff --git a/tests/e2e/e2e_verify_tls_test.go b/tests/e2e/e2e_verify_tls_test.go new file mode 100644 index 00000000..41f4ed7b --- /dev/null +++ b/tests/e2e/e2e_verify_tls_test.go @@ -0,0 +1,118 @@ +//go:build e2e + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package e2e + +import ( + "context" + "crypto/tls" + "fmt" + "io" + "net/http" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + + "github.com/NVIDIA/dcgm-exporter/tests/e2e/internal/framework" +) + +// VerifyHelmConfigurationWhenTLSEnabled tests configuration when TLS is enabled +var VerifyHelmConfigurationWhenTLSEnabled = func( + kubeClient *framework.KubeClient, + helmClient *framework.HelmClient, + testRunLabels map[string]string, +) bool { + return Context("and TLS is enabled", Label("tls"), func() { + var ( + helmReleaseName string + dcgmExpPod *corev1.Pod + ) + + AfterAll(func(ctx context.Context) { + shouldUninstallHelmChart(helmClient, helmReleaseName) + }) + + It("should install dcgm-exporter helm chart", func(ctx context.Context) { + By(fmt.Sprintf("Helm chart installation: %q chart started.", + testContext.chart)) + + values := getDefaultHelmValues() + + values = append(values, "tlsServerConfig.enabled=true") + + var err error + + helmReleaseName, err = helmClient.Install(ctx, framework.HelmChartOptions{ + CleanupOnFail: true, + GenerateName: true, + Timeout: 5 * time.Minute, + Wait: true, + DryRun: false, + }, framework.WithValues(values...)) + Expect(err).ShouldNot(HaveOccurred(), "Helm chart installation: %q chart failed with error err: %v", + testContext.chart, err) + + By(fmt.Sprintf("Helm chart installation: %q completed.", + testContext.chart)) + By(fmt.Sprintf("Helm chart installation: new %q release name.", + helmReleaseName)) + }) + + It("should create dcgm-exporter pod", func(ctx context.Context) { + dcgmExpPod = shouldCheckIfPodCreated(ctx, kubeClient, dcgmExporterPodLabels) + }) + + It("should ensure that the dcgm-exporter pod is ready", func(ctx context.Context) { + shouldCheckIfPodIsReady(ctx, kubeClient, dcgmExpPod.Namespace, dcgmExpPod.Name) + }) + + It("should check that the port accepts TLS", func(ctx context.Context) { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + kubeClient.ErrWriter = GinkgoWriter + kubeClient.OutWriter = GinkgoWriter + localPort, err := kubeClient.PortForward(ctx, dcgmExpPod.Namespace, dcgmExpPod.Name, 9400) + Expect(err).ShouldNot(HaveOccurred()) + Expect(localPort).Should(BeNumerically(">", 0)) + httpClient := &http.Client{ + Timeout: 5 * time.Second, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + }, + } + + By("Ensure that HTTP request returns 400 error") + resp, err := httpClient.Get(fmt.Sprintf("http://localhost:%d/metrics", localPort)) + Expect(err).ShouldNot(HaveOccurred()) + Expect(resp.StatusCode).To(Equal(400)) + body, err := io.ReadAll(resp.Body) + Expect(err).NotTo(HaveOccurred()) + Expect(string(body)).To(ContainSubstring("Client sent an HTTP request to an HTTPS server")) + + By("Ensure that HTTP request returns 200 error") + resp, err = httpClient.Get(fmt.Sprintf("https://localhost:%d/metrics", localPort)) + Expect(err).ShouldNot(HaveOccurred()) + Expect(resp.StatusCode).To(Equal(200)) + _, err = io.ReadAll(resp.Body) + Expect(err).NotTo(HaveOccurred()) + }) + }) +} diff --git a/tests/e2e/internal/framework/helm.go b/tests/e2e/internal/framework/helm.go index 727771f1..49e8842f 100644 --- a/tests/e2e/internal/framework/helm.go +++ b/tests/e2e/internal/framework/helm.go @@ -111,10 +111,26 @@ type HelmChartOptions struct { DryRun bool } +type HelmChartValueOption func(*helmValues.Options) + +func WithValues(values ...string) HelmChartValueOption { + return func(o *helmValues.Options) { + o.Values = values + } +} + +func WithJSONValues(values ...string) HelmChartValueOption { + return func(o *helmValues.Options) { + o.JSONValues = values + } +} + // Install deploys the helm chart -func (c *HelmClient) Install(ctx context.Context, params []string, chartOpts HelmChartOptions) (string, error) { - values := helmValues.Options{ - Values: params, +func (c *HelmClient) Install(ctx context.Context, chartOpts HelmChartOptions, valuesOptions ...HelmChartValueOption) (string, error) { + values := helmValues.Options{} + + for _, valueOption := range valuesOptions { + valueOption(&values) } chartSpec := helm.ChartSpec{ @@ -136,7 +152,6 @@ func (c *HelmClient) Install(ctx context.Context, params []string, chartOpts Hel } res, err := c.client.InstallChart(ctx, &chartSpec, nil) - if err != nil { return "", fmt.Errorf("error installing the chart; err: %w", err) } diff --git a/tests/e2e/internal/framework/kube.go b/tests/e2e/internal/framework/kube.go index a9448c53..728a8f5d 100644 --- a/tests/e2e/internal/framework/kube.go +++ b/tests/e2e/internal/framework/kube.go @@ -19,6 +19,14 @@ package framework import ( "context" "fmt" + "io" + "net" + "net/http" + + "github.com/pkg/errors" + "k8s.io/client-go/transport/spdy" + + "k8s.io/client-go/tools/portforward" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -32,16 +40,25 @@ const nvidiaResourceName = "nvidia.com/gpu" // KubeClient is a kubernetes client type KubeClient struct { - client *kubernetes.Clientset + client *kubernetes.Clientset + restConfig *rest.Config + OutWriter io.Writer + ErrWriter io.Writer } // NewKubeClient creates a new KubeClient instance -func NewKubeClient(k8sConfig *rest.Config) (*KubeClient, error) { - client, err := kubernetes.NewForConfig(k8sConfig) +func NewKubeClient(restConfig *rest.Config) (*KubeClient, error) { + client, err := kubernetes.NewForConfig(restConfig) if err != nil { return nil, err } - return &KubeClient{client: client}, nil + + return &KubeClient{ + client: client, + restConfig: restConfig, + OutWriter: io.Discard, + ErrWriter: io.Discard, + }, nil } // CreateNamespace creates a new namespace @@ -70,7 +87,9 @@ func (c *KubeClient) DeleteNamespace( } // GetPodsByLabel returns a list of pods that matches with the label selector -func (c *KubeClient) GetPodsByLabel(ctx context.Context, namespace string, labelMap map[string]string) ([]corev1.Pod, error) { +func (c *KubeClient) GetPodsByLabel(ctx context.Context, namespace string, labelMap map[string]string) ([]corev1.Pod, + error, +) { podList, err := c.client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ LabelSelector: labels.SelectorFromSet(labelMap).String(), }) @@ -80,7 +99,9 @@ func (c *KubeClient) GetPodsByLabel(ctx context.Context, namespace string, label return podList.Items, nil } -func (c *KubeClient) CheckPodStatus(ctx context.Context, +// CheckPodStatus check pod status +func (c *KubeClient) CheckPodStatus( + ctx context.Context, namespace, podName string, condition func(namespace, podName string, status corev1.PodStatus) (bool, error), ) (bool, error) { @@ -103,14 +124,23 @@ func (c *KubeClient) CheckPodStatus(ctx context.Context, } // CreatePod creates a new pod in the defined namespace -func (c *KubeClient) CreatePod(ctx context.Context, +func (c *KubeClient) CreatePod( + ctx context.Context, namespace string, labels map[string]string, name string, containerName string, image string, + runtimeClassName string, ) (*corev1.Pod, error) { + // RuntimeClassName does not accept a reference to empty string, however nil is acceptable. + var runtimeClassNameRef *string + if runtimeClassName != "" { + runtimeClassNameRef = &runtimeClassName + } + quantity, _ := resource.ParseQuantity("1") + pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: name, @@ -118,7 +148,8 @@ func (c *KubeClient) CreatePod(ctx context.Context, Labels: labels, }, Spec: corev1.PodSpec{ - RestartPolicy: corev1.RestartPolicyNever, + RuntimeClassName: runtimeClassNameRef, + RestartPolicy: corev1.RestartPolicyNever, Containers: []corev1.Container{ { Name: containerName, @@ -132,19 +163,22 @@ func (c *KubeClient) CreatePod(ctx context.Context, }, }, } + return c.client.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{}) } // DeletePod deletes a pod in the defined namespace -func (c *KubeClient) DeletePod(ctx context.Context, +func (c *KubeClient) DeletePod( + ctx context.Context, namespace string, name string, ) error { return c.client.CoreV1().Pods(namespace).Delete(ctx, name, metav1.DeleteOptions{}) } -// DoHttpRequest makes http request to path on the pod -func (c *KubeClient) DoHttpRequest(ctx context.Context, +// DoHTTPRequest makes http request to path on the pod +func (c *KubeClient) DoHTTPRequest( + ctx context.Context, namespace string, name string, port uint, @@ -169,3 +203,53 @@ func (c *KubeClient) DoHttpRequest(ctx context.Context, return rawResponse, nil } + +// PortForward turn on port forwarding for the pod +func (c *KubeClient) PortForward( + ctx context.Context, namespace string, + podName string, + targetPort int, +) (int, error) { + transport, upgrader, err := spdy.RoundTripperFor(c.restConfig) + if err != nil { + return -1, err + } + + req := c.client.CoreV1().RESTClient().Post(). + Resource("pods"). + Namespace(namespace). + Name(podName). + SubResource("portforward") + + dialer := spdy.NewDialer(upgrader, &http.Client{Transport: transport}, "POST", req.URL()) + + // random select a unused port using port number 0 + ln, err := net.Listen("tcp", "localhost:0") + if err != nil { + return -1, err + } + + localPort := ln.Addr().(*net.TCPAddr).Port + ln.Close() + + fw, err := portforward.New(dialer, []string{fmt.Sprintf("%d:%d", localPort, targetPort)}, ctx.Done(), + make(chan struct{}), + c.OutWriter, + c.ErrWriter) + if err != nil { + return -1, err + } + + errCh := make(chan error, 1) + go func() { + errCh <- fw.ForwardPorts() + }() + + select { + case err = <-errCh: + return -1, errors.Wrap(err, "port forwarding failed") + case <-fw.Ready: + } + + return localPort, nil +} diff --git a/tests/e2e/main_test.go b/tests/e2e/main_test.go index e0850d63..37f482e3 100644 --- a/tests/e2e/main_test.go +++ b/tests/e2e/main_test.go @@ -63,6 +63,16 @@ func TestMain(m *testing.M) { "", `DCGM-exporter command line arguments. Example: -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}"`) + flag.BoolVar(&testContext.noCleanup, + "no-cleanup", + false, + `Skip clean up after tests execution`) + + flag.StringVar(&testContext.runtimeClass, + "runtime-class", + "", + "Runtime Class to use for the DCGM-exporter deployment and workload pods") + flag.Parse() os.Exit(m.Run()) diff --git a/tests/integration/start_read_test.go b/tests/integration/start_read_test.go index ca97b3ac..a398af6b 100644 --- a/tests/integration/start_read_test.go +++ b/tests/integration/start_read_test.go @@ -38,7 +38,7 @@ func TestStartAndReadMetrics(t *testing.T) { } app := cmd.NewApp() args := os.Args[0:1] - args = append(args, "-f=../../etc/default-counters.csv") // Append a file with default counters + args = append(args, "-f=./testdata/default-counters.csv") // Append a file with default counters port := getRandomAvailablePort(t) args = append(args, fmt.Sprintf("-a=:%d", port)) ctx, cancel := context.WithCancel(context.Background()) @@ -47,9 +47,6 @@ func TestStartAndReadMetrics(t *testing.T) { require.NoError(t, err) }(ctx) - t.Log("The dcgm-exporter is running, we wait for 30 seconds to read metrics") - <-time.After(30 * time.Second) - t.Logf("Read metrics from http://localhost:%d/metrics", port) metricsResp, _ := retry.DoWithData( diff --git a/tests/integration/start_with_tls_test.go b/tests/integration/start_with_tls_test.go index 77af70ec..532cff5e 100644 --- a/tests/integration/start_with_tls_test.go +++ b/tests/integration/start_with_tls_test.go @@ -21,7 +21,7 @@ func TestStartWithTLSEnabledAndBasicAuth(t *testing.T) { } app := cmd.NewApp() args := os.Args[0:1] - args = append(args, "-f=../../etc/default-counters.csv") // Append a file with default counters + args = append(args, "-f=./testdata/default-counters.csv") // Append a file with default counters port := getRandomAvailablePort(t) args = append(args, fmt.Sprintf("-a=:%d", port)) args = append(args, "--web-config-file=./testdata/web-config.yml") @@ -59,7 +59,8 @@ func TestStartWithTLSEnabledAndBasicAuth(t *testing.T) { } status, err := retry.DoWithData( func() (int, error) { - req := newRequestWithBasicAuth(t, "alice", "password", http.MethodGet, fmt.Sprintf("https://localhost:%d/metrics", port), nil) + req := newRequestWithBasicAuth(t, "alice", "password", http.MethodGet, + fmt.Sprintf("https://localhost:%d/metrics", port), nil) resp, err := client.Do(req) if err != nil { return -1, err @@ -84,7 +85,8 @@ func TestStartWithTLSEnabledAndBasicAuth(t *testing.T) { } status, err := retry.DoWithData( func() (int, error) { - req := newRequestWithBasicAuth(t, "alice", "bad password", http.MethodGet, fmt.Sprintf("https://localhost:%d/metrics", port), nil) + req := newRequestWithBasicAuth(t, "alice", "bad password", http.MethodGet, + fmt.Sprintf("https://localhost:%d/metrics", port), nil) resp, err := client.Do(req) if err != nil { return -1, err diff --git a/tests/integration/testdata/default-counters.csv b/tests/integration/testdata/default-counters.csv new file mode 100644 index 00000000..ab5e545d --- /dev/null +++ b/tests/integration/testdata/default-counters.csv @@ -0,0 +1,77 @@ +# Format +# If line starts with a '#' it is considered a comment +# DCGM FIELD, Prometheus metric type, help message + +# Clocks +DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param). + +# Temperature +DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). +DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + +# Power +DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + +# PCIE +# DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. +# DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. +# DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + +# Utilization (the sample period varies depending on the product) +DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). +DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). +DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). +DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + +# Errors and violations +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. +# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). +# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). +# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). +# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). +# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). +# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). +# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param). +# Memory usage +DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). +DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). + +# ECC +# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. +# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + +# Retired pages +# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. +# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. +# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + +# NVLink +# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. +# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. +# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. +# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes + +# VGPU License status +DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + +# Remapped rows +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors +DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + +# Static configuration information. These appear as labels on the other metrics +DCGM_FI_DRIVER_VERSION, label, Driver Version +# DCGM_FI_NVML_VERSION, label, NVML Version +# DCGM_FI_DEV_BRAND, label, Device Brand +# DCGM_FI_DEV_SERIAL, label, Device Serial Number +# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version +# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version +# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version +# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version +# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device