From c3ff5a2219750c9ffbcfe6af300dc1dbead03123 Mon Sep 17 00:00:00 2001 From: Vadym Fedorov <152634547+nvvfedorov@users.noreply.github.com> Date: Thu, 23 May 2024 10:06:58 -0500 Subject: [PATCH] DCGM-Exporter release version 3.3.7-3.5.0 --- .gitignore | 4 +- Makefile | 7 +- README.md | 2 +- dcgm-exporter.yaml | 12 +-- deployment/Chart.yaml | 4 +- deployment/templates/metrics-configmap.yaml | 16 ++-- deployment/values.yaml | 2 +- docker/Dockerfile.ubi9 | 20 +++-- docker/Dockerfile.ubuntu22.04 | 17 ++-- etc/dcp-metrics-included.csv | 16 ++-- go.mod | 73 ++++++++--------- go.sum | 88 +++++++++++++++++++++ hack/VERSION | 8 +- pkg/cmd/app.go | 8 ++ pkg/dcgmexporter/config.go | 1 + pkg/dcgmexporter/dcgm.go | 6 +- pkg/dcgmexporter/expcollector.go | 46 +++++------ pkg/dcgmexporter/gpu_collector.go | 5 +- pkg/dcgmexporter/gpu_collector_test.go | 24 +++++- pkg/dcgmexporter/kubernetes.go | 3 +- pkg/dcgmexporter/kubernetes_test.go | 9 +++ pkg/dcgmexporter/pipeline.go | 2 +- pkg/dcgmexporter/types.go | 1 + pkg/dcgmexporter/xid_collector_test.go | 18 +++-- scripts/test_coverage.sh | 34 ++++++++ service-monitor.yaml | 4 +- tests/e2e/Makefile | 2 +- 27 files changed, 304 insertions(+), 128 deletions(-) create mode 100644 scripts/test_coverage.sh diff --git a/.gitignore b/.gitignore index 19c0ff61..2b06b62b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,7 @@ tests.cov test_results.json .scannerwork dist/ - +.run/ ############################################################################### # JetBrains # https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore @@ -236,4 +236,4 @@ $RECYCLE.BIN/ *.msp # Windows shortcuts -*.lnk \ No newline at end of file +*.lnk diff --git a/Makefile b/Makefile index a1146b61..98c95003 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ MKDIR ?= mkdir GOLANGCILINT_TIMEOUT ?= 10m DCGM_VERSION := $(NEW_DCGM_VERSION) -GOLANG_VERSION := 1.21.5 +GOLANG_VERSION := 1.22.5 VERSION := $(NEW_EXPORTER_VERSION) FULL_VERSION := $(DCGM_VERSION)-$(VERSION) OUTPUT := type=oci,dest=/dev/null @@ -39,7 +39,7 @@ test-main: $(GO) test ./... -short install: binary - install -m 755 $(DIST_DIR)/dcgm-exporter /usr/bin/dcgm-exporter + install -m 755 cmd/dcgm-exporter/dcgm-exporter /usr/bin/dcgm-exporter install -m 644 -D ./etc/default-counters.csv /etc/dcgm-exporter/default-counters.csv install -m 644 -D ./etc/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv @@ -78,7 +78,8 @@ test-integration: go test -race -count=1 -timeout 5m -v $(TEST_ARGS) ./tests/integration/ test-coverage: - gocov test ./... | gocov report + sh scripts/test_coverage.sh + gocov convert tests.cov | gocov report .PHONY: lint lint: diff --git a/README.md b/README.md index ea6060ea..dcd112f4 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https To gather metrics on a GPU node, simply start the `dcgm-exporter` container: ```shell -docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04 +docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04 curl localhost:9400/metrics # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). # TYPE DCGM_FI_DEV_SM_CLOCK gauge diff --git a/dcgm-exporter.yaml b/dcgm-exporter.yaml index cad69704..b9a18e46 100644 --- a/dcgm-exporter.yaml +++ b/dcgm-exporter.yaml @@ -18,23 +18,23 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.2" + app.kubernetes.io/version: "3.5.0" spec: updateStrategy: type: RollingUpdate selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.2" + app.kubernetes.io/version: "3.5.0" template: metadata: labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.2" + app.kubernetes.io/version: "3.5.0" name: "dcgm-exporter" spec: containers: - - image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04" + - image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04" env: - name: "DCGM_EXPORTER_LISTEN" value: ":9400" @@ -64,11 +64,11 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.2" + app.kubernetes.io/version: "3.5.0" spec: selector: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.2" + app.kubernetes.io/version: "3.5.0" ports: - name: "metrics" port: 9400 diff --git a/deployment/Chart.yaml b/deployment/Chart.yaml index c4271839..5b861e9a 100644 --- a/deployment/Chart.yaml +++ b/deployment/Chart.yaml @@ -1,9 +1,9 @@ apiVersion: v2 name: dcgm-exporter description: A Helm chart for DCGM exporter -version: "3.4.2" +version: "3.5.0" kubeVersion: ">= 1.19.0-0" -appVersion: "3.4.2" +appVersion: "3.5.0" sources: - https://github.com/nvidia/dcgm-exporter home: https://github.com/nvidia/dcgm-exporter/ diff --git a/deployment/templates/metrics-configmap.yaml b/deployment/templates/metrics-configmap.yaml index faf8cfd8..6c5b7f6a 100644 --- a/deployment/templates/metrics-configmap.yaml +++ b/deployment/templates/metrics-configmap.yaml @@ -73,13 +73,13 @@ data: DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed # DCP metrics - DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). - # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). - # DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). - DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). - # DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). - # DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). - # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). + DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. + # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. + # DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM. + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. + DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. + # DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active. + # DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active. + # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. diff --git a/deployment/values.yaml b/deployment/values.yaml index c2cffade..7490f46c 100644 --- a/deployment/values.yaml +++ b/deployment/values.yaml @@ -17,7 +17,7 @@ image: pullPolicy: IfNotPresent # Image tag defaults to AppVersion, but you can use the tag key # for the image tag, e.g: - tag: 3.3.6-3.4.2-ubuntu22.04 + tag: 3.3.7-3.5.0-ubuntu22.04 # Change the following reference to "/etc/dcgm-exporter/default-counters.csv" # to stop profiling metrics from DCGM diff --git a/docker/Dockerfile.ubi9 b/docker/Dockerfile.ubi9 index 66ef969f..cc3f2614 100644 --- a/docker/Dockerfile.ubi9 +++ b/docker/Dockerfile.ubi9 @@ -1,5 +1,5 @@ -FROM nvcr.io/nvidia/cuda:12.4.1-base-ubi9 AS builder -ARG GOLANG_VERSION +FROM nvcr.io/nvidia/cuda:12.5.1-base-ubi9 AS builder +ARG GOLANG_VERSION=1.22.4 WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter RUN set -eux; \ dnf clean expire-cache; \ @@ -40,7 +40,7 @@ COPY . . RUN make binary check-format -FROM nvcr.io/nvidia/cuda:12.4.1-base-ubi9 +FROM nvcr.io/nvidia/cuda:12.5.1-base-ubi9 ARG DCGM_VERSION ARG VERSION ARG DIST_DIR @@ -53,9 +53,17 @@ LABEL release="N/A" LABEL summary="Exports GPU Metrics to Prometheus" LABEL description="See summary" -RUN dnf update --disablerepo=* --enablerepo=ubi-9-appstream-rpms --enablerepo=ubi-9-baseos-rpms -y && rm -rf /var/cache/yum \ - && dnf clean expire-cache && dnf install -y datacenter-gpu-manager-${DCGM_VERSION} libcap \ - && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite +RUN dnf update --disablerepo=* --enablerepo=ubi-9-appstream-rpms --enablerepo=ubi-9-baseos-rpms -y \ + && dnf install --nodocs --setopt=install_weak_deps=False -y datacenter-gpu-manager-${DCGM_VERSION} libcap \ + && dnf -y clean all\ + && rm -rf /var/cache/yum\ + && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \ + # DCGM exporter doesn't use libdcgm_cublas_proxy*.so. + && rm -rf /usr/lib64/libdcgm_cublas_proxy*.so \ + && rm -rf /usr/local/dcgm/scripts \ + && rm -f /usr/include/*.h /usr/bin/DcgmProfTesterKernels.ptx /usr/bin/dcgmproftester* \ + && rm -rf /var/lib/rpm/rpmdb.sqlite /var/cache/* /var/lib/dnf/history.* /var/log/* /tmp/* /var/tmp/* \ + && rm -rf /usr/share/doc && rm -rf /usr/share/man COPY ./LICENSE ./licenses/LICENSE COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/ diff --git a/docker/Dockerfile.ubuntu22.04 b/docker/Dockerfile.ubuntu22.04 index 24191666..2a6078c5 100644 --- a/docker/Dockerfile.ubuntu22.04 +++ b/docker/Dockerfile.ubuntu22.04 @@ -1,5 +1,5 @@ -FROM nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 AS builder -ARG GOLANG_VERSION=1.21.5 +FROM nvcr.io/nvidia/cuda:12.5.1-base-ubuntu22.04 AS builder +ARG GOLANG_VERSION=1.22.4 WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter RUN set -eux; \ apt-get update; \ @@ -45,7 +45,7 @@ COPY . . RUN make binary check-format -FROM nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 +FROM nvcr.io/nvidia/cuda:12.5.1-base-ubuntu22.04 ARG VERSION ARG DCGM_VERSION @@ -65,9 +65,16 @@ COPY etc /etc/dcgm-exporter RUN apt-get update && apt-get install -y --no-install-recommends \ datacenter-gpu-manager=1:${DCGM_VERSION} libcap2-bin && apt-get purge --autoremove -y openssl \ + && apt-get -y clean \ + && apt-get -y autoclean \ && apt-get autoremove -y \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite + && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \ + # DCGM exporter doesn't use libdcgm_cublas_proxy*.so. + && rm -rf /usr/lib/x86_64-linux-gnu/libdcgm_cublas_proxy*.so \ + && rm -rf /usr/local/dcgm/scripts \ + && rm -f /usr/include/*.h /usr/bin/DcgmProfTesterKernels.ptx /usr/bin/dcgmproftester* \ + && rm -rf /var/cache/debconf/* /var/lib/apt/lists/* /var/log/* /tmp/* /var/tmp/* \ + && rm -rf /usr/share/doc && rm -rf /usr/share/man # Required for DCP metrics ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 # disable all constraints on the configurations required by NVIDIA container toolkit diff --git a/etc/dcp-metrics-included.csv b/etc/dcp-metrics-included.csv index aa263b63..a934db09 100644 --- a/etc/dcp-metrics-included.csv +++ b/etc/dcp-metrics-included.csv @@ -77,14 +77,14 @@ DCGM_FI_DRIVER_VERSION, label, Driver Version # DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device # DCP metrics -DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). -# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). -# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). -DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). -DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). -# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). -# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). -# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). +DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. +# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. +# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM. +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. +DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. +# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active. +# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active. +# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. diff --git a/go.mod b/go.mod index 00f3dc03..43c75dc7 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,8 @@ module github.com/NVIDIA/dcgm-exporter -go 1.21 +go 1.22.0 + +toolchain go1.22.4 require ( github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f @@ -8,9 +10,9 @@ require ( github.com/avast/retry-go/v4 v4.5.1 github.com/bits-and-blooms/bitset v1.13.0 github.com/go-kit/log v0.2.1 - github.com/google/uuid v1.5.0 + github.com/google/uuid v1.6.0 github.com/gorilla/mux v1.8.1 - github.com/mittwald/go-helm-client v0.12.8 + github.com/mittwald/go-helm-client v0.12.9 github.com/onsi/ginkgo/v2 v2.15.0 github.com/onsi/gomega v1.32.0 github.com/prometheus/client_model v0.6.0 @@ -20,13 +22,14 @@ require ( github.com/stretchr/testify v1.8.4 github.com/urfave/cli/v2 v2.27.1 go.uber.org/automaxprocs v1.5.3 - golang.org/x/sync v0.5.0 - google.golang.org/grpc v1.61.1 - k8s.io/api v0.29.2 - k8s.io/apimachinery v0.29.2 - k8s.io/client-go v0.29.2 - k8s.io/kubelet v0.29.2 - k8s.io/utils v0.0.0-20240102154912-e7106e64919e + go.uber.org/mock v0.4.0 + golang.org/x/sync v0.7.0 + google.golang.org/grpc v1.64.0 + k8s.io/api v0.30.2 + k8s.io/apimachinery v0.30.2 + k8s.io/client-go v0.30.2 + k8s.io/kubelet v0.30.2 + k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 ) require ( @@ -44,20 +47,19 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/chai2010/gettext-go v1.0.2 // indirect - github.com/containerd/containerd v1.7.11 // indirect + github.com/containerd/containerd v1.7.12 // indirect github.com/containerd/log v0.1.0 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.3 // indirect github.com/cyphar/filepath-securejoin v0.2.4 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/distribution/reference v0.5.0 // indirect - github.com/docker/cli v24.0.7+incompatible // indirect + github.com/docker/cli v26.1.4+incompatible // indirect github.com/docker/distribution v2.8.3+incompatible // indirect - github.com/docker/docker v24.0.7+incompatible // indirect + github.com/docker/docker v26.1.4+incompatible // indirect github.com/docker/docker-credential-helpers v0.8.0 // indirect - github.com/docker/go-connections v0.4.0 // indirect + github.com/docker/go-connections v0.5.0 // indirect github.com/docker/go-metrics v0.0.1 // indirect - github.com/docker/go-units v0.5.0 // indirect github.com/emicklei/go-restful/v3 v3.11.1 // indirect github.com/evanphx/json-patch v5.7.0+incompatible // indirect github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect @@ -74,7 +76,7 @@ require ( github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.3 // indirect + github.com/golang/protobuf v1.5.4 // indirect github.com/google/btree v1.1.2 // indirect github.com/google/gnostic-models v0.6.8 // indirect github.com/google/go-cmp v0.6.0 // indirect @@ -111,12 +113,11 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect - github.com/morikuni/aec v1.0.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/opencontainers/go-digest v1.0.0 // indirect - github.com/opencontainers/image-spec v1.1.0-rc5 // indirect + github.com/opencontainers/image-spec v1.1.0-rc6 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect @@ -140,32 +141,32 @@ require ( go.opentelemetry.io/otel/metric v1.21.0 // indirect go.opentelemetry.io/otel/trace v1.21.0 // indirect go.starlark.net v0.0.0-20231121155337-90ade8b19d09 // indirect - go.uber.org/mock v0.4.0 // indirect - golang.org/x/crypto v0.18.0 // indirect + golang.org/x/crypto v0.24.0 // indirect golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc // indirect - golang.org/x/net v0.20.0 // indirect - golang.org/x/oauth2 v0.16.0 // indirect - golang.org/x/sys v0.16.0 // indirect - golang.org/x/term v0.16.0 // indirect - golang.org/x/text v0.14.0 // indirect + golang.org/x/mod v0.17.0 // indirect + golang.org/x/net v0.26.0 // indirect + golang.org/x/oauth2 v0.18.0 // indirect + golang.org/x/sys v0.21.0 // indirect + golang.org/x/term v0.21.0 // indirect + golang.org/x/text v0.16.0 // indirect golang.org/x/time v0.5.0 // indirect - golang.org/x/tools v0.16.1 // indirect + golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect google.golang.org/appengine v1.6.8 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240102182953-50ed04b92917 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 // indirect google.golang.org/protobuf v1.33.0 // indirect gopkg.in/evanphx/json-patch.v5 v5.7.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - helm.sh/helm/v3 v3.14.2 // indirect - k8s.io/apiextensions-apiserver v0.29.0 // indirect - k8s.io/apiserver v0.29.2 // indirect - k8s.io/cli-runtime v0.29.0 // indirect - k8s.io/component-base v0.29.2 // indirect - k8s.io/klog/v2 v2.110.1 // indirect - k8s.io/kube-openapi v0.0.0-20240220201932-37d671a357a5 // indirect - k8s.io/kubectl v0.29.0 // indirect - oras.land/oras-go v1.2.4 // indirect + helm.sh/helm/v3 v3.15.2 // indirect + k8s.io/apiextensions-apiserver v0.30.0 // indirect + k8s.io/apiserver v0.30.2 // indirect + k8s.io/cli-runtime v0.30.0 // indirect + k8s.io/component-base v0.30.2 // indirect + k8s.io/klog/v2 v2.120.1 // indirect + k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect + k8s.io/kubectl v0.30.0 // indirect + oras.land/oras-go v1.2.5 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/kustomize/api v0.16.0 // indirect sigs.k8s.io/kustomize/kyaml v0.16.0 // indirect diff --git a/go.sum b/go.sum index 7ccb8941..48b7309a 100644 --- a/go.sum +++ b/go.sum @@ -6,6 +6,7 @@ github.com/BurntSushi/toml v1.3.2 h1:o7IhLm0Msx3BaB+n3Ag7L8EVlByGnpq14C4YWiu/gL8 github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= github.com/DATA-DOG/go-sqlmock v1.5.0 h1:Shsta01QNfFxHCfpW6YH2STWB0MudeXXEWMr20OEh60= github.com/DATA-DOG/go-sqlmock v1.5.0/go.mod h1:f/Ixk793poVmq4qj/V1dPUg2JEAKC73Q5eFN3EC/SaM= +github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ4pzQ= github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE= github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= @@ -62,6 +63,8 @@ github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaD github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw= github.com/containerd/containerd v1.7.11 h1:lfGKw3eU35sjV0aG2eYZTiwFEY1pCzxdzicHP3SZILw= github.com/containerd/containerd v1.7.11/go.mod h1:5UluHxHTX2rdvYuZ5OJTC5m/KJNs0Zs9wVoJm9zf5ZE= +github.com/containerd/containerd v1.7.12 h1:+KQsnv4VnzyxWcfO9mlxxELaoztsDEjOuCMPAuPqgU0= +github.com/containerd/containerd v1.7.12/go.mod h1:/5OMpE1p0ylxtEUGY8kuCYkDRzJm9NO1TFMWjUpdevk= github.com/containerd/continuity v0.4.2 h1:v3y/4Yz5jwnvqPKJJ+7Wf93fyWoCB3F5EclWG023MDM= github.com/containerd/continuity v0.4.2/go.mod h1:F6PTNCKepoxEaXLQp3wDAjygEnImnZ/7o4JzpodfroQ= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= @@ -83,14 +86,24 @@ github.com/distribution/reference v0.5.0 h1:/FUIFXtfc/x2gpa5/VGfiGLuOIdYa1t65IKK github.com/distribution/reference v0.5.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/docker/cli v24.0.7+incompatible h1:wa/nIwYFW7BVTGa7SWPVyyXU9lgORqUb1xfI36MSkFg= github.com/docker/cli v24.0.7+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= +github.com/docker/cli v25.0.1+incompatible h1:mFpqnrS6Hsm3v1k7Wa/BO23oz0k121MTbTO1lpcGSkU= +github.com/docker/cli v25.0.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= +github.com/docker/cli v26.1.4+incompatible h1:I8PHdc0MtxEADqYJZvhBrW9bo8gawKwwenxRM7/rLu8= +github.com/docker/cli v26.1.4+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= github.com/docker/distribution v2.8.3+incompatible h1:AtKxIZ36LoNK51+Z6RpzLpddBirtxJnzDrHLEKxTAYk= github.com/docker/distribution v2.8.3+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= github.com/docker/docker v24.0.7+incompatible h1:Wo6l37AuwP3JaMnZa226lzVXGA3F9Ig1seQen0cKYlM= github.com/docker/docker v24.0.7+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/docker v25.0.5+incompatible h1:UmQydMduGkrD5nQde1mecF/YnSbTOaPeFIeP5C4W+DE= +github.com/docker/docker v25.0.5+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/docker v26.1.4+incompatible h1:vuTpXDuoga+Z38m1OZHzl7NKisKWaWlhjQk7IDPSLsU= +github.com/docker/docker v26.1.4+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/docker-credential-helpers v0.8.0 h1:YQFtbBQb4VrpoPxhFuzEBPQ9E16qz5SpHLS+uswaCp8= github.com/docker/docker-credential-helpers v0.8.0/go.mod h1:UGFXcuoQ5TxPiB54nHOZ32AWRqQdECoh/Mg0AlEYb40= github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= +github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= +github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c h1:+pKlWGMw7gf6bQ+oDZB4KHQFypsfjYlq/C4rfL7D3g8= github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c/go.mod h1:Uw6UezgYA44ePAFQYUehOuCzmy5zmg/+nl2ZfMWGkpA= github.com/docker/go-metrics v0.0.1 h1:AgB/0SvBxihN0X8OR4SjsblXkbMvalQ8cjmtKQ2rQV8= @@ -156,6 +169,8 @@ github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaS github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/gomodule/redigo v1.8.2 h1:H5XSIre1MB5NbPYFp+i1NBbb5qN1W8Y8YAQoAYbkm8k= github.com/gomodule/redigo v1.8.2/go.mod h1:P9dn9mFrCBvWhGE1wpxx6fgq7BAeLBk+UUUzlpkBYO0= github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU= @@ -177,6 +192,8 @@ github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3 github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/handlers v1.5.1 h1:9lRY6j8DEeeBT10CvO9hGW0gmky0BprnvDI5vfhUHH4= github.com/gorilla/handlers v1.5.1/go.mod h1:t8XrUpc4KVXb7HGyJ4/cEnwQiaxrX/hz1Zv/4g96P1Q= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= @@ -259,6 +276,8 @@ github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zx github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/mittwald/go-helm-client v0.12.8 h1:i85RsE+9j9onkju/Pp0j5dxD+QdDSUB9TknmAq3MLG4= github.com/mittwald/go-helm-client v0.12.8/go.mod h1:ukR3Et5zbfBij7bFL1ZnLvPytsbBXCrI2qQYr2yVi9I= +github.com/mittwald/go-helm-client v0.12.9 h1:tfI5ECgrbfAolA9TnlCeA5F2TEIvdsOxVmoSyW80lCI= +github.com/mittwald/go-helm-client v0.12.9/go.mod h1:ukR3Et5zbfBij7bFL1ZnLvPytsbBXCrI2qQYr2yVi9I= github.com/moby/locker v1.0.1 h1:fOXqR41zeveg4fFODix+1Ch4mj/gT0NE1XJbp/epuBg= github.com/moby/locker v1.0.1/go.mod h1:S7SDdo5zpBK84bzzVlKr2V0hz+7x9hWbYC/kq7oQppc= github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= @@ -293,6 +312,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0-rc5 h1:Ygwkfw9bpDvs+c9E34SdgGOj41dX/cbdlwvlWt0pnFI= github.com/opencontainers/image-spec v1.1.0-rc5/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8= +github.com/opencontainers/image-spec v1.1.0-rc6 h1:XDqvyKsJEbRtATzkgItUqBA7QHk58yxX1Ov9HERHNqU= +github.com/opencontainers/image-spec v1.1.0-rc6/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5 h1:Ii+DKncOVM8Cu1Hc+ETb5K+23HdAMvESYE3ZJ5b5cMI= @@ -412,6 +433,9 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc= golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= +golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= +golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= +golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc h1:ao2WRsKSzW6KuUY9IWPwWahcHCgR0s52IfwutMfEbdM= golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -419,6 +443,8 @@ golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.14.0 h1:dGoOF9QVLYng8IHTm7BAyWqCqSheQ5pYWGhzW00YJr0= golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -430,8 +456,14 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo= golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/oauth2 v0.16.0 h1:aDkGMBSYxElaoP81NpoUoz2oo2R2wHdZpGToUxfyQrQ= golang.org/x/oauth2 v0.16.0/go.mod h1:hqZ+0LWXsiVoZpeld6jVt06P3adbS2Uu911W1SsJv2o= +golang.org/x/oauth2 v0.18.0 h1:09qnuIAgzdx1XplqJvW6CQqMCtGZykZWcXzPMPUusvI= +golang.org/x/oauth2 v0.18.0/go.mod h1:Wf7knwG0MPoWIMMBgFlEaSUDaKskp0dCfrlJRJXbBi8= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -440,6 +472,9 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -458,11 +493,17 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= +golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= +golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= @@ -470,6 +511,8 @@ golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -479,6 +522,9 @@ golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4f golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.16.1 h1:TLyB3WofjdOEepBHAU20JdNC1Zbg87elYofWYAY5oZA= golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= +golang.org/x/tools v0.18.0/go.mod h1:GL7B4CwcLLeo59yx/9UWWuNOW1n3VZ4f5axWfML7Lcg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -487,8 +533,12 @@ google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAs google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= google.golang.org/genproto/googleapis/rpc v0.0.0-20240102182953-50ed04b92917 h1:6G8oQ016D88m1xAKljMlBOOGWDZkes4kMhgGFlf8WcQ= google.golang.org/genproto/googleapis/rpc v0.0.0-20240102182953-50ed04b92917/go.mod h1:xtjpI3tXFPP051KaWnhvxkiubL/6dJ18vLVf7q2pTOU= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 h1:NnYq6UN9ReLM9/Y01KWNOWyI5xQ9kbIms5GGJVwS/Yc= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY= google.golang.org/grpc v1.61.1 h1:kLAiWrZs7YeDM6MumDe7m3y4aM6wacLzM1Y/wiLP9XY= google.golang.org/grpc v1.61.1/go.mod h1:VUbo7IFqmF1QtCAstipjG0GIoq49KvMe9+h1jFLBNJs= +google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= +google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= @@ -514,32 +564,70 @@ gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= gotest.tools/v3 v3.4.0/go.mod h1:CtbdzLSsqVhDgMtKsx03ird5YTGB3ar27v0u/yKBW5g= helm.sh/helm/v3 v3.14.2 h1:V71fv+NGZv0icBlr+in1MJXuUIHCiPG1hW9gEBISTIA= helm.sh/helm/v3 v3.14.2/go.mod h1:2itvvDv2WSZXTllknfQo6j7u3VVgMAvm8POCDgYH424= +helm.sh/helm/v3 v3.15.2 h1:/3XINUFinJOBjQplGnjw92eLGpgXXp1L8chWPkCkDuw= +helm.sh/helm/v3 v3.15.2/go.mod h1:FzSIP8jDQaa6WAVg9F+OkKz7J0ZmAga4MABtTbsb9WQ= k8s.io/api v0.29.2 h1:hBC7B9+MU+ptchxEqTNW2DkUosJpp1P+Wn6YncZ474A= k8s.io/api v0.29.2/go.mod h1:sdIaaKuU7P44aoyyLlikSLayT6Vb7bvJNCX105xZXY0= +k8s.io/api v0.30.0 h1:siWhRq7cNjy2iHssOB9SCGNCl2spiF1dO3dABqZ8niA= +k8s.io/api v0.30.0/go.mod h1:OPlaYhoHs8EQ1ql0R/TsUgaRPhpKNxIMrKQfWUp8QSE= +k8s.io/api v0.30.2 h1:+ZhRj+28QT4UOH+BKznu4CBgPWgkXO7XAvMcMl0qKvI= +k8s.io/api v0.30.2/go.mod h1:ULg5g9JvOev2dG0u2hig4Z7tQ2hHIuS+m8MNZ+X6EmI= k8s.io/apiextensions-apiserver v0.29.0 h1:0VuspFG7Hj+SxyF/Z/2T0uFbI5gb5LRgEyUVE3Q4lV0= k8s.io/apiextensions-apiserver v0.29.0/go.mod h1:TKmpy3bTS0mr9pylH0nOt/QzQRrW7/h7yLdRForMZwc= +k8s.io/apiextensions-apiserver v0.30.0 h1:jcZFKMqnICJfRxTgnC4E+Hpcq8UEhT8B2lhBcQ+6uAs= +k8s.io/apiextensions-apiserver v0.30.0/go.mod h1:N9ogQFGcrbWqAY9p2mUAL5mGxsLqwgtUce127VtRX5Y= k8s.io/apimachinery v0.29.2 h1:EWGpfJ856oj11C52NRCHuU7rFDwxev48z+6DSlGNsV8= k8s.io/apimachinery v0.29.2/go.mod h1:6HVkd1FwxIagpYrHSwJlQqZI3G9LfYWRPAkUvLnXTKU= +k8s.io/apimachinery v0.30.0 h1:qxVPsyDM5XS96NIh9Oj6LavoVFYff/Pon9cZeDIkHHA= +k8s.io/apimachinery v0.30.0/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= +k8s.io/apimachinery v0.30.2 h1:fEMcnBj6qkzzPGSVsAZtQThU62SmQ4ZymlXRC5yFSCg= +k8s.io/apimachinery v0.30.2/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= k8s.io/apiserver v0.29.2 h1:+Z9S0dSNr+CjnVXQePG8TcBWHr3Q7BmAr7NraHvsMiQ= k8s.io/apiserver v0.29.2/go.mod h1:B0LieKVoyU7ykQvPFm7XSdIHaCHSzCzQWPFa5bqbeMQ= +k8s.io/apiserver v0.30.0 h1:QCec+U72tMQ+9tR6A0sMBB5Vh6ImCEkoKkTDRABWq6M= +k8s.io/apiserver v0.30.0/go.mod h1:smOIBq8t0MbKZi7O7SyIpjPsiKJ8qa+llcFCluKyqiY= +k8s.io/apiserver v0.30.2 h1:ACouHiYl1yFI2VFI3YGM+lvxgy6ir4yK2oLOsLI1/tw= +k8s.io/apiserver v0.30.2/go.mod h1:BOTdFBIch9Sv0ypSEcUR6ew/NUFGocRFNl72Ra7wTm8= k8s.io/cli-runtime v0.29.0 h1:q2kC3cex4rOBLfPOnMSzV2BIrrQlx97gxHJs21KxKS4= k8s.io/cli-runtime v0.29.0/go.mod h1:VKudXp3X7wR45L+nER85YUzOQIru28HQpXr0mTdeCrk= +k8s.io/cli-runtime v0.30.0 h1:0vn6/XhOvn1RJ2KJOC6IRR2CGqrpT6QQF4+8pYpWQ48= +k8s.io/cli-runtime v0.30.0/go.mod h1:vATpDMATVTMA79sZ0YUCzlMelf6rUjoBzlp+RnoM+cg= k8s.io/client-go v0.29.2 h1:FEg85el1TeZp+/vYJM7hkDlSTFZ+c5nnK44DJ4FyoRg= k8s.io/client-go v0.29.2/go.mod h1:knlvFZE58VpqbQpJNbCbctTVXcd35mMyAAwBdpt4jrA= +k8s.io/client-go v0.30.0 h1:sB1AGGlhY/o7KCyCEQ0bPWzYDL0pwOZO4vAtTSh/gJQ= +k8s.io/client-go v0.30.0/go.mod h1:g7li5O5256qe6TYdAMyX/otJqMhIiGgTapdLchhmOaY= +k8s.io/client-go v0.30.2 h1:sBIVJdojUNPDU/jObC+18tXWcTJVcwyqS9diGdWHk50= +k8s.io/client-go v0.30.2/go.mod h1:JglKSWULm9xlJLx4KCkfLLQ7XwtlbflV6uFFSHTMgVs= k8s.io/component-base v0.29.2 h1:lpiLyuvPA9yV1aQwGLENYyK7n/8t6l3nn3zAtFTJYe8= k8s.io/component-base v0.29.2/go.mod h1:BfB3SLrefbZXiBfbM+2H1dlat21Uewg/5qtKOl8degM= +k8s.io/component-base v0.30.0 h1:cj6bp38g0ainlfYtaOQuRELh5KSYjhKxM+io7AUIk4o= +k8s.io/component-base v0.30.0/go.mod h1:V9x/0ePFNaKeKYA3bOvIbrNoluTSG+fSJKjLdjOoeXQ= +k8s.io/component-base v0.30.2 h1:pqGBczYoW1sno8q9ObExUqrYSKhtE5rW3y6gX88GZII= +k8s.io/component-base v0.30.2/go.mod h1:yQLkQDrkK8J6NtP+MGJOws+/PPeEXNpwFixsUI7h/OE= k8s.io/klog/v2 v2.110.1 h1:U/Af64HJf7FcwMcXyKm2RPM22WZzyR7OSpYj5tg3cL0= k8s.io/klog/v2 v2.110.1/go.mod h1:YGtd1984u+GgbuZ7e08/yBuAfKLSO0+uR1Fhi6ExXjo= +k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= +k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20240220201932-37d671a357a5 h1:QSpdNrZ9uRlV0VkqLvVO0Rqg8ioKi3oSw7O5P7pJV8M= k8s.io/kube-openapi v0.0.0-20240220201932-37d671a357a5/go.mod h1:Pa1PvrP7ACSkuX6I7KYomY6cmMA0Tx86waBhDUgoKPw= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= k8s.io/kubectl v0.29.0 h1:Oqi48gXjikDhrBF67AYuZRTcJV4lg2l42GmvsP7FmYI= k8s.io/kubectl v0.29.0/go.mod h1:0jMjGWIcMIQzmUaMgAzhSELv5WtHo2a8pq67DtviAJs= +k8s.io/kubectl v0.30.0 h1:xbPvzagbJ6RNYVMVuiHArC1grrV5vSmmIcSZuCdzRyk= +k8s.io/kubectl v0.30.0/go.mod h1:zgolRw2MQXLPwmic2l/+iHs239L49fhSeICuMhQQXTI= k8s.io/kubelet v0.29.2 h1:bQ2StqkUqPCFNLtGLsb3v3O2LKQHXNMju537zOGboRg= k8s.io/kubelet v0.29.2/go.mod h1:i5orNPqW/fAMrqptbCXFW/vLBBP12TZZc41IrrvF7SY= +k8s.io/kubelet v0.30.2 h1:Ck4E/pHndI20IzDXxS57dElhDGASPO5pzXF7BcKfmCY= +k8s.io/kubelet v0.30.2/go.mod h1:DSwwTbLQmdNkebAU7ypIALR4P9aXZNFwgRmedojUE94= k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ= k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 h1:jgGTlFYnhF1PM1Ax/lAlxUPE+KfCIXHaathvJg1C3ak= +k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= oras.land/oras-go v1.2.4 h1:djpBY2/2Cs1PV87GSJlxv4voajVOMZxqqtq9AB8YNvY= oras.land/oras-go v1.2.4/go.mod h1:DYcGfb3YF1nKjcezfX2SNlDAeQFKSXmf+qrFmrh4324= +oras.land/oras-go v1.2.5 h1:XpYuAwAb0DfQsunIyMfeET92emK8km3W4yEzZvUbsTo= +oras.land/oras-go v1.2.5/go.mod h1:PuAwRShRZCsZb7g8Ar3jKKQR/2A/qN+pkYxIOd/FAoo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/kustomize/api v0.16.0 h1:/zAR4FOQDCkgSDmVzV2uiFbuy9bhu3jEzthrHCuvm1g= diff --git a/hack/VERSION b/hack/VERSION index 273f147f..be3c14df 100644 --- a/hack/VERSION +++ b/hack/VERSION @@ -1,4 +1,4 @@ -OLD_DCGM_VERSION=3.3.5 -OLD_EXPORTER_VERSION=3.4.1 -NEW_DCGM_VERSION=3.3.6 -NEW_EXPORTER_VERSION=3.4.2 +OLD_DCGM_VERSION=3.3.6 +OLD_EXPORTER_VERSION=3.4.2 +NEW_DCGM_VERSION=3.3.7 +NEW_EXPORTER_VERSION=3.5.0 diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index 7bbd44c8..c78f8434 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -74,6 +74,7 @@ const ( CLIDCGMLogLevel = "dcgm-log-level" CLIPodResourcesKubeletSocket = "pod-resources-kubelet-socket" CLIHPCJobMappingDir = "hpc-job-mapping-dir" + CLINvidiaResourceNames = "nvidia-resource-names" ) func NewApp(buildVersion ...string) *cli.App { @@ -237,6 +238,12 @@ func NewApp(buildVersion ...string) *cli.App { Usage: "Path to HPC job mapping file directory used for mapping GPUs to jobs.", EnvVars: []string{"DCGM_HPC_JOB_MAPPING_DIR"}, }, + &cli.StringSliceFlag{ + Name: CLINvidiaResourceNames, + Value: cli.NewStringSlice(), + Usage: "Nvidia resource names for specified GPU type like nvidia.com/a100, nvidia.com/a10.", + EnvVars: []string{"NVIDIA_RESOURCE_NAMES"}, + }, } if runtime.GOOS == "linux" { @@ -631,5 +638,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { DCGMLogLevel: dcgmLogLevel, PodResourcesKubeletSocket: c.String(CLIPodResourcesKubeletSocket), HPCJobMappingDir: c.String(CLIHPCJobMappingDir), + NvidiaResourceNames: c.StringSlice(CLINvidiaResourceNames), }, nil } diff --git a/pkg/dcgmexporter/config.go b/pkg/dcgmexporter/config.go index a9405111..f13c91db 100644 --- a/pkg/dcgmexporter/config.go +++ b/pkg/dcgmexporter/config.go @@ -58,4 +58,5 @@ type Config struct { DCGMLogLevel string PodResourcesKubeletSocket string HPCJobMappingDir string + NvidiaResourceNames []string } diff --git a/pkg/dcgmexporter/dcgm.go b/pkg/dcgmexporter/dcgm.go index a00bd366..e348bf96 100644 --- a/pkg/dcgmexporter/dcgm.go +++ b/pkg/dcgmexporter/dcgm.go @@ -81,7 +81,7 @@ func WatchFieldGroup( return nil } -func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]func(), error) { +func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]dcgm.GroupHandle, dcgm.FieldHandle, []func(), error) { var err error var cleanups []func() var cleanup func() @@ -120,12 +120,12 @@ func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collect } } - return cleanups, nil + return groups, fieldGroup, cleanups, nil fail: for _, f := range cleanups { f() } - return nil, err + return nil, dcgm.FieldHandle{}, nil, err } diff --git a/pkg/dcgmexporter/expcollector.go b/pkg/dcgmexporter/expcollector.go index 1ddcea87..68778db8 100644 --- a/pkg/dcgmexporter/expcollector.go +++ b/pkg/dcgmexporter/expcollector.go @@ -35,7 +35,7 @@ var expMetricsFormat = ` # HELP {{ $counter.FieldName }} {{ $counter.Help }} # TYPE {{ $counter.FieldName }} {{ $counter.PromType }} {{- range $metric := $metrics }} -{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} +{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",pci_bus_id="{{ $metric.GPUPCIBusID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} {{- range $k, $v := $metric.Labels -}} ,{{ $k }}="{{ $v }}" @@ -78,22 +78,12 @@ type expCollector struct { labelFiller func(map[string]string, int64) // Function to fill labels windowSize int // Window size transformations []Transform // Transformers for metric postprocessing + deviceGroups []dcgm.GroupHandle + deviceFieldGroup dcgm.FieldHandle } func (c *expCollector) getMetrics() (MetricsByCounter, error) { - fieldGroupIdx := expCollectorFieldGroupIdx.Add(1) - - fieldGroupName := fmt.Sprintf("expCollectorFieldGroupName%d", fieldGroupIdx) - fieldsGroup, err := dcgm.FieldGroupCreate(fieldGroupName, c.counterDeviceFields) - if err != nil { - return nil, err - } - - defer func() { - _ = dcgm.FieldGroupDestroy(fieldsGroup) - }() - - err = dcgm.UpdateAllFields() + err := dcgm.UpdateAllFields() if err != nil { return nil, err } @@ -102,18 +92,19 @@ func (c *expCollector) getMetrics() (MetricsByCounter, error) { window := time.Now().Add(-time.Duration(c.windowSize) * time.Millisecond) - values, _, err := dcgm.GetValuesSince(dcgm.GroupAllGPUs(), fieldsGroup, window) - if err != nil { - return nil, err - } - - for _, val := range values { - if val.Status == 0 { - if _, exists := mapEntityIDToValues[val.EntityId]; !exists { - mapEntityIDToValues[val.EntityId] = map[int64]int{} - } - for _, v := range c.fieldValueParser(val.Int64()) { - mapEntityIDToValues[val.EntityId][v] += 1 + for _, group := range c.deviceGroups { + values, _, err := dcgm.GetValuesSince(group, c.deviceFieldGroup, window) + if err != nil { + return nil, err + } + for _, val := range values { + if val.Status == 0 { + if _, exists := mapEntityIDToValues[val.EntityId]; !exists { + mapEntityIDToValues[val.EntityId] = map[int64]int{} + } + for _, v := range c.fieldValueParser(val.Int64()) { + mapEntityIDToValues[val.EntityId][v] += 1 + } } } } @@ -174,6 +165,7 @@ func (c *expCollector) createMetric(labels map[string]string, mi MonitoringInfo, GPUUUID: mi.DeviceInfo.UUID, GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU), GPUModelName: gpuModel, + GPUPCIBusID: mi.DeviceInfo.PCI.BusID, Hostname: c.hostname, Labels: labels, @@ -257,7 +249,7 @@ func newExpCollector( var err error - collector.cleanups, err = SetupDcgmFieldsWatch(collector.counterDeviceFields, + collector.deviceGroups, collector.deviceFieldGroup, collector.cleanups, err = SetupDcgmFieldsWatch(collector.counterDeviceFields, collector.sysInfo, int64(config.CollectInterval)*1000) if err != nil { diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index dcd640ca..e4cac498 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -56,7 +56,7 @@ func NewDCGMCollector( collector.UseOldNamespace = config.UseOldNamespace collector.ReplaceBlanksInModelName = config.ReplaceBlanksInModelName - cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, + _, _, cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, fieldEntityGroupTypeSystemInfo.SystemInfo, int64(config.CollectInterval)*1000) if err != nil { @@ -201,6 +201,7 @@ func ToSwitchMetric( GPUUUID: "", GPUDevice: fmt.Sprintf("nvswitch%d", mi.ParentId), GPUModelName: "", + GPUPCIBusID: "", Hostname: hostname, Labels: labels, Attributes: nil, @@ -246,6 +247,7 @@ func ToCPUMetric( GPUUUID: "", GPUDevice: fmt.Sprintf("%d", mi.ParentId), GPUModelName: "", + GPUPCIBusID: "", Hostname: hostname, Labels: labels, Attributes: nil, @@ -311,6 +313,7 @@ func ToMetric( GPUUUID: d.UUID, GPUDevice: fmt.Sprintf("nvidia%d", d.GPU), GPUModelName: gpuModel, + GPUPCIBusID: d.PCI.BusID, Hostname: hostname, Labels: labels, diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go index e38b2673..2f38d442 100644 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ b/pkg/dcgmexporter/gpu_collector_test.go @@ -95,6 +95,9 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun dev := dcgm.Device{ GPU: 0, UUID: fmt.Sprintf("fake%d", gpuId), + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, } return dev, nil @@ -169,7 +172,8 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun for _, metric := range metrics { seenMetrics[metric.Counter.FieldName] = true require.NotEmpty(t, metric.GPU) - + require.NotEmpty(t, metric.GPUUUID) + require.NotEmpty(t, metric.GPUPCIBusID) require.NotEmpty(t, metric.Value) require.NotEqual(t, metric.Value, FailedToConvert) } @@ -197,6 +201,9 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun GPU: 0, DCGMSupported: "No", UUID: fmt.Sprintf("fake%d", gpuId), + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, } return dev, nil @@ -260,7 +267,8 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun for _, metric := range dev { seenMetrics[metric.Counter.FieldName] = true require.NotEmpty(t, metric.GPU) - + require.Empty(t, metric.GPUUUID) + require.Empty(t, metric.GPUPCIBusID) require.NotEmpty(t, metric.Value) require.NotEqual(t, metric.Value, FailedToConvert) } @@ -295,6 +303,9 @@ func TestToMetric(t *testing.T) { Identifiers: dcgm.DeviceIdentifiers{ Model: "NVIDIA T400 4GB", }, + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, } var instanceInfo *GPUInstanceInfo = nil @@ -324,6 +335,9 @@ func TestToMetric(t *testing.T) { metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] assert.Equal(t, "42", metricValues[0].Value) assert.Equal(t, tc.expectedGPUModelName, metricValues[0].GPUModelName) + + assert.Equal(t, d.UUID, metricValues[0].GPUUUID) + assert.Equal(t, d.PCI.BusID, metricValues[0].GPUPCIBusID) }) } } @@ -343,6 +357,9 @@ func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) { Identifiers: dcgm.DeviceIdentifiers{ Model: "NVIDIA T400 4GB", }, + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, } var instanceInfo *GPUInstanceInfo = nil @@ -393,6 +410,9 @@ func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) { assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Attributes["err_code"]) assert.Contains(t, metricValues[0].Attributes, "err_msg") assert.Equal(t, tc.expectedErr, metricValues[0].Attributes["err_msg"]) + + assert.Equal(t, d.UUID, metricValues[0].GPUUUID) + assert.Equal(t, d.PCI.BusID, metricValues[0].GPUPCIBusID) }) } } diff --git a/pkg/dcgmexporter/kubernetes.go b/pkg/dcgmexporter/kubernetes.go index 1a04245b..8fb8d7d2 100644 --- a/pkg/dcgmexporter/kubernetes.go +++ b/pkg/dcgmexporter/kubernetes.go @@ -21,6 +21,7 @@ import ( "fmt" "net" "regexp" + "slices" "strings" "time" @@ -147,7 +148,7 @@ func (p *PodMapper) toDeviceToPod( for _, device := range container.GetDevices() { resourceName := device.GetResourceName() - if resourceName != nvidiaResourceName { + if resourceName != nvidiaResourceName && !slices.Contains(p.Config.NvidiaResourceNames, resourceName) { // Mig resources appear differently than GPU resources if !strings.HasPrefix(resourceName, nvidiaMigResourcePrefix) { continue diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go index 7a9b2b86..3b48efe2 100644 --- a/pkg/dcgmexporter/kubernetes_test.go +++ b/pkg/dcgmexporter/kubernetes_test.go @@ -174,6 +174,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { MetricGPUDevice string MetricMigProfile string PODGPUID string + NvidiaResourceNames []string } testCases := []TestCase{ @@ -232,6 +233,13 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { MetricGPUDevice: "0", GPUInstanceID: 3, }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: "nvidia.com/a100", + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + NvidiaResourceNames: []string{"nvidia.com/a100"}, + }, } for _, tc := range testCases { @@ -272,6 +280,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { podMapper, err := NewPodMapper(&Config{ KubernetesGPUIdType: tc.KubernetesGPUIDType, PodResourcesKubeletSocket: socketPath, + NvidiaResourceNames: tc.NvidiaResourceNames, }) require.NoError(t, err) require.NotNil(t, podMapper) diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go index 69312403..fd4b25c0 100644 --- a/pkg/dcgmexporter/pipeline.go +++ b/pkg/dcgmexporter/pipeline.go @@ -296,7 +296,7 @@ var migMetricsFormat = ` # HELP {{ $counter.FieldName }} {{ $counter.Help }} # TYPE {{ $counter.FieldName }} {{ $counter.PromType }} {{- range $metric := $metrics }} -{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} +{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",pci_bus_id="{{ $metric.GPUPCIBusID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} {{- range $k, $v := $metric.Labels -}} ,{{ $k }}="{{ $v }}" diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go index fc4ba0f4..246afe02 100644 --- a/pkg/dcgmexporter/types.go +++ b/pkg/dcgmexporter/types.go @@ -96,6 +96,7 @@ type Metric struct { GPUUUID string GPUDevice string GPUModelName string + GPUPCIBusID string UUID string diff --git a/pkg/dcgmexporter/xid_collector_test.go b/pkg/dcgmexporter/xid_collector_test.go index 96b3f9b9..ceaf02d1 100644 --- a/pkg/dcgmexporter/xid_collector_test.go +++ b/pkg/dcgmexporter/xid_collector_test.go @@ -221,16 +221,18 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { })) continue } - assert.Len(t, mv.Label, 8) + assert.Len(t, mv.Label, 9) assert.Equal(t, "gpu", *mv.Label[0].Name) assert.Equal(t, "UUID", *mv.Label[1].Name) - assert.Equal(t, "device", *mv.Label[2].Name) - assert.Equal(t, "modelName", *mv.Label[3].Name) - assert.Equal(t, "Hostname", *mv.Label[4].Name) - assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *mv.Label[5].Name) - assert.Equal(t, "window_size_in_ms", *mv.Label[6].Name) - assert.Equal(t, "xid", *mv.Label[7].Name) - assert.NotEmpty(t, *mv.Label[7].Value) + assert.Equal(t, "pci_bus_id", *mv.Label[2].Name) + assert.NotEmpty(t, *mv.Label[2].Value) + assert.Equal(t, "device", *mv.Label[3].Name) + assert.Equal(t, "modelName", *mv.Label[4].Name) + assert.Equal(t, "Hostname", *mv.Label[5].Name) + assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *mv.Label[6].Name) + assert.Equal(t, "window_size_in_ms", *mv.Label[7].Name) + assert.Equal(t, "xid", *mv.Label[8].Name) + assert.NotEmpty(t, *mv.Label[8].Value) } } diff --git a/scripts/test_coverage.sh b/scripts/test_coverage.sh new file mode 100644 index 00000000..db49bd43 --- /dev/null +++ b/scripts/test_coverage.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +echo "Running unit tests..." +go test $(go list ./... | grep -v "/tests/e2e/") \ + -count=1 \ + -timeout 5m \ + -covermode=count \ + -coverprofile=unit_coverage.out \ + -json > test_results.json + +echo "Merging coverage profiles..." +gocovmerge unit_coverage.out > combined_coverage.out.tmp + +# Remove mocks from coverage +cat combined_coverage.out.tmp | grep -v "mock_" > tests.cov + +# Cleanup +rm combined_coverage.out.tmp unit_coverage.out diff --git a/service-monitor.yaml b/service-monitor.yaml index 7e89b3ee..885ddc41 100644 --- a/service-monitor.yaml +++ b/service-monitor.yaml @@ -18,12 +18,12 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.2" + app.kubernetes.io/version: "3.5.0" spec: selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.2" + app.kubernetes.io/version: "3.5.0" endpoints: - port: "metrics" path: "/metrics" diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 23214afb..adde1f9b 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -16,7 +16,7 @@ GO_CMD ?= go NAMESPACE ?= "dcgm-exporter" CHART ?= "./../../deployment/" IMAGE_REPOSITORY ?= "nvcr.io/nvidia/k8s/dcgm-exporter" -IMAGE_TAG ?= "3.3.6-3.4.2-ubuntu22.04" +IMAGE_TAG ?= "3.3.7-3.5.0-ubuntu22.04" KUBECONFIG ?= "~/.kube/config" define TEST_CMD