Skip to content

Commit

Permalink
DCGM-Exporter release version 3.3.7-3.5.0 (#366)
Browse files Browse the repository at this point in the history
Co-authored-by: Vadym Fedorov <[email protected]>
  • Loading branch information
glowkey and nvvfedorov authored Jul 24, 2024
1 parent b4552f0 commit 6d499c6
Show file tree
Hide file tree
Showing 17 changed files with 228 additions and 98 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ tests.cov
test_results.json
.scannerwork
dist/

.run/
###############################################################################
# JetBrains
# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
Expand Down Expand Up @@ -236,4 +236,4 @@ $RECYCLE.BIN/
*.msp

# Windows shortcuts
*.lnk
*.lnk
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ MKDIR ?= mkdir
GOLANGCILINT_TIMEOUT ?= 10m

DCGM_VERSION := $(NEW_DCGM_VERSION)
GOLANG_VERSION := 1.21.5
GOLANG_VERSION := 1.22.5
VERSION := $(NEW_EXPORTER_VERSION)
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
OUTPUT := type=oci,dest=/dev/null
Expand Down Expand Up @@ -78,7 +78,8 @@ test-integration:
go test -race -count=1 -timeout 5m -v $(TEST_ARGS) ./tests/integration/

test-coverage:
gocov test ./... | gocov report
sh scripts/test_coverage.sh
gocov convert tests.cov | gocov report

.PHONY: lint
lint:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https
To gather metrics on a GPU node, simply start the `dcgm-exporter` container:

```shell
docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04
docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04
curl localhost:9400/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
Expand Down
12 changes: 6 additions & 6 deletions dcgm-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,23 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.4.2"
app.kubernetes.io/version: "3.5.0"
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.4.2"
app.kubernetes.io/version: "3.5.0"
template:
metadata:
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.4.2"
app.kubernetes.io/version: "3.5.0"
name: "dcgm-exporter"
spec:
containers:
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04"
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04"
env:
- name: "DCGM_EXPORTER_LISTEN"
value: ":9400"
Expand Down Expand Up @@ -64,11 +64,11 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.4.2"
app.kubernetes.io/version: "3.5.0"
spec:
selector:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.4.2"
app.kubernetes.io/version: "3.5.0"
ports:
- name: "metrics"
port: 9400
4 changes: 2 additions & 2 deletions deployment/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
apiVersion: v2
name: dcgm-exporter
description: A Helm chart for DCGM exporter
version: "3.4.2"
version: "3.5.0"
kubeVersion: ">= 1.19.0-0"
appVersion: "3.4.2"
appVersion: "3.5.0"
sources:
- https://github.com/nvidia/dcgm-exporter
home: https://github.com/nvidia/dcgm-exporter/
Expand Down
2 changes: 1 addition & 1 deletion deployment/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ image:
pullPolicy: IfNotPresent
# Image tag defaults to AppVersion, but you can use the tag key
# for the image tag, e.g:
tag: 3.3.6-3.4.2-ubuntu22.04
tag: 3.3.7-3.5.0-ubuntu22.04

# Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
# to stop profiling metrics from DCGM
Expand Down
20 changes: 14 additions & 6 deletions docker/Dockerfile.ubi9
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM nvcr.io/nvidia/cuda:12.4.1-base-ubi9 AS builder
ARG GOLANG_VERSION
FROM nvcr.io/nvidia/cuda:12.5.1-base-ubi9 AS builder
ARG GOLANG_VERSION=1.22.4
WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter
RUN set -eux; \
dnf clean expire-cache; \
Expand Down Expand Up @@ -40,7 +40,7 @@ COPY . .

RUN make binary check-format

FROM nvcr.io/nvidia/cuda:12.4.1-base-ubi9
FROM nvcr.io/nvidia/cuda:12.5.1-base-ubi9
ARG DCGM_VERSION
ARG VERSION
ARG DIST_DIR
Expand All @@ -53,9 +53,17 @@ LABEL release="N/A"
LABEL summary="Exports GPU Metrics to Prometheus"
LABEL description="See summary"

RUN dnf update --disablerepo=* --enablerepo=ubi-9-appstream-rpms --enablerepo=ubi-9-baseos-rpms -y && rm -rf /var/cache/yum \
&& dnf clean expire-cache && dnf install -y datacenter-gpu-manager-${DCGM_VERSION} libcap \
&& rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite
RUN dnf update --disablerepo=* --enablerepo=ubi-9-appstream-rpms --enablerepo=ubi-9-baseos-rpms -y \
&& dnf install --nodocs --setopt=install_weak_deps=False -y datacenter-gpu-manager-${DCGM_VERSION} libcap \
&& dnf -y clean all\
&& rm -rf /var/cache/yum\
&& rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \
# DCGM exporter doesn't use libdcgm_cublas_proxy*.so.
&& rm -rf /usr/lib64/libdcgm_cublas_proxy*.so \
&& rm -rf /usr/local/dcgm/scripts \
&& rm -f /usr/include/*.h /usr/bin/DcgmProfTesterKernels.ptx /usr/bin/dcgmproftester* \
&& rm -rf /var/lib/rpm/rpmdb.sqlite /var/cache/* /var/lib/dnf/history.* /var/log/* /tmp/* /var/tmp/* \
&& rm -rf /usr/share/doc && rm -rf /usr/share/man

COPY ./LICENSE ./licenses/LICENSE
COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/
Expand Down
17 changes: 12 additions & 5 deletions docker/Dockerfile.ubuntu22.04
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 AS builder
ARG GOLANG_VERSION=1.21.5
FROM nvcr.io/nvidia/cuda:12.5.1-base-ubuntu22.04 AS builder
ARG GOLANG_VERSION=1.22.4
WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter
RUN set -eux; \
apt-get update; \
Expand Down Expand Up @@ -45,7 +45,7 @@ COPY . .

RUN make binary check-format

FROM nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04
FROM nvcr.io/nvidia/cuda:12.5.1-base-ubuntu22.04

ARG VERSION
ARG DCGM_VERSION
Expand All @@ -65,9 +65,16 @@ COPY etc /etc/dcgm-exporter

RUN apt-get update && apt-get install -y --no-install-recommends \
datacenter-gpu-manager=1:${DCGM_VERSION} libcap2-bin && apt-get purge --autoremove -y openssl \
&& apt-get -y clean \
&& apt-get -y autoclean \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite
&& rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \
# DCGM exporter doesn't use libdcgm_cublas_proxy*.so.
&& rm -rf /usr/lib/x86_64-linux-gnu/libdcgm_cublas_proxy*.so \
&& rm -rf /usr/local/dcgm/scripts \
&& rm -f /usr/include/*.h /usr/bin/DcgmProfTesterKernels.ptx /usr/bin/dcgmproftester* \
&& rm -rf /var/cache/debconf/* /var/lib/apt/lists/* /var/log/* /tmp/* /var/tmp/* \
&& rm -rf /usr/share/doc && rm -rf /usr/share/man
# Required for DCP metrics
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
# disable all constraints on the configurations required by NVIDIA container toolkit
Expand Down
73 changes: 37 additions & 36 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
module github.com/NVIDIA/dcgm-exporter

go 1.21
go 1.22.0

toolchain go1.22.4

require (
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f
github.com/NVIDIA/go-nvml v0.12.0-2
github.com/avast/retry-go/v4 v4.5.1
github.com/bits-and-blooms/bitset v1.13.0
github.com/go-kit/log v0.2.1
github.com/google/uuid v1.5.0
github.com/google/uuid v1.6.0
github.com/gorilla/mux v1.8.1
github.com/mittwald/go-helm-client v0.12.8
github.com/mittwald/go-helm-client v0.12.9
github.com/onsi/ginkgo/v2 v2.15.0
github.com/onsi/gomega v1.32.0
github.com/prometheus/client_model v0.6.0
Expand All @@ -20,13 +22,14 @@ require (
github.com/stretchr/testify v1.8.4
github.com/urfave/cli/v2 v2.27.1
go.uber.org/automaxprocs v1.5.3
golang.org/x/sync v0.5.0
google.golang.org/grpc v1.61.1
k8s.io/api v0.29.2
k8s.io/apimachinery v0.29.2
k8s.io/client-go v0.29.2
k8s.io/kubelet v0.29.2
k8s.io/utils v0.0.0-20240102154912-e7106e64919e
go.uber.org/mock v0.4.0
golang.org/x/sync v0.7.0
google.golang.org/grpc v1.64.0
k8s.io/api v0.30.2
k8s.io/apimachinery v0.30.2
k8s.io/client-go v0.30.2
k8s.io/kubelet v0.30.2
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0
)

require (
Expand All @@ -44,20 +47,19 @@ require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/chai2010/gettext-go v1.0.2 // indirect
github.com/containerd/containerd v1.7.11 // indirect
github.com/containerd/containerd v1.7.12 // indirect
github.com/containerd/log v0.1.0 // indirect
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.3 // indirect
github.com/cyphar/filepath-securejoin v0.2.4 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/distribution/reference v0.5.0 // indirect
github.com/docker/cli v24.0.7+incompatible // indirect
github.com/docker/cli v26.1.4+incompatible // indirect
github.com/docker/distribution v2.8.3+incompatible // indirect
github.com/docker/docker v24.0.7+incompatible // indirect
github.com/docker/docker v26.1.4+incompatible // indirect
github.com/docker/docker-credential-helpers v0.8.0 // indirect
github.com/docker/go-connections v0.4.0 // indirect
github.com/docker/go-connections v0.5.0 // indirect
github.com/docker/go-metrics v0.0.1 // indirect
github.com/docker/go-units v0.5.0 // indirect
github.com/emicklei/go-restful/v3 v3.11.1 // indirect
github.com/evanphx/json-patch v5.7.0+incompatible // indirect
github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect
Expand All @@ -74,7 +76,7 @@ require (
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/btree v1.1.2 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
Expand Down Expand Up @@ -111,12 +113,11 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
github.com/morikuni/aec v1.0.0 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.0-rc5 // indirect
github.com/opencontainers/image-spec v1.1.0-rc6 // indirect
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
Expand All @@ -140,32 +141,32 @@ require (
go.opentelemetry.io/otel/metric v1.21.0 // indirect
go.opentelemetry.io/otel/trace v1.21.0 // indirect
go.starlark.net v0.0.0-20231121155337-90ade8b19d09 // indirect
go.uber.org/mock v0.4.0 // indirect
golang.org/x/crypto v0.18.0 // indirect
golang.org/x/crypto v0.24.0 // indirect
golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc // indirect
golang.org/x/net v0.20.0 // indirect
golang.org/x/oauth2 v0.16.0 // indirect
golang.org/x/sys v0.16.0 // indirect
golang.org/x/term v0.16.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/mod v0.17.0 // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/oauth2 v0.18.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/term v0.21.0 // indirect
golang.org/x/text v0.16.0 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.16.1 // indirect
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
google.golang.org/appengine v1.6.8 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240102182953-50ed04b92917 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 // indirect
google.golang.org/protobuf v1.33.0 // indirect
gopkg.in/evanphx/json-patch.v5 v5.7.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
helm.sh/helm/v3 v3.14.2 // indirect
k8s.io/apiextensions-apiserver v0.29.0 // indirect
k8s.io/apiserver v0.29.2 // indirect
k8s.io/cli-runtime v0.29.0 // indirect
k8s.io/component-base v0.29.2 // indirect
k8s.io/klog/v2 v2.110.1 // indirect
k8s.io/kube-openapi v0.0.0-20240220201932-37d671a357a5 // indirect
k8s.io/kubectl v0.29.0 // indirect
oras.land/oras-go v1.2.4 // indirect
helm.sh/helm/v3 v3.15.2 // indirect
k8s.io/apiextensions-apiserver v0.30.0 // indirect
k8s.io/apiserver v0.30.2 // indirect
k8s.io/cli-runtime v0.30.0 // indirect
k8s.io/component-base v0.30.2 // indirect
k8s.io/klog/v2 v2.120.1 // indirect
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
k8s.io/kubectl v0.30.0 // indirect
oras.land/oras-go v1.2.5 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/kustomize/api v0.16.0 // indirect
sigs.k8s.io/kustomize/kyaml v0.16.0 // indirect
Expand Down
Loading

0 comments on commit 6d499c6

Please sign in to comment.