Skip to content

Commit

Permalink
Update to DCGM 3.3.0 and ubi9
Browse files Browse the repository at this point in the history
  • Loading branch information
glowkey committed Nov 8, 2023
1 parent a8a6a05 commit 05b85eb
Show file tree
Hide file tree
Showing 10 changed files with 75 additions and 621 deletions.
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
FROM nvidia/cuda:12.2.0-base-ubuntu22.04
FROM nvidia/cuda:12.2.2-base-ubuntu22.04

ARG DCGM_VERSION
ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update && apt-get install -y --no-install-recommends \
gnupg2 curl ca-certificates build-essential && \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub | apt-key add - && \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/7fa2af80.pub | apt-key add - && \
curl -s https://storage.googleapis.com/golang/go1.16.5.linux-amd64.tar.gz| tar -v -C /usr/local -xz && \
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2204/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
apt-get purge --autoremove -y curl \
&& rm -rf /var/lib/apt/lists/*

Expand Down
18 changes: 14 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
MKDIR ?= mkdir
REGISTRY ?= nvidia

DCGM_VERSION := 3.2.5
DCGM_VERSION := 3.3.0
GOLANG_VERSION := 1.20
VERSION := 3.1.8
VERSION := 3.2.0
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
OUTPUT := type=oci,dest=/tmp/dcgm-exporter.tar
PLATFORMS := linux/amd64,linux/arm64
Expand All @@ -30,7 +30,7 @@ NON_TEST_FILES += cmd/dcgm-exporter/main.go
MAIN_TEST_FILES := pkg/dcgmexporter/system_info_test.go

.PHONY: all binary install check-format local
all: ubuntu22.04
all: ubuntu22.04 ubi9

binary:
cd cmd/dcgm-exporter; go build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"
Expand All @@ -49,7 +49,7 @@ check-format:

push:
$(MAKE) ubuntu22.04 OUTPUT=type=registry
$(MAKE) ubi8 OUTPUT=type=registry
$(MAKE) ubi9 OUTPUT=type=registry

local:
ifeq ($(shell uname -p),aarch64)
Expand All @@ -66,3 +66,13 @@ ubuntu22.04:
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu22.04" \
--file docker/Dockerfile.ubuntu22.04 .

ubi9:
$(DOCKERCMD) --pull \
--output $(OUTPUT) \
--platform $(PLATFORMS) \
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
--build-arg "VERSION=$(FULL_VERSION)" \
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi9" \
--file docker/Dockerfile.ubi9 .
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https

To gather metrics on a GPU node, simply start the `dcgm-exporter` container:
```
$ docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.2.5-3.1.8-ubuntu20.04
$ docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04
$ curl localhost:9400/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
Expand Down
12 changes: 6 additions & 6 deletions dcgm-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,23 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.1.8"
app.kubernetes.io/version: "3.2.0"
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.1.8"
app.kubernetes.io/version: "3.2.0"
template:
metadata:
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.1.8"
app.kubernetes.io/version: "3.2.0"
name: "dcgm-exporter"
spec:
containers:
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.2.5-3.1.8-ubuntu20.04"
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04"
env:
- name: "DCGM_EXPORTER_LISTEN"
value: ":9400"
Expand Down Expand Up @@ -64,11 +64,11 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.1.8"
app.kubernetes.io/version: "3.2.0"
spec:
selector:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.1.8"
app.kubernetes.io/version: "3.2.0"
ports:
- name: "metrics"
port: 9400
4 changes: 2 additions & 2 deletions deployment/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
apiVersion: v2
name: dcgm-exporter
description: A Helm chart for DCGM exporter
version: "3.1.8"
version: "3.2.0"
kubeVersion: ">= 1.19.0-0"
appVersion: "3.1.8"
appVersion: "3.2.0"
sources:
- https://github.com/nvidia/dcgm-exporter
home: https://github.com/nvidia/dcgm-exporter/
Expand Down
2 changes: 1 addition & 1 deletion deployment/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ image:
pullPolicy: IfNotPresent
# Image tag defaults to AppVersion, but you can use the tag key
# for the image tag, e.g:
tag: 3.2.5-3.1.8-ubuntu20.04
tag: 3.3.0-3.2.0-ubuntu22.04

# Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
# to stop profiling metrics from DCGM
Expand Down
41 changes: 41 additions & 0 deletions docker/Dockerfile.ubi9
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
ARG GOLANG_VERSION
FROM golang:$GOLANG_VERSION AS builder
WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter

COPY . .

RUN make binary check-format

FROM nvcr.io/nvidia/cuda:12.2.2-base-ubi9
LABEL io.k8s.display-name="NVIDIA DCGM Exporter"

ARG DCGM_VERSION

RUN dnf clean expire-cache && dnf install -y datacenter-gpu-manager-${DCGM_VERSION} libcap

COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/
COPY etc /etc/dcgm-exporter

ENV NVIDIA_VISIBLE_DEVICES=all
# disable all constraints on the configurations required by NVIDIA container toolkit
ENV NVIDIA_DISABLE_REQUIRE="true"
# Required for DCP metrics
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32

ARG VERSION

LABEL io.k8s.display-name="NVIDIA DCGM Eporter"
LABEL name="NVIDIA DCGM Exporter"
LABEL vendor="NVIDIA"
LABEL version="${VERSION}"
LABEL release="N/A"
LABEL summary="Exports GPU Metrics to Prometheus"
LABEL description="See summary"

COPY ./LICENSE ./licenses/LICENSE

ENV NO_SETCAP=
COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh
RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh

ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"]
5 changes: 1 addition & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ replace (
)

require (
github.com/NVIDIA/go-dcgm v0.0.0-20230816170901-d898cc7820fe
github.com/NVIDIA/go-dcgm v0.0.0-20231107230254-2e092a7526b3
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20211102125545-5a2c58442e48
github.com/gorilla/mux v1.8.0
github.com/sirupsen/logrus v1.9.3
Expand All @@ -47,7 +47,6 @@ require (
github.com/Microsoft/go-winio v0.6.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bits-and-blooms/bitset v1.8.0 // indirect
github.com/blang/semver v3.5.1+incompatible // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
Expand All @@ -64,7 +63,6 @@ require (
github.com/google/go-cmp v0.5.9 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/googleapis/gnostic v0.5.5 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
Expand All @@ -81,7 +79,6 @@ require (
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
golang.org/x/crypto v0.13.0 // indirect
golang.org/x/mod v0.10.0 // indirect
golang.org/x/net v0.15.0 // indirect
golang.org/x/oauth2 v0.12.0 // indirect
Expand Down
Loading

0 comments on commit 05b85eb

Please sign in to comment.