Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DCGM-Exporter 4.0.0 #437

Merged
merged 1 commit into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 39 additions & 31 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,31 +1,39 @@
FROM nvcr.io/nvidia/cuda:12.3.1-base-ubuntu22.04
ARG GOLANG_VERSION=1.21.5
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
ARG GOLANG_VERSION=1.22.5
ARG USERNAME=developer
ARG USER_UID=1000
ARG USER_GID=1000
ARG DCGM_VERSION=3.3.3
# Create a user 'developer' with UID=1000, add to 'developer' group, and add to 'sudo' group
RUN groupadd -g $USER_GID $USERNAME && \
useradd -m -u $USER_GID -g $USERNAME -s /bin/bash $USERNAME && \
usermod -aG sudo $USERNAME
useradd -m -u $USER_GID -g $USERNAME -s /bin/bash $USERNAME && \
usermod -aG sudo $USERNAME
# Allow 'developer' to use sudo without a password
RUN echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

RUN --mount=type=cache,target=/var/cache/apt \
set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
git \
ca-certificates \
g++ \
gcc \
libc6-dev \
make \
pkg-config \
wget \
datacenter-gpu-manager=1:${DCGM_VERSION} \
libcap2-bin \
&& apt-get autoremove -y \
git \
ca-certificates \
g++ \
gcc \
libc6-dev \
make \
pkg-config \
wget \
datacenter-gpu-manager-4-core \
libcap2-bin \
&& install -m 0755 -d /etc/apt/keyrings \
&& wget -O /etc/apt/keyrings/docker.asc https://download.docker.com/linux/ubuntu/gpg \
&& chmod a+r /etc/apt/keyrings/docker.asc \
&& echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
tee /etc/apt/sources.list.d/docker.list > /dev/null \
&& apt-get update \
&& apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io docker-buildx-plugin \
&& apt-get autoremove -y \
&& rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \
# DCGM exporter doesn't use libdcgm_cublas_proxy*.so.
&& rm -rf /usr/lib/x86_64-linux-gnu/libdcgm_cublas_proxy*.so \
Expand All @@ -36,25 +44,25 @@ RUN set -eux; \
url=; \
echo "$arch"; \
case "$arch" in \
'amd64') \
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \
;; \
'arm64') \
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \
;; \
*) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \
'amd64') \
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \
;; \
'arm64') \
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \
;; \
*) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \
esac; \
build=; \
if [ -z "$url" ]; then \
# https://github.com/golang/go/issues/38536#issuecomment-616897960
build=1; \
url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \
echo >&2; \
echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \
echo >&2; \
# https://github.com/golang/go/issues/38536#issuecomment-616897960
build=1; \
url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \
echo >&2; \
echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \
echo >&2; \
fi; \
wget -O go.tgz "$url" --progress=dot:giga; \
tar -C /usr/local -xzf go.tgz; \
wget -O go.tgz "$url" --progress=dot:giga; \
tar -C /usr/local -xzf go.tgz; \
rm go.tgz
ENV GOTOOLCHAIN=local
ENV GOPATH /go
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v2
with:
go-version: 1.21
go-version: 1.22

- name: Build
run: make binary
Expand Down
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ tests.cov
test_results.json
.scannerwork
dist/
.run/
.run
dist/

###############################################################################
# JetBrains
# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
Expand Down Expand Up @@ -236,4 +238,4 @@ $RECYCLE.BIN/
*.msp

# Windows shortcuts
*.lnk
*.lnk
22 changes: 14 additions & 8 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,18 @@
"request": "launch",
"mode": "test",
"program": "${workspaceFolder}/tests/e2e",
"args": ["-test.v",
"--ginkgo.v",
"-kubeconfig","~/.kube/config",
"-chart","./../../deployment/",
"-image-repository","nvidia/dcgm-exporter",
"-arguments","{-f=/etc/dcgm-exporter/default-counters.csv,--enable-dcgm-log=true,--dcgm-log-level=ERROR}"],
"args": [
"-test.v",
"--ginkgo.v",
"-kubeconfig",
"~/.kube/config",
"-chart",
"./../../deployment/",
"-image-repository",
"nvidia/dcgm-exporter",
"-arguments",
"{-f=/etc/dcgm-exporter/default-counters.csv}"
],
"env": {},
"buildFlags": "-tags=e2e"
},
Expand All @@ -30,8 +36,8 @@
"-f",
"./etc/default-counters.csv",
"--debug",
"--enable-dcgm-log",
"--dcgm-log-level=INFO"
"-r",
"localhost:5555"
]
}
]
Expand Down
64 changes: 0 additions & 64 deletions Jenkinsfile

This file was deleted.

76 changes: 62 additions & 14 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,29 @@ REGISTRY ?= nvidia
GO ?= go
MKDIR ?= mkdir
GOLANGCILINT_TIMEOUT ?= 10m
IMAGE_TAG ?= ""

DCGM_VERSION := $(NEW_DCGM_VERSION)
GOLANG_VERSION := 1.22.5
GOLANG_VERSION := 1.22.9
VERSION := $(NEW_EXPORTER_VERSION)
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
OUTPUT := type=oci,dest=/dev/null
PLATFORMS := linux/amd64,linux/arm64
DOCKERCMD := docker buildx build
DOCKERCMD := docker --debug buildx build
MODULE := github.com/NVIDIA/dcgm-exporter


.PHONY: all binary install check-format local
all: update-version ubuntu22.04 ubi9

binary: generate update-version
binary: update-version
cd cmd/dcgm-exporter; $(GO) build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"

test-main:
test-main: generate
$(GO) test ./... -short

install: binary
install -m 755 cmd/dcgm-exporter/dcgm-exporter /usr/bin/dcgm-exporter
install -m 644 -D ./etc/default-counters.csv /etc/dcgm-exporter/default-counters.csv
install -m 644 -D ./etc/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv

check-format:
test $$(gofmt -l pkg | tee /dev/stderr | wc -l) -eq 0
Expand All @@ -58,23 +57,71 @@ else
$(MAKE) PLATFORMS=linux/amd64 OUTPUT=type=docker DOCKERCMD='docker build'
endif

TARGETS = ubuntu22.04 ubi9
ubi%: DOCKERFILE = docker/Dockerfile.ubi
ubi%: --docker-build-%
@
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubi9
ubi9: IMAGE_TAG = ubi9

ubuntu%: DOCKERFILE = docker/Dockerfile.ubuntu
ubuntu%: --docker-build-%
@
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
ubuntu22.04: IMAGE_TAG = ubuntu22.04

DOCKERFILE.ubuntu22.04 = docker/Dockerfile.ubuntu22.04
DOCKERFILE.ubi9 = docker/Dockerfile.ubi9

$(TARGETS):
--docker-build-%:
@echo "Building for $@"
DOCKER_BUILDKIT=1 \
$(DOCKERCMD) --pull \
--output $(OUTPUT) \
--progress=plain \
--platform $(PLATFORMS) \
--build-arg BASEIMAGE="$(BASE_IMAGE)" \
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
--build-arg "VERSION=$(VERSION)" \
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$@" \
--file $(DOCKERFILE.$@) .
--tag $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)$(if $(IMAGE_TAG),-$(IMAGE_TAG)) \
--file $(DOCKERFILE) .

.PHONY: packages package-arm64 package-amd64
packages: package-amd64 package-arm64

package-arm64:
$(MAKE) package-build PLATFORMS=linux/arm64

package-amd64:
$(MAKE) package-build PLATFORMS=linux/amd64

package-build: IMAGE_TAG = ubuntu22.04
package-build:
ARCH=`echo $(PLATFORMS) | cut -d'/' -f2)`; \
if [ "$$ARCH" = "amd64" ]; then \
ARCH="x86-64"; \
fi; \
if [ "$$ARCH" = "arm64" ]; then \
ARCH="sbsa"; \
fi; \
export DIST_NAME="dcgm_exporter-linux-$$ARCH-$(VERSION)"; \
export COMPONENT_NAME="dcgm_exporter"; \
$(MAKE) ubuntu22.04 OUTPUT=type=docker PLATFORMS=$(PLATFORMS) && \
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME && \
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin && \
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/dcgm-exporter && \
I=`docker create $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$(IMAGE_TAG)` && \
docker cp $$I:/usr/bin/dcgm-exporter /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin/ && \
docker cp $$I:/etc/dcgm-exporter /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/ && \
cp ./LICENSE /tmp/$$DIST_NAME/$$COMPONENT_NAME && \
mkdir -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/lib/systemd/system/ && \
cp ./packaging/config-files/systemd/nvidia-dcgm-exporter.service \
/tmp/$$DIST_NAME/$$COMPONENT_NAME/lib/systemd/system/nvidia-dcgm-exporter.service && \
docker rm -f $$I && \
$(MKDIR) -p $(CURDIR)/dist && \
cd "/tmp/$$DIST_NAME" && tar -czf $(CURDIR)/dist/$$DIST_NAME.tar.gz `ls -A` && \
rm -rf "/tmp/$$DIST_NAME";

.PHONY: integration
test-integration:
test-integration: generate
go test -race -count=1 -timeout 5m -v $(TEST_ARGS) ./tests/integration/

test-coverage:
Expand All @@ -83,7 +130,7 @@ test-coverage:

.PHONY: lint
lint:
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1

.PHONY: validate-modules
validate-modules:
Expand All @@ -99,6 +146,7 @@ tools: ## Install required tools and utilities
go install github.com/axw/gocov/gocov@latest
go install golang.org/x/tools/cmd/goimports@latest
go install mvdan.cc/gofumpt@latest
go install github.com/wadey/gocovmerge@latest

fmt:
find . -name '*.go' | xargs gofumpt -l -w
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https
To gather metrics on a GPU node, simply start the `dcgm-exporter` container:

```shell
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.9-3.6.1-ubuntu22.04
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.0.0-4.0.0-ubuntu22.04
curl localhost:9400/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
Expand Down Expand Up @@ -111,8 +111,9 @@ To enable GPU-to-job mapping on the DCGM-exporter side, users must run the DCGM-

In order to build dcgm-exporter ensure you have the following:

* [Golang >= 1.21 installed](https://golang.org/)
* [Golang >= 1.22 installed](https://golang.org/)
* [DCGM installed](https://developer.nvidia.com/dcgm)
* Have Linux machine with GPU, compatible with DCGM.

```shell
git clone https://github.com/NVIDIA/dcgm-exporter.git
Expand Down
Loading
Loading