Skip to content

Commit

Permalink
Make nvidia resource names configurable (#359)
Browse files Browse the repository at this point in the history
  • Loading branch information
nvvfedorov committed Jul 17, 2024
1 parent 84ada94 commit 0525b9b
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 12 deletions.
16 changes: 7 additions & 9 deletions docker/Dockerfile.ubuntu
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.5.0-base-ubuntu22.04

FROM --platform=$BUILDPLATFORM ubuntu:18.04 as builder
FROM --platform=$BUILDPLATFORM ubuntu:18.04 AS builder

ARG GOLANG_VERSION=1.22.4

Expand Down Expand Up @@ -43,10 +43,8 @@ ENV PATH $PATH:/usr/local/go/bin
COPY . .
ARG TARGETOS
ARG TARGETARCH
RUN --mount=type=cache,target=/root/.cache/go-build \
--mount=type=cache,target=/go/pkg \
# when building aarch64 we have to target aarch64-linux-gnu-gcc compiler
if [ "$TARGETARCH" = "arm64" ]; then \
# when building aarch64 we have to target aarch64-linux-gnu-gcc compiler
RUN if [ "$TARGETARCH" = "arm64" ]; then \
export CC=aarch64-linux-gnu-gcc; \
export LD_LIBRARY_PATH=/usr/aarch64-linux-gnu/lib:$LD_LIBRARY_PATH; \
fi && \
Expand Down Expand Up @@ -83,12 +81,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& rm -rf /var/cache/debconf/* /var/lib/apt/lists/* /var/log/* /tmp/* /var/tmp/* \
&& rm -rf /usr/share/doc && rm -rf /usr/share/man
# Required for DCP metrics
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility,compat32
# disable all constraints on the configurations required by NVIDIA container toolkit
ENV NVIDIA_DISABLE_REQUIRE="true"
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DISABLE_REQUIRE "true"
ENV NVIDIA_VISIBLE_DEVICES all

ENV NO_SETCAP=
ENV NO_SETCAP ""
COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh
RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh

Expand Down
1 change: 1 addition & 0 deletions internal/pkg/appconfig/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,5 @@ type Config struct {
DCGMLogLevel string
PodResourcesKubeletSocket string
HPCJobMappingDir string
NvidiaResourceNames []string
}
3 changes: 2 additions & 1 deletion internal/pkg/transformation/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"net"
"regexp"
"slices"
"strings"
"time"

Expand Down Expand Up @@ -149,7 +150,7 @@ func (p *PodMapper) toDeviceToPod(
for _, device := range container.GetDevices() {

resourceName := device.GetResourceName()
if resourceName != appconfig.NvidiaResourceName {
if resourceName != appconfig.NvidiaResourceName && !slices.Contains(p.Config.NvidiaResourceNames, resourceName) {
// Mig resources appear differently than GPU resources
if !strings.HasPrefix(resourceName, appconfig.NvidiaMigResourcePrefix) {
continue
Expand Down
9 changes: 9 additions & 0 deletions internal/pkg/transformation/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
MetricGPUDevice string
MetricMigProfile string
PODGPUID string
NvidiaResourceNames []string
}

testCases := []TestCase{
Expand Down Expand Up @@ -108,6 +109,13 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
MetricGPUDevice: "0",
GPUInstanceID: 3,
},
{
KubernetesGPUIDType: appconfig.GPUUID,
ResourceName: "nvidia.com/a100",
MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5",
PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5",
NvidiaResourceNames: []string{"nvidia.com/a100"},
},
}

for _, tc := range testCases {
Expand Down Expand Up @@ -151,6 +159,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
podMapper := NewPodMapper(&appconfig.Config{
KubernetesGPUIdType: tc.KubernetesGPUIDType,
PodResourcesKubeletSocket: socketPath,
NvidiaResourceNames: tc.NvidiaResourceNames,
})
require.NotNil(t, podMapper)
metrics := collector.MetricsByCounter{}
Expand Down
8 changes: 8 additions & 0 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ const (
CLIDCGMLogLevel = "dcgm-log-level"
CLIPodResourcesKubeletSocket = "pod-resources-kubelet-socket"
CLIHPCJobMappingDir = "hpc-job-mapping-dir"
CLINvidiaResourceNames = "nvidia-resource-names"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -249,6 +250,12 @@ func NewApp(buildVersion ...string) *cli.App {
Usage: "Path to HPC job mapping file directory used for mapping GPUs to jobs.",
EnvVars: []string{"DCGM_HPC_JOB_MAPPING_DIR"},
},
&cli.StringSliceFlag{
Name: CLINvidiaResourceNames,
Value: cli.NewStringSlice(),
Usage: "Nvidia resource names for specified GPU type like nvidia.com/a100, nvidia.com/a10.",
EnvVars: []string{"NVIDIA_RESOURCE_NAMES"},
},
}

if runtime.GOOS == "linux" {
Expand Down Expand Up @@ -591,5 +598,6 @@ func contextToConfig(c *cli.Context) (*appconfig.Config, error) {
DCGMLogLevel: dcgmLogLevel,
PodResourcesKubeletSocket: c.String(CLIPodResourcesKubeletSocket),
HPCJobMappingDir: c.String(CLIHPCJobMappingDir),
NvidiaResourceNames: c.StringSlice(CLINvidiaResourceNames),
}, nil
}
2 changes: 1 addition & 1 deletion tests/integration/start_read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func TestStartAndReadMetrics(t *testing.T) {
}
app := cmd.NewApp()
args := os.Args[0:1]
args = append(args, "-f=../../etc/default-counters.csv") // Append a file with default counters
args = append(args, "-f=./testdata/default-counters.csv") // Append a file with default counters
port := getRandomAvailablePort(t)
args = append(args, fmt.Sprintf("-a=:%d", port))
ctx, cancel := context.WithCancel(context.Background())
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/start_with_tls_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ func TestStartWithTLSEnabledAndBasicAuth(t *testing.T) {
}
app := cmd.NewApp()
args := os.Args[0:1]
args = append(args, "-f=../../etc/default-counters.csv") // Append a file with default counters
args = append(args, "-f=./testdata/default-counters.csv") // Append a file with default counters
port := getRandomAvailablePort(t)
args = append(args, fmt.Sprintf("-a=:%d", port))
args = append(args, "--web-config-file=./testdata/web-config.yml")
Expand Down
77 changes: 77 additions & 0 deletions tests/integration/testdata/default-counters.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Format
# If line starts with a '#' it is considered a comment
# DCGM FIELD, Prometheus metric type, help message

# Clocks
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param).

# Temperature
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).

# Power
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).

# PCIE
# DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML.
# DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML.
# DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.

# Utilization (the sample period varies depending on the product)
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).

# Errors and violations
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param).
# Memory usage
DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).

# ECC
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.

# Retired pages
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.

# NVLink
# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes

# VGPU License status
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status

# Remapped rows
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed

# Static configuration information. These appear as labels on the other metrics
DCGM_FI_DRIVER_VERSION, label, Driver Version
# DCGM_FI_NVML_VERSION, label, NVML Version
# DCGM_FI_DEV_BRAND, label, Device Brand
# DCGM_FI_DEV_SERIAL, label, Device Serial Number
# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device

0 comments on commit 0525b9b

Please sign in to comment.