diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index 57a8a459..e92d3b63 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -1,6 +1,6 @@ ARG BASEIMAGE=nvcr.io/nvidia/cuda:12.5.0-base-ubuntu22.04 -FROM --platform=$BUILDPLATFORM ubuntu:18.04 as builder +FROM --platform=$BUILDPLATFORM ubuntu:18.04 AS builder ARG GOLANG_VERSION=1.22.4 @@ -43,10 +43,8 @@ ENV PATH $PATH:/usr/local/go/bin COPY . . ARG TARGETOS ARG TARGETARCH -RUN --mount=type=cache,target=/root/.cache/go-build \ - --mount=type=cache,target=/go/pkg \ - # when building aarch64 we have to target aarch64-linux-gnu-gcc compiler - if [ "$TARGETARCH" = "arm64" ]; then \ +# when building aarch64 we have to target aarch64-linux-gnu-gcc compiler +RUN if [ "$TARGETARCH" = "arm64" ]; then \ export CC=aarch64-linux-gnu-gcc; \ export LD_LIBRARY_PATH=/usr/aarch64-linux-gnu/lib:$LD_LIBRARY_PATH; \ fi && \ @@ -83,12 +81,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/cache/debconf/* /var/lib/apt/lists/* /var/log/* /tmp/* /var/tmp/* \ && rm -rf /usr/share/doc && rm -rf /usr/share/man # Required for DCP metrics -ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility,compat32 # disable all constraints on the configurations required by NVIDIA container toolkit -ENV NVIDIA_DISABLE_REQUIRE="true" -ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DISABLE_REQUIRE "true" +ENV NVIDIA_VISIBLE_DEVICES all -ENV NO_SETCAP= +ENV NO_SETCAP "" COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh diff --git a/internal/pkg/appconfig/types.go b/internal/pkg/appconfig/types.go index 2db32870..6d369845 100644 --- a/internal/pkg/appconfig/types.go +++ b/internal/pkg/appconfig/types.go @@ -55,4 +55,5 @@ type Config struct { DCGMLogLevel string PodResourcesKubeletSocket string HPCJobMappingDir string + NvidiaResourceNames []string } diff --git a/internal/pkg/transformation/kubernetes.go b/internal/pkg/transformation/kubernetes.go index 846d71a4..bab6a6a8 100644 --- a/internal/pkg/transformation/kubernetes.go +++ b/internal/pkg/transformation/kubernetes.go @@ -21,6 +21,7 @@ import ( "fmt" "net" "regexp" + "slices" "strings" "time" @@ -149,7 +150,7 @@ func (p *PodMapper) toDeviceToPod( for _, device := range container.GetDevices() { resourceName := device.GetResourceName() - if resourceName != appconfig.NvidiaResourceName { + if resourceName != appconfig.NvidiaResourceName && !slices.Contains(p.Config.NvidiaResourceNames, resourceName) { // Mig resources appear differently than GPU resources if !strings.HasPrefix(resourceName, appconfig.NvidiaMigResourcePrefix) { continue diff --git a/internal/pkg/transformation/kubernetes_test.go b/internal/pkg/transformation/kubernetes_test.go index e7ac38a7..1f82c713 100644 --- a/internal/pkg/transformation/kubernetes_test.go +++ b/internal/pkg/transformation/kubernetes_test.go @@ -50,6 +50,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { MetricGPUDevice string MetricMigProfile string PODGPUID string + NvidiaResourceNames []string } testCases := []TestCase{ @@ -108,6 +109,13 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { MetricGPUDevice: "0", GPUInstanceID: 3, }, + { + KubernetesGPUIDType: appconfig.GPUUID, + ResourceName: "nvidia.com/a100", + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + NvidiaResourceNames: []string{"nvidia.com/a100"}, + }, } for _, tc := range testCases { @@ -151,6 +159,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { podMapper := NewPodMapper(&appconfig.Config{ KubernetesGPUIdType: tc.KubernetesGPUIDType, PodResourcesKubeletSocket: socketPath, + NvidiaResourceNames: tc.NvidiaResourceNames, }) require.NotNil(t, podMapper) metrics := collector.MetricsByCounter{} diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index a5ab7e2b..2dc9c97c 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -86,6 +86,7 @@ const ( CLIDCGMLogLevel = "dcgm-log-level" CLIPodResourcesKubeletSocket = "pod-resources-kubelet-socket" CLIHPCJobMappingDir = "hpc-job-mapping-dir" + CLINvidiaResourceNames = "nvidia-resource-names" ) func NewApp(buildVersion ...string) *cli.App { @@ -249,6 +250,12 @@ func NewApp(buildVersion ...string) *cli.App { Usage: "Path to HPC job mapping file directory used for mapping GPUs to jobs.", EnvVars: []string{"DCGM_HPC_JOB_MAPPING_DIR"}, }, + &cli.StringSliceFlag{ + Name: CLINvidiaResourceNames, + Value: cli.NewStringSlice(), + Usage: "Nvidia resource names for specified GPU type like nvidia.com/a100, nvidia.com/a10.", + EnvVars: []string{"NVIDIA_RESOURCE_NAMES"}, + }, } if runtime.GOOS == "linux" { @@ -591,5 +598,6 @@ func contextToConfig(c *cli.Context) (*appconfig.Config, error) { DCGMLogLevel: dcgmLogLevel, PodResourcesKubeletSocket: c.String(CLIPodResourcesKubeletSocket), HPCJobMappingDir: c.String(CLIHPCJobMappingDir), + NvidiaResourceNames: c.StringSlice(CLINvidiaResourceNames), }, nil } diff --git a/tests/integration/start_read_test.go b/tests/integration/start_read_test.go index daf59a6d..a398af6b 100644 --- a/tests/integration/start_read_test.go +++ b/tests/integration/start_read_test.go @@ -38,7 +38,7 @@ func TestStartAndReadMetrics(t *testing.T) { } app := cmd.NewApp() args := os.Args[0:1] - args = append(args, "-f=../../etc/default-counters.csv") // Append a file with default counters + args = append(args, "-f=./testdata/default-counters.csv") // Append a file with default counters port := getRandomAvailablePort(t) args = append(args, fmt.Sprintf("-a=:%d", port)) ctx, cancel := context.WithCancel(context.Background()) diff --git a/tests/integration/start_with_tls_test.go b/tests/integration/start_with_tls_test.go index eaaf0d03..532cff5e 100644 --- a/tests/integration/start_with_tls_test.go +++ b/tests/integration/start_with_tls_test.go @@ -21,7 +21,7 @@ func TestStartWithTLSEnabledAndBasicAuth(t *testing.T) { } app := cmd.NewApp() args := os.Args[0:1] - args = append(args, "-f=../../etc/default-counters.csv") // Append a file with default counters + args = append(args, "-f=./testdata/default-counters.csv") // Append a file with default counters port := getRandomAvailablePort(t) args = append(args, fmt.Sprintf("-a=:%d", port)) args = append(args, "--web-config-file=./testdata/web-config.yml") diff --git a/tests/integration/testdata/default-counters.csv b/tests/integration/testdata/default-counters.csv new file mode 100644 index 00000000..ab5e545d --- /dev/null +++ b/tests/integration/testdata/default-counters.csv @@ -0,0 +1,77 @@ +# Format +# If line starts with a '#' it is considered a comment +# DCGM FIELD, Prometheus metric type, help message + +# Clocks +DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param). + +# Temperature +DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). +DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + +# Power +DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + +# PCIE +# DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. +# DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. +# DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + +# Utilization (the sample period varies depending on the product) +DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). +DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). +DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). +DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + +# Errors and violations +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. +# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). +# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). +# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). +# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). +# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). +# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). +# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param). +# Memory usage +DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). +DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). + +# ECC +# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. +# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + +# Retired pages +# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. +# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. +# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + +# NVLink +# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. +# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. +# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. +# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes + +# VGPU License status +DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + +# Remapped rows +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors +DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + +# Static configuration information. These appear as labels on the other metrics +DCGM_FI_DRIVER_VERSION, label, Driver Version +# DCGM_FI_NVML_VERSION, label, NVML Version +# DCGM_FI_DEV_BRAND, label, Device Brand +# DCGM_FI_DEV_SERIAL, label, Device Serial Number +# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version +# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version +# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version +# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version +# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device