diff --git a/README.md b/README.md index dcd112f4..8134b5df 100644 --- a/README.md +++ b/README.md @@ -177,7 +177,7 @@ This project uses [docker buildx](https://docs.docker.com/buildx/working-with-bu Builds local images based on the machine architecture and makes them available in 'docker images' -``` +```shell make local ``` diff --git a/deployment/templates/metrics-configmap.yaml b/deployment/templates/metrics-configmap.yaml index 67a1c774..3bb37ddf 100644 --- a/deployment/templates/metrics-configmap.yaml +++ b/deployment/templates/metrics-configmap.yaml @@ -4,6 +4,10 @@ metadata: name: exporter-metrics-config-map namespace: {{ include "dcgm-exporter.namespace" . }} data: +{{- if .Values.customMetrics }} + metrics: | +{{- .Values.customMetrics | nindent 4 }} +{{- else }} metrics: | # Format # If line starts with a '#' it is considered a comment @@ -83,3 +87,4 @@ data: # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. +{{- end }} \ No newline at end of file diff --git a/deployment/values.yaml b/deployment/values.yaml index 7490f46c..9679bf70 100644 --- a/deployment/values.yaml +++ b/deployment/values.yaml @@ -73,8 +73,8 @@ podLabels: {} # Annotations to be added to dcgm-exporter pods podAnnotations: {} # Using this annotation which is required for prometheus scraping - # prometheus.io/scrape: "true" - # prometheus.io/port: "9400" + # prometheus.io/scrape: "true" + # prometheus.io/port: "9400" # The SecurityContext for the dcgm-exporter pods podSecurityContext: {} @@ -85,7 +85,7 @@ securityContext: runAsNonRoot: false runAsUser: 0 capabilities: - add: ["SYS_ADMIN"] + add: ["SYS_ADMIN"] # readOnlyRootFilesystem: true # Defines the dcgm-exporter service @@ -138,15 +138,18 @@ extraHostVolumes: [] #- name: host-binaries # hostPath: /opt/bin -extraConfigMapVolumes: [] -#- name: exporter-metrics-volume -# configMap: -# name: exporter-metrics-config-map +extraConfigMapVolumes: + - name: exporter-metrics-volume + configMap: + name: exporter-metrics-config-map + items: + - key: metrics + path: dcp-metrics-included.csv -extraVolumeMounts: [] -#- name: host-binaries -# mountPath: /opt/bin -# readOnly: true +extraVolumeMounts: + - name: exporter-metrics-volume + mountPath: /etc/dcgm-exporter/dcp-metrics-included.csv + subPath: dcp-metrics-included.csv extraEnv: [] #- name: EXTRA_VAR @@ -154,3 +157,85 @@ extraEnv: [] # Path to the kubelet socket for /pod-resources kubeletPath: "/var/lib/kubelet/pod-resources" + +# Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list. +# Must be the complete list and is not additive. If unset, the default list will take effect. +# customMetrics: | + # Format + # If line starts with a '#' it is considered a comment + # DCGM FIELD, Prometheus metric type, help message + + # Clocks + # DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + # DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). + + # Temperature + # DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). + # DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + + # Power + # DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). + # DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + + # PCIE + # DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. + # DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. + # DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + + # Utilization (the sample period varies depending on the product) + # DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). + # DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). + # DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). + # DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + + # Errors and violations + # DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. + # DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). + # DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). + # DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). + # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). + # DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). + # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). + + # Memory usage + # DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). + # DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + + # ECC + # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. + # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. + # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. + # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + + # Retired pages + # DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. + # DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. + # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + + # NVLink + # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. + # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. + # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. + # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. + # DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. + # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + + # VGPU License status + # DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + + # Remapped rows + # DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors + # DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors + # DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + + # DCP metrics + # DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. + # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. + # DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM. + # DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. + # DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. + # DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active. + # DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active. + # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. + # DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. + # DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload.