From a3d0469c82b7a7889417622838966b643e213b4e Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Fri, 2 Feb 2024 13:41:50 -0600 Subject: [PATCH] Added new command line parameter: replace-blanks-in-model-name Signed-off-by: Vadym Fedorov --- pkg/cmd/app.go | 77 ++++++++++++++------------ pkg/dcgmexporter/gpu_collector.go | 40 ++++++++++--- pkg/dcgmexporter/gpu_collector_test.go | 60 ++++++++++++++++++++ pkg/dcgmexporter/types.go | 52 ++++++++--------- 4 files changed, 162 insertions(+), 67 deletions(-) diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index 388bf60b..fb64028a 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -46,22 +46,23 @@ const ( ) const ( - CLIFieldsFile = "collectors" - CLIAddress = "address" - CLICollectInterval = "collect-interval" - CLIKubernetes = "kubernetes" - CLIKubernetesGPUIDType = "kubernetes-gpu-id-type" - CLIUseOldNamespace = "use-old-namespace" - CLIRemoteHEInfo = "remote-hostengine-info" - CLIGPUDevices = "devices" - CLISwitchDevices = "switch-devices" - CLICPUDevices = "cpu-devices" - CLINoHostname = "no-hostname" - CLIUseFakeGPUs = "fake-gpus" - CLIConfigMapData = "configmap-data" - CLIWebSystemdSocket = "web-systemd-socket" - CLIWebConfigFile = "web-config-file" - CLIXIDCountWindowSize = "xid-count-window-size" + CLIFieldsFile = "collectors" + CLIAddress = "address" + CLICollectInterval = "collect-interval" + CLIKubernetes = "kubernetes" + CLIKubernetesGPUIDType = "kubernetes-gpu-id-type" + CLIUseOldNamespace = "use-old-namespace" + CLIRemoteHEInfo = "remote-hostengine-info" + CLIGPUDevices = "devices" + CLISwitchDevices = "switch-devices" + CLICPUDevices = "cpu-devices" + CLINoHostname = "no-hostname" + CLIUseFakeGPUs = "fake-gpus" + CLIConfigMapData = "configmap-data" + CLIWebSystemdSocket = "web-systemd-socket" + CLIWebConfigFile = "web-config-file" + CLIXIDCountWindowSize = "xid-count-window-size" + CLIReplaceBlanksInModelName = "replace-blanks-in-model-name" ) func NewApp(buildVersion ...string) *cli.App { @@ -182,6 +183,13 @@ func NewApp(buildVersion ...string) *cli.App { Usage: "Set time window size in milliseconds (ms) for counting active XID errors in DCGM Exporter.", EnvVars: []string{"DCGM_EXPORTER_XID_COUNT_WINDOW_SIZE"}, }, + &cli.BoolFlag{ + Name: CLIReplaceBlanksInModelName, + Aliases: []string{"rbmn"}, + Value: false, + Usage: "Replaces every blank space in the GPU model name with a dash, ensuring a continuous, space-free identifier.", + EnvVars: []string{"DCGM_EXPORTER_REPLACE_BLANKS_IN_MODEL_NAME"}, + }, } if runtime.GOOS == "linux" { @@ -399,23 +407,24 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { } return &dcgmexporter.Config{ - CollectorsFile: c.String(CLIFieldsFile), - Address: c.String(CLIAddress), - CollectInterval: c.Int(CLICollectInterval), - Kubernetes: c.Bool(CLIKubernetes), - KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)), - CollectDCP: true, - UseOldNamespace: c.Bool(CLIUseOldNamespace), - UseRemoteHE: c.IsSet(CLIRemoteHEInfo), - RemoteHEInfo: c.String(CLIRemoteHEInfo), - GPUDevices: gOpt, - SwitchDevices: sOpt, - CPUDevices: cOpt, - NoHostname: c.Bool(CLINoHostname), - UseFakeGPUs: c.Bool(CLIUseFakeGPUs), - ConfigMapData: c.String(CLIConfigMapData), - WebSystemdSocket: c.Bool(CLIWebSystemdSocket), - WebConfigFile: c.String(CLIWebConfigFile), - XIDCountWindowSize: c.Int(CLIXIDCountWindowSize), + CollectorsFile: c.String(CLIFieldsFile), + Address: c.String(CLIAddress), + CollectInterval: c.Int(CLICollectInterval), + Kubernetes: c.Bool(CLIKubernetes), + KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)), + CollectDCP: true, + UseOldNamespace: c.Bool(CLIUseOldNamespace), + UseRemoteHE: c.IsSet(CLIRemoteHEInfo), + RemoteHEInfo: c.String(CLIRemoteHEInfo), + GPUDevices: gOpt, + SwitchDevices: sOpt, + CPUDevices: cOpt, + NoHostname: c.Bool(CLINoHostname), + UseFakeGPUs: c.Bool(CLIUseFakeGPUs), + ConfigMapData: c.String(CLIConfigMapData), + WebSystemdSocket: c.Bool(CLIWebSystemdSocket), + WebConfigFile: c.String(CLIWebConfigFile), + XIDCountWindowSize: c.Int(CLIXIDCountWindowSize), + ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName), }, nil } diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index a3cc28cf..6bc4ce0d 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -19,6 +19,7 @@ package dcgmexporter import ( "fmt" "os" + "strings" "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/sirupsen/logrus" @@ -39,11 +40,12 @@ func NewDCGMCollector(c []Counter, config *Config, hostname string, entityType d } collector := &DCGMCollector{ - Counters: c, - DeviceFields: deviceFields, - UseOldNamespace: config.UseOldNamespace, - SysInfo: *sysInfo, - Hostname: hostname, + Counters: c, + DeviceFields: deviceFields, + UseOldNamespace: config.UseOldNamespace, + SysInfo: *sysInfo, + Hostname: hostname, + ReplaceBlanksInModelName: config.ReplaceBlanksInModelName, } cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, *sysInfo, int64(config.CollectInterval)*1000) @@ -116,7 +118,13 @@ func (c *DCGMCollector) GetMetrics() (map[Counter][]Metric, error) { } else if c.SysInfo.InfoType == dcgm.FE_CPU || c.SysInfo.InfoType == dcgm.FE_CPU_CORE { metrics = ToCPUMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) } else { - metrics = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname) + metrics = ToMetric(vals, + c.Counters, + mi.DeviceInfo, + mi.InstanceInfo, + c.UseOldNamespace, + c.Hostname, + c.ReplaceBlanksInModelName) } } @@ -235,7 +243,14 @@ func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, us return metrics } -func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GPUInstanceInfo, useOld bool, hostname string) map[Counter][]Metric { +func ToMetric(values []dcgm.FieldValue_v1, + c []Counter, + d dcgm.Device, + instanceInfo *GPUInstanceInfo, + useOld bool, + hostname string, + replaceBlanksInModelName bool, +) map[Counter][]Metric { metrics := make(map[Counter][]Metric) var labels = map[string]string{} @@ -259,6 +274,15 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI if useOld { uuid = "uuid" } + + gpuModel := d.Identifiers.Model + + if replaceBlanksInModelName { + parts := strings.Fields(gpuModel) + gpuModel = strings.Join(parts, " ") + gpuModel = strings.ReplaceAll(gpuModel, " ", "-") + } + m := Metric{ Counter: counter, Value: v, @@ -267,7 +291,7 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI GPU: fmt.Sprintf("%d", d.GPU), GPUUUID: d.UUID, GPUDevice: fmt.Sprintf("nvidia%d", d.GPU), - GPUModelName: d.Identifiers.Model, + GPUModelName: gpuModel, Hostname: hostname, Labels: labels, diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go index 760e3072..97199f14 100644 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ b/pkg/dcgmexporter/gpu_collector_test.go @@ -18,9 +18,11 @@ package dcgmexporter import ( "fmt" + "reflect" "testing" "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -227,3 +229,61 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun return c, cleanup } + +func TestToMetric(t *testing.T) { + + fieldValue := [4096]byte{} + fieldValue[0] = 42 + values := []dcgm.FieldValue_v1{ + { + FieldId: 150, + FieldType: dcgm.DCGM_FT_INT64, + Value: fieldValue, + }, + } + + c := []Counter{ + { + FieldID: 150, + FieldName: "DCGM_FI_DEV_GPU_TEMP", + PromType: "gauge", + Help: "Temperature Help info", + }, + } + + d := dcgm.Device{ + UUID: "fake0", + Identifiers: dcgm.DeviceIdentifiers{ + Model: "NVIDIA T400 4GB", + }, + } + + var instanceInfo *GPUInstanceInfo = nil + + type testCase struct { + replaceBlanksInModelName bool + expectedGPUModelName string + } + + testCases := []testCase{ + { + replaceBlanksInModelName: true, + expectedGPUModelName: "NVIDIA-T400-4GB", + }, + { + replaceBlanksInModelName: false, + expectedGPUModelName: "NVIDIA T400 4GB", + }, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("When replaceBlanksInModelName is %t", tc.replaceBlanksInModelName), func(t *testing.T) { + metrics := ToMetric(values, c, d, instanceInfo, false, "", tc.replaceBlanksInModelName) + assert.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] + assert.Equal(t, "42", metricValues[0].Value) + assert.Equal(t, tc.expectedGPUModelName, metricValues[0].GPUModelName) + }) + } +} diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go index 9844c7ed..db4b1f5e 100644 --- a/pkg/dcgmexporter/types.go +++ b/pkg/dcgmexporter/types.go @@ -60,25 +60,26 @@ type DeviceOptions struct { } type Config struct { - CollectorsFile string - Address string - CollectInterval int - Kubernetes bool - KubernetesGPUIdType KubernetesGPUIDType - CollectDCP bool - UseOldNamespace bool - UseRemoteHE bool - RemoteHEInfo string - GPUDevices DeviceOptions - SwitchDevices DeviceOptions - CPUDevices DeviceOptions - NoHostname bool - UseFakeGPUs bool - ConfigMapData string - MetricGroups []dcgm.MetricGroup - WebSystemdSocket bool - WebConfigFile string - XIDCountWindowSize int + CollectorsFile string + Address string + CollectInterval int + Kubernetes bool + KubernetesGPUIdType KubernetesGPUIDType + CollectDCP bool + UseOldNamespace bool + UseRemoteHE bool + RemoteHEInfo string + GPUDevices DeviceOptions + SwitchDevices DeviceOptions + CPUDevices DeviceOptions + NoHostname bool + UseFakeGPUs bool + ConfigMapData string + MetricGroups []dcgm.MetricGroup + WebSystemdSocket bool + WebConfigFile string + XIDCountWindowSize int + ReplaceBlanksInModelName bool } type Transform interface { @@ -105,12 +106,13 @@ type MetricsPipeline struct { } type DCGMCollector struct { - Counters []Counter - DeviceFields []dcgm.Short - Cleanups []func() - UseOldNamespace bool - SysInfo SystemInfo - Hostname string + Counters []Counter + DeviceFields []dcgm.Short + Cleanups []func() + UseOldNamespace bool + SysInfo SystemInfo + Hostname string + ReplaceBlanksInModelName bool } type Counter struct {