From 9ee63de39af4e5b186b960eadad9cb37574b4a35 Mon Sep 17 00:00:00 2001 From: Garen Date: Mon, 10 Jun 2024 13:13:03 +0800 Subject: [PATCH] feat: add pci_bus_id label for metrics Signed-off-by: Garen Fang --- pkg/dcgmexporter/expcollector.go | 3 ++- pkg/dcgmexporter/gpu_collector.go | 3 +++ pkg/dcgmexporter/gpu_collector_test.go | 24 ++++++++++++++++++++++-- pkg/dcgmexporter/pipeline.go | 2 +- pkg/dcgmexporter/types.go | 1 + pkg/dcgmexporter/xid_collector_test.go | 18 ++++++++++-------- 6 files changed, 39 insertions(+), 12 deletions(-) diff --git a/pkg/dcgmexporter/expcollector.go b/pkg/dcgmexporter/expcollector.go index 1ddcea87..be92f241 100644 --- a/pkg/dcgmexporter/expcollector.go +++ b/pkg/dcgmexporter/expcollector.go @@ -35,7 +35,7 @@ var expMetricsFormat = ` # HELP {{ $counter.FieldName }} {{ $counter.Help }} # TYPE {{ $counter.FieldName }} {{ $counter.PromType }} {{- range $metric := $metrics }} -{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} +{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",pci_bus_id="{{ $metric.GPUPCIBusID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} {{- range $k, $v := $metric.Labels -}} ,{{ $k }}="{{ $v }}" @@ -174,6 +174,7 @@ func (c *expCollector) createMetric(labels map[string]string, mi MonitoringInfo, GPUUUID: mi.DeviceInfo.UUID, GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU), GPUModelName: gpuModel, + GPUPCIBusID: mi.DeviceInfo.PCI.BusID, Hostname: c.hostname, Labels: labels, diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index dcd640ca..bed71866 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -201,6 +201,7 @@ func ToSwitchMetric( GPUUUID: "", GPUDevice: fmt.Sprintf("nvswitch%d", mi.ParentId), GPUModelName: "", + GPUPCIBusID: "", Hostname: hostname, Labels: labels, Attributes: nil, @@ -246,6 +247,7 @@ func ToCPUMetric( GPUUUID: "", GPUDevice: fmt.Sprintf("%d", mi.ParentId), GPUModelName: "", + GPUPCIBusID: "", Hostname: hostname, Labels: labels, Attributes: nil, @@ -311,6 +313,7 @@ func ToMetric( GPUUUID: d.UUID, GPUDevice: fmt.Sprintf("nvidia%d", d.GPU), GPUModelName: gpuModel, + GPUPCIBusID: d.PCI.BusID, Hostname: hostname, Labels: labels, diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go index e38b2673..2f38d442 100644 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ b/pkg/dcgmexporter/gpu_collector_test.go @@ -95,6 +95,9 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun dev := dcgm.Device{ GPU: 0, UUID: fmt.Sprintf("fake%d", gpuId), + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, } return dev, nil @@ -169,7 +172,8 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun for _, metric := range metrics { seenMetrics[metric.Counter.FieldName] = true require.NotEmpty(t, metric.GPU) - + require.NotEmpty(t, metric.GPUUUID) + require.NotEmpty(t, metric.GPUPCIBusID) require.NotEmpty(t, metric.Value) require.NotEqual(t, metric.Value, FailedToConvert) } @@ -197,6 +201,9 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun GPU: 0, DCGMSupported: "No", UUID: fmt.Sprintf("fake%d", gpuId), + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, } return dev, nil @@ -260,7 +267,8 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun for _, metric := range dev { seenMetrics[metric.Counter.FieldName] = true require.NotEmpty(t, metric.GPU) - + require.Empty(t, metric.GPUUUID) + require.Empty(t, metric.GPUPCIBusID) require.NotEmpty(t, metric.Value) require.NotEqual(t, metric.Value, FailedToConvert) } @@ -295,6 +303,9 @@ func TestToMetric(t *testing.T) { Identifiers: dcgm.DeviceIdentifiers{ Model: "NVIDIA T400 4GB", }, + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, } var instanceInfo *GPUInstanceInfo = nil @@ -324,6 +335,9 @@ func TestToMetric(t *testing.T) { metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] assert.Equal(t, "42", metricValues[0].Value) assert.Equal(t, tc.expectedGPUModelName, metricValues[0].GPUModelName) + + assert.Equal(t, d.UUID, metricValues[0].GPUUUID) + assert.Equal(t, d.PCI.BusID, metricValues[0].GPUPCIBusID) }) } } @@ -343,6 +357,9 @@ func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) { Identifiers: dcgm.DeviceIdentifiers{ Model: "NVIDIA T400 4GB", }, + PCI: dcgm.PCIInfo{ + BusID: "00000000:0000:0000.0", + }, } var instanceInfo *GPUInstanceInfo = nil @@ -393,6 +410,9 @@ func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) { assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Attributes["err_code"]) assert.Contains(t, metricValues[0].Attributes, "err_msg") assert.Equal(t, tc.expectedErr, metricValues[0].Attributes["err_msg"]) + + assert.Equal(t, d.UUID, metricValues[0].GPUUUID) + assert.Equal(t, d.PCI.BusID, metricValues[0].GPUPCIBusID) }) } } diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go index 69312403..fd4b25c0 100644 --- a/pkg/dcgmexporter/pipeline.go +++ b/pkg/dcgmexporter/pipeline.go @@ -296,7 +296,7 @@ var migMetricsFormat = ` # HELP {{ $counter.FieldName }} {{ $counter.Help }} # TYPE {{ $counter.FieldName }} {{ $counter.PromType }} {{- range $metric := $metrics }} -{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} +{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",pci_bus_id="{{ $metric.GPUPCIBusID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} {{- range $k, $v := $metric.Labels -}} ,{{ $k }}="{{ $v }}" diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go index fc4ba0f4..246afe02 100644 --- a/pkg/dcgmexporter/types.go +++ b/pkg/dcgmexporter/types.go @@ -96,6 +96,7 @@ type Metric struct { GPUUUID string GPUDevice string GPUModelName string + GPUPCIBusID string UUID string diff --git a/pkg/dcgmexporter/xid_collector_test.go b/pkg/dcgmexporter/xid_collector_test.go index 96b3f9b9..ceaf02d1 100644 --- a/pkg/dcgmexporter/xid_collector_test.go +++ b/pkg/dcgmexporter/xid_collector_test.go @@ -221,16 +221,18 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { })) continue } - assert.Len(t, mv.Label, 8) + assert.Len(t, mv.Label, 9) assert.Equal(t, "gpu", *mv.Label[0].Name) assert.Equal(t, "UUID", *mv.Label[1].Name) - assert.Equal(t, "device", *mv.Label[2].Name) - assert.Equal(t, "modelName", *mv.Label[3].Name) - assert.Equal(t, "Hostname", *mv.Label[4].Name) - assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *mv.Label[5].Name) - assert.Equal(t, "window_size_in_ms", *mv.Label[6].Name) - assert.Equal(t, "xid", *mv.Label[7].Name) - assert.NotEmpty(t, *mv.Label[7].Value) + assert.Equal(t, "pci_bus_id", *mv.Label[2].Name) + assert.NotEmpty(t, *mv.Label[2].Value) + assert.Equal(t, "device", *mv.Label[3].Name) + assert.Equal(t, "modelName", *mv.Label[4].Name) + assert.Equal(t, "Hostname", *mv.Label[5].Name) + assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *mv.Label[6].Name) + assert.Equal(t, "window_size_in_ms", *mv.Label[7].Name) + assert.Equal(t, "xid", *mv.Label[8].Name) + assert.NotEmpty(t, *mv.Label[8].Value) } }