Skip to content

Commit

Permalink
feat: add pci_bus_id label for metrics (#326)
Browse files Browse the repository at this point in the history
Signed-off-by: Garen Fang <[email protected]>
  • Loading branch information
fungaren authored Jun 10, 2024
1 parent 478fab1 commit 961ee35
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 12 deletions.
3 changes: 2 additions & 1 deletion pkg/dcgmexporter/expcollector.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ var expMetricsFormat = `
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
{{- range $metric := $metrics }}
{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",pci_bus_id="{{ $metric.GPUPCIBusID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
{{- range $k, $v := $metric.Labels -}}
,{{ $k }}="{{ $v }}"
Expand Down Expand Up @@ -174,6 +174,7 @@ func (c *expCollector) createMetric(labels map[string]string, mi MonitoringInfo,
GPUUUID: mi.DeviceInfo.UUID,
GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU),
GPUModelName: gpuModel,
GPUPCIBusID: mi.DeviceInfo.PCI.BusID,
Hostname: c.hostname,

Labels: labels,
Expand Down
3 changes: 3 additions & 0 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ func ToSwitchMetric(
GPUUUID: "",
GPUDevice: fmt.Sprintf("nvswitch%d", mi.ParentId),
GPUModelName: "",
GPUPCIBusID: "",
Hostname: hostname,
Labels: labels,
Attributes: nil,
Expand Down Expand Up @@ -246,6 +247,7 @@ func ToCPUMetric(
GPUUUID: "",
GPUDevice: fmt.Sprintf("%d", mi.ParentId),
GPUModelName: "",
GPUPCIBusID: "",
Hostname: hostname,
Labels: labels,
Attributes: nil,
Expand Down Expand Up @@ -311,6 +313,7 @@ func ToMetric(
GPUUUID: d.UUID,
GPUDevice: fmt.Sprintf("nvidia%d", d.GPU),
GPUModelName: gpuModel,
GPUPCIBusID: d.PCI.BusID,
Hostname: hostname,

Labels: labels,
Expand Down
24 changes: 22 additions & 2 deletions pkg/dcgmexporter/gpu_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun
dev := dcgm.Device{
GPU: 0,
UUID: fmt.Sprintf("fake%d", gpuId),
PCI: dcgm.PCIInfo{
BusID: "00000000:0000:0000.0",
},
}

return dev, nil
Expand Down Expand Up @@ -169,7 +172,8 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun
for _, metric := range metrics {
seenMetrics[metric.Counter.FieldName] = true
require.NotEmpty(t, metric.GPU)

require.NotEmpty(t, metric.GPUUUID)
require.NotEmpty(t, metric.GPUPCIBusID)
require.NotEmpty(t, metric.Value)
require.NotEqual(t, metric.Value, FailedToConvert)
}
Expand Down Expand Up @@ -197,6 +201,9 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun
GPU: 0,
DCGMSupported: "No",
UUID: fmt.Sprintf("fake%d", gpuId),
PCI: dcgm.PCIInfo{
BusID: "00000000:0000:0000.0",
},
}

return dev, nil
Expand Down Expand Up @@ -260,7 +267,8 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun
for _, metric := range dev {
seenMetrics[metric.Counter.FieldName] = true
require.NotEmpty(t, metric.GPU)

require.Empty(t, metric.GPUUUID)
require.Empty(t, metric.GPUPCIBusID)
require.NotEmpty(t, metric.Value)
require.NotEqual(t, metric.Value, FailedToConvert)
}
Expand Down Expand Up @@ -295,6 +303,9 @@ func TestToMetric(t *testing.T) {
Identifiers: dcgm.DeviceIdentifiers{
Model: "NVIDIA T400 4GB",
},
PCI: dcgm.PCIInfo{
BusID: "00000000:0000:0000.0",
},
}

var instanceInfo *GPUInstanceInfo = nil
Expand Down Expand Up @@ -324,6 +335,9 @@ func TestToMetric(t *testing.T) {
metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)]
assert.Equal(t, "42", metricValues[0].Value)
assert.Equal(t, tc.expectedGPUModelName, metricValues[0].GPUModelName)

assert.Equal(t, d.UUID, metricValues[0].GPUUUID)
assert.Equal(t, d.PCI.BusID, metricValues[0].GPUPCIBusID)
})
}
}
Expand All @@ -343,6 +357,9 @@ func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) {
Identifiers: dcgm.DeviceIdentifiers{
Model: "NVIDIA T400 4GB",
},
PCI: dcgm.PCIInfo{
BusID: "00000000:0000:0000.0",
},
}

var instanceInfo *GPUInstanceInfo = nil
Expand Down Expand Up @@ -393,6 +410,9 @@ func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) {
assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Attributes["err_code"])
assert.Contains(t, metricValues[0].Attributes, "err_msg")
assert.Equal(t, tc.expectedErr, metricValues[0].Attributes["err_msg"])

assert.Equal(t, d.UUID, metricValues[0].GPUUUID)
assert.Equal(t, d.PCI.BusID, metricValues[0].GPUPCIBusID)
})
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/dcgmexporter/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ var migMetricsFormat = `
# HELP {{ $counter.FieldName }} {{ $counter.Help }}
# TYPE {{ $counter.FieldName }} {{ $counter.PromType }}
{{- range $metric := $metrics }}
{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",pci_bus_id="{{ $metric.GPUPCIBusID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}}
{{- range $k, $v := $metric.Labels -}}
,{{ $k }}="{{ $v }}"
Expand Down
1 change: 1 addition & 0 deletions pkg/dcgmexporter/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ type Metric struct {
GPUUUID string
GPUDevice string
GPUModelName string
GPUPCIBusID string

UUID string

Expand Down
18 changes: 10 additions & 8 deletions pkg/dcgmexporter/xid_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,16 +221,18 @@ func TestXIDCollector_Gather_Encode(t *testing.T) {
}))
continue
}
assert.Len(t, mv.Label, 8)
assert.Len(t, mv.Label, 9)
assert.Equal(t, "gpu", *mv.Label[0].Name)
assert.Equal(t, "UUID", *mv.Label[1].Name)
assert.Equal(t, "device", *mv.Label[2].Name)
assert.Equal(t, "modelName", *mv.Label[3].Name)
assert.Equal(t, "Hostname", *mv.Label[4].Name)
assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *mv.Label[5].Name)
assert.Equal(t, "window_size_in_ms", *mv.Label[6].Name)
assert.Equal(t, "xid", *mv.Label[7].Name)
assert.NotEmpty(t, *mv.Label[7].Value)
assert.Equal(t, "pci_bus_id", *mv.Label[2].Name)
assert.NotEmpty(t, *mv.Label[2].Value)
assert.Equal(t, "device", *mv.Label[3].Name)
assert.Equal(t, "modelName", *mv.Label[4].Name)
assert.Equal(t, "Hostname", *mv.Label[5].Name)
assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *mv.Label[6].Name)
assert.Equal(t, "window_size_in_ms", *mv.Label[7].Name)
assert.Equal(t, "xid", *mv.Label[8].Name)
assert.NotEmpty(t, *mv.Label[8].Value)
}
}

Expand Down

0 comments on commit 961ee35

Please sign in to comment.