Skip to content

Commit

Permalink
The issue 250 dcgm-exporter on main does not report metrics for all GPUs
Browse files Browse the repository at this point in the history
Signed-off-by: Vadym Fedorov <[email protected]>
  • Loading branch information
nvvfedorov committed Feb 8, 2024
1 parent 8c80c51 commit 56285f3
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 20 deletions.
33 changes: 14 additions & 19 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,8 @@ func (c *DCGMCollector) Cleanup() {

func (c *DCGMCollector) GetMetrics() (map[Counter][]Metric, error) {
monitoringInfo := GetMonitoredEntities(c.SysInfo)
count := len(monitoringInfo)

metrics := make(map[Counter][]Metric, count)
metrics := make(map[Counter][]Metric)

for _, mi := range monitoringInfo {
var vals []dcgm.FieldValue_v1
Expand All @@ -114,11 +113,12 @@ func (c *DCGMCollector) GetMetrics() (map[Counter][]Metric, error) {

// InstanceInfo will be nil for GPUs
if c.SysInfo.InfoType == dcgm.FE_SWITCH || c.SysInfo.InfoType == dcgm.FE_LINK {
metrics = ToSwitchMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname)
ToSwitchMetric(metrics, vals, c.Counters, mi, c.UseOldNamespace, c.Hostname)
} else if c.SysInfo.InfoType == dcgm.FE_CPU || c.SysInfo.InfoType == dcgm.FE_CPU_CORE {
metrics = ToCPUMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname)
ToCPUMetric(metrics, vals, c.Counters, mi, c.UseOldNamespace, c.Hostname)
} else {
metrics = ToMetric(vals,
ToMetric(metrics,
vals,
c.Counters,
mi.DeviceInfo,
mi.InstanceInfo,
Expand Down Expand Up @@ -153,8 +153,8 @@ func FindCounterField(c []Counter, fieldId uint) (Counter, error) {
return c[0], fmt.Errorf("Could not find corresponding counter")
}

func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) map[Counter][]Metric {
metrics := make(map[Counter][]Metric)
func ToSwitchMetric(metrics map[Counter][]Metric,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
labels := map[string]string{}

for _, val := range values {
Expand Down Expand Up @@ -194,12 +194,10 @@ func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo,

metrics[m.Counter] = append(metrics[m.Counter], m)
}

return metrics
}

func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) map[Counter][]Metric {
metrics := make(map[Counter][]Metric)
func ToCPUMetric(metrics map[Counter][]Metric,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
var labels = map[string]string{}

for _, val := range values {
Expand Down Expand Up @@ -239,19 +237,18 @@ func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, us

metrics[m.Counter] = append(metrics[m.Counter], m)
}

return metrics
}

func ToMetric(values []dcgm.FieldValue_v1,
func ToMetric(
metrics map[Counter][]Metric,
values []dcgm.FieldValue_v1,
c []Counter,
d dcgm.Device,
instanceInfo *GPUInstanceInfo,
useOld bool,
hostname string,
replaceBlanksInModelName bool,
) map[Counter][]Metric {
metrics := make(map[Counter][]Metric)
) {
var labels = map[string]string{}

for _, val := range values {
Expand Down Expand Up @@ -305,10 +302,8 @@ func ToMetric(values []dcgm.FieldValue_v1,
m.GPUInstanceID = ""
}

metrics[m.Counter] = []Metric{m}
metrics[m.Counter] = append(metrics[m.Counter], m)
}

return metrics
}

func ToString(value dcgm.FieldValue_v1) string {
Expand Down
64 changes: 63 additions & 1 deletion pkg/dcgmexporter/gpu_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,8 @@ func TestToMetric(t *testing.T) {

for _, tc := range testCases {
t.Run(fmt.Sprintf("When replaceBlanksInModelName is %t", tc.replaceBlanksInModelName), func(t *testing.T) {
metrics := ToMetric(values, c, d, instanceInfo, false, "", tc.replaceBlanksInModelName)
metrics := make(map[Counter][]Metric)
ToMetric(metrics, values, c, d, instanceInfo, false, "", tc.replaceBlanksInModelName)
assert.Len(t, metrics, 1)
// We get metric value with 0 index
metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)]
Expand All @@ -287,3 +288,64 @@ func TestToMetric(t *testing.T) {
})
}
}

func TestGPUCollector_GetMetrics(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)

runOnlyWithLiveGPUs(t)
// Create fake GPU
numGPUs, err := dcgm.GetAllDeviceCount()
require.NoError(t, err)

if numGPUs+1 > dcgm.MAX_NUM_DEVICES {
t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES)
}

entityList := []dcgm.MigHierarchyInfo{
{Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}},
{Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}},
{Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}},
}

gpuIDs, err := dcgm.CreateFakeEntities(entityList)
require.NoError(t, err)
require.NotEmpty(t, gpuIDs)

numGPUs, err = dcgm.GetAllDeviceCount()
require.NoError(t, err)

counters := []Counter{
{
FieldID: 100,
FieldName: "DCGM_FI_DEV_SM_CLOCK",
PromType: "gauge",
Help: "SM clock frequency (in MHz).",
},
}

dOpt := DeviceOptions{
Flex: true,
MajorRange: []int{-1},
MinorRange: []int{-1},
}
cfg := Config{
GPUDevices: dOpt,
NoHostname: false,
UseOldNamespace: false,
UseFakeGPUs: false,
}

c, cleanup, err := NewDCGMCollector(counters, &cfg, "", dcgm.FE_GPU)
require.NoError(t, err)

defer cleanup()

out, err := c.GetMetrics()
require.NoError(t, err)
require.Len(t, out, 1)

values := out[counters[0]]

require.Equal(t, numGPUs, uint(len(values)))
}
9 changes: 9 additions & 0 deletions pkg/dcgmexporter/test_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,12 @@ func setupTest(t *testing.T) func(t *testing.T) {
defer cleanup()
}
}

func runOnlyWithLiveGPUs(t *testing.T) {
t.Helper()
gpus, err := dcgm.GetSupportedDevices()
assert.NoError(t, err)
if len(gpus) < 1 {
t.Skip("Skipping test that requires live GPUs. None were found")
}
}
1 change: 1 addition & 0 deletions pkg/dcgmexporter/xid_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
func TestXIDCollector_Gather_Encode(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)
runOnlyWithLiveGPUs(t)

hostname := "local-test"
config := &Config{
Expand Down

0 comments on commit 56285f3

Please sign in to comment.