From 75c2dbad6b13c67dc0037e6a6700e9c959e1343f Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Thu, 8 Feb 2024 15:21:49 -0600 Subject: [PATCH] The issue 250 dcgm-exporter on main does not report metrics for all GPUs Signed-off-by: Vadym Fedorov --- .github/workflows/go.yml | 2 + pkg/dcgmexporter/gpu_collector.go | 33 ++++++------- pkg/dcgmexporter/gpu_collector_test.go | 64 +++++++++++++++++++++++++- pkg/dcgmexporter/test_utils.go | 9 ++++ pkg/dcgmexporter/xid_collector_test.go | 1 + 5 files changed, 89 insertions(+), 20 deletions(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index eb423060..25904dfc 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -23,3 +23,5 @@ jobs: - name: Lint run: make check-format + + diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index 6bc4ce0d..293641d8 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -90,9 +90,8 @@ func (c *DCGMCollector) Cleanup() { func (c *DCGMCollector) GetMetrics() (map[Counter][]Metric, error) { monitoringInfo := GetMonitoredEntities(c.SysInfo) - count := len(monitoringInfo) - metrics := make(map[Counter][]Metric, count) + metrics := make(map[Counter][]Metric) for _, mi := range monitoringInfo { var vals []dcgm.FieldValue_v1 @@ -114,11 +113,12 @@ func (c *DCGMCollector) GetMetrics() (map[Counter][]Metric, error) { // InstanceInfo will be nil for GPUs if c.SysInfo.InfoType == dcgm.FE_SWITCH || c.SysInfo.InfoType == dcgm.FE_LINK { - metrics = ToSwitchMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) + ToSwitchMetric(metrics, vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) } else if c.SysInfo.InfoType == dcgm.FE_CPU || c.SysInfo.InfoType == dcgm.FE_CPU_CORE { - metrics = ToCPUMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) + ToCPUMetric(metrics, vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) } else { - metrics = ToMetric(vals, + ToMetric(metrics, + vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, @@ -153,8 +153,8 @@ func FindCounterField(c []Counter, fieldId uint) (Counter, error) { return c[0], fmt.Errorf("Could not find corresponding counter") } -func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) map[Counter][]Metric { - metrics := make(map[Counter][]Metric) +func ToSwitchMetric(metrics map[Counter][]Metric, + values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) { labels := map[string]string{} for _, val := range values { @@ -194,12 +194,10 @@ func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, metrics[m.Counter] = append(metrics[m.Counter], m) } - - return metrics } -func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) map[Counter][]Metric { - metrics := make(map[Counter][]Metric) +func ToCPUMetric(metrics map[Counter][]Metric, + values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) { var labels = map[string]string{} for _, val := range values { @@ -239,19 +237,18 @@ func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, us metrics[m.Counter] = append(metrics[m.Counter], m) } - - return metrics } -func ToMetric(values []dcgm.FieldValue_v1, +func ToMetric( + metrics map[Counter][]Metric, + values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GPUInstanceInfo, useOld bool, hostname string, replaceBlanksInModelName bool, -) map[Counter][]Metric { - metrics := make(map[Counter][]Metric) +) { var labels = map[string]string{} for _, val := range values { @@ -305,10 +302,8 @@ func ToMetric(values []dcgm.FieldValue_v1, m.GPUInstanceID = "" } - metrics[m.Counter] = []Metric{m} + metrics[m.Counter] = append(metrics[m.Counter], m) } - - return metrics } func ToString(value dcgm.FieldValue_v1) string { diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go index 97199f14..fe9acf19 100644 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ b/pkg/dcgmexporter/gpu_collector_test.go @@ -278,7 +278,8 @@ func TestToMetric(t *testing.T) { for _, tc := range testCases { t.Run(fmt.Sprintf("When replaceBlanksInModelName is %t", tc.replaceBlanksInModelName), func(t *testing.T) { - metrics := ToMetric(values, c, d, instanceInfo, false, "", tc.replaceBlanksInModelName) + metrics := make(map[Counter][]Metric) + ToMetric(metrics, values, c, d, instanceInfo, false, "", tc.replaceBlanksInModelName) assert.Len(t, metrics, 1) // We get metric value with 0 index metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] @@ -287,3 +288,64 @@ func TestToMetric(t *testing.T) { }) } } + +func TestGPUCollector_GetMetrics(t *testing.T) { + teardownTest := setupTest(t) + defer teardownTest(t) + + runOnlyWithLiveGPUs(t) + // Create fake GPU + numGPUs, err := dcgm.GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > dcgm.MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) + } + + entityList := []dcgm.MigHierarchyInfo{ + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + } + + gpuIDs, err := dcgm.CreateFakeEntities(entityList) + require.NoError(t, err) + require.NotEmpty(t, gpuIDs) + + numGPUs, err = dcgm.GetAllDeviceCount() + require.NoError(t, err) + + counters := []Counter{ + { + FieldID: 100, + FieldName: "DCGM_FI_DEV_SM_CLOCK", + PromType: "gauge", + Help: "SM clock frequency (in MHz).", + }, + } + + dOpt := DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + } + cfg := Config{ + GPUDevices: dOpt, + NoHostname: false, + UseOldNamespace: false, + UseFakeGPUs: false, + } + + c, cleanup, err := NewDCGMCollector(counters, &cfg, "", dcgm.FE_GPU) + require.NoError(t, err) + + defer cleanup() + + out, err := c.GetMetrics() + require.NoError(t, err) + require.Len(t, out, 1) + + values := out[counters[0]] + + require.Equal(t, numGPUs, uint(len(values))) +} diff --git a/pkg/dcgmexporter/test_utils.go b/pkg/dcgmexporter/test_utils.go index ebac00c4..6c13aeaf 100644 --- a/pkg/dcgmexporter/test_utils.go +++ b/pkg/dcgmexporter/test_utils.go @@ -31,3 +31,12 @@ func setupTest(t *testing.T) func(t *testing.T) { defer cleanup() } } + +func runOnlyWithLiveGPUs(t *testing.T) { + t.Helper() + gpus, err := dcgm.GetSupportedDevices() + assert.NoError(t, err) + if len(gpus) < 1 { + t.Skip("Skipping test that requires live GPUs. None were found") + } +} diff --git a/pkg/dcgmexporter/xid_collector_test.go b/pkg/dcgmexporter/xid_collector_test.go index b1c0eb79..430e9cd9 100644 --- a/pkg/dcgmexporter/xid_collector_test.go +++ b/pkg/dcgmexporter/xid_collector_test.go @@ -33,6 +33,7 @@ import ( func TestXIDCollector_Gather_Encode(t *testing.T) { teardownTest := setupTest(t) defer teardownTest(t) + runOnlyWithLiveGPUs(t) hostname := "local-test" config := &Config{