diff --git a/.gitignore b/.gitignore index 0a1af636..8d41b92f 100644 --- a/.gitignore +++ b/.gitignore @@ -232,4 +232,4 @@ $RECYCLE.BIN/ *.msp # Windows shortcuts -*.lnk +*.lnk \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..a4f0acbf --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,21 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Run Debug", + "type": "go", + "request": "launch", + "mode": "debug", + "cwd": "${workspaceRoot}", + "program": "cmd/dcgm-exporter/main.go", + "args": [ + "-f", + "./etc/default-counters.csv", + "--debug" + ] + } + ] +} \ No newline at end of file diff --git a/etc/default-counters.csv b/etc/default-counters.csv index 92dabab3..db55f406 100644 --- a/etc/default-counters.csv +++ b/etc/default-counters.csv @@ -5,6 +5,7 @@ # Clocks DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +# DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT, gauge, Count of clock throttle reasons within the user-specified time window (see clock-throttle-reasons-count-window-size param). # Temperature DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). @@ -34,7 +35,6 @@ DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encoun # DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). # DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param). - # Memory usage DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). @@ -74,4 +74,4 @@ DCGM_FI_DRIVER_VERSION, label, Driver Version # DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version # DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version # DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version -# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device +# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device \ No newline at end of file diff --git a/go.mod b/go.mod index 38ee5255..72c5facd 100644 --- a/go.mod +++ b/go.mod @@ -39,6 +39,7 @@ require ( github.com/sirupsen/logrus v1.9.3 github.com/stretchr/testify v1.8.4 github.com/urfave/cli/v2 v2.26.0 + golang.org/x/sync v0.5.0 google.golang.org/grpc v1.60.0 k8s.io/api v0.20.2 k8s.io/apimachinery v0.20.2 @@ -74,11 +75,11 @@ require ( github.com/prometheus/client_golang v1.17.0 // indirect github.com/prometheus/procfs v0.11.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect + github.com/stretchr/objx v0.5.0 // indirect github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect golang.org/x/crypto v0.17.0 // indirect golang.org/x/net v0.17.0 // indirect golang.org/x/oauth2 v0.13.0 // indirect - golang.org/x/sync v0.5.0 // indirect golang.org/x/sys v0.15.0 // indirect golang.org/x/term v0.15.0 // indirect golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index ded8cede..3465f4e2 100644 --- a/go.sum +++ b/go.sum @@ -647,6 +647,7 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index b229ff66..b4a51514 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -48,24 +48,25 @@ const ( ) const ( - CLIFieldsFile = "collectors" - CLIAddress = "address" - CLICollectInterval = "collect-interval" - CLIKubernetes = "kubernetes" - CLIKubernetesGPUIDType = "kubernetes-gpu-id-type" - CLIUseOldNamespace = "use-old-namespace" - CLIRemoteHEInfo = "remote-hostengine-info" - CLIGPUDevices = "devices" - CLISwitchDevices = "switch-devices" - CLICPUDevices = "cpu-devices" - CLINoHostname = "no-hostname" - CLIUseFakeGPUs = "fake-gpus" - CLIConfigMapData = "configmap-data" - CLIWebSystemdSocket = "web-systemd-socket" - CLIWebConfigFile = "web-config-file" - CLIXIDCountWindowSize = "xid-count-window-size" - CLIReplaceBlanksInModelName = "replace-blanks-in-model-name" - CLIDebugMode = "debug" + CLIFieldsFile = "collectors" + CLIAddress = "address" + CLICollectInterval = "collect-interval" + CLIKubernetes = "kubernetes" + CLIKubernetesGPUIDType = "kubernetes-gpu-id-type" + CLIUseOldNamespace = "use-old-namespace" + CLIRemoteHEInfo = "remote-hostengine-info" + CLIGPUDevices = "devices" + CLISwitchDevices = "switch-devices" + CLICPUDevices = "cpu-devices" + CLINoHostname = "no-hostname" + CLIUseFakeGPUs = "fake-gpus" + CLIConfigMapData = "configmap-data" + CLIWebSystemdSocket = "web-systemd-socket" + CLIWebConfigFile = "web-config-file" + CLIXIDCountWindowSize = "xid-count-window-size" + CLIReplaceBlanksInModelName = "replace-blanks-in-model-name" + CLIDebugMode = "debug" + CLIClockThrottleReasonsCountWindowSize = "clock-throttle-reasons-count-window-size" ) func NewApp(buildVersion ...string) *cli.App { @@ -199,6 +200,12 @@ func NewApp(buildVersion ...string) *cli.App { Usage: "Enable debug output", EnvVars: []string{"DCGM_EXPORTER_DEBUG"}, }, + &cli.IntFlag{ + Name: CLIClockThrottleReasonsCountWindowSize, + Value: int((5 * time.Minute).Milliseconds()), + Usage: "Set time window size in milliseconds (ms) for counting active XID errors in DCGM Exporter.", + EnvVars: []string{"DCGM_EXPORTER_XID_COUNT_WINDOW_SIZE"}, + }, } if runtime.GOOS == "linux" { @@ -285,15 +292,16 @@ restart: config.MetricGroups = groups } - counters, exporterCounters, err := dcgmexporter.ExtractCounters(config) + cs, err := dcgmexporter.GetCounterSet(config) + if err != nil { logrus.Fatal(err) } - // Copy labels from counters to exporterCounters - for i := range counters { - if counters[i].PromType == "label" { - exporterCounters = append(exporterCounters, counters[i]) + // Copy labels from DCGM Counters to ExporterCounters + for i := range cs.DCGMCounters { + if cs.DCGMCounters[i].PromType == "label" { + cs.ExporterCounters = append(cs.ExporterCounters, cs.DCGMCounters[i]) } } @@ -302,9 +310,35 @@ restart: return err } + allCounters := []dcgmexporter.Counter{} + + allCounters = append(allCounters, cs.DCGMCounters...) + allCounters = append(allCounters, + dcgmexporter.Counter{ + FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + }, + dcgmexporter.Counter{ + FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS, + }, + ) + + fieldEntityGroupTypeSystemInfo := dcgmexporter.NewEntityGroupTypeSystemInfo(allCounters, config) + + for _, egt := range dcgmexporter.FieldEntityGroupTypeToMonitor { + err := fieldEntityGroupTypeSystemInfo.Load(egt) + if err != nil { + logrus.Infof("Not collecting %s metrics", egt.String()) + } + } + ch := make(chan string, 10) - pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, counters, hostname, dcgmexporter.NewDCGMCollector) + pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, + cs.DCGMCounters, + hostname, + dcgmexporter.NewDCGMCollector, + fieldEntityGroupTypeSystemInfo, + ) defer cleanup() if err != nil { logrus.Fatal(err) @@ -312,19 +346,42 @@ restart: cRegistry := dcgmexporter.NewRegistry() - if dcgmexporter.IsdcgmExpXIDErrorsCountEnabled(exporterCounters) { - xidCollector, err := dcgmexporter.NewXIDCollector(config, exporterCounters, hostname) + if dcgmexporter.IsDCGMExpXIDErrorsCountEnabled(cs.ExporterCounters) { + item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + if !exists { + logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMXIDErrorsCount.String()) + } + + xidCollector, err := dcgmexporter.NewXIDCollector(cs.ExporterCounters, hostname, config, item) if err != nil { logrus.Fatal(err) } - defer func() { - xidCollector.Cleanup() - }() - cRegistry.Register(xidCollector) + + logrus.Infof("%s collector initialized", dcgmexporter.DCGMXIDErrorsCount.String()) } + if dcgmexporter.IsDCGMExpClockThrottleReasonsEnabledCount(cs.ExporterCounters) { + item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + if !exists { + logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockThrottleReasonsCount.String()) + } + clocksThrottleReasonsCollector, err := dcgmexporter.NewClocksThrottleReasonsCollector( + cs.ExporterCounters, hostname, config, item) + if err != nil { + logrus.Fatal(err) + } + + cRegistry.Register(clocksThrottleReasonsCollector) + + logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockThrottleReasonsCount.String()) + } + + defer func() { + cRegistry.Cleanup() + }() + server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch, cRegistry) defer cleanup() if err != nil { @@ -436,25 +493,26 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { } return &dcgmexporter.Config{ - CollectorsFile: c.String(CLIFieldsFile), - Address: c.String(CLIAddress), - CollectInterval: c.Int(CLICollectInterval), - Kubernetes: c.Bool(CLIKubernetes), - KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)), - CollectDCP: true, - UseOldNamespace: c.Bool(CLIUseOldNamespace), - UseRemoteHE: c.IsSet(CLIRemoteHEInfo), - RemoteHEInfo: c.String(CLIRemoteHEInfo), - GPUDevices: gOpt, - SwitchDevices: sOpt, - CPUDevices: cOpt, - NoHostname: c.Bool(CLINoHostname), - UseFakeGPUs: c.Bool(CLIUseFakeGPUs), - ConfigMapData: c.String(CLIConfigMapData), - WebSystemdSocket: c.Bool(CLIWebSystemdSocket), - WebConfigFile: c.String(CLIWebConfigFile), - XIDCountWindowSize: c.Int(CLIXIDCountWindowSize), - ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName), - Debug: c.Bool(CLIDebugMode), + CollectorsFile: c.String(CLIFieldsFile), + Address: c.String(CLIAddress), + CollectInterval: c.Int(CLICollectInterval), + Kubernetes: c.Bool(CLIKubernetes), + KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)), + CollectDCP: true, + UseOldNamespace: c.Bool(CLIUseOldNamespace), + UseRemoteHE: c.IsSet(CLIRemoteHEInfo), + RemoteHEInfo: c.String(CLIRemoteHEInfo), + GPUDevices: gOpt, + SwitchDevices: sOpt, + CPUDevices: cOpt, + NoHostname: c.Bool(CLINoHostname), + UseFakeGPUs: c.Bool(CLIUseFakeGPUs), + ConfigMapData: c.String(CLIConfigMapData), + WebSystemdSocket: c.Bool(CLIWebSystemdSocket), + WebConfigFile: c.String(CLIWebConfigFile), + XIDCountWindowSize: c.Int(CLIXIDCountWindowSize), + ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName), + Debug: c.Bool(CLIDebugMode), + ClockThrottleReasonsCountWindowSize: c.Int(CLIClockThrottleReasonsCountWindowSize), }, nil } diff --git a/pkg/dcgmexporter/clocks_throttle_reasons_collector.go b/pkg/dcgmexporter/clocks_throttle_reasons_collector.go new file mode 100644 index 00000000..f7fe7213 --- /dev/null +++ b/pkg/dcgmexporter/clocks_throttle_reasons_collector.go @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "fmt" + "slices" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/sirupsen/logrus" +) + +// IsDCGMExpClockThrottleReasonsEnabledCount checks if the DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT counter exists +func IsDCGMExpClockThrottleReasonsEnabledCount(counters []Counter) bool { + return slices.ContainsFunc(counters, + func(c Counter) bool { + return c.FieldName == dcgmExpClockThrottleReasonsCount + }) +} + +type clocksThrottleReasonsCollector struct { + expCollector +} + +type clocksThrottleReasonBitmask int64 + +const ( + // DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE Nothing is running on the GPU and the clocks are dropping to Idle state + DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE clocksThrottleReasonBitmask = 0x0000000000000001 + // DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING GPU clocks are limited by current setting of applications clocks + DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING clocksThrottleReasonBitmask = 0x0000000000000002 + // DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP SW Power Scaling algorithm is reducing the clocks below requested clocks + DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP clocksThrottleReasonBitmask = 0x0000000000000004 + // DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN clocksThrottleReasonBitmask = 0x0000000000000008 + // DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST Sync Boost + DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST clocksThrottleReasonBitmask = 0x0000000000000010 + //SW Thermal Slowdown + DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL clocksThrottleReasonBitmask = 0x0000000000000020 + // DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL clocksThrottleReasonBitmask = 0x0000000000000040 + // DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE clocksThrottleReasonBitmask = 0x0000000000000080 + // DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS GPU clocks are limited by current setting of Display clocks + DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS clocksThrottleReasonBitmask = 0x0000000000000100 +) + +var clocksThrottleReasonToString = map[clocksThrottleReasonBitmask]string{ + // See: https://github.com/NVIDIA/DCGM/blob/6792b70c65b938d17ac9d791f59ceaadc0c7ef8a/dcgmi/CommandLineParser.cpp#L63 + DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE: "gpu_idle", + DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING: "clocks_setting", + DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP: "power_cap", + DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN: "hw_slowdown", + DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST: "sync_boost", + DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL: "sw_thermal", + DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL: "hw_thermal", + DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE: "hw_power_brake", + DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS: "display_clocks", +} + +// String method to convert the enum value to a string +func (enm clocksThrottleReasonBitmask) String() string { + return clocksThrottleReasonToString[enm] +} + +func (c *clocksThrottleReasonsCollector) GetMetrics() (MetricsByCounter, error) { + return c.expCollector.getMetrics() +} + +func NewClocksThrottleReasonsCollector(counters []Counter, + hostname string, + config *Config, + fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (Collector, error) { + if !IsDCGMExpClockThrottleReasonsEnabledCount(counters) { + logrus.Error(dcgmExpClockThrottleReasonsCount + " collector is disabled") + return nil, fmt.Errorf(dcgmExpClockThrottleReasonsCount + " collector is disabled") + } + + collector := clocksThrottleReasonsCollector{} + collector.expCollector = newExpCollector( + counters, + hostname, + []dcgm.Short{dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS}, + config, + fieldEntityGroupTypeSystemInfo, + ) + + collector.counter = counters[slices.IndexFunc(counters, func(c Counter) bool { + return c.FieldName == dcgmExpClockThrottleReasonsCount + })] + + collector.labelFiller = func(metricValueLabels map[string]string, entityValue int64) { + metricValueLabels["throttle_reason"] = clocksThrottleReasonBitmask(entityValue).String() + } + + collector.windowSize = config.ClockThrottleReasonsCountWindowSize + + collector.fieldValueParser = func(value int64) []int64 { + var reasons []int64 + + // The int64 value may represent multiple reasons. + // To extract a specific reason, we need to perform an XOR operation with a bitmask. + reasonBitmask := clocksThrottleReasonBitmask(value) + + for tr := range clocksThrottleReasonToString { + if reasonBitmask&tr != 0 { + reasons = append(reasons, int64(tr)) + } + } + + return reasons + } + + return &collector, nil +} diff --git a/pkg/dcgmexporter/clocks_throttle_reasons_collector_test.go b/pkg/dcgmexporter/clocks_throttle_reasons_collector_test.go new file mode 100644 index 00000000..c8c27e13 --- /dev/null +++ b/pkg/dcgmexporter/clocks_throttle_reasons_collector_test.go @@ -0,0 +1,384 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "fmt" + "reflect" + "slices" + "strconv" + "testing" + "time" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1" +) + +func TestClocksThrottleReasonsCollector_Gather(t *testing.T) { + teardownTest := setupTest(t) + defer teardownTest(t) + runOnlyWithLiveGPUs(t) + testutils.RequireLinux(t) + + hostname := "local-test" + config := &Config{ + GPUDevices: DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + ClockThrottleReasonsCountWindowSize: int(time.Duration(5) * time.Minute), + } + + records := [][]string{ + {"DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", "gauge", ""}, + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + } + + cc, err := extractCounters(records, config) + require.NoError(t, err) + require.Len(t, cc.ExporterCounters, 1) + require.Len(t, cc.DCGMCounters, 1) + + for i := range cc.DCGMCounters { + if cc.DCGMCounters[i].PromType == "label" { + cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) + } + } + + // Create fake GPU + numGPUs, err := dcgm.GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > dcgm.MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) + } + + entityList := []dcgm.MigHierarchyInfo{ + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + } + + gpuIDs, err := dcgm.CreateFakeEntities(entityList) + require.NoError(t, err) + require.NotEmpty(t, gpuIDs) + + type clockThrottleReasonExpectation map[string]string + expectations := map[string]clockThrottleReasonExpectation{} + + for i, gpuID := range gpuIDs { + err = dcgm.InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL|DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL), + ) + require.NoError(t, err) + + err = dcgm.InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL|DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL), + ) + require.NoError(t, err) + + err = dcgm.InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE), + ) + require.NoError(t, err) + + expectations[fmt.Sprint(gpuID)] = clockThrottleReasonExpectation{ + DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL.String(): "2", + DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL.String(): "2", + DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE.String(): "1", + } + } + + // Create a fake K8S to emulate work on K8S environment + tmpDir, cleanup := CreateTmpDir(t) + defer cleanup() + socketPath = tmpDir + "/kubelet.sock" + server := grpc.NewServer() + + gpuIDsAsString := make([]string, len(gpuIDs)) + + for i, g := range gpuIDs { + gpuIDsAsString[i] = fmt.Sprint(g) + } + + podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpuIDsAsString)) + // Tell that the app is running on K8S + config.Kubernetes = true + + allCounters := []Counter{ + Counter{ + FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + }, + } + + fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) + err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) + require.NoError(t, err) + + item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + + collector, err := NewClocksThrottleReasonsCollector(cc.ExporterCounters, hostname, config, item) + require.NoError(t, err) + + defer func() { + collector.Cleanup() + }() + + metrics, err := collector.GetMetrics() + require.NoError(t, err) + require.NotEmpty(t, metrics) + // We expect 1 metric: DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT + require.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] + + for i := 0; i < len(metricValues); i++ { + gpuID, err := strconv.ParseUint(metricValues[i].GPU, 10, 64) + if err == nil { + if !slices.Contains(gpuIDs, uint(gpuID)) { + metricValues = append(metricValues[:i], metricValues[i+1:]...) + } + } + } + + // We expect 9 records, because we have 3 fake GPU and each GPU experienced 3 CLOCK_THROTTLE errors + require.Len(t, metricValues, 9) + for _, val := range metricValues { + require.Contains(t, val.Labels, "window_size_in_ms") + require.Equal(t, fmt.Sprint(config.ClockThrottleReasonsCountWindowSize), val.Labels["window_size_in_ms"]) + expected, exists := expectations[val.GPU] + require.True(t, exists) + actualReason, exists := val.Labels["throttle_reason"] + require.True(t, exists) + expectedVal, exists := expected[actualReason] + require.True(t, exists) + require.Equal(t, expectedVal, val.Value) + } +} + +func TestClocksThrottleReasonsCollector_NewClocksThrottleReasonsCollector(t *testing.T) { + config := &Config{ + GPUDevices: DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + } + + teardownTest := setupTest(t) + defer teardownTest(t) + + allCounters := []Counter{ + Counter{ + FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + }, + } + + fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) + err := fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) + require.NoError(t, err) + item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + + t.Run("Should Return Error When DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT is not present", func(t *testing.T) { + records := [][]string{ + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + } + cc, err := extractCounters(records, config) + require.NoError(t, err) + require.Len(t, cc.ExporterCounters, 0) + require.Len(t, cc.DCGMCounters, 1) + collector, err := NewClocksThrottleReasonsCollector(cc.DCGMCounters, "", config, item) + require.Error(t, err) + require.Nil(t, collector) + }) + + t.Run("Should Return Error When Counter Param Is Empty", func(t *testing.T) { + counters := make([]Counter, 0) + collector, err := NewClocksThrottleReasonsCollector(counters, "", config, item) + require.Error(t, err) + require.Nil(t, collector) + }) + + t.Run("Should Not Return Error When DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT Present More Than Once", func(t *testing.T) { + records := [][]string{ + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + {"DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", "gauge", ""}, + {"DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", "gauge", ""}, + {"DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", "gauge", ""}, + } + cc, err := extractCounters(records, config) + require.NoError(t, err) + for i := range cc.DCGMCounters { + if cc.DCGMCounters[i].PromType == "label" { + cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) + } + } + xidCollector, err := NewClocksThrottleReasonsCollector(cc.ExporterCounters, "", config, item) + require.NoError(t, err) + require.NotNil(t, xidCollector) + }) +} + +func TestClocksThrottleReasonsCollector_Gather_AllTheThings(t *testing.T) { + teardownTest := setupTest(t) + defer teardownTest(t) + runOnlyWithLiveGPUs(t) + + hostname := "local-test" + config := &Config{ + GPUDevices: DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + ClockThrottleReasonsCountWindowSize: int(time.Duration(5) * time.Minute), + } + + records := [][]string{ + {"DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", "gauge", ""}, + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + } + + cc, err := extractCounters(records, config) + require.NoError(t, err) + require.Len(t, cc.ExporterCounters, 1) + require.Len(t, cc.DCGMCounters, 1) + + for i := range cc.DCGMCounters { + if cc.DCGMCounters[i].PromType == "label" { + cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) + } + } + + // Create fake GPU + numGPUs, err := dcgm.GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > dcgm.MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) + } + + entityList := []dcgm.MigHierarchyInfo{ + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + } + + gpuIDs, err := dcgm.CreateFakeEntities(entityList) + require.NoError(t, err) + require.NotEmpty(t, gpuIDs) + + type clockThrottleReasonExpectation map[string]string + expectations := map[string]clockThrottleReasonExpectation{} + + require.Len(t, gpuIDs, 1) + gpuID := gpuIDs[0] + err = dcgm.InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(1)*time.Second).UnixMicro(), + int64(DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE| + DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING| + DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP| + DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN| + DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST| + DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL| + DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL| + DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE| + DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS), + ) + + require.NoError(t, err) + + expectations[fmt.Sprint(gpuID)] = clockThrottleReasonExpectation{ + DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE.String(): "1", + DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING.String(): "1", + DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP.String(): "1", + DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN.String(): "1", + DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST.String(): "1", + DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL.String(): "1", + DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL.String(): "1", + DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE.String(): "1", + DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS.String(): "1", + } + + allCounters := []Counter{ + Counter{ + FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + }, + } + + fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) + + err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) + require.NoError(t, err) + + item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + + collector, err := NewClocksThrottleReasonsCollector(cc.ExporterCounters, hostname, config, item) + require.NoError(t, err) + + defer func() { + collector.Cleanup() + }() + + metrics, err := collector.GetMetrics() + require.NoError(t, err) + require.NotEmpty(t, metrics) + // We expect 1 metric: DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT + require.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] + + for i := 0; i < len(metricValues); i++ { + gpuID, err := strconv.ParseUint(metricValues[i].GPU, 10, 64) + if err == nil { + if !slices.Contains(gpuIDs, uint(gpuID)) { + metricValues = append(metricValues[:i], metricValues[i+1:]...) + } + } + } + + // Expected 9 metric values, because we injected 9 reasons + require.Len(t, metricValues, 9) + for _, val := range metricValues { + require.Contains(t, val.Labels, "window_size_in_ms") + require.Equal(t, fmt.Sprint(config.ClockThrottleReasonsCountWindowSize), val.Labels["window_size_in_ms"]) + expected, exists := expectations[val.GPU] + require.True(t, exists) + actualReason, exists := val.Labels["throttle_reason"] + require.True(t, exists) + expectedVal, exists := expected[actualReason] + require.True(t, exists) + require.Equal(t, expectedVal, val.Value) + } +} diff --git a/pkg/dcgmexporter/config.go b/pkg/dcgmexporter/config.go new file mode 100644 index 00000000..eac923c6 --- /dev/null +++ b/pkg/dcgmexporter/config.go @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package dcgmexporter + +import "github.com/NVIDIA/go-dcgm/pkg/dcgm" + +type KubernetesGPUIDType string + +const ( + GPUUID KubernetesGPUIDType = "uid" + DeviceName KubernetesGPUIDType = "device-name" +) + +type DeviceOptions struct { + Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled. + MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all + MinorRange []int // The indices of each GPUInstance/NvLink to monitor, or -1 to monitor all +} + +type Config struct { + CollectorsFile string + Address string + CollectInterval int + Kubernetes bool + KubernetesGPUIdType KubernetesGPUIDType + CollectDCP bool + UseOldNamespace bool + UseRemoteHE bool + RemoteHEInfo string + GPUDevices DeviceOptions + SwitchDevices DeviceOptions + CPUDevices DeviceOptions + NoHostname bool + UseFakeGPUs bool + ConfigMapData string + MetricGroups []dcgm.MetricGroup + WebSystemdSocket bool + WebConfigFile string + XIDCountWindowSize int + ReplaceBlanksInModelName bool + Debug bool + ClockThrottleReasonsCountWindowSize int +} diff --git a/pkg/dcgmexporter/const.go b/pkg/dcgmexporter/const.go index 46ff202a..a780367e 100644 --- a/pkg/dcgmexporter/const.go +++ b/pkg/dcgmexporter/const.go @@ -16,39 +16,6 @@ package dcgmexporter -import "fmt" - -type DCGMExporterMetric uint16 - -const ( - DCGMFIUnknown DCGMExporterMetric = 0 - DCGMXIDErrorsCount DCGMExporterMetric = iota + 9000 -) - -// DCGMFields maps DCGMExporterMetric String to enum -var DCGMFields = map[string]DCGMExporterMetric{ - DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount, - DCGMFIUnknown.String(): DCGMFIUnknown, -} - -// String method to convert the enum value to a string -func (d DCGMExporterMetric) String() string { - switch d { - case DCGMXIDErrorsCount: - return "DCGM_EXP_XID_ERRORS_COUNT" - default: - return "DCGM_FI_UNKNOWN" - } -} - -func IdentifyMetricType(s string) (DCGMExporterMetric, error) { - mv, ok := DCGMFields[s] - if !ok { - return mv, fmt.Errorf("unknown DCGMExporterMetric field '%s'", s) - } - return mv, nil -} - // Constants for logging fields const ( LoggerGroupIDKey = "groupID" @@ -60,3 +27,7 @@ const ( PARENT_ID_IGNORED = 0 DCGM_ST_NOT_CONFIGURED = "Setting not configured" ) + +const ( + windowSizeInMSLabel = "window_size_in_ms" +) diff --git a/pkg/dcgmexporter/expcollector.go b/pkg/dcgmexporter/expcollector.go new file mode 100644 index 00000000..c075eb50 --- /dev/null +++ b/pkg/dcgmexporter/expcollector.go @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "fmt" + "io" + "maps" + "slices" + "sync" + "sync/atomic" + "text/template" + "time" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/sirupsen/logrus" +) + +var expMetricsFormat = ` + +{{- range $counter, $metrics := . -}} +# HELP {{ $counter.FieldName }} {{ $counter.Help }} +# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} +{{- range $metric := $metrics }} +{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} + +{{- range $k, $v := $metric.Labels -}} + ,{{ $k }}="{{ $v }}" +{{- end -}} +{{- range $k, $v := $metric.Attributes -}} + ,{{ $k }}="{{ $v }}" +{{- end -}} + +} {{ $metric.Value -}} +{{- end }} +{{ end }}` + +// Collector interface +type Collector interface { + GetMetrics() (MetricsByCounter, error) + Cleanup() +} + +var getExpMetricTemplate = sync.OnceValue(func() *template.Template { + return template.Must(template.New("expMetrics").Parse(expMetricsFormat)) + +}) + +func encodeExpMetrics(w io.Writer, metrics MetricsByCounter) error { + tmpl := getExpMetricTemplate() + return tmpl.Execute(w, metrics) +} + +var expCollectorFieldGroupIdx atomic.Uint32 + +type expCollector struct { + sysInfo SystemInfo // Hardware system info + counter Counter // Counter that collector + hostname string // Hostname + config *Config // Configuration settings + labelDeviceFields []dcgm.Short // Fields used for labels + counterDeviceFields []dcgm.Short // Fields used for the counter + labelsCounters []Counter // Counters used for labels + cleanups []func() // Cleanup functions + fieldValueParser func(val int64) []int64 // Function to parse the field value + labelFiller func(map[string]string, int64) // Function to fill labels + windowSize int // Window size + transformations []Transform // Transformers for metric postprocessing +} + +func (c *expCollector) getMetrics() (MetricsByCounter, error) { + + fieldGroupIdx := expCollectorFieldGroupIdx.Add(1) + + fieldGroupName := fmt.Sprintf("expCollectorFieldGroupName%d", fieldGroupIdx) + fieldsGroup, err := dcgm.FieldGroupCreate(fieldGroupName, c.counterDeviceFields) + if err != nil { + return nil, err + } + + defer func() { + _ = dcgm.FieldGroupDestroy(fieldsGroup) + }() + + err = dcgm.UpdateAllFields() + if err != nil { + return nil, err + } + + mapEntityIDToValues := map[uint]map[int64]int{} + + window := time.Now().Add(-time.Duration(c.windowSize) * time.Millisecond) + + values, _, err := dcgm.GetValuesSince(dcgm.GroupAllGPUs(), fieldsGroup, window) + if err != nil { + return nil, err + } + + for _, val := range values { + if val.Status == 0 { + if _, exists := mapEntityIDToValues[val.EntityId]; !exists { + mapEntityIDToValues[val.EntityId] = map[int64]int{} + } + for _, v := range c.fieldValueParser(val.Int64()) { + mapEntityIDToValues[val.EntityId][v] += 1 + } + } + } + + labels := map[string]string{} + labels[windowSizeInMSLabel] = fmt.Sprint(c.windowSize) + + monitoringInfo := GetMonitoredEntities(c.sysInfo) + metrics := make(MetricsByCounter) + useOld := c.config.UseOldNamespace + uuid := "UUID" + if useOld { + uuid = "uuid" + } + for _, mi := range monitoringInfo { + latestValues, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.labelDeviceFields) + if err != nil { + return nil, err + } + // Extract Labels + for _, val := range latestValues { + v := ToString(val) + // Filter out counters with no value and ignored fields for this entity + if v == SkipDCGMValue { + continue + } + + counter, err := FindCounterField(c.labelsCounters, val.FieldId) + if err != nil { + continue + } + + if counter.PromType == "label" { + labels[counter.FieldName] = v + continue + } + } + + entityValues, exists := mapEntityIDToValues[mi.DeviceInfo.GPU] + if exists { + for entityValue, val := range entityValues { + + metricValueLabels := maps.Clone(labels) + c.labelFiller(metricValueLabels, entityValue) + + gpuModel := getGPUModel(mi.DeviceInfo, c.config.ReplaceBlanksInModelName) + + m := Metric{ + Counter: c.counter, + Value: fmt.Sprint(val), + UUID: uuid, + GPU: fmt.Sprintf("%d", mi.DeviceInfo.GPU), + GPUUUID: mi.DeviceInfo.UUID, + GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU), + GPUModelName: gpuModel, + Hostname: c.hostname, + + Labels: metricValueLabels, + Attributes: map[string]string{}, + } + if mi.InstanceInfo != nil { + m.MigProfile = mi.InstanceInfo.ProfileName + m.GPUInstanceID = fmt.Sprintf("%d", mi.InstanceInfo.Info.NvmlInstanceId) + } else { + m.MigProfile = "" + m.GPUInstanceID = "" + } + + metrics[c.counter] = append(metrics[c.counter], m) + } + } + } + + for _, transform := range c.transformations { + err := transform.Process(metrics, c.sysInfo) + if err != nil { + return nil, fmt.Errorf("failed to transform metrics for transform '%s'; err: %v", transform.Name(), err) + } + } + + return metrics, nil +} + +func (c *expCollector) Cleanup() { + for _, cleanup := range c.cleanups { + cleanup() + } +} + +// newExpCollector is a constructor for the expCollector +func newExpCollector( + counters []Counter, + hostname string, + counterDeviceFields []dcgm.Short, + config *Config, + fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem, +) expCollector { + var labelsCounters []Counter + for i := 0; i < len(counters); i++ { + if counters[i].PromType == "label" { + labelsCounters = append(labelsCounters, counters[i]) + counters = slices.Delete(counters, i, i+1) + } + } + + labelDeviceFields := NewDeviceFields(labelsCounters, dcgm.FE_GPU) + + transformations := getTransformations(config) + + collector := expCollector{ + hostname: hostname, + config: config, + labelDeviceFields: labelDeviceFields, + labelsCounters: labelsCounters, + counterDeviceFields: counterDeviceFields, + fieldValueParser: func(val int64) []int64 { + return []int64{val} + }, + labelFiller: func(metricValueLabels map[string]string, entityValue int64) {}, + transformations: transformations, + } + + collector.sysInfo = fieldEntityGroupTypeSystemInfo.SystemInfo + + var err error + + collector.cleanups, err = SetupDcgmFieldsWatch(collector.counterDeviceFields, + collector.sysInfo, + int64(config.CollectInterval)*1000) + if err != nil { + logrus.Fatal("Failed to watch metrics: ", err) + } + + return collector +} diff --git a/pkg/dcgmexporter/exporter_metrics.go b/pkg/dcgmexporter/exporter_metrics.go new file mode 100644 index 00000000..b2b48478 --- /dev/null +++ b/pkg/dcgmexporter/exporter_metrics.go @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import "fmt" + +const ( + dcgmExpClockThrottleReasonsCount = "DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT" + dcgmExpXIDErrorsCount = "DCGM_EXP_XID_ERRORS_COUNT" +) + +type ExporterCounter uint16 + +const ( + DCGMFIUnknown ExporterCounter = 0 + DCGMXIDErrorsCount ExporterCounter = iota + 9000 + DCGMClockThrottleReasonsCount ExporterCounter = iota +) + +// String method to convert the enum value to a string +func (enm ExporterCounter) String() string { + switch enm { + case DCGMXIDErrorsCount: + return dcgmExpXIDErrorsCount + case DCGMClockThrottleReasonsCount: + return dcgmExpClockThrottleReasonsCount + default: + return "DCGM_FI_UNKNOWN" + } +} + +// DCGMFields maps DCGMExporterMetric String to enum +var DCGMFields = map[string]ExporterCounter{ + DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount, + DCGMClockThrottleReasonsCount.String(): DCGMClockThrottleReasonsCount, + DCGMFIUnknown.String(): DCGMFIUnknown, +} + +func IdentifyMetricType(s string) (ExporterCounter, error) { + mv, ok := DCGMFields[s] + if !ok { + return mv, fmt.Errorf("Unknown ExporterCounter field '%s'", s) + } + return mv, nil +} diff --git a/pkg/dcgmexporter/const_test.go b/pkg/dcgmexporter/exporter_metrics_test.go similarity index 98% rename from pkg/dcgmexporter/const_test.go rename to pkg/dcgmexporter/exporter_metrics_test.go index 14a8e910..60710393 100644 --- a/pkg/dcgmexporter/const_test.go +++ b/pkg/dcgmexporter/exporter_metrics_test.go @@ -26,7 +26,7 @@ func TestIdentifyMetricType(t *testing.T) { tests := []struct { name string field string - output DCGMExporterMetric + output ExporterCounter valid bool }{ { diff --git a/pkg/dcgmexporter/field_entity_group_system_info.go b/pkg/dcgmexporter/field_entity_group_system_info.go new file mode 100644 index 00000000..e6ce4b53 --- /dev/null +++ b/pkg/dcgmexporter/field_entity_group_system_info.go @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "fmt" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" +) + +// FieldEntityGroupTypeToMonitor supported entity group types +var FieldEntityGroupTypeToMonitor = []dcgm.Field_Entity_Group{ + dcgm.FE_GPU, + dcgm.FE_SWITCH, + dcgm.FE_LINK, + dcgm.FE_CPU, + dcgm.FE_CPU_CORE, +} + +type FieldEntityGroupTypeSystemInfoItem struct { + SystemInfo SystemInfo + DeviceFields []dcgm.Short +} + +func (f FieldEntityGroupTypeSystemInfoItem) isEmpty() bool { + return len(f.DeviceFields) == 0 +} + +// FieldEntityGroupTypeSystemInfo represents a mapping between FieldEntityGroupType and SystemInfo +type FieldEntityGroupTypeSystemInfo struct { + items map[dcgm.Field_Entity_Group]FieldEntityGroupTypeSystemInfoItem + counters []Counter + gpuDevices DeviceOptions + switchDevices DeviceOptions + cpuDevices DeviceOptions + useFakeGPUs bool +} + +// NewEntityGroupTypeSystemInfo creates a new instance of the FieldEntityGroupTypeSystemInfo +func NewEntityGroupTypeSystemInfo(c []Counter, config *Config) *FieldEntityGroupTypeSystemInfo { + return &FieldEntityGroupTypeSystemInfo{ + items: make(map[dcgm.Field_Entity_Group]FieldEntityGroupTypeSystemInfoItem), + counters: c, + gpuDevices: config.GPUDevices, + switchDevices: config.SwitchDevices, + cpuDevices: config.CPUDevices, + useFakeGPUs: config.UseFakeGPUs, + } +} + +// Load loads SystemInfo for a provided Field_Entity_Group +func (e *FieldEntityGroupTypeSystemInfo) Load(entityType dcgm.Field_Entity_Group) error { + var deviceFields = NewDeviceFields(e.counters, entityType) + + if !ShouldMonitorDeviceType(deviceFields, entityType) { + return fmt.Errorf("no fields to watch for device type: %d", entityType) + } + + sysInfo, err := GetSystemInfo(&Config{ + GPUDevices: e.gpuDevices, + SwitchDevices: e.switchDevices, + CPUDevices: e.cpuDevices, + UseFakeGPUs: e.useFakeGPUs, + }, entityType) + if err != nil { + return err + } + + e.items[entityType] = FieldEntityGroupTypeSystemInfoItem{ + SystemInfo: *sysInfo, + DeviceFields: deviceFields, + } + + return err +} + +// Get returns FieldEntityGroupTypeSystemInfoItem, bool by dcgm.Field_Entity_Group +func (e *FieldEntityGroupTypeSystemInfo) Get(key dcgm.Field_Entity_Group) (FieldEntityGroupTypeSystemInfoItem, bool) { + val, exists := e.items[key] + return val, exists +} diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index acd83ffe..02a2ca30 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -17,6 +17,7 @@ package dcgmexporter import ( + "errors" "fmt" "os" "strings" @@ -25,31 +26,35 @@ import ( "github.com/sirupsen/logrus" ) -type DCGMCollectorConstructor func([]Counter, *Config, string, dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) +type DCGMCollectorConstructor func([]Counter, string, *Config, FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) -func NewDCGMCollector(c []Counter, config *Config, hostname string, entityType dcgm.Field_Entity_Group) (*DCGMCollector, - func(), error) { - var deviceFields = NewDeviceFields(c, entityType) +func NewDCGMCollector(c []Counter, + hostname string, + config *Config, + fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) { - if !ShouldMonitorDeviceType(deviceFields, entityType) { - return nil, func() {}, fmt.Errorf("no fields to watch for device type '%d'", entityType) + if fieldEntityGroupTypeSystemInfo.isEmpty() { + return nil, func() {}, errors.New("fieldEntityGroupTypeSystemInfo is empty") } - sysInfo, err := GetSystemInfo(config, entityType) - if err != nil { - return nil, func() {}, err + collector := &DCGMCollector{ + Counters: c, + DeviceFields: fieldEntityGroupTypeSystemInfo.DeviceFields, + SysInfo: fieldEntityGroupTypeSystemInfo.SystemInfo, + Hostname: hostname, } - collector := &DCGMCollector{ - Counters: c, - DeviceFields: deviceFields, - UseOldNamespace: config.UseOldNamespace, - SysInfo: *sysInfo, - Hostname: hostname, - ReplaceBlanksInModelName: config.ReplaceBlanksInModelName, + if config == nil { + logrus.Warn("Config is empty") + return collector, func() { collector.Cleanup() }, nil } - cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, *sysInfo, int64(config.CollectInterval)*1000) + collector.UseOldNamespace = config.UseOldNamespace + collector.ReplaceBlanksInModelName = config.ReplaceBlanksInModelName + + cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, + fieldEntityGroupTypeSystemInfo.SystemInfo, + int64(config.CollectInterval)*1000) if err != nil { logrus.Fatal("Failed to watch metrics: ", err) } @@ -60,8 +65,10 @@ func NewDCGMCollector(c []Counter, config *Config, hostname string, entityType d } func GetSystemInfo(config *Config, entityType dcgm.Field_Entity_Group) (*SystemInfo, error) { - sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.CPUDevices, config.UseFakeGPUs, - entityType) + sysInfo, err := InitializeSystemInfo(config.GPUDevices, + config.SwitchDevices, + config.CPUDevices, + config.UseFakeGPUs, entityType) if err != nil { return nil, err } @@ -155,10 +162,8 @@ func FindCounterField(c []Counter, fieldId uint) (Counter, error) { return c[0], fmt.Errorf("could not find counter corresponding to field ID '%d'", fieldId) } -func ToSwitchMetric( - metrics MetricsByCounter, - values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string, -) { +func ToSwitchMetric(metrics MetricsByCounter, + values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) { labels := map[string]string{} for _, val := range values { @@ -200,10 +205,8 @@ func ToSwitchMetric( } } -func ToCPUMetric( - metrics MetricsByCounter, - values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string, -) { +func ToCPUMetric(metrics MetricsByCounter, + values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) { var labels = map[string]string{} for _, val := range values { @@ -278,13 +281,7 @@ func ToMetric( uuid = "uuid" } - gpuModel := d.Identifiers.Model - - if replaceBlanksInModelName { - parts := strings.Fields(gpuModel) - gpuModel = strings.Join(parts, " ") - gpuModel = strings.ReplaceAll(gpuModel, " ", "-") - } + gpuModel := getGPUModel(d, replaceBlanksInModelName) m := Metric{ Counter: counter, @@ -312,6 +309,17 @@ func ToMetric( } } +func getGPUModel(d dcgm.Device, replaceBlanksInModelName bool) string { + gpuModel := d.Identifiers.Model + + if replaceBlanksInModelName { + parts := strings.Fields(gpuModel) + gpuModel = strings.Join(parts, " ") + gpuModel = strings.ReplaceAll(gpuModel, " ", "-") + } + return gpuModel +} + func ToString(value dcgm.FieldValue_v1) string { switch value.FieldType { case dcgm.DCGM_FT_INT64: diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go index fe9acf19..ddf82fc8 100644 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ b/pkg/dcgmexporter/gpu_collector_test.go @@ -69,11 +69,12 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun MajorRange: []int{-1}, MinorRange: []int{-1}, } - cfg := Config{ + config := Config{ GPUDevices: dOpt, NoHostname: false, UseOldNamespace: false, UseFakeGPUs: false, + CollectInterval: 1, } dcgmGetAllDeviceCount = func() (uint, error) { @@ -121,16 +122,30 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun dcgmAddEntityToGroup = dcgm.AddEntityToGroup }() - g, cleanup, err := NewDCGMCollector(counters, &cfg, "", dcgm.FE_GPU) + fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(counters, &config) + + err := fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) + require.NoError(t, err) + + gpuItem, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + require.True(t, exists) + + g, cleanup, err := NewDCGMCollector(counters, "", &config, gpuItem) require.NoError(t, err) /* Test for error when no switches are available to monitor. */ - _, _, err = NewDCGMCollector(counters, &cfg, "", dcgm.FE_SWITCH) - require.Error(t, err) + switchItem, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_SWITCH) + assert.False(t, exists, "dcgm.FE_SWITCH should not be available") + + _, _, err = NewDCGMCollector(counters, "", &config, switchItem) + require.Error(t, err, "NewDCGMCollector should return error") /* Test for error when no cpus are available to monitor. */ - _, _, err = NewDCGMCollector(counters, &cfg, "", dcgm.FE_CPU) - require.NoError(t, err) + cpuItem, exist := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_CPU) + require.False(t, exist, "dcgm.FE_CPU should not be available") + + _, _, err = NewDCGMCollector(counters, "", &config, cpuItem) + require.Error(t, err, "NewDCGMCollector should return error") out, err := g.GetMetrics() require.NoError(t, err) @@ -154,7 +169,7 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) { dOpt := DeviceOptions{true, []int{-1}, []int{-1}} - cfg := Config{ + config := Config{ CPUDevices: dOpt, NoHostname: false, UseOldNamespace: false, @@ -208,7 +223,18 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun }() /* Test that only cpu metrics are collected for cpu entities. */ - c, cleanup, err := NewDCGMCollector(counters, &cfg, "", dcgm.FE_CPU) + + fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(counters, &config) + err := fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_CPU) + require.NoError(t, err) + + err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_CPU) + require.NoError(t, err) + + cpuItem, cpuItemExist := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_CPU) + require.True(t, cpuItemExist) + + c, cleanup, err := NewDCGMCollector(counters, "", &config, cpuItem) require.NoError(t, err) out, err := c.GetMetrics() @@ -329,14 +355,21 @@ func TestGPUCollector_GetMetrics(t *testing.T) { MajorRange: []int{-1}, MinorRange: []int{-1}, } - cfg := Config{ + config := Config{ GPUDevices: dOpt, NoHostname: false, UseOldNamespace: false, UseFakeGPUs: false, } - c, cleanup, err := NewDCGMCollector(counters, &cfg, "", dcgm.FE_GPU) + fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(counters, &config) + err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) + require.NoError(t, err) + + gpuItem, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + require.True(t, exists) + + c, cleanup, err := NewDCGMCollector(counters, "", &config, gpuItem) require.NoError(t, err) defer cleanup() diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go index f6cc2eb4..e4026070 100644 --- a/pkg/dcgmexporter/kubernetes_test.go +++ b/pkg/dcgmexporter/kubernetes_test.go @@ -35,13 +35,10 @@ import ( "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" ) -var tmpDir string - func TestProcessPodMapper(t *testing.T) { - testutils.RequireLinux(t) - cleanup := CreateTmpDir(t) + tmpDir, cleanup := CreateTmpDir(t) defer cleanup() cleanup, err := dcgm.Init(dcgm.Embedded) @@ -117,14 +114,12 @@ func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() { } } -func CreateTmpDir(t *testing.T) func() { +func CreateTmpDir(t *testing.T) (string, func()) { path, err := os.MkdirTemp("", "dcgm-exporter") require.NoError(t, err) - tmpDir = path - - return func() { - require.NoError(t, os.RemoveAll(tmpDir)) + return path, func() { + require.NoError(t, os.RemoveAll(path)) } } @@ -230,7 +225,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { tc.MetricGPUDevice, ), func(t *testing.T) { - cleanup := CreateTmpDir(t) + tmpDir, cleanup := CreateTmpDir(t) defer cleanup() socketPath = tmpDir + "/kubelet.sock" server := grpc.NewServer() diff --git a/pkg/dcgmexporter/parser.go b/pkg/dcgmexporter/parser.go index 1bcb2c94..08ce36ba 100644 --- a/pkg/dcgmexporter/parser.go +++ b/pkg/dcgmexporter/parser.go @@ -33,13 +33,17 @@ import ( ) const ( - CPU_FIELDS_START = 1100 - DCP_FIELDS_START = 1000 + cpuFieldsStart = 1100 + dcpFieldsStart = 1000 ) -func ExtractCounters(c *Config) ([]Counter, []Counter, error) { - var err error - var records [][]string +func GetCounterSet(c *Config) (*CounterSet, error) { + var ( + err error + records [][]string + ) + + res := new(CounterSet) if c.ConfigMapData != undefinedConfigMapData { var client kubernetes.Interface @@ -52,7 +56,7 @@ func ExtractCounters(c *Config) ([]Counter, []Counter, error) { logrus.Fatal(err) } } else { - logrus.Infof("No configmap data specified") + err = fmt.Errorf("no configmap data specified") } if err != nil || c.ConfigMapData == undefinedConfigMapData { @@ -61,16 +65,16 @@ func ExtractCounters(c *Config) ([]Counter, []Counter, error) { records, err = ReadCSVFile(c.CollectorsFile) if err != nil { logrus.Errorf("Could not read metrics file '%s'; err: %v", c.CollectorsFile, err) - return nil, nil, err + return res, err } } - counters, extraCounters, err := extractCounters(records, c) + res, err = extractCounters(records, c) if err != nil { - return nil, nil, err + return res, err } - return counters, extraCounters, err + return res, err } func ReadCSVFile(filename string) ([][]string, error) { @@ -88,9 +92,8 @@ func ReadCSVFile(filename string) ([][]string, error) { return records, err } -func extractCounters(records [][]string, c *Config) ([]Counter, []Counter, error) { - f := make([]Counter, 0, len(records)) - expf := make([]Counter, 0) +func extractCounters(records [][]string, c *Config) (*CounterSet, error) { + res := CounterSet{} for i, record := range records { var useOld = false @@ -103,7 +106,7 @@ func extractCounters(records [][]string, c *Config) ([]Counter, []Counter, error } if len(record) != 3 { - return nil, nil, fmt.Errorf("malformed CSV record; err: failed to parse line %d (`%v`), "+ + return nil, fmt.Errorf("malformed CSV record; err: failed to parse line %d (`%v`), "+ "expected 3 fields", i, record) } @@ -114,9 +117,9 @@ func extractCounters(records [][]string, c *Config) ([]Counter, []Counter, error expField, err := IdentifyMetricType(record[0]) if err != nil { - return nil, nil, fmt.Errorf("could not find DCGM field; err: %w", err) + return nil, fmt.Errorf("could not find DCGM field; err: %w", err) } else if expField != DCGMFIUnknown { - expf = append(expf, Counter{dcgm.Short(expField), record[0], record[1], record[2]}) + res.ExporterCounters = append(res.ExporterCounters, Counter{dcgm.Short(expField), record[0], record[1], record[2]}) continue } } @@ -132,10 +135,10 @@ func extractCounters(records [][]string, c *Config) ([]Counter, []Counter, error } if _, ok := promMetricType[record[1]]; !ok { - return nil, nil, fmt.Errorf("could not find Prometheus metric type '%s'", record[1]) + return nil, fmt.Errorf("could not find Prometheus metric type '%s'", record[1]) } - f = append(f, Counter{fieldID, record[0], record[1], record[2]}) + res.DCGMCounters = append(res.DCGMCounters, Counter{fieldID, record[0], record[1], record[2]}) } else { if !fieldIsSupported(uint(oldFieldID), c) { logrus.Warnf("Skipping line %d ('%s'): metric not enabled", i, record[0]) @@ -143,19 +146,18 @@ func extractCounters(records [][]string, c *Config) ([]Counter, []Counter, error } if _, ok := promMetricType[record[1]]; !ok { - return nil, nil, fmt.Errorf("could not find Prometheus metric type '%s'", record[1]) + return nil, fmt.Errorf("could not find Prometheus metric type '%s'", record[1]) } - f = append(f, Counter{oldFieldID, record[0], record[1], record[2]}) - + res.DCGMCounters = append(res.DCGMCounters, Counter{oldFieldID, record[0], record[1], record[2]}) } } - return f, expf, nil + return &res, nil } func fieldIsSupported(fieldID uint, c *Config) bool { - if fieldID < DCP_FIELDS_START || fieldID >= CPU_FIELDS_START { + if fieldID < dcpFieldsStart || fieldID >= cpuFieldsStart { return true } diff --git a/pkg/dcgmexporter/parser_test.go b/pkg/dcgmexporter/parser_test.go index ef70de55..ede589b3 100644 --- a/pkg/dcgmexporter/parser_test.go +++ b/pkg/dcgmexporter/parser_test.go @@ -148,13 +148,12 @@ func extractCountersHelper(t *testing.T, input string, valid bool) { ConfigMapData: undefinedConfigMapData, CollectorsFile: tmpFile.Name(), } - records, extraCounters, err := ExtractCounters(&c) + cc, err := GetCounterSet(&c) if valid { assert.NoError(t, err, "Expected no error.") - assert.Equal(t, 1, len(records), "Expected 1 record counters.") + assert.Equal(t, 1, len(cc.DCGMCounters), "Expected 1 record counters.") } else { assert.Error(t, err, "Expected error.") - assert.Equal(t, 0, len(records), "Expected no counters.") - assert.Equal(t, 0, len(extraCounters), "Expected no extra counters.") + assert.Nil(t, cc, "Expected no counters.") } } diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go index bef0763e..c167842c 100644 --- a/pkg/dcgmexporter/pipeline.go +++ b/pkg/dcgmexporter/pipeline.go @@ -27,60 +27,75 @@ import ( "github.com/sirupsen/logrus" ) -func NewMetricsPipeline( - c *Config, counters []Counter, hostname string, newDCGMCollector DCGMCollectorConstructor, +func NewMetricsPipeline(config *Config, + counters []Counter, + hostname string, + newDCGMCollector DCGMCollectorConstructor, + fieldEntityGroupTypeSystemInfo *FieldEntityGroupTypeSystemInfo, ) (*MetricsPipeline, func(), error) { logrus.WithField(LoggerDumpKey, fmt.Sprintf("%+v", counters)).Debug("Counters are initialized") cleanups := []func(){} - gpuCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_GPU) - if err != nil { - logrus.Info("Not collecting gpu metrics: ", err) - } else { - cleanups = append(cleanups, cleanup) - } - switchCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_SWITCH) - if err != nil { - logrus.Info("Not collecting switch metrics: ", err) - } else { + var ( + gpuCollector *DCGMCollector + switchCollector *DCGMCollector + linkCollector *DCGMCollector + cpuCollector *DCGMCollector + coreCollector *DCGMCollector + err error + ) + + if item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU); exists { + var cleanup func() + gpuCollector, cleanup, err = newDCGMCollector(counters, hostname, config, item) + if err != nil { + logrus.Warn("Cannot create DCGMCollector for dcgm.FE_GPU") + } cleanups = append(cleanups, cleanup) } - linkCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_LINK) - if err != nil { - logrus.Info("Not collecting link metrics: ", err) - } else { + if item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_SWITCH); exists { + var cleanup func() + switchCollector, cleanup, err = newDCGMCollector(counters, hostname, config, item) + if err != nil { + logrus.Warn("Cannot create DCGMCollector for dcgm.FE_SWITCH") + } cleanups = append(cleanups, cleanup) } - cpuCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_CPU) - if err != nil { - logrus.Info("Not collecting cpu metrics: ", err) - } else { + if item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_LINK); exists { + var cleanup func() + linkCollector, cleanup, err = newDCGMCollector(counters, hostname, config, item) + if err != nil { + logrus.Warn("Cannot create DCGMCollector for dcgm.FE_LINK") + } cleanups = append(cleanups, cleanup) } - coreCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_CPU_CORE) - if err != nil { - logrus.Info("Not collecting cpu core metrics: ", err) - } else { + if item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_CPU); exists { + var cleanup func() + cpuCollector, cleanup, err = newDCGMCollector(counters, hostname, config, item) + if err != nil { + logrus.Warn("Cannot create DCGMCollector for dcgm.FE_CPU") + } cleanups = append(cleanups, cleanup) } - transformations := []Transform{} - if c.Kubernetes { - podMapper, err := NewPodMapper(c) + if item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_CPU_CORE); exists { + var cleanup func() + coreCollector, cleanup, err = newDCGMCollector(counters, hostname, config, item) if err != nil { - logrus.Warnf("Could not enable kubernetes metric collection: %v", err) - } else { - transformations = append(transformations, podMapper) + logrus.Warn("Cannot create DCGMCollector for dcgm.FE_CPU_CORE") } + cleanups = append(cleanups, cleanup) } + transformations := getTransformations(config) + return &MetricsPipeline{ - config: c, + config: config, migMetricsFormat: template.Must(template.New("migMetrics").Parse(migMetricsFormat)), switchMetricsFormat: template.Must(template.New("switchMetrics").Parse(switchMetricsFormat)), @@ -102,6 +117,20 @@ func NewMetricsPipeline( }, nil } +func getTransformations(c *Config) []Transform { + transformations := []Transform{} + if c.Kubernetes { + podMapper, err := NewPodMapper(c) + if err != nil { + logrus.Warnf("Could not enable kubernetes metric collection: %v", err) + } else { + transformations = append(transformations, podMapper) + } + } + + return transformations +} + // Primarely for testing, caller expected to cleanup the collector func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error) { return &MetricsPipeline{ diff --git a/pkg/dcgmexporter/pipeline_test.go b/pkg/dcgmexporter/pipeline_test.go index 4de3b437..ee4e9e69 100644 --- a/pkg/dcgmexporter/pipeline_test.go +++ b/pkg/dcgmexporter/pipeline_test.go @@ -17,12 +17,13 @@ package dcgmexporter import ( - "fmt" + "errors" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" "os" "testing" "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "github.com/sirupsen/logrus" "github.com/stretchr/testify/require" ) @@ -49,15 +50,19 @@ func TestRun(t *testing.T) { t.Logf("Pipeline result is:\n%v", out) } -func testNewDCGMCollector( +func testNewDCGMCollector(t *testing.T, counter *int, enabledCollector map[dcgm.Field_Entity_Group]struct{}, ) DCGMCollectorConstructor { - return func(c []Counter, config *Config, hostname string, entityType dcgm.Field_Entity_Group) (*DCGMCollector, - func(), error) { + t.Helper() + return func(c []Counter, + hostname string, + config *Config, + fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) { // should always create GPU Collector - if entityType != dcgm.FE_GPU { - if _, ok := enabledCollector[entityType]; !ok { - return nil, func() {}, fmt.Errorf("collector '%s' should not be created", entityType) + if fieldEntityGroupTypeSystemInfo.SystemInfo.InfoType != dcgm.FE_GPU { + if _, ok := enabledCollector[fieldEntityGroupTypeSystemInfo.SystemInfo.InfoType]; !ok { + t.Errorf("collector '%s' should not be created", fieldEntityGroupTypeSystemInfo.SystemInfo.InfoType) + return nil, func() {}, nil } } @@ -82,70 +87,122 @@ func TestCountPipelineCleanup(t *testing.T) { for _, c := range []struct { name string enabledCollector map[dcgm.Field_Entity_Group]struct{} - }{ - { - name: "only_gpu", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{}, - }, { - name: "gpu_switch", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_SWITCH: struct{}{}, - }, - }, { - name: "gpu_link", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_LINK: struct{}{}, - }, - }, { - name: "gpu_cpu", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_CPU: struct{}{}, - }, - }, { - name: "gpu_core", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_CPU_CORE: struct{}{}, - }, - }, { - name: "gpu_switch_link", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_SWITCH: struct{}{}, - dcgm.FE_LINK: struct{}{}, - }, - }, { - name: "gpu_cpu_core", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_CPU: struct{}{}, - dcgm.FE_CPU_CORE: struct{}{}, - }, - }, { - name: "all", - enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ - dcgm.FE_SWITCH: struct{}{}, - dcgm.FE_LINK: struct{}{}, - dcgm.FE_CPU: struct{}{}, - dcgm.FE_CPU_CORE: struct{}{}, - }, + }{{ + name: "only_gpu", + enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ + dcgm.FE_GPU: struct{}{}, + }, + }, { + name: "gpu_switch", + enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ + dcgm.FE_SWITCH: struct{}{}, + }, + }, { + name: "gpu_link", + enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ + dcgm.FE_LINK: struct{}{}, }, - } { - cleanupCounter := 0 + }, { + name: "gpu_cpu", + enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ + dcgm.FE_CPU: struct{}{}, + }, + }, { + name: "gpu_core", + enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ + dcgm.FE_CPU_CORE: struct{}{}, + }, + }, { + name: "gpu_switch_link", + enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ + dcgm.FE_SWITCH: struct{}{}, + dcgm.FE_LINK: struct{}{}, + }, + }, { + name: "gpu_cpu_core", + enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ + dcgm.FE_CPU: struct{}{}, + dcgm.FE_CPU_CORE: struct{}{}, + }, + }, { + name: "all", + enabledCollector: map[dcgm.Field_Entity_Group]struct{}{ + dcgm.FE_SWITCH: struct{}{}, + dcgm.FE_LINK: struct{}{}, + dcgm.FE_CPU: struct{}{}, + dcgm.FE_CPU_CORE: struct{}{}, + }, + }} { - config := &Config{ - Kubernetes: false, - ConfigMapData: undefinedConfigMapData, - CollectorsFile: f.Name(), - } + t.Run(c.name, func(t *testing.T) { + cleanupCounter := 0 - counters, _, err := ExtractCounters(config) - if err != nil { - logrus.Fatal(err) - } + config := &Config{ + Kubernetes: false, + ConfigMapData: undefinedConfigMapData, + CollectorsFile: f.Name(), + } + + cc, err := GetCounterSet(config) + if err != nil { + logrus.Fatal(err) + } - _, cleanup, err := NewMetricsPipeline(config, counters, "", - testNewDCGMCollector(&cleanupCounter, c.enabledCollector)) - require.NoError(t, err, "case: %s failed", c.name) + fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(cc.DCGMCounters, config) - cleanup() - require.Equal(t, len(c.enabledCollector)+1, cleanupCounter, "case: %s failed", c.name) + for egt := range c.enabledCollector { + // We inject system info for unit test purpose + fieldEntityGroupTypeSystemInfo.items[egt] = FieldEntityGroupTypeSystemInfoItem{ + SystemInfo: SystemInfo{ + InfoType: egt, + }, + } + } + + _, cleanup, err := NewMetricsPipeline(config, + cc.DCGMCounters, + "", + testNewDCGMCollector(t, &cleanupCounter, c.enabledCollector), + fieldEntityGroupTypeSystemInfo) + require.NoError(t, err, "case: %s failed", c.name) + + cleanup() + require.Equal(t, len(c.enabledCollector), cleanupCounter, "case: %s failed", c.name) + }) + } +} + +func TestNewMetricsPipelineWhenFieldEntityGroupTypeSystemInfoItemIsEmpty(t *testing.T) { + cleanup, err := dcgm.Init(dcgm.Embedded) + require.NoError(t, err) + defer cleanup() + + config := &Config{} + + fieldEntityGroupTypeSystemInfo := &FieldEntityGroupTypeSystemInfo{ + items: map[dcgm.Field_Entity_Group]FieldEntityGroupTypeSystemInfoItem{ + dcgm.FE_GPU: FieldEntityGroupTypeSystemInfoItem{}, + dcgm.FE_SWITCH: FieldEntityGroupTypeSystemInfoItem{}, + dcgm.FE_LINK: FieldEntityGroupTypeSystemInfoItem{}, + dcgm.FE_CPU: FieldEntityGroupTypeSystemInfoItem{}, + dcgm.FE_CPU_CORE: FieldEntityGroupTypeSystemInfoItem{}, + }, } + + p, cleanup, err := NewMetricsPipeline(config, + sampleCounters, + "", + func(_ []Counter, _ string, _ *Config, item FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) { + assert.True(t, item.isEmpty()) + return nil, func() {}, errors.New("empty") + }, + fieldEntityGroupTypeSystemInfo, + ) + require.NoError(t, err) + defer cleanup() + require.NoError(t, err) + + out, err := p.run() + require.NoError(t, err) + require.Empty(t, out) } diff --git a/pkg/dcgmexporter/registry.go b/pkg/dcgmexporter/registry.go index 112d5fb3..3b62df4c 100644 --- a/pkg/dcgmexporter/registry.go +++ b/pkg/dcgmexporter/registry.go @@ -16,7 +16,11 @@ package dcgmexporter -import "sync" +import ( + "sync" + + "golang.org/x/sync/errgroup" +) type Registry struct { collectors []Collector @@ -29,27 +33,60 @@ func NewRegistry() *Registry { } } +// Register registers a collector with the registry. func (r *Registry) Register(c Collector) { r.collectors = append(r.collectors, c) } -func (r *Registry) Gather() (map[Counter][]Metric, error) { +// Gather gathers metrics from all registered collectors. +func (r *Registry) Gather() (MetricsByCounter, error) { r.mtx.Lock() defer r.mtx.Unlock() - output := map[Counter][]Metric{} + var wg sync.WaitGroup + wg.Add(len(r.collectors)) + + g := new(errgroup.Group) + + var sm sync.Map for _, c := range r.collectors { - metrics, err := c.GetMetrics() + c := c //creates new c, see https://golang.org/doc/faq#closures_and_goroutines + g.Go(func() error { + metrics, err := c.GetMetrics() - if err != nil { - return nil, err - } + if err != nil { + return err + } + + for counter, metricVals := range metrics { + val, _ := sm.LoadOrStore(counter, []Metric{}) + out := val.([]Metric) + out = append(out, metricVals...) + sm.Store(counter, out) + } + + return nil + }) + } - for counter, metricVals := range metrics { - output[counter] = append(output[counter], metricVals...) - } + if err := g.Wait(); err != nil { + return nil, err } + output := MetricsByCounter{} + + sm.Range(func(key, value interface{}) bool { + output[key.(Counter)] = value.([]Metric) + return true // continue iteration + }) + return output, nil } + +// Cleanup resources of registered collectors +func (r *Registry) Cleanup() { + for _, c := range r.collectors { + c.Cleanup() + } +} diff --git a/pkg/dcgmexporter/registry_test.go b/pkg/dcgmexporter/registry_test.go new file mode 100644 index 00000000..f7da1ccf --- /dev/null +++ b/pkg/dcgmexporter/registry_test.go @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" +) + +type mockCollector struct { + mock.Mock +} + +func (m *mockCollector) GetMetrics() (MetricsByCounter, error) { + args := m.Called() + return args.Get(0).(MetricsByCounter), args.Error(1) +} + +func (m *mockCollector) Cleanup() { + m.Called() +} + +func TestRegistry_Gather(t *testing.T) { + collector := new(mockCollector) + reg := NewRegistry() + + metrics := MetricsByCounter{} + counterA := Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + } + metrics[counterA] = append(metrics[counterA], Metric{ + GPU: "0", + Counter: counterA, + Attributes: map[string]string{}, + }) + + counterB := Counter{ + FieldName: "DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", + PromType: "gauge", + } + + metrics[counterB] = append(metrics[counterB], Metric{ + GPU: "0", + Counter: counterB, + Value: "42", + Attributes: map[string]string{}, + }) + + type test struct { + name string + collectorState func() *mock.Call + assert func(MetricsByCounter, error) + } + + tests := []test{ + { + name: "When collector return no errors", + collectorState: func() *mock.Call { + return collector.On("GetMetrics").Return(metrics, nil) + }, + assert: func(mbc MetricsByCounter, err error) { + require.NoError(t, err) + require.Len(t, mbc, 2) + }, + }, + { + name: "When collector return errors", + collectorState: func() *mock.Call { + return collector.On("GetMetrics").Return(MetricsByCounter{}, errors.New("Boom!")) + }, + assert: func(mbc MetricsByCounter, err error) { + require.Error(t, err) + require.Len(t, mbc, 0) + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + reg.collectors = nil + reg.Register(collector) + mockCall := tc.collectorState() + got, err := reg.Gather() + tc.assert(got, err) + mockCall.Unset() + }) + + } +} diff --git a/pkg/dcgmexporter/server.go b/pkg/dcgmexporter/server.go index 80f3985b..51099308 100644 --- a/pkg/dcgmexporter/server.go +++ b/pkg/dcgmexporter/server.go @@ -117,13 +117,13 @@ func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request) { http.Error(w, "failed to write response", http.StatusInternalServerError) return } - xidMetrics, err := s.registry.Gather() + metrics, err := s.registry.Gather() if err != nil { logrus.WithError(err).Error("Failed to write response.") http.Error(w, "failed to write response", http.StatusInternalServerError) return } - err = encodeXIDMetrics(w, xidMetrics) + err = encodeExpMetrics(w, metrics) if err != nil { http.Error(w, "failed to write response", http.StatusInternalServerError) return diff --git a/pkg/dcgmexporter/system_info.go b/pkg/dcgmexporter/system_info.go index 1abffc85..6d448828 100644 --- a/pkg/dcgmexporter/system_info.go +++ b/pkg/dcgmexporter/system_info.go @@ -99,7 +99,7 @@ func SetGPUInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName s func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error { var err error var errFound bool - errStr := fmt.Sprintf("cannot find match for entities:") + errStr := "cannot find match for entities:" for _, v := range values { if !SetGPUInstanceProfileName(sysInfo, v.EntityId, dcgm.Fv2_String(v)) { diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go index 9431b045..0dd91468 100644 --- a/pkg/dcgmexporter/types.go +++ b/pkg/dcgmexporter/types.go @@ -46,43 +46,6 @@ var ( undefinedConfigMapData = "none" ) -type KubernetesGPUIDType string - -const ( - GPUUID KubernetesGPUIDType = "uid" - DeviceName KubernetesGPUIDType = "device-name" -) - -type DeviceOptions struct { - Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled. - MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all - MinorRange []int // The indices of each GPUInstance/NvLink to monitor, or -1 to monitor all -} - -type Config struct { - CollectorsFile string - Address string - CollectInterval int - Kubernetes bool - KubernetesGPUIdType KubernetesGPUIDType - CollectDCP bool - UseOldNamespace bool - UseRemoteHE bool - RemoteHEInfo string - GPUDevices DeviceOptions - SwitchDevices DeviceOptions - CPUDevices DeviceOptions - NoHostname bool - UseFakeGPUs bool - ConfigMapData string - MetricGroups []dcgm.MetricGroup - WebSystemdSocket bool - WebConfigFile string - XIDCountWindowSize int - ReplaceBlanksInModelName bool - Debug bool -} - type Transform interface { Process(metrics MetricsByCounter, sysInfo SystemInfo) error Name() string @@ -184,5 +147,11 @@ type PodInfo struct { Container string } -// MetricsByCounter represeents a map where each Counter is associated with a slice of Metric objects +// MetricsByCounter represents a map where each Counter is associated with a slice of Metric objects type MetricsByCounter map[Counter][]Metric + +// CounterSet return +type CounterSet struct { + DCGMCounters []Counter + ExporterCounters []Counter +} diff --git a/pkg/dcgmexporter/xid_collector.go b/pkg/dcgmexporter/xid_collector.go index fca7430a..88ca020c 100644 --- a/pkg/dcgmexporter/xid_collector.go +++ b/pkg/dcgmexporter/xid_collector.go @@ -18,228 +18,50 @@ package dcgmexporter import ( "fmt" - "io" - "maps" - "math/rand" "slices" - "sync" - "text/template" - "time" "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/sirupsen/logrus" ) -const ( - dcgmExpXIDErrorsCount = "DCGM_EXP_XID_ERRORS_COUNT" - windowSizeInMSLabel = "window_size_in_ms" -) - -var xidMetricsFormat = ` -{{- range $counter, $metrics := . -}} -# HELP {{ $counter.FieldName }} {{ $counter.Help }} -# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} -{{- range $metric := $metrics }} -{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} - -{{- range $k, $v := $metric.Labels -}} - ,{{ $k }}="{{ $v }}" -{{- end -}} -{{- range $k, $v := $metric.Attributes -}} - ,{{ $k }}="{{ $v }}" -{{- end -}} - -} {{ $metric.Value -}} -{{- end }} -{{ end }}` - -type Collector interface { - GetMetrics() (map[Counter][]Metric, error) - Cleanup() -} - type xidCollector struct { - sysInfo *SystemInfo - counter Counter - hostname string - config *Config - deviceFields []dcgm.Short - labelsCounters []Counter - cleanups []func() -} - -func (c *xidCollector) GetMetrics() (map[Counter][]Metric, error) { - // Create a group of fields - const ( - xid int = iota - ) - - deviceFields := make([]dcgm.Short, 1) - deviceFields[xid] = dcgm.DCGM_FI_DEV_XID_ERRORS - - fieldGroupName := fmt.Sprintf("fieldGroupName%d", rand.Uint64()) - fieldsGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields) - if err != nil { - return nil, err - } - - defer func() { - _ = dcgm.FieldGroupDestroy(fieldsGroup) - }() - - err = dcgm.UpdateAllFields() - if err != nil { - return nil, err - } - - mapEntityIDToValues := map[uint]map[int64]int{} - - window := time.Now().Add(-time.Duration(c.config.XIDCountWindowSize) * time.Millisecond) - - values, _, err := dcgm.GetValuesSince(dcgm.GroupAllGPUs(), fieldsGroup, window) - if err != nil { - return nil, err - } - - for _, val := range values { - if val.Status == 0 { - if _, exists := mapEntityIDToValues[val.EntityId]; !exists { - mapEntityIDToValues[val.EntityId] = map[int64]int{} - } - mapEntityIDToValues[val.EntityId][val.Int64()] += 1 - } - } - - labels := map[string]string{} - labels[windowSizeInMSLabel] = fmt.Sprint(c.config.XIDCountWindowSize) - - monitoringInfo := GetMonitoredEntities(*c.sysInfo) - metrics := make(map[Counter][]Metric) - useOld := c.config.UseOldNamespace - uuid := "UUID" - if useOld { - uuid = "uuid" - } - for _, mi := range monitoringInfo { - vals, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.deviceFields) - if err != nil { - return nil, err - } - // Extract Labels - for _, val := range vals { - v := ToString(val) - // Filter out counters with no value and ignored fields for this entity - if v == SkipDCGMValue { - continue - } - - counter, err := FindCounterField(c.labelsCounters, val.FieldId) - if err != nil { - continue - } - - if counter.PromType == "label" { - labels[counter.FieldName] = v - continue - } - } - - gpuXIDErrors, exists := mapEntityIDToValues[mi.DeviceInfo.GPU] - - if exists { - for xidErr, val := range gpuXIDErrors { - - metricValueLables := maps.Clone(labels) - metricValueLables["xid"] = fmt.Sprint(xidErr) - m := Metric{ - Counter: c.counter, - Value: fmt.Sprint(val), - UUID: uuid, - GPU: fmt.Sprintf("%d", mi.DeviceInfo.GPU), - GPUUUID: mi.DeviceInfo.UUID, - GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU), - GPUModelName: mi.DeviceInfo.Identifiers.Model, - Hostname: c.hostname, - - Labels: metricValueLables, - Attributes: map[string]string{}, - } - if mi.InstanceInfo != nil { - m.MigProfile = mi.InstanceInfo.ProfileName - m.GPUInstanceID = fmt.Sprintf("%d", mi.InstanceInfo.Info.NvmlInstanceId) - } else { - m.MigProfile = "" - m.GPUInstanceID = "" - } - - metrics[c.counter] = append(metrics[c.counter], m) - } - } - } - - return metrics, nil + expCollector } -func (c *xidCollector) Cleanup() { - for _, cleanup := range c.cleanups { - cleanup() - } +func (c *xidCollector) GetMetrics() (MetricsByCounter, error) { + return c.expCollector.getMetrics() } -func NewXIDCollector(config *Config, counters []Counter, hostname string) (Collector, error) { - - if !IsdcgmExpXIDErrorsCountEnabled(counters) { +func NewXIDCollector(counters []Counter, + hostname string, + config *Config, + fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (Collector, error) { + if !IsDCGMExpXIDErrorsCountEnabled(counters) { logrus.Error(dcgmExpXIDErrorsCount + " collector is disabled") return nil, fmt.Errorf(dcgmExpXIDErrorsCount + " collector is disabled") } - sysInfo, err := GetSystemInfo(config, dcgm.FE_GPU) - if err != nil { - return nil, err - } - - labelsCounters := []Counter{} - for i := 0; i < len(counters); i++ { - if counters[i].PromType == "label" { - labelsCounters = append(labelsCounters, counters[i]) - counters = slices.Delete(counters, i, i+1) - } - } - - var deviceFields = NewDeviceFields(labelsCounters, dcgm.FE_GPU) + collector := xidCollector{} + collector.expCollector = newExpCollector(counters, + hostname, + []dcgm.Short{dcgm.DCGM_FI_DEV_XID_ERRORS}, + config, + fieldEntityGroupTypeSystemInfo) - cleanups, err := SetupDcgmFieldsWatch([]dcgm.Short{dcgm.DCGM_FI_DEV_XID_ERRORS}, *sysInfo, - int64(config.CollectInterval)*1000) - if err != nil { - logrus.Fatalf("Failed to watch metrics; err: %v", err) - } - - counter := counters[slices.IndexFunc(counters, func(c Counter) bool { + collector.counter = counters[slices.IndexFunc(counters, func(c Counter) bool { return c.FieldName == dcgmExpXIDErrorsCount })] - collector := xidCollector{ - sysInfo: sysInfo, - counter: counter, - hostname: hostname, - config: config, - deviceFields: deviceFields, - labelsCounters: labelsCounters, - cleanups: cleanups, + collector.labelFiller = func(metricValueLabels map[string]string, entityValue int64) { + metricValueLabels["xid"] = fmt.Sprint(entityValue) } - return &collector, nil -} -var getXIDMetricTemplate = sync.OnceValue(func() *template.Template { - return template.Must(template.New("xidMetrics").Parse(xidMetricsFormat)) -}) + collector.windowSize = config.XIDCountWindowSize -func encodeXIDMetrics(w io.Writer, metrics MetricsByCounter) error { - template := getXIDMetricTemplate() - return template.Execute(w, metrics) + return &collector, nil } -func IsdcgmExpXIDErrorsCountEnabled(counters []Counter) bool { +func IsDCGMExpXIDErrorsCountEnabled(counters []Counter) bool { return slices.ContainsFunc(counters, func(c Counter) bool { return c.FieldName == dcgmExpXIDErrorsCount }) diff --git a/pkg/dcgmexporter/xid_collector_test.go b/pkg/dcgmexporter/xid_collector_test.go index 8bc1171a..a73b94ef 100644 --- a/pkg/dcgmexporter/xid_collector_test.go +++ b/pkg/dcgmexporter/xid_collector_test.go @@ -50,14 +50,14 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, } - counters, extraCounters, err := extractCounters(records, config) + cc, err := extractCounters(records, config) require.NoError(t, err) - require.Len(t, extraCounters, 1) - require.Len(t, counters, 1) + require.Len(t, cc.ExporterCounters, 1) + require.Len(t, cc.DCGMCounters, 1) - for i := range counters { - if counters[i].PromType == "label" { - extraCounters = append(extraCounters, counters[i]) + for i := range cc.DCGMCounters { + if cc.DCGMCounters[i].PromType == "label" { + cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) } } @@ -109,7 +109,20 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { } - xidCollector, err := NewXIDCollector(config, extraCounters, hostname) + allCounters := []Counter{ + Counter{ + FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + }, + } + + fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) + err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) + require.NoError(t, err) + + item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + require.True(t, exists) + + xidCollector, err := NewXIDCollector(cc.ExporterCounters, hostname, config, item) require.NoError(t, err) defer func() { @@ -123,7 +136,7 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { require.Len(t, metrics, 1) // We get metric value with 0 index metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - // We expect 6 records, because we have 3 fake GPU and each GPU expirenced 2 XID errors: 42 and 46 + // We expect 6 records, because we have 3 fake GPU and each GPU experienced 2 XID errors: 42 and 46 require.Len(t, metricValues, 6) for _, val := range metricValues { require.Contains(t, val.Labels, "window_size_in_ms") @@ -159,7 +172,7 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { // Now we check the metric rendering var b bytes.Buffer - err = encodeXIDMetrics(&b, metrics) + err = encodeExpMetrics(&b, metrics) require.NoError(t, err) require.NotEmpty(t, b) @@ -198,22 +211,35 @@ func TestXIDCollector_NewXIDCollector(t *testing.T) { teardownTest := setupTest(t) defer teardownTest(t) + allCounters := []Counter{ + Counter{ + FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + }, + } + + fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) + err := fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) + require.NoError(t, err) + + item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + t.Run("Should Return Error When DCGM_EXP_XID_ERRORS_COUNT is not present", func(t *testing.T) { records := [][]string{ {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, } - counters, extraCounters, err := extractCounters(records, config) + cc, err := extractCounters(records, config) require.NoError(t, err) - require.Len(t, extraCounters, 0) - require.Len(t, counters, 1) - xidCollector, err := NewXIDCollector(config, counters, "") + require.Len(t, cc.ExporterCounters, 0) + require.Len(t, cc.DCGMCounters, 1) + + xidCollector, err := NewXIDCollector(cc.DCGMCounters, "", config, item) require.Error(t, err) require.Nil(t, xidCollector) }) - t.Run("Should Return Error When Counter Param Is Empty", func(t *testing.T) { + t.Run("Should Return Error When Counters Param Is Empty", func(t *testing.T) { counters := make([]Counter, 0) - xidCollector, err := NewXIDCollector(config, counters, "") + xidCollector, err := NewXIDCollector(counters, "", config, item) require.Error(t, err) require.Nil(t, xidCollector) }) @@ -225,15 +251,15 @@ func TestXIDCollector_NewXIDCollector(t *testing.T) { {"DCGM_EXP_XID_ERRORS_COUNT", "gauge", "Count of XID Errors within user-specified time window (see xid-count-window-size param)."}, {"DCGM_EXP_XID_ERRORS_COUNT", "gauge", "Count of XID Errors within user-specified time window (see xid-count-window-size param)."}, } - counters, extraCounters, err := extractCounters(records, config) + cc, err := extractCounters(records, config) require.NoError(t, err) - for i := range counters { - if counters[i].PromType == "label" { - extraCounters = append(extraCounters, counters[i]) + for i := range cc.DCGMCounters { + if cc.DCGMCounters[i].PromType == "label" { + cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) } } - xidCollector, err := NewXIDCollector(config, counters, "") - require.Error(t, err) - require.Nil(t, xidCollector) + xidCollector, err := NewXIDCollector(cc.ExporterCounters, "", config, item) + require.NoError(t, err) + require.NotNil(t, xidCollector) }) }