From a3ae3470d9ef5ecc347cab62644d3b843776bd92 Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Thu, 22 Feb 2024 09:01:10 -0600 Subject: [PATCH] DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT renamed to the DCGM_EXP_CLOCK_EVENTS_COUNT Signed-off-by: Vadym Fedorov Fixed error in the metric description Signed-off-by: Vadym Fedorov --- etc/default-counters.csv | 2 +- pkg/cmd/app.go | 94 +++++++++---------- ...collector.go => clock_events_collector.go} | 61 ++++++------ ...test.go => clock_events_collector_test.go} | 56 +++++------ pkg/dcgmexporter/config.go | 44 ++++----- pkg/dcgmexporter/exporter_metrics.go | 20 ++-- 6 files changed, 139 insertions(+), 138 deletions(-) rename pkg/dcgmexporter/{clocks_throttle_reasons_collector.go => clock_events_collector.go} (59%) rename pkg/dcgmexporter/{clocks_throttle_reasons_collector_test.go => clock_events_collector_test.go} (82%) diff --git a/etc/default-counters.csv b/etc/default-counters.csv index db55f406..6637d39f 100644 --- a/etc/default-counters.csv +++ b/etc/default-counters.csv @@ -5,7 +5,7 @@ # Clocks DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). -# DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT, gauge, Count of clock throttle reasons within the user-specified time window (see clock-throttle-reasons-count-window-size param). +# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param). # Temperature DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index 981c922b..3fbce59f 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -48,25 +48,25 @@ const ( ) const ( - CLIFieldsFile = "collectors" - CLIAddress = "address" - CLICollectInterval = "collect-interval" - CLIKubernetes = "kubernetes" - CLIKubernetesGPUIDType = "kubernetes-gpu-id-type" - CLIUseOldNamespace = "use-old-namespace" - CLIRemoteHEInfo = "remote-hostengine-info" - CLIGPUDevices = "devices" - CLISwitchDevices = "switch-devices" - CLICPUDevices = "cpu-devices" - CLINoHostname = "no-hostname" - CLIUseFakeGPUs = "fake-gpus" - CLIConfigMapData = "configmap-data" - CLIWebSystemdSocket = "web-systemd-socket" - CLIWebConfigFile = "web-config-file" - CLIXIDCountWindowSize = "xid-count-window-size" - CLIReplaceBlanksInModelName = "replace-blanks-in-model-name" - CLIDebugMode = "debug" - CLIClockThrottleReasonsCountWindowSize = "clock-throttle-reasons-count-window-size" + CLIFieldsFile = "collectors" + CLIAddress = "address" + CLICollectInterval = "collect-interval" + CLIKubernetes = "kubernetes" + CLIKubernetesGPUIDType = "kubernetes-gpu-id-type" + CLIUseOldNamespace = "use-old-namespace" + CLIRemoteHEInfo = "remote-hostengine-info" + CLIGPUDevices = "devices" + CLISwitchDevices = "switch-devices" + CLICPUDevices = "cpu-devices" + CLINoHostname = "no-hostname" + CLIUseFakeGPUs = "fake-gpus" + CLIConfigMapData = "configmap-data" + CLIWebSystemdSocket = "web-systemd-socket" + CLIWebConfigFile = "web-config-file" + CLIXIDCountWindowSize = "xid-count-window-size" + CLIReplaceBlanksInModelName = "replace-blanks-in-model-name" + CLIDebugMode = "debug" + CLIClockEventsCountWindowSize = "clock-events-count-window-size" ) func NewApp(buildVersion ...string) *cli.App { @@ -201,10 +201,10 @@ func NewApp(buildVersion ...string) *cli.App { EnvVars: []string{"DCGM_EXPORTER_DEBUG"}, }, &cli.IntFlag{ - Name: CLIClockThrottleReasonsCountWindowSize, + Name: CLIClockEventsCountWindowSize, Value: int((5 * time.Minute).Milliseconds()), - Usage: "Set time window size in milliseconds (ms) for counting active XID errors in DCGM Exporter.", - EnvVars: []string{"DCGM_EXPORTER_XID_COUNT_WINDOW_SIZE"}, + Usage: "Set time window size in milliseconds (ms) for counting clock events in DCGM Exporter.", + EnvVars: []string{"DCGM_EXPORTER_CLOCK_EVENTS_COUNT_WINDOW_SIZE"}, }, } @@ -362,12 +362,12 @@ restart: logrus.Infof("%s collector initialized", dcgmexporter.DCGMXIDErrorsCount.String()) } - if dcgmexporter.IsDCGMExpClockThrottleReasonsEnabledCount(cs.ExporterCounters) { + if dcgmexporter.IsDCGMExpClockEventsCountEnabled(cs.ExporterCounters) { item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) if !exists { - logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockThrottleReasonsCount.String()) + logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockEventsCount.String()) } - clocksThrottleReasonsCollector, err := dcgmexporter.NewClocksThrottleReasonsCollector( + clocksThrottleReasonsCollector, err := dcgmexporter.NewClockEventsCollector( cs.ExporterCounters, hostname, config, item) if err != nil { logrus.Fatal(err) @@ -375,7 +375,7 @@ restart: cRegistry.Register(clocksThrottleReasonsCollector) - logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockThrottleReasonsCount.String()) + logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockEventsCount.String()) } defer func() { @@ -493,26 +493,26 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { } return &dcgmexporter.Config{ - CollectorsFile: c.String(CLIFieldsFile), - Address: c.String(CLIAddress), - CollectInterval: c.Int(CLICollectInterval), - Kubernetes: c.Bool(CLIKubernetes), - KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)), - CollectDCP: true, - UseOldNamespace: c.Bool(CLIUseOldNamespace), - UseRemoteHE: c.IsSet(CLIRemoteHEInfo), - RemoteHEInfo: c.String(CLIRemoteHEInfo), - GPUDevices: gOpt, - SwitchDevices: sOpt, - CPUDevices: cOpt, - NoHostname: c.Bool(CLINoHostname), - UseFakeGPUs: c.Bool(CLIUseFakeGPUs), - ConfigMapData: c.String(CLIConfigMapData), - WebSystemdSocket: c.Bool(CLIWebSystemdSocket), - WebConfigFile: c.String(CLIWebConfigFile), - XIDCountWindowSize: c.Int(CLIXIDCountWindowSize), - ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName), - Debug: c.Bool(CLIDebugMode), - ClockThrottleReasonsCountWindowSize: c.Int(CLIClockThrottleReasonsCountWindowSize), + CollectorsFile: c.String(CLIFieldsFile), + Address: c.String(CLIAddress), + CollectInterval: c.Int(CLICollectInterval), + Kubernetes: c.Bool(CLIKubernetes), + KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)), + CollectDCP: true, + UseOldNamespace: c.Bool(CLIUseOldNamespace), + UseRemoteHE: c.IsSet(CLIRemoteHEInfo), + RemoteHEInfo: c.String(CLIRemoteHEInfo), + GPUDevices: gOpt, + SwitchDevices: sOpt, + CPUDevices: cOpt, + NoHostname: c.Bool(CLINoHostname), + UseFakeGPUs: c.Bool(CLIUseFakeGPUs), + ConfigMapData: c.String(CLIConfigMapData), + WebSystemdSocket: c.Bool(CLIWebSystemdSocket), + WebConfigFile: c.String(CLIWebConfigFile), + XIDCountWindowSize: c.Int(CLIXIDCountWindowSize), + ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName), + Debug: c.Bool(CLIDebugMode), + ClockEventsCountWindowSize: c.Int(CLIClockEventsCountWindowSize), }, nil } diff --git a/pkg/dcgmexporter/clocks_throttle_reasons_collector.go b/pkg/dcgmexporter/clock_events_collector.go similarity index 59% rename from pkg/dcgmexporter/clocks_throttle_reasons_collector.go rename to pkg/dcgmexporter/clock_events_collector.go index f7fe7213..31eb0ff6 100644 --- a/pkg/dcgmexporter/clocks_throttle_reasons_collector.go +++ b/pkg/dcgmexporter/clock_events_collector.go @@ -24,42 +24,43 @@ import ( "github.com/sirupsen/logrus" ) -// IsDCGMExpClockThrottleReasonsEnabledCount checks if the DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT counter exists -func IsDCGMExpClockThrottleReasonsEnabledCount(counters []Counter) bool { +// IsDCGMExpClockEventsCountEnabled checks if the DCGM_EXP_CLOCK_EVENTS_COUNT counter exists +func IsDCGMExpClockEventsCountEnabled(counters []Counter) bool { return slices.ContainsFunc(counters, func(c Counter) bool { - return c.FieldName == dcgmExpClockThrottleReasonsCount + return c.FieldName == dcgmExpClockEventsCount }) } -type clocksThrottleReasonsCollector struct { +type clockEventsCollector struct { expCollector } -type clocksThrottleReasonBitmask int64 +type clockEventBitmask int64 +// Source of the const values: https://github.com/NVIDIA/DCGM/blob/master/dcgmlib/dcgm_fields.h const ( // DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE Nothing is running on the GPU and the clocks are dropping to Idle state - DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE clocksThrottleReasonBitmask = 0x0000000000000001 + DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE clockEventBitmask = 0x0000000000000001 // DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING GPU clocks are limited by current setting of applications clocks - DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING clocksThrottleReasonBitmask = 0x0000000000000002 + DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING clockEventBitmask = 0x0000000000000002 // DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP SW Power Scaling algorithm is reducing the clocks below requested clocks - DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP clocksThrottleReasonBitmask = 0x0000000000000004 + DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP clockEventBitmask = 0x0000000000000004 // DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN clocksThrottleReasonBitmask = 0x0000000000000008 + DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN clockEventBitmask = 0x0000000000000008 // DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST Sync Boost - DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST clocksThrottleReasonBitmask = 0x0000000000000010 + DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST clockEventBitmask = 0x0000000000000010 //SW Thermal Slowdown - DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL clocksThrottleReasonBitmask = 0x0000000000000020 + DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL clockEventBitmask = 0x0000000000000020 // DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL clocksThrottleReasonBitmask = 0x0000000000000040 + DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL clockEventBitmask = 0x0000000000000040 // DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged - DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE clocksThrottleReasonBitmask = 0x0000000000000080 + DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE clockEventBitmask = 0x0000000000000080 // DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS GPU clocks are limited by current setting of Display clocks - DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS clocksThrottleReasonBitmask = 0x0000000000000100 + DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS clockEventBitmask = 0x0000000000000100 ) -var clocksThrottleReasonToString = map[clocksThrottleReasonBitmask]string{ +var clockEventToString = map[clockEventBitmask]string{ // See: https://github.com/NVIDIA/DCGM/blob/6792b70c65b938d17ac9d791f59ceaadc0c7ef8a/dcgmi/CommandLineParser.cpp#L63 DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE: "gpu_idle", DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING: "clocks_setting", @@ -73,24 +74,24 @@ var clocksThrottleReasonToString = map[clocksThrottleReasonBitmask]string{ } // String method to convert the enum value to a string -func (enm clocksThrottleReasonBitmask) String() string { - return clocksThrottleReasonToString[enm] +func (enm clockEventBitmask) String() string { + return clockEventToString[enm] } -func (c *clocksThrottleReasonsCollector) GetMetrics() (MetricsByCounter, error) { +func (c *clockEventsCollector) GetMetrics() (MetricsByCounter, error) { return c.expCollector.getMetrics() } -func NewClocksThrottleReasonsCollector(counters []Counter, +func NewClockEventsCollector(counters []Counter, hostname string, config *Config, fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (Collector, error) { - if !IsDCGMExpClockThrottleReasonsEnabledCount(counters) { - logrus.Error(dcgmExpClockThrottleReasonsCount + " collector is disabled") - return nil, fmt.Errorf(dcgmExpClockThrottleReasonsCount + " collector is disabled") + if !IsDCGMExpClockEventsCountEnabled(counters) { + logrus.Error(dcgmExpClockEventsCount + " collector is disabled") + return nil, fmt.Errorf(dcgmExpClockEventsCount + " collector is disabled") } - collector := clocksThrottleReasonsCollector{} + collector := clockEventsCollector{} collector.expCollector = newExpCollector( counters, hostname, @@ -100,23 +101,23 @@ func NewClocksThrottleReasonsCollector(counters []Counter, ) collector.counter = counters[slices.IndexFunc(counters, func(c Counter) bool { - return c.FieldName == dcgmExpClockThrottleReasonsCount + return c.FieldName == dcgmExpClockEventsCount })] collector.labelFiller = func(metricValueLabels map[string]string, entityValue int64) { - metricValueLabels["throttle_reason"] = clocksThrottleReasonBitmask(entityValue).String() + metricValueLabels["clock_event"] = clockEventBitmask(entityValue).String() } - collector.windowSize = config.ClockThrottleReasonsCountWindowSize + collector.windowSize = config.ClockEventsCountWindowSize collector.fieldValueParser = func(value int64) []int64 { var reasons []int64 - // The int64 value may represent multiple reasons. - // To extract a specific reason, we need to perform an XOR operation with a bitmask. - reasonBitmask := clocksThrottleReasonBitmask(value) + // The int64 value may represent multiple events. + // To extract a specific event, we need to perform an XOR operation with a bitmask. + reasonBitmask := clockEventBitmask(value) - for tr := range clocksThrottleReasonToString { + for tr := range clockEventToString { if reasonBitmask&tr != 0 { reasons = append(reasons, int64(tr)) } diff --git a/pkg/dcgmexporter/clocks_throttle_reasons_collector_test.go b/pkg/dcgmexporter/clock_events_collector_test.go similarity index 82% rename from pkg/dcgmexporter/clocks_throttle_reasons_collector_test.go rename to pkg/dcgmexporter/clock_events_collector_test.go index c8c27e13..496fbcfb 100644 --- a/pkg/dcgmexporter/clocks_throttle_reasons_collector_test.go +++ b/pkg/dcgmexporter/clock_events_collector_test.go @@ -31,7 +31,7 @@ import ( podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1" ) -func TestClocksThrottleReasonsCollector_Gather(t *testing.T) { +func TestClockEventsCollector_Gather(t *testing.T) { teardownTest := setupTest(t) defer teardownTest(t) runOnlyWithLiveGPUs(t) @@ -44,11 +44,11 @@ func TestClocksThrottleReasonsCollector_Gather(t *testing.T) { MajorRange: []int{-1}, MinorRange: []int{-1}, }, - ClockThrottleReasonsCountWindowSize: int(time.Duration(5) * time.Minute), + ClockEventsCountWindowSize: int(time.Duration(5) * time.Minute), } records := [][]string{ - {"DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", "gauge", ""}, + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, } @@ -81,8 +81,8 @@ func TestClocksThrottleReasonsCollector_Gather(t *testing.T) { require.NoError(t, err) require.NotEmpty(t, gpuIDs) - type clockThrottleReasonExpectation map[string]string - expectations := map[string]clockThrottleReasonExpectation{} + type clockEventsCountExpectation map[string]string + expectations := map[string]clockEventsCountExpectation{} for i, gpuID := range gpuIDs { err = dcgm.InjectFieldValue(gpuID, @@ -112,7 +112,7 @@ func TestClocksThrottleReasonsCollector_Gather(t *testing.T) { ) require.NoError(t, err) - expectations[fmt.Sprint(gpuID)] = clockThrottleReasonExpectation{ + expectations[fmt.Sprint(gpuID)] = clockEventsCountExpectation{ DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL.String(): "2", DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL.String(): "2", DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE.String(): "1", @@ -147,7 +147,7 @@ func TestClocksThrottleReasonsCollector_Gather(t *testing.T) { item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - collector, err := NewClocksThrottleReasonsCollector(cc.ExporterCounters, hostname, config, item) + collector, err := NewClockEventsCollector(cc.ExporterCounters, hostname, config, item) require.NoError(t, err) defer func() { @@ -157,7 +157,7 @@ func TestClocksThrottleReasonsCollector_Gather(t *testing.T) { metrics, err := collector.GetMetrics() require.NoError(t, err) require.NotEmpty(t, metrics) - // We expect 1 metric: DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT + // We expect 1 metric: DCGM_EXP_CLOCK_EVENTS_COUNT require.Len(t, metrics, 1) // We get metric value with 0 index metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] @@ -171,14 +171,14 @@ func TestClocksThrottleReasonsCollector_Gather(t *testing.T) { } } - // We expect 9 records, because we have 3 fake GPU and each GPU experienced 3 CLOCK_THROTTLE errors + // We expect 9 records, because we have 3 fake GPU and each GPU experienced 3 CLOCK_EVENTS require.Len(t, metricValues, 9) for _, val := range metricValues { require.Contains(t, val.Labels, "window_size_in_ms") - require.Equal(t, fmt.Sprint(config.ClockThrottleReasonsCountWindowSize), val.Labels["window_size_in_ms"]) + require.Equal(t, fmt.Sprint(config.ClockEventsCountWindowSize), val.Labels["window_size_in_ms"]) expected, exists := expectations[val.GPU] require.True(t, exists) - actualReason, exists := val.Labels["throttle_reason"] + actualReason, exists := val.Labels["clock_event"] require.True(t, exists) expectedVal, exists := expected[actualReason] require.True(t, exists) @@ -186,7 +186,7 @@ func TestClocksThrottleReasonsCollector_Gather(t *testing.T) { } } -func TestClocksThrottleReasonsCollector_NewClocksThrottleReasonsCollector(t *testing.T) { +func TestClockEventsCollector_NewClocksThrottleReasonsCollector(t *testing.T) { config := &Config{ GPUDevices: DeviceOptions{ Flex: true, @@ -209,7 +209,7 @@ func TestClocksThrottleReasonsCollector_NewClocksThrottleReasonsCollector(t *tes require.NoError(t, err) item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - t.Run("Should Return Error When DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT is not present", func(t *testing.T) { + t.Run("Should Return Error When DCGM_EXP_CLOCK_EVENTS_COUNT is not present", func(t *testing.T) { records := [][]string{ {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, } @@ -217,24 +217,24 @@ func TestClocksThrottleReasonsCollector_NewClocksThrottleReasonsCollector(t *tes require.NoError(t, err) require.Len(t, cc.ExporterCounters, 0) require.Len(t, cc.DCGMCounters, 1) - collector, err := NewClocksThrottleReasonsCollector(cc.DCGMCounters, "", config, item) + collector, err := NewClockEventsCollector(cc.DCGMCounters, "", config, item) require.Error(t, err) require.Nil(t, collector) }) t.Run("Should Return Error When Counter Param Is Empty", func(t *testing.T) { counters := make([]Counter, 0) - collector, err := NewClocksThrottleReasonsCollector(counters, "", config, item) + collector, err := NewClockEventsCollector(counters, "", config, item) require.Error(t, err) require.Nil(t, collector) }) - t.Run("Should Not Return Error When DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT Present More Than Once", func(t *testing.T) { + t.Run("Should Not Return Error When DCGM_EXP_CLOCK_EVENTS_COUNT Present More Than Once", func(t *testing.T) { records := [][]string{ {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, - {"DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", "gauge", ""}, - {"DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", "gauge", ""}, - {"DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", "gauge", ""}, + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, } cc, err := extractCounters(records, config) require.NoError(t, err) @@ -243,13 +243,13 @@ func TestClocksThrottleReasonsCollector_NewClocksThrottleReasonsCollector(t *tes cc.ExporterCounters = append(cc.ExporterCounters, cc.DCGMCounters[i]) } } - xidCollector, err := NewClocksThrottleReasonsCollector(cc.ExporterCounters, "", config, item) + collector, err := NewClockEventsCollector(cc.ExporterCounters, "", config, item) require.NoError(t, err) - require.NotNil(t, xidCollector) + require.NotNil(t, collector) }) } -func TestClocksThrottleReasonsCollector_Gather_AllTheThings(t *testing.T) { +func TestClockEventsCollector_Gather_AllTheThings(t *testing.T) { teardownTest := setupTest(t) defer teardownTest(t) runOnlyWithLiveGPUs(t) @@ -261,11 +261,11 @@ func TestClocksThrottleReasonsCollector_Gather_AllTheThings(t *testing.T) { MajorRange: []int{-1}, MinorRange: []int{-1}, }, - ClockThrottleReasonsCountWindowSize: int(time.Duration(5) * time.Minute), + ClockEventsCountWindowSize: int(time.Duration(5) * time.Minute), } records := [][]string{ - {"DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT", "gauge", ""}, + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, } @@ -344,7 +344,7 @@ func TestClocksThrottleReasonsCollector_Gather_AllTheThings(t *testing.T) { item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) - collector, err := NewClocksThrottleReasonsCollector(cc.ExporterCounters, hostname, config, item) + collector, err := NewClockEventsCollector(cc.ExporterCounters, hostname, config, item) require.NoError(t, err) defer func() { @@ -354,7 +354,7 @@ func TestClocksThrottleReasonsCollector_Gather_AllTheThings(t *testing.T) { metrics, err := collector.GetMetrics() require.NoError(t, err) require.NotEmpty(t, metrics) - // We expect 1 metric: DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT + // We expect 1 metric: DCGM_EXP_CLOCK_EVENTS_COUNT require.Len(t, metrics, 1) // We get metric value with 0 index metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] @@ -372,10 +372,10 @@ func TestClocksThrottleReasonsCollector_Gather_AllTheThings(t *testing.T) { require.Len(t, metricValues, 9) for _, val := range metricValues { require.Contains(t, val.Labels, "window_size_in_ms") - require.Equal(t, fmt.Sprint(config.ClockThrottleReasonsCountWindowSize), val.Labels["window_size_in_ms"]) + require.Equal(t, fmt.Sprint(config.ClockEventsCountWindowSize), val.Labels["window_size_in_ms"]) expected, exists := expectations[val.GPU] require.True(t, exists) - actualReason, exists := val.Labels["throttle_reason"] + actualReason, exists := val.Labels["clock_event"] require.True(t, exists) expectedVal, exists := expected[actualReason] require.True(t, exists) diff --git a/pkg/dcgmexporter/config.go b/pkg/dcgmexporter/config.go index eac923c6..0dacf770 100644 --- a/pkg/dcgmexporter/config.go +++ b/pkg/dcgmexporter/config.go @@ -31,26 +31,26 @@ type DeviceOptions struct { } type Config struct { - CollectorsFile string - Address string - CollectInterval int - Kubernetes bool - KubernetesGPUIdType KubernetesGPUIDType - CollectDCP bool - UseOldNamespace bool - UseRemoteHE bool - RemoteHEInfo string - GPUDevices DeviceOptions - SwitchDevices DeviceOptions - CPUDevices DeviceOptions - NoHostname bool - UseFakeGPUs bool - ConfigMapData string - MetricGroups []dcgm.MetricGroup - WebSystemdSocket bool - WebConfigFile string - XIDCountWindowSize int - ReplaceBlanksInModelName bool - Debug bool - ClockThrottleReasonsCountWindowSize int + CollectorsFile string + Address string + CollectInterval int + Kubernetes bool + KubernetesGPUIdType KubernetesGPUIDType + CollectDCP bool + UseOldNamespace bool + UseRemoteHE bool + RemoteHEInfo string + GPUDevices DeviceOptions + SwitchDevices DeviceOptions + CPUDevices DeviceOptions + NoHostname bool + UseFakeGPUs bool + ConfigMapData string + MetricGroups []dcgm.MetricGroup + WebSystemdSocket bool + WebConfigFile string + XIDCountWindowSize int + ReplaceBlanksInModelName bool + Debug bool + ClockEventsCountWindowSize int } diff --git a/pkg/dcgmexporter/exporter_metrics.go b/pkg/dcgmexporter/exporter_metrics.go index b2b48478..90a31acd 100644 --- a/pkg/dcgmexporter/exporter_metrics.go +++ b/pkg/dcgmexporter/exporter_metrics.go @@ -19,16 +19,16 @@ package dcgmexporter import "fmt" const ( - dcgmExpClockThrottleReasonsCount = "DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT" - dcgmExpXIDErrorsCount = "DCGM_EXP_XID_ERRORS_COUNT" + dcgmExpClockEventsCount = "DCGM_EXP_CLOCK_EVENTS_COUNT" + dcgmExpXIDErrorsCount = "DCGM_EXP_XID_ERRORS_COUNT" ) type ExporterCounter uint16 const ( - DCGMFIUnknown ExporterCounter = 0 - DCGMXIDErrorsCount ExporterCounter = iota + 9000 - DCGMClockThrottleReasonsCount ExporterCounter = iota + DCGMFIUnknown ExporterCounter = 0 + DCGMXIDErrorsCount ExporterCounter = iota + 9000 + DCGMClockEventsCount ExporterCounter = iota ) // String method to convert the enum value to a string @@ -36,8 +36,8 @@ func (enm ExporterCounter) String() string { switch enm { case DCGMXIDErrorsCount: return dcgmExpXIDErrorsCount - case DCGMClockThrottleReasonsCount: - return dcgmExpClockThrottleReasonsCount + case DCGMClockEventsCount: + return dcgmExpClockEventsCount default: return "DCGM_FI_UNKNOWN" } @@ -45,9 +45,9 @@ func (enm ExporterCounter) String() string { // DCGMFields maps DCGMExporterMetric String to enum var DCGMFields = map[string]ExporterCounter{ - DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount, - DCGMClockThrottleReasonsCount.String(): DCGMClockThrottleReasonsCount, - DCGMFIUnknown.String(): DCGMFIUnknown, + DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount, + DCGMClockEventsCount.String(): DCGMClockEventsCount, + DCGMFIUnknown.String(): DCGMFIUnknown, } func IdentifyMetricType(s string) (ExporterCounter, error) {