Skip to content

Commit

Permalink
DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT renamed to the DCGM_EXP_CLOC…
Browse files Browse the repository at this point in the history
…K_EVENTS_COUNT

Signed-off-by: Vadym Fedorov <[email protected]>
Fixed error in the metric description

Signed-off-by: Vadym Fedorov <[email protected]>
  • Loading branch information
nvvfedorov committed Feb 22, 2024
1 parent 0e64327 commit a3ae347
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 138 deletions.
2 changes: 1 addition & 1 deletion etc/default-counters.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Clocks
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
# DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT, gauge, Count of clock throttle reasons within the user-specified time window (see clock-throttle-reasons-count-window-size param).
# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param).

# Temperature
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
Expand Down
94 changes: 47 additions & 47 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,25 +48,25 @@ const (
)

const (
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
CLIReplaceBlanksInModelName = "replace-blanks-in-model-name"
CLIDebugMode = "debug"
CLIClockThrottleReasonsCountWindowSize = "clock-throttle-reasons-count-window-size"
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
CLIReplaceBlanksInModelName = "replace-blanks-in-model-name"
CLIDebugMode = "debug"
CLIClockEventsCountWindowSize = "clock-events-count-window-size"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -201,10 +201,10 @@ func NewApp(buildVersion ...string) *cli.App {
EnvVars: []string{"DCGM_EXPORTER_DEBUG"},
},
&cli.IntFlag{
Name: CLIClockThrottleReasonsCountWindowSize,
Name: CLIClockEventsCountWindowSize,
Value: int((5 * time.Minute).Milliseconds()),
Usage: "Set time window size in milliseconds (ms) for counting active XID errors in DCGM Exporter.",
EnvVars: []string{"DCGM_EXPORTER_XID_COUNT_WINDOW_SIZE"},
Usage: "Set time window size in milliseconds (ms) for counting clock events in DCGM Exporter.",
EnvVars: []string{"DCGM_EXPORTER_CLOCK_EVENTS_COUNT_WINDOW_SIZE"},
},
}

Expand Down Expand Up @@ -362,20 +362,20 @@ restart:
logrus.Infof("%s collector initialized", dcgmexporter.DCGMXIDErrorsCount.String())
}

if dcgmexporter.IsDCGMExpClockThrottleReasonsEnabledCount(cs.ExporterCounters) {
if dcgmexporter.IsDCGMExpClockEventsCountEnabled(cs.ExporterCounters) {
item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU)
if !exists {
logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockThrottleReasonsCount.String())
logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockEventsCount.String())
}
clocksThrottleReasonsCollector, err := dcgmexporter.NewClocksThrottleReasonsCollector(
clocksThrottleReasonsCollector, err := dcgmexporter.NewClockEventsCollector(
cs.ExporterCounters, hostname, config, item)
if err != nil {
logrus.Fatal(err)
}

cRegistry.Register(clocksThrottleReasonsCollector)

logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockThrottleReasonsCount.String())
logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockEventsCount.String())
}

defer func() {
Expand Down Expand Up @@ -493,26 +493,26 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
}

return &dcgmexporter.Config{
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
GPUDevices: gOpt,
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName),
Debug: c.Bool(CLIDebugMode),
ClockThrottleReasonsCountWindowSize: c.Int(CLIClockThrottleReasonsCountWindowSize),
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
GPUDevices: gOpt,
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName),
Debug: c.Bool(CLIDebugMode),
ClockEventsCountWindowSize: c.Int(CLIClockEventsCountWindowSize),
}, nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,42 +24,43 @@ import (
"github.com/sirupsen/logrus"
)

// IsDCGMExpClockThrottleReasonsEnabledCount checks if the DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT counter exists
func IsDCGMExpClockThrottleReasonsEnabledCount(counters []Counter) bool {
// IsDCGMExpClockEventsCountEnabled checks if the DCGM_EXP_CLOCK_EVENTS_COUNT counter exists
func IsDCGMExpClockEventsCountEnabled(counters []Counter) bool {
return slices.ContainsFunc(counters,
func(c Counter) bool {
return c.FieldName == dcgmExpClockThrottleReasonsCount
return c.FieldName == dcgmExpClockEventsCount
})
}

type clocksThrottleReasonsCollector struct {
type clockEventsCollector struct {
expCollector
}

type clocksThrottleReasonBitmask int64
type clockEventBitmask int64

// Source of the const values: https://github.com/NVIDIA/DCGM/blob/master/dcgmlib/dcgm_fields.h
const (
// DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE Nothing is running on the GPU and the clocks are dropping to Idle state
DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE clocksThrottleReasonBitmask = 0x0000000000000001
DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE clockEventBitmask = 0x0000000000000001
// DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING GPU clocks are limited by current setting of applications clocks
DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING clocksThrottleReasonBitmask = 0x0000000000000002
DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING clockEventBitmask = 0x0000000000000002
// DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP SW Power Scaling algorithm is reducing the clocks below requested clocks
DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP clocksThrottleReasonBitmask = 0x0000000000000004
DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP clockEventBitmask = 0x0000000000000004
// DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN clocksThrottleReasonBitmask = 0x0000000000000008
DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN clockEventBitmask = 0x0000000000000008
// DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST Sync Boost
DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST clocksThrottleReasonBitmask = 0x0000000000000010
DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST clockEventBitmask = 0x0000000000000010
//SW Thermal Slowdown
DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL clocksThrottleReasonBitmask = 0x0000000000000020
DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL clockEventBitmask = 0x0000000000000020
// DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL clocksThrottleReasonBitmask = 0x0000000000000040
DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL clockEventBitmask = 0x0000000000000040
// DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE clocksThrottleReasonBitmask = 0x0000000000000080
DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE clockEventBitmask = 0x0000000000000080
// DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS GPU clocks are limited by current setting of Display clocks
DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS clocksThrottleReasonBitmask = 0x0000000000000100
DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS clockEventBitmask = 0x0000000000000100
)

var clocksThrottleReasonToString = map[clocksThrottleReasonBitmask]string{
var clockEventToString = map[clockEventBitmask]string{
// See: https://github.com/NVIDIA/DCGM/blob/6792b70c65b938d17ac9d791f59ceaadc0c7ef8a/dcgmi/CommandLineParser.cpp#L63
DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE: "gpu_idle",
DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING: "clocks_setting",
Expand All @@ -73,24 +74,24 @@ var clocksThrottleReasonToString = map[clocksThrottleReasonBitmask]string{
}

// String method to convert the enum value to a string
func (enm clocksThrottleReasonBitmask) String() string {
return clocksThrottleReasonToString[enm]
func (enm clockEventBitmask) String() string {
return clockEventToString[enm]
}

func (c *clocksThrottleReasonsCollector) GetMetrics() (MetricsByCounter, error) {
func (c *clockEventsCollector) GetMetrics() (MetricsByCounter, error) {
return c.expCollector.getMetrics()
}

func NewClocksThrottleReasonsCollector(counters []Counter,
func NewClockEventsCollector(counters []Counter,
hostname string,
config *Config,
fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (Collector, error) {
if !IsDCGMExpClockThrottleReasonsEnabledCount(counters) {
logrus.Error(dcgmExpClockThrottleReasonsCount + " collector is disabled")
return nil, fmt.Errorf(dcgmExpClockThrottleReasonsCount + " collector is disabled")
if !IsDCGMExpClockEventsCountEnabled(counters) {
logrus.Error(dcgmExpClockEventsCount + " collector is disabled")
return nil, fmt.Errorf(dcgmExpClockEventsCount + " collector is disabled")
}

collector := clocksThrottleReasonsCollector{}
collector := clockEventsCollector{}
collector.expCollector = newExpCollector(
counters,
hostname,
Expand All @@ -100,23 +101,23 @@ func NewClocksThrottleReasonsCollector(counters []Counter,
)

collector.counter = counters[slices.IndexFunc(counters, func(c Counter) bool {
return c.FieldName == dcgmExpClockThrottleReasonsCount
return c.FieldName == dcgmExpClockEventsCount
})]

collector.labelFiller = func(metricValueLabels map[string]string, entityValue int64) {
metricValueLabels["throttle_reason"] = clocksThrottleReasonBitmask(entityValue).String()
metricValueLabels["clock_event"] = clockEventBitmask(entityValue).String()
}

collector.windowSize = config.ClockThrottleReasonsCountWindowSize
collector.windowSize = config.ClockEventsCountWindowSize

collector.fieldValueParser = func(value int64) []int64 {
var reasons []int64

// The int64 value may represent multiple reasons.
// To extract a specific reason, we need to perform an XOR operation with a bitmask.
reasonBitmask := clocksThrottleReasonBitmask(value)
// The int64 value may represent multiple events.
// To extract a specific event, we need to perform an XOR operation with a bitmask.
reasonBitmask := clockEventBitmask(value)

for tr := range clocksThrottleReasonToString {
for tr := range clockEventToString {
if reasonBitmask&tr != 0 {
reasons = append(reasons, int64(tr))
}
Expand Down
Loading

0 comments on commit a3ae347

Please sign in to comment.