Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT was renamed to the DCGM_EXP_CLOCK_EVENTS_COUNT #261

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion etc/default-counters.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Clocks
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
# DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT, gauge, Count of clock throttle reasons within the user-specified time window (see clock-throttle-reasons-count-window-size param).
# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param).

# Temperature
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
Expand Down
94 changes: 47 additions & 47 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,25 +48,25 @@ const (
)

const (
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
CLIReplaceBlanksInModelName = "replace-blanks-in-model-name"
CLIDebugMode = "debug"
CLIClockThrottleReasonsCountWindowSize = "clock-throttle-reasons-count-window-size"
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
CLIReplaceBlanksInModelName = "replace-blanks-in-model-name"
CLIDebugMode = "debug"
CLIClockEventsCountWindowSize = "clock-events-count-window-size"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -201,10 +201,10 @@ func NewApp(buildVersion ...string) *cli.App {
EnvVars: []string{"DCGM_EXPORTER_DEBUG"},
},
&cli.IntFlag{
Name: CLIClockThrottleReasonsCountWindowSize,
Name: CLIClockEventsCountWindowSize,
Value: int((5 * time.Minute).Milliseconds()),
Usage: "Set time window size in milliseconds (ms) for counting active XID errors in DCGM Exporter.",
EnvVars: []string{"DCGM_EXPORTER_XID_COUNT_WINDOW_SIZE"},
Usage: "Set time window size in milliseconds (ms) for counting clock events in DCGM Exporter.",
EnvVars: []string{"DCGM_EXPORTER_CLOCK_EVENTS_COUNT_WINDOW_SIZE"},
},
}

Expand Down Expand Up @@ -362,20 +362,20 @@ restart:
logrus.Infof("%s collector initialized", dcgmexporter.DCGMXIDErrorsCount.String())
}

if dcgmexporter.IsDCGMExpClockThrottleReasonsEnabledCount(cs.ExporterCounters) {
if dcgmexporter.IsDCGMExpClockEventsCountEnabled(cs.ExporterCounters) {
item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU)
if !exists {
logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockThrottleReasonsCount.String())
logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockEventsCount.String())
}
clocksThrottleReasonsCollector, err := dcgmexporter.NewClocksThrottleReasonsCollector(
clocksThrottleReasonsCollector, err := dcgmexporter.NewClockEventsCollector(
cs.ExporterCounters, hostname, config, item)
if err != nil {
logrus.Fatal(err)
}

cRegistry.Register(clocksThrottleReasonsCollector)

logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockThrottleReasonsCount.String())
logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockEventsCount.String())
}

defer func() {
Expand Down Expand Up @@ -493,26 +493,26 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
}

return &dcgmexporter.Config{
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
GPUDevices: gOpt,
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName),
Debug: c.Bool(CLIDebugMode),
ClockThrottleReasonsCountWindowSize: c.Int(CLIClockThrottleReasonsCountWindowSize),
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
GPUDevices: gOpt,
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName),
Debug: c.Bool(CLIDebugMode),
ClockEventsCountWindowSize: c.Int(CLIClockEventsCountWindowSize),
}, nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,42 +24,43 @@ import (
"github.com/sirupsen/logrus"
)

// IsDCGMExpClockThrottleReasonsEnabledCount checks if the DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT counter exists
func IsDCGMExpClockThrottleReasonsEnabledCount(counters []Counter) bool {
// IsDCGMExpClockEventsCountEnabled checks if the DCGM_EXP_CLOCK_EVENTS_COUNT counter exists
func IsDCGMExpClockEventsCountEnabled(counters []Counter) bool {
return slices.ContainsFunc(counters,
func(c Counter) bool {
return c.FieldName == dcgmExpClockThrottleReasonsCount
return c.FieldName == dcgmExpClockEventsCount
})
}

type clocksThrottleReasonsCollector struct {
type clockEventsCollector struct {
expCollector
}

type clocksThrottleReasonBitmask int64
type clockEventBitmask int64

// Source of the const values: https://github.com/NVIDIA/DCGM/blob/master/dcgmlib/dcgm_fields.h
const (
// DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE Nothing is running on the GPU and the clocks are dropping to Idle state
DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE clocksThrottleReasonBitmask = 0x0000000000000001
DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE clockEventBitmask = 0x0000000000000001
// DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING GPU clocks are limited by current setting of applications clocks
DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING clocksThrottleReasonBitmask = 0x0000000000000002
DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING clockEventBitmask = 0x0000000000000002
// DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP SW Power Scaling algorithm is reducing the clocks below requested clocks
DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP clocksThrottleReasonBitmask = 0x0000000000000004
DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP clockEventBitmask = 0x0000000000000004
// DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN clocksThrottleReasonBitmask = 0x0000000000000008
DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN clockEventBitmask = 0x0000000000000008
// DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST Sync Boost
DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST clocksThrottleReasonBitmask = 0x0000000000000010
DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST clockEventBitmask = 0x0000000000000010
//SW Thermal Slowdown
DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL clocksThrottleReasonBitmask = 0x0000000000000020
DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL clockEventBitmask = 0x0000000000000020
// DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL clocksThrottleReasonBitmask = 0x0000000000000040
DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL clockEventBitmask = 0x0000000000000040
// DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE clocksThrottleReasonBitmask = 0x0000000000000080
DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE clockEventBitmask = 0x0000000000000080
// DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS GPU clocks are limited by current setting of Display clocks
DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS clocksThrottleReasonBitmask = 0x0000000000000100
DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS clockEventBitmask = 0x0000000000000100
)

var clocksThrottleReasonToString = map[clocksThrottleReasonBitmask]string{
var clockEventToString = map[clockEventBitmask]string{
// See: https://github.com/NVIDIA/DCGM/blob/6792b70c65b938d17ac9d791f59ceaadc0c7ef8a/dcgmi/CommandLineParser.cpp#L63
DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE: "gpu_idle",
DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING: "clocks_setting",
Expand All @@ -73,24 +74,24 @@ var clocksThrottleReasonToString = map[clocksThrottleReasonBitmask]string{
}

// String method to convert the enum value to a string
func (enm clocksThrottleReasonBitmask) String() string {
return clocksThrottleReasonToString[enm]
func (enm clockEventBitmask) String() string {
return clockEventToString[enm]
}

func (c *clocksThrottleReasonsCollector) GetMetrics() (MetricsByCounter, error) {
func (c *clockEventsCollector) GetMetrics() (MetricsByCounter, error) {
return c.expCollector.getMetrics()
}

func NewClocksThrottleReasonsCollector(counters []Counter,
func NewClockEventsCollector(counters []Counter,
hostname string,
config *Config,
fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (Collector, error) {
if !IsDCGMExpClockThrottleReasonsEnabledCount(counters) {
logrus.Error(dcgmExpClockThrottleReasonsCount + " collector is disabled")
return nil, fmt.Errorf(dcgmExpClockThrottleReasonsCount + " collector is disabled")
if !IsDCGMExpClockEventsCountEnabled(counters) {
logrus.Error(dcgmExpClockEventsCount + " collector is disabled")
return nil, fmt.Errorf(dcgmExpClockEventsCount + " collector is disabled")
}

collector := clocksThrottleReasonsCollector{}
collector := clockEventsCollector{}
collector.expCollector = newExpCollector(
counters,
hostname,
Expand All @@ -100,23 +101,23 @@ func NewClocksThrottleReasonsCollector(counters []Counter,
)

collector.counter = counters[slices.IndexFunc(counters, func(c Counter) bool {
return c.FieldName == dcgmExpClockThrottleReasonsCount
return c.FieldName == dcgmExpClockEventsCount
})]

collector.labelFiller = func(metricValueLabels map[string]string, entityValue int64) {
metricValueLabels["throttle_reason"] = clocksThrottleReasonBitmask(entityValue).String()
metricValueLabels["clock_event"] = clockEventBitmask(entityValue).String()
}

collector.windowSize = config.ClockThrottleReasonsCountWindowSize
collector.windowSize = config.ClockEventsCountWindowSize

collector.fieldValueParser = func(value int64) []int64 {
var reasons []int64

// The int64 value may represent multiple reasons.
// To extract a specific reason, we need to perform an XOR operation with a bitmask.
reasonBitmask := clocksThrottleReasonBitmask(value)
// The int64 value may represent multiple events.
// To extract a specific event, we need to perform an XOR operation with a bitmask.
reasonBitmask := clockEventBitmask(value)

for tr := range clocksThrottleReasonToString {
for tr := range clockEventToString {
if reasonBitmask&tr != 0 {
reasons = append(reasons, int64(tr))
}
Expand Down
Loading
Loading