Skip to content

Commit

Permalink
Merge pull request #254 from NVIDIA/enable-dcgm_fi_dev_clock_events_a…
Browse files Browse the repository at this point in the history
…ctive

 Enable DCGM_FI_DEV_CLOCK_EVENTS_ACTIVE
  • Loading branch information
nvvfedorov authored Feb 21, 2024
2 parents e7dad0c + 4c388d2 commit 543d648
Show file tree
Hide file tree
Showing 28 changed files with 1,653 additions and 539 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -232,4 +232,4 @@ $RECYCLE.BIN/
*.msp

# Windows shortcuts
*.lnk
*.lnk
21 changes: 21 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Run Debug",
"type": "go",
"request": "launch",
"mode": "debug",
"cwd": "${workspaceRoot}",
"program": "cmd/dcgm-exporter/main.go",
"args": [
"-f",
"./etc/default-counters.csv",
"--debug"
]
}
]
}
4 changes: 2 additions & 2 deletions etc/default-counters.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Clocks
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
# DCGM_FI_EXP_CLOCK_THROTTLE_REASONS_COUNT, gauge, Count of clock throttle reasons within the user-specified time window (see clock-throttle-reasons-count-window-size param).

# Temperature
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
Expand Down Expand Up @@ -34,7 +35,6 @@ DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encoun
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param).

# Memory usage
DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).
Expand Down Expand Up @@ -74,4 +74,4 @@ DCGM_FI_DRIVER_VERSION, label, Driver Version
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ require (
github.com/sirupsen/logrus v1.9.3
github.com/stretchr/testify v1.8.4
github.com/urfave/cli/v2 v2.26.0
golang.org/x/sync v0.5.0
google.golang.org/grpc v1.60.0
k8s.io/api v0.20.2
k8s.io/apimachinery v0.20.2
Expand Down Expand Up @@ -74,11 +75,11 @@ require (
github.com/prometheus/client_golang v1.17.0 // indirect
github.com/prometheus/procfs v0.11.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/stretchr/objx v0.5.0 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
golang.org/x/crypto v0.17.0 // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/oauth2 v0.13.0 // indirect
golang.org/x/sync v0.5.0 // indirect
golang.org/x/sys v0.15.0 // indirect
golang.org/x/term v0.15.0 // indirect
golang.org/x/text v0.14.0 // indirect
Expand Down
1 change: 1 addition & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,7 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
Expand Down
158 changes: 108 additions & 50 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,24 +48,25 @@ const (
)

const (
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
CLIReplaceBlanksInModelName = "replace-blanks-in-model-name"
CLIDebugMode = "debug"
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
CLIReplaceBlanksInModelName = "replace-blanks-in-model-name"
CLIDebugMode = "debug"
CLIClockThrottleReasonsCountWindowSize = "clock-throttle-reasons-count-window-size"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -199,6 +200,12 @@ func NewApp(buildVersion ...string) *cli.App {
Usage: "Enable debug output",
EnvVars: []string{"DCGM_EXPORTER_DEBUG"},
},
&cli.IntFlag{
Name: CLIClockThrottleReasonsCountWindowSize,
Value: int((5 * time.Minute).Milliseconds()),
Usage: "Set time window size in milliseconds (ms) for counting active XID errors in DCGM Exporter.",
EnvVars: []string{"DCGM_EXPORTER_XID_COUNT_WINDOW_SIZE"},
},
}

if runtime.GOOS == "linux" {
Expand Down Expand Up @@ -285,15 +292,16 @@ restart:
config.MetricGroups = groups
}

counters, exporterCounters, err := dcgmexporter.ExtractCounters(config)
cs, err := dcgmexporter.GetCounterSet(config)

if err != nil {
logrus.Fatal(err)
}

// Copy labels from counters to exporterCounters
for i := range counters {
if counters[i].PromType == "label" {
exporterCounters = append(exporterCounters, counters[i])
// Copy labels from DCGM Counters to ExporterCounters
for i := range cs.DCGMCounters {
if cs.DCGMCounters[i].PromType == "label" {
cs.ExporterCounters = append(cs.ExporterCounters, cs.DCGMCounters[i])
}
}

Expand All @@ -302,29 +310,78 @@ restart:
return err
}

allCounters := []dcgmexporter.Counter{}

allCounters = append(allCounters, cs.DCGMCounters...)
allCounters = append(allCounters,
dcgmexporter.Counter{
FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS,
},
dcgmexporter.Counter{
FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS,
},
)

fieldEntityGroupTypeSystemInfo := dcgmexporter.NewEntityGroupTypeSystemInfo(allCounters, config)

for _, egt := range dcgmexporter.FieldEntityGroupTypeToMonitor {
err := fieldEntityGroupTypeSystemInfo.Load(egt)
if err != nil {
logrus.Infof("Not collecting %s metrics", egt.String())
}
}

ch := make(chan string, 10)

pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, counters, hostname, dcgmexporter.NewDCGMCollector)
pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config,
cs.DCGMCounters,
hostname,
dcgmexporter.NewDCGMCollector,
fieldEntityGroupTypeSystemInfo,
)
defer cleanup()
if err != nil {
logrus.Fatal(err)
}

cRegistry := dcgmexporter.NewRegistry()

if dcgmexporter.IsdcgmExpXIDErrorsCountEnabled(exporterCounters) {
xidCollector, err := dcgmexporter.NewXIDCollector(config, exporterCounters, hostname)
if dcgmexporter.IsDCGMExpXIDErrorsCountEnabled(cs.ExporterCounters) {
item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU)
if !exists {
logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMXIDErrorsCount.String())
}

xidCollector, err := dcgmexporter.NewXIDCollector(cs.ExporterCounters, hostname, config, item)
if err != nil {
logrus.Fatal(err)
}

defer func() {
xidCollector.Cleanup()
}()

cRegistry.Register(xidCollector)

logrus.Infof("%s collector initialized", dcgmexporter.DCGMXIDErrorsCount.String())
}

if dcgmexporter.IsDCGMExpClockThrottleReasonsEnabledCount(cs.ExporterCounters) {
item, exists := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU)
if !exists {
logrus.Fatalf("%s collector cannot be initialized", dcgmexporter.DCGMClockThrottleReasonsCount.String())
}
clocksThrottleReasonsCollector, err := dcgmexporter.NewClocksThrottleReasonsCollector(
cs.ExporterCounters, hostname, config, item)
if err != nil {
logrus.Fatal(err)
}

cRegistry.Register(clocksThrottleReasonsCollector)

logrus.Infof("%s collector initialized", dcgmexporter.DCGMClockThrottleReasonsCount.String())
}

defer func() {
cRegistry.Cleanup()
}()

server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch, cRegistry)
defer cleanup()
if err != nil {
Expand Down Expand Up @@ -436,25 +493,26 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
}

return &dcgmexporter.Config{
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
GPUDevices: gOpt,
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName),
Debug: c.Bool(CLIDebugMode),
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
GPUDevices: gOpt,
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName),
Debug: c.Bool(CLIDebugMode),
ClockThrottleReasonsCountWindowSize: c.Int(CLIClockThrottleReasonsCountWindowSize),
}, nil
}
Loading

0 comments on commit 543d648

Please sign in to comment.