Skip to content

Commit

Permalink
Merge pull request #239 from NVIDIA/adde-xid-metrics
Browse files Browse the repository at this point in the history
DCGM Xid Error Metrics Enhancements
  • Loading branch information
nvvfedorov authored Feb 5, 2024
2 parents 0518edc + a812077 commit afd3f28
Show file tree
Hide file tree
Showing 20 changed files with 837 additions and 146 deletions.
1 change: 0 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
"args": [
"-f",
"./etc/default-counters.csv",
"--web-config-file=./tests/integration/testdata/web-config.yml"
]
}
]
Expand Down
3 changes: 2 additions & 1 deletion etc/default-counters.csv
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@ DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).

# Errors and violations
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param).

# Memory usage
DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ replace (
)

require (
github.com/NVIDIA/go-dcgm v0.0.0-20240108230649-3c233ee2a242
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f
github.com/NVIDIA/go-nvml v0.12.0-1.0.20231031105836-a160364ba1cc
github.com/avast/retry-go/v4 v4.5.1
github.com/bits-and-blooms/bitset v1.12.0
github.com/bits-and-blooms/bitset v1.13.0
github.com/gorilla/mux v1.8.1
github.com/prometheus/common v0.45.0
github.com/prometheus/exporter-toolkit v0.11.0
Expand Down
10 changes: 4 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF0
github.com/Microsoft/go-winio v0.4.14 h1:+hMXMk01us9KgxGb7ftKQt2Xpf5hH/yky+TDA+qxleU=
github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA=
github.com/Microsoft/hcsshim v0.0.0-20190417211021-672e52e9209d/go.mod h1:Op3hHsoHPAvb6lceZHDtd9OkTew38wNoXnJs8iY7rUg=
github.com/NVIDIA/go-dcgm v0.0.0-20240108230649-3c233ee2a242 h1:H+Md4NKlMvN/rTNCVMFqRGXAgag0dRs2NsEEIfTRReM=
github.com/NVIDIA/go-dcgm v0.0.0-20240108230649-3c233ee2a242/go.mod h1:eAZdHcOerdg1hyVoWwJ6jGQ+bxl95PfreT1S7ukI7mY=
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f h1:HEY1H1By8XI2P6KHA0wk+nXsBE+l/iYRCAwR6nZAoU8=
github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
github.com/NVIDIA/go-nvml v0.12.0-1.0.20231031105836-a160364ba1cc h1:cpPqTnfDcYPZyvc55pdf+3PnHYZRolqp95HH9ORa12o=
github.com/NVIDIA/go-nvml v0.12.0-1.0.20231031105836-a160364ba1cc/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0=
github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
Expand Down Expand Up @@ -101,9 +101,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
github.com/bifurcation/mint v0.0.0-20180715133206-93c51c6ce115/go.mod h1:zVt7zX3K/aDCk9Tj+VM7YymsX66ERvzCJzw8rFCX2JU=
github.com/bits-and-blooms/bitset v1.2.1/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE=
github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84=
github.com/blang/semver v3.5.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
Expand Down Expand Up @@ -367,7 +366,6 @@ github.com/gophercloud/gophercloud v0.1.0/go.mod h1:vxM41WHh5uqHVBMZHzuwNOHh8XEo
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg=
github.com/gorilla/mux v1.7.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
Expand Down
48 changes: 45 additions & 3 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ const (
and therefore reporting must occur at the GPU instance level.`
)

var (
const (
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
Expand All @@ -61,6 +61,7 @@ var (
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -174,6 +175,13 @@ func NewApp(buildVersion ...string) *cli.App {
Usage: "TLS config file following webConfig spec.",
EnvVars: []string{"DCGM_EXPORTER_WEB_CONFIG_FILE"},
},
&cli.IntFlag{
Name: CLIXIDCountWindowSize,
Aliases: []string{"x"},
Value: int((5 * time.Minute).Milliseconds()),
Usage: "Set time window size in milliseconds (ms) for counting active XID errors in DCGM Exporter.",
EnvVars: []string{"DCGM_EXPORTER_XID_COUNT_WINDOW_SIZE"},
},
}

if runtime.GOOS == "linux" {
Expand Down Expand Up @@ -241,14 +249,47 @@ restart:
config.MetricGroups = groups
}

counters, exporterCounters, err := dcgmexporter.ExtractCounters(config)
if err != nil {
logrus.Fatal(err)
}

// Copy labels from counters to exporterCounters
for i := range counters {
if counters[i].PromType == "label" {
exporterCounters = append(exporterCounters, counters[i])
}
}

hostname, err := dcgmexporter.GetHostname(config)
if err != nil {
return err
}

ch := make(chan string, 10)
pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, dcgmexporter.NewDCGMCollector)

pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, counters, hostname, dcgmexporter.NewDCGMCollector)
defer cleanup()
if err != nil {
logrus.Fatal(err)
}

server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch)
cRegistry := dcgmexporter.NewRegistry()

if dcgmexporter.IsdcgmExpXIDErrorsCountEnabled(exporterCounters) {
xidCollector, err := dcgmexporter.NewXIDCollector(config, exporterCounters, hostname)
if err != nil {
logrus.Fatal(err)
}

defer func() {
xidCollector.Cleanup()
}()

cRegistry.Register(xidCollector)
}

server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch, cRegistry)
defer cleanup()
if err != nil {
return err
Expand Down Expand Up @@ -375,5 +416,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
}, nil
}
48 changes: 48 additions & 0 deletions pkg/dcgmexporter/const.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgmexporter

import "fmt"

type DCGMExporterMetric uint16

const (
DCGMFIUnknown DCGMExporterMetric = 0
DCGMXIDErrorsCount DCGMExporterMetric = iota + 9000
)

// String method to convert the enum value to a string
func (enm DCGMExporterMetric) String() string {
switch enm {
case DCGMXIDErrorsCount:
return "DCGM_EXP_XID_ERRORS_COUNT"
default:
return "DCGM_FI_UNKNOWN"
}
}

func mustParseDCGMExporterMetric(s string) DCGMExporterMetric {
metrics := map[string]DCGMExporterMetric{
DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount,
DCGMFIUnknown.String(): DCGMFIUnknown,
}
mv, ok := metrics[s]
if !ok {
panic(fmt.Sprintf(`cannot parse:[%s] as DCGMExporterMetric`, s))
}
return mv
}
Loading

0 comments on commit afd3f28

Please sign in to comment.