diff --git a/.vscode/launch.json b/.vscode/launch.json index 1d80a557..bf62b591 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -14,7 +14,6 @@ "args": [ "-f", "./etc/default-counters.csv", - "--web-config-file=./tests/integration/testdata/web-config.yml" ] } ] diff --git a/etc/default-counters.csv b/etc/default-counters.csv index c5280a5c..92dabab3 100644 --- a/etc/default-counters.csv +++ b/etc/default-counters.csv @@ -26,13 +26,14 @@ DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). # Errors and violations -DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. # DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). # DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). # DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). # DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). +# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param). # Memory usage DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). diff --git a/go.mod b/go.mod index c6cac73c..5a9bcdf1 100644 --- a/go.mod +++ b/go.mod @@ -28,10 +28,10 @@ replace ( ) require ( - github.com/NVIDIA/go-dcgm v0.0.0-20240108230649-3c233ee2a242 + github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f github.com/NVIDIA/go-nvml v0.12.0-1.0.20231031105836-a160364ba1cc github.com/avast/retry-go/v4 v4.5.1 - github.com/bits-and-blooms/bitset v1.12.0 + github.com/bits-and-blooms/bitset v1.13.0 github.com/gorilla/mux v1.8.1 github.com/prometheus/common v0.45.0 github.com/prometheus/exporter-toolkit v0.11.0 diff --git a/go.sum b/go.sum index 35afbc10..ded8cede 100644 --- a/go.sum +++ b/go.sum @@ -56,8 +56,8 @@ github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF0 github.com/Microsoft/go-winio v0.4.14 h1:+hMXMk01us9KgxGb7ftKQt2Xpf5hH/yky+TDA+qxleU= github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA= github.com/Microsoft/hcsshim v0.0.0-20190417211021-672e52e9209d/go.mod h1:Op3hHsoHPAvb6lceZHDtd9OkTew38wNoXnJs8iY7rUg= -github.com/NVIDIA/go-dcgm v0.0.0-20240108230649-3c233ee2a242 h1:H+Md4NKlMvN/rTNCVMFqRGXAgag0dRs2NsEEIfTRReM= -github.com/NVIDIA/go-dcgm v0.0.0-20240108230649-3c233ee2a242/go.mod h1:eAZdHcOerdg1hyVoWwJ6jGQ+bxl95PfreT1S7ukI7mY= +github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f h1:HEY1H1By8XI2P6KHA0wk+nXsBE+l/iYRCAwR6nZAoU8= +github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4= github.com/NVIDIA/go-nvml v0.12.0-1.0.20231031105836-a160364ba1cc h1:cpPqTnfDcYPZyvc55pdf+3PnHYZRolqp95HH9ORa12o= github.com/NVIDIA/go-nvml v0.12.0-1.0.20231031105836-a160364ba1cc/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= @@ -101,9 +101,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/bifurcation/mint v0.0.0-20180715133206-93c51c6ce115/go.mod h1:zVt7zX3K/aDCk9Tj+VM7YymsX66ERvzCJzw8rFCX2JU= -github.com/bits-and-blooms/bitset v1.2.1/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= -github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= -github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE= +github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84= github.com/blang/semver v3.5.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk= github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk= @@ -367,7 +366,6 @@ github.com/gophercloud/gophercloud v0.1.0/go.mod h1:vxM41WHh5uqHVBMZHzuwNOHh8XEo github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= github.com/gorilla/mux v1.7.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= -github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index 800db21e..388bf60b 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -45,7 +45,7 @@ const ( and therefore reporting must occur at the GPU instance level.` ) -var ( +const ( CLIFieldsFile = "collectors" CLIAddress = "address" CLICollectInterval = "collect-interval" @@ -61,6 +61,7 @@ var ( CLIConfigMapData = "configmap-data" CLIWebSystemdSocket = "web-systemd-socket" CLIWebConfigFile = "web-config-file" + CLIXIDCountWindowSize = "xid-count-window-size" ) func NewApp(buildVersion ...string) *cli.App { @@ -174,6 +175,13 @@ func NewApp(buildVersion ...string) *cli.App { Usage: "TLS config file following webConfig spec.", EnvVars: []string{"DCGM_EXPORTER_WEB_CONFIG_FILE"}, }, + &cli.IntFlag{ + Name: CLIXIDCountWindowSize, + Aliases: []string{"x"}, + Value: int((5 * time.Minute).Milliseconds()), + Usage: "Set time window size in milliseconds (ms) for counting active XID errors in DCGM Exporter.", + EnvVars: []string{"DCGM_EXPORTER_XID_COUNT_WINDOW_SIZE"}, + }, } if runtime.GOOS == "linux" { @@ -241,14 +249,47 @@ restart: config.MetricGroups = groups } + counters, exporterCounters, err := dcgmexporter.ExtractCounters(config) + if err != nil { + logrus.Fatal(err) + } + + // Copy labels from counters to exporterCounters + for i := range counters { + if counters[i].PromType == "label" { + exporterCounters = append(exporterCounters, counters[i]) + } + } + + hostname, err := dcgmexporter.GetHostname(config) + if err != nil { + return err + } + ch := make(chan string, 10) - pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, dcgmexporter.NewDCGMCollector) + + pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, counters, hostname, dcgmexporter.NewDCGMCollector) defer cleanup() if err != nil { logrus.Fatal(err) } - server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch) + cRegistry := dcgmexporter.NewRegistry() + + if dcgmexporter.IsdcgmExpXIDErrorsCountEnabled(exporterCounters) { + xidCollector, err := dcgmexporter.NewXIDCollector(config, exporterCounters, hostname) + if err != nil { + logrus.Fatal(err) + } + + defer func() { + xidCollector.Cleanup() + }() + + cRegistry.Register(xidCollector) + } + + server, cleanup, err := dcgmexporter.NewMetricsServer(config, ch, cRegistry) defer cleanup() if err != nil { return err @@ -375,5 +416,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { ConfigMapData: c.String(CLIConfigMapData), WebSystemdSocket: c.Bool(CLIWebSystemdSocket), WebConfigFile: c.String(CLIWebConfigFile), + XIDCountWindowSize: c.Int(CLIXIDCountWindowSize), }, nil } diff --git a/pkg/dcgmexporter/const.go b/pkg/dcgmexporter/const.go new file mode 100644 index 00000000..c590da91 --- /dev/null +++ b/pkg/dcgmexporter/const.go @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import "fmt" + +type DCGMExporterMetric uint16 + +const ( + DCGMFIUnknown DCGMExporterMetric = 0 + DCGMXIDErrorsCount DCGMExporterMetric = iota + 9000 +) + +// String method to convert the enum value to a string +func (enm DCGMExporterMetric) String() string { + switch enm { + case DCGMXIDErrorsCount: + return "DCGM_EXP_XID_ERRORS_COUNT" + default: + return "DCGM_FI_UNKNOWN" + } +} + +func mustParseDCGMExporterMetric(s string) DCGMExporterMetric { + metrics := map[string]DCGMExporterMetric{ + DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount, + DCGMFIUnknown.String(): DCGMFIUnknown, + } + mv, ok := metrics[s] + if !ok { + panic(fmt.Sprintf(`cannot parse:[%s] as DCGMExporterMetric`, s)) + } + return mv +} diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index 8f2ba2d5..a3cc28cf 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -24,41 +24,29 @@ import ( "github.com/sirupsen/logrus" ) -type DCGMCollectorConstructor func([]Counter, *Config, dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) +type DCGMCollectorConstructor func([]Counter, *Config, string, dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) -func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) { +func NewDCGMCollector(c []Counter, config *Config, hostname string, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) { var deviceFields = NewDeviceFields(c, entityType) if !ShouldMonitorDeviceType(deviceFields, entityType) { return nil, func() {}, fmt.Errorf("No fields to watch for device type: %d", entityType) } - sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.CPUDevices, config.UseFakeGPUs, entityType) + sysInfo, err := GetSystemInfo(config, entityType) if err != nil { return nil, func() {}, err } - hostname := "" - if config.NoHostname == false { - if nodeName := os.Getenv("NODE_NAME"); nodeName != "" { - hostname = nodeName - } else { - hostname, err = os.Hostname() - if err != nil { - return nil, func() {}, err - } - } - } - collector := &DCGMCollector{ Counters: c, DeviceFields: deviceFields, UseOldNamespace: config.UseOldNamespace, - SysInfo: sysInfo, + SysInfo: *sysInfo, Hostname: hostname, } - cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, sysInfo, int64(config.CollectInterval)*1000) + cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, *sysInfo, int64(config.CollectInterval)*1000) if err != nil { logrus.Fatal("Failed to watch metrics: ", err) } @@ -68,19 +56,43 @@ func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_ return collector, func() { collector.Cleanup() }, nil } +func GetSystemInfo(config *Config, entityType dcgm.Field_Entity_Group) (*SystemInfo, error) { + sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.CPUDevices, config.UseFakeGPUs, entityType) + if err != nil { + return nil, err + } + return &sysInfo, err +} + +func GetHostname(config *Config) (string, error) { + hostname := "" + var err error + if !config.NoHostname { + if nodeName := os.Getenv("NODE_NAME"); nodeName != "" { + hostname = nodeName + } else { + hostname, err = os.Hostname() + if err != nil { + return "", err + } + } + } + return hostname, nil +} + func (c *DCGMCollector) Cleanup() { for _, c := range c.Cleanups { c() } } -func (c *DCGMCollector) GetMetrics() ([][]Metric, error) { +func (c *DCGMCollector) GetMetrics() (map[Counter][]Metric, error) { monitoringInfo := GetMonitoredEntities(c.SysInfo) count := len(monitoringInfo) - metrics := make([][]Metric, count) + metrics := make(map[Counter][]Metric, count) - for i, mi := range monitoringInfo { + for _, mi := range monitoringInfo { var vals []dcgm.FieldValue_v1 var err error if mi.Entity.EntityGroupId == dcgm.FE_LINK { @@ -100,11 +112,11 @@ func (c *DCGMCollector) GetMetrics() ([][]Metric, error) { // InstanceInfo will be nil for GPUs if c.SysInfo.InfoType == dcgm.FE_SWITCH || c.SysInfo.InfoType == dcgm.FE_LINK { - metrics[i] = ToSwitchMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) + metrics = ToSwitchMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) } else if c.SysInfo.InfoType == dcgm.FE_CPU || c.SysInfo.InfoType == dcgm.FE_CPU_CORE { - metrics[i] = ToCPUMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) + metrics = ToCPUMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) } else { - metrics[i] = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname) + metrics = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname) } } @@ -123,19 +135,19 @@ func ShouldMonitorDeviceType(fields []dcgm.Short, entityType dcgm.Field_Entity_G return true } -func FindCounterField(c []Counter, fieldId uint) (*Counter, error) { +func FindCounterField(c []Counter, fieldId uint) (Counter, error) { for i := 0; i < len(c); i++ { if uint(c[i].FieldID) == fieldId { - return &c[i], nil + return c[i], nil } } - return &c[0], fmt.Errorf("Could not find corresponding counter") + return c[0], fmt.Errorf("Could not find corresponding counter") } -func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) []Metric { - var metrics []Metric - var labels = map[string]string{} +func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) map[Counter][]Metric { + metrics := make(map[Counter][]Metric) + labels := map[string]string{} for _, val := range values { v := ToString(val) @@ -167,18 +179,19 @@ func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, GPUDevice: fmt.Sprintf("nvswitch%d", mi.ParentId), GPUModelName: "", Hostname: hostname, - Labels: &labels, + Labels: labels, Attributes: nil, } } - metrics = append(metrics, m) + + metrics[m.Counter] = append(metrics[m.Counter], m) } return metrics } -func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) []Metric { - var metrics []Metric +func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) map[Counter][]Metric { + metrics := make(map[Counter][]Metric) var labels = map[string]string{} for _, val := range values { @@ -211,18 +224,19 @@ func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, us GPUDevice: fmt.Sprintf("%d", mi.ParentId), GPUModelName: "", Hostname: hostname, - Labels: &labels, + Labels: labels, Attributes: nil, } } - metrics = append(metrics, m) + + metrics[m.Counter] = append(metrics[m.Counter], m) } return metrics } -func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GPUInstanceInfo, useOld bool, hostname string) []Metric { - var metrics []Metric +func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GPUInstanceInfo, useOld bool, hostname string) map[Counter][]Metric { + metrics := make(map[Counter][]Metric) var labels = map[string]string{} for _, val := range values { @@ -256,7 +270,7 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI GPUModelName: d.Identifiers.Model, Hostname: hostname, - Labels: &labels, + Labels: labels, Attributes: map[string]string{}, } if instanceInfo != nil { @@ -266,7 +280,8 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI m.MigProfile = "" m.GPUInstanceID = "" } - metrics = append(metrics, m) + + metrics[m.Counter] = []Metric{m} } return metrics @@ -321,8 +336,6 @@ func ToString(value dcgm.FieldValue_v1) string { default: return v } - default: - return FailedToConvert } return FailedToConvert diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go index c1844e6a..760e3072 100644 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ b/pkg/dcgmexporter/gpu_collector_test.go @@ -62,7 +62,11 @@ func TestDCGMCollector(t *testing.T) { } func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) { - dOpt := DeviceOptions{true, []int{-1}, []int{-1}} + dOpt := DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + } cfg := Config{ GPUDevices: dOpt, NoHostname: false, @@ -115,33 +119,33 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun dcgmAddEntityToGroup = dcgm.AddEntityToGroup }() - g, cleanup, err := NewDCGMCollector(counters, &cfg, dcgm.FE_GPU) + g, cleanup, err := NewDCGMCollector(counters, &cfg, "", dcgm.FE_GPU) require.NoError(t, err) /* Test for error when no switches are available to monitor. */ - _, _, err = NewDCGMCollector(counters, &cfg, dcgm.FE_SWITCH) + _, _, err = NewDCGMCollector(counters, &cfg, "", dcgm.FE_SWITCH) require.Error(t, err) /* Test for error when no cpus are available to monitor. */ - _, _, err = NewDCGMCollector(counters, &cfg, dcgm.FE_CPU) + _, _, err = NewDCGMCollector(counters, &cfg, "", dcgm.FE_CPU) require.NoError(t, err) out, err := g.GetMetrics() require.NoError(t, err) require.Greater(t, len(out), 0, "Check that you have a GPU on this node") - require.Len(t, out[0], len(expectedMetrics)) + require.Len(t, out, len(expectedMetrics)) - for i, dev := range out { - seenMetrics := map[string]bool{} - for _, metric := range dev { + seenMetrics := map[string]bool{} + for _, metrics := range out { + for _, metric := range metrics { seenMetrics[metric.Counter.FieldName] = true - require.Equal(t, metric.GPU, fmt.Sprintf("%d", i)) + require.NotEmpty(t, metric.GPU) require.NotEmpty(t, metric.Value) require.NotEqual(t, metric.Value, FailedToConvert) } - require.Equal(t, seenMetrics, expectedMetrics) } + require.Equal(t, seenMetrics, expectedMetrics) return g, cleanup } @@ -202,18 +206,18 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun }() /* Test that only cpu metrics are collected for cpu entities. */ - c, cleanup, err := NewDCGMCollector(counters, &cfg, dcgm.FE_CPU) + c, cleanup, err := NewDCGMCollector(counters, &cfg, "", dcgm.FE_CPU) require.NoError(t, err) out, err := c.GetMetrics() require.NoError(t, err) require.Greater(t, len(out), 0, "Check that the fake CPU has been registered") - for i, dev := range out { + for _, dev := range out { seenMetrics := map[string]bool{} for _, metric := range dev { seenMetrics[metric.Counter.FieldName] = true - require.Equal(t, metric.GPU, fmt.Sprintf("%d", i)) + require.NotEmpty(t, metric.GPU) require.NotEmpty(t, metric.Value) require.NotEqual(t, metric.Value, FailedToConvert) diff --git a/pkg/dcgmexporter/kubernetes.go b/pkg/dcgmexporter/kubernetes.go index 357fd9a4..63c04f12 100644 --- a/pkg/dcgmexporter/kubernetes.go +++ b/pkg/dcgmexporter/kubernetes.go @@ -54,7 +54,7 @@ func (p *PodMapper) Name() string { return "podMapper" } -func (p *PodMapper) Process(metrics [][]Metric, sysInfo SystemInfo) error { +func (p *PodMapper) Process(metrics map[Counter][]Metric, sysInfo SystemInfo) error { _, err := os.Stat(socketPath) if os.IsNotExist(err) { logrus.Infof("No Kubelet socket, ignoring") @@ -77,20 +77,20 @@ func (p *PodMapper) Process(metrics [][]Metric, sysInfo SystemInfo) error { // Note: for loop are copies the value, if we want to change the value // and not the copy, we need to use the indexes - for i, device := range metrics { - for j, val := range device { + for counter := range metrics { + for j, val := range metrics[counter] { deviceID, err := val.getIDOfType(p.Config.KubernetesGPUIdType) if err != nil { return err } if !p.Config.UseOldNamespace { - metrics[i][j].Attributes[podAttribute] = deviceToPod[deviceID].Name - metrics[i][j].Attributes[namespaceAttribute] = deviceToPod[deviceID].Namespace - metrics[i][j].Attributes[containerAttribute] = deviceToPod[deviceID].Container + metrics[counter][j].Attributes[podAttribute] = deviceToPod[deviceID].Name + metrics[counter][j].Attributes[namespaceAttribute] = deviceToPod[deviceID].Namespace + metrics[counter][j].Attributes[containerAttribute] = deviceToPod[deviceID].Container } else { - metrics[i][j].Attributes[oldPodAttribute] = deviceToPod[deviceID].Name - metrics[i][j].Attributes[oldNamespaceAttribute] = deviceToPod[deviceID].Namespace - metrics[i][j].Attributes[oldContainerAttribute] = deviceToPod[deviceID].Container + metrics[counter][j].Attributes[oldPodAttribute] = deviceToPod[deviceID].Name + metrics[counter][j].Attributes[oldNamespaceAttribute] = deviceToPod[deviceID].Namespace + metrics[counter][j].Attributes[oldContainerAttribute] = deviceToPod[deviceID].Container } } } diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go index 3a8332a4..045315f9 100644 --- a/pkg/dcgmexporter/kubernetes_test.go +++ b/pkg/dcgmexporter/kubernetes_test.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "os" + "reflect" "testing" "time" @@ -51,11 +52,14 @@ func TestProcessPodMapper(t *testing.T) { out, err := c.GetMetrics() require.NoError(t, err) - original := append(out[:0:0], out...) + + original := out + + arbirtaryMetric := out[reflect.ValueOf(out).MapKeys()[0].Interface().(Counter)] socketPath = tmpDir + "/kubelet.sock" server := grpc.NewServer() - gpus := GetGPUUUIDs(original) + gpus := GetGPUUUIDs(arbirtaryMetric) podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpus)) cleanup = StartMockServer(t, server, socketPath) @@ -68,25 +72,22 @@ func TestProcessPodMapper(t *testing.T) { require.NoError(t, err) require.Len(t, out, len(original)) - for i, dev := range out { - for _, metric := range dev { + for _, metrics := range out { + for _, metric := range metrics { require.Contains(t, metric.Attributes, podAttribute) require.Contains(t, metric.Attributes, namespaceAttribute) require.Contains(t, metric.Attributes, containerAttribute) - - // TODO currently we rely on ordering and implicit expectations of the mock implementation - // This should be a table comparison - require.Equal(t, metric.Attributes[podAttribute], fmt.Sprintf("gpu-pod-%d", i)) + require.Equal(t, metric.Attributes[podAttribute], fmt.Sprintf("gpu-pod-%s", metric.GPU)) require.Equal(t, metric.Attributes[namespaceAttribute], "default") require.Equal(t, metric.Attributes[containerAttribute], "default") } } } -func GetGPUUUIDs(metrics [][]Metric) []string { +func GetGPUUUIDs(metrics []Metric) []string { gpus := make([]string, len(metrics)) for i, dev := range metrics { - gpus[i] = dev[0].GPUUUID + gpus[i] = dev.GPUUUID } return gpus @@ -255,24 +256,27 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: tc.KubernetesGPUIDType}) require.NoError(t, err) require.NotNil(t, podMapper) - metrics := [][]Metric{ - { - { - GPU: "0", - GPUUUID: tc.MetricGPUID, - GPUDevice: tc.MetricGPUDevice, - GPUInstanceID: fmt.Sprint(tc.GPUInstanceID), - Value: "42", - MigProfile: tc.MetricMigProfile, - Counter: &Counter{ - FieldID: 155, - FieldName: "DCGM_FI_DEV_POWER_USAGE", - PromType: "gauge", - }, - Attributes: map[string]string{}, - }, - }, + metrics := map[Counter][]Metric{} + counter := Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", } + metrics[counter] = append(metrics[counter], Metric{ + GPU: "0", + GPUUUID: tc.MetricGPUID, + GPUDevice: tc.MetricGPUDevice, + GPUInstanceID: fmt.Sprint(tc.GPUInstanceID), + Value: "42", + MigProfile: tc.MetricMigProfile, + Counter: Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + }, + Attributes: map[string]string{}, + }) + sysInfo := SystemInfo{ GPUCount: 1, GPUs: [32]GPUInfo{ @@ -288,7 +292,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { err = podMapper.Process(metrics, sysInfo) require.NoError(t, err) assert.Len(t, metrics, 1) - for _, metric := range metrics[0] { + for _, metric := range metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] { require.Contains(t, metric.Attributes, podAttribute) require.Contains(t, metric.Attributes, namespaceAttribute) require.Contains(t, metric.Attributes, containerAttribute) diff --git a/pkg/dcgmexporter/parser.go b/pkg/dcgmexporter/parser.go index 67d8b7c8..0c76bab9 100644 --- a/pkg/dcgmexporter/parser.go +++ b/pkg/dcgmexporter/parser.go @@ -37,7 +37,7 @@ const ( DCP_FIELDS_START = 1000 ) -func ExtractCounters(c *Config) ([]Counter, error) { +func ExtractCounters(c *Config) ([]Counter, []Counter, error) { var err error var records [][]string @@ -61,16 +61,16 @@ func ExtractCounters(c *Config) ([]Counter, error) { records, err = ReadCSVFile(c.CollectorsFile) if err != nil { logrus.Errorf("Could not read metrics file '%s': %v\n", c.CollectorsFile, err) - return nil, err + return nil, nil, err } } - counters, err := extractCounters(records, c) + counters, extraCounters, err := extractCounters(records, c) if err != nil { - return nil, err + return nil, nil, err } - return counters, err + return counters, extraCounters, err } func ReadCSVFile(filename string) ([][]string, error) { @@ -88,8 +88,9 @@ func ReadCSVFile(filename string) ([][]string, error) { return records, err } -func extractCounters(records [][]string, c *Config) ([]Counter, error) { +func extractCounters(records [][]string, c *Config) ([]Counter, []Counter, error) { f := make([]Counter, 0, len(records)) + expf := make([]Counter, 0) for i, record := range records { var useOld = false @@ -102,13 +103,20 @@ func extractCounters(records [][]string, c *Config) ([]Counter, error) { } if len(record) != 3 { - return nil, fmt.Errorf("Malformed CSV record, failed to parse line %d (`%v`), expected 3 fields", i, record) + return nil, nil, fmt.Errorf("Malformed CSV record, failed to parse line %d (`%v`), expected 3 fields", i, record) } fieldID, ok := dcgm.DCGM_FI[record[0]] oldFieldID, oldOk := dcgm.OLD_DCGM_FI[record[0]] if !ok && !oldOk { - return nil, fmt.Errorf("Could not find DCGM field %s", record[0]) + + expField := mustParseDCGMExporterMetric(record[0]) + if expField != DCGMFIUnknown { + expf = append(expf, Counter{dcgm.Short(expField), record[0], record[1], record[2]}) + continue + } else { + return nil, nil, fmt.Errorf("Could not find DCGM field %s", record[0]) + } } if !ok && oldOk { @@ -122,7 +130,7 @@ func extractCounters(records [][]string, c *Config) ([]Counter, error) { } if _, ok := promMetricType[record[1]]; !ok { - return nil, fmt.Errorf("Could not find Prometheus metric type %s", record[1]) + return nil, nil, fmt.Errorf("Could not find Prometheus metric type %s", record[1]) } f = append(f, Counter{fieldID, record[0], record[1], record[2]}) @@ -133,7 +141,7 @@ func extractCounters(records [][]string, c *Config) ([]Counter, error) { } if _, ok := promMetricType[record[1]]; !ok { - return nil, fmt.Errorf("Could not find Prometheus metric type %s", record[1]) + return nil, nil, fmt.Errorf("Could not find Prometheus metric type %s", record[1]) } f = append(f, Counter{oldFieldID, record[0], record[1], record[2]}) @@ -141,7 +149,7 @@ func extractCounters(records [][]string, c *Config) ([]Counter, error) { } } - return f, nil + return f, expf, nil } func fieldIsSupported(fieldID uint, c *Config) bool { diff --git a/pkg/dcgmexporter/parser_test.go b/pkg/dcgmexporter/parser_test.go index 163535ea..eff8197c 100644 --- a/pkg/dcgmexporter/parser_test.go +++ b/pkg/dcgmexporter/parser_test.go @@ -1,7 +1,6 @@ package dcgmexporter import ( - "io/ioutil" "os" "testing" @@ -100,7 +99,7 @@ func TestInvalidConfigMapNamespace(t *testing.T) { } func TestExtractCounters(t *testing.T) { - tmpFile, err := ioutil.TempFile(os.TempDir(), "prefix-") + tmpFile, err := os.CreateTemp(os.TempDir(), "prefix-") if err != nil { t.Fatalf("Cannot create temporary file: %v", err) } @@ -122,7 +121,7 @@ func TestExtractCounters(t *testing.T) { ConfigMapData: undefinedConfigMapData, CollectorsFile: tmpFile.Name(), } - records, err := ExtractCounters(&c) + records, _, err := ExtractCounters(&c) if len(records) != 1 || err != nil { t.Fatalf("Should have succeeded: records (%d != 1) err=%v", len(records), err) } diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go index 99410868..89888667 100644 --- a/pkg/dcgmexporter/pipeline.go +++ b/pkg/dcgmexporter/pipeline.go @@ -27,42 +27,37 @@ import ( "github.com/sirupsen/logrus" ) -func NewMetricsPipeline(c *Config, newDCGMCollector DCGMCollectorConstructor) (*MetricsPipeline, func(), error) { - counters, err := ExtractCounters(c) - if err != nil { - return nil, func() {}, err - } - +func NewMetricsPipeline(c *Config, counters []Counter, hostname string, newDCGMCollector DCGMCollectorConstructor) (*MetricsPipeline, func(), error) { cleanups := []func(){} - gpuCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_GPU) + gpuCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_GPU) if err != nil { logrus.Info("Not collecting gpu metrics: ", err) } else { cleanups = append(cleanups, cleanup) } - switchCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_SWITCH) + switchCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_SWITCH) if err != nil { logrus.Info("Not collecting switch metrics: ", err) } else { cleanups = append(cleanups, cleanup) } - linkCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_LINK) + linkCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_LINK) if err != nil { logrus.Info("Not collecting link metrics: ", err) } else { cleanups = append(cleanups, cleanup) } - cpuCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_CPU) + cpuCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_CPU) if err != nil { logrus.Info("Not collecting cpu metrics: ", err) } else { cleanups = append(cleanups, cleanup) } - coreCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_CPU_CORE) + coreCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_CPU_CORE) if err != nil { logrus.Info("Not collecting cpu core metrics: ", err) } else { @@ -152,7 +147,7 @@ func (m *MetricsPipeline) Run(out chan string, stop chan interface{}, wg *sync.W } func (m *MetricsPipeline) run() (string, error) { - var metrics [][]Metric + var metrics map[Counter][]Metric var err error var formatted string @@ -333,15 +328,7 @@ var cpuCoreMetricsFormat = ` {{ end }}` // Template is passed here so that it isn't recompiled at each iteration -func FormatMetrics(t *template.Template, m [][]Metric) (string, error) { - // Group metrics by counter instead of by device - groupedMetrics := make(map[*Counter][]Metric) - for _, deviceMetrics := range m { - for _, deviceMetric := range deviceMetrics { - groupedMetrics[deviceMetric.Counter] = append(groupedMetrics[deviceMetric.Counter], deviceMetric) - } - } - +func FormatMetrics(t *template.Template, groupedMetrics map[Counter][]Metric) (string, error) { // Format metrics var res bytes.Buffer if err := t.Execute(&res, groupedMetrics); err != nil { diff --git a/pkg/dcgmexporter/pipeline_test.go b/pkg/dcgmexporter/pipeline_test.go index f86589f5..8e643ba0 100644 --- a/pkg/dcgmexporter/pipeline_test.go +++ b/pkg/dcgmexporter/pipeline_test.go @@ -22,6 +22,7 @@ import ( "testing" "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/sirupsen/logrus" "github.com/stretchr/testify/require" ) @@ -34,6 +35,7 @@ func TestRun(t *testing.T) { defer cleanup() p, cleanup, err := NewMetricsPipelineWithGPUCollector(&Config{}, c) + require.NoError(t, err) defer cleanup() out, err := p.run() @@ -47,7 +49,7 @@ func TestRun(t *testing.T) { } func testNewDCGMCollector(counter *int, enabledCollector map[dcgm.Field_Entity_Group]struct{}) DCGMCollectorConstructor { - return func(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) { + return func(c []Counter, config *Config, hostname string, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) { // should always create GPU Collector if entityType != dcgm.FE_GPU { if _, ok := enabledCollector[entityType]; !ok { @@ -121,11 +123,19 @@ func TestCountPipelineCleanup(t *testing.T) { }, }} { cleanupCounter := 0 - _, cleanup, err := NewMetricsPipeline(&Config{ + + config := &Config{ Kubernetes: false, ConfigMapData: undefinedConfigMapData, CollectorsFile: f.Name(), - }, testNewDCGMCollector(&cleanupCounter, c.enabledCollector)) + } + + counters, _, err := ExtractCounters(config) + if err != nil { + logrus.Fatal(err) + } + + _, cleanup, err := NewMetricsPipeline(config, counters, "", testNewDCGMCollector(&cleanupCounter, c.enabledCollector)) require.NoError(t, err, "case: %s failed", c.name) cleanup() diff --git a/pkg/dcgmexporter/registry.go b/pkg/dcgmexporter/registry.go new file mode 100644 index 00000000..112d5fb3 --- /dev/null +++ b/pkg/dcgmexporter/registry.go @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import "sync" + +type Registry struct { + collectors []Collector + mtx sync.RWMutex +} + +func NewRegistry() *Registry { + return &Registry{ + collectors: make([]Collector, 0), + } +} + +func (r *Registry) Register(c Collector) { + r.collectors = append(r.collectors, c) +} + +func (r *Registry) Gather() (map[Counter][]Metric, error) { + r.mtx.Lock() + defer r.mtx.Unlock() + + output := map[Counter][]Metric{} + + for _, c := range r.collectors { + metrics, err := c.GetMetrics() + + if err != nil { + return nil, err + } + + for counter, metricVals := range metrics { + output[counter] = append(output[counter], metricVals...) + } + } + + return output, nil +} diff --git a/pkg/dcgmexporter/server.go b/pkg/dcgmexporter/server.go index 0383c46a..b144d367 100644 --- a/pkg/dcgmexporter/server.go +++ b/pkg/dcgmexporter/server.go @@ -28,7 +28,7 @@ import ( "github.com/sirupsen/logrus" ) -func NewMetricsServer(c *Config, metrics chan string) (*MetricsServer, func(), error) { +func NewMetricsServer(c *Config, metrics chan string, registry *Registry) (*MetricsServer, func(), error) { router := mux.NewRouter() serverv1 := &MetricsServer{ server: &http.Server{ @@ -44,6 +44,7 @@ func NewMetricsServer(c *Config, metrics chan string) (*MetricsServer, func(), e }, metricsChan: metrics, metrics: "", + registry: registry, } router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { @@ -106,6 +107,10 @@ func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request) { w.Header().Set("X-Content-Type-Options", "nosniff") w.WriteHeader(http.StatusOK) w.Write([]byte(s.getMetrics())) + xidMetrics, err := s.registry.Gather() + if err == nil { + encodeXIDMetrics(w, xidMetrics) + } } func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request) { diff --git a/pkg/dcgmexporter/test_utils.go b/pkg/dcgmexporter/test_utils.go new file mode 100644 index 00000000..ebac00c4 --- /dev/null +++ b/pkg/dcgmexporter/test_utils.go @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" +) + +func setupTest(t *testing.T) func(t *testing.T) { + cleanup, err := dcgm.Init(dcgm.Embedded) + assert.NoError(t, err) + + return func(t *testing.T) { + defer cleanup() + } +} diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go index 799d70af..9844c7ed 100644 --- a/pkg/dcgmexporter/types.go +++ b/pkg/dcgmexporter/types.go @@ -78,10 +78,11 @@ type Config struct { MetricGroups []dcgm.MetricGroup WebSystemdSocket bool WebConfigFile string + XIDCountWindowSize int } type Transform interface { - Process(metrics [][]Metric, sysInfo SystemInfo) error + Process(metrics map[Counter][]Metric, sysInfo SystemInfo) error Name() string } @@ -120,7 +121,7 @@ type Counter struct { } type Metric struct { - Counter *Counter + Counter Counter Value string GPU string @@ -134,7 +135,7 @@ type Metric struct { GPUInstanceID string Hostname string - Labels *map[string]string + Labels map[string]string Attributes map[string]string } @@ -167,6 +168,7 @@ type MetricsServer struct { webConfig *web.FlagConfig metrics string metricsChan chan string + registry *Registry } type PodMapper struct { diff --git a/pkg/dcgmexporter/xid_collector.go b/pkg/dcgmexporter/xid_collector.go new file mode 100644 index 00000000..202a1b70 --- /dev/null +++ b/pkg/dcgmexporter/xid_collector.go @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "fmt" + "io" + "maps" + "math/rand" + "slices" + "sync" + "text/template" + "time" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/sirupsen/logrus" +) + +const ( + dcgmExpXIDErrorsCount = "DCGM_EXP_XID_ERRORS_COUNT" + windowSizeInMSLabel = "window_size_in_ms" +) + +var xidMetricsFormat = ` +{{- range $counter, $metrics := . -}} +# HELP {{ $counter.FieldName }} {{ $counter.Help }} +# TYPE {{ $counter.FieldName }} {{ $counter.PromType }} +{{- range $metric := $metrics }} +{{ $counter.FieldName }}{gpu="{{ $metric.GPU }}",{{ $metric.UUID }}="{{ $metric.GPUUUID }}",device="{{ $metric.GPUDevice }}",modelName="{{ $metric.GPUModelName }}"{{if $metric.MigProfile}},GPU_I_PROFILE="{{ $metric.MigProfile }}",GPU_I_ID="{{ $metric.GPUInstanceID }}"{{end}}{{if $metric.Hostname }},Hostname="{{ $metric.Hostname }}"{{end}} + +{{- range $k, $v := $metric.Labels -}} + ,{{ $k }}="{{ $v }}" +{{- end -}} +{{- range $k, $v := $metric.Attributes -}} + ,{{ $k }}="{{ $v }}" +{{- end -}} + +} {{ $metric.Value -}} +{{- end }} +{{ end }}` + +type Collector interface { + GetMetrics() (map[Counter][]Metric, error) + Cleanup() +} + +type xidCollector struct { + sysInfo *SystemInfo + counter Counter + hostname string + config *Config + deviceFields []dcgm.Short + labelsCounters []Counter + cleanups []func() +} + +func (c *xidCollector) GetMetrics() (map[Counter][]Metric, error) { + // Create a group of fields + const ( + xid int = iota + ) + + deviceFields := make([]dcgm.Short, 1) + deviceFields[xid] = dcgm.DCGM_FI_DEV_XID_ERRORS + + fieldGroupName := fmt.Sprintf("fieldGroupName%d", rand.Uint64()) + fieldsGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields) + if err != nil { + return nil, err + } + + defer func() { + _ = dcgm.FieldGroupDestroy(fieldsGroup) + }() + + err = dcgm.UpdateAllFields() + if err != nil { + return nil, err + } + + mapEntityIDToValues := map[uint]map[int64]int{} + + window := time.Now().Add(-time.Duration(c.config.XIDCountWindowSize) * time.Millisecond) + + values, _, err := dcgm.GetValuesSince(dcgm.GroupAllGPUs(), fieldsGroup, window) + if err != nil { + return nil, err + } + + for _, val := range values { + if val.Status == 0 { + if _, exists := mapEntityIDToValues[val.EntityId]; !exists { + mapEntityIDToValues[val.EntityId] = map[int64]int{} + } + mapEntityIDToValues[val.EntityId][val.Int64()] += 1 + } + } + + labels := map[string]string{} + labels[windowSizeInMSLabel] = fmt.Sprint(c.config.XIDCountWindowSize) + + monitoringInfo := GetMonitoredEntities(*c.sysInfo) + metrics := make(map[Counter][]Metric) + useOld := c.config.UseOldNamespace + uuid := "UUID" + if useOld { + uuid = "uuid" + } + for _, mi := range monitoringInfo { + vals, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.deviceFields) + if err != nil { + return nil, err + } + // Extract Labels + for _, val := range vals { + v := ToString(val) + // Filter out counters with no value and ignored fields for this entity + if v == SkipDCGMValue { + continue + } + + counter, err := FindCounterField(c.labelsCounters, val.FieldId) + if err != nil { + continue + } + + if counter.PromType == "label" { + labels[counter.FieldName] = v + continue + } + } + + gpuXIDErrors, exists := mapEntityIDToValues[mi.DeviceInfo.GPU] + + if exists { + for xidErr, val := range gpuXIDErrors { + + metricValueLables := maps.Clone(labels) + metricValueLables["xid"] = fmt.Sprint(xidErr) + m := Metric{ + Counter: c.counter, + Value: fmt.Sprint(val), + UUID: uuid, + GPU: fmt.Sprintf("%d", mi.DeviceInfo.GPU), + GPUUUID: mi.DeviceInfo.UUID, + GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU), + GPUModelName: mi.DeviceInfo.Identifiers.Model, + Hostname: c.hostname, + + Labels: metricValueLables, + Attributes: map[string]string{}, + } + if mi.InstanceInfo != nil { + m.MigProfile = mi.InstanceInfo.ProfileName + m.GPUInstanceID = fmt.Sprintf("%d", mi.InstanceInfo.Info.NvmlInstanceId) + } else { + m.MigProfile = "" + m.GPUInstanceID = "" + } + + metrics[c.counter] = append(metrics[c.counter], m) + } + } + } + + return metrics, nil +} + +func (c *xidCollector) Cleanup() { + for _, cleanup := range c.cleanups { + cleanup() + } +} + +func NewXIDCollector(config *Config, counters []Counter, hostname string) (Collector, error) { + + if !IsdcgmExpXIDErrorsCountEnabled(counters) { + logrus.Error(dcgmExpXIDErrorsCount + " collector is disabled") + return nil, fmt.Errorf(dcgmExpXIDErrorsCount + " collector is disabled") + } + + sysInfo, err := GetSystemInfo(config, dcgm.FE_GPU) + if err != nil { + return nil, err + } + + labelsCounters := []Counter{} + for i := 0; i < len(counters); i++ { + if counters[i].PromType == "label" { + labelsCounters = append(labelsCounters, counters[i]) + counters = slices.Delete(counters, i, i+1) + } + } + + var deviceFields = NewDeviceFields(labelsCounters, dcgm.FE_GPU) + + cleanups, err := SetupDcgmFieldsWatch([]dcgm.Short{dcgm.DCGM_FI_DEV_XID_ERRORS}, *sysInfo, int64(config.CollectInterval)*1000) + if err != nil { + logrus.Fatal("Failed to watch metrics: ", err) + } + + counter := counters[slices.IndexFunc(counters, func(c Counter) bool { + return c.FieldName == dcgmExpXIDErrorsCount + })] + + collector := xidCollector{ + sysInfo: sysInfo, + counter: counter, + hostname: hostname, + config: config, + deviceFields: deviceFields, + labelsCounters: labelsCounters, + cleanups: cleanups, + } + return &collector, nil +} + +var getXIDMetricTemplate = sync.OnceValue(func() *template.Template { + return template.Must(template.New("xidMetrics").Parse(xidMetricsFormat)) +}) + +func encodeXIDMetrics(w io.Writer, metrics map[Counter][]Metric) error { + template := getXIDMetricTemplate() + return template.Execute(w, metrics) +} + +func IsdcgmExpXIDErrorsCountEnabled(counters []Counter) bool { + return slices.ContainsFunc(counters, func(c Counter) bool { + return c.FieldName == dcgmExpXIDErrorsCount + }) +} diff --git a/pkg/dcgmexporter/xid_collector_test.go b/pkg/dcgmexporter/xid_collector_test.go new file mode 100644 index 00000000..b1c0eb79 --- /dev/null +++ b/pkg/dcgmexporter/xid_collector_test.go @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "bytes" + "fmt" + "reflect" + "testing" + "time" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + io_prometheus_client "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestXIDCollector_Gather_Encode(t *testing.T) { + teardownTest := setupTest(t) + defer teardownTest(t) + + hostname := "local-test" + config := &Config{ + GPUDevices: DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + XIDCountWindowSize: int(time.Duration(5) * time.Minute), + } + + records := [][]string{ + {"DCGM_EXP_XID_ERRORS_COUNT", "gauge", "Count of XID Errors within user-specified time window (see xid-count-window-size param)."}, + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + } + + counters, extraCounters, err := extractCounters(records, config) + require.NoError(t, err) + require.Len(t, extraCounters, 1) + require.Len(t, counters, 1) + + for i := range counters { + if counters[i].PromType == "label" { + extraCounters = append(extraCounters, counters[i]) + } + } + + // Create fake GPU + numGPUs, err := dcgm.GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > dcgm.MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) + } + + entityList := []dcgm.MigHierarchyInfo{ + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + } + + gpuIDs, err := dcgm.CreateFakeEntities(entityList) + require.NoError(t, err) + require.NotEmpty(t, gpuIDs) + + for i, gpuID := range gpuIDs { + err = dcgm.InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_XID_ERRORS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(42), + ) + require.NoError(t, err) + + err = dcgm.InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_XID_ERRORS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(42), + ) + require.NoError(t, err) + + err = dcgm.InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_XID_ERRORS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), + int64(46), + ) + require.NoError(t, err) + + } + + xidCollector, err := NewXIDCollector(config, extraCounters, hostname) + require.NoError(t, err) + + defer func() { + xidCollector.Cleanup() + }() + + metrics, err := xidCollector.GetMetrics() + require.NoError(t, err) + require.NotEmpty(t, metrics) + // We expect 1 metric: DCGM_EXP_XID_ERRORS_COUNT + require.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] + // We expect 6 records, because we have 3 fake GPU and each GPU expirenced 2 XID errors: 42 and 46 + require.Len(t, metricValues, 6) + for _, val := range metricValues { + require.Contains(t, val.Labels, "window_size_in_ms") + require.Equal(t, fmt.Sprint(config.XIDCountWindowSize), val.Labels["window_size_in_ms"]) + } + + // We inject new error + err = dcgm.InjectFieldValue(gpuIDs[0], + dcgm.DCGM_FI_DEV_XID_ERRORS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().UnixMicro(), + int64(19), + ) + require.NoError(t, err) + + // Wait for 1 second + time.Sleep(1 * time.Second) + + metrics, err = xidCollector.GetMetrics() + require.NoError(t, err) + require.NotEmpty(t, metrics) + + // We expect 1 metric: DCGM_EXP_XID_ERRORS_COUNT + require.Len(t, metrics, 1) + // We get metric value with the last index + metricValues = metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] + require.Len(t, metricValues, 6+1) + for _, val := range metricValues { + require.Contains(t, val.Labels, "window_size_in_ms") + require.Equal(t, fmt.Sprint(config.XIDCountWindowSize), val.Labels["window_size_in_ms"]) + } + + // Now we check the metric rendering + var b bytes.Buffer + encodeXIDMetrics(&b, metrics) + + require.NotEmpty(t, b) + + var parser expfmt.TextParser + mf, err := parser.TextToMetricFamilies(&b) + require.NoError(t, err) + require.NotEmpty(t, mf) + require.Len(t, mf, 1) + metricFamily := mf[reflect.ValueOf(mf).MapKeys()[0].Interface().(string)] + require.NotNil(t, metricFamily.Name) + assert.Equal(t, "DCGM_EXP_XID_ERRORS_COUNT", *metricFamily.Name) + assert.Equal(t, "Count of XID Errors within user-specified time window (see xid-count-window-size param).", *metricFamily.Help) + assert.Equal(t, io_prometheus_client.MetricType_GAUGE, *metricFamily.Type) + require.Len(t, metricFamily.Metric, 6+1) + assert.Len(t, metricFamily.Metric[0].Label, 8) + assert.Equal(t, "gpu", *metricFamily.Metric[0].Label[0].Name) + assert.Equal(t, "UUID", *metricFamily.Metric[0].Label[1].Name) + assert.Equal(t, "device", *metricFamily.Metric[0].Label[2].Name) + assert.Equal(t, "modelName", *metricFamily.Metric[0].Label[3].Name) + assert.Equal(t, "Hostname", *metricFamily.Metric[0].Label[4].Name) + assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *metricFamily.Metric[0].Label[5].Name) + assert.Equal(t, "window_size_in_ms", *metricFamily.Metric[0].Label[6].Name) + assert.Equal(t, "xid", *metricFamily.Metric[0].Label[7].Name) + assert.NotEmpty(t, *metricFamily.Metric[0].Label[7].Value) +} + +func TestXIDCollector_NewXIDCollector(t *testing.T) { + config := &Config{ + GPUDevices: DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + } + + teardownTest := setupTest(t) + defer teardownTest(t) + + t.Run("Should Return Error When DCGM_EXP_XID_ERRORS_COUNT is not present", func(t *testing.T) { + records := [][]string{ + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + } + counters, extraCounters, err := extractCounters(records, config) + require.NoError(t, err) + require.Len(t, extraCounters, 0) + require.Len(t, counters, 1) + xidCollector, err := NewXIDCollector(config, counters, "") + require.Error(t, err) + require.Nil(t, xidCollector) + }) + + t.Run("Should Return Error When Counter Param Is Empty", func(t *testing.T) { + counters := make([]Counter, 0) + xidCollector, err := NewXIDCollector(config, counters, "") + require.Error(t, err) + require.Nil(t, xidCollector) + }) + + t.Run("Should Not Return Error When DCGM_EXP_XID_ERRORS_COUNT Present More Than Once", func(t *testing.T) { + records := [][]string{ + {"DCGM_FI_DRIVER_VERSION", "label", "Driver Version"}, + {"DCGM_EXP_XID_ERRORS_COUNT", "gauge", "Count of XID Errors within user-specified time window (see xid-count-window-size param)."}, + {"DCGM_EXP_XID_ERRORS_COUNT", "gauge", "Count of XID Errors within user-specified time window (see xid-count-window-size param)."}, + {"DCGM_EXP_XID_ERRORS_COUNT", "gauge", "Count of XID Errors within user-specified time window (see xid-count-window-size param)."}, + } + counters, extraCounters, err := extractCounters(records, config) + require.NoError(t, err) + for i := range counters { + if counters[i].PromType == "label" { + extraCounters = append(extraCounters, counters[i]) + } + } + xidCollector, err := NewXIDCollector(config, counters, "") + require.Error(t, err) + require.Nil(t, xidCollector) + }) +}