Skip to content

Commit

Permalink
GPU Background Health Monitoring
Browse files Browse the repository at this point in the history
  • Loading branch information
nvvfedorov committed Jul 24, 2024
1 parent 0525b9b commit b904ccd
Show file tree
Hide file tree
Showing 23 changed files with 1,390 additions and 194 deletions.
22 changes: 14 additions & 8 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,18 @@
"request": "launch",
"mode": "test",
"program": "${workspaceFolder}/tests/e2e",
"args": ["-test.v",
"--ginkgo.v",
"-kubeconfig","~/.kube/config",
"-chart","./../../deployment/",
"-image-repository","nvidia/dcgm-exporter",
"-arguments","{-f=/etc/dcgm-exporter/default-counters.csv}"],
"args": [
"-test.v",
"--ginkgo.v",
"-kubeconfig",
"~/.kube/config",
"-chart",
"./../../deployment/",
"-image-repository",
"nvidia/dcgm-exporter",
"-arguments",
"{-f=/etc/dcgm-exporter/default-counters.csv}"
],
"env": {},
"buildFlags": "-tags=e2e"
},
Expand All @@ -30,8 +36,8 @@
"-f",
"./etc/default-counters.csv",
"--debug",
"--enable-dcgm-log",
"--dcgm-log-level=INFO"
"-r",
"localhost:5555"
]
}
]
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ GOLANGCILINT_TIMEOUT ?= 10m
IMAGE_TAG ?= ""

DCGM_VERSION := $(NEW_DCGM_VERSION)
GOLANG_VERSION := 1.22.4
GOLANG_VERSION := 1.22.5
VERSION := $(NEW_EXPORTER_VERSION)
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
OUTPUT := type=oci,dest=/dev/null
PLATFORMS := linux/amd64,linux/arm64
DOCKERCMD := docker buildx build
DOCKERCMD := docker --debug buildx build
MODULE := github.com/NVIDIA/dcgm-exporter

.PHONY: all binary install check-format local
Expand Down
8 changes: 6 additions & 2 deletions docker/Dockerfile.ubuntu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ RUN set -eux; \
wget -O go.tgz "$url" --progress=dot:giga; \
tar -C /usr/local -xzf go.tgz; \
rm go.tgz;
ENV GOTOOLCHAIN=local
ENV GOTOOLCHAIN local
ENV GOPATH /go
ENV PATH $GOPATH/bin:$PATH
RUN mkdir -p "$GOPATH/src" "$GOPATH/bin" && chmod -R 1777 "$GOPATH"
Expand All @@ -48,7 +48,11 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
export CC=aarch64-linux-gnu-gcc; \
export LD_LIBRARY_PATH=/usr/aarch64-linux-gnu/lib:$LD_LIBRARY_PATH; \
fi && \
GOOS=$TARGETOS GOARCH=$TARGETARCH CC=$CC CGO_ENABLED=1 make install
export GOOS=$TARGETOS && \
export GOARCH=$TARGETARCH && \
export CGO_ENABLED=1 && \
go clean -cache -modcache &&\
make install

FROM ${BASEIMAGE}

Expand Down
3 changes: 3 additions & 0 deletions etc/default-counters.csv
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,6 @@ DCGM_FI_DRIVER_VERSION, label, Driver Version
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device

# GPU Background health monitoring metrics.
DCGM_EXP_GPU_HEALTH_STATUS, gauge, GPU Health Status
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ module github.com/NVIDIA/dcgm-exporter

go 1.22.0

toolchain go1.22.4
toolchain go1.22.5

require (
github.com/NVIDIA/go-dcgm v0.0.0-20240315184911-5e46e1bcb05a
github.com/NVIDIA/go-dcgm v0.0.0-20240724144738-f83cdef499b2
github.com/NVIDIA/go-nvml v0.12.0-3
github.com/avast/retry-go/v4 v4.5.1
github.com/bits-and-blooms/bitset v1.13.0
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7
github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w=
github.com/NVIDIA/go-dcgm v0.0.0-20240315184911-5e46e1bcb05a h1:qFqfDLey3/IaM3JYLrRY33CKEwXEzzGeJsruDkHka4s=
github.com/NVIDIA/go-dcgm v0.0.0-20240315184911-5e46e1bcb05a/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
github.com/NVIDIA/go-dcgm v0.0.0-20240619211117-c0033eaed98e h1:xlrBVR72inSPNNoFaiQu8JUFsvl6Uil5nHGszWuTSLI=
github.com/NVIDIA/go-dcgm v0.0.0-20240619211117-c0033eaed98e/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
github.com/NVIDIA/go-dcgm v0.0.0-20240625142512-8d2180486ef7 h1:w7RrIrMWqt6upSVbqnDZRwAfk7dywseHXrKjHCBjXoU=
github.com/NVIDIA/go-dcgm v0.0.0-20240625142512-8d2180486ef7/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
github.com/NVIDIA/go-dcgm v0.0.0-20240702213448-784931ee1612 h1:V6R+2/SRN9bzRD0JehbX4ocrchX2iWLFc2N9ndkdRuM=
github.com/NVIDIA/go-dcgm v0.0.0-20240702213448-784931ee1612/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
github.com/NVIDIA/go-dcgm v0.0.0-20240724144738-f83cdef499b2 h1:vm3BjlVoMnvllesEylfk+TAY5OhMfYO2vocJMEmuLmk=
github.com/NVIDIA/go-dcgm v0.0.0-20240724144738-f83cdef499b2/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
github.com/NVIDIA/go-nvml v0.12.0-3 h1:QwfjYxEqIQVRhl8327g2Y3ZvKResPydpGSKtCIIK9jE=
github.com/NVIDIA/go-nvml v0.12.0-3/go.mod h1:SOufGc5Wql+cxrIZ8RyJwVKDYxfbs4WPkHXqadcbfvA=
github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d h1:UrqY+r/OJnIp5u0s1SbQ8dVfLCZJsnvazdBP5hS4iRs=
Expand Down
59 changes: 59 additions & 0 deletions internal/mocks/pkg/dcgmprovider/mock_client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

98 changes: 98 additions & 0 deletions internal/pkg/collector/base_collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package collector

import (
"fmt"

"github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig"
"github.com/NVIDIA/dcgm-exporter/internal/pkg/counters"
"github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider"
"github.com/NVIDIA/dcgm-exporter/internal/pkg/devicemonitoring"
"github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager"
)

type baseExpCollector struct {
deviceWatchList devicewatchlistmanager.WatchList // Device info and fields used for counters and labels
counter counters.Counter // Counter for a specific collector type
labelsCounters []counters.Counter // Counters used for labels
hostname string // Hostname
config *appconfig.Config // Configuration settings
cleanups []func() // Cleanup functions
}

func (c *baseExpCollector) createMetric(
labels map[string]string, mi devicemonitoring.Info, uuid string, val int,
) Metric {
gpuModel := getGPUModel(mi.DeviceInfo, c.config.ReplaceBlanksInModelName)

m := Metric{
Counter: c.counter,
Value: fmt.Sprint(val),
UUID: uuid,
GPU: fmt.Sprintf("%d", mi.DeviceInfo.GPU),
GPUUUID: mi.DeviceInfo.UUID,
GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU),
GPUModelName: gpuModel,
GPUPCIBusID: mi.DeviceInfo.PCI.BusID,
Hostname: c.hostname,

Labels: labels,
Attributes: map[string]string{},
}
if mi.InstanceInfo != nil {
m.MigProfile = mi.InstanceInfo.ProfileName
m.GPUInstanceID = fmt.Sprintf("%d", mi.InstanceInfo.Info.NvmlInstanceId)
} else {
m.MigProfile = ""
m.GPUInstanceID = ""
}
return m
}

func (c *baseExpCollector) getLabelsFromCounters(mi devicemonitoring.Info, labels map[string]string) error {
latestValues, err := dcgmprovider.Client().EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId,
c.deviceWatchList.LabelDeviceFields())
if err != nil {
return err
}
// Extract Labels
for _, val := range latestValues {
v := toString(val)
// Filter out counters with no value and ignored fields for this entity
if v == skipDCGMValue {
continue
}

counter, err := findCounterField(c.labelsCounters, val.FieldId)
if err != nil {
continue
}

if counter.IsLabel() {
labels[counter.FieldName] = v
continue
}
}
return nil
}

func (c *baseExpCollector) Cleanup() {
for _, cleanup := range c.cleanups {
cleanup()
}
}
44 changes: 29 additions & 15 deletions internal/pkg/collector/clock_events_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,13 +195,15 @@ func TestNewClockEventsCollector(t *testing.T) {
deviceWatchList.SetDeviceFields([]dcgm.Short{dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS})
return &clockEventsCollector{
expCollector{
deviceWatchList: deviceWatchList,
counter: sampleDCGMExpClockEventsCounter,
labelsCounters: []counters.Counter{sampleLabelCounter},
hostname: hostname,
config: config,
cleanups: sampleCleanups,
windowSize: config.ClockEventsCountWindowSize,
baseExpCollector: baseExpCollector{
deviceWatchList: deviceWatchList,
counter: sampleDCGMExpClockEventsCounter,
labelsCounters: []counters.Counter{sampleLabelCounter},
hostname: hostname,
config: config,
cleanups: sampleCleanups,
},
windowSize: config.ClockEventsCountWindowSize,
},
}
},
Expand Down Expand Up @@ -232,13 +234,15 @@ func TestNewClockEventsCollector(t *testing.T) {
deviceWatchList.SetDeviceFields([]dcgm.Short{dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS})
return &clockEventsCollector{
expCollector{
deviceWatchList: deviceWatchList,
counter: sampleDCGMExpClockEventsCounter,
labelsCounters: nil,
hostname: hostname,
config: config,
cleanups: sampleCleanups,
windowSize: config.ClockEventsCountWindowSize,
baseExpCollector: baseExpCollector{
deviceWatchList: deviceWatchList,
counter: sampleDCGMExpClockEventsCounter,
labelsCounters: nil,
hostname: hostname,
config: config,
cleanups: sampleCleanups,
},
windowSize: config.ClockEventsCountWindowSize,
},
}
},
Expand Down Expand Up @@ -367,7 +371,12 @@ func Test_clockEventsCollector_GetMetrics(t *testing.T) {
mockCollectorInterval := int64(1)
mockConfig := appconfig.Config{}
mockHostname := "localhost"
var mockCleanups []func()
cleanupCalled := 0
mockCleanups := []func(){
func() {
cleanupCalled++
},
}

mockGroupHandle1 := dcgm.GroupHandle{}
mockGroupHandle1.SetHandle(uintptr(1))
Expand Down Expand Up @@ -759,6 +768,11 @@ func Test_clockEventsCollector_GetMetrics(t *testing.T) {
want, gpu1Value, gpu2Value := tt.want()
tt.conditions(mockDeviceWatcher, gpu1Value, gpu2Value)
c := tt.collector()
defer func() {
c.Cleanup()
assert.Equal(t, 1, cleanupCalled, "clean up function was not called")
cleanupCalled = 0 // reset to zero
}()

got, err := c.GetMetrics()

Expand Down
18 changes: 17 additions & 1 deletion internal/pkg/collector/collector_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ func (cf *collectorFactory) NewCollectors() []EntityCollectorTuple {
"Counters are being initialized.")

entityCollectorTuples := make([]EntityCollectorTuple, 0)

entityTypes := []dcgm.Field_Entity_Group{
dcgm.FE_GPU,
dcgm.FE_SWITCH,
Expand Down Expand Up @@ -108,6 +107,17 @@ func (cf *collectorFactory) NewCollectors() []EntityCollectorTuple {
}
}

if IsDCGMExpGPUHealthStatusEnabled(cf.counterSet.ExporterCounters) {
if newCollector, err := cf.enableExpCollector(counters.DCGMExpGPUHealthStatus); err != nil {
logrus.Fatalf("collector '%s' cannot be initialized; err: %v", counters.DCGMExpGPUHealthStatus, err)
} else {
entityCollectorTuples = append(entityCollectorTuples, EntityCollectorTuple{
entity: dcgm.FE_GPU,
collector: newCollector,
})
}
}

return entityCollectorTuples
}

Expand Down Expand Up @@ -139,6 +149,12 @@ func (cf *collectorFactory) enableExpCollector(expCollectorName string) (Collect
case counters.DCGMExpXIDErrorsCount:
newCollector, err = NewXIDCollector(cf.counterSet.ExporterCounters, cf.hostname, cf.config,
item)
case counters.DCGMExpGPUHealthStatus:
newCollector, err = NewGPUHealthStatusCollector(cf.counterSet.ExporterCounters,
cf.hostname,
cf.config,
item,
)
default:
err = fmt.Errorf("invalid collector '%s'", expCollectorName)
}
Expand Down
Loading

0 comments on commit b904ccd

Please sign in to comment.