diff --git a/.vscode/launch.json b/.vscode/launch.json
index bf62b591..a4f0acbf 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -14,6 +14,7 @@
"args": [
"-f",
"./etc/default-counters.csv",
+ "--debug"
]
}
]
diff --git a/docker/Dockerfile.ubi9 b/docker/Dockerfile.ubi9
index 0074585a..9978009b 100644
--- a/docker/Dockerfile.ubi9
+++ b/docker/Dockerfile.ubi9
@@ -4,7 +4,7 @@ WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter
RUN set -eux; \
dnf clean expire-cache; \
dnf install -y go-toolset make wget
-RUN dnf clean all
+RUN dnf clean all && rm -rf /usr/bin/go
# Install Go official release
RUN set -eux; \
diff --git a/go.mod b/go.mod
index 5a9bcdf1..38ee5255 100644
--- a/go.mod
+++ b/go.mod
@@ -33,6 +33,7 @@ require (
github.com/avast/retry-go/v4 v4.5.1
github.com/bits-and-blooms/bitset v1.13.0
github.com/gorilla/mux v1.8.1
+ github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16
github.com/prometheus/common v0.45.0
github.com/prometheus/exporter-toolkit v0.11.0
github.com/sirupsen/logrus v1.9.3
@@ -71,7 +72,6 @@ require (
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.17.0 // indirect
- github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect
github.com/prometheus/procfs v0.11.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go
index fb64028a..c90a585f 100644
--- a/pkg/cmd/app.go
+++ b/pkg/cmd/app.go
@@ -63,6 +63,7 @@ const (
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
CLIReplaceBlanksInModelName = "replace-blanks-in-model-name"
+ CLIDebugMode = "debug"
)
func NewApp(buildVersion ...string) *cli.App {
@@ -190,6 +191,12 @@ func NewApp(buildVersion ...string) *cli.App {
Usage: "Replaces every blank space in the GPU model name with a dash, ensuring a continuous, space-free identifier.",
EnvVars: []string{"DCGM_EXPORTER_REPLACE_BLANKS_IN_MODEL_NAME"},
},
+ &cli.BoolFlag{
+ Name: CLIDebugMode,
+ Value: false,
+ Usage: "Enable debug output",
+ EnvVars: []string{"DCGM_EXPORTER_DEBUG"},
+ },
}
if runtime.GOOS == "linux" {
@@ -228,6 +235,16 @@ restart:
return err
}
+ if config.Debug {
+ //enable debug logging
+ logrus.SetLevel(logrus.DebugLevel)
+ logrus.Debug("Debug output is enabled")
+ }
+
+ logrus.Debugf("Command line: %s", strings.Join(os.Args, " "))
+
+ logrus.WithField(dcgmexporter.LoggerDumpKey, fmt.Sprintf("%+v", config)).Debug("Loaded configuration")
+
if config.UseRemoteHE {
logrus.Info("Attemping to connect to remote hostengine at ", config.RemoteHEInfo)
cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0")
@@ -426,5 +443,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName),
+ Debug: c.Bool(CLIDebugMode),
}, nil
}
diff --git a/pkg/dcgmexporter/const.go b/pkg/dcgmexporter/const.go
index c590da91..4286cea8 100644
--- a/pkg/dcgmexporter/const.go
+++ b/pkg/dcgmexporter/const.go
@@ -46,3 +46,14 @@ func mustParseDCGMExporterMetric(s string) DCGMExporterMetric {
}
return mv
}
+
+// Constants for logging fields
+const (
+ LoggerGroupIDKey = "groupID"
+ LoggerDumpKey = "dump"
+)
+
+const (
+ PARENT_ID_IGNORED = 0
+ DCGM_ST_NOT_CONFIGURED = "Setting not configured"
+)
diff --git a/pkg/dcgmexporter/dcgm.go b/pkg/dcgmexporter/dcgm.go
index 7769b38e..2661b722 100644
--- a/pkg/dcgmexporter/dcgm.go
+++ b/pkg/dcgmexporter/dcgm.go
@@ -21,6 +21,7 @@ import (
"math/rand"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
+ "github.com/sirupsen/logrus"
)
func NewGroup() (dcgm.GroupHandle, func(), error) {
@@ -29,7 +30,12 @@ func NewGroup() (dcgm.GroupHandle, func(), error) {
return dcgm.GroupHandle{}, func() {}, err
}
- return group, func() { dcgm.DestroyGroup(group) }, nil
+ return group, func() {
+ err := dcgm.DestroyGroup(group)
+ if err != nil {
+ logrus.WithError(err).Warn("Cannot destroy field group")
+ }
+ }, nil
}
func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short {
@@ -56,7 +62,12 @@ func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error)
return dcgm.FieldHandle{}, func() {}, err
}
- return fieldGroup, func() { dcgm.FieldGroupDestroy(fieldGroup) }, nil
+ return fieldGroup, func() {
+ err := dcgm.FieldGroupDestroy(fieldGroup)
+ if err != nil {
+ logrus.WithError(err).Warn("Cannot destroy field group")
+ }
+ }, nil
}
func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32) error {
diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go
index 293641d8..805bea01 100644
--- a/pkg/dcgmexporter/gpu_collector.go
+++ b/pkg/dcgmexporter/gpu_collector.go
@@ -88,10 +88,10 @@ func (c *DCGMCollector) Cleanup() {
}
}
-func (c *DCGMCollector) GetMetrics() (map[Counter][]Metric, error) {
+func (c *DCGMCollector) GetMetrics() (MetricsByCounter, error) {
monitoringInfo := GetMonitoredEntities(c.SysInfo)
- metrics := make(map[Counter][]Metric)
+ metrics := make(MetricsByCounter)
for _, mi := range monitoringInfo {
var vals []dcgm.FieldValue_v1
@@ -153,7 +153,7 @@ func FindCounterField(c []Counter, fieldId uint) (Counter, error) {
return c[0], fmt.Errorf("Could not find corresponding counter")
}
-func ToSwitchMetric(metrics map[Counter][]Metric,
+func ToSwitchMetric(metrics MetricsByCounter,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
labels := map[string]string{}
@@ -196,7 +196,7 @@ func ToSwitchMetric(metrics map[Counter][]Metric,
}
}
-func ToCPUMetric(metrics map[Counter][]Metric,
+func ToCPUMetric(metrics MetricsByCounter,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
var labels = map[string]string{}
@@ -240,7 +240,7 @@ func ToCPUMetric(metrics map[Counter][]Metric,
}
func ToMetric(
- metrics map[Counter][]Metric,
+ metrics MetricsByCounter,
values []dcgm.FieldValue_v1,
c []Counter,
d dcgm.Device,
diff --git a/pkg/dcgmexporter/kubernetes.go b/pkg/dcgmexporter/kubernetes.go
index 63c04f12..68340c84 100644
--- a/pkg/dcgmexporter/kubernetes.go
+++ b/pkg/dcgmexporter/kubernetes.go
@@ -28,6 +28,7 @@ import (
"github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider"
"github.com/sirupsen/logrus"
"google.golang.org/grpc"
+ "google.golang.org/grpc/credentials/insecure"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1alpha1"
)
@@ -54,7 +55,7 @@ func (p *PodMapper) Name() string {
return "podMapper"
}
-func (p *PodMapper) Process(metrics map[Counter][]Metric, sysInfo SystemInfo) error {
+func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error {
_, err := os.Stat(socketPath)
if os.IsNotExist(err) {
logrus.Infof("No Kubelet socket, ignoring")
@@ -102,9 +103,13 @@ func connectToServer(socket string) (*grpc.ClientConn, func(), error) {
ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout)
defer cancel()
- conn, err := grpc.DialContext(ctx, socket, grpc.WithInsecure(), grpc.WithBlock(),
- grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
- return net.DialTimeout("unix", addr, timeout)
+ conn, err := grpc.DialContext(ctx,
+ socket,
+ grpc.WithTransportCredentials(insecure.NewCredentials()),
+ grpc.WithBlock(),
+ grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
+ d := net.Dialer{}
+ return d.DialContext(ctx, "unix", addr)
}),
)
diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go
index 045315f9..5e1ab164 100644
--- a/pkg/dcgmexporter/kubernetes_test.go
+++ b/pkg/dcgmexporter/kubernetes_test.go
@@ -100,7 +100,8 @@ func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() {
stopped := make(chan interface{})
go func() {
- server.Serve(l)
+ err := server.Serve(l)
+ assert.NoError(t, err)
close(stopped)
}()
@@ -256,7 +257,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: tc.KubernetesGPUIDType})
require.NoError(t, err)
require.NotNil(t, podMapper)
- metrics := map[Counter][]Metric{}
+ metrics := MetricsByCounter{}
counter := Counter{
FieldID: 155,
FieldName: "DCGM_FI_DEV_POWER_USAGE",
diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go
index 89888667..e571276a 100644
--- a/pkg/dcgmexporter/pipeline.go
+++ b/pkg/dcgmexporter/pipeline.go
@@ -28,6 +28,9 @@ import (
)
func NewMetricsPipeline(c *Config, counters []Counter, hostname string, newDCGMCollector DCGMCollectorConstructor) (*MetricsPipeline, func(), error) {
+
+ logrus.WithField(LoggerDumpKey, fmt.Sprintf("%+v", counters)).Debug("Counters are initialized")
+
cleanups := []func(){}
gpuCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_GPU)
if err != nil {
@@ -328,7 +331,7 @@ var cpuCoreMetricsFormat = `
{{ end }}`
// Template is passed here so that it isn't recompiled at each iteration
-func FormatMetrics(t *template.Template, groupedMetrics map[Counter][]Metric) (string, error) {
+func FormatMetrics(t *template.Template, groupedMetrics MetricsByCounter) (string, error) {
// Format metrics
var res bytes.Buffer
if err := t.Execute(&res, groupedMetrics); err != nil {
diff --git a/pkg/dcgmexporter/pipeline_test.go b/pkg/dcgmexporter/pipeline_test.go
index 8e643ba0..f0cb2f89 100644
--- a/pkg/dcgmexporter/pipeline_test.go
+++ b/pkg/dcgmexporter/pipeline_test.go
@@ -37,6 +37,7 @@ func TestRun(t *testing.T) {
p, cleanup, err := NewMetricsPipelineWithGPUCollector(&Config{}, c)
require.NoError(t, err)
defer cleanup()
+ require.NoError(t, err)
out, err := p.run()
require.NoError(t, err)
diff --git a/pkg/dcgmexporter/server.go b/pkg/dcgmexporter/server.go
index b144d367..94808732 100644
--- a/pkg/dcgmexporter/server.go
+++ b/pkg/dcgmexporter/server.go
@@ -50,13 +50,18 @@ func NewMetricsServer(c *Config, metrics chan string, registry *Registry) (*Metr
router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusOK)
- w.Write([]byte(`
+ _, err := w.Write([]byte(`
GPU Exporter
GPU Exporter
Metrics
`))
+ if err != nil {
+ logrus.WithError(err).Error("Failed to write response")
+ http.Error(w, "Failed to write response", http.StatusInternalServerError)
+ return
+ }
})
router.HandleFunc("/health", serverv1.Health)
@@ -76,7 +81,7 @@ func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) {
defer httpwg.Done()
logrus.Info("Starting webserver")
if err := web.ListenAndServe(s.server, s.webConfig, logger); err != nil && err != http.ErrServerClosed {
- logrus.Fatalf("Failed to Listen and Server HTTP server with err: `%v`", err)
+ logrus.WithError(err).Fatal("Failed to Listen and Server HTTP server")
}
}()
@@ -95,21 +100,33 @@ func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) {
<-stop
if err := s.server.Shutdown(context.Background()); err != nil {
- logrus.Fatalf("Failed to shutdown HTTP server, with err: `%v`", err)
+ logrus.WithError(err).Fatal("Failed to shutdown HTTP server")
}
if err := WaitWithTimeout(&httpwg, 3*time.Second); err != nil {
- logrus.Fatalf("Failed waiting for HTTP server to shutdown, with err: `%v`", err)
+ logrus.WithError(err).Fatal("Failed waiting for HTTP server to shutdown")
}
}
func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request) {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusOK)
- w.Write([]byte(s.getMetrics()))
+ _, err := w.Write([]byte(s.getMetrics()))
+ if err != nil {
+ logrus.WithError(err).Error("Failed to write response")
+ http.Error(w, "Failed to write response", http.StatusInternalServerError)
+ return
+ }
xidMetrics, err := s.registry.Gather()
- if err == nil {
- encodeXIDMetrics(w, xidMetrics)
+ if err != nil {
+ logrus.WithError(err).Error("Failed to write response")
+ http.Error(w, "Failed to write response", http.StatusInternalServerError)
+ return
+ }
+ err = encodeXIDMetrics(w, xidMetrics)
+ if err != nil {
+ http.Error(w, "Failed to write response", http.StatusInternalServerError)
+ return
}
}
@@ -117,11 +134,19 @@ func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request) {
if s.getMetrics() == "" {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusServiceUnavailable)
- w.Write([]byte("KO"))
+ _, err := w.Write([]byte("KO"))
+ if err != nil {
+ logrus.WithError(err).Error("Failed to write response")
+ http.Error(w, "Failed to write response", http.StatusInternalServerError)
+ }
} else {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusOK)
- w.Write([]byte("OK"))
+ _, err := w.Write([]byte("OK"))
+ if err != nil {
+ logrus.WithError(err).Error("Failed to write response")
+ http.Error(w, "Failed to write response", http.StatusInternalServerError)
+ }
}
}
diff --git a/pkg/dcgmexporter/system_info.go b/pkg/dcgmexporter/system_info.go
index b11131aa..dc9bc632 100644
--- a/pkg/dcgmexporter/system_info.go
+++ b/pkg/dcgmexporter/system_info.go
@@ -19,14 +19,14 @@ package dcgmexporter
import (
"fmt"
"math/rand"
+ "slices"
+ "strings"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/bits-and-blooms/bitset"
"github.com/sirupsen/logrus"
)
-const PARENT_ID_IGNORED = 0
-
var (
dcgmGetAllDeviceCount = dcgm.GetAllDeviceCount
dcgmGetDeviceInfo = dcgm.GetDeviceInfo
@@ -36,11 +36,6 @@ var (
dcgmGetCpuHierarchy = dcgm.GetCpuHierarchy
)
-type GroupInfo struct {
- groupHandle dcgm.GroupHandle
- groupType dcgm.Field_Entity_Group
-}
-
type ComputeInstanceInfo struct {
InstanceInfo dcgm.MigEntityInfo
ProfileName string
@@ -106,7 +101,7 @@ func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error
err := fmt.Errorf("Cannot find match for entities:")
for _, v := range values {
found := SetGPUInstanceProfileName(sysInfo, v.EntityId, dcgm.Fv2_String(v))
- if found == false {
+ if !found {
err = fmt.Errorf("%s group %d, id %d", err, v.EntityGroupId, v.EntityId)
notFound = true
}
@@ -204,17 +199,17 @@ func VerifyCPUDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error {
if len(sOpt.MajorRange) > 0 && sOpt.MajorRange[0] != -1 {
// Verify we can find all the specified Switches
- for _, cpuId := range sOpt.MajorRange {
- if !SwitchIdExists(sysInfo, cpuId) {
- return fmt.Errorf("couldn't find requested cpu id %d", cpuId)
+ for _, cpuID := range sOpt.MajorRange {
+ if !SwitchIdExists(sysInfo, cpuID) {
+ return fmt.Errorf("couldn't find requested cpu id %d", cpuID)
}
}
}
if len(sOpt.MinorRange) > 0 && sOpt.MinorRange[0] != -1 {
- for _, coreId := range sOpt.MinorRange {
- if !CPUCoreIdExists(sysInfo, coreId) {
- return fmt.Errorf("couldn't find requested cpu core %d", coreId)
+ for _, coreID := range sOpt.MinorRange {
+ if !CPUCoreIdExists(sysInfo, coreID) {
+ return fmt.Errorf("couldn't find requested cpu core %d", coreID)
}
}
}
@@ -229,17 +224,17 @@ func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error {
if len(sOpt.MajorRange) > 0 && sOpt.MajorRange[0] != -1 {
// Verify we can find all the specified Switches
- for _, swId := range sOpt.MajorRange {
- if !SwitchIdExists(sysInfo, swId) {
- return fmt.Errorf("couldn't find requested NvSwitch id %d", swId)
+ for _, swID := range sOpt.MajorRange {
+ if !SwitchIdExists(sysInfo, swID) {
+ return fmt.Errorf("couldn't find requested NvSwitch id %d", swID)
}
}
}
if len(sOpt.MinorRange) > 0 && sOpt.MinorRange[0] != -1 {
- for _, linkId := range sOpt.MinorRange {
- if !LinkIdExists(sysInfo, linkId) {
- return fmt.Errorf("couldn't find requested NvLink %d", linkId)
+ for _, linkID := range sOpt.MinorRange {
+ if !LinkIdExists(sysInfo, linkID) {
+ return fmt.Errorf("couldn't find requested NvLink %d", linkID)
}
}
}
@@ -254,17 +249,17 @@ func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error {
if len(gOpt.MajorRange) > 0 && gOpt.MajorRange[0] != -1 {
// Verify we can find all the specified GPUs
- for _, gpuId := range gOpt.MajorRange {
- if GPUIdExists(sysInfo, gpuId) == false {
- return fmt.Errorf("Couldn't find requested GPU id %d", gpuId)
+ for _, gpuID := range gOpt.MajorRange {
+ if !GPUIdExists(sysInfo, gpuID) {
+ return fmt.Errorf("Couldn't find requested GPU id %d", gpuID)
}
}
}
if len(gOpt.MinorRange) > 0 && gOpt.MinorRange[0] != -1 {
- for _, gpuInstanceId := range gOpt.MinorRange {
- if GPUInstanceIdExists(sysInfo, gpuInstanceId) == false {
- return fmt.Errorf("Couldn't find requested GPU instance id %d", gpuInstanceId)
+ for _, gpuInstanceID := range gOpt.MinorRange {
+ if !GPUInstanceIdExists(sysInfo, gpuInstanceID) {
+ return fmt.Errorf("Couldn't find requested GPU instance id %d", gpuInstanceID)
}
}
}
@@ -314,8 +309,12 @@ func InitializeCPUInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, erro
}
sysInfo.cOpt = sOpt
- err = VerifyCPUDevicePresence(&sysInfo, sOpt)
+ err = VerifyCPUDevicePresence(&sysInfo, sOpt)
+ if err != nil {
+ return sysInfo, err
+ }
+ logrus.Debugf("System entities of type %s initialized", sysInfo.InfoType)
return sysInfo, nil
}
@@ -352,8 +351,11 @@ func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo,
sysInfo.sOpt = sOpt
err = VerifySwitchDevicePresence(&sysInfo, sOpt)
+ if err == nil {
+ logrus.Debugf("System entities of type %s initialized", sysInfo.InfoType)
+ }
- return sysInfo, nil
+ return sysInfo, err
}
func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool) (SystemInfo, error) {
@@ -385,27 +387,27 @@ func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool)
if hierarchy.Count > 0 {
var entities []dcgm.GroupEntityPair
- gpuId := uint(0)
+ gpuID := uint(0)
instanceIndex := 0
for i := uint(0); i < hierarchy.Count; i++ {
if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU {
// We are adding a GPU instance
- gpuId = hierarchy.EntityList[i].Parent.EntityId
- entityId := hierarchy.EntityList[i].Entity.EntityId
+ gpuID = hierarchy.EntityList[i].Parent.EntityId
+ entityID := hierarchy.EntityList[i].Entity.EntityId
instanceInfo := GPUInstanceInfo{
Info: hierarchy.EntityList[i].Info,
ProfileName: "",
- EntityId: entityId,
+ EntityId: entityID,
}
- sysInfo.GPUs[gpuId].MigEnabled = true
- sysInfo.GPUs[gpuId].GPUInstances = append(sysInfo.GPUs[gpuId].GPUInstances, instanceInfo)
- entities = append(entities, dcgm.GroupEntityPair{dcgm.FE_GPU_I, entityId})
- instanceIndex = len(sysInfo.GPUs[gpuId].GPUInstances) - 1
+ sysInfo.GPUs[gpuID].MigEnabled = true
+ sysInfo.GPUs[gpuID].GPUInstances = append(sysInfo.GPUs[gpuID].GPUInstances, instanceInfo)
+ entities = append(entities, dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: entityID})
+ instanceIndex = len(sysInfo.GPUs[gpuID].GPUInstances) - 1
} else if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU_I {
// Add the compute instance, gpuId is recorded previously
- entityId := hierarchy.EntityList[i].Entity.EntityId
- ciInfo := ComputeInstanceInfo{hierarchy.EntityList[i].Info, "", entityId}
- sysInfo.GPUs[gpuId].GPUInstances[instanceIndex].ComputeInstances = append(sysInfo.GPUs[gpuId].GPUInstances[instanceIndex].ComputeInstances, ciInfo)
+ entityID := hierarchy.EntityList[i].Entity.EntityId
+ ciInfo := ComputeInstanceInfo{hierarchy.EntityList[i].Info, "", entityID}
+ sysInfo.GPUs[gpuID].GPUInstances[instanceIndex].ComputeInstances = append(sysInfo.GPUs[gpuID].GPUInstances[instanceIndex].ComputeInstances, ciInfo)
}
}
@@ -417,7 +419,9 @@ func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool)
sysInfo.gOpt = gOpt
err = VerifyDevicePresence(&sysInfo, gOpt)
-
+ if err == nil {
+ logrus.Debugf("System entities of type %s initialized", sysInfo.InfoType)
+ }
return sysInfo, err
}
@@ -449,7 +453,7 @@ func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, cOpt DeviceOpt
func CreateCoreGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error) {
var groups []dcgm.GroupHandle
var cleanups []func()
- var groupId dcgm.GroupHandle
+ var groupID dcgm.GroupHandle
var err error
/* Create per-cpu core groups */
@@ -461,25 +465,33 @@ func CreateCoreGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []f
for i, core := range cpu.Cores {
if i == 0 || i%dcgm.DCGM_GROUP_MAX_ENTITIES == 0 {
- groupId, err = dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
+ groupID, err = dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
if err != nil {
return nil, cleanups, err
}
- groups = append(groups, groupId)
+ groups = append(groups, groupID)
}
if !IsCoreWatched(core, cpu.EntityId, sysInfo) {
continue
}
- err = dcgm.AddEntityToGroup(groupId, dcgm.FE_CPU_CORE, core)
+ err = dcgm.AddEntityToGroup(groupID, dcgm.FE_CPU_CORE, core)
if err != nil {
return groups, cleanups, err
}
- cleanups = append(cleanups, func() { dcgm.DestroyGroup(groupId) })
+ cleanups = append(cleanups, func() {
+ err := dcgm.DestroyGroup(groupID)
+ if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) {
+ logrus.WithFields(logrus.Fields{
+ LoggerGroupIDKey: groupID,
+ logrus.ErrorKey: err,
+ }).Warn("can not destroy group")
+ }
+ })
}
}
@@ -496,12 +508,12 @@ func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []f
continue
}
- groupId, err := dcgmCreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
+ groupID, err := dcgmCreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
if err != nil {
return nil, cleanups, err
}
- groups = append(groups, groupId)
+ groups = append(groups, groupID)
for _, link := range sw.NvLinks {
if link.State != dcgm.LS_UP {
@@ -512,13 +524,21 @@ func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []f
continue
}
- err = dcgm.AddLinkEntityToGroup(groupId, link.Index, link.ParentId)
+ err = dcgm.AddLinkEntityToGroup(groupID, link.Index, link.ParentId)
if err != nil {
return groups, cleanups, err
}
- cleanups = append(cleanups, func() { dcgm.DestroyGroup(groupId) })
+ cleanups = append(cleanups, func() {
+ err := dcgm.DestroyGroup(groupID)
+ if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) {
+ logrus.WithFields(logrus.Fields{
+ LoggerGroupIDKey: groupID,
+ logrus.ErrorKey: err,
+ }).Warn("can not destroy group")
+ }
+ })
}
}
@@ -527,19 +547,35 @@ func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []f
func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error) {
monitoringInfo := GetMonitoredEntities(sysInfo)
- groupId, err := dcgmCreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
+ groupID, err := dcgmCreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
if err != nil {
return dcgm.GroupHandle{}, func() {}, err
}
for _, mi := range monitoringInfo {
- err := dcgmAddEntityToGroup(groupId, mi.Entity.EntityGroupId, mi.Entity.EntityId)
+ err := dcgmAddEntityToGroup(groupID, mi.Entity.EntityGroupId, mi.Entity.EntityId)
if err != nil {
- return groupId, func() { dcgm.DestroyGroup(groupId) }, err
+ return groupID, func() {
+ err := dcgm.DestroyGroup(groupID)
+ if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) {
+ logrus.WithFields(logrus.Fields{
+ LoggerGroupIDKey: groupID,
+ logrus.ErrorKey: err,
+ }).Warn("can not destroy group")
+ }
+ }, err
}
}
- return groupId, func() { dcgm.DestroyGroup(groupId) }, nil
+ return groupID, func() {
+ err := dcgm.DestroyGroup(groupID)
+ if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) {
+ logrus.WithFields(logrus.Fields{
+ LoggerGroupIDKey: groupID,
+ logrus.ErrorKey: err,
+ }).Warn("can not destroy group")
+ }
+ }, nil
}
func AddAllGPUs(sysInfo SystemInfo) []MonitoringInfo {
@@ -547,7 +583,7 @@ func AddAllGPUs(sysInfo SystemInfo) []MonitoringInfo {
for i := uint(0); i < sysInfo.GPUCount; i++ {
mi := MonitoringInfo{
- dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.GPUs[i].DeviceInfo.GPU},
+ dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: sysInfo.GPUs[i].DeviceInfo.GPU},
sysInfo.GPUs[i].DeviceInfo,
nil,
PARENT_ID_IGNORED,
@@ -567,13 +603,8 @@ func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo {
}
mi := MonitoringInfo{
- dcgm.GroupEntityPair{dcgm.FE_SWITCH, sw.EntityId},
- dcgm.Device{
- 0, "", "", 0,
- dcgm.PCIInfo{"", 0, 0, 0},
- dcgm.DeviceIdentifiers{"", "", "", "", "", ""},
- nil, "",
- },
+ dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_SWITCH, EntityId: sw.EntityId},
+ dcgm.Device{},
nil,
PARENT_ID_IGNORED,
}
@@ -601,13 +632,8 @@ func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo {
}
mi := MonitoringInfo{
- dcgm.GroupEntityPair{dcgm.FE_LINK, link.Index},
- dcgm.Device{
- 0, "", "", 0,
- dcgm.PCIInfo{"", 0, 0, 0},
- dcgm.DeviceIdentifiers{"", "", "", "", "", ""},
- nil, "",
- },
+ dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_LINK, EntityId: link.Index},
+ dcgm.Device{},
nil,
link.ParentId,
}
@@ -618,87 +644,86 @@ func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo {
return monitoring
}
-func IsSwitchWatched(switchId uint, sysInfo SystemInfo) bool {
+func IsSwitchWatched(switchID uint, sysInfo SystemInfo) bool {
if sysInfo.sOpt.Flex {
return true
}
- if len(sysInfo.sOpt.MajorRange) <= 0 {
+ // When MajorRange contains -1 value, we do monitorig of all switches
+ if len(sysInfo.sOpt.MajorRange) > 0 && sysInfo.sOpt.MajorRange[0] == -1 {
return true
}
- for _, sw := range sysInfo.sOpt.MajorRange {
- if uint(sw) == switchId {
- return true
- }
-
- }
- return false
+ return slices.Contains(sysInfo.sOpt.MajorRange, int(switchID))
}
-func IsLinkWatched(linkId uint, switchId uint, sysInfo SystemInfo) bool {
+func IsLinkWatched(linkIndex uint, switchID uint, sysInfo SystemInfo) bool {
if sysInfo.sOpt.Flex {
return true
}
- for _, sw := range sysInfo.Switches {
- if !IsSwitchWatched(sw.EntityId, sysInfo) {
- return false
- }
+ // Find a switch
+ switchIdx := slices.IndexFunc(sysInfo.Switches, func(si SwitchInfo) bool {
+ return si.EntityId == switchID && IsSwitchWatched(si.EntityId, sysInfo)
+ })
+
+ if switchIdx > -1 {
+ // Switch exists and is watched
+ sw := sysInfo.Switches[switchIdx]
- if len(sysInfo.sOpt.MinorRange) <= 0 {
+ if len(sysInfo.sOpt.MinorRange) > 0 && sysInfo.sOpt.MinorRange[0] == -1 {
return true
}
- for _, link := range sysInfo.sOpt.MinorRange {
- if uint(link) == linkId {
- return true
- }
+ // The Link exists
+ if slices.ContainsFunc(sw.NvLinks, func(nls dcgm.NvLinkStatus) bool {
+ return nls.Index == linkIndex
+ }) {
+ // and the link index in the Minor range
+ return slices.Contains(sysInfo.sOpt.MinorRange, int(linkIndex))
}
- return false
}
return false
}
-func IsCPUWatched(cpuId uint, sysInfo SystemInfo) bool {
+func IsCPUWatched(cpuID uint, sysInfo SystemInfo) bool {
+
+ if !slices.ContainsFunc(sysInfo.CPUs, func(cpu CPUInfo) bool {
+ return cpu.EntityId == cpuID
+ }) {
+ return false
+ }
+
if sysInfo.cOpt.Flex {
return true
}
- if len(sysInfo.cOpt.MajorRange) <= 0 {
+ if len(sysInfo.cOpt.MajorRange) > 0 && sysInfo.cOpt.MajorRange[0] == -1 {
return true
}
- for _, cpu := range sysInfo.cOpt.MajorRange {
- if uint(cpu) == cpuId {
- return true
- }
-
- }
- return false
+ return slices.ContainsFunc(sysInfo.cOpt.MajorRange, func(cpu int) bool {
+ return uint(cpu) == cpuID
+ })
}
-func IsCoreWatched(coreId uint, cpuId uint, sysInfo SystemInfo) bool {
+func IsCoreWatched(coreID uint, cpuID uint, sysInfo SystemInfo) bool {
if sysInfo.cOpt.Flex {
return true
}
- for _, cpu := range sysInfo.CPUs {
- if !IsCPUWatched(cpu.EntityId, sysInfo) {
- return false
- }
+ // Find a CPU
+ cpuIdx := slices.IndexFunc(sysInfo.CPUs, func(cpu CPUInfo) bool {
+ return IsCPUWatched(cpu.EntityId, sysInfo) && cpu.EntityId == cpuID
+ })
- if len(sysInfo.cOpt.MinorRange) <= 0 {
+ if cpuIdx > -1 {
+ if len(sysInfo.cOpt.MinorRange) > 0 && sysInfo.cOpt.MinorRange[0] == -1 {
return true
}
- for _, core := range sysInfo.cOpt.MinorRange {
- if uint(core) == coreId {
- return true
- }
- }
- return false
+ return slices.Contains(sysInfo.cOpt.MinorRange, int(coreID))
}
return false
@@ -713,13 +738,8 @@ func AddAllCPUs(sysInfo SystemInfo) []MonitoringInfo {
}
mi := MonitoringInfo{
- dcgm.GroupEntityPair{dcgm.FE_CPU, cpu.EntityId},
- dcgm.Device{
- 0, "", "", 0,
- dcgm.PCIInfo{"", 0, 0, 0},
- dcgm.DeviceIdentifiers{"", "", "", "", "", ""},
- nil, "",
- },
+ dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU, EntityId: cpu.EntityId},
+ dcgm.Device{},
nil,
PARENT_ID_IGNORED,
}
@@ -743,13 +763,8 @@ func AddAllCPUCores(sysInfo SystemInfo) []MonitoringInfo {
}
mi := MonitoringInfo{
- dcgm.GroupEntityPair{dcgm.FE_CPU_CORE, core},
- dcgm.Device{
- 0, "", "", 0,
- dcgm.PCIInfo{"", 0, 0, 0},
- dcgm.DeviceIdentifiers{"", "", "", "", "", ""},
- nil, "",
- },
+ dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: core},
+ dcgm.Device{},
nil,
cpu.EntityId,
}
@@ -764,9 +779,9 @@ func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo {
var monitoring []MonitoringInfo
for i := uint(0); i < sysInfo.GPUCount; i++ {
- if addFlexibly == true && len(sysInfo.GPUs[i].GPUInstances) == 0 {
+ if addFlexibly && len(sysInfo.GPUs[i].GPUInstances) == 0 {
mi := MonitoringInfo{
- dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.GPUs[i].DeviceInfo.GPU},
+ dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: sysInfo.GPUs[i].DeviceInfo.GPU},
sysInfo.GPUs[i].DeviceInfo,
nil,
PARENT_ID_IGNORED,
@@ -775,7 +790,7 @@ func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo {
} else {
for j := 0; j < len(sysInfo.GPUs[i].GPUInstances); j++ {
mi := MonitoringInfo{
- dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.GPUs[i].GPUInstances[j].EntityId},
+ dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: sysInfo.GPUs[i].GPUInstances[j].EntityId},
sysInfo.GPUs[i].DeviceInfo,
&sysInfo.GPUs[i].GPUInstances[j],
PARENT_ID_IGNORED,
@@ -788,11 +803,11 @@ func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo {
return monitoring
}
-func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuId int) *MonitoringInfo {
+func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuID int) *MonitoringInfo {
for i := uint(0); i < sysInfo.GPUCount; i++ {
- if sysInfo.GPUs[i].DeviceInfo.GPU == uint(gpuId) {
+ if sysInfo.GPUs[i].DeviceInfo.GPU == uint(gpuID) {
return &MonitoringInfo{
- dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.GPUs[i].DeviceInfo.GPU},
+ dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: sysInfo.GPUs[i].DeviceInfo.GPU},
sysInfo.GPUs[i].DeviceInfo,
nil,
PARENT_ID_IGNORED,
@@ -803,12 +818,12 @@ func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuId int) *MonitoringInfo {
return nil
}
-func GetMonitoringInfoForGPUInstance(sysInfo SystemInfo, gpuInstanceId int) *MonitoringInfo {
+func GetMonitoringInfoForGPUInstance(sysInfo SystemInfo, gpuInstanceID int) *MonitoringInfo {
for i := uint(0); i < sysInfo.GPUCount; i++ {
for _, instance := range sysInfo.GPUs[i].GPUInstances {
- if instance.EntityId == uint(gpuInstanceId) {
+ if instance.EntityId == uint(gpuInstanceID) {
return &MonitoringInfo{
- dcgm.GroupEntityPair{dcgm.FE_GPU_I, uint(gpuInstanceId)},
+ dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: uint(gpuInstanceID)},
sysInfo.GPUs[i].DeviceInfo,
&instance,
PARENT_ID_IGNORED,
@@ -831,24 +846,24 @@ func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo {
monitoring = AddAllCPUs(sysInfo)
} else if sysInfo.InfoType == dcgm.FE_CPU_CORE {
monitoring = AddAllCPUCores(sysInfo)
- } else if sysInfo.gOpt.Flex == true {
+ } else if sysInfo.gOpt.Flex {
monitoring = AddAllGPUInstances(sysInfo, true)
} else {
if len(sysInfo.gOpt.MajorRange) > 0 && sysInfo.gOpt.MajorRange[0] == -1 {
monitoring = AddAllGPUs(sysInfo)
} else {
- for _, gpuId := range sysInfo.gOpt.MajorRange {
+ for _, gpuID := range sysInfo.gOpt.MajorRange {
// We've already verified that everything in the options list exists
- monitoring = append(monitoring, *GetMonitoringInfoForGPU(sysInfo, gpuId))
+ monitoring = append(monitoring, *GetMonitoringInfoForGPU(sysInfo, gpuID))
}
}
if len(sysInfo.gOpt.MinorRange) > 0 && sysInfo.gOpt.MinorRange[0] == -1 {
monitoring = AddAllGPUInstances(sysInfo, false)
} else {
- for _, gpuInstanceId := range sysInfo.gOpt.MinorRange {
+ for _, gpuInstanceID := range sysInfo.gOpt.MinorRange {
// We've already verified that everything in the options list exists
- monitoring = append(monitoring, *GetMonitoringInfoForGPUInstance(sysInfo, gpuInstanceId))
+ monitoring = append(monitoring, *GetMonitoringInfoForGPUInstance(sysInfo, gpuInstanceID))
}
}
}
@@ -856,10 +871,10 @@ func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo {
return monitoring
}
-func GetGPUInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceId uint) string {
+func GetGPUInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceID uint) string {
for i := uint(0); i < sysInfo.GPUCount; i++ {
if sysInfo.GPUs[i].DeviceInfo.UUID == gpuuuid {
- identifier := fmt.Sprintf("%d-%d", sysInfo.GPUs[i].DeviceInfo.GPU, gpuInstanceId)
+ identifier := fmt.Sprintf("%d-%d", sysInfo.GPUs[i].DeviceInfo.GPU, gpuInstanceID)
return identifier
}
}
diff --git a/pkg/dcgmexporter/system_info_test.go b/pkg/dcgmexporter/system_info_test.go
index a7f024a8..d1efed0c 100644
--- a/pkg/dcgmexporter/system_info_test.go
+++ b/pkg/dcgmexporter/system_info_test.go
@@ -18,9 +18,11 @@ package dcgmexporter
import (
"fmt"
+ "testing"
+
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
+ "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
- "testing"
)
const (
@@ -73,6 +75,9 @@ func SpoofSwitchSystemInfo() SystemInfo {
sysInfo.Switches = append(sysInfo.Switches, sw1)
sysInfo.Switches = append(sysInfo.Switches, sw2)
+ sysInfo.sOpt.MajorRange = []int{-1}
+ sysInfo.sOpt.MinorRange = []int{-1}
+
return sysInfo
}
@@ -81,13 +86,13 @@ func SpoofSystemInfo() SystemInfo {
sysInfo.GPUCount = 2
sysInfo.GPUs[0].DeviceInfo.GPU = 0
gi := GPUInstanceInfo{
- Info: dcgm.MigEntityInfo{"fake", 0, 0, 0, 0, 3},
+ Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlProfileSlices: 3},
ProfileName: fakeProfileName,
EntityId: 0,
}
sysInfo.GPUs[0].GPUInstances = append(sysInfo.GPUs[0].GPUInstances, gi)
gi2 := GPUInstanceInfo{
- Info: dcgm.MigEntityInfo{"fake", 0, 1, 0, 0, 3},
+ Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlInstanceId: 1, NvmlProfileSlices: 3},
ProfileName: fakeProfileName,
EntityId: 14,
}
@@ -164,11 +169,6 @@ func TestVerifyDevicePresence(t *testing.T) {
require.Equal(t, err, nil, "Expected to have no error, but found %s", err)
}
-//func TestMigProfileNames(t *testing.T) {
-// sysInfo := SpoofSystemInfo()
-// SetMigProfileNames(sysInfo, values)
-//}
-
func TestMonitoredSwitches(t *testing.T) {
sysInfo := SpoofSwitchSystemInfo()
@@ -185,6 +185,333 @@ func TestMonitoredSwitches(t *testing.T) {
require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored links but found %d", len(monitoring)))
for i, mi := range monitoring {
require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_LINK, fmt.Sprintf("Should have only returned links but returned %d", mi.Entity.EntityGroupId))
- require.Equal(t, mi.ParentId, uint(i), fmt.Sprint("Link should reference switch parent"))
+ require.Equal(t, mi.ParentId, uint(i), "Link should reference switch parent")
+ }
+}
+
+func TestIsSwitchWatched(t *testing.T) {
+ tests := []struct {
+ name string
+ switchID uint
+ sysInfo SystemInfo
+ want bool
+ }{
+ {
+ name: "Monitor all devices",
+ switchID: 1,
+ sysInfo: SystemInfo{
+ sOpt: DeviceOptions{
+ Flex: true,
+ },
+ },
+ want: true,
+ },
+ {
+ name: "MajorRange empty",
+ switchID: 2,
+ sysInfo: SystemInfo{
+ sOpt: DeviceOptions{
+ MajorRange: []int{},
+ },
+ },
+ want: false,
+ },
+ {
+ name: "MajorRange contains -1 to watch all devices",
+ switchID: 3,
+ sysInfo: SystemInfo{
+ sOpt: DeviceOptions{
+ MajorRange: []int{-1},
+ },
+ },
+ want: true,
+ },
+ {
+ name: "SwitchID in MajorRange",
+ switchID: 4,
+ sysInfo: SystemInfo{
+ sOpt: DeviceOptions{
+ MajorRange: []int{3, 4, 5},
+ },
+ },
+ want: true,
+ },
+ {
+ name: "SwitchID not in MajorRange",
+ switchID: 5,
+ sysInfo: SystemInfo{
+ sOpt: DeviceOptions{
+ MajorRange: []int{3, 4, 6},
+ },
+ },
+ want: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := IsSwitchWatched(tt.switchID, tt.sysInfo)
+ assert.Equal(t, tt.want, got)
+ })
+ }
+}
+
+func TestIsLinkWatched(t *testing.T) {
+ tests := []struct {
+ name string
+ linkIndex uint
+ switchID uint
+ sysInfo SystemInfo
+ want bool
+ }{
+ {
+ name: "Monitor all devices",
+ linkIndex: 1,
+ sysInfo: SystemInfo{sOpt: DeviceOptions{Flex: true}},
+ want: true,
+ },
+ {
+ name: "No watched devices",
+ linkIndex: 1,
+ sysInfo: SystemInfo{},
+ want: false,
+ },
+ {
+ name: "Watched link with empty MinorRange",
+ linkIndex: 2,
+ sysInfo: SystemInfo{
+ sOpt: DeviceOptions{
+ MajorRange: []int{-1},
+ },
+ Switches: []SwitchInfo{
+ {
+ EntityId: 1,
+ NvLinks: []dcgm.NvLinkStatus{
+ {Index: 2},
+ },
+ },
+ },
+ },
+ want: false,
+ },
+ {
+ name: "MinorRange contains -1 to watch all links",
+ switchID: 1,
+ linkIndex: 3,
+ sysInfo: SystemInfo{
+ sOpt: DeviceOptions{
+ MajorRange: []int{-1},
+ MinorRange: []int{-1},
+ },
+ Switches: []SwitchInfo{
+ {
+ EntityId: 1,
+ NvLinks: []dcgm.NvLinkStatus{
+ {Index: 3},
+ },
+ },
+ },
+ },
+ want: true,
+ },
+ {
+ name: "The link not in the watched switch",
+ switchID: 1,
+ linkIndex: 4,
+ sysInfo: SystemInfo{
+ sOpt: DeviceOptions{
+ MajorRange: []int{-1},
+ MinorRange: []int{1, 2, 3},
+ },
+ Switches: []SwitchInfo{
+ {
+ EntityId: 1,
+ NvLinks: []dcgm.NvLinkStatus{
+ {Index: 4},
+ },
+ },
+ },
+ },
+ want: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := IsLinkWatched(tt.linkIndex, tt.switchID, tt.sysInfo)
+ assert.Equal(t, tt.want, got)
+ })
+ }
+}
+
+func TestIsCPUWatched(t *testing.T) {
+ tests := []struct {
+ name string
+ cpuID uint
+ sysInfo SystemInfo
+ want bool
+ }{
+ {
+ name: "Monitor all devices",
+ cpuID: 1,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{Flex: true},
+ CPUs: []CPUInfo{
+ {
+ EntityId: 1,
+ },
+ },
+ },
+ want: true,
+ },
+ {
+ name: "MajorRange Contains -1",
+ cpuID: 2,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{MajorRange: []int{-1}},
+ CPUs: []CPUInfo{
+ {
+ EntityId: 2,
+ },
+ },
+ },
+ want: true,
+ },
+ {
+ name: "CPU ID in MajorRange",
+ cpuID: 3,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{MajorRange: []int{1, 2, 3}},
+ CPUs: []CPUInfo{
+ {
+ EntityId: 3,
+ },
+ },
+ },
+ want: true,
+ },
+ {
+ name: "CPU ID Not in MajorRange",
+ cpuID: 4,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{MajorRange: []int{1, 2, 3}},
+ CPUs: []CPUInfo{
+ {
+ EntityId: 4,
+ },
+ },
+ },
+ want: false,
+ },
+ {
+ name: "MajorRange Empty",
+ cpuID: 5,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{MajorRange: []int{}},
+ CPUs: []CPUInfo{
+ {
+ EntityId: 5,
+ },
+ },
+ },
+ want: false,
+ },
+ {
+ name: "CPU not found",
+ cpuID: 6,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{MajorRange: []int{}},
+ CPUs: []CPUInfo{
+ {
+ EntityId: 5,
+ },
+ },
+ },
+ want: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ assert.Equal(t, tt.want, IsCPUWatched(tt.cpuID, tt.sysInfo))
+ })
+ }
+}
+
+func TestIsCoreWatched(t *testing.T) {
+ tests := []struct {
+ name string
+ coreID uint
+ cpuID uint
+ sysInfo SystemInfo
+ want bool
+ }{
+ {
+ name: "Monitor all devices",
+ coreID: 1,
+ cpuID: 1,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{Flex: true},
+ },
+ want: true,
+ },
+ {
+ name: "Core in MinorRange",
+ coreID: 2,
+ cpuID: 1,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{
+ MinorRange: []int{1, 2, 3},
+ MajorRange: []int{-1},
+ },
+ CPUs: []CPUInfo{{EntityId: 1}},
+ },
+ want: true,
+ },
+ {
+ name: "Core Not in MinorRange",
+ coreID: 4,
+ cpuID: 1,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{
+ MinorRange: []int{1, 2, 3},
+ MajorRange: []int{-1},
+ },
+ CPUs: []CPUInfo{{EntityId: 1}},
+ },
+ want: false,
+ },
+ {
+ name: "MinorRange Contains -1",
+ coreID: 5,
+ cpuID: 1,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{
+ MinorRange: []int{-1},
+ MajorRange: []int{-1},
+ },
+ CPUs: []CPUInfo{{EntityId: 1}},
+ },
+ want: true,
+ },
+ {
+ name: "CPU Not Found",
+ coreID: 1,
+ cpuID: 2,
+ sysInfo: SystemInfo{
+ cOpt: DeviceOptions{
+ MinorRange: []int{1, 2, 3},
+ MajorRange: []int{-1},
+ },
+ CPUs: []CPUInfo{{EntityId: 1}},
+ },
+ want: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ assert.Equal(t, tt.want, IsCoreWatched(tt.coreID, tt.cpuID, tt.sysInfo))
+ })
}
}
diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go
index db4b1f5e..9431b045 100644
--- a/pkg/dcgmexporter/types.go
+++ b/pkg/dcgmexporter/types.go
@@ -80,10 +80,11 @@ type Config struct {
WebConfigFile string
XIDCountWindowSize int
ReplaceBlanksInModelName bool
+ Debug bool
}
type Transform interface {
- Process(metrics map[Counter][]Metric, sysInfo SystemInfo) error
+ Process(metrics MetricsByCounter, sysInfo SystemInfo) error
Name() string
}
@@ -182,3 +183,6 @@ type PodInfo struct {
Namespace string
Container string
}
+
+// MetricsByCounter represeents a map where each Counter is associated with a slice of Metric objects
+type MetricsByCounter map[Counter][]Metric
diff --git a/pkg/dcgmexporter/xid_collector.go b/pkg/dcgmexporter/xid_collector.go
index 202a1b70..fd4f61e6 100644
--- a/pkg/dcgmexporter/xid_collector.go
+++ b/pkg/dcgmexporter/xid_collector.go
@@ -233,7 +233,7 @@ var getXIDMetricTemplate = sync.OnceValue(func() *template.Template {
return template.Must(template.New("xidMetrics").Parse(xidMetricsFormat))
})
-func encodeXIDMetrics(w io.Writer, metrics map[Counter][]Metric) error {
+func encodeXIDMetrics(w io.Writer, metrics MetricsByCounter) error {
template := getXIDMetricTemplate()
return template.Execute(w, metrics)
}
diff --git a/pkg/dcgmexporter/xid_collector_test.go b/pkg/dcgmexporter/xid_collector_test.go
index 430e9cd9..8bc1171a 100644
--- a/pkg/dcgmexporter/xid_collector_test.go
+++ b/pkg/dcgmexporter/xid_collector_test.go
@@ -159,8 +159,8 @@ func TestXIDCollector_Gather_Encode(t *testing.T) {
// Now we check the metric rendering
var b bytes.Buffer
- encodeXIDMetrics(&b, metrics)
-
+ err = encodeXIDMetrics(&b, metrics)
+ require.NoError(t, err)
require.NotEmpty(t, b)
var parser expfmt.TextParser
diff --git a/tests/integration/start_read_test.go b/tests/integration/start_read_test.go
index 750137fc..35c28363 100644
--- a/tests/integration/start_read_test.go
+++ b/tests/integration/start_read_test.go
@@ -42,7 +42,8 @@ func TestStartAndReadMetrics(t *testing.T) {
args = append(args, fmt.Sprintf("-a=:%d", port))
ctx, cancel := context.WithCancel(context.Background())
go func(ctx context.Context) {
- app.Run(args)
+ err := app.Run(args)
+ require.NoError(t, err)
}(ctx)
t.Log("The dcgm-exporter is running, we wait for 30 seconds to read metrics")
diff --git a/tests/integration/start_with_tls_test.go b/tests/integration/start_with_tls_test.go
index 8b710658..b39128ed 100644
--- a/tests/integration/start_with_tls_test.go
+++ b/tests/integration/start_with_tls_test.go
@@ -26,7 +26,8 @@ func TestStartWithTLSEnabledAndBasicAuth(t *testing.T) {
args = append(args, "--web-config-file=./testdata/web-config.yml")
ctx, cancel := context.WithCancel(context.Background())
go func(ctx context.Context) {
- app.Run(args)
+ err := app.Run(args)
+ require.NoError(t, err)
}(ctx)
t.Run("server returns 400 if request uses HTTP and TLS enabled on the server",