diff --git a/.vscode/launch.json b/.vscode/launch.json index bf62b591..a4f0acbf 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -14,6 +14,7 @@ "args": [ "-f", "./etc/default-counters.csv", + "--debug" ] } ] diff --git a/docker/Dockerfile.ubi9 b/docker/Dockerfile.ubi9 index 0074585a..9978009b 100644 --- a/docker/Dockerfile.ubi9 +++ b/docker/Dockerfile.ubi9 @@ -4,7 +4,7 @@ WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter RUN set -eux; \ dnf clean expire-cache; \ dnf install -y go-toolset make wget -RUN dnf clean all +RUN dnf clean all && rm -rf /usr/bin/go # Install Go official release RUN set -eux; \ diff --git a/go.mod b/go.mod index 5a9bcdf1..38ee5255 100644 --- a/go.mod +++ b/go.mod @@ -33,6 +33,7 @@ require ( github.com/avast/retry-go/v4 v4.5.1 github.com/bits-and-blooms/bitset v1.13.0 github.com/gorilla/mux v1.8.1 + github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 github.com/prometheus/common v0.45.0 github.com/prometheus/exporter-toolkit v0.11.0 github.com/sirupsen/logrus v1.9.3 @@ -71,7 +72,6 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_golang v1.17.0 // indirect - github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect github.com/prometheus/procfs v0.11.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index fb64028a..c90a585f 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -63,6 +63,7 @@ const ( CLIWebConfigFile = "web-config-file" CLIXIDCountWindowSize = "xid-count-window-size" CLIReplaceBlanksInModelName = "replace-blanks-in-model-name" + CLIDebugMode = "debug" ) func NewApp(buildVersion ...string) *cli.App { @@ -190,6 +191,12 @@ func NewApp(buildVersion ...string) *cli.App { Usage: "Replaces every blank space in the GPU model name with a dash, ensuring a continuous, space-free identifier.", EnvVars: []string{"DCGM_EXPORTER_REPLACE_BLANKS_IN_MODEL_NAME"}, }, + &cli.BoolFlag{ + Name: CLIDebugMode, + Value: false, + Usage: "Enable debug output", + EnvVars: []string{"DCGM_EXPORTER_DEBUG"}, + }, } if runtime.GOOS == "linux" { @@ -228,6 +235,16 @@ restart: return err } + if config.Debug { + //enable debug logging + logrus.SetLevel(logrus.DebugLevel) + logrus.Debug("Debug output is enabled") + } + + logrus.Debugf("Command line: %s", strings.Join(os.Args, " ")) + + logrus.WithField(dcgmexporter.LoggerDumpKey, fmt.Sprintf("%+v", config)).Debug("Loaded configuration") + if config.UseRemoteHE { logrus.Info("Attemping to connect to remote hostengine at ", config.RemoteHEInfo) cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0") @@ -426,5 +443,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { WebConfigFile: c.String(CLIWebConfigFile), XIDCountWindowSize: c.Int(CLIXIDCountWindowSize), ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName), + Debug: c.Bool(CLIDebugMode), }, nil } diff --git a/pkg/dcgmexporter/const.go b/pkg/dcgmexporter/const.go index c590da91..4286cea8 100644 --- a/pkg/dcgmexporter/const.go +++ b/pkg/dcgmexporter/const.go @@ -46,3 +46,14 @@ func mustParseDCGMExporterMetric(s string) DCGMExporterMetric { } return mv } + +// Constants for logging fields +const ( + LoggerGroupIDKey = "groupID" + LoggerDumpKey = "dump" +) + +const ( + PARENT_ID_IGNORED = 0 + DCGM_ST_NOT_CONFIGURED = "Setting not configured" +) diff --git a/pkg/dcgmexporter/dcgm.go b/pkg/dcgmexporter/dcgm.go index 7769b38e..2661b722 100644 --- a/pkg/dcgmexporter/dcgm.go +++ b/pkg/dcgmexporter/dcgm.go @@ -21,6 +21,7 @@ import ( "math/rand" "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/sirupsen/logrus" ) func NewGroup() (dcgm.GroupHandle, func(), error) { @@ -29,7 +30,12 @@ func NewGroup() (dcgm.GroupHandle, func(), error) { return dcgm.GroupHandle{}, func() {}, err } - return group, func() { dcgm.DestroyGroup(group) }, nil + return group, func() { + err := dcgm.DestroyGroup(group) + if err != nil { + logrus.WithError(err).Warn("Cannot destroy field group") + } + }, nil } func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short { @@ -56,7 +62,12 @@ func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error) return dcgm.FieldHandle{}, func() {}, err } - return fieldGroup, func() { dcgm.FieldGroupDestroy(fieldGroup) }, nil + return fieldGroup, func() { + err := dcgm.FieldGroupDestroy(fieldGroup) + if err != nil { + logrus.WithError(err).Warn("Cannot destroy field group") + } + }, nil } func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32) error { diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index 293641d8..805bea01 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -88,10 +88,10 @@ func (c *DCGMCollector) Cleanup() { } } -func (c *DCGMCollector) GetMetrics() (map[Counter][]Metric, error) { +func (c *DCGMCollector) GetMetrics() (MetricsByCounter, error) { monitoringInfo := GetMonitoredEntities(c.SysInfo) - metrics := make(map[Counter][]Metric) + metrics := make(MetricsByCounter) for _, mi := range monitoringInfo { var vals []dcgm.FieldValue_v1 @@ -153,7 +153,7 @@ func FindCounterField(c []Counter, fieldId uint) (Counter, error) { return c[0], fmt.Errorf("Could not find corresponding counter") } -func ToSwitchMetric(metrics map[Counter][]Metric, +func ToSwitchMetric(metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) { labels := map[string]string{} @@ -196,7 +196,7 @@ func ToSwitchMetric(metrics map[Counter][]Metric, } } -func ToCPUMetric(metrics map[Counter][]Metric, +func ToCPUMetric(metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) { var labels = map[string]string{} @@ -240,7 +240,7 @@ func ToCPUMetric(metrics map[Counter][]Metric, } func ToMetric( - metrics map[Counter][]Metric, + metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, diff --git a/pkg/dcgmexporter/kubernetes.go b/pkg/dcgmexporter/kubernetes.go index 63c04f12..68340c84 100644 --- a/pkg/dcgmexporter/kubernetes.go +++ b/pkg/dcgmexporter/kubernetes.go @@ -28,6 +28,7 @@ import ( "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" "github.com/sirupsen/logrus" "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1alpha1" ) @@ -54,7 +55,7 @@ func (p *PodMapper) Name() string { return "podMapper" } -func (p *PodMapper) Process(metrics map[Counter][]Metric, sysInfo SystemInfo) error { +func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error { _, err := os.Stat(socketPath) if os.IsNotExist(err) { logrus.Infof("No Kubelet socket, ignoring") @@ -102,9 +103,13 @@ func connectToServer(socket string) (*grpc.ClientConn, func(), error) { ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout) defer cancel() - conn, err := grpc.DialContext(ctx, socket, grpc.WithInsecure(), grpc.WithBlock(), - grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { - return net.DialTimeout("unix", addr, timeout) + conn, err := grpc.DialContext(ctx, + socket, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { + d := net.Dialer{} + return d.DialContext(ctx, "unix", addr) }), ) diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go index 045315f9..5e1ab164 100644 --- a/pkg/dcgmexporter/kubernetes_test.go +++ b/pkg/dcgmexporter/kubernetes_test.go @@ -100,7 +100,8 @@ func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() { stopped := make(chan interface{}) go func() { - server.Serve(l) + err := server.Serve(l) + assert.NoError(t, err) close(stopped) }() @@ -256,7 +257,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: tc.KubernetesGPUIDType}) require.NoError(t, err) require.NotNil(t, podMapper) - metrics := map[Counter][]Metric{} + metrics := MetricsByCounter{} counter := Counter{ FieldID: 155, FieldName: "DCGM_FI_DEV_POWER_USAGE", diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go index 89888667..e571276a 100644 --- a/pkg/dcgmexporter/pipeline.go +++ b/pkg/dcgmexporter/pipeline.go @@ -28,6 +28,9 @@ import ( ) func NewMetricsPipeline(c *Config, counters []Counter, hostname string, newDCGMCollector DCGMCollectorConstructor) (*MetricsPipeline, func(), error) { + + logrus.WithField(LoggerDumpKey, fmt.Sprintf("%+v", counters)).Debug("Counters are initialized") + cleanups := []func(){} gpuCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_GPU) if err != nil { @@ -328,7 +331,7 @@ var cpuCoreMetricsFormat = ` {{ end }}` // Template is passed here so that it isn't recompiled at each iteration -func FormatMetrics(t *template.Template, groupedMetrics map[Counter][]Metric) (string, error) { +func FormatMetrics(t *template.Template, groupedMetrics MetricsByCounter) (string, error) { // Format metrics var res bytes.Buffer if err := t.Execute(&res, groupedMetrics); err != nil { diff --git a/pkg/dcgmexporter/pipeline_test.go b/pkg/dcgmexporter/pipeline_test.go index 8e643ba0..f0cb2f89 100644 --- a/pkg/dcgmexporter/pipeline_test.go +++ b/pkg/dcgmexporter/pipeline_test.go @@ -37,6 +37,7 @@ func TestRun(t *testing.T) { p, cleanup, err := NewMetricsPipelineWithGPUCollector(&Config{}, c) require.NoError(t, err) defer cleanup() + require.NoError(t, err) out, err := p.run() require.NoError(t, err) diff --git a/pkg/dcgmexporter/server.go b/pkg/dcgmexporter/server.go index b144d367..94808732 100644 --- a/pkg/dcgmexporter/server.go +++ b/pkg/dcgmexporter/server.go @@ -50,13 +50,18 @@ func NewMetricsServer(c *Config, metrics chan string, registry *Registry) (*Metr router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("X-Content-Type-Options", "nosniff") w.WriteHeader(http.StatusOK) - w.Write([]byte(` + _, err := w.Write([]byte(` GPU Exporter

GPU Exporter

Metrics

`)) + if err != nil { + logrus.WithError(err).Error("Failed to write response") + http.Error(w, "Failed to write response", http.StatusInternalServerError) + return + } }) router.HandleFunc("/health", serverv1.Health) @@ -76,7 +81,7 @@ func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) { defer httpwg.Done() logrus.Info("Starting webserver") if err := web.ListenAndServe(s.server, s.webConfig, logger); err != nil && err != http.ErrServerClosed { - logrus.Fatalf("Failed to Listen and Server HTTP server with err: `%v`", err) + logrus.WithError(err).Fatal("Failed to Listen and Server HTTP server") } }() @@ -95,21 +100,33 @@ func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) { <-stop if err := s.server.Shutdown(context.Background()); err != nil { - logrus.Fatalf("Failed to shutdown HTTP server, with err: `%v`", err) + logrus.WithError(err).Fatal("Failed to shutdown HTTP server") } if err := WaitWithTimeout(&httpwg, 3*time.Second); err != nil { - logrus.Fatalf("Failed waiting for HTTP server to shutdown, with err: `%v`", err) + logrus.WithError(err).Fatal("Failed waiting for HTTP server to shutdown") } } func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request) { w.Header().Set("X-Content-Type-Options", "nosniff") w.WriteHeader(http.StatusOK) - w.Write([]byte(s.getMetrics())) + _, err := w.Write([]byte(s.getMetrics())) + if err != nil { + logrus.WithError(err).Error("Failed to write response") + http.Error(w, "Failed to write response", http.StatusInternalServerError) + return + } xidMetrics, err := s.registry.Gather() - if err == nil { - encodeXIDMetrics(w, xidMetrics) + if err != nil { + logrus.WithError(err).Error("Failed to write response") + http.Error(w, "Failed to write response", http.StatusInternalServerError) + return + } + err = encodeXIDMetrics(w, xidMetrics) + if err != nil { + http.Error(w, "Failed to write response", http.StatusInternalServerError) + return } } @@ -117,11 +134,19 @@ func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request) { if s.getMetrics() == "" { w.Header().Set("X-Content-Type-Options", "nosniff") w.WriteHeader(http.StatusServiceUnavailable) - w.Write([]byte("KO")) + _, err := w.Write([]byte("KO")) + if err != nil { + logrus.WithError(err).Error("Failed to write response") + http.Error(w, "Failed to write response", http.StatusInternalServerError) + } } else { w.Header().Set("X-Content-Type-Options", "nosniff") w.WriteHeader(http.StatusOK) - w.Write([]byte("OK")) + _, err := w.Write([]byte("OK")) + if err != nil { + logrus.WithError(err).Error("Failed to write response") + http.Error(w, "Failed to write response", http.StatusInternalServerError) + } } } diff --git a/pkg/dcgmexporter/system_info.go b/pkg/dcgmexporter/system_info.go index b11131aa..dc9bc632 100644 --- a/pkg/dcgmexporter/system_info.go +++ b/pkg/dcgmexporter/system_info.go @@ -19,14 +19,14 @@ package dcgmexporter import ( "fmt" "math/rand" + "slices" + "strings" "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/bits-and-blooms/bitset" "github.com/sirupsen/logrus" ) -const PARENT_ID_IGNORED = 0 - var ( dcgmGetAllDeviceCount = dcgm.GetAllDeviceCount dcgmGetDeviceInfo = dcgm.GetDeviceInfo @@ -36,11 +36,6 @@ var ( dcgmGetCpuHierarchy = dcgm.GetCpuHierarchy ) -type GroupInfo struct { - groupHandle dcgm.GroupHandle - groupType dcgm.Field_Entity_Group -} - type ComputeInstanceInfo struct { InstanceInfo dcgm.MigEntityInfo ProfileName string @@ -106,7 +101,7 @@ func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error err := fmt.Errorf("Cannot find match for entities:") for _, v := range values { found := SetGPUInstanceProfileName(sysInfo, v.EntityId, dcgm.Fv2_String(v)) - if found == false { + if !found { err = fmt.Errorf("%s group %d, id %d", err, v.EntityGroupId, v.EntityId) notFound = true } @@ -204,17 +199,17 @@ func VerifyCPUDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error { if len(sOpt.MajorRange) > 0 && sOpt.MajorRange[0] != -1 { // Verify we can find all the specified Switches - for _, cpuId := range sOpt.MajorRange { - if !SwitchIdExists(sysInfo, cpuId) { - return fmt.Errorf("couldn't find requested cpu id %d", cpuId) + for _, cpuID := range sOpt.MajorRange { + if !SwitchIdExists(sysInfo, cpuID) { + return fmt.Errorf("couldn't find requested cpu id %d", cpuID) } } } if len(sOpt.MinorRange) > 0 && sOpt.MinorRange[0] != -1 { - for _, coreId := range sOpt.MinorRange { - if !CPUCoreIdExists(sysInfo, coreId) { - return fmt.Errorf("couldn't find requested cpu core %d", coreId) + for _, coreID := range sOpt.MinorRange { + if !CPUCoreIdExists(sysInfo, coreID) { + return fmt.Errorf("couldn't find requested cpu core %d", coreID) } } } @@ -229,17 +224,17 @@ func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error { if len(sOpt.MajorRange) > 0 && sOpt.MajorRange[0] != -1 { // Verify we can find all the specified Switches - for _, swId := range sOpt.MajorRange { - if !SwitchIdExists(sysInfo, swId) { - return fmt.Errorf("couldn't find requested NvSwitch id %d", swId) + for _, swID := range sOpt.MajorRange { + if !SwitchIdExists(sysInfo, swID) { + return fmt.Errorf("couldn't find requested NvSwitch id %d", swID) } } } if len(sOpt.MinorRange) > 0 && sOpt.MinorRange[0] != -1 { - for _, linkId := range sOpt.MinorRange { - if !LinkIdExists(sysInfo, linkId) { - return fmt.Errorf("couldn't find requested NvLink %d", linkId) + for _, linkID := range sOpt.MinorRange { + if !LinkIdExists(sysInfo, linkID) { + return fmt.Errorf("couldn't find requested NvLink %d", linkID) } } } @@ -254,17 +249,17 @@ func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error { if len(gOpt.MajorRange) > 0 && gOpt.MajorRange[0] != -1 { // Verify we can find all the specified GPUs - for _, gpuId := range gOpt.MajorRange { - if GPUIdExists(sysInfo, gpuId) == false { - return fmt.Errorf("Couldn't find requested GPU id %d", gpuId) + for _, gpuID := range gOpt.MajorRange { + if !GPUIdExists(sysInfo, gpuID) { + return fmt.Errorf("Couldn't find requested GPU id %d", gpuID) } } } if len(gOpt.MinorRange) > 0 && gOpt.MinorRange[0] != -1 { - for _, gpuInstanceId := range gOpt.MinorRange { - if GPUInstanceIdExists(sysInfo, gpuInstanceId) == false { - return fmt.Errorf("Couldn't find requested GPU instance id %d", gpuInstanceId) + for _, gpuInstanceID := range gOpt.MinorRange { + if !GPUInstanceIdExists(sysInfo, gpuInstanceID) { + return fmt.Errorf("Couldn't find requested GPU instance id %d", gpuInstanceID) } } } @@ -314,8 +309,12 @@ func InitializeCPUInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, erro } sysInfo.cOpt = sOpt - err = VerifyCPUDevicePresence(&sysInfo, sOpt) + err = VerifyCPUDevicePresence(&sysInfo, sOpt) + if err != nil { + return sysInfo, err + } + logrus.Debugf("System entities of type %s initialized", sysInfo.InfoType) return sysInfo, nil } @@ -352,8 +351,11 @@ func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, sysInfo.sOpt = sOpt err = VerifySwitchDevicePresence(&sysInfo, sOpt) + if err == nil { + logrus.Debugf("System entities of type %s initialized", sysInfo.InfoType) + } - return sysInfo, nil + return sysInfo, err } func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool) (SystemInfo, error) { @@ -385,27 +387,27 @@ func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool) if hierarchy.Count > 0 { var entities []dcgm.GroupEntityPair - gpuId := uint(0) + gpuID := uint(0) instanceIndex := 0 for i := uint(0); i < hierarchy.Count; i++ { if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU { // We are adding a GPU instance - gpuId = hierarchy.EntityList[i].Parent.EntityId - entityId := hierarchy.EntityList[i].Entity.EntityId + gpuID = hierarchy.EntityList[i].Parent.EntityId + entityID := hierarchy.EntityList[i].Entity.EntityId instanceInfo := GPUInstanceInfo{ Info: hierarchy.EntityList[i].Info, ProfileName: "", - EntityId: entityId, + EntityId: entityID, } - sysInfo.GPUs[gpuId].MigEnabled = true - sysInfo.GPUs[gpuId].GPUInstances = append(sysInfo.GPUs[gpuId].GPUInstances, instanceInfo) - entities = append(entities, dcgm.GroupEntityPair{dcgm.FE_GPU_I, entityId}) - instanceIndex = len(sysInfo.GPUs[gpuId].GPUInstances) - 1 + sysInfo.GPUs[gpuID].MigEnabled = true + sysInfo.GPUs[gpuID].GPUInstances = append(sysInfo.GPUs[gpuID].GPUInstances, instanceInfo) + entities = append(entities, dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: entityID}) + instanceIndex = len(sysInfo.GPUs[gpuID].GPUInstances) - 1 } else if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU_I { // Add the compute instance, gpuId is recorded previously - entityId := hierarchy.EntityList[i].Entity.EntityId - ciInfo := ComputeInstanceInfo{hierarchy.EntityList[i].Info, "", entityId} - sysInfo.GPUs[gpuId].GPUInstances[instanceIndex].ComputeInstances = append(sysInfo.GPUs[gpuId].GPUInstances[instanceIndex].ComputeInstances, ciInfo) + entityID := hierarchy.EntityList[i].Entity.EntityId + ciInfo := ComputeInstanceInfo{hierarchy.EntityList[i].Info, "", entityID} + sysInfo.GPUs[gpuID].GPUInstances[instanceIndex].ComputeInstances = append(sysInfo.GPUs[gpuID].GPUInstances[instanceIndex].ComputeInstances, ciInfo) } } @@ -417,7 +419,9 @@ func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool) sysInfo.gOpt = gOpt err = VerifyDevicePresence(&sysInfo, gOpt) - + if err == nil { + logrus.Debugf("System entities of type %s initialized", sysInfo.InfoType) + } return sysInfo, err } @@ -449,7 +453,7 @@ func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, cOpt DeviceOpt func CreateCoreGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error) { var groups []dcgm.GroupHandle var cleanups []func() - var groupId dcgm.GroupHandle + var groupID dcgm.GroupHandle var err error /* Create per-cpu core groups */ @@ -461,25 +465,33 @@ func CreateCoreGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []f for i, core := range cpu.Cores { if i == 0 || i%dcgm.DCGM_GROUP_MAX_ENTITIES == 0 { - groupId, err = dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) + groupID, err = dcgm.CreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) if err != nil { return nil, cleanups, err } - groups = append(groups, groupId) + groups = append(groups, groupID) } if !IsCoreWatched(core, cpu.EntityId, sysInfo) { continue } - err = dcgm.AddEntityToGroup(groupId, dcgm.FE_CPU_CORE, core) + err = dcgm.AddEntityToGroup(groupID, dcgm.FE_CPU_CORE, core) if err != nil { return groups, cleanups, err } - cleanups = append(cleanups, func() { dcgm.DestroyGroup(groupId) }) + cleanups = append(cleanups, func() { + err := dcgm.DestroyGroup(groupID) + if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) { + logrus.WithFields(logrus.Fields{ + LoggerGroupIDKey: groupID, + logrus.ErrorKey: err, + }).Warn("can not destroy group") + } + }) } } @@ -496,12 +508,12 @@ func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []f continue } - groupId, err := dcgmCreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) + groupID, err := dcgmCreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) if err != nil { return nil, cleanups, err } - groups = append(groups, groupId) + groups = append(groups, groupID) for _, link := range sw.NvLinks { if link.State != dcgm.LS_UP { @@ -512,13 +524,21 @@ func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []f continue } - err = dcgm.AddLinkEntityToGroup(groupId, link.Index, link.ParentId) + err = dcgm.AddLinkEntityToGroup(groupID, link.Index, link.ParentId) if err != nil { return groups, cleanups, err } - cleanups = append(cleanups, func() { dcgm.DestroyGroup(groupId) }) + cleanups = append(cleanups, func() { + err := dcgm.DestroyGroup(groupID) + if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) { + logrus.WithFields(logrus.Fields{ + LoggerGroupIDKey: groupID, + logrus.ErrorKey: err, + }).Warn("can not destroy group") + } + }) } } @@ -527,19 +547,35 @@ func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []f func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error) { monitoringInfo := GetMonitoredEntities(sysInfo) - groupId, err := dcgmCreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) + groupID, err := dcgmCreateGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) if err != nil { return dcgm.GroupHandle{}, func() {}, err } for _, mi := range monitoringInfo { - err := dcgmAddEntityToGroup(groupId, mi.Entity.EntityGroupId, mi.Entity.EntityId) + err := dcgmAddEntityToGroup(groupID, mi.Entity.EntityGroupId, mi.Entity.EntityId) if err != nil { - return groupId, func() { dcgm.DestroyGroup(groupId) }, err + return groupID, func() { + err := dcgm.DestroyGroup(groupID) + if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) { + logrus.WithFields(logrus.Fields{ + LoggerGroupIDKey: groupID, + logrus.ErrorKey: err, + }).Warn("can not destroy group") + } + }, err } } - return groupId, func() { dcgm.DestroyGroup(groupId) }, nil + return groupID, func() { + err := dcgm.DestroyGroup(groupID) + if err != nil && !strings.Contains(err.Error(), DCGM_ST_NOT_CONFIGURED) { + logrus.WithFields(logrus.Fields{ + LoggerGroupIDKey: groupID, + logrus.ErrorKey: err, + }).Warn("can not destroy group") + } + }, nil } func AddAllGPUs(sysInfo SystemInfo) []MonitoringInfo { @@ -547,7 +583,7 @@ func AddAllGPUs(sysInfo SystemInfo) []MonitoringInfo { for i := uint(0); i < sysInfo.GPUCount; i++ { mi := MonitoringInfo{ - dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.GPUs[i].DeviceInfo.GPU}, + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: sysInfo.GPUs[i].DeviceInfo.GPU}, sysInfo.GPUs[i].DeviceInfo, nil, PARENT_ID_IGNORED, @@ -567,13 +603,8 @@ func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo { } mi := MonitoringInfo{ - dcgm.GroupEntityPair{dcgm.FE_SWITCH, sw.EntityId}, - dcgm.Device{ - 0, "", "", 0, - dcgm.PCIInfo{"", 0, 0, 0}, - dcgm.DeviceIdentifiers{"", "", "", "", "", ""}, - nil, "", - }, + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_SWITCH, EntityId: sw.EntityId}, + dcgm.Device{}, nil, PARENT_ID_IGNORED, } @@ -601,13 +632,8 @@ func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo { } mi := MonitoringInfo{ - dcgm.GroupEntityPair{dcgm.FE_LINK, link.Index}, - dcgm.Device{ - 0, "", "", 0, - dcgm.PCIInfo{"", 0, 0, 0}, - dcgm.DeviceIdentifiers{"", "", "", "", "", ""}, - nil, "", - }, + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_LINK, EntityId: link.Index}, + dcgm.Device{}, nil, link.ParentId, } @@ -618,87 +644,86 @@ func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo { return monitoring } -func IsSwitchWatched(switchId uint, sysInfo SystemInfo) bool { +func IsSwitchWatched(switchID uint, sysInfo SystemInfo) bool { if sysInfo.sOpt.Flex { return true } - if len(sysInfo.sOpt.MajorRange) <= 0 { + // When MajorRange contains -1 value, we do monitorig of all switches + if len(sysInfo.sOpt.MajorRange) > 0 && sysInfo.sOpt.MajorRange[0] == -1 { return true } - for _, sw := range sysInfo.sOpt.MajorRange { - if uint(sw) == switchId { - return true - } - - } - return false + return slices.Contains(sysInfo.sOpt.MajorRange, int(switchID)) } -func IsLinkWatched(linkId uint, switchId uint, sysInfo SystemInfo) bool { +func IsLinkWatched(linkIndex uint, switchID uint, sysInfo SystemInfo) bool { if sysInfo.sOpt.Flex { return true } - for _, sw := range sysInfo.Switches { - if !IsSwitchWatched(sw.EntityId, sysInfo) { - return false - } + // Find a switch + switchIdx := slices.IndexFunc(sysInfo.Switches, func(si SwitchInfo) bool { + return si.EntityId == switchID && IsSwitchWatched(si.EntityId, sysInfo) + }) + + if switchIdx > -1 { + // Switch exists and is watched + sw := sysInfo.Switches[switchIdx] - if len(sysInfo.sOpt.MinorRange) <= 0 { + if len(sysInfo.sOpt.MinorRange) > 0 && sysInfo.sOpt.MinorRange[0] == -1 { return true } - for _, link := range sysInfo.sOpt.MinorRange { - if uint(link) == linkId { - return true - } + // The Link exists + if slices.ContainsFunc(sw.NvLinks, func(nls dcgm.NvLinkStatus) bool { + return nls.Index == linkIndex + }) { + // and the link index in the Minor range + return slices.Contains(sysInfo.sOpt.MinorRange, int(linkIndex)) } - return false } return false } -func IsCPUWatched(cpuId uint, sysInfo SystemInfo) bool { +func IsCPUWatched(cpuID uint, sysInfo SystemInfo) bool { + + if !slices.ContainsFunc(sysInfo.CPUs, func(cpu CPUInfo) bool { + return cpu.EntityId == cpuID + }) { + return false + } + if sysInfo.cOpt.Flex { return true } - if len(sysInfo.cOpt.MajorRange) <= 0 { + if len(sysInfo.cOpt.MajorRange) > 0 && sysInfo.cOpt.MajorRange[0] == -1 { return true } - for _, cpu := range sysInfo.cOpt.MajorRange { - if uint(cpu) == cpuId { - return true - } - - } - return false + return slices.ContainsFunc(sysInfo.cOpt.MajorRange, func(cpu int) bool { + return uint(cpu) == cpuID + }) } -func IsCoreWatched(coreId uint, cpuId uint, sysInfo SystemInfo) bool { +func IsCoreWatched(coreID uint, cpuID uint, sysInfo SystemInfo) bool { if sysInfo.cOpt.Flex { return true } - for _, cpu := range sysInfo.CPUs { - if !IsCPUWatched(cpu.EntityId, sysInfo) { - return false - } + // Find a CPU + cpuIdx := slices.IndexFunc(sysInfo.CPUs, func(cpu CPUInfo) bool { + return IsCPUWatched(cpu.EntityId, sysInfo) && cpu.EntityId == cpuID + }) - if len(sysInfo.cOpt.MinorRange) <= 0 { + if cpuIdx > -1 { + if len(sysInfo.cOpt.MinorRange) > 0 && sysInfo.cOpt.MinorRange[0] == -1 { return true } - for _, core := range sysInfo.cOpt.MinorRange { - if uint(core) == coreId { - return true - } - } - return false + return slices.Contains(sysInfo.cOpt.MinorRange, int(coreID)) } return false @@ -713,13 +738,8 @@ func AddAllCPUs(sysInfo SystemInfo) []MonitoringInfo { } mi := MonitoringInfo{ - dcgm.GroupEntityPair{dcgm.FE_CPU, cpu.EntityId}, - dcgm.Device{ - 0, "", "", 0, - dcgm.PCIInfo{"", 0, 0, 0}, - dcgm.DeviceIdentifiers{"", "", "", "", "", ""}, - nil, "", - }, + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU, EntityId: cpu.EntityId}, + dcgm.Device{}, nil, PARENT_ID_IGNORED, } @@ -743,13 +763,8 @@ func AddAllCPUCores(sysInfo SystemInfo) []MonitoringInfo { } mi := MonitoringInfo{ - dcgm.GroupEntityPair{dcgm.FE_CPU_CORE, core}, - dcgm.Device{ - 0, "", "", 0, - dcgm.PCIInfo{"", 0, 0, 0}, - dcgm.DeviceIdentifiers{"", "", "", "", "", ""}, - nil, "", - }, + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_CPU_CORE, EntityId: core}, + dcgm.Device{}, nil, cpu.EntityId, } @@ -764,9 +779,9 @@ func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo { var monitoring []MonitoringInfo for i := uint(0); i < sysInfo.GPUCount; i++ { - if addFlexibly == true && len(sysInfo.GPUs[i].GPUInstances) == 0 { + if addFlexibly && len(sysInfo.GPUs[i].GPUInstances) == 0 { mi := MonitoringInfo{ - dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.GPUs[i].DeviceInfo.GPU}, + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: sysInfo.GPUs[i].DeviceInfo.GPU}, sysInfo.GPUs[i].DeviceInfo, nil, PARENT_ID_IGNORED, @@ -775,7 +790,7 @@ func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo { } else { for j := 0; j < len(sysInfo.GPUs[i].GPUInstances); j++ { mi := MonitoringInfo{ - dcgm.GroupEntityPair{dcgm.FE_GPU_I, sysInfo.GPUs[i].GPUInstances[j].EntityId}, + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: sysInfo.GPUs[i].GPUInstances[j].EntityId}, sysInfo.GPUs[i].DeviceInfo, &sysInfo.GPUs[i].GPUInstances[j], PARENT_ID_IGNORED, @@ -788,11 +803,11 @@ func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo { return monitoring } -func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuId int) *MonitoringInfo { +func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuID int) *MonitoringInfo { for i := uint(0); i < sysInfo.GPUCount; i++ { - if sysInfo.GPUs[i].DeviceInfo.GPU == uint(gpuId) { + if sysInfo.GPUs[i].DeviceInfo.GPU == uint(gpuID) { return &MonitoringInfo{ - dcgm.GroupEntityPair{dcgm.FE_GPU, sysInfo.GPUs[i].DeviceInfo.GPU}, + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU, EntityId: sysInfo.GPUs[i].DeviceInfo.GPU}, sysInfo.GPUs[i].DeviceInfo, nil, PARENT_ID_IGNORED, @@ -803,12 +818,12 @@ func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuId int) *MonitoringInfo { return nil } -func GetMonitoringInfoForGPUInstance(sysInfo SystemInfo, gpuInstanceId int) *MonitoringInfo { +func GetMonitoringInfoForGPUInstance(sysInfo SystemInfo, gpuInstanceID int) *MonitoringInfo { for i := uint(0); i < sysInfo.GPUCount; i++ { for _, instance := range sysInfo.GPUs[i].GPUInstances { - if instance.EntityId == uint(gpuInstanceId) { + if instance.EntityId == uint(gpuInstanceID) { return &MonitoringInfo{ - dcgm.GroupEntityPair{dcgm.FE_GPU_I, uint(gpuInstanceId)}, + dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU_I, EntityId: uint(gpuInstanceID)}, sysInfo.GPUs[i].DeviceInfo, &instance, PARENT_ID_IGNORED, @@ -831,24 +846,24 @@ func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo { monitoring = AddAllCPUs(sysInfo) } else if sysInfo.InfoType == dcgm.FE_CPU_CORE { monitoring = AddAllCPUCores(sysInfo) - } else if sysInfo.gOpt.Flex == true { + } else if sysInfo.gOpt.Flex { monitoring = AddAllGPUInstances(sysInfo, true) } else { if len(sysInfo.gOpt.MajorRange) > 0 && sysInfo.gOpt.MajorRange[0] == -1 { monitoring = AddAllGPUs(sysInfo) } else { - for _, gpuId := range sysInfo.gOpt.MajorRange { + for _, gpuID := range sysInfo.gOpt.MajorRange { // We've already verified that everything in the options list exists - monitoring = append(monitoring, *GetMonitoringInfoForGPU(sysInfo, gpuId)) + monitoring = append(monitoring, *GetMonitoringInfoForGPU(sysInfo, gpuID)) } } if len(sysInfo.gOpt.MinorRange) > 0 && sysInfo.gOpt.MinorRange[0] == -1 { monitoring = AddAllGPUInstances(sysInfo, false) } else { - for _, gpuInstanceId := range sysInfo.gOpt.MinorRange { + for _, gpuInstanceID := range sysInfo.gOpt.MinorRange { // We've already verified that everything in the options list exists - monitoring = append(monitoring, *GetMonitoringInfoForGPUInstance(sysInfo, gpuInstanceId)) + monitoring = append(monitoring, *GetMonitoringInfoForGPUInstance(sysInfo, gpuInstanceID)) } } } @@ -856,10 +871,10 @@ func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo { return monitoring } -func GetGPUInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceId uint) string { +func GetGPUInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceID uint) string { for i := uint(0); i < sysInfo.GPUCount; i++ { if sysInfo.GPUs[i].DeviceInfo.UUID == gpuuuid { - identifier := fmt.Sprintf("%d-%d", sysInfo.GPUs[i].DeviceInfo.GPU, gpuInstanceId) + identifier := fmt.Sprintf("%d-%d", sysInfo.GPUs[i].DeviceInfo.GPU, gpuInstanceID) return identifier } } diff --git a/pkg/dcgmexporter/system_info_test.go b/pkg/dcgmexporter/system_info_test.go index a7f024a8..d1efed0c 100644 --- a/pkg/dcgmexporter/system_info_test.go +++ b/pkg/dcgmexporter/system_info_test.go @@ -18,9 +18,11 @@ package dcgmexporter import ( "fmt" + "testing" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "testing" ) const ( @@ -73,6 +75,9 @@ func SpoofSwitchSystemInfo() SystemInfo { sysInfo.Switches = append(sysInfo.Switches, sw1) sysInfo.Switches = append(sysInfo.Switches, sw2) + sysInfo.sOpt.MajorRange = []int{-1} + sysInfo.sOpt.MinorRange = []int{-1} + return sysInfo } @@ -81,13 +86,13 @@ func SpoofSystemInfo() SystemInfo { sysInfo.GPUCount = 2 sysInfo.GPUs[0].DeviceInfo.GPU = 0 gi := GPUInstanceInfo{ - Info: dcgm.MigEntityInfo{"fake", 0, 0, 0, 0, 3}, + Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlProfileSlices: 3}, ProfileName: fakeProfileName, EntityId: 0, } sysInfo.GPUs[0].GPUInstances = append(sysInfo.GPUs[0].GPUInstances, gi) gi2 := GPUInstanceInfo{ - Info: dcgm.MigEntityInfo{"fake", 0, 1, 0, 0, 3}, + Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlInstanceId: 1, NvmlProfileSlices: 3}, ProfileName: fakeProfileName, EntityId: 14, } @@ -164,11 +169,6 @@ func TestVerifyDevicePresence(t *testing.T) { require.Equal(t, err, nil, "Expected to have no error, but found %s", err) } -//func TestMigProfileNames(t *testing.T) { -// sysInfo := SpoofSystemInfo() -// SetMigProfileNames(sysInfo, values) -//} - func TestMonitoredSwitches(t *testing.T) { sysInfo := SpoofSwitchSystemInfo() @@ -185,6 +185,333 @@ func TestMonitoredSwitches(t *testing.T) { require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored links but found %d", len(monitoring))) for i, mi := range monitoring { require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_LINK, fmt.Sprintf("Should have only returned links but returned %d", mi.Entity.EntityGroupId)) - require.Equal(t, mi.ParentId, uint(i), fmt.Sprint("Link should reference switch parent")) + require.Equal(t, mi.ParentId, uint(i), "Link should reference switch parent") + } +} + +func TestIsSwitchWatched(t *testing.T) { + tests := []struct { + name string + switchID uint + sysInfo SystemInfo + want bool + }{ + { + name: "Monitor all devices", + switchID: 1, + sysInfo: SystemInfo{ + sOpt: DeviceOptions{ + Flex: true, + }, + }, + want: true, + }, + { + name: "MajorRange empty", + switchID: 2, + sysInfo: SystemInfo{ + sOpt: DeviceOptions{ + MajorRange: []int{}, + }, + }, + want: false, + }, + { + name: "MajorRange contains -1 to watch all devices", + switchID: 3, + sysInfo: SystemInfo{ + sOpt: DeviceOptions{ + MajorRange: []int{-1}, + }, + }, + want: true, + }, + { + name: "SwitchID in MajorRange", + switchID: 4, + sysInfo: SystemInfo{ + sOpt: DeviceOptions{ + MajorRange: []int{3, 4, 5}, + }, + }, + want: true, + }, + { + name: "SwitchID not in MajorRange", + switchID: 5, + sysInfo: SystemInfo{ + sOpt: DeviceOptions{ + MajorRange: []int{3, 4, 6}, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := IsSwitchWatched(tt.switchID, tt.sysInfo) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestIsLinkWatched(t *testing.T) { + tests := []struct { + name string + linkIndex uint + switchID uint + sysInfo SystemInfo + want bool + }{ + { + name: "Monitor all devices", + linkIndex: 1, + sysInfo: SystemInfo{sOpt: DeviceOptions{Flex: true}}, + want: true, + }, + { + name: "No watched devices", + linkIndex: 1, + sysInfo: SystemInfo{}, + want: false, + }, + { + name: "Watched link with empty MinorRange", + linkIndex: 2, + sysInfo: SystemInfo{ + sOpt: DeviceOptions{ + MajorRange: []int{-1}, + }, + Switches: []SwitchInfo{ + { + EntityId: 1, + NvLinks: []dcgm.NvLinkStatus{ + {Index: 2}, + }, + }, + }, + }, + want: false, + }, + { + name: "MinorRange contains -1 to watch all links", + switchID: 1, + linkIndex: 3, + sysInfo: SystemInfo{ + sOpt: DeviceOptions{ + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + Switches: []SwitchInfo{ + { + EntityId: 1, + NvLinks: []dcgm.NvLinkStatus{ + {Index: 3}, + }, + }, + }, + }, + want: true, + }, + { + name: "The link not in the watched switch", + switchID: 1, + linkIndex: 4, + sysInfo: SystemInfo{ + sOpt: DeviceOptions{ + MajorRange: []int{-1}, + MinorRange: []int{1, 2, 3}, + }, + Switches: []SwitchInfo{ + { + EntityId: 1, + NvLinks: []dcgm.NvLinkStatus{ + {Index: 4}, + }, + }, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := IsLinkWatched(tt.linkIndex, tt.switchID, tt.sysInfo) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestIsCPUWatched(t *testing.T) { + tests := []struct { + name string + cpuID uint + sysInfo SystemInfo + want bool + }{ + { + name: "Monitor all devices", + cpuID: 1, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{Flex: true}, + CPUs: []CPUInfo{ + { + EntityId: 1, + }, + }, + }, + want: true, + }, + { + name: "MajorRange Contains -1", + cpuID: 2, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{MajorRange: []int{-1}}, + CPUs: []CPUInfo{ + { + EntityId: 2, + }, + }, + }, + want: true, + }, + { + name: "CPU ID in MajorRange", + cpuID: 3, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{MajorRange: []int{1, 2, 3}}, + CPUs: []CPUInfo{ + { + EntityId: 3, + }, + }, + }, + want: true, + }, + { + name: "CPU ID Not in MajorRange", + cpuID: 4, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{MajorRange: []int{1, 2, 3}}, + CPUs: []CPUInfo{ + { + EntityId: 4, + }, + }, + }, + want: false, + }, + { + name: "MajorRange Empty", + cpuID: 5, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{MajorRange: []int{}}, + CPUs: []CPUInfo{ + { + EntityId: 5, + }, + }, + }, + want: false, + }, + { + name: "CPU not found", + cpuID: 6, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{MajorRange: []int{}}, + CPUs: []CPUInfo{ + { + EntityId: 5, + }, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, IsCPUWatched(tt.cpuID, tt.sysInfo)) + }) + } +} + +func TestIsCoreWatched(t *testing.T) { + tests := []struct { + name string + coreID uint + cpuID uint + sysInfo SystemInfo + want bool + }{ + { + name: "Monitor all devices", + coreID: 1, + cpuID: 1, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{Flex: true}, + }, + want: true, + }, + { + name: "Core in MinorRange", + coreID: 2, + cpuID: 1, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{ + MinorRange: []int{1, 2, 3}, + MajorRange: []int{-1}, + }, + CPUs: []CPUInfo{{EntityId: 1}}, + }, + want: true, + }, + { + name: "Core Not in MinorRange", + coreID: 4, + cpuID: 1, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{ + MinorRange: []int{1, 2, 3}, + MajorRange: []int{-1}, + }, + CPUs: []CPUInfo{{EntityId: 1}}, + }, + want: false, + }, + { + name: "MinorRange Contains -1", + coreID: 5, + cpuID: 1, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{ + MinorRange: []int{-1}, + MajorRange: []int{-1}, + }, + CPUs: []CPUInfo{{EntityId: 1}}, + }, + want: true, + }, + { + name: "CPU Not Found", + coreID: 1, + cpuID: 2, + sysInfo: SystemInfo{ + cOpt: DeviceOptions{ + MinorRange: []int{1, 2, 3}, + MajorRange: []int{-1}, + }, + CPUs: []CPUInfo{{EntityId: 1}}, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, IsCoreWatched(tt.coreID, tt.cpuID, tt.sysInfo)) + }) } } diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go index db4b1f5e..9431b045 100644 --- a/pkg/dcgmexporter/types.go +++ b/pkg/dcgmexporter/types.go @@ -80,10 +80,11 @@ type Config struct { WebConfigFile string XIDCountWindowSize int ReplaceBlanksInModelName bool + Debug bool } type Transform interface { - Process(metrics map[Counter][]Metric, sysInfo SystemInfo) error + Process(metrics MetricsByCounter, sysInfo SystemInfo) error Name() string } @@ -182,3 +183,6 @@ type PodInfo struct { Namespace string Container string } + +// MetricsByCounter represeents a map where each Counter is associated with a slice of Metric objects +type MetricsByCounter map[Counter][]Metric diff --git a/pkg/dcgmexporter/xid_collector.go b/pkg/dcgmexporter/xid_collector.go index 202a1b70..fd4f61e6 100644 --- a/pkg/dcgmexporter/xid_collector.go +++ b/pkg/dcgmexporter/xid_collector.go @@ -233,7 +233,7 @@ var getXIDMetricTemplate = sync.OnceValue(func() *template.Template { return template.Must(template.New("xidMetrics").Parse(xidMetricsFormat)) }) -func encodeXIDMetrics(w io.Writer, metrics map[Counter][]Metric) error { +func encodeXIDMetrics(w io.Writer, metrics MetricsByCounter) error { template := getXIDMetricTemplate() return template.Execute(w, metrics) } diff --git a/pkg/dcgmexporter/xid_collector_test.go b/pkg/dcgmexporter/xid_collector_test.go index 430e9cd9..8bc1171a 100644 --- a/pkg/dcgmexporter/xid_collector_test.go +++ b/pkg/dcgmexporter/xid_collector_test.go @@ -159,8 +159,8 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { // Now we check the metric rendering var b bytes.Buffer - encodeXIDMetrics(&b, metrics) - + err = encodeXIDMetrics(&b, metrics) + require.NoError(t, err) require.NotEmpty(t, b) var parser expfmt.TextParser diff --git a/tests/integration/start_read_test.go b/tests/integration/start_read_test.go index 750137fc..35c28363 100644 --- a/tests/integration/start_read_test.go +++ b/tests/integration/start_read_test.go @@ -42,7 +42,8 @@ func TestStartAndReadMetrics(t *testing.T) { args = append(args, fmt.Sprintf("-a=:%d", port)) ctx, cancel := context.WithCancel(context.Background()) go func(ctx context.Context) { - app.Run(args) + err := app.Run(args) + require.NoError(t, err) }(ctx) t.Log("The dcgm-exporter is running, we wait for 30 seconds to read metrics") diff --git a/tests/integration/start_with_tls_test.go b/tests/integration/start_with_tls_test.go index 8b710658..b39128ed 100644 --- a/tests/integration/start_with_tls_test.go +++ b/tests/integration/start_with_tls_test.go @@ -26,7 +26,8 @@ func TestStartWithTLSEnabledAndBasicAuth(t *testing.T) { args = append(args, "--web-config-file=./testdata/web-config.yml") ctx, cancel := context.WithCancel(context.Background()) go func(ctx context.Context) { - app.Run(args) + err := app.Run(args) + require.NoError(t, err) }(ctx) t.Run("server returns 400 if request uses HTTP and TLS enabled on the server",