Skip to content

Commit

Permalink
Merge pull request #248 from NVIDIA/ability-control-modelName-value-f…
Browse files Browse the repository at this point in the history
…ormat

Added new command line parameter: replace-blanks-in-model-name
  • Loading branch information
nvvfedorov authored Feb 5, 2024
2 parents afd3f28 + a3d0469 commit 8c80c51
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 67 deletions.
77 changes: 43 additions & 34 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,23 @@ const (
)

const (
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
CLIReplaceBlanksInModelName = "replace-blanks-in-model-name"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -182,6 +183,13 @@ func NewApp(buildVersion ...string) *cli.App {
Usage: "Set time window size in milliseconds (ms) for counting active XID errors in DCGM Exporter.",
EnvVars: []string{"DCGM_EXPORTER_XID_COUNT_WINDOW_SIZE"},
},
&cli.BoolFlag{
Name: CLIReplaceBlanksInModelName,
Aliases: []string{"rbmn"},
Value: false,
Usage: "Replaces every blank space in the GPU model name with a dash, ensuring a continuous, space-free identifier.",
EnvVars: []string{"DCGM_EXPORTER_REPLACE_BLANKS_IN_MODEL_NAME"},
},
}

if runtime.GOOS == "linux" {
Expand Down Expand Up @@ -399,23 +407,24 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
}

return &dcgmexporter.Config{
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
GPUDevices: gOpt,
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
GPUDevices: gOpt,
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName),
}, nil
}
40 changes: 32 additions & 8 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package dcgmexporter
import (
"fmt"
"os"
"strings"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/sirupsen/logrus"
Expand All @@ -39,11 +40,12 @@ func NewDCGMCollector(c []Counter, config *Config, hostname string, entityType d
}

collector := &DCGMCollector{
Counters: c,
DeviceFields: deviceFields,
UseOldNamespace: config.UseOldNamespace,
SysInfo: *sysInfo,
Hostname: hostname,
Counters: c,
DeviceFields: deviceFields,
UseOldNamespace: config.UseOldNamespace,
SysInfo: *sysInfo,
Hostname: hostname,
ReplaceBlanksInModelName: config.ReplaceBlanksInModelName,
}

cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, *sysInfo, int64(config.CollectInterval)*1000)
Expand Down Expand Up @@ -116,7 +118,13 @@ func (c *DCGMCollector) GetMetrics() (map[Counter][]Metric, error) {
} else if c.SysInfo.InfoType == dcgm.FE_CPU || c.SysInfo.InfoType == dcgm.FE_CPU_CORE {
metrics = ToCPUMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname)
} else {
metrics = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname)
metrics = ToMetric(vals,
c.Counters,
mi.DeviceInfo,
mi.InstanceInfo,
c.UseOldNamespace,
c.Hostname,
c.ReplaceBlanksInModelName)
}
}

Expand Down Expand Up @@ -235,7 +243,14 @@ func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, us
return metrics
}

func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GPUInstanceInfo, useOld bool, hostname string) map[Counter][]Metric {
func ToMetric(values []dcgm.FieldValue_v1,
c []Counter,
d dcgm.Device,
instanceInfo *GPUInstanceInfo,
useOld bool,
hostname string,
replaceBlanksInModelName bool,
) map[Counter][]Metric {
metrics := make(map[Counter][]Metric)
var labels = map[string]string{}

Expand All @@ -259,6 +274,15 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI
if useOld {
uuid = "uuid"
}

gpuModel := d.Identifiers.Model

if replaceBlanksInModelName {
parts := strings.Fields(gpuModel)
gpuModel = strings.Join(parts, " ")
gpuModel = strings.ReplaceAll(gpuModel, " ", "-")
}

m := Metric{
Counter: counter,
Value: v,
Expand All @@ -267,7 +291,7 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI
GPU: fmt.Sprintf("%d", d.GPU),
GPUUUID: d.UUID,
GPUDevice: fmt.Sprintf("nvidia%d", d.GPU),
GPUModelName: d.Identifiers.Model,
GPUModelName: gpuModel,
Hostname: hostname,

Labels: labels,
Expand Down
60 changes: 60 additions & 0 deletions pkg/dcgmexporter/gpu_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ package dcgmexporter

import (
"fmt"
"reflect"
"testing"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -227,3 +229,61 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun

return c, cleanup
}

func TestToMetric(t *testing.T) {

fieldValue := [4096]byte{}
fieldValue[0] = 42
values := []dcgm.FieldValue_v1{
{
FieldId: 150,
FieldType: dcgm.DCGM_FT_INT64,
Value: fieldValue,
},
}

c := []Counter{
{
FieldID: 150,
FieldName: "DCGM_FI_DEV_GPU_TEMP",
PromType: "gauge",
Help: "Temperature Help info",
},
}

d := dcgm.Device{
UUID: "fake0",
Identifiers: dcgm.DeviceIdentifiers{
Model: "NVIDIA T400 4GB",
},
}

var instanceInfo *GPUInstanceInfo = nil

type testCase struct {
replaceBlanksInModelName bool
expectedGPUModelName string
}

testCases := []testCase{
{
replaceBlanksInModelName: true,
expectedGPUModelName: "NVIDIA-T400-4GB",
},
{
replaceBlanksInModelName: false,
expectedGPUModelName: "NVIDIA T400 4GB",
},
}

for _, tc := range testCases {
t.Run(fmt.Sprintf("When replaceBlanksInModelName is %t", tc.replaceBlanksInModelName), func(t *testing.T) {
metrics := ToMetric(values, c, d, instanceInfo, false, "", tc.replaceBlanksInModelName)
assert.Len(t, metrics, 1)
// We get metric value with 0 index
metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)]
assert.Equal(t, "42", metricValues[0].Value)
assert.Equal(t, tc.expectedGPUModelName, metricValues[0].GPUModelName)
})
}
}
52 changes: 27 additions & 25 deletions pkg/dcgmexporter/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,25 +60,26 @@ type DeviceOptions struct {
}

type Config struct {
CollectorsFile string
Address string
CollectInterval int
Kubernetes bool
KubernetesGPUIdType KubernetesGPUIDType
CollectDCP bool
UseOldNamespace bool
UseRemoteHE bool
RemoteHEInfo string
GPUDevices DeviceOptions
SwitchDevices DeviceOptions
CPUDevices DeviceOptions
NoHostname bool
UseFakeGPUs bool
ConfigMapData string
MetricGroups []dcgm.MetricGroup
WebSystemdSocket bool
WebConfigFile string
XIDCountWindowSize int
CollectorsFile string
Address string
CollectInterval int
Kubernetes bool
KubernetesGPUIdType KubernetesGPUIDType
CollectDCP bool
UseOldNamespace bool
UseRemoteHE bool
RemoteHEInfo string
GPUDevices DeviceOptions
SwitchDevices DeviceOptions
CPUDevices DeviceOptions
NoHostname bool
UseFakeGPUs bool
ConfigMapData string
MetricGroups []dcgm.MetricGroup
WebSystemdSocket bool
WebConfigFile string
XIDCountWindowSize int
ReplaceBlanksInModelName bool
}

type Transform interface {
Expand All @@ -105,12 +106,13 @@ type MetricsPipeline struct {
}

type DCGMCollector struct {
Counters []Counter
DeviceFields []dcgm.Short
Cleanups []func()
UseOldNamespace bool
SysInfo SystemInfo
Hostname string
Counters []Counter
DeviceFields []dcgm.Short
Cleanups []func()
UseOldNamespace bool
SysInfo SystemInfo
Hostname string
ReplaceBlanksInModelName bool
}

type Counter struct {
Expand Down

0 comments on commit 8c80c51

Please sign in to comment.