Skip to content

Commit

Permalink
Added new command line parameter: replace-blanks-in-model-name
Browse files Browse the repository at this point in the history
Signed-off-by: Vadym Fedorov <[email protected]>
  • Loading branch information
nvvfedorov committed Feb 2, 2024
1 parent 0518edc commit 0c4728a
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 64 deletions.
73 changes: 41 additions & 32 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,22 @@ const (
)

var (
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
CLIUseOldNamespace = "use-old-namespace"
CLIRemoteHEInfo = "remote-hostengine-info"
CLIGPUDevices = "devices"
CLISwitchDevices = "switch-devices"
CLICPUDevices = "cpu-devices"
CLINoHostname = "no-hostname"
CLIUseFakeGPUs = "fake-gpus"
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIReplaceBlanksInModelName = "replace-blanks-in-model-name"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -174,6 +175,13 @@ func NewApp(buildVersion ...string) *cli.App {
Usage: "TLS config file following webConfig spec.",
EnvVars: []string{"DCGM_EXPORTER_WEB_CONFIG_FILE"},
},
&cli.BoolFlag{
Name: CLIReplaceBlanksInModelName,
Aliases: []string{"rbmn"},
Value: false,
Usage: "Replaces every blank space in the GPU model name with a dash, ensuring a continuous, space-free identifier.",
EnvVars: []string{"DCGM_EXPORTER_REPLACE_BLANKS_IN_MODEL_NAME"},
},
}

if runtime.GOOS == "linux" {
Expand Down Expand Up @@ -358,22 +366,23 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
}

return &dcgmexporter.Config{
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
GPUDevices: gOpt,
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: dcgmexporter.KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
UseOldNamespace: c.Bool(CLIUseOldNamespace),
UseRemoteHE: c.IsSet(CLIRemoteHEInfo),
RemoteHEInfo: c.String(CLIRemoteHEInfo),
GPUDevices: gOpt,
SwitchDevices: sOpt,
CPUDevices: cOpt,
NoHostname: c.Bool(CLINoHostname),
UseFakeGPUs: c.Bool(CLIUseFakeGPUs),
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName),
}, nil
}
40 changes: 32 additions & 8 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package dcgmexporter
import (
"fmt"
"os"
"strings"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -51,11 +52,12 @@ func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_
}

collector := &DCGMCollector{
Counters: c,
DeviceFields: deviceFields,
UseOldNamespace: config.UseOldNamespace,
SysInfo: sysInfo,
Hostname: hostname,
Counters: c,
DeviceFields: deviceFields,
UseOldNamespace: config.UseOldNamespace,
SysInfo: sysInfo,
Hostname: hostname,
ReplaceBlanksInModelName: config.ReplaceBlanksInModelName,
}

cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, sysInfo, int64(config.CollectInterval)*1000)
Expand Down Expand Up @@ -104,7 +106,13 @@ func (c *DCGMCollector) GetMetrics() ([][]Metric, error) {
} else if c.SysInfo.InfoType == dcgm.FE_CPU || c.SysInfo.InfoType == dcgm.FE_CPU_CORE {
metrics[i] = ToCPUMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname)
} else {
metrics[i] = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname)
metrics[i] = ToMetric(vals,
c.Counters,
mi.DeviceInfo,
mi.InstanceInfo,
c.UseOldNamespace,
c.Hostname,
c.ReplaceBlanksInModelName)
}
}

Expand Down Expand Up @@ -221,7 +229,14 @@ func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, us
return metrics
}

func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GPUInstanceInfo, useOld bool, hostname string) []Metric {
func ToMetric(values []dcgm.FieldValue_v1,
c []Counter,
d dcgm.Device,
instanceInfo *GPUInstanceInfo,
useOld bool,
hostname string,
replaceBlanksInModelName bool,
) []Metric {
var metrics []Metric
var labels = map[string]string{}

Expand All @@ -245,6 +260,15 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI
if useOld {
uuid = "uuid"
}

gpuModel := d.Identifiers.Model

if replaceBlanksInModelName {
parts := strings.Fields(gpuModel)
gpuModel = strings.Join(parts, " ")
gpuModel = strings.ReplaceAll(gpuModel, " ", "-")
}

m := Metric{
Counter: counter,
Value: v,
Expand All @@ -253,7 +277,7 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceI
GPU: fmt.Sprintf("%d", d.GPU),
GPUUUID: d.UUID,
GPUDevice: fmt.Sprintf("nvidia%d", d.GPU),
GPUModelName: d.Identifiers.Model,
GPUModelName: gpuModel,
Hostname: hostname,

Labels: &labels,
Expand Down
57 changes: 57 additions & 0 deletions pkg/dcgmexporter/gpu_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"testing"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -223,3 +224,59 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun

return c, cleanup
}

func TestToMetric(t *testing.T) {

fieldValue := [4096]byte{}
fieldValue[0] = 42
values := []dcgm.FieldValue_v1{
{
FieldId: 150,
FieldType: dcgm.DCGM_FT_INT64,
Value: fieldValue,
},
}

c := []Counter{
{
FieldID: 150,
FieldName: "DCGM_FI_DEV_GPU_TEMP",
PromType: "gauge",
Help: "Temperature Help info",
},
}

d := dcgm.Device{
UUID: "fake0",
Identifiers: dcgm.DeviceIdentifiers{
Model: "NVIDIA T400 4GB",
},
}

var instanceInfo *GPUInstanceInfo = nil

type testCase struct {
replaceBlanksInModelName bool
expectedGPUModelName string
}

testCases := []testCase{
{
replaceBlanksInModelName: true,
expectedGPUModelName: "NVIDIA-T400-4GB",
},
{
replaceBlanksInModelName: false,
expectedGPUModelName: "NVIDIA T400 4GB",
},
}

for _, tc := range testCases {
t.Run(fmt.Sprintf("When replaceBlanksInModelName is %t", tc.replaceBlanksInModelName), func(t *testing.T) {
metrics := ToMetric(values, c, d, instanceInfo, false, "", tc.replaceBlanksInModelName)
assert.Len(t, metrics, 1)
assert.Equal(t, "42", metrics[0].Value)
assert.Equal(t, tc.expectedGPUModelName, metrics[0].GPUModelName)
})
}
}
50 changes: 26 additions & 24 deletions pkg/dcgmexporter/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,24 +60,25 @@ type DeviceOptions struct {
}

type Config struct {
CollectorsFile string
Address string
CollectInterval int
Kubernetes bool
KubernetesGPUIdType KubernetesGPUIDType
CollectDCP bool
UseOldNamespace bool
UseRemoteHE bool
RemoteHEInfo string
GPUDevices DeviceOptions
SwitchDevices DeviceOptions
CPUDevices DeviceOptions
NoHostname bool
UseFakeGPUs bool
ConfigMapData string
MetricGroups []dcgm.MetricGroup
WebSystemdSocket bool
WebConfigFile string
CollectorsFile string
Address string
CollectInterval int
Kubernetes bool
KubernetesGPUIdType KubernetesGPUIDType
CollectDCP bool
UseOldNamespace bool
UseRemoteHE bool
RemoteHEInfo string
GPUDevices DeviceOptions
SwitchDevices DeviceOptions
CPUDevices DeviceOptions
NoHostname bool
UseFakeGPUs bool
ConfigMapData string
MetricGroups []dcgm.MetricGroup
WebSystemdSocket bool
WebConfigFile string
ReplaceBlanksInModelName bool
}

type Transform interface {
Expand All @@ -104,12 +105,13 @@ type MetricsPipeline struct {
}

type DCGMCollector struct {
Counters []Counter
DeviceFields []dcgm.Short
Cleanups []func()
UseOldNamespace bool
SysInfo SystemInfo
Hostname string
Counters []Counter
DeviceFields []dcgm.Short
Cleanups []func()
UseOldNamespace bool
SysInfo SystemInfo
Hostname string
ReplaceBlanksInModelName bool
}

type Counter struct {
Expand Down

0 comments on commit 0c4728a

Please sign in to comment.