From f67ffa9fd6304a5998bc1e0041888210f567c65b Mon Sep 17 00:00:00 2001 From: Douglas Wightman Date: Tue, 19 Dec 2023 09:54:17 -0700 Subject: [PATCH] Review feedback Signed-off-by: Douglas Wightman --- pkg/cmd/app.go | 8 ++--- pkg/dcgmexporter/gpu_collector.go | 6 ++-- pkg/dcgmexporter/gpu_collector_test.go | 18 +++++----- pkg/dcgmexporter/parser.go | 7 +++- pkg/dcgmexporter/system_info.go | 48 +++++++++++++------------- pkg/dcgmexporter/types.go | 2 +- 6 files changed, 47 insertions(+), 42 deletions(-) diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index 31c582b3..50079723 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -55,7 +55,7 @@ var ( CLIRemoteHEInfo = "remote-hostengine-info" CLIGPUDevices = "devices" CLISwitchDevices = "switch-devices" - CLICpuDevices = "cpu-devices" + CLICPUDevices = "cpu-devices" CLINoHostname = "no-hostname" CLIUseFakeGpus = "fake-gpus" CLIConfigMapData = "configmap-data" @@ -114,7 +114,7 @@ func NewApp(buildVersion ...string) *cli.App { EnvVars: []string{"DCGM_EXPORTER_USE_OLD_NAMESPACE"}, }, &cli.StringFlag{ - Name: CLICpuDevices, + Name: CLICPUDevices, Aliases: []string{"p"}, Value: FlexKey, Usage: DeviceUsageStr, @@ -352,7 +352,7 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { return nil, err } - cOpt, err := parseDeviceOptions(c.String(CLICpuDevices)) + cOpt, err := parseDeviceOptions(c.String(CLICPUDevices)) if err != nil { return nil, err } @@ -369,7 +369,7 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { RemoteHEInfo: c.String(CLIRemoteHEInfo), GPUDevices: gOpt, SwitchDevices: sOpt, - CpuDevices: cOpt, + CPUDevices: cOpt, NoHostname: c.Bool(CLINoHostname), UseFakeGpus: c.Bool(CLIUseFakeGpus), ConfigMapData: c.String(CLIConfigMapData), diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index a17f52c2..f0c6007f 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -25,7 +25,7 @@ import ( ) func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) { - sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.CpuDevices, config.UseFakeGpus, entityType) + sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.CPUDevices, config.UseFakeGpus, entityType) if err != nil { return nil, func() {}, err } @@ -100,7 +100,7 @@ func (c *DCGMCollector) GetMetrics() ([][]Metric, error) { if c.SysInfo.InfoType == dcgm.FE_SWITCH || c.SysInfo.InfoType == dcgm.FE_LINK { metrics[i] = ToSwitchMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) } else if c.SysInfo.InfoType == dcgm.FE_CPU || c.SysInfo.InfoType == dcgm.FE_CPU_CORE { - metrics[i] = ToCpuMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) + metrics[i] = ToCPUMetric(vals, c.Counters, mi, c.UseOldNamespace, c.Hostname) } else { metrics[i] = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname) } @@ -163,7 +163,7 @@ func ToSwitchMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, return metrics } -func ToCpuMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) []Metric { +func ToCPUMetric(values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) []Metric { var metrics []Metric var labels = map[string]string{} diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go index 18e1628b..ad0eae6b 100644 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ b/pkg/dcgmexporter/gpu_collector_test.go @@ -45,7 +45,7 @@ var expectedMetrics = map[string]bool{ "DCGM_FI_DEV_VGPU_LICENSE_STATUS": true, } -var expectedCpuMetrics = map[string]bool{ +var expectedCPUMetrics = map[string]bool{ "DCGM_FI_DEV_CPU_UTIL_TOTAL": true, } @@ -57,7 +57,7 @@ func TestDCGMCollector(t *testing.T) { _, cleanup = testDCGMGPUCollector(t, sampleCounters) cleanup() - _, cleanup = testDCGMCpuCollector(t, sampleCounters) + _, cleanup = testDCGMCPUCollector(t, sampleCounters) cleanup() } @@ -95,14 +95,14 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun } dcgmGetCpuHierarchy = func() (dcgm.CpuHierarchy_v1, error) { - Cpu := dcgm.CpuHierarchyCpu_v1{ + CPU := dcgm.CpuHierarchyCpu_v1{ CpuId: 0, OwnedCores: []uint64{0}, } hierarchy := dcgm.CpuHierarchy_v1{ Version: 0, NumCpus: 1, - Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{Cpu}, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{CPU}, } return hierarchy, nil @@ -146,10 +146,10 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun return g, cleanup } -func testDCGMCpuCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) { +func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) { dOpt := DeviceOptions{true, []int{-1}, []int{-1}} cfg := Config{ - CpuDevices: dOpt, + CPUDevices: dOpt, NoHostname: false, UseOldNamespace: false, UseFakeGpus: false, @@ -181,14 +181,14 @@ func testDCGMCpuCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun } dcgmGetCpuHierarchy = func() (dcgm.CpuHierarchy_v1, error) { - Cpu := dcgm.CpuHierarchyCpu_v1{ + CPU := dcgm.CpuHierarchyCpu_v1{ CpuId: 0, OwnedCores: []uint64{0, 1}, } hierarchy := dcgm.CpuHierarchy_v1{ Version: 0, NumCpus: 1, - Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{Cpu}, + Cpus: [dcgm.MAX_NUM_CPUS]dcgm.CpuHierarchyCpu_v1{CPU}, } return hierarchy, nil @@ -218,7 +218,7 @@ func testDCGMCpuCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun require.NotEmpty(t, metric.Value) require.NotEqual(t, metric.Value, FailedToConvert) } - require.Equal(t, seenMetrics, expectedCpuMetrics) + require.Equal(t, seenMetrics, expectedCPUMetrics) } return c, cleanup diff --git a/pkg/dcgmexporter/parser.go b/pkg/dcgmexporter/parser.go index b9a75892..67d8b7c8 100644 --- a/pkg/dcgmexporter/parser.go +++ b/pkg/dcgmexporter/parser.go @@ -32,6 +32,11 @@ import ( "k8s.io/client-go/rest" ) +const ( + CPU_FIELDS_START = 1100 + DCP_FIELDS_START = 1000 +) + func ExtractCounters(c *Config) ([]Counter, error) { var err error var records [][]string @@ -140,7 +145,7 @@ func extractCounters(records [][]string, c *Config) ([]Counter, error) { } func fieldIsSupported(fieldID uint, c *Config) bool { - if fieldID < 1000 || fieldID > 1099 { + if fieldID < DCP_FIELDS_START || fieldID >= CPU_FIELDS_START { return true } diff --git a/pkg/dcgmexporter/system_info.go b/pkg/dcgmexporter/system_info.go index 52da9489..95b4baa5 100644 --- a/pkg/dcgmexporter/system_info.go +++ b/pkg/dcgmexporter/system_info.go @@ -65,7 +65,7 @@ type SwitchInfo struct { NvLinks []dcgm.NvLinkStatus } -type CpuInfo struct { +type CPUInfo struct { EntityId uint Cores []uint } @@ -78,7 +78,7 @@ type SystemInfo struct { cOpt DeviceOptions InfoType dcgm.Field_Entity_Group Switches []SwitchInfo - Cpus []CpuInfo + CPUs []CPUInfo } type MonitoringInfo struct { @@ -155,8 +155,8 @@ func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool { return false } -func CpuIdExists(sysInfo *SystemInfo, cpuId int) bool { - for _, cpu := range sysInfo.Cpus { +func CPUIdExists(sysInfo *SystemInfo, cpuId int) bool { + for _, cpu := range sysInfo.CPUs { if cpu.EntityId == uint(cpuId) { return true } @@ -186,8 +186,8 @@ func LinkIdExists(sysInfo *SystemInfo, linkId int) bool { return false } -func CpuCoreIdExists(sysInfo *SystemInfo, coreId int) bool { - for _, cpu := range sysInfo.Cpus { +func CPUCoreIdExists(sysInfo *SystemInfo, coreId int) bool { + for _, cpu := range sysInfo.CPUs { for _, core := range cpu.Cores { if core == uint(coreId) { return true @@ -197,7 +197,7 @@ func CpuCoreIdExists(sysInfo *SystemInfo, coreId int) bool { return false } -func VerifyCpuDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error { +func VerifyCPUDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error { if sOpt.Flex { return nil } @@ -213,7 +213,7 @@ func VerifyCpuDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error { if len(sOpt.MinorRange) > 0 && sOpt.MinorRange[0] != -1 { for _, coreId := range sOpt.MinorRange { - if !CpuCoreIdExists(sysInfo, coreId) { + if !CPUCoreIdExists(sysInfo, coreId) { return fmt.Errorf("couldn't find requested cpu core %d", coreId) } } @@ -308,16 +308,16 @@ func InitializeCPUInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, erro for i := 0; i < int(hierarchy.NumCpus); i++ { cores := getCoreArray([]uint64(hierarchy.Cpus[i].OwnedCores)) - cpu := CpuInfo{ + cpu := CPUInfo{ hierarchy.Cpus[i].CpuId, cores, } - sysInfo.Cpus = append(sysInfo.Cpus, cpu) + sysInfo.CPUs = append(sysInfo.CPUs, cpu) } sysInfo.cOpt = sOpt - err = VerifyCpuDevicePresence(&sysInfo, sOpt) + err = VerifyCPUDevicePresence(&sysInfo, sOpt) return sysInfo, nil } @@ -454,8 +454,8 @@ func CreateCoreGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []f var cleanups []func() /* Create per-switch link groups */ - for _, cpu := range sysInfo.Cpus { - if !IsCpuWatched(cpu.EntityId, sysInfo) { + for _, cpu := range sysInfo.CPUs { + if !IsCPUWatched(cpu.EntityId, sysInfo) { continue } @@ -660,7 +660,7 @@ func IsLinkWatched(linkId uint, switchId uint, sysInfo SystemInfo) bool { return false } -func IsCpuWatched(cpuId uint, sysInfo SystemInfo) bool { +func IsCPUWatched(cpuId uint, sysInfo SystemInfo) bool { if sysInfo.cOpt.Flex { return true } @@ -683,8 +683,8 @@ func IsCoreWatched(coreId uint, cpuId uint, sysInfo SystemInfo) bool { return true } - for _, cpu := range sysInfo.Cpus { - if !IsCpuWatched(cpu.EntityId, sysInfo) { + for _, cpu := range sysInfo.CPUs { + if !IsCPUWatched(cpu.EntityId, sysInfo) { return false } @@ -703,11 +703,11 @@ func IsCoreWatched(coreId uint, cpuId uint, sysInfo SystemInfo) bool { return false } -func AddAllCpus(sysInfo SystemInfo) []MonitoringInfo { +func AddAllCPUs(sysInfo SystemInfo) []MonitoringInfo { var monitoring []MonitoringInfo - for _, cpu := range sysInfo.Cpus { - if !IsCpuWatched(cpu.EntityId, sysInfo) { + for _, cpu := range sysInfo.CPUs { + if !IsCPUWatched(cpu.EntityId, sysInfo) { continue } @@ -728,12 +728,12 @@ func AddAllCpus(sysInfo SystemInfo) []MonitoringInfo { return monitoring } -func AddAllCpuCores(sysInfo SystemInfo) []MonitoringInfo { +func AddAllCPUCores(sysInfo SystemInfo) []MonitoringInfo { var monitoring []MonitoringInfo - for _, cpu := range sysInfo.Cpus { + for _, cpu := range sysInfo.CPUs { for _, core := range cpu.Cores { - if !IsCpuWatched(cpu.EntityId, sysInfo) { + if !IsCPUWatched(cpu.EntityId, sysInfo) { continue } @@ -827,9 +827,9 @@ func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo { } else if sysInfo.InfoType == dcgm.FE_LINK { monitoring = AddAllLinks(sysInfo) } else if sysInfo.InfoType == dcgm.FE_CPU { - monitoring = AddAllCpus(sysInfo) + monitoring = AddAllCPUs(sysInfo) } else if sysInfo.InfoType == dcgm.FE_CPU_CORE { - monitoring = AddAllCpuCores(sysInfo) + monitoring = AddAllCPUCores(sysInfo) } else if sysInfo.gOpt.Flex == true { monitoring = AddAllGpuInstances(sysInfo, true) } else { diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go index ab12c4a7..46375518 100644 --- a/pkg/dcgmexporter/types.go +++ b/pkg/dcgmexporter/types.go @@ -71,7 +71,7 @@ type Config struct { RemoteHEInfo string GPUDevices DeviceOptions SwitchDevices DeviceOptions - CpuDevices DeviceOptions + CPUDevices DeviceOptions NoHostname bool UseFakeGpus bool ConfigMapData string