Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix accelerator device map #1913

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .dockerignore
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding .dockerignore will lead to incorrect version computation #1809

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
./_output/
./ansible/
./enhancements/
./manifests/
3 changes: 3 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,9 @@ func logBoolConfigs() {
klog.V(5).Infof("EXPOSE_ESTIMATED_IDLE_POWER_METRICS: %t. This only impacts when the power is estimated using pre-prained models. Estimated idle power is meaningful only when Kepler is running on bare-metal or with a single virtual machine (VM) on the node.", instance.Kepler.ExposeIdlePowerMetrics)
klog.V(5).Infof("EXPERIMENTAL_BPF_SAMPLE_RATE: %d", instance.Kepler.BPFSampleRate)
klog.V(5).Infof("EXCLUDE_SWAPPER_PROCESS: %t", instance.Kepler.ExcludeSwapperProcess)
if instance.Kepler.EnabledGPU {
klog.V(5).Infof("DCGMHostEngineEndpoint %s", instance.DCGMHostEngineEndpoint)
}
}
}

Expand Down
6 changes: 5 additions & 1 deletion pkg/sensors/accelerator/accelerator.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package accelerator

//nolint:gci // The supported device imports are kept separate.
import (
"encoding/json"
"slices"
"sync"
"time"
Expand Down Expand Up @@ -130,7 +131,10 @@ func New(atype string, sleep bool) (Accelerator, error) {

// Init the available devices.

devs := devices.GetRegistry().GetAllDeviceTypes()
r := devices.GetRegistry()
j, _ := json.Marshal(r.GetAllDevices())
klog.V(5).Infof("Accelerator Registry AllDevices: %s", string(j))
devs := r.GetAllDeviceTypes()
numDevs := len(devs)
if numDevs == 0 || !slices.Contains(devs, atype) {
return nil, errors.New("no devices found")
Expand Down
13 changes: 9 additions & 4 deletions pkg/sensors/accelerator/devices/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,22 @@ func dcgmCheck(r *Registry) {
}

func dcgmDeviceStartup() Device {
a := dcgmAccImpl
klog.V(3).Infof("Attempting to startup DCGM")
d := dcgmAccImpl

if err := a.InitLib(); err != nil {
if err := d.InitLib(); err != nil {
klog.Errorf("Error initializing %s: %v", dcgmType.String(), err)
return nil
}

if err := a.Init(); err != nil {
if err := d.Init(); err != nil {
klog.Errorf("failed to StartupDevice: %v", err)
return nil
}

klog.Infof("Using %s to obtain gpu power", dcgmType.String())

return &a
return &d
}

func (d *gpuDcgm) Init() error {
Expand Down Expand Up @@ -138,6 +139,7 @@ func (d *gpuDcgm) InitLib() (err error) {
if err != nil {
klog.Infof("There is no DCGM daemon running in the host: %s", err)
// embedded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995
klog.Info("Attempting to inilialize dcgm in Embedded mode.")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit typo: initialize

cleanup, err = dcgm.Init(dcgm.Embedded)
if err != nil {
klog.Errorf("Could not start DCGM. Error: %s", err)
Expand All @@ -147,6 +149,8 @@ func (d *gpuDcgm) InitLib() (err error) {
return fmt.Errorf("not able to connect to DCGM: %s", err)
}
klog.Info("Started DCGM in the Embedded mode ")
} else {
klog.Info("Started DCGM in the Standalone mode ")
}
d.nvmlInited = false
d.devs = make(map[int]GPUDevice)
Expand All @@ -172,6 +176,7 @@ func (d *gpuDcgm) InitLib() (err error) {
}

func (d *gpuDcgm) loadDevices() error {
klog.V(5).Infof("Attempting to load dcgm devices.")
d.devs = map[int]GPUDevice{}
count, err := nvml.DeviceGetCount()
if err != nvml.SUCCESS {
Expand Down
21 changes: 19 additions & 2 deletions pkg/sensors/accelerator/devices/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,13 @@ func (r *Registry) MustRegister(a string, d DeviceType, deviceStartup deviceStar
return
}
klog.V(5).Infof("Adding the device to the registry [%s][%s]", a, d.String())
r.Registry[a] = map[DeviceType]deviceStartupFunc{
d: deviceStartup,
m, ok := r.Registry[a]
if !ok {
r.Registry[a] = map[DeviceType]deviceStartupFunc{
d: deviceStartup,
}
} else {
m[d] = deviceStartup
}
}

Expand All @@ -143,6 +148,18 @@ func (r *Registry) GetAllDeviceTypes() []string {
return devices
}

func (r *Registry) GetAllDevices() map[string]map[string]interface{} {
all := map[string]map[string]interface{}{}
for t, m := range r.Registry {
devices := map[string]interface{}{}
for d := range m {
devices[d.String()] = struct{}{}
}
all[t] = devices
}
return all
}

func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, deviceStartup deviceStartupFunc) error {
switch accType {
case config.GPU:
Expand Down
Loading