forked from BugRoger/nvidia-exporter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetrics.go
131 lines (109 loc) · 3.38 KB
/
metrics.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
package main
import (
"strconv"
"time"
"github.com/mindprince/gonvml"
"github.com/pkg/errors"
)
var (
averageDuration = 10 * time.Second
)
type Metrics struct {
Version string
Devices []*Device
}
type Device struct {
Index string
MinorNumber string
Name string
UUID string
Temperature float64
PowerUsage float64
PowerUsageAverage float64
// FanSpeed float64
MemoryTotal float64
MemoryUsed float64
UtilizationMemory float64
UtilizationGPU float64
UtilizationGPUAverage float64
}
func collectMetrics() (*Metrics, error) {
if err := gonvml.Initialize(); err != nil {
return nil, errors.Wrap(err, "Initialize is failed")
}
defer gonvml.Shutdown()
version, err := gonvml.SystemDriverVersion()
if err != nil {
return nil, errors.Wrap(err, "SystemDriverVersion is failed")
}
metrics := &Metrics{
Version: version,
}
numDevices, err := gonvml.DeviceCount()
if err != nil {
return nil, errors.Wrap(err, "DeviceCount is failed")
}
for index := 0; index < int(numDevices); index++ {
device, err := gonvml.DeviceHandleByIndex(uint(index))
if err != nil {
return nil, errors.Wrapf(err, "index:%d DeviceHandleByIndex is failed", index)
}
uuid, err := device.UUID()
if err != nil {
return nil, errors.Wrapf(err, "index:%d UUID is failed", index)
}
name, err := device.Name()
if err != nil {
return nil, errors.Wrapf(err, "index:%d Name is failed", index)
}
minorNumber, err := device.MinorNumber()
if err != nil {
return nil, errors.Wrapf(err, "index:%d MinorNumber is failed", index)
}
temperature, err := device.Temperature()
if err != nil {
return nil, errors.Wrapf(err, "index:%d Temperature is failed", index)
}
powerUsage, err := device.PowerUsage()
if err != nil {
return nil, errors.Wrapf(err, "index:%d PowerUsage is failed", index)
}
powerUsageAverage, err := device.AveragePowerUsage(averageDuration)
if err != nil {
return nil, errors.Wrapf(err, "index:%d AveragePowerUsage is failed", index)
}
// fanSpeed, err := device.FanSpeed()
// if err != nil {
// return nil, errors.Wrapf(err, "index:%d FanSpeed is failed", index)
// }
memoryTotal, memoryUsed, err := device.MemoryInfo()
if err != nil {
return nil, errors.Wrapf(err, "index:%d MemoryInfo is failed", index)
}
utilizationGPU, utilizationMemory, err := device.UtilizationRates()
if err != nil {
return nil, errors.Wrapf(err, "index:%d UtilizationRates is failed", index)
}
utilizationGPUAverage, err := device.AverageGPUUtilization(averageDuration)
if err != nil {
return nil, errors.Wrapf(err, "index:%d AverageGPUUtilization is failed", index)
}
metrics.Devices = append(metrics.Devices,
&Device{
Index: strconv.Itoa(index),
MinorNumber: strconv.Itoa(int(minorNumber)),
Name: name,
UUID: uuid,
Temperature: float64(temperature),
PowerUsage: float64(powerUsage),
PowerUsageAverage: float64(powerUsageAverage),
// FanSpeed: float64(fanSpeed),
MemoryTotal: float64(memoryTotal),
MemoryUsed: float64(memoryUsed),
UtilizationMemory: float64(utilizationMemory),
UtilizationGPU: float64(utilizationGPU),
UtilizationGPUAverage: float64(utilizationGPUAverage),
})
}
return metrics, nil
}