Skip to content

Commit

Permalink
pkg/dcgmexporter/gpu_collector.go: include a err_msg label in metric …
Browse files Browse the repository at this point in the history
…DCGM_FI_DEV_XID_ERRORS (#309)

* pkg/dcgmexporter/gpu_collector.go: include a err_msg label in metric DCGM_FI_DEV_XID_ERRORS

The DCGM_FI_DEV_XID_ERRORS metric reports xid error code as its value, this commit includes an err_msg
label with value retrieved from this nvidia doc: https://docs.nvidia.com/deploy/xid-errors/#topic_4

pkg/dcgmexporter/gpu_collector.go: include err_code label in metrics for easy alert configs

pkg/dcgmexporter/gpu_collector.go: convert xidErrCodeToText to a slice and adjust the known value sanity check

Signed-off-by: Xiaofan Hu <[email protected]>

* Added unit tests

Signed-off-by: Vadym Fedorov <[email protected]>

* Simplified XID Error code mapping logic

Signed-off-by: Vadym Fedorov <[email protected]>

---------

Signed-off-by: Xiaofan Hu <[email protected]>
Signed-off-by: Vadym Fedorov <[email protected]>
Co-authored-by: Vadym Fedorov <[email protected]>
  • Loading branch information
bom-d-van and nvvfedorov authored May 9, 2024
1 parent 5121ded commit 7decfd2
Show file tree
Hide file tree
Showing 3 changed files with 250 additions and 8 deletions.
28 changes: 21 additions & 7 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"errors"
"fmt"
"os"
"strconv"
"strings"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
Expand All @@ -31,8 +32,8 @@ type DCGMCollectorConstructor func([]Counter, string, *Config, FieldEntityGroupT
func NewDCGMCollector(c []Counter,
hostname string,
config *Config,
fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) {

fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem,
) (*DCGMCollector, func(), error) {
if fieldEntityGroupTypeSystemInfo.isEmpty() {
return nil, func() {}, errors.New("fieldEntityGroupTypeSystemInfo is empty")
}
Expand Down Expand Up @@ -163,7 +164,8 @@ func FindCounterField(c []Counter, fieldId uint) (Counter, error) {
}

func ToSwitchMetric(metrics MetricsByCounter,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string,
) {
labels := map[string]string{}

for _, val := range values {
Expand Down Expand Up @@ -206,8 +208,9 @@ func ToSwitchMetric(metrics MetricsByCounter,
}

func ToCPUMetric(metrics MetricsByCounter,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
var labels = map[string]string{}
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string,
) {
labels := map[string]string{}

for _, val := range values {
v := ToString(val)
Expand Down Expand Up @@ -258,7 +261,7 @@ func ToMetric(
hostname string,
replaceBlanksInModelName bool,
) {
var labels = map[string]string{}
labels := map[string]string{}

for _, val := range values {
v := ToString(val)
Expand All @@ -283,6 +286,17 @@ func ToMetric(

gpuModel := getGPUModel(d, replaceBlanksInModelName)

attrs := map[string]string{}
if counter.FieldID == dcgm.DCGM_FI_DEV_XID_ERRORS {
errCode := int(val.Int64())
attrs["err_code"] = strconv.Itoa(errCode)
if 0 < errCode && errCode < len(xidErrCodeToText) {
attrs["err_msg"] = xidErrCodeToText[errCode]
} else {
attrs["err_msg"] = "Unknown Error"
}
}

m := Metric{
Counter: counter,
Value: v,
Expand All @@ -295,7 +309,7 @@ func ToMetric(
Hostname: hostname,

Labels: labels,
Attributes: map[string]string{},
Attributes: attrs,
}
if instanceInfo != nil {
m.MigProfile = instanceInfo.ProfileName
Expand Down
65 changes: 64 additions & 1 deletion pkg/dcgmexporter/gpu_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,6 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun
}

func TestToMetric(t *testing.T) {

fieldValue := [4096]byte{}
fieldValue[0] = 42
values := []dcgm.FieldValue_v1{
Expand Down Expand Up @@ -315,6 +314,70 @@ func TestToMetric(t *testing.T) {
}
}

func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) {
c := []Counter{
{
FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS,
FieldName: "DCGM_FI_DEV_GPU_TEMP",
PromType: "gauge",
Help: "Temperature Help info",
},
}

d := dcgm.Device{
UUID: "fake0",
Identifiers: dcgm.DeviceIdentifiers{
Model: "NVIDIA T400 4GB",
},
}

var instanceInfo *GPUInstanceInfo = nil

type testCase struct {
name string
fieldValue byte
expectedErr string
}

testCases := []testCase{
{
name: "when DCGM_FI_DEV_XID_ERRORS has known value",
fieldValue: 42,
expectedErr: "Video processor exception",
},
{
name: "when DCGM_FI_DEV_XID_ERRORS has unknown value",
fieldValue: 255,
expectedErr: "Unknown Error",
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
fieldValue := [4096]byte{}
fieldValue[0] = tc.fieldValue
values := []dcgm.FieldValue_v1{
{
FieldId: dcgm.DCGM_FI_DEV_XID_ERRORS,
FieldType: dcgm.DCGM_FT_INT64,
Value: fieldValue,
},
}

metrics := make(map[Counter][]Metric)
ToMetric(metrics, values, c, d, instanceInfo, false, "", false)
assert.Len(t, metrics, 1)
// We get metric value with 0 index
metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)]
assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Value)
assert.Contains(t, metricValues[0].Attributes, "err_code")
assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Attributes["err_code"])
assert.Contains(t, metricValues[0].Attributes, "err_msg")
assert.Equal(t, tc.expectedErr, metricValues[0].Attributes["err_msg"])
})
}
}

func TestGPUCollector_GetMetrics(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)
Expand Down
165 changes: 165 additions & 0 deletions pkg/dcgmexporter/xid_errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgmexporter

// Based on this doc: https://docs.nvidia.com/deploy/xid-errors/#topic_4
var xidErrCodeToText = []string{
0: "No Error",
1: "Invalid or corrupted push buffer stream",
2: "Invalid or corrupted push buffer stream",
3: "Invalid or corrupted push buffer stream",
4: "Invalid or corrupted push buffer stream",
5: "Unused",
6: "Invalid or corrupted push buffer stream",
7: "Invalid or corrupted push buffer address",
8: "GPU stopped processing",
9: "Driver error programming GPU",
10: "Unused",
11: "Invalid or corrupted push buffer stream",
12: "Driver error handling GPU exception",
13: "Graphics Engine Exception",
14: "Unused",
15: "Unused",
16: "Display engine hung",
17: "Unused",
18: "Bus mastering disabled in PCI Config Space",
19: "Display Engine error",
20: "Invalid or corrupted Mpeg push buffer",
21: "Invalid or corrupted Motion Estimation push buffer",
22: "Invalid or corrupted Video Processor push buffer",
23: "Unused",
24: "GPU semaphore timeout",
25: "Invalid or illegal push buffer stream",
26: "Framebuffer timeout",
27: "Video processor exception",
28: "Video processor exception",
29: "Video processor exception",
30: "GPU semaphore access error",
31: "GPU memory page fault",
32: "Invalid or corrupted push buffer stream",
33: "Internal micro-controller error",
34: "Video processor exception",
35: "Video processor exception",
36: "Video processor exception",
37: "Driver firmware error",
38: "Driver firmware error",
39: "Unused",
40: "Unused",
41: "Unused",
42: "Video processor exception",
43: "GPU stopped processing",
44: "Graphics Engine fault during context switch",
45: "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda",
46: "GPU stopped processing",
47: "Video processor exception",
48: "Double Bit ECC Error",
49: "Unused",
50: "Unused",
51: "Unused",
52: "Unused",
53: "Unused",
54: "Auxiliary power is not connected to the GPU board",
55: "Unused",
56: "Display Engine error",
57: "Error programming video memory interface",
58: "Unstable video memory interface detected",
59: "Internal micro-controller error",
60: "Video processor exception",
61: "Internal micro-controller breakpoint/warning",
62: "Internal micro-controller halt",
63: "ECC page retirement or row remapping recording event",
64: "ECC page retirement or row remapper recording failure",
65: "Video processor exception",
66: "Illegal access by driver",
67: "Illegal access by driver",
68: "NVDEC0 Exception",
69: "Graphics Engine class error",
70: "CE3: Unknown Error",
71: "CE4: Unknown Error",
72: "CE5: Unknown Error",
73: "NVENC2 Error",
74: "NVLINK Error",
75: "CE6: Unknown Error",
76: "CE7: Unknown Error",
77: "CE8: Unknown Error",
78: "vGPU Start Error",
79: "GPU has fallen off the bus",
80: "Corrupted data sent to GPU",
81: "VGA Subsystem Error",
82: "NVJPG0 Error",
83: "NVDEC1 Error",
84: "NVDEC2 Error",
85: "CE9: Unknown Error",
86: "OFA Exception",
87: "Reserved",
88: "NVDEC3 Error",
89: "NVDEC4 Error",
90: "Reserved",
91: "Reserved",
92: "High single-bit ECC error rate",
93: "Non-fatal violation of provisioned InfoROM wear limit",
94: "Contained ECC error",
95: "Uncontained ECC error",
96: "NVDEC5 Error",
97: "NVDEC6 Error",
98: "NVDEC7 Error",
99: "NVJPG1 Error",
100: "NVJPG2 Error",
101: "NVJPG3 Error",
102: "NVJPG4 Error",
103: "NVJPG5 Error",
104: "NVJPG6 Error",
105: "NVJPG7 Error",
106: "SMBPBI Test Message",
107: "SMBPBI Test Message Silent",
108: "Reserved",
109: "Context Switch Timeout Error",
110: "Security Fault Error",
111: "Display Bundle Error Event",
112: "Display Supervisor Error",
113: "DP Link Training Error",
114: "Display Pipeline Underflow Error",
115: "Display Core Channel Error",
116: "Display Window Channel Error",
117: "Display Cursor Channel Error",
118: "Display Pixel Pipeline Error",
119: "GSP RPC Timeout",
120: "GSP Error",
121: "C2C Link Error",
122: "SPI PMU RPC Read Failure",
123: "SPI PMU RPC Write Failure",
124: "SPI PMU RPC Erase Failure",
125: "Inforom FS Failure",
126: "Reserved",
127: "Reserved",
128: "Reserved",
129: "Reserved",
130: "Reserved",
131: "Reserved",
132: "Reserved",
133: "Reserved",
134: "Reserved",
135: "Reserved",
136: "Reserved",
137: "Reserved",
138: "Reserved",
139: "Reserved",
140: "Unrecovered ECC Error",
141: "Reserved",
142: "Reserved",
143: "GPU Initialization Failure",
}

0 comments on commit 7decfd2

Please sign in to comment.