diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index 02a2ca30..c63f484e 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -20,6 +20,7 @@ import ( "errors" "fmt" "os" + "strconv" "strings" "github.com/NVIDIA/go-dcgm/pkg/dcgm" @@ -31,8 +32,8 @@ type DCGMCollectorConstructor func([]Counter, string, *Config, FieldEntityGroupT func NewDCGMCollector(c []Counter, hostname string, config *Config, - fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) { - + fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem, +) (*DCGMCollector, func(), error) { if fieldEntityGroupTypeSystemInfo.isEmpty() { return nil, func() {}, errors.New("fieldEntityGroupTypeSystemInfo is empty") } @@ -163,7 +164,8 @@ func FindCounterField(c []Counter, fieldId uint) (Counter, error) { } func ToSwitchMetric(metrics MetricsByCounter, - values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) { + values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string, +) { labels := map[string]string{} for _, val := range values { @@ -206,8 +208,9 @@ func ToSwitchMetric(metrics MetricsByCounter, } func ToCPUMetric(metrics MetricsByCounter, - values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) { - var labels = map[string]string{} + values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string, +) { + labels := map[string]string{} for _, val := range values { v := ToString(val) @@ -258,7 +261,7 @@ func ToMetric( hostname string, replaceBlanksInModelName bool, ) { - var labels = map[string]string{} + labels := map[string]string{} for _, val := range values { v := ToString(val) @@ -283,6 +286,17 @@ func ToMetric( gpuModel := getGPUModel(d, replaceBlanksInModelName) + attrs := map[string]string{} + if counter.FieldID == dcgm.DCGM_FI_DEV_XID_ERRORS { + errCode := int(val.Int64()) + attrs["err_code"] = strconv.Itoa(errCode) + if 0 < errCode && errCode < len(xidErrCodeToText) { + attrs["err_msg"] = xidErrCodeToText[errCode] + } else { + attrs["err_msg"] = "Unknown Error" + } + } + m := Metric{ Counter: counter, Value: v, @@ -295,7 +309,7 @@ func ToMetric( Hostname: hostname, Labels: labels, - Attributes: map[string]string{}, + Attributes: attrs, } if instanceInfo != nil { m.MigProfile = instanceInfo.ProfileName diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go index ddf82fc8..23a92929 100644 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ b/pkg/dcgmexporter/gpu_collector_test.go @@ -257,7 +257,6 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun } func TestToMetric(t *testing.T) { - fieldValue := [4096]byte{} fieldValue[0] = 42 values := []dcgm.FieldValue_v1{ @@ -315,6 +314,70 @@ func TestToMetric(t *testing.T) { } } +func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) { + c := []Counter{ + { + FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS, + FieldName: "DCGM_FI_DEV_GPU_TEMP", + PromType: "gauge", + Help: "Temperature Help info", + }, + } + + d := dcgm.Device{ + UUID: "fake0", + Identifiers: dcgm.DeviceIdentifiers{ + Model: "NVIDIA T400 4GB", + }, + } + + var instanceInfo *GPUInstanceInfo = nil + + type testCase struct { + name string + fieldValue byte + expectedErr string + } + + testCases := []testCase{ + { + name: "when DCGM_FI_DEV_XID_ERRORS has known value", + fieldValue: 42, + expectedErr: "Video processor exception", + }, + { + name: "when DCGM_FI_DEV_XID_ERRORS has unknown value", + fieldValue: 255, + expectedErr: "Unknown Error", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + fieldValue := [4096]byte{} + fieldValue[0] = tc.fieldValue + values := []dcgm.FieldValue_v1{ + { + FieldId: dcgm.DCGM_FI_DEV_XID_ERRORS, + FieldType: dcgm.DCGM_FT_INT64, + Value: fieldValue, + }, + } + + metrics := make(map[Counter][]Metric) + ToMetric(metrics, values, c, d, instanceInfo, false, "", false) + assert.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] + assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Value) + assert.Contains(t, metricValues[0].Attributes, "err_code") + assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Attributes["err_code"]) + assert.Contains(t, metricValues[0].Attributes, "err_msg") + assert.Equal(t, tc.expectedErr, metricValues[0].Attributes["err_msg"]) + }) + } +} + func TestGPUCollector_GetMetrics(t *testing.T) { teardownTest := setupTest(t) defer teardownTest(t) diff --git a/pkg/dcgmexporter/xid_errors.go b/pkg/dcgmexporter/xid_errors.go new file mode 100644 index 00000000..64be5363 --- /dev/null +++ b/pkg/dcgmexporter/xid_errors.go @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +// Based on this doc: https://docs.nvidia.com/deploy/xid-errors/#topic_4 +var xidErrCodeToText = []string{ + 0: "No Error", + 1: "Invalid or corrupted push buffer stream", + 2: "Invalid or corrupted push buffer stream", + 3: "Invalid or corrupted push buffer stream", + 4: "Invalid or corrupted push buffer stream", + 5: "Unused", + 6: "Invalid or corrupted push buffer stream", + 7: "Invalid or corrupted push buffer address", + 8: "GPU stopped processing", + 9: "Driver error programming GPU", + 10: "Unused", + 11: "Invalid or corrupted push buffer stream", + 12: "Driver error handling GPU exception", + 13: "Graphics Engine Exception", + 14: "Unused", + 15: "Unused", + 16: "Display engine hung", + 17: "Unused", + 18: "Bus mastering disabled in PCI Config Space", + 19: "Display Engine error", + 20: "Invalid or corrupted Mpeg push buffer", + 21: "Invalid or corrupted Motion Estimation push buffer", + 22: "Invalid or corrupted Video Processor push buffer", + 23: "Unused", + 24: "GPU semaphore timeout", + 25: "Invalid or illegal push buffer stream", + 26: "Framebuffer timeout", + 27: "Video processor exception", + 28: "Video processor exception", + 29: "Video processor exception", + 30: "GPU semaphore access error", + 31: "GPU memory page fault", + 32: "Invalid or corrupted push buffer stream", + 33: "Internal micro-controller error", + 34: "Video processor exception", + 35: "Video processor exception", + 36: "Video processor exception", + 37: "Driver firmware error", + 38: "Driver firmware error", + 39: "Unused", + 40: "Unused", + 41: "Unused", + 42: "Video processor exception", + 43: "GPU stopped processing", + 44: "Graphics Engine fault during context switch", + 45: "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda", + 46: "GPU stopped processing", + 47: "Video processor exception", + 48: "Double Bit ECC Error", + 49: "Unused", + 50: "Unused", + 51: "Unused", + 52: "Unused", + 53: "Unused", + 54: "Auxiliary power is not connected to the GPU board", + 55: "Unused", + 56: "Display Engine error", + 57: "Error programming video memory interface", + 58: "Unstable video memory interface detected", + 59: "Internal micro-controller error", + 60: "Video processor exception", + 61: "Internal micro-controller breakpoint/warning", + 62: "Internal micro-controller halt", + 63: "ECC page retirement or row remapping recording event", + 64: "ECC page retirement or row remapper recording failure", + 65: "Video processor exception", + 66: "Illegal access by driver", + 67: "Illegal access by driver", + 68: "NVDEC0 Exception", + 69: "Graphics Engine class error", + 70: "CE3: Unknown Error", + 71: "CE4: Unknown Error", + 72: "CE5: Unknown Error", + 73: "NVENC2 Error", + 74: "NVLINK Error", + 75: "CE6: Unknown Error", + 76: "CE7: Unknown Error", + 77: "CE8: Unknown Error", + 78: "vGPU Start Error", + 79: "GPU has fallen off the bus", + 80: "Corrupted data sent to GPU", + 81: "VGA Subsystem Error", + 82: "NVJPG0 Error", + 83: "NVDEC1 Error", + 84: "NVDEC2 Error", + 85: "CE9: Unknown Error", + 86: "OFA Exception", + 87: "Reserved", + 88: "NVDEC3 Error", + 89: "NVDEC4 Error", + 90: "Reserved", + 91: "Reserved", + 92: "High single-bit ECC error rate", + 93: "Non-fatal violation of provisioned InfoROM wear limit", + 94: "Contained ECC error", + 95: "Uncontained ECC error", + 96: "NVDEC5 Error", + 97: "NVDEC6 Error", + 98: "NVDEC7 Error", + 99: "NVJPG1 Error", + 100: "NVJPG2 Error", + 101: "NVJPG3 Error", + 102: "NVJPG4 Error", + 103: "NVJPG5 Error", + 104: "NVJPG6 Error", + 105: "NVJPG7 Error", + 106: "SMBPBI Test Message", + 107: "SMBPBI Test Message Silent", + 108: "Reserved", + 109: "Context Switch Timeout Error", + 110: "Security Fault Error", + 111: "Display Bundle Error Event", + 112: "Display Supervisor Error", + 113: "DP Link Training Error", + 114: "Display Pipeline Underflow Error", + 115: "Display Core Channel Error", + 116: "Display Window Channel Error", + 117: "Display Cursor Channel Error", + 118: "Display Pixel Pipeline Error", + 119: "GSP RPC Timeout", + 120: "GSP Error", + 121: "C2C Link Error", + 122: "SPI PMU RPC Read Failure", + 123: "SPI PMU RPC Write Failure", + 124: "SPI PMU RPC Erase Failure", + 125: "Inforom FS Failure", + 126: "Reserved", + 127: "Reserved", + 128: "Reserved", + 129: "Reserved", + 130: "Reserved", + 131: "Reserved", + 132: "Reserved", + 133: "Reserved", + 134: "Reserved", + 135: "Reserved", + 136: "Reserved", + 137: "Reserved", + 138: "Reserved", + 139: "Reserved", + 140: "Unrecovered ECC Error", + 141: "Reserved", + 142: "Reserved", + 143: "GPU Initialization Failure", +}