diff --git a/pkg/dcgm/api.go b/pkg/dcgm/api.go index 6cef868..07f1774 100644 --- a/pkg/dcgm/api.go +++ b/pkg/dcgm/api.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "sync" + "time" ) var ( @@ -89,7 +90,7 @@ func GetDeviceTopology(gpuId uint) ([]P2PLink, error) { // WatchPidFields lets DCGM start recording stats for GPU process // It needs to be called before calling GetProcessInfo func WatchPidFields() (GroupHandle, error) { - return watchPidFields(defaultUpdateFreq, defaultMaxKeepAge, defaultMaxKeepSamples) + return watchPidFields(time.Microsecond*time.Duration(defaultUpdateFreq), time.Second*time.Duration(defaultMaxKeepAge), defaultMaxKeepSamples) } // GetProcessInfo provides detailed per GPU stats for this process diff --git a/pkg/dcgm/const.go b/pkg/dcgm/const.go index d29eef4..96b63f5 100644 --- a/pkg/dcgm/const.go +++ b/pkg/dcgm/const.go @@ -148,6 +148,9 @@ const ( DCGM_FI_DEV_FB_USED = 252 DCGM_FI_DEV_FB_RESERVED = 253 DCGM_FI_DEV_FB_USED_PERCENT = 254 + DCGM_FI_DEV_C2C_LINK_COUNT = 285 + DCGM_FI_DEV_C2C_LINK_STATUS = 286 + DCGM_FI_DEV_C2C_MAX_BANDWIDTH = 287 DCGM_FI_DEV_ECC_CURRENT = 300 DCGM_FI_DEV_ECC_PENDING = 301 DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 @@ -311,82 +314,15 @@ const ( DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE = 532 DCGM_FI_DEV_VGPU_PCI_ID = 533 DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID = 534 - DCGM_FI_FIRST_VGPU_FIELD_ID = 520 - DCGM_FI_LAST_VGPU_FIELD_ID = 570 DCGM_FI_INTERNAL_FIELDS_0_START = 600 DCGM_FI_INTERNAL_FIELDS_0_END = 699 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 = 700 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 = 701 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 = 702 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 = 703 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 = 704 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 = 705 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 = 706 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 = 707 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 = 708 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 = 709 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 = 710 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 = 711 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 = 712 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 = 713 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 = 714 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 = 715 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 = 716 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 = 717 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 = 718 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 = 719 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 = 720 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 = 721 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 = 722 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 = 723 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 = 724 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 = 725 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 = 726 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 = 727 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 = 728 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 = 729 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 = 730 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 = 731 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 = 732 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 = 733 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 = 734 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 = 735 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 = 736 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 = 737 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 = 738 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 = 739 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 = 740 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 = 741 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 = 742 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 = 743 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 = 744 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 = 745 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 = 746 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 = 747 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 = 748 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 = 749 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 = 750 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 = 751 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 = 752 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 = 753 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 = 754 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 = 755 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 = 756 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 = 757 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 = 758 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 = 759 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 = 760 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 = 761 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 = 762 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 = 763 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 = 764 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 = 765 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 = 766 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 = 767 - DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 = 768 - DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 = 769 - DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 = 770 - DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 = 771 + DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT = 701 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ = 702 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV = 703 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD = 704 + DCGM_FI_DEV_NVSWITCH_POWER_VDD = 705 + DCGM_FI_DEV_NVSWITCH_POWER_DVDD = 706 + DCGM_FI_DEV_NVSWITCH_POWER_HVDD = 707 DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX = 780 DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX = 781 DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS = 782 @@ -447,8 +383,6 @@ const ( DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID = 876 DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID = 877 DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID = 878 - DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700 - DCGM_FI_LAST_NVSWITCH_FIELD_ID = 899 DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 DCGM_FI_PROF_SM_ACTIVE = 1002 DCGM_FI_PROF_SM_OCCUPANCY = 1003 @@ -518,7 +452,20 @@ const ( DCGM_FI_PROF_NVLINK_L16_RX_BYTES = 1073 DCGM_FI_PROF_NVLINK_L17_TX_BYTES = 1074 DCGM_FI_PROF_NVLINK_L17_RX_BYTES = 1075 - DCGM_FI_MAX_FIELDS = 1076 + DCGM_FI_DEV_CPU_UTIL_TOTAL = 1100 + DCGM_FI_DEV_CPU_UTIL_USER = 1101 + DCGM_FI_DEV_CPU_UTIL_NICE = 1102 + DCGM_FI_DEV_CPU_UTIL_SYS = 1103 + DCGM_FI_DEV_CPU_UTIL_IRQ = 1104 + DCGM_FI_DEV_CPU_TEMP_CURRENT = 1110 + DCGM_FI_DEV_CPU_TEMP_WARNING = 1111 + DCGM_FI_DEV_CPU_TEMP_CRITICAL = 1112 + DCGM_FI_DEV_CPU_CLOCK_CURRENT = 1120 + DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT = 1130 + DCGM_FI_DEV_CPU_POWER_LIMIT = 1131 + DCGM_FI_DEV_CPU_VENDOR = 1140 + DCGM_FI_DEV_CPU_MODEL = 1141 + DCGM_FI_MAX_FIELDS = 1142 DCGM_ST_OK = 0 DCGM_ST_BADPARAM = -1 @@ -573,15 +520,18 @@ const ( DCGM_ST_NVVS_ISOLATE_ERROR = -51 DCGM_ST_NVVS_BINARY_NOT_FOUND = -52 DCGM_ST_NVVS_KILLED = -53 + DCGM_ST_PAUSED = -54 + DCGM_ST_ALREADY_INITIALIZED = -55 ) var ( DCGM_FI = map[string]Short{ - "DCGM_FT_BINARY": Short('b'), - "DCGM_FT_DOUBLE": Short('d'), - "DCGM_FT_INT64": Short('i'), - "DCGM_FT_STRING": Short('s'), - "DCGM_FT_TIMESTAMP": Short('t'), + "DCGM_FT_BINARY": Short('b'), + "DCGM_FT_DOUBLE": Short('d'), + "DCGM_FT_INT64": Short('i'), + "DCGM_FT_STRING": Short('s'), + "DCGM_FT_TIMESTAMP": Short('t'), + "DCGM_FI_UNKNOWN": 0, "DCGM_FI_DRIVER_VERSION": 1, "DCGM_FI_NVML_VERSION": 2, @@ -682,6 +632,9 @@ var ( "DCGM_FI_DEV_FB_USED": 252, "DCGM_FI_DEV_FB_RESERVED": 253, "DCGM_FI_DEV_FB_USED_PERCENT": 254, + "DCGM_FI_DEV_C2C_LINK_COUNT": 285, + "DCGM_FI_DEV_C2C_LINK_STATUS": 286, + "DCGM_FI_DEV_C2C_MAX_BANDWIDTH": 287, "DCGM_FI_DEV_ECC_CURRENT": 300, "DCGM_FI_DEV_ECC_PENDING": 301, "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": 310, @@ -845,82 +798,15 @@ var ( "DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE": 532, "DCGM_FI_DEV_VGPU_PCI_ID": 533, "DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID": 534, - "DCGM_FI_FIRST_VGPU_FIELD_ID": 520, - "DCGM_FI_LAST_VGPU_FIELD_ID": 570, "DCGM_FI_INTERNAL_FIELDS_0_START": 600, "DCGM_FI_INTERNAL_FIELDS_0_END": 699, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00": 700, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00": 701, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00": 702, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00": 703, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01": 704, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01": 705, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01": 706, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01": 707, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02": 708, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02": 709, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02": 710, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02": 711, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03": 712, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03": 713, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03": 714, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03": 715, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04": 716, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04": 717, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04": 718, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04": 719, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05": 720, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05": 721, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05": 722, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05": 723, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06": 724, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06": 725, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06": 726, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06": 727, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07": 728, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07": 729, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07": 730, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07": 731, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08": 732, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08": 733, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08": 734, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08": 735, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09": 736, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09": 737, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09": 738, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09": 739, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10": 740, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10": 741, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10": 742, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10": 743, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11": 744, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11": 745, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11": 746, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11": 747, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12": 748, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12": 749, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12": 750, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12": 751, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13": 752, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13": 753, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13": 754, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13": 755, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14": 756, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14": 757, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14": 758, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14": 759, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15": 760, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15": 761, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15": 762, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15": 763, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16": 764, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16": 765, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16": 766, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16": 767, - "DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17": 768, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17": 769, - "DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17": 770, - "DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17": 771, + "DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT": 701, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ": 702, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV": 703, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD": 704, + "DCGM_FI_DEV_NVSWITCH_POWER_VDD": 705, + "DCGM_FI_DEV_NVSWITCH_POWER_DVDD": 706, + "DCGM_FI_DEV_NVSWITCH_POWER_HVDD": 707, "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX": 780, "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX": 781, "DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS": 782, @@ -981,8 +867,6 @@ var ( "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID": 876, "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID": 877, "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID": 878, - "DCGM_FI_FIRST_NVSWITCH_FIELD_ID": 700, - "DCGM_FI_LAST_NVSWITCH_FIELD_ID": 899, "DCGM_FI_PROF_GR_ENGINE_ACTIVE": 1001, "DCGM_FI_PROF_SM_ACTIVE": 1002, "DCGM_FI_PROF_SM_OCCUPANCY": 1003, @@ -1052,7 +936,20 @@ var ( "DCGM_FI_PROF_NVLINK_L16_RX_BYTES": 1073, "DCGM_FI_PROF_NVLINK_L17_TX_BYTES": 1074, "DCGM_FI_PROF_NVLINK_L17_RX_BYTES": 1075, - "DCGM_FI_MAX_FIELDS": 1076, + "DCGM_FI_DEV_CPU_UTIL_TOTAL": 1100, + "DCGM_FI_DEV_CPU_UTIL_USER": 1101, + "DCGM_FI_DEV_CPU_UTIL_NICE": 1102, + "DCGM_FI_DEV_CPU_UTIL_SYS": 1103, + "DCGM_FI_DEV_CPU_UTIL_IRQ": 1104, + "DCGM_FI_DEV_CPU_TEMP_CURRENT": 1110, + "DCGM_FI_DEV_CPU_TEMP_WARNING": 1111, + "DCGM_FI_DEV_CPU_TEMP_CRITICAL": 1112, + "DCGM_FI_DEV_CPU_CLOCK_CURRENT": 1120, + "DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT": 1130, + "DCGM_FI_DEV_CPU_POWER_LIMIT": 1131, + "DCGM_FI_DEV_CPU_VENDOR": 1140, + "DCGM_FI_DEV_CPU_MODEL": 1141, + "DCGM_FI_MAX_FIELDS": 1142, } ) diff --git a/pkg/dcgm/dcgm_agent.h b/pkg/dcgm/dcgm_agent.h index 481beb8..d1eba62 100644 --- a/pkg/dcgm/dcgm_agent.h +++ b/pkg/dcgm/dcgm_agent.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,13 +17,13 @@ #ifndef DCGM_AGENT_H #define DCGM_AGENT_H +#define DCGM_PUBLIC_API #include "dcgm_structs.h" #ifdef __cplusplus extern "C" { #endif -#define DCGM_PUBLIC_API /***************************************************************************************************/ /** @defgroup DCGMAPI_Admin Administrative @@ -274,8 +274,8 @@ DCGM_PUBLIC_API dcgmReturn_t dcgmModuleIdToName(dcgmModuleId_t id, char const ** /***************************************************************************************************/ /** @defgroup DCGMAPI_SYS System * @{ - * This chapter describes the APIs used to identify set of GPUs on the node, grouping functions to - * provide mechanism to operate on a group of GPUs, and status management APIs in + * This chapter describes the APIs used to identify entities on the node, grouping functions to + * provide mechanism to operate on a group of entities, and status management APIs in * order to get individual statuses for each operation. The APIs in System module can be * broken down into following categories: */ @@ -405,6 +405,23 @@ dcgmReturn_t DCGM_PUBLIC_API dcgmGetGpuInstanceHierarchy(dcgmHandle_t dcgmHandle */ dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v3 *linkStatus); + +/** + * List supported CPUs and their cores present on the system + * + * This and other CPU APIs only support datacenter NVIDIA CPUs + * + * @param dcgmHandle IN: DCGM Handle + * @param cpuHierarchy OUT: Structure where the CPUs and their associated cores will be enumerated + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_NOT_SUPPORTED if the device is unsupported + * - \ref DCGM_ST_MODULE_NOT_LOADED if the sysmon module could not be loaded + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetCpuHierarchy(dcgmHandle_t dcgmHandle, dcgmCpuHierarchy_v1 *cpuHierarchy); + /** @} */ /***************************************************************************************************/ diff --git a/pkg/dcgm/dcgm_errors.h b/pkg/dcgm/dcgm_errors.h index 813e429..02d15ab 100644 --- a/pkg/dcgm/dcgm_errors.h +++ b/pkg/dcgm/dcgm_errors.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,133 +16,172 @@ #ifndef DCGM_ERRORS_H #define DCGM_ERRORS_H -#include "dcgm_api_export.h" +#define DCGM_PUBLIC_API #include "dcgm_structs.h" +/***************************************************************************************************/ +/** @defgroup dcgmErrorEnums Error Codes + * @{ + */ +/***************************************************************************************************/ /* * Error codes for passive and active health checks. * New error codes must be added to end of enum to maintain backwards compatibility. */ typedef enum dcgmError_enum { - DCGM_FR_OK = 0, //!< No error - DCGM_FR_UNKNOWN = 1, //!< Unknown error code - DCGM_FR_UNRECOGNIZED = 2, //!< Unrecognized error code - DCGM_FR_PCI_REPLAY_RATE = 3, //!< Unacceptable rate of PCI errors - DCGM_FR_VOLATILE_DBE_DETECTED = 4, //!< Uncorrectable volatile double bit error - DCGM_FR_VOLATILE_SBE_DETECTED = 5, //!< Unacceptable rate of volatile single bit errors - DCGM_FR_PENDING_PAGE_RETIREMENTS = 6, //!< Pending page retirements detected - DCGM_FR_RETIRED_PAGES_LIMIT = 7, //!< Unacceptable total page retirements detected - DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8, //!< Unacceptable total page retirements due to uncorrectable errors - DCGM_FR_CORRUPT_INFOROM = 9, //!< Corrupt inforom found - DCGM_FR_CLOCK_THROTTLE_THERMAL = 10, //!< Clocks being throttled due to overheating - DCGM_FR_POWER_UNREADABLE = 11, //!< Cannot get a reading for power from NVML - DCGM_FR_CLOCK_THROTTLE_POWER = 12, //!< Clock being throttled due to power restrictions - DCGM_FR_NVLINK_ERROR_THRESHOLD = 13, //!< Unacceptable rate of NVLink errors - DCGM_FR_NVLINK_DOWN = 14, //!< NVLink is down - DCGM_FR_NVSWITCH_FATAL_ERROR = 15, //!< Fatal errors on the NVSwitch - DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16, //!< Non-fatal errors on the NVSwitch - DCGM_FR_NVSWITCH_DOWN = 17, //!< NVSwitch is down - DCGM_FR_NO_ACCESS_TO_FILE = 18, //!< Cannot access a file - DCGM_FR_NVML_API = 19, //!< Error occurred on an NVML API - DCGM_FR_DEVICE_COUNT_MISMATCH = 20, //!< Disagreement in GPU count between /dev and NVML - DCGM_FR_BAD_PARAMETER = 21, //!< Bad parameter passed to API - DCGM_FR_CANNOT_OPEN_LIB = 22, //!< Cannot open a library that must be accessed - DCGM_FR_DENYLISTED_DRIVER = 23, //!< A driver on the denylist (nouveau) is active - DCGM_FR_NVML_LIB_BAD = 24, //!< The NVML library is missing expected functions - DCGM_FR_GRAPHICS_PROCESSES = 25, //!< Graphics processes are active on this GPU - DCGM_FR_HOSTENGINE_CONN = 26, //!< Unstable connection to nv-hostengine (daemonized DCGM) - DCGM_FR_FIELD_QUERY = 27, //!< Error querying a field from DCGM - DCGM_FR_BAD_CUDA_ENV = 28, //!< The environment has variables that hurt CUDA - DCGM_FR_PERSISTENCE_MODE = 29, //!< Persistence mode is disabled - DCGM_FR_LOW_BANDWIDTH = 30, //!< The bandwidth is unacceptably low - DCGM_FR_HIGH_LATENCY = 31, //!< Latency is too high - DCGM_FR_CANNOT_GET_FIELD_TAG = 32, //!< Cannot find a tag for a field - DCGM_FR_FIELD_VIOLATION = 33, //!< The value for the specified error field is above 0 - DCGM_FR_FIELD_THRESHOLD = 34, //!< The value for the specified field is above the threshold - DCGM_FR_FIELD_VIOLATION_DBL = 35, //!< The value for the specified error field is above 0 - DCGM_FR_FIELD_THRESHOLD_DBL = 36, //!< The value for the specified field is above the threshold - DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37, //!< Field type cannot be supported - DCGM_FR_FIELD_THRESHOLD_TS = 38, //!< The value for the specified field is above the threshold - DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39, //!< The value for the specified field is above the threshold - DCGM_FR_THERMAL_VIOLATIONS = 40, //!< Thermal violations detected - DCGM_FR_THERMAL_VIOLATIONS_TS = 41, //!< Thermal violations detected with a timestamp - DCGM_FR_TEMP_VIOLATION = 42, //!< Temperature is too high - DCGM_FR_THROTTLING_VIOLATION = 43, //!< Non-benign clock throttling is occurring - DCGM_FR_INTERNAL = 44, //!< An internal error was detected - DCGM_FR_PCIE_GENERATION = 45, //!< PCIe generation is too low - DCGM_FR_PCIE_WIDTH = 46, //!< PCIe width is too low - DCGM_FR_ABORTED = 47, //!< Test was aborted by a user signal - DCGM_FR_TEST_DISABLED = 48, //!< This test is disabled for this GPU - DCGM_FR_CANNOT_GET_STAT = 49, //!< Cannot get telemetry for a needed value - DCGM_FR_STRESS_LEVEL = 50, //!< Stress level is too low (bad performance) - DCGM_FR_CUDA_API = 51, //!< Error calling the specified CUDA API - DCGM_FR_FAULTY_MEMORY = 52, //!< Faulty memory detected on this GPU - DCGM_FR_CANNOT_SET_WATCHES = 53, //!< Unable to set field watches in DCGM - DCGM_FR_CUDA_UNBOUND = 54, //!< CUDA context is no longer bound - DCGM_FR_ECC_DISABLED = 55, //!< ECC memory is disabled right now - DCGM_FR_MEMORY_ALLOC = 56, //!< Cannot allocate memory on the GPU - DCGM_FR_CUDA_DBE = 57, //!< CUDA detected unrecovable double-bit error - DCGM_FR_MEMORY_MISMATCH = 58, //!< Memory error detected - DCGM_FR_CUDA_DEVICE = 59, //!< No CUDA device discoverable for existing GPU - DCGM_FR_ECC_UNSUPPORTED = 60, //!< ECC memory is unsupported by this SKU - DCGM_FR_ECC_PENDING = 61, //!< ECC memory is in a pending state - DCGM_FR_MEMORY_BANDWIDTH = 62, //!< Memory bandwidth is too low - DCGM_FR_TARGET_POWER = 63, //!< Cannot hit the target power draw - DCGM_FR_API_FAIL = 64, //!< The specified API call failed - DCGM_FR_API_FAIL_GPU = 65, //!< The specified API call failed for the specified GPU - DCGM_FR_CUDA_CONTEXT = 66, //!< Cannot create a CUDA context on this GPU - DCGM_FR_DCGM_API = 67, //!< DCGM API failure - DCGM_FR_CONCURRENT_GPUS = 68, //!< Need multiple GPUs to run this test - DCGM_FR_TOO_MANY_ERRORS = 69, //!< More errors than fit in the return struct - DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70, //!< More than 100 CRC errors are happening per second - DCGM_FR_NVLINK_ERROR_CRITICAL = 71, //!< NVLink error for a field that should always be 0 - DCGM_FR_ENFORCED_POWER_LIMIT = 72, //!< The enforced power limit is too low to hit the target - DCGM_FR_MEMORY_ALLOC_HOST = 73, //!< Cannot allocate memory on the host - DCGM_FR_GPU_OP_MODE = 74, //!< Bad GPU operating mode for running plugin - DCGM_FR_NO_MEMORY_CLOCKS = 75, //!< No memory clocks with the needed MHz were found - DCGM_FR_NO_GRAPHICS_CLOCKS = 76, //!< No graphics clocks with the needed MHz were found - DCGM_FR_HAD_TO_RESTORE_STATE = 77, //!< Note that we had to restore a GPU's state - DCGM_FR_L1TAG_UNSUPPORTED = 78, //!< L1TAG test is unsupported by this SKU - DCGM_FR_L1TAG_MISCOMPARE = 79, //!< L1TAG test failed on a miscompare - DCGM_FR_ROW_REMAP_FAILURE = 80, //!< Row remapping failed (Ampere or newer GPUs) - DCGM_FR_UNCONTAINED_ERROR = 81, //!< Uncontained error - XID 95 - DCGM_FR_EMPTY_GPU_LIST = 82, //!< No GPU information given to plugin - DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83, //!< Pending page retirements due to a DBE - DCGM_FR_UNCORRECTABLE_ROW_REMAP = 84, //!< Uncorrectable row remapping - DCGM_FR_PENDING_ROW_REMAP = 85, //!< Row remapping is pending - DCGM_FR_BROKEN_P2P_MEMORY_DEVICE = 86, //!< P2P copy test detected an error writing to this GPU - DCGM_FR_BROKEN_P2P_WRITER_DEVICE = 87, //!< P2P copy test detected an error writing from this GPU - DCGM_FR_NVSWITCH_NVLINK_DOWN = 88, //!< An NvLink is down for the specified NVSwitch - DCGM_FR_EUD_BINARY_PERMISSIONS = 89, //!< EUD binary permissions are incorrect - DCGM_FR_EUD_NON_ROOT_USER = 90, //!< EUD plugin is not running as root - DCGM_FR_EUD_SPAWN_FAILURE = 91, //!< EUD plugin failed to spawn the EUD binary - DCGM_FR_EUD_TIMEOUT = 92, //!< EUD plugin timed out - DCGM_FR_EUD_ZOMBIE = 93, //!< EUD process remains running after the plugin considers it finished - DCGM_FR_EUD_NON_ZERO_EXIT_CODE = 94, //!< EUD process exited with a non-zero exit code - DCGM_FR_EUD_TEST_FAILED = 95, //!< EUD test failed - DCGM_FR_FILE_CREATE_PERMISSIONS = 96, //!< We cannot create a file in this directory. - DCGM_FR_PAUSE_RESUME_FAILED = 97, //!< Pause/Resume failed - DCGM_FR_PCIE_REPLAYS = 98, //!< PCIe test caught correctable errors - DCGM_FR_GPU_EXPECTED_NVLINKS_UP = 99, //!< Expected nvlinks up per gpu - DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP = 100, //!< Expected nvlinks up per nvswitch - DCGM_FR_XID_ERROR = 101, //!< XID error detected - DCGM_FR_ERROR_SENTINEL = 102, //!< MUST BE THE LAST ERROR CODE + DCGM_FR_OK = 0, //!< 0 No error + DCGM_FR_UNKNOWN = 1, //!< 1 Unknown error code + DCGM_FR_UNRECOGNIZED = 2, //!< 2 Unrecognized error code + DCGM_FR_PCI_REPLAY_RATE = 3, //!< 3 Unacceptable rate of PCI errors + DCGM_FR_VOLATILE_DBE_DETECTED = 4, //!< 4 Uncorrectable volatile double bit error + DCGM_FR_VOLATILE_SBE_DETECTED = 5, //!< 5 Unacceptable rate of volatile single bit errors + DCGM_FR_PENDING_PAGE_RETIREMENTS = 6, //!< 6 Pending page retirements detected + DCGM_FR_RETIRED_PAGES_LIMIT = 7, //!< 7 Unacceptable total page retirements detected + DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8, //!< 8 Unacceptable total page retirements due to uncorrectable errors + DCGM_FR_CORRUPT_INFOROM = 9, //!< 9 Corrupt inforom found + DCGM_FR_CLOCK_THROTTLE_THERMAL = 10, //!< 10 Clocks being throttled due to overheating + DCGM_FR_POWER_UNREADABLE = 11, //!< 11 Cannot get a reading for power from NVML + DCGM_FR_CLOCK_THROTTLE_POWER = 12, //!< 12 Clock being throttled due to power restrictions + DCGM_FR_NVLINK_ERROR_THRESHOLD = 13, //!< 13 Unacceptable rate of NVLink errors + DCGM_FR_NVLINK_DOWN = 14, //!< 14 NVLink is down + DCGM_FR_NVSWITCH_FATAL_ERROR = 15, //!< 15 Fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16, //!< 16 Non-fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_DOWN = 17, //!< 17 NVSwitch is down - NOT USED: DEPRECATED + DCGM_FR_NO_ACCESS_TO_FILE = 18, //!< 18 Cannot access a file + DCGM_FR_NVML_API = 19, //!< 19 Error occurred on an NVML API - NOT USED: DEPRECATED + DCGM_FR_DEVICE_COUNT_MISMATCH = 20, //!< 20 Disagreement in GPU count between /dev and NVML + DCGM_FR_BAD_PARAMETER = 21, //!< 21 Bad parameter passed to API + DCGM_FR_CANNOT_OPEN_LIB = 22, //!< 22 Cannot open a library that must be accessed + DCGM_FR_DENYLISTED_DRIVER = 23, //!< 23 A driver on the denylist (nouveau) is active + DCGM_FR_NVML_LIB_BAD = 24, //!< 24 NVML library is missing expected functions - NOT USED: DEPRECATED + DCGM_FR_GRAPHICS_PROCESSES = 25, //!< 25 Graphics processes are active on this GPU + DCGM_FR_HOSTENGINE_CONN = 26, //!< 26 Bad connection to nv-hostengine - NOT USED: DEPRECATED + DCGM_FR_FIELD_QUERY = 27, //!< 27 Error querying a field from DCGM + DCGM_FR_BAD_CUDA_ENV = 28, //!< 28 The environment has variables that hurt CUDA + DCGM_FR_PERSISTENCE_MODE = 29, //!< 29 Persistence mode is disabled + DCGM_FR_LOW_BANDWIDTH = 30, //!< 30 The bandwidth is unacceptably low + DCGM_FR_HIGH_LATENCY = 31, //!< 31 Latency is too high + DCGM_FR_CANNOT_GET_FIELD_TAG = 32, //!< 32 Cannot find a tag for a field + DCGM_FR_FIELD_VIOLATION = 33, //!< 33 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD = 34, //!< 34 The value for the specified field is above the threshold + DCGM_FR_FIELD_VIOLATION_DBL = 35, //!< 35 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD_DBL = 36, //!< 36 The value for the specified field is above the threshold + DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37, //!< 37 Field type cannot be supported + DCGM_FR_FIELD_THRESHOLD_TS = 38, //!< 38 The value for the specified field is above the threshold + DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39, //!< 39 The value for the specified field is above the threshold + DCGM_FR_THERMAL_VIOLATIONS = 40, //!< 40 Thermal violations detected + DCGM_FR_THERMAL_VIOLATIONS_TS = 41, //!< 41 Thermal violations detected with a timestamp + DCGM_FR_TEMP_VIOLATION = 42, //!< 42 Temperature is too high + DCGM_FR_THROTTLING_VIOLATION = 43, //!< 43 Non-benign clock throttling is occurring + DCGM_FR_INTERNAL = 44, //!< 44 An internal error was detected + DCGM_FR_PCIE_GENERATION = 45, //!< 45 PCIe generation is too low + DCGM_FR_PCIE_WIDTH = 46, //!< 46 PCIe width is too low + DCGM_FR_ABORTED = 47, //!< 47 Test was aborted by a user signal + DCGM_FR_TEST_DISABLED = 48, //!< 48 This test is disabled for this GPU + DCGM_FR_CANNOT_GET_STAT = 49, //!< 49 Cannot get telemetry for a needed value + DCGM_FR_STRESS_LEVEL = 50, //!< 50 Stress level is too low (bad performance) + DCGM_FR_CUDA_API = 51, //!< 51 Error calling the specified CUDA API + DCGM_FR_FAULTY_MEMORY = 52, //!< 52 Faulty memory detected on this GPU + DCGM_FR_CANNOT_SET_WATCHES = 53, //!< 53 Unable to set field watches in DCGM - NOT USED: DEPRECATED + DCGM_FR_CUDA_UNBOUND = 54, //!< 54 CUDA context is no longer bound + DCGM_FR_ECC_DISABLED = 55, //!< 55 ECC memory is disabled right now + DCGM_FR_MEMORY_ALLOC = 56, //!< 56 Cannot allocate memory on the GPU + DCGM_FR_CUDA_DBE = 57, //!< 57 CUDA detected unrecovable double-bit error + DCGM_FR_MEMORY_MISMATCH = 58, //!< 58 Memory error detected + DCGM_FR_CUDA_DEVICE = 59, //!< 59 No CUDA device discoverable for existing GPU + DCGM_FR_ECC_UNSUPPORTED = 60, //!< 60 ECC memory is unsupported by this SKU + DCGM_FR_ECC_PENDING = 61, //!< 61 ECC memory is in a pending state - NOT USED: DEPRECATED + DCGM_FR_MEMORY_BANDWIDTH = 62, //!< 62 Memory bandwidth is too low + DCGM_FR_TARGET_POWER = 63, //!< 63 Cannot hit the target power draw + DCGM_FR_API_FAIL = 64, //!< 64 The specified API call failed + DCGM_FR_API_FAIL_GPU = 65, //!< 65 The specified API call failed for the specified GPU + DCGM_FR_CUDA_CONTEXT = 66, //!< 66 Cannot create a CUDA context on this GPU + DCGM_FR_DCGM_API = 67, //!< 67 DCGM API failure + DCGM_FR_CONCURRENT_GPUS = 68, //!< 68 Need multiple GPUs to run this test + DCGM_FR_TOO_MANY_ERRORS = 69, //!< 69 More errors than fit in the return struct - NOT USED: DEPRECATED + DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70, //!< 70 More than 100 CRC errors are happening per second + DCGM_FR_NVLINK_ERROR_CRITICAL = 71, //!< 71 NVLink error for a field that should always be 0 + DCGM_FR_ENFORCED_POWER_LIMIT = 72, //!< 72 The enforced power limit is too low to hit the target + DCGM_FR_MEMORY_ALLOC_HOST = 73, //!< 73 Cannot allocate memory on the host + DCGM_FR_GPU_OP_MODE = 74, //!< 74 Bad GPU operating mode for running plugin - NOT USED: DEPRECATED + DCGM_FR_NO_MEMORY_CLOCKS = 75, //!< 75 No memory clocks with the needed MHz found - NOT USED: DEPRECATED + DCGM_FR_NO_GRAPHICS_CLOCKS = 76, //!< 76 No graphics clocks with the needed MHz found - NOT USED: DEPRECATED + DCGM_FR_HAD_TO_RESTORE_STATE = 77, //!< 77 Note that we had to restore a GPU's state + DCGM_FR_L1TAG_UNSUPPORTED = 78, //!< 78 L1TAG test is unsupported by this SKU + DCGM_FR_L1TAG_MISCOMPARE = 79, //!< 79 L1TAG test failed on a miscompare + DCGM_FR_ROW_REMAP_FAILURE = 80, //!< 80 Row remapping failed (Ampere or newer GPUs) + DCGM_FR_UNCONTAINED_ERROR = 81, //!< 81 Uncontained error - XID 95 + DCGM_FR_EMPTY_GPU_LIST = 82, //!< 82 No GPU information given to plugin + DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83, //!< 83 Pending page retirements due to a DBE + DCGM_FR_UNCORRECTABLE_ROW_REMAP = 84, //!< 84 Uncorrectable row remapping + DCGM_FR_PENDING_ROW_REMAP = 85, //!< 85 Row remapping is pending + DCGM_FR_BROKEN_P2P_MEMORY_DEVICE = 86, //!< 86 P2P copy test detected an error writing to this GPU + DCGM_FR_BROKEN_P2P_WRITER_DEVICE = 87, //!< 87 P2P copy test detected an error writing from this GPU + DCGM_FR_NVSWITCH_NVLINK_DOWN = 88, //!< 88 An NvLink is down for the specified NVSwitch - NOT USED: DEPRECATED + DCGM_FR_EUD_BINARY_PERMISSIONS = 89, //!< 89 EUD binary permissions are incorrect + DCGM_FR_EUD_NON_ROOT_USER = 90, //!< 90 EUD plugin is not running as root + DCGM_FR_EUD_SPAWN_FAILURE = 91, //!< 91 EUD plugin failed to spawn the EUD binary + DCGM_FR_EUD_TIMEOUT = 92, //!< 92 EUD plugin timed out + DCGM_FR_EUD_ZOMBIE = 93, //!< 93 EUD process remains running after the plugin considers it finished + DCGM_FR_EUD_NON_ZERO_EXIT_CODE = 94, //!< 94 EUD process exited with a non-zero exit code + DCGM_FR_EUD_TEST_FAILED = 95, //!< 95 EUD test failed + DCGM_FR_FILE_CREATE_PERMISSIONS = 96, //!< 96 We cannot create a file in this directory. + DCGM_FR_PAUSE_RESUME_FAILED = 97, //!< 97 Pause/Resume failed + DCGM_FR_PCIE_H_REPLAY_VIOLATION = 98, //!< 98 PCIe test caught correctable errors + DCGM_FR_GPU_EXPECTED_NVLINKS_UP = 99, //!< 99 Expected nvlinks up per gpu + DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP = 100, //!< 100 Expected nvlinks up per nvswitch + DCGM_FR_XID_ERROR = 101, //!< 101 XID error detected + DCGM_FR_SBE_VIOLATION = 102, //!< 102 Single bit error detected + DCGM_FR_DBE_VIOLATION = 103, //!< 103 Double bit error detected + DCGM_FR_PCIE_REPLAY_VIOLATION = 104, //!< 104 PCIe replay errors detected + DCGM_FR_SBE_THRESHOLD_VIOLATION = 105, //!< 105 SBE threshold violated + DCGM_FR_DBE_THRESHOLD_VIOLATION = 106, //!< 106 DBE threshold violated + DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION = 107, //!< 107 PCIE replay count violated + DCGM_FR_CUDA_FM_NOT_INITIALIZED = 108, //!< 108 The fabricmanager is not initialized + DCGM_FR_SXID_ERROR = 109, //!< 109 NvSwitch fatal error detected + DCGM_FR_ERROR_SENTINEL = 110, //!< 110 MUST BE THE LAST ERROR CODE } dcgmError_t; typedef enum dcgmErrorSeverity_enum { - DCGM_ERROR_MONITOR = 0, //!< Can perform workload, but needs to be monitored. - DCGM_ERROR_ISOLATE = 1, //!< Cannot perform workload. GPU should be isolated. - DCGM_ERROR_UNKNOWN = 2, //!< This error code is not recognized + DCGM_ERROR_NONE = 0, //!< 0 NONE + DCGM_ERROR_MONITOR = 1, //!< 1 Can perform workload, but needs to be monitored. + DCGM_ERROR_ISOLATE = 2, //!< 2 Cannot perform workload. GPU should be isolated. + DCGM_ERROR_UNKNOWN = 3, //!< 3 This error code is not recognized + DCGM_ERROR_TRIAGE = 4, //!< 4 This error should be triaged + DCGM_ERROR_CONFIG = 5, //!< 5 This error can be configured + DCGM_ERROR_RESET = 6, //!< 6 Drain and reset GPU } dcgmErrorSeverity_t; +typedef enum dcgmErrorCategory_enum +{ + DCGM_FR_EC_NONE = 0, //!< 0 NONE + DCGM_FR_EC_PERF_THRESHOLD = 1, //!< 1 Performance Threshold + DCGM_FR_EC_PERF_VIOLATION = 2, //!< 2 Performance Violation + DCGM_FR_EC_SOFTWARE_CONFIG = 3, //!< 3 Software Configuration + DCGM_FR_EC_SOFTWARE_LIBRARY = 4, //!< 4 Software Library + DCGM_FR_EC_SOFTWARE_XID = 5, //!< 5 Software XID + DCGM_FR_EC_SOFTWARE_CUDA = 6, //!< 6 Software Cuda + DCGM_FR_EC_SOFTWARE_EUD = 7, //!< 7 Software EUD + DCGM_FR_EC_SOFTWARE_OTHER = 8, //!< 8 Software Other + DCGM_FR_EC_HARDWARE_THERMAL = 9, //!< 9 Hardware Thermal + DCGM_FR_EC_HARDWARE_MEMORY = 10, //!< 10 Hardware Memory + DCGM_FR_EC_HARDWARE_NVLINK = 11, //!< 11 Hardware NvLink + DCGM_FR_EC_HARDWARE_NVSWITCH = 12, //!< 12 Hardware NvSwitch + DCGM_FR_EC_HARDWARE_PCIE = 13, //!< 13 Hardware PCIe + DCGM_FR_EC_HARDWARE_POWER = 14, //!< 14 Hardware Power + DCGM_FR_EC_HARDWARE_OTHER = 15, //!< 15 Hardware Other + DCGM_FR_EC_INTERNAL_OTHER = 16, //!< 16 Internal Other +} dcgmErrorCategory_t; + typedef struct { dcgmError_t errorId; const char *msgFormat; const char *suggestion; int severity; + int category; } dcgm_error_meta_t; extern dcgm_error_meta_t dcgmErrorMeta[]; @@ -153,7 +192,9 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DEBUG_COOLING_MSG \ "Verify that the cooling on this machine is functional, including external, " \ "thermal material interface, fans, and any other components." -#define BUG_REPORT_MSG "Please capture an nvidia-bug-report and send it to NVIDIA." +#define BUG_REPORT_MSG "Please capture an nvidia-bug-report and send it to NVIDIA." +#define SYSTEM_TRIAGE_MSG "Check DCGM and system logs for errors. Reset GPU. Restart DCGM. Rerun diagnostics." +#define CONFIG_MSG "Check DCGM and system configuration. This error may be eliminated with an updated configuration." /* * Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG format @@ -190,8 +231,10 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; "Detected %ld %s NvLink errors on GPU %u's NVLink which exceeds " \ "threshold of %u" // gpu id, nvlink id -#define DCGM_FR_NVLINK_DOWN_MSG "GPU %u's NvLink link %d is currently down" -#define DCGM_FR_GPU_EXPECTED_NVLINKS_UP_MSG "Only %u NvLinks are up out of the expected %u" +#define DCGM_FR_NVLINK_DOWN_MSG "GPU %u's NvLink link %d is currently down" +// nvlinks up, expected nvlinks up +#define DCGM_FR_GPU_EXPECTED_NVLINKS_UP_MSG "Only %u NvLinks are up out of the expected %u" +// switch id, nvlinks up, expected nvlinks up #define DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP_MSG "NvSwitch %u - Only %u NvLinks are up out of the expected %u" // nvswitch id, nvlink id #define DCGM_FR_NVSWITCH_FATAL_ERROR_MSG "Detected fatal errors on NvSwitch %u link %u" @@ -336,43 +379,70 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; "This API can only return up to four errors per system. " \ "Additional errors were found for this system that couldn't be " \ "communicated." +// error count, gpu id #define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG \ "%.1f %s NvLink errors found occuring per second on GPU %u, " \ "exceeding the limit of 100 per second." +// error count, field name, gpu id #define DCGM_FR_NVLINK_ERROR_CRITICAL_MSG "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)" +// gpu id, power limit, power reached #define DCGM_FR_ENFORCED_POWER_LIMIT_MSG \ "Enforced power limit on GPU %u set to %.1f, which is too low to " \ "attempt to achieve target power %.1f" +// memory #define DCGM_FR_MEMORY_ALLOC_HOST_MSG "Cannot allocate %zu bytes on the host" #define DCGM_FR_GPU_OP_MODE_MSG "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP." -#define DCGM_FR_NO_MEMORY_CLOCKS_MSG "No memory clocks <= %u MHZ were found in %u supported memory clocks." +// clock, count +#define DCGM_FR_NO_MEMORY_CLOCKS_MSG "No memory clocks <= %u MHZ were found in %u supported memory clocks." +// clock, count, clock #define DCGM_FR_NO_GRAPHICS_CLOCKS_MSG \ "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ." -#define DCGM_FR_HAD_TO_RESTORE_STATE_MSG "Had to restore GPU state on NVML GPU(s): %s" -#define DCGM_FR_L1TAG_UNSUPPORTED_MSG "This card does not support the L1 cache test. Skipping test." -#define DCGM_FR_L1TAG_MISCOMPARE_MSG "Detected a miscompare failure in the L1 cache." +// error detail +#define DCGM_FR_HAD_TO_RESTORE_STATE_MSG "Had to restore GPU state on NVML GPU(s): %s" +#define DCGM_FR_L1TAG_UNSUPPORTED_MSG "This card does not support the L1 cache test. Skipping test." +#define DCGM_FR_L1TAG_MISCOMPARE_MSG "Detected a miscompare failure in the L1 cache." +// gpu id #define DCGM_FR_ROW_REMAP_FAILURE_MSG "GPU %u had uncorrectable memory errors and row remapping failed." #define DCGM_FR_UNCONTAINED_ERROR_MSG "GPU had an uncontained error (XID 95)" #define DCGM_FR_EMPTY_GPU_LIST_MSG "No valid GPUs passed to plugin" #define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG "Pending page retirements together with a DBE were detected on GPU %u." -#define DCGM_FR_UNCORRECTABLE_ROW_REMAP_MSG "GPU %u had uncorrectable memory errors and %u rows were remapped" -#define DCGM_FR_PENDING_ROW_REMAP_MSG "GPU %u has uncorrectable memory errors and row remappings are pending" -#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG "GPU %u was unsuccessfully written to in a peer-to-peer test: %s" -#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s" -#define DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG "NVSwitch %u's NvLink %u is down." -#define DCGM_FR_EUD_BINARY_PERMISSIONS_MSG "" /* See message inplace */ -#define DCGM_FR_EUD_NON_ROOT_USER_MSG "" /* See message inplace */ -#define DCGM_FR_EUD_SPAWN_FAILURE_MSG "" /* See message inplace */ -#define DCGM_FR_EUD_TIMEOUT_MSG "" /* See message inplace */ -#define DCGM_FR_EUD_ZOMBIE_MSG "" /* See message inplace */ -#define DCGM_FR_EUD_NON_ZERO_EXIT_CODE_MSG "" /* See message inplace */ -#define DCGM_FR_EUD_TEST_FAILED_MSG "" /* See message inplace */ +// gpu id, rows remapped +#define DCGM_FR_UNCORRECTABLE_ROW_REMAP_MSG "GPU %u had uncorrectable memory errors and %u rows were remapped" +// gpu id +#define DCGM_FR_PENDING_ROW_REMAP_MSG "GPU %u had memory errors and row remappings are pending" +// gpu id, test name +#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG "GPU %u was unsuccessfully written to in a peer-to-peer test: %s" +// gpu id, test name +#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s" +// nvswitch id, nvlink id +#define DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG "NVSwitch %u's NvLink %u is down." +#define DCGM_FR_EUD_BINARY_PERMISSIONS_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_NON_ROOT_USER_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_SPAWN_FAILURE_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_TIMEOUT_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_ZOMBIE_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_NON_ZERO_EXIT_CODE_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_TEST_FAILED_MSG "" /* See message inplace */ #define DCGM_FR_FILE_CREATE_PERMISSIONS_MSG \ "The DCGM Diagnostic does not have permissions to create a file in directory '%s'" #define DCGM_FR_PAUSE_RESUME_FAILED_MSG "" /* See message inplace */ -#define DCGM_FR_PCIE_REPLAYS_MSG "GPU %u had PCIe replays, see dmesg for more information" -#define DCGM_FR_XID_ERROR_MSG "Detected XID %u for GPU %u" -#define DCGM_FR_ERROR_SENTINEL_MSG "" /* See message inplace */ +// gpu id +#define DCGM_FR_PCIE_H_REPLAY_VIOLATION_MSG "GPU %u host-side PCIe replay violation, see dmesg for more information" +// xid error, gpu id +#define DCGM_FR_XID_ERROR_MSG "Detected XID %u for GPU %u" +// count, field, gpu id +#define DCGM_FR_SBE_VIOLATION_MSG "Detected %ld %s for GPU %u" +// count, field, gpu id +#define DCGM_FR_DBE_VIOLATION_MSG "Detected %ld %s for GPU %u" +// count, field, gpu id +#define DCGM_FR_PCIE_REPLAY_VIOLATION_MSG "Detected %ld %s for GPU %u" +// count, field, gpu id, threshold +#define DCGM_FR_SBE_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +#define DCGM_FR_DBE_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +#define DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +#define DCGM_FR_CUDA_FM_NOT_INITIALIZED_MSG "" +#define DCGM_FR_SXID_ERROR_MSG "Detected fatal NvSwitch SXID %u" +#define DCGM_FR_ERROR_SENTINEL_MSG "" /* See message inplace */ /* * Suggestions for next steps for the corresponding error message @@ -391,13 +461,13 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG #define DCGM_FR_CORRUPT_INFOROM_NEXT "Flash the InfoROM to clear this corruption." #define DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT DEBUG_COOLING_MSG -#define DCGM_FR_POWER_UNREADABLE_NEXT "" +#define DCGM_FR_POWER_UNREADABLE_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_CLOCK_THROTTLE_POWER_NEXT "Monitor the power conditions. This GPU can still perform workload." #define DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT "Monitor the NVLink. It can still perform workload." -#define DCGM_FR_NVLINK_DOWN_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_NVLINK_DOWN_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT TRIAGE_RUN_FIELD_DIAG_MSG #define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT "Monitor the NVSwitch. It can still perform workload." -#define DCGM_FR_NVSWITCH_DOWN_NEXT "" +#define DCGM_FR_NVSWITCH_DOWN_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_NO_ACCESS_TO_FILE_NEXT "Check relevant permissions, access, and existence of the file." #define DCGM_FR_GPU_EXPECTED_NVLINKS_UP_NEXT \ "Ensure Fabric Manager is running. Check system logs, dmesg, and fabric-manager logs for more info." @@ -411,7 +481,7 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT \ "Check for the presence of cgroups, operating system blocks, and " \ "or unsupported / older cards" -#define DCGM_FR_BAD_PARAMETER_NEXT "" +#define DCGM_FR_BAD_PARAMETER_NEXT BUG_REPORT_MSG #define DCGM_FR_CANNOT_OPEN_LIB_NEXT \ "Check for the existence of the library and set LD_LIBRARY_PATH " \ "if needed." @@ -425,7 +495,7 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DCGM_FR_HOSTENGINE_CONN_NEXT \ "If hostengine is run separately, please ensure that it is up " \ "and responsive." -#define DCGM_FR_FIELD_QUERY_NEXT "" +#define DCGM_FR_FIELD_QUERY_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_BAD_CUDA_ENV_NEXT "Please unset this environment variable to address test failures." #define DCGM_FR_PERSISTENCE_MODE_NEXT \ "Enable persistence mode by running \"nvidia-smi -i -pm " \ @@ -439,53 +509,53 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; "the topology of each GPU. If so, and errors are consistent, " \ "please run a field diagnostic." #define DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT "" -#define DCGM_FR_FIELD_VIOLATION_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_NEXT "" -#define DCGM_FR_FIELD_VIOLATION_DBL_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_DBL_NEXT "" -#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_TS_NEXT "" -#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT "" +#define DCGM_FR_FIELD_VIOLATION_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_VIOLATION_DBL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_DBL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_TS_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_THERMAL_VIOLATIONS_NEXT DEBUG_COOLING_MSG #define DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT DEBUG_COOLING_MSG #define DCGM_FR_TEMP_VIOLATION_NEXT \ "Verify that the user-specified temperature maximum is set " \ "correctly. If it is, check the cooling for this GPU and node: " DEBUG_COOLING_MSG -#define DCGM_FR_THROTTLING_VIOLATION_NEXT "" -#define DCGM_FR_INTERNAL_NEXT "" -#define DCGM_FR_PCIE_GENERATION_NEXT "" -#define DCGM_FR_PCIE_WIDTH_NEXT "" +#define DCGM_FR_THROTTLING_VIOLATION_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_INTERNAL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_PCIE_GENERATION_NEXT CONFIG_MSG +#define DCGM_FR_PCIE_WIDTH_NEXT CONFIG_MSG #define DCGM_FR_ABORTED_NEXT "" -#define DCGM_FR_TEST_DISABLED_NEXT "" +#define DCGM_FR_TEST_DISABLED_NEXT CONFIG_MSG #define DCGM_FR_CANNOT_GET_STAT_NEXT \ "If running a standalone nv-hostengine, verify that it is up " \ "and responsive." -#define DCGM_FR_STRESS_LEVEL_NEXT "" -#define DCGM_FR_CUDA_API_NEXT "" +#define DCGM_FR_STRESS_LEVEL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CUDA_API_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_FAULTY_MEMORY_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_CANNOT_SET_WATCHES_NEXT "" -#define DCGM_FR_CUDA_UNBOUND_NEXT "" +#define DCGM_FR_CANNOT_SET_WATCHES_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CUDA_UNBOUND_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_ECC_DISABLED_NEXT \ "Enable ECC memory by running \"nvidia-smi -i -e 1\" " \ "to enable. This may require a GPU reset or reboot to take effect." -#define DCGM_FR_MEMORY_ALLOC_NEXT "" +#define DCGM_FR_MEMORY_ALLOC_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_CUDA_DBE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG #define DCGM_FR_MEMORY_MISMATCH_NEXT TRIAGE_RUN_FIELD_DIAG_MSG #define DCGM_FR_CUDA_DEVICE_NEXT \ "Make sure CUDA_VISIBLE_DEVICES is not preventing visibility of " \ "this GPU. Also check if CUDA libraries are compatible and " \ "correctly installed." -#define DCGM_FR_ECC_UNSUPPORTED_NEXT "" +#define DCGM_FR_ECC_UNSUPPORTED_NEXT CONFIG_MSG #define DCGM_FR_ECC_PENDING_NEXT "Reboot to complete activation of the ECC memory." -#define DCGM_FR_MEMORY_BANDWIDTH_NEXT "" +#define DCGM_FR_MEMORY_BANDWIDTH_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_TARGET_POWER_NEXT "Verify that the clock speeds and GPU utilization are high." -#define DCGM_FR_API_FAIL_NEXT "" -#define DCGM_FR_API_FAIL_GPU_NEXT "" +#define DCGM_FR_API_FAIL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_API_FAIL_GPU_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_CUDA_CONTEXT_NEXT \ "Please make sure the correct driver version is installed and " \ "verify that no conflicting libraries are present." -#define DCGM_FR_DCGM_API_NEXT "" -#define DCGM_FR_CONCURRENT_GPUS_NEXT "" +#define DCGM_FR_DCGM_API_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CONCURRENT_GPUS_NEXT CONFIG_MSG #define DCGM_FR_TOO_MANY_ERRORS_NEXT "" #define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT TRIAGE_RUN_FIELD_DIAG_MSG #define DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT TRIAGE_RUN_FIELD_DIAG_MSG @@ -499,15 +569,15 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; "" #define DCGM_FR_NO_MEMORY_CLOCKS_NEXT "" #define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT "" -#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT "" -#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT "" +#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT CONFIG_MSG #define DCGM_FR_L1TAG_MISCOMPARE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG -#define DCGM_FR_ROW_REMAP_FAILURE_NEXT "Reset the GPU as soon as possible to restore operation." +#define DCGM_FR_ROW_REMAP_FAILURE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG #define DCGM_FR_UNCONTAINED_ERROR_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT #define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT "Drain the GPU and reset it or reboot the node to resolve this issue." -#define DCGM_FR_EMPTY_GPU_LIST_NEXT "" +#define DCGM_FR_EMPTY_GPU_LIST_NEXT CONFIG_MSG #define DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT "" -#define DCGM_FR_PENDING_ROW_REMAP_NEXT "" +#define DCGM_FR_PENDING_ROW_REMAP_NEXT SYSTEM_TRIAGE_MSG #define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT BUG_REPORT_MSG #define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT BUG_REPORT_MSG #define DCGM_FR_NVSWITCH_NVLINK_DOWN_NEXT \ @@ -523,21 +593,32 @@ extern dcgm_error_meta_t dcgmErrorMeta[]; #define DCGM_FR_FILE_CREATE_PERMISSIONS_NEXT \ "Please restart the hostengine with parameter --home-dir to specify a different home directory for the " \ "diagnostic or change permissions in the current directory to allow the user to write files there." -#define DCGM_FR_PAUSE_RESUME_FAILED_NEXT "" /* See message inplace */ -#define DCGM_FR_PCIE_REPLAYS_NEXT "" /* See message inplace */ -#define DCGM_FR_XID_ERROR_NEXT "Please consult the documentation for details of this XID." -#define DCGM_FR_ERROR_SENTINEL_NEXT "" /* See message inplace */ +#define DCGM_FR_PAUSE_RESUME_FAILED_NEXT "" /* See message inplace */ +#define DCGM_FR_PCIE_H_REPLAY_VIOLATION_NEXT "" /* See message inplace */ +#define DCGM_FR_XID_ERROR_NEXT "Please consult the documentation for details of this XID." +#define DCGM_FR_SBE_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_DBE_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_PCIE_REPLAY_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_SBE_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_DBE_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CUDA_FM_NOT_INITIALIZED_NEXT "Ensure that the FabricManager is running without errors." +#define DCGM_FR_SXID_ERROR_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_ERROR_SENTINEL_NEXT "" /* See message inplace */ #ifdef __cplusplus extern "C" { #endif DCGM_PUBLIC_API dcgmErrorSeverity_t dcgmErrorGetPriorityByCode(unsigned int code); +DCGM_PUBLIC_API dcgmErrorCategory_t dcgmErrorGetCategoryByCode(unsigned int code); DCGM_PUBLIC_API const char *dcgmErrorGetFormatMsgByCode(unsigned int code); DCGM_PUBLIC_API const dcgm_error_meta_t *dcgmGetErrorMeta(dcgmError_t error); DCGM_PUBLIC_API const char *errorString(dcgmReturn_t result); +/** @} */ + #ifdef __cplusplus } // extern "C" #endif diff --git a/pkg/dcgm/dcgm_fields.h b/pkg/dcgm/dcgm_fields.h index b032007..19d1eae 100644 --- a/pkg/dcgm/dcgm_fields.h +++ b/pkg/dcgm/dcgm_fields.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -187,6 +187,8 @@ typedef enum dcgm_field_entity_group_t DCGM_FE_GPU_I, /*!< Field is associated with a GPU Instance entity */ DCGM_FE_GPU_CI, /*!< Field is associated with a GPU Compute Instance entity */ DCGM_FE_LINK, /*!< Field is associated with an NVLink */ + DCGM_FE_CPU, /*!< Field is associated with a CPU node */ + DCGM_FE_CPU_CORE, /*!< Field is associated with a CPU */ DCGM_FE_COUNT /*!< Number of elements in this enumeration. Keep this entry last */ } dcgm_field_entity_group_t; @@ -725,6 +727,24 @@ typedef unsigned int dcgm_field_eid_t; */ #define DCGM_FI_DEV_FB_USED_PERCENT 254 +/** + * C2C Link Count + */ +#define DCGM_FI_DEV_C2C_LINK_COUNT 285 + +/** + * C2C Link Status + * The value of 0 the link is INACTIVE. + * The value of 1 the link is ACTIVE. + */ +#define DCGM_FI_DEV_C2C_LINK_STATUS 286 + +/** + * C2C Max Bandwidth + * The value indicates the link speed in MB/s. + */ +#define DCGM_FI_DEV_C2C_MAX_BANDWIDTH 287 + /** * Current ECC mode for the device */ @@ -1502,6 +1522,21 @@ typedef unsigned int dcgm_field_eid_t; */ #define DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD 704 +/** + * NvSwitch Power VDD in watts + */ +#define DCGM_FI_DEV_NVSWITCH_POWER_VDD 705 + +/** + * NvSwitch Power DVDD in watts + */ +#define DCGM_FI_DEV_NVSWITCH_POWER_DVDD 706 + +/** + * NvSwitch Power HVDD in watts + */ +#define DCGM_FI_DEV_NVSWITCH_POWER_HVDD 707 + /** *

NVSwitch Tx Throughput Counter for ports 0-17

*/ @@ -1995,10 +2030,75 @@ typedef unsigned int dcgm_field_eid_t; */ #define DCGM_FI_PROF_NVLINK_THROUGHPUT_LAST DCGM_FI_PROF_NVLINK_L17_RX_BYTES +/** + * CPU Utilization, total + */ +#define DCGM_FI_DEV_CPU_UTIL_TOTAL 1100 + +/** + * CPU Utilization, user + */ +#define DCGM_FI_DEV_CPU_UTIL_USER 1101 + +/** + * CPU Utilization, nice + */ +#define DCGM_FI_DEV_CPU_UTIL_NICE 1102 + +/** + * CPU Utilization, system time + */ +#define DCGM_FI_DEV_CPU_UTIL_SYS 1103 + +/** + * CPU Utilization, interrupt servicing + */ +#define DCGM_FI_DEV_CPU_UTIL_IRQ 1104 + +/** + * CPU temperature + */ +#define DCGM_FI_DEV_CPU_TEMP_CURRENT 1110 + +/** + * CPU Warning Temperature + */ +#define DCGM_FI_DEV_CPU_TEMP_WARNING 1111 + +/** + * CPU Critical Temperature + */ +#define DCGM_FI_DEV_CPU_TEMP_CRITICAL 1112 + +/** + * CPU instantaneous clock speed + */ +#define DCGM_FI_DEV_CPU_CLOCK_CURRENT 1120 + +/** + * CPU power utilization + */ +#define DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT 1130 + +/** + * CPU power limit + */ +#define DCGM_FI_DEV_CPU_POWER_LIMIT 1131 + +/** + * CPU vendor name + */ +#define DCGM_FI_DEV_CPU_VENDOR 1140 + +/** + * CPU model name + */ +#define DCGM_FI_DEV_CPU_MODEL 1141 + /** * 1 greater than maximum fields above. This is the 1 greater than the maximum field id that could be allocated */ -#define DCGM_FI_MAX_FIELDS 1076 +#define DCGM_FI_MAX_FIELDS 1142 /** @} */ diff --git a/pkg/dcgm/dcgm_structs.h b/pkg/dcgm/dcgm_structs.h index 4e88cdc..74bee6d 100644 --- a/pkg/dcgm/dcgm_structs.h +++ b/pkg/dcgm/dcgm_structs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #define DCGM_STRUCTS_H #include "dcgm_fields.h" +#include #include @@ -202,6 +203,16 @@ */ #define DCGM_MAX_VGPU_INSTANCES_PER_PGPU 32 +/** + * Max number of CPU nodes + **/ +#define DCGM_MAX_NUM_CPUS 8 + +/** + * Max number of CPUs + **/ +#define DCGM_MAX_NUM_CPU_CORES 1024 + /** * Max length of the DCGM string field */ @@ -350,6 +361,7 @@ typedef enum dcgmReturn_enum DCGM_ST_NVVS_BINARY_NOT_FOUND = -52, //!< The NVVS binary was not found in the specified location DCGM_ST_NVVS_KILLED = -53, //!< The NVVS process was killed by a signal DCGM_ST_PAUSED = -54, //!< The hostengine and all modules are paused + DCGM_ST_ALREADY_INITIALIZED = -55, //!< The object is already initialized } dcgmReturn_t; const char *errorString(dcgmReturn_t result); @@ -673,6 +685,41 @@ typedef struct #define dcgmMigHierarchy_version dcgmMigHierarchy_version2 +/** + * Bitmask indicating which cores are owned by this CPUs + */ +#define DCGM_CPU_CORE_BITMASK_COUNT_V1 (DCGM_MAX_NUM_CPU_CORES / sizeof(uint64_t) / CHAR_BIT) +typedef struct +{ + unsigned int version; + uint64_t bitmask[DCGM_CPU_CORE_BITMASK_COUNT_V1]; +} dcgmCpuHierarchyOwnedCores_v1; + +typedef dcgmCpuHierarchyOwnedCores_v1 dcgmCpuHierarchyOwnedCores_t; + +#define dcgmCpuHierarchyOwnedCores_version1 MAKE_DCGM_VERSION(dcgmCpuHierarchyOwnedCores_v1, 1) + +/** + * Hierarchy of CPUs and their cores + */ +typedef struct +{ + unsigned int version; + unsigned int numCpus; + struct dcgmCpuHierarchyCpu_v1 + { + unsigned int cpuId; + dcgmCpuHierarchyOwnedCores_v1 ownedCores; + } cpus[DCGM_MAX_NUM_CPUS]; +} dcgmCpuHierarchy_v1; + +typedef dcgmCpuHierarchy_v1 dcgmCpuHierarchy_t; + +/** + * Version 1 for dcgmCpuHierarchy_t + */ +#define dcgmCpuHierarchy_version1 MAKE_DCGM_VERSION(dcgmCpuHierarchy_v1, 1) + /** * Maximum number of field groups that can exist */ @@ -2036,6 +2083,21 @@ typedef struct unsigned int code; } dcgmDiagErrorDetail_t; +#define DCGM_ERR_MSG_LENGTH 512 +/** + * Error details + * + * Since DCGM 3.3 + */ +typedef struct +{ + char msg[DCGM_ERR_MSG_LENGTH]; + int gpuId; + unsigned int code; + unsigned int category; //!< See dcgmErrorCategory_t + unsigned int severity; //!< See dcgmErrorSeverity_t +} dcgmDiagErrorDetail_v2; + #define DCGM_HEALTH_WATCH_MAX_INCIDENTS DCGM_GROUP_MAX_ENTITIES typedef struct @@ -2348,6 +2410,13 @@ typedef struct char info[1024]; //!< Information details returned from the test, if any } dcgmDiagTestResult_v2; +#define DCGM_MAX_ERRORS 5 +typedef struct +{ + dcgmDiagResult_t status; //!< The result of the test + dcgmDiagErrorDetail_v2 error[DCGM_MAX_ERRORS]; //!< The error message and error code, if any + char info[DCGM_ERR_MSG_LENGTH]; //!< Information details returned from the test, if any +} dcgmDiagTestResult_v3; /** * Diagnostic per gpu tests - fixed indices for dcgmDiagResponsePerGpu_t.results[] @@ -2394,6 +2463,13 @@ typedef struct dcgmDiagTestResult_v2 results[DCGM_PER_GPU_TEST_COUNT_V8]; //!< Array with a result for each per-gpu test } dcgmDiagResponsePerGpu_v4; +typedef struct +{ + unsigned int gpuId; //!< ID for the GPU this information pertains + unsigned int hwDiagnosticReturn; //!< Per GPU hardware diagnostic test return code + dcgmDiagTestResult_v3 results[DCGM_PER_GPU_TEST_COUNT_V8]; //!< Array with a result for each per-gpu test +} dcgmDiagResponsePerGpu_v5; + /** * Per gpu response structure v3 * @@ -2427,6 +2503,27 @@ typedef enum dcgmSoftwareTest_enum #define DCGM_DEVICE_ID_LEN 5 #define DCGM_VERSION_LEN 12 +/** + * Global diagnostics result structure v9 + * + * Since DCGM 3.3 + */ +typedef struct +{ + unsigned int version; //!< version number (dcgmDiagResult_version) + unsigned int gpuCount; //!< number of valid per GPU results + unsigned int levelOneTestCount; //!< number of valid levelOne results + + dcgmDiagTestResult_v3 levelOneResults[LEVEL_ONE_MAX_RESULTS]; //!< Basic, system-wide test results. + dcgmDiagResponsePerGpu_v5 perGpuResponses[DCGM_MAX_NUM_DEVICES]; //!< per GPU test results + dcgmDiagErrorDetail_v2 systemError; //!< System-wide error reported from NVVS + char devIds[DCGM_MAX_NUM_DEVICES][DCGM_DEVICE_ID_LEN]; //!< The SKU device id for each GPU + char devSerials[DCGM_MAX_NUM_DEVICES][DCGM_MAX_STR_LENGTH]; //!< Serial for the device + char dcgmVersion[DCGM_VERSION_LEN]; //!< A string representing DCGM's version + char driverVersion[DCGM_MAX_STR_LENGTH]; //!< A string representing the driver version + char _unused[596]; //!< No longer used +} dcgmDiagResponse_v9; + /** * Global diagnostics result structure v8 * @@ -2465,9 +2562,14 @@ typedef struct } dcgmDiagResponse_v7; /** - * Typedef for \ref dcgmDiagResponse_v8 + * Typedef for \ref dcgmDiagResponse_v9 + */ +typedef dcgmDiagResponse_v9 dcgmDiagResponse_t; + +/** + * Version 9 for \ref dcgmDiagResponse_v9 */ -typedef dcgmDiagResponse_v8 dcgmDiagResponse_t; +#define dcgmDiagResponse_version9 MAKE_DCGM_VERSION(dcgmDiagResponse_v9, 9) /** * Version 8 for \ref dcgmDiagResponse_v8 @@ -2482,7 +2584,7 @@ typedef dcgmDiagResponse_v8 dcgmDiagResponse_t; /** * Latest version for \ref dcgmDiagResponse_t */ -#define dcgmDiagResponse_version dcgmDiagResponse_version8 +#define dcgmDiagResponse_version dcgmDiagResponse_version9 /** * Represents level relationships within a system between two GPUs @@ -2906,6 +3008,7 @@ typedef enum DcgmModuleIdConfig = 6, //!< Config Module DcgmModuleIdDiag = 7, //!< GPU Diagnostic Module DcgmModuleIdProfiling = 8, //!< Profiling Module + DcgmModuleIdSysmon = 9, //!< System Monitoring Module DcgmModuleIdCount //!< Always last. 1 greater than largest value above } dcgmModuleId_t; @@ -2959,14 +3062,13 @@ typedef dcgmModuleGetStatuses_v1 dcgmModuleGetStatuses_t; */ typedef struct { - unsigned int version; /*!< Version number. Use dcgmStartEmbeddedV2Params_version1 */ - dcgmOperationMode_t opMode; /*!< IN: Collect data automatically or manually when asked by the user. */ - dcgmHandle_t dcgmHandle; /*!< OUT: DCGM Handle to use for API calls */ - const char *logFile; /*!< IN: File that DCGM should log to. NULL = do not log. '-' = stdout */ - DcgmLoggingSeverity_t severity; /*!< IN: Severity at which DCGM should log to logFile */ - unsigned int denyListCount; /*!< IN: Number of modules in denyList[] */ - dcgmModuleId_t denyList[DcgmModuleIdCount]; /* IN: IDs of modules to add to the denylist */ - unsigned int unused; /*!< IN: Unused. Set to 0. Aligns structure to 8-bytes */ + unsigned int version; /*!< Version number. Use dcgmStartEmbeddedV2Params_version1 */ + dcgmOperationMode_t opMode; /*!< IN: Collect data automatically or manually when asked by the user. */ + dcgmHandle_t dcgmHandle; /*!< OUT: DCGM Handle to use for API calls */ + const char *logFile; /*!< IN: File that DCGM should log to. NULL = do not log. '-' = stdout */ + DcgmLoggingSeverity_t severity; /*!< IN: Severity at which DCGM should log to logFile */ + unsigned int denyListCount; /*!< IN: Number of modules in denyList[] */ + unsigned int denyList[DcgmModuleIdCount]; /* IN: IDs of modules to add to the denylist */ } dcgmStartEmbeddedV2Params_v1; /** @@ -2981,15 +3083,14 @@ typedef struct */ typedef struct { - unsigned int version; /*!< Version number. Use dcgmStartEmbeddedV2Params_version2 */ - dcgmOperationMode_t opMode; /*!< IN: Collect data automatically or manually when asked by the user. */ - dcgmHandle_t dcgmHandle; /*!< OUT: DCGM Handle to use for API calls */ - const char *logFile; /*!< IN: File that DCGM should log to. NULL = do not log. '-' = stdout */ - DcgmLoggingSeverity_t severity; /*!< IN: Severity at which DCGM should log to logFile */ - unsigned int denyListCount; /*!< IN: Number of modules to be added to the denylist in denyList[] */ - const char *serviceAccount; /*!< IN: Service account for unprivileged processes */ - dcgmModuleId_t denyList[DcgmModuleIdCount]; /*!< IN: IDs of modules to be added to the denylist */ - char _padding[4]; /*!< IN: Unused. Aligns the struct to 8 bytes. */ + unsigned int version; /*!< Version number. Use dcgmStartEmbeddedV2Params_version2 */ + dcgmOperationMode_t opMode; /*!< IN: Collect data automatically or manually when asked by the user. */ + dcgmHandle_t dcgmHandle; /*!< OUT: DCGM Handle to use for API calls */ + const char *logFile; /*!< IN: File that DCGM should log to. NULL = do not log. '-' = stdout */ + DcgmLoggingSeverity_t severity; /*!< IN: Severity at which DCGM should log to logFile */ + unsigned int denyListCount; /*!< IN: Number of modules to be added to the denylist in denyList[] */ + const char *serviceAccount; /*!< IN: Service account for unprivileged processes */ + unsigned int denyList[DcgmModuleIdCount]; /*!< IN: IDs of modules to be added to the denylist */ } dcgmStartEmbeddedV2Params_v2; /** diff --git a/pkg/dcgm/diag.go b/pkg/dcgm/diag.go index 2d53fb0..8212842 100644 --- a/pkg/dcgm/diag.go +++ b/pkg/dcgm/diag.go @@ -113,15 +113,15 @@ func gpuTestName(t int) string { return "" } -func newDiagResult(testResult C.dcgmDiagTestResult_v2, testName string) DiagResult { - msg := C.GoString((*C.char)(unsafe.Pointer(&testResult.error.msg))) +func newDiagResult(testResult C.dcgmDiagTestResult_v3, testName string) DiagResult { + msg := C.GoString((*C.char)(unsafe.Pointer(&testResult.error[0].msg))) info := C.GoString((*C.char)(unsafe.Pointer(&testResult.info))) return DiagResult{ Status: diagResultString(int(testResult.status)), TestName: testName, TestOutput: info, - ErrorCode: uint(testResult.error.code), + ErrorCode: uint(testResult.error[0].code), ErrorMessage: msg, } } @@ -141,10 +141,10 @@ func diagLevel(diagType DiagType) C.dcgmDiagnosticLevel_t { } func RunDiag(diagType DiagType, groupId GroupHandle) (DiagResults, error) { - var diagResults C.dcgmDiagResponse_v8 - diagResults.version = makeVersion8(unsafe.Sizeof(diagResults)) + var diagResults C.dcgmDiagResponse_v9 + diagResults.version = makeVersion9(unsafe.Sizeof(diagResults)) - result := C.dcgmRunDiagnostic(handle.handle, groupId.handle, diagLevel(diagType), (*C.dcgmDiagResponse_v8)(unsafe.Pointer(&diagResults))) + result := C.dcgmRunDiagnostic(handle.handle, groupId.handle, diagLevel(diagType), (*C.dcgmDiagResponse_v9)(unsafe.Pointer(&diagResults))) if err := errorString(result); err != nil { return DiagResults{}, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} } diff --git a/pkg/dcgm/policy.go b/pkg/dcgm/policy.go index 10d2680..692f480 100644 --- a/pkg/dcgm/policy.go +++ b/pkg/dcgm/policy.go @@ -160,6 +160,7 @@ func makePolicyParmsMap() { } // ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify() +// //export ViolationRegistration func ViolationRegistration(data unsafe.Pointer) int { var con policyCondition diff --git a/pkg/dcgm/utils.go b/pkg/dcgm/utils.go index fab9b4a..dd7170b 100644 --- a/pkg/dcgm/utils.go +++ b/pkg/dcgm/utils.go @@ -161,6 +161,11 @@ func makeVersion8(struct_type uintptr) C.uint { return version } +func makeVersion9(struct_type uintptr) C.uint { + version := C.uint(struct_type | 9<<24) + return version +} + func roundFloat(f *float64) *float64 { var val float64 if f != nil { diff --git a/samples/processInfo/main.go b/samples/processInfo/main.go index f038165..1a0dfd6 100644 --- a/samples/processInfo/main.go +++ b/samples/processInfo/main.go @@ -47,11 +47,11 @@ Avg Memory Utilization (%) : {{or .ProcessUtilization.MemUtil "N/A"}} var process = flag.Uint("pid", 0, "Provide pid to get this process information.") // NOTE: The "WatchPidFields()" function must be initially called (as root) BEFORE starting the process to be monitored: -// 1. Run as root, for enabling health watches -// sudo dcgmi stats -e -// 2. Start process to be monitored -// 3. Run processInfo. This is equivalent to "dcgmi stats --pid ENTERPID -v" -// go build && ./processInfo -pid PID +// 1. Run as root, for enabling health watches +// sudo dcgmi stats -e +// 2. Start process to be monitored +// 3. Run processInfo. This is equivalent to "dcgmi stats --pid ENTERPID -v" +// go build && ./processInfo -pid PID func main() { cleanup, err := dcgm.Init(dcgm.Embedded) if err != nil {