From 2879da8af1ddbc8297c541cb367073b2e738bff3 Mon Sep 17 00:00:00 2001
From: Adam DeConinck <adeconinck@nvidia.com>
Date: Fri, 14 May 2021 17:13:06 +0000
Subject: [PATCH] Add support for custom configuration to k8s monitoring stack

- Add a custom metrics file to cluster configuration
- Update the deploy script to create a configMap based on the cluster configuration
- Update dcgm-exporter definition to mount the configMap as a volume and use that configuration
- Update dcgm-exporter definition to add sys_admin capability so we can use profiling metrics
---
 .../files/k8s-cluster/dcgm-custom-metrics.csv | 79 +++++++++++++++++++
 scripts/k8s/deploy_monitoring.sh              |  6 ++
 workloads/services/k8s/dcgm-exporter.yml      |  9 +++
 3 files changed, 94 insertions(+)
 create mode 100644 config.example/files/k8s-cluster/dcgm-custom-metrics.csv

diff --git a/config.example/files/k8s-cluster/dcgm-custom-metrics.csv b/config.example/files/k8s-cluster/dcgm-custom-metrics.csv
new file mode 100644
index 000000000..66875527f
--- /dev/null
+++ b/config.example/files/k8s-cluster/dcgm-custom-metrics.csv
@@ -0,0 +1,79 @@
+# Format,,
+# If line starts with a '#' it is considered a comment,,
+# DCGM FIELD, Prometheus metric type, help message
+
+# Clocks,,
+DCGM_FI_DEV_SM_CLOCK,  gauge, SM clock frequency (in MHz).
+DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
+
+# Temperature,,
+DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
+DCGM_FI_DEV_GPU_TEMP,    gauge, GPU temperature (in C).
+
+# Power,,
+DCGM_FI_DEV_POWER_USAGE,              gauge, Power draw (in W).
+DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
+
+# PCIE,,
+DCGM_FI_DEV_PCIE_TX_THROUGHPUT,  counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
+DCGM_FI_DEV_PCIE_RX_THROUGHPUT,  counter, Total number of bytes received through PCIe RX (in KB) via NVML.
+DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
+
+# Utilization (the sample period varies depending on the product),,
+DCGM_FI_DEV_GPU_UTIL,      gauge, GPU utilization (in %).
+DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
+DCGM_FI_DEV_ENC_UTIL,      gauge, Encoder utilization (in %).
+DCGM_FI_DEV_DEC_UTIL ,     gauge, Decoder utilization (in %).
+
+# Errors and violations,,
+DCGM_FI_DEV_XID_ERRORS,            gauge,   Value of the last XID error encountered.
+# DCGM_FI_DEV_POWER_VIOLATION,       counter, Throttling duration due to power constraints (in us).
+# DCGM_FI_DEV_THERMAL_VIOLATION,     counter, Throttling duration due to thermal constraints (in us).
+# DCGM_FI_DEV_SYNC_BOOST_VIOLATION,  counter, Throttling duration due to sync-boost constraints (in us).
+# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
+# DCGM_FI_DEV_LOW_UTIL_VIOLATION,    counter, Throttling duration due to low utilization (in us).
+# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
+
+# Memory usage,,
+DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
+DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
+
+# ECC,,
+# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
+# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
+# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
+# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
+
+# Retired pages,,
+# DCGM_FI_DEV_RETIRED_SBE,     counter, Total number of retired pages due to single-bit errors.
+# DCGM_FI_DEV_RETIRED_DBE,     counter, Total number of retired pages due to double-bit errors.
+# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
+
+# NVLink,,
+# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
+# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
+# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,   counter, Total number of NVLink retries.
+# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
+DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,            counter, Total number of NVLink bandwidth counters for all lanes.
+# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0,               counter, The number of bytes of active NVLink rx or tx data including both header and payload.
+
+# VGPU License status,,
+# DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
+
+# Remapped rows,,
+DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
+DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS,   counter, Number of remapped rows for correctable errors
+DCGM_FI_DEV_ROW_REMAP_FAILURE,           gauge,   Whether remapping of rows has failed
+
+# DCP metrics,,
+DCGM_FI_PROF_GR_ENGINE_ACTIVE,   gauge, Ratio of time the graphics engine is active (in %).
+DCGM_FI_PROF_SM_ACTIVE,          gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
+DCGM_FI_PROF_SM_OCCUPANCY,       gauge, The ratio of number of warps resident on an SM (in %).
+DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %).
+DCGM_FI_PROF_DRAM_ACTIVE,        gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %).
+DCGM_FI_PROF_PIPE_FP64_ACTIVE,   gauge, Ratio of cycles the fp64 pipes are active (in %).
+DCGM_FI_PROF_PIPE_FP32_ACTIVE,   gauge, Ratio of cycles the fp32 pipes are active (in %).
+DCGM_FI_PROF_PIPE_FP16_ACTIVE,   gauge, Ratio of cycles the fp16 pipes are active (in %).
+DCGM_FI_PROF_PCIE_TX_BYTES,      counter, The number of bytes of active pcie tx data including both header and payload.
+DCGM_FI_PROF_PCIE_RX_BYTES,      counter, The number of bytes of active pcie rx data including both header and payload.
+
diff --git a/scripts/k8s/deploy_monitoring.sh b/scripts/k8s/deploy_monitoring.sh
index d32a93ef7..9b359c253 100755
--- a/scripts/k8s/deploy_monitoring.sh
+++ b/scripts/k8s/deploy_monitoring.sh
@@ -26,6 +26,7 @@ ingress_name="ingress-nginx"
 
 PROMETHEUS_YAML_CONFIG="${PROMETHEUS_YAML_CONFIG:-${DEEPOPS_CONFIG_DIR}/helm/monitoring.yml}"
 PROMETHEUS_YAML_NO_PERSIST_CONFIG="${PROMETHEUS_YAML_NO_PERSIST_CONFIG:-${DEEPOPS_CONFIG_DIR}/helm/monitoring-no-persist.yml}"
+DCGM_CONFIG_CSV="${DCGM_CONFIG_CSV:-${DEEPOPS_CONFIG_DIR}/files/k8s-cluster/dcgm-custom-metrics.csv}"
 
 function help_me() {
     echo "This script installs the DCGM exporter, Prometheus, Grafana, and configures a GPU Grafana dashboard."
@@ -151,6 +152,11 @@ function setup_gpu_monitoring() {
         kubectl -n monitoring label configmap kube-prometheus-grafana-gpu grafana_dashboard=1
     fi
 
+    # Create DCGM metrics config map
+    if ! kubectl -n monitoring get configmap dcgm-custom-metrics >/dev/null 2>&1 ; then
+        kubectl create configmap dcgm-custom-metrics --from-file=${DCGM_CONFIG_CSV} -n monitoring
+    fi
+
     # Label GPU nodes
     for node in $(kubectl get node --no-headers -o custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\\.com\\/gpu | grep -v none | awk '{print $1}') ; do
         kubectl label nodes ${node} hardware-type=NVIDIAGPU --overwrite >/dev/null
diff --git a/workloads/services/k8s/dcgm-exporter.yml b/workloads/services/k8s/dcgm-exporter.yml
index a6152b467..401fe7fbc 100644
--- a/workloads/services/k8s/dcgm-exporter.yml
+++ b/workloads/services/k8s/dcgm-exporter.yml
@@ -25,6 +25,7 @@ spec:
       containers:
       - image: "nvcr.io/nvidia/k8s/dcgm-exporter:2.1.8-2.4.0-rc.2-ubuntu20.04"
         name: nvidia-dcgm-exporter
+        command: ["/usr/bin/dcgm-exporter", "-f", "/etc/dcgm-config/dcgm-custom-metrics.csv"]
         env:
         - name: "DCGM_EXPORTER_LISTEN"
           value: ":9400"
@@ -33,14 +34,22 @@ spec:
         securityContext:
           runAsNonRoot: false
           runAsUser: 0
+          capabilities:
+            add: ["SYS_ADMIN"]
         volumeMounts:
         - name: "pod-gpu-resources"
           readOnly: true
           mountPath: "/var/lib/kubelet/pod-resources"
+        - name: "dcgm-config"
+          readOnly: true
+          mountPath: "/etc/dcgm-config"
       volumes:
       - name: "pod-gpu-resources"
         hostPath:
           path: "/var/lib/kubelet/pod-resources"
+      - name: "dcgm-config"
+        configMap:
+          name: "dcgm-custom-metrics"
       tolerations:
         - effect: NoSchedule
           operator: Exists