From 2879da8af1ddbc8297c541cb367073b2e738bff3 Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Fri, 14 May 2021 17:13:06 +0000 Subject: [PATCH] Add support for custom configuration to k8s monitoring stack - Add a custom metrics file to cluster configuration - Update the deploy script to create a configMap based on the cluster configuration - Update dcgm-exporter definition to mount the configMap as a volume and use that configuration - Update dcgm-exporter definition to add sys_admin capability so we can use profiling metrics --- .../files/k8s-cluster/dcgm-custom-metrics.csv | 79 +++++++++++++++++++ scripts/k8s/deploy_monitoring.sh | 6 ++ workloads/services/k8s/dcgm-exporter.yml | 9 +++ 3 files changed, 94 insertions(+) create mode 100644 config.example/files/k8s-cluster/dcgm-custom-metrics.csv diff --git a/config.example/files/k8s-cluster/dcgm-custom-metrics.csv b/config.example/files/k8s-cluster/dcgm-custom-metrics.csv new file mode 100644 index 000000000..66875527f --- /dev/null +++ b/config.example/files/k8s-cluster/dcgm-custom-metrics.csv @@ -0,0 +1,79 @@ +# Format,, +# If line starts with a '#' it is considered a comment,, +# DCGM FIELD, Prometheus metric type, help message + +# Clocks,, +DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). + +# Temperature,, +DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). +DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + +# Power,, +DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + +# PCIE,, +DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. +DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. +DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + +# Utilization (the sample period varies depending on the product),, +DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). +DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). +DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). +DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + +# Errors and violations,, +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. +# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). +# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). +# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). +# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). +# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). +# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). + +# Memory usage,, +DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). +DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + +# ECC,, +# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. +# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + +# Retired pages,, +# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. +# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. +# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + +# NVLink,, +# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. +# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. +# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. +# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. +# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + +# VGPU License status,, +# DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + +# Remapped rows,, +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors +DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + +# DCP metrics,, +DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). +DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). +DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). +DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). +DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). +DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). +DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). +DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. +DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. + diff --git a/scripts/k8s/deploy_monitoring.sh b/scripts/k8s/deploy_monitoring.sh index d32a93ef7..9b359c253 100755 --- a/scripts/k8s/deploy_monitoring.sh +++ b/scripts/k8s/deploy_monitoring.sh @@ -26,6 +26,7 @@ ingress_name="ingress-nginx" PROMETHEUS_YAML_CONFIG="${PROMETHEUS_YAML_CONFIG:-${DEEPOPS_CONFIG_DIR}/helm/monitoring.yml}" PROMETHEUS_YAML_NO_PERSIST_CONFIG="${PROMETHEUS_YAML_NO_PERSIST_CONFIG:-${DEEPOPS_CONFIG_DIR}/helm/monitoring-no-persist.yml}" +DCGM_CONFIG_CSV="${DCGM_CONFIG_CSV:-${DEEPOPS_CONFIG_DIR}/files/k8s-cluster/dcgm-custom-metrics.csv}" function help_me() { echo "This script installs the DCGM exporter, Prometheus, Grafana, and configures a GPU Grafana dashboard." @@ -151,6 +152,11 @@ function setup_gpu_monitoring() { kubectl -n monitoring label configmap kube-prometheus-grafana-gpu grafana_dashboard=1 fi + # Create DCGM metrics config map + if ! kubectl -n monitoring get configmap dcgm-custom-metrics >/dev/null 2>&1 ; then + kubectl create configmap dcgm-custom-metrics --from-file=${DCGM_CONFIG_CSV} -n monitoring + fi + # Label GPU nodes for node in $(kubectl get node --no-headers -o custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\\.com\\/gpu | grep -v none | awk '{print $1}') ; do kubectl label nodes ${node} hardware-type=NVIDIAGPU --overwrite >/dev/null diff --git a/workloads/services/k8s/dcgm-exporter.yml b/workloads/services/k8s/dcgm-exporter.yml index a6152b467..401fe7fbc 100644 --- a/workloads/services/k8s/dcgm-exporter.yml +++ b/workloads/services/k8s/dcgm-exporter.yml @@ -25,6 +25,7 @@ spec: containers: - image: "nvcr.io/nvidia/k8s/dcgm-exporter:2.1.8-2.4.0-rc.2-ubuntu20.04" name: nvidia-dcgm-exporter + command: ["/usr/bin/dcgm-exporter", "-f", "/etc/dcgm-config/dcgm-custom-metrics.csv"] env: - name: "DCGM_EXPORTER_LISTEN" value: ":9400" @@ -33,14 +34,22 @@ spec: securityContext: runAsNonRoot: false runAsUser: 0 + capabilities: + add: ["SYS_ADMIN"] volumeMounts: - name: "pod-gpu-resources" readOnly: true mountPath: "/var/lib/kubelet/pod-resources" + - name: "dcgm-config" + readOnly: true + mountPath: "/etc/dcgm-config" volumes: - name: "pod-gpu-resources" hostPath: path: "/var/lib/kubelet/pod-resources" + - name: "dcgm-config" + configMap: + name: "dcgm-custom-metrics" tolerations: - effect: NoSchedule operator: Exists