From e9b8637497dd7d53aeb72627f838416ee0c74e86 Mon Sep 17 00:00:00 2001 From: Yi Yao Date: Fri, 1 Nov 2024 11:33:09 +0800 Subject: [PATCH] Add setup guide of gaudi prometheus exporter (#186) * Add setup guide of gaudi prometheus exporter --- evals/benchmark/grafana/README.md | 33 ++++++- ...rter.yaml => prometheus_cpu_exporter.yaml} | 0 .../grafana/prometheus_gaudi_exporter.yaml | 90 +++++++++++++++++++ 3 files changed, 121 insertions(+), 2 deletions(-) rename evals/benchmark/grafana/{grafana_node_exporter.yaml => prometheus_cpu_exporter.yaml} (100%) create mode 100644 evals/benchmark/grafana/prometheus_gaudi_exporter.yaml diff --git a/evals/benchmark/grafana/README.md b/evals/benchmark/grafana/README.md index e4d5af19..6edb3645 100644 --- a/evals/benchmark/grafana/README.md +++ b/evals/benchmark/grafana/README.md @@ -38,14 +38,16 @@ You should now access `localhost:9090/targets?search=` to open the Prometheus UI ### 1.1 CPU Metrics (optional) -The Prometheus Node Exporter is required for collecting CPU metrics. Install and run the Node Exporter via tarball by the [guide](https://prometheus.io/docs/guides/node-exporter/#installing-and-running-the-node-exporter). +The Prometheus Node Exporter is required for collecting CPU metrics. Deploy the Node Exporter via tarball by the [guide](https://prometheus.io/docs/guides/node-exporter/#installing-and-running-the-node-exporter). Or install it in a K8S cluster by the following commands: +Ensure namespace `monitoring` was created in your K8S environment. + ```bash git clone https://github.com/opea-project/GenAIEval.git cd GenAIEval/evals/benchmark/grafana/ -kubectl apply -f grafana_node_exporter.yaml +kubectl apply -f prometheus_cpu_exporter.yaml ``` Add the following configuration to `prometheus.yml`: @@ -58,6 +60,33 @@ scrape_configs: - targets: [":9100", ":9100", ...] ``` +### 1.2 Intel® Gaudi® Metrics (optional) + +The Intel Gaudi Prometheus Metrics Exporter is required for collecting Intel® Gaudi® AI accelerator metrics. + +Follow the [guide](https://docs.habana.ai/en/latest/Orchestration/Prometheus_Metric_Exporter.html#deploying-prometheus-metric-exporter-in-docker) to deploy the metrics exporter in Docker. + +Or install it in a K8S cluster by the following commands: + +Ensure namespace `monitoring` was created in your K8S environment. + +```bash +git clone https://github.com/opea-project/GenAIEval.git +cd GenAIEval/evals/benchmark/grafana/ +kubectl apply -f prometheus_gaudi_exporter.yaml +``` + +Add the following configuration to `prometheus.yml`: + +```yaml +scrape_configs: + - job_name: "prometheus-gaudi-exporter" + scrape_interval: 15s + metrics_path: /metrics + static_configs: + - targets: [":41611", ":41611", ...] +``` + Restart Prometheus after saving the changes. ## 2. Setup Grafana diff --git a/evals/benchmark/grafana/grafana_node_exporter.yaml b/evals/benchmark/grafana/prometheus_cpu_exporter.yaml similarity index 100% rename from evals/benchmark/grafana/grafana_node_exporter.yaml rename to evals/benchmark/grafana/prometheus_cpu_exporter.yaml diff --git a/evals/benchmark/grafana/prometheus_gaudi_exporter.yaml b/evals/benchmark/grafana/prometheus_gaudi_exporter.yaml new file mode 100644 index 00000000..a0568084 --- /dev/null +++ b/evals/benchmark/grafana/prometheus_gaudi_exporter.yaml @@ -0,0 +1,90 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/name: metric-exporter-ds + app.kubernetes.io/version: v0.0.1 + name: metric-exporter-ds + namespace: monitoring +spec: + selector: + matchLabels: + app.kubernetes.io/name: metric-exporter-ds + template: + metadata: + labels: + app.kubernetes.io/name: metric-exporter-ds + app.kubernetes.io/version: v0.0.1 + spec: + priorityClassName: "system-node-critical" + imagePullSecrets: [] + tolerations: + - key: "habana.ai/gaudi" + operator: "Exists" + effect: "NoSchedule" + # Required for network monitoring + hostNetwork: true + containers: + - name: metric-exporter + image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:1.18.0-524 + imagePullPolicy: Always + env: + - name: LD_LIBRARY_PATH + value: "/usr/lib/habanalabs" + securityContext: + privileged: true + volumeMounts: + - name: pod-resources + mountPath: /var/lib/kubelet/pod-resources + ports: + - name: habana-metrics + containerPort: 41611 + protocol: TCP + resources: + limits: + cpu: 150m + memory: 120Mi + requests: + cpu: 100m + memory: 100Mi + volumes: + - name: pod-resources + hostPath: + path: /var/lib/kubelet/pod-resources + +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/name: metric-exporter + app.kubernetes.io/version: v0.0.1 + name: metric-exporter + namespace: monitoring +spec: + clusterIP: None + ports: + - name: habana-metrics + port: 41611 + selector: + app.kubernetes.io/name: metric-exporter-ds + +# --- +# apiVersion: monitoring.coreos.com/v1 +# kind: ServiceMonitor +# metadata: +# labels: +# app.kubernetes.io/name: metric-exporter +# app.kubernetes.io/version: v0.0.1 +# name: metric-exporter +# namespace: monitoring +# spec: +# selector: +# matchLabels: +# app.kubernetes.io/name: metric-exporter +# endpoints: +# - port: habana-metrics +# interval: 30s