From 38e69eb0885e1fadc1d6dd3aae713165ccc80b9d Mon Sep 17 00:00:00 2001 From: Yi Yao Date: Wed, 23 Oct 2024 15:12:24 +0800 Subject: [PATCH] Add CPU Grafana dashboard (#171) * Add CPU Grafana dashboard * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- evals/benchmark/grafana/README.md | 25 + evals/benchmark/grafana/cpu_grafana.json | 1529 +++++++++++++++++ .../grafana/grafana_node_exporter.yaml | 77 + 3 files changed, 1631 insertions(+) create mode 100644 evals/benchmark/grafana/cpu_grafana.json create mode 100644 evals/benchmark/grafana/grafana_node_exporter.yaml diff --git a/evals/benchmark/grafana/README.md b/evals/benchmark/grafana/README.md index 2bc9923a..e4d5af19 100644 --- a/evals/benchmark/grafana/README.md +++ b/evals/benchmark/grafana/README.md @@ -36,6 +36,30 @@ Next, run Prometheus server `nohup ./prometheus --config.file=./prometheus.yml & You should now access `localhost:9090/targets?search=` to open the Prometheus UI. +### 1.1 CPU Metrics (optional) + +The Prometheus Node Exporter is required for collecting CPU metrics. Install and run the Node Exporter via tarball by the [guide](https://prometheus.io/docs/guides/node-exporter/#installing-and-running-the-node-exporter). + +Or install it in a K8S cluster by the following commands: + +```bash +git clone https://github.com/opea-project/GenAIEval.git +cd GenAIEval/evals/benchmark/grafana/ +kubectl apply -f grafana_node_exporter.yaml +``` + +Add the following configuration to `prometheus.yml`: + +```yaml +scrape_configs: + - job_name: "prometheus-node-exporter" + metrics_path: /metrics + static_configs: + - targets: [":9100", ":9100", ...] +``` + +Restart Prometheus after saving the changes. + ## 2. Setup Grafana Grafana provides numerous dashboards to visualize data from a data source. Here we introduce how to visualize TGI metrics. @@ -75,3 +99,4 @@ In this folder, we also provides some Grafana dashboard JSON files for your refe - `tgi_grafana.json`: A sample Grafana dashboard JSON file for visualizing TGI metrics. - `redis_grafana.json`: A sample Grafana dashboard JSON file for visualizing the Redis metrics. For importing the redis metrics, you need to add the new connection and Redis data source in Grafana. Please refer this [link](https://grafana.com/grafana/plugins/redis-datasource/?tab=installation) for more details. - `gaudi_grafana.json`: A sample Grafana dashboard JSON file for visualizing the IntelĀ® GaudiĀ® AI accelerator metrics in a container cluster for compute workload. +- `cpu_grafana.json`: A sample Grafana dashboard JSON file for visualizing the CPU metrics. diff --git a/evals/benchmark/grafana/cpu_grafana.json b/evals/benchmark/grafana/cpu_grafana.json new file mode 100644 index 00000000..7b912c01 --- /dev/null +++ b/evals/benchmark/grafana/cpu_grafana.json @@ -0,0 +1,1529 @@ +{ + "__inputs": [ + { + "name": "DS_HOME_MAIN PROMETHEUS", + "label": "Home main Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "5.4.2" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "5.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "type": "dashboard" + } + ] + }, + "description": "", + "editable": true, + "gnetId": 9617, + "graphTooltip": 1, + "id": null, + "iteration": 1547371615169, + "links": [], + "panels": [ + { + "alerting": {}, + "aliasColors": { + "load 15m": "#CCA300", + "load 1m": "#890F02", + "load 5m": "#C15C17" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "editable": true, + "error": false, + "fill": 3, + "grid": {}, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 147, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$host\"}", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "load 1m", + "refId": "A", + "step": 1200, + "target": "" + }, + { + "expr": "node_load5{instance=\"$host\"}", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "load 5m", + "refId": "B", + "step": 1200, + "target": "" + }, + { + "expr": "node_load15{instance=\"$host\"}", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "load 15m", + "refId": "C", + "step": 1200, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Load average", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 3, + "y": 0 + }, + "id": 25, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "100%", + "prefix": "", + "prefixFontSize": "100%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(60, 189, 31, 0.18)", + "full": true, + "lineColor": "rgb(0, 0, 0)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "scalar(count(count by (cpu)(node_cpu_seconds_total{instance='$host'})))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": "2,4", + "title": "CPU cores", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "decimals": 3, + "editable": true, + "error": false, + "format": "short", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 5, + "y": 0 + }, + "id": 125, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "100%", + "prefix": "", + "prefixFontSize": "100%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(63, 255, 0, 0.09)", + "full": true, + "lineColor": "rgba(126, 178, 109, 0)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "clamp_min((1 - avg(irate(node_cpu_seconds_total{instance=\"$host\", mode='idle'}[$interval])))*100, 0.0)", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": "70,95", + "title": "Average cpu %", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "editable": true, + "error": false, + "format": "hertz", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 8, + "y": 0 + }, + "id": 79, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "100%", + "prefix": "", + "prefixFontSize": "100%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(60, 189, 31, 0.18)", + "full": true, + "lineColor": "rgb(0, 0, 0)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "min(node_cpu_frequency_min_hertz{instance='$host'})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": "", + "title": "Min frequency", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "min" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "editable": true, + "error": false, + "format": "hertz", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 10, + "y": 0 + }, + "id": 78, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "100%", + "prefix": "", + "prefixFontSize": "100%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(60, 189, 31, 0.18)", + "full": true, + "lineColor": "rgb(0, 0, 0)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(node_cpu_frequency_max_hertz{instance='$host'})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": "", + "title": "Max frequency", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "max" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "decimals": 3, + "editable": true, + "error": false, + "format": "short", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 103, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "100%", + "prefix": "", + "prefixFontSize": "100%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(63, 255, 0, 0.09)", + "full": true, + "lineColor": "rgba(126, 178, 109, 0)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "100 * (avg(node_cpu_scaling_frequency_hertz{instance='$host'})-min(node_cpu_frequency_min_hertz{instance='$host'})) / (max(node_cpu_frequency_max_hertz{instance='$host'})-min(node_cpu_frequency_min_hertz{instance='$host'}))", + "format": "time_series", + "instant": false, + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": "70,95", + "title": "Average frequency %", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "decimals": 0, + "editable": true, + "error": false, + "format": "short", + "gauge": { + "maxValue": 10, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 15, + "y": 0 + }, + "id": 169, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "100%", + "prefix": "", + "prefixFontSize": "100%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(191, 27, 0, 0.42)", + "full": true, + "lineColor": "rgba(191, 27, 0, 0.42)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(node_cpu_core_throttles_total{instance='$host'})", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": "2,3", + "title": "Throttles core", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + }, + { + "op": "=", + "text": "OK", + "value": "0" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "decimals": 0, + "editable": true, + "error": false, + "format": "short", + "gauge": { + "maxValue": 10, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 17, + "y": 0 + }, + "id": 191, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "100%", + "prefix": "", + "prefixFontSize": "100%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(191, 27, 0, 0.42)", + "full": true, + "lineColor": "rgba(191, 27, 0, 0.42)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(node_cpu_package_throttles_total{instance='$host'})", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": "2,3", + "title": "Throttles package", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + }, + { + "op": "=", + "text": "OK", + "value": "0" + } + ], + "valueName": "current" + }, + { + "alerting": {}, + "aliasColors": { + "load 15m": "#CCA300", + "load 1m": "#890F02", + "load 5m": "#C15C17" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 4, + "w": 5, + "x": 19, + "y": 0 + }, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(node_cpu_core_throttles_total{instance=\"$host\"}[$interval]))", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "Core", + "refId": "A", + "step": 1200, + "target": "" + }, + { + "expr": "sum(irate(node_cpu_package_throttles_total{instance=\"$host\"}[$interval]))", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "Packages", + "refId": "B", + "step": 1200, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Throttles history", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "editable": true, + "error": false, + "format": "hertz", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 8, + "y": 2 + }, + "id": 80, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "100%", + "prefix": "", + "prefixFontSize": "100%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(63, 255, 0, 0.09)", + "full": true, + "lineColor": "rgba(126, 178, 109, 0)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(node_cpu_scaling_frequency_hertz{instance='$host'})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": "", + "title": "Average frequency", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 55, + "panels": [], + "repeat": "cpu", + "title": "CPU $cpu", + "type": "row" + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "editable": true, + "error": false, + "fill": 5, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 3, + "legend": { + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "minSpan": 6, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "user", + "color": "#629e51" + }, + { + "alias": "system", + "color": "#bf1b00" + }, + { + "alias": "nice", + "color": "#1f78c1" + }, + { + "alias": "irq", + "color": "#511749" + }, + { + "alias": "softirq", + "color": "#ba43a9" + }, + { + "alias": "steal", + "color": "#99440a" + }, + { + "alias": "iowait", + "color": "#f9934e" + }, + { + "alias": "idle", + "color": "#052b51" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_cpu_seconds_total{instance=\"$host\", mode='user', cpu='$cpu'}[$interval])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "{{ mode }}", + "refId": "A", + "step": 3000 + }, + { + "expr": "irate(node_cpu_seconds_total{instance=\"$host\", mode='system', cpu='$cpu'}[$interval])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "{{ mode }}", + "refId": "B", + "step": 3000 + }, + { + "expr": "irate(node_cpu_seconds_total{instance=\"$host\", mode='nice', cpu='$cpu'}[$interval])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "{{ mode }}", + "refId": "C", + "step": 3000 + }, + { + "expr": "irate(node_cpu_seconds_total{instance=\"$host\", mode='idle', cpu='$cpu'}[$interval])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "{{ mode }}", + "refId": "H", + "step": 3000 + }, + { + "expr": "irate(node_cpu_seconds_total{instance=\"$host\", mode='irq', cpu='$cpu'}[$interval])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "{{ mode }}", + "refId": "D", + "step": 3000 + }, + { + "expr": "irate(node_cpu_seconds_total{instance=\"$host\", mode='softirq', cpu='$cpu'}[$interval])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "{{ mode }}", + "refId": "E", + "step": 3000 + }, + { + "expr": "irate(node_cpu_seconds_total{instance=\"$host\", mode='iowait', cpu='$cpu'}[$interval])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "{{ mode }}", + "refId": "F", + "step": 3000 + }, + { + "expr": "irate(node_cpu_seconds_total{instance=\"$host\", mode='steal', cpu='$cpu'}[$interval])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "{{ mode }}", + "refId": "G", + "step": 3000 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": "", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alerting": {}, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 56, + "legend": { + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 6, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "max", + "color": "#bf1b00", + "dashes": true + }, + { + "alias": "min", + "color": "#7eb26d", + "dashes": true + }, + { + "alias": "current", + "color": "#f9934e", + "lines": false, + "pointradius": 1, + "points": true + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_cpu_frequency_max_hertz{instance='$host', cpu='$cpu'}", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "max", + "refId": "A" + }, + { + "expr": "node_cpu_scaling_frequency_hertz{instance='$host', cpu='$cpu'}", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "current", + "refId": "B" + }, + { + "expr": "node_cpu_frequency_min_hertz{instance='$host', cpu='$cpu'}", + "format": "time_series", + "interval": "$step", + "intervalFactor": 1, + "legendFormat": "min", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Frequency", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "hertz", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "linux", + "cpu" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_HOME_MAIN PROMETHEUS}", + "definition": "label_values(node_boot_time_seconds, instance)", + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "host", + "options": [], + "query": "label_values(node_boot_time_seconds, instance)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "$env", + "tags": [], + "tagsQuery": "instance", + "type": "query", + "useTags": false + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "1m", + "value": "1m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "options": [ + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + } + ], + "query": "30s,1m,5m,10m,1h,6h,12h", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "30s", + "value": "30s" + }, + "hide": 0, + "label": "step", + "name": "step", + "options": [ + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + } + ], + "query": "30s,1m,5m,10m,1h,6h,12h", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_HOME_MAIN PROMETHEUS}", + "definition": "label_values(node_cpu_seconds_total, cpu)", + "hide": 0, + "includeAll": true, + "label": "cpu", + "multi": true, + "name": "cpu", + "options": [], + "query": "label_values(node_cpu_seconds_total, cpu)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "hidden": false, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "CPU", + "uid": "j3RfFN8mk", + "version": 64 + } diff --git a/evals/benchmark/grafana/grafana_node_exporter.yaml b/evals/benchmark/grafana/grafana_node_exporter.yaml new file mode 100644 index 00000000..b6abfcc0 --- /dev/null +++ b/evals/benchmark/grafana/grafana_node_exporter.yaml @@ -0,0 +1,77 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + name: node-exporter + namespace: monitoring +spec: + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + template: + metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + spec: + containers: + - args: + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --no-collector.wifi + - --no-collector.hwmon + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) + - --collector.netclass.ignored-devices=^(veth.*)$ + name: node-exporter + image: prom/node-exporter + ports: + - containerPort: 9100 + protocol: TCP + resources: + limits: + cpu: 250m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi + volumeMounts: + - mountPath: /host/sys + mountPropagation: HostToContainer + name: sys + readOnly: true + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true + volumes: + - hostPath: + path: /sys + name: sys + - hostPath: + path: / + name: root + +--- +kind: Service +apiVersion: v1 +metadata: + name: node-exporter + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '9100' +spec: + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + ports: + - name: node-exporter + protocol: TCP + port: 9100 + targetPort: 9100