From c3dbd87dc55c6d85df545005c1d16ea721c54221 Mon Sep 17 00:00:00 2001 From: Jerad C Date: Thu, 18 Nov 2021 09:46:24 -0600 Subject: [PATCH] add dashboards (#806) --- .../karpenter-capacity-glance.json | 569 +++++++++++++ .../karpenter-capacity-history.json | 749 ++++++++++++++++++ .../karpenter-controllers-allocation.json | 330 ++++++++ grafana-dashboards/karpenter-controllers.json | 446 +++++++++++ .../content/en/docs/getting-started/_index.md | 35 + .../docs/getting-started/grafana-values.yaml | 9 + .../getting-started/prometheus-values.yaml | 14 + 7 files changed, 2152 insertions(+) create mode 100644 grafana-dashboards/karpenter-capacity-glance.json create mode 100644 grafana-dashboards/karpenter-capacity-history.json create mode 100644 grafana-dashboards/karpenter-controllers-allocation.json create mode 100644 grafana-dashboards/karpenter-controllers.json create mode 100644 website/content/en/docs/getting-started/grafana-values.yaml create mode 100644 website/content/en/docs/getting-started/prometheus-values.yaml diff --git a/grafana-dashboards/karpenter-capacity-glance.json b/grafana-dashboards/karpenter-capacity-glance.json new file mode 100644 index 000000000000..59758989df04 --- /dev/null +++ b/grafana-dashboards/karpenter-capacity-glance.json @@ -0,0 +1,569 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.1.6" + }, + { + "type": "panel", + "id": "piechart", + "name": "Pie chart", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1636732245770, + "links": [], + "panels": [ + { + "datasource": null, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 39, + "options": { + "content": "Displays information about the Nodes associated with provisioner \"$provisioner\". The *Overview* row displays current counts across all zones. Rows with zone specific counts will automatically be added for each observed zone within the selected time range.\n\nSee the information icon of each panel for a description.\n\nSuggest improvements and additions [here](https://github.com/awslabs/karpenter/issues/new?labels=dashboard).", + "mode": "markdown" + }, + "pluginVersion": "8.1.6", + "title": "About this dashboard", + "type": "text" + }, + { + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 29, + "title": "Overview", + "type": "row" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Percentage of capacity reporting as \"ready\".", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "noValue": "No provisioned capacity", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 2, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.1.6", + "targets": [ + { + "exemplar": true, + "expr": "sum(karpenter_capacity_ready_node_count{provisioner=\"$provisioner\"}) / sum(karpenter_capacity_node_count{provisioner=\"$provisioner\"})", + "interval": "", + "legendFormat": "Ready", + "queryType": "randomWalk", + "refId": "Ready Node Count" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Reporting as ready", + "transformations": [], + "type": "gauge" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Compares capacity that is \"ready\" in each zone.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 4, + "options": { + "displayLabels": [ + "name" + ], + "legend": { + "displayMode": "hidden", + "placement": "right", + "values": [] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "7.5.0", + "targets": [ + { + "exemplar": true, + "expr": "sum by(zone) (karpenter_capacity_ready_node_count{provisioner=\"$provisioner\"}) > 0", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{zone}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Ready capacity by zone", + "type": "piechart" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Compares capacity reporting as \"ready\" by instance type.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 12, + "options": { + "displayLabels": [ + "name" + ], + "legend": { + "displayMode": "hidden", + "placement": "right", + "values": [] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "7.5.0", + "targets": [ + { + "exemplar": true, + "expr": "sum by(instancetype) (karpenter_capacity_ready_node_instancetype_count{provisioner=\"$provisioner\"}) > 0", + "interval": "", + "legendFormat": "{{instancetype}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Ready capacity by instance type", + "type": "piechart" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Comparison of capacity reporting as \"ready\" by hardware architecture.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 14, + "options": { + "displayLabels": [ + "name" + ], + "legend": { + "displayMode": "hidden", + "placement": "right", + "values": [] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "7.5.0", + "targets": [ + { + "exemplar": true, + "expr": "sum by(arch) (karpenter_capacity_ready_node_arch_count{provisioner=\"$provisioner\"}) > 0", + "interval": "", + "legendFormat": "{{arch}}", + "queryType": "randomWalk", + "refId": "Arch Count" + } + ], + "title": "Ready capacity by node architecture", + "type": "piechart" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 8, + "panels": [], + "repeat": "zone", + "title": "Zone $zone", + "type": "row" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Compares capacity reporting as \"ready\" by instance type.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "noValue": "No provisioned capacity" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 6, + "options": { + "displayLabels": [ + "name" + ], + "legend": { + "displayMode": "hidden", + "placement": "right", + "values": [] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "7.5.0", + "repeat": null, + "repeatDirection": "v", + "targets": [ + { + "exemplar": true, + "expr": "karpenter_capacity_ready_node_instancetype_count{provisioner=\"$provisioner\", zone=\"$zone\"} > 0", + "instant": false, + "interval": "", + "legendFormat": "{{instancetype}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Ready capacity by instance type", + "type": "piechart" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Comparison of capacity reporting as \"ready\" by hardware architecture.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "noValue": "No provisioned capacity" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 15, + "options": { + "displayLabels": [ + "name" + ], + "legend": { + "displayMode": "hidden", + "placement": "right", + "values": [] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "7.5.0", + "repeat": null, + "repeatDirection": "v", + "targets": [ + { + "exemplar": true, + "expr": "karpenter_capacity_ready_node_arch_count{provisioner=\"$provisioner\", zone=\"$zone\"} > 0", + "interval": "", + "legendFormat": "{{arch}}", + "queryType": "randomWalk", + "refId": "Arch Count" + } + ], + "title": "Ready capacity by node architecture", + "type": "piechart" + } + ], + "refresh": "1m", + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": "", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(karpenter_capacity_node_count, provisioner)", + "description": "Karpenter Provisioner", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Provisioner", + "multi": false, + "name": "provisioner", + "options": [], + "query": { + "query": "label_values(karpenter_capacity_node_count, provisioner)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": "", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(karpenter_capacity_ready_node_count, zone)", + "description": "Topology zone", + "error": null, + "hide": 2, + "includeAll": true, + "label": "Zone", + "multi": false, + "name": "zone", + "options": [], + "query": { + "query": "label_values(karpenter_capacity_ready_node_count, zone)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Capacity / At a Glance", + "uid": "ffLnEKv7k", + "version": 6 +} \ No newline at end of file diff --git a/grafana-dashboards/karpenter-capacity-history.json b/grafana-dashboards/karpenter-capacity-history.json new file mode 100644 index 000000000000..f6a006483008 --- /dev/null +++ b/grafana-dashboards/karpenter-capacity-history.json @@ -0,0 +1,749 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.1.6" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1636732344860, + "links": [], + "panels": [ + { + "datasource": null, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 39, + "options": { + "content": "Displays information about the Nodes associated with provisioner \"$provisioner\". The *Overview* row displays historic counts across all zones. Rows with zone specific counts will automatically be added for each observed zone within the selected time range.\n\nSee the information icon of each panel for a description.\n\nSuggest improvements and additions [here](https://github.com/awslabs/karpenter/issues/new?labels=dashboard).", + "mode": "markdown" + }, + "pluginVersion": "8.1.6", + "title": "About this dashboard", + "type": "text" + }, + { + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 29, + "title": "Overview", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Percentage of capacity reporting as \"ready\".", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 3, + "fillGradient": 3, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(karpenter_capacity_ready_node_count{provisioner=\"$provisioner\"}) / sum(karpenter_capacity_node_count{provisioner=\"$provisioner\"})", + "interval": "", + "legendFormat": "Ready", + "queryType": "randomWalk", + "refId": "Ready Node Count" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Reporting as Ready", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1.0", + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Compares capacity that is \"ready\" in each zone.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 6 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by(zone) (karpenter_capacity_ready_node_count{provisioner=\"$provisioner\"}) > 0", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{zone}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ready capacity by zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Compares capacity reporting as \"ready\" by instance type.", + "fill": 1, + "fillGradient": 3, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 15 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by(instancetype) (karpenter_capacity_ready_node_instancetype_count{provisioner=\"$provisioner\"}) > 0", + "interval": "", + "legendFormat": "{{instancetype}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ready capacity by instance type", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Comparison of capacity reporting as \"ready\" by hardware architecture.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 15 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by(arch) (karpenter_capacity_ready_node_arch_count{provisioner=\"$provisioner\"}) > 0", + "interval": "", + "legendFormat": "{{arch}}", + "queryType": "randomWalk", + "refId": "Arch Count" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ready by node architecture", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 8, + "panels": [], + "repeat": "zone", + "title": "Zone $zone", + "type": "row" + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Compares capacity reporting as \"ready\" by instance type.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 25 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "v", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "karpenter_capacity_ready_node_instancetype_count{provisioner=\"$provisioner\", zone=\"$zone\"} > 0", + "instant": false, + "interval": "", + "legendFormat": "{{instancetype}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ready capacity by instance type", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Comparison of capacity reporting as \"ready\" by hardware architecture.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 25 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "v", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "karpenter_capacity_ready_node_arch_count{provisioner=\"$provisioner\", zone=\"$zone\"} > 0", + "interval": "", + "legendFormat": "{{arch}}", + "queryType": "randomWalk", + "refId": "Arch Count" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ready capacity by node architecture", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": "", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(karpenter_capacity_node_count, provisioner)", + "description": "Karpenter Provisioner", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Provisioner", + "multi": false, + "name": "provisioner", + "options": [], + "query": { + "query": "label_values(karpenter_capacity_node_count, provisioner)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": "", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(karpenter_capacity_ready_node_count, zone)", + "description": "Topology zone", + "error": null, + "hide": 2, + "includeAll": true, + "label": "Zone", + "multi": false, + "name": "zone", + "options": [], + "query": { + "query": "label_values(karpenter_capacity_ready_node_count, zone)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Capacity / History", + "uid": "ffLnEKv71", + "version": 4 +} \ No newline at end of file diff --git a/grafana-dashboards/karpenter-controllers-allocation.json b/grafana-dashboards/karpenter-controllers-allocation.json new file mode 100644 index 000000000000..971097b9b575 --- /dev/null +++ b/grafana-dashboards/karpenter-controllers-allocation.json @@ -0,0 +1,330 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.1.6" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1636732403925, + "links": [], + "panels": [ + { + "datasource": null, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "options": { + "content": "Displays information about Allocation controller processes.\n\nSee the information icon of each panel for a description.\n\nSuggest improvements and additions [here](https://github.com/awslabs/karpenter/issues/new?labels=dashboard).", + "mode": "markdown" + }, + "pluginVersion": "8.1.6", + "targets": [ + { + "queryType": "randomWalk", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "About this dashboard", + "type": "text" + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateMagma", + "exponent": 0.5, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_PROMETHEUS}", + "description": "Aggregates the duration of all bind operations of the Allocation controller.\n\nThe color of each \"bucket\" is a visual clue to the number of bind operations that completed within that duration range.\n\nMouse-over a bucket to display exact values.", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 4 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 2, + "legend": { + "show": true + }, + "maxDataPoints": 25, + "pluginVersion": "7.5.0", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(karpenter_allocation_controller_bind_duration_seconds_bucket[$__interval])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "All Binds" + } + ], + "title": "Bind duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": "0", + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateMagma", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_PROMETHEUS}", + "description": "Aggregates the duration of all binpack operations of the Allocation controller.\n\nThe color of each \"bucket\" is a visual clue to the number of binpack operations that completed within that duration range.\n\nMouse-over a bucket to display exact values.", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 12 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 7, + "legend": { + "show": true + }, + "maxDataPoints": 25, + "pluginVersion": "7.5.0", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(karpenter_allocation_controller_binpacking_duration_seconds_bucket[$__interval])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "Binpacking Duration" + } + ], + "title": "Binpack duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": "0", + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_PROMETHEUS}", + "description": "Aggregates the duration of all scheduling operations of the Allocation controller for provisioner $provisioner.\n\nThe color of each \"bucket\" is a visual clue to the number of scheduling operations that completed within that duration range.\n\nMouse-over a bucket to display exact values.", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 20 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 9, + "legend": { + "show": true + }, + "maxDataPoints": 25, + "pluginVersion": "7.5.0", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(karpenter_allocation_controller_scheduling_duration_seconds_bucket{provisioner=\"$provisioner\"}[$__interval])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "Scheduling Duration" + } + ], + "title": "Scheduling duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + } + ], + "refresh": "1m", + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(karpenter_allocation_controller_scheduling_duration_seconds_bucket, provisioner)", + "description": "Karpenter provisioner", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Provisioner", + "multi": false, + "name": "provisioner", + "options": [], + "query": { + "query": "label_values(karpenter_allocation_controller_scheduling_duration_seconds_bucket, provisioner)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Controllers / Allocation", + "uid": "fs47R-Dnz", + "version": 9 +} \ No newline at end of file diff --git a/grafana-dashboards/karpenter-controllers.json b/grafana-dashboards/karpenter-controllers.json new file mode 100644 index 000000000000..5413489ec324 --- /dev/null +++ b/grafana-dashboards/karpenter-controllers.json @@ -0,0 +1,446 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.1.6" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1636733515850, + "links": [], + "panels": [ + { + "datasource": null, + "description": "", + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 99, + "options": { + "content": "Displays information collected by the Kubernetes controller-runtime.\n\nSee the information icon of each panel for a description.\n\nSuggest improvements and additions [here](https://github.com/awslabs/karpenter/issues/new?labels=dashboard).", + "mode": "markdown" + }, + "pluginVersion": "8.1.6", + "title": "About this dashboard", + "type": "text" + }, + { + "aliasColors": { + "items": "light-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 1, + "fillGradient": 2, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 4 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "workqueue_depth{name=\"$controller\"}", + "interval": "", + "legendFormat": "items", + "queryType": "randomWalk", + "refId": "Work Queue Depth" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Items in Work Queue", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "error": "red", + "requeue": "orange", + "requeue_after": "semi-dark-purple", + "success": "green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "The rate of completed reconciliations per minute broken out by result status.", + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 4 + }, + "hiddenSeries": false, + "id": 76, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "rate(controller_runtime_reconcile_total{controller=\"$controller\"}[$trailing]) * 60", + "interval": "", + "legendFormat": "{{result}}", + "queryType": "randomWalk", + "refId": "Reconcilitions per minute" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Reconcilitions per minute", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:566", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:567", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateMagma", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_PROMETHEUS}", + "description": "Aggregates the duration of the reconciliation process.\n\nThe color of each \"bucket\" is a visual clue to the number of reconciliations that completed within that time range.\n\nMouse-over a bucket to display exact values.", + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 4 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 126, + "legend": { + "show": true + }, + "maxDataPoints": 25, + "pluginVersion": "8.1.6", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(controller_runtime_reconcile_time_seconds_bucket{controller=\"$controller\"}[$__interval])) by (le)", + "format": "heatmap", + "hide": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "p100" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Reconciliation duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": "0", + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + } + ], + "refresh": "1m", + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": "", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(controller_runtime_reconcile_errors_total, controller)", + "description": "Kubernetes controller", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Controller", + "multi": false, + "name": "controller", + "options": [], + "query": { + "query": "label_values(controller_runtime_reconcile_errors_total, controller)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "10m", + "value": "10m" + }, + "description": "Trailing aggregation window", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Trailing", + "multi": false, + "name": "trailing", + "options": [ + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": true, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + } + ], + "query": "30s,1m,5m,10m,30m,1h", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Controllers", + "uid": "-Yw9ShDnz", + "version": 15 +} \ No newline at end of file diff --git a/website/content/en/docs/getting-started/_index.md b/website/content/en/docs/getting-started/_index.md index f6d881e0d5fd..14cd4bee8f8f 100644 --- a/website/content/en/docs/getting-started/_index.md +++ b/website/content/en/docs/getting-started/_index.md @@ -168,6 +168,41 @@ helm upgrade --install karpenter karpenter/karpenter --namespace karpenter \ ```sh kubectl patch configmap config-logging -n karpenter --patch '{"data":{"loglevel.controller":"debug"}}' ``` + +### Create Grafana dashboards (optional) + +The Karpenter repo contains multiple [importable dashboards](https://github.com/awslabs/karpenter/tree/main/grafana-dashboards) for an existing Grafana instance. See the Grafana documentation for [instructions](https://grafana.com/docs/grafana/latest/dashboards/export-import/#import-dashboard) to import a dashboard. + +#### Deploy a temporary Prometheus and Grafana stack (optional) + +The following commands will deploy a Prometheus and Grafana stack that is suitable for this guide but does not include persistent storage or other configurations that would be necessary for monitoring a production deployment of Karpenter. + +```sh +helm repo add grafana-charts https://grafana.github.io/helm-charts +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update + +kubectl create namespace monitoring + +curl -fsSL https://karpenter.sh/docs/getting-started/prometheus-values.yaml +helm install --namespace monitoring prometheus prometheus-community/prometheus --values prometheus-values.yaml + +curl -fsSL https://karpenter.sh/docs/getting-started/grafana-values.yaml +helm install --namespace monitoring grafana grafana-charts/grafana --values grafana-values.yaml +``` + +The Grafana instance may be accessed using port forwarding. + +```sh +kubectl port-forward --namespace monitoring svc/grafana 3000:80 +``` + +The new stack has only one user, `admin`, and the password is stored in a secret. The following command will retrieve the password. + +```sh +kubectl get secret --namespace monitoring grafana -o jsonpath="{.data.admin-password}" | base64 --decode +``` + ### Provisioner A single Karpenter provisioner is capable of handling many different pod diff --git a/website/content/en/docs/getting-started/grafana-values.yaml b/website/content/en/docs/getting-started/grafana-values.yaml new file mode 100644 index 000000000000..708ae9a561c3 --- /dev/null +++ b/website/content/en/docs/getting-started/grafana-values.yaml @@ -0,0 +1,9 @@ +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + version: 1 + url: http://prometheus-server:80 + access: proxy diff --git a/website/content/en/docs/getting-started/prometheus-values.yaml b/website/content/en/docs/getting-started/prometheus-values.yaml new file mode 100644 index 000000000000..0eae2d3f882d --- /dev/null +++ b/website/content/en/docs/getting-started/prometheus-values.yaml @@ -0,0 +1,14 @@ +alertmanager: + persistentVolume: + enabled: false + +server: + fullnameOverride: prometheus-server + persistentVolume: + enabled: false + +extraScrapeConfigs: | + - job_name: karpenter + static_configs: + - targets: + - karpenter-metrics.karpenter:8080