diff --git a/grafana-dashboards/karpenter-capacity-glance.json b/grafana-dashboards/karpenter-capacity-glance.json deleted file mode 100644 index 071393be6174..000000000000 --- a/grafana-dashboards/karpenter-capacity-glance.json +++ /dev/null @@ -1,569 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "panel", - "id": "gauge", - "name": "Gauge", - "version": "" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "8.1.6" - }, - { - "type": "panel", - "id": "piechart", - "name": "Pie chart", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "text", - "name": "Text", - "version": "" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "id": null, - "iteration": 1636732245770, - "links": [], - "panels": [ - { - "datasource": null, - "gridPos": { - "h": 5, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 39, - "options": { - "content": "Displays information about the Nodes associated with provisioner \"$provisioner\". The *Overview* row displays current counts across all zones. Rows with zone specific counts will automatically be added for each observed zone within the selected time range.\n\nSee the information icon of each panel for a description.\n\nSuggest improvements and additions [here](https://github.com/aws/karpenter/issues/new?labels=dashboard).", - "mode": "markdown" - }, - "pluginVersion": "8.1.6", - "title": "About this dashboard", - "type": "text" - }, - { - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 5 - }, - "id": 29, - "title": "Overview", - "type": "row" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Percentage of capacity reporting as \"ready\".", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 1, - "min": 0, - "noValue": "No provisioned capacity", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 6 - }, - "id": 2, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "text": {} - }, - "pluginVersion": "8.1.6", - "targets": [ - { - "exemplar": true, - "expr": "sum(karpenter_capacity_ready_node_count{provisioner=\"$provisioner\"}) / sum(karpenter_capacity_node_count{provisioner=\"$provisioner\"})", - "interval": "", - "legendFormat": "Ready", - "queryType": "randomWalk", - "refId": "Ready Node Count" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Reporting as ready", - "transformations": [], - "type": "gauge" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Compares capacity that is \"ready\" in each zone.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "mappings": [] - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 6 - }, - "id": 4, - "options": { - "displayLabels": [ - "name" - ], - "legend": { - "displayMode": "hidden", - "placement": "right", - "values": [] - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "7.5.0", - "targets": [ - { - "exemplar": true, - "expr": "sum by(zone) (karpenter_capacity_ready_node_count{provisioner=\"$provisioner\"}) > 0", - "format": "time_series", - "instant": false, - "interval": "", - "legendFormat": "{{zone}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Ready capacity by zone", - "type": "piechart" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Compares capacity reporting as \"ready\" by instance type.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "mappings": [] - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 15 - }, - "id": 12, - "options": { - "displayLabels": [ - "name" - ], - "legend": { - "displayMode": "hidden", - "placement": "right", - "values": [] - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "7.5.0", - "targets": [ - { - "exemplar": true, - "expr": "sum by(instancetype) (karpenter_capacity_ready_node_instancetype_count{provisioner=\"$provisioner\"}) > 0", - "interval": "", - "legendFormat": "{{instancetype}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Ready capacity by instance type", - "type": "piechart" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Comparison of capacity reporting as \"ready\" by hardware architecture.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "mappings": [] - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 15 - }, - "id": 14, - "options": { - "displayLabels": [ - "name" - ], - "legend": { - "displayMode": "hidden", - "placement": "right", - "values": [] - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "7.5.0", - "targets": [ - { - "exemplar": true, - "expr": "sum by(arch) (karpenter_capacity_ready_node_arch_count{provisioner=\"$provisioner\"}) > 0", - "interval": "", - "legendFormat": "{{arch}}", - "queryType": "randomWalk", - "refId": "Arch Count" - } - ], - "title": "Ready capacity by node architecture", - "type": "piechart" - }, - { - "collapsed": false, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 24 - }, - "id": 8, - "panels": [], - "repeat": "zone", - "title": "Zone $zone", - "type": "row" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Compares capacity reporting as \"ready\" by instance type.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "mappings": [], - "noValue": "No provisioned capacity" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 25 - }, - "id": 6, - "options": { - "displayLabels": [ - "name" - ], - "legend": { - "displayMode": "hidden", - "placement": "right", - "values": [] - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "7.5.0", - "repeat": null, - "repeatDirection": "v", - "targets": [ - { - "exemplar": true, - "expr": "karpenter_capacity_ready_node_instancetype_count{provisioner=\"$provisioner\", zone=\"$zone\"} > 0", - "instant": false, - "interval": "", - "legendFormat": "{{instancetype}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Ready capacity by instance type", - "type": "piechart" - }, - { - "datasource": "${DS_PROMETHEUS}", - "description": "Comparison of capacity reporting as \"ready\" by hardware architecture.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "mappings": [], - "noValue": "No provisioned capacity" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 25 - }, - "id": 15, - "options": { - "displayLabels": [ - "name" - ], - "legend": { - "displayMode": "hidden", - "placement": "right", - "values": [] - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "7.5.0", - "repeat": null, - "repeatDirection": "v", - "targets": [ - { - "exemplar": true, - "expr": "karpenter_capacity_ready_node_arch_count{provisioner=\"$provisioner\", zone=\"$zone\"} > 0", - "interval": "", - "legendFormat": "{{arch}}", - "queryType": "randomWalk", - "refId": "Arch Count" - } - ], - "title": "Ready capacity by node architecture", - "type": "piechart" - } - ], - "refresh": "1m", - "schemaVersion": 30, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": "", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(karpenter_capacity_node_count, provisioner)", - "description": "Karpenter Provisioner", - "error": null, - "hide": 0, - "includeAll": false, - "label": "Provisioner", - "multi": false, - "name": "provisioner", - "options": [], - "query": { - "query": "label_values(karpenter_capacity_node_count, provisioner)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": "", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(karpenter_capacity_ready_node_count, zone)", - "description": "Topology zone", - "error": null, - "hide": 2, - "includeAll": true, - "label": "Zone", - "multi": false, - "name": "zone", - "options": [], - "query": { - "query": "label_values(karpenter_capacity_ready_node_count, zone)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Capacity / At a Glance", - "uid": "ffLnEKv7k", - "version": 6 -} diff --git a/grafana-dashboards/karpenter-capacity-history.json b/grafana-dashboards/karpenter-capacity-history.json deleted file mode 100644 index 8ecd445bfcc4..000000000000 --- a/grafana-dashboards/karpenter-capacity-history.json +++ /dev/null @@ -1,749 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "8.1.6" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph (old)", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "text", - "name": "Text", - "version": "" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "id": null, - "iteration": 1636732344860, - "links": [], - "panels": [ - { - "datasource": null, - "gridPos": { - "h": 5, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 39, - "options": { - "content": "Displays information about the Nodes associated with provisioner \"$provisioner\". The *Overview* row displays historic counts across all zones. Rows with zone specific counts will automatically be added for each observed zone within the selected time range.\n\nSee the information icon of each panel for a description.\n\nSuggest improvements and additions [here](https://github.com/aws/karpenter/issues/new?labels=dashboard).", - "mode": "markdown" - }, - "pluginVersion": "8.1.6", - "title": "About this dashboard", - "type": "text" - }, - { - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 5 - }, - "id": 29, - "title": "Overview", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "Percentage of capacity reporting as \"ready\".", - "fieldConfig": { - "defaults": { - "unit": "percentunit" - }, - "overrides": [] - }, - "fill": 3, - "fillGradient": 3, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 6 - }, - "hiddenSeries": false, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.1.6", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(karpenter_capacity_ready_node_count{provisioner=\"$provisioner\"}) / sum(karpenter_capacity_node_count{provisioner=\"$provisioner\"})", - "interval": "", - "legendFormat": "Ready", - "queryType": "randomWalk", - "refId": "Ready Node Count" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Reporting as Ready", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transformations": [], - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "percentunit", - "label": null, - "logBase": 1, - "max": "1.0", - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "Compares capacity that is \"ready\" in each zone.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 6 - }, - "hiddenSeries": false, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": false, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.1.6", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum by(zone) (karpenter_capacity_ready_node_count{provisioner=\"$provisioner\"}) > 0", - "format": "time_series", - "instant": false, - "interval": "", - "legendFormat": "{{zone}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Ready capacity by zone", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "Compares capacity reporting as \"ready\" by instance type.", - "fill": 1, - "fillGradient": 3, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 15 - }, - "hiddenSeries": false, - "id": 12, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": false, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.1.6", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum by(instancetype) (karpenter_capacity_ready_node_instancetype_count{provisioner=\"$provisioner\"}) > 0", - "interval": "", - "legendFormat": "{{instancetype}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Ready capacity by instance type", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "Comparison of capacity reporting as \"ready\" by hardware architecture.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 15 - }, - "hiddenSeries": false, - "id": 14, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": false, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.1.6", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum by(arch) (karpenter_capacity_ready_node_arch_count{provisioner=\"$provisioner\"}) > 0", - "interval": "", - "legendFormat": "{{arch}}", - "queryType": "randomWalk", - "refId": "Arch Count" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Ready by node architecture", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "collapsed": false, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 24 - }, - "id": 8, - "panels": [], - "repeat": "zone", - "title": "Zone $zone", - "type": "row" - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "Compares capacity reporting as \"ready\" by instance type.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 25 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.1.6", - "pointradius": 2, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": "v", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "karpenter_capacity_ready_node_instancetype_count{provisioner=\"$provisioner\", zone=\"$zone\"} > 0", - "instant": false, - "interval": "", - "legendFormat": "{{instancetype}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Ready capacity by instance type", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "Comparison of capacity reporting as \"ready\" by hardware architecture.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 25 - }, - "hiddenSeries": false, - "id": 15, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": false, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.1.6", - "pointradius": 2, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": "v", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "karpenter_capacity_ready_node_arch_count{provisioner=\"$provisioner\", zone=\"$zone\"} > 0", - "interval": "", - "legendFormat": "{{arch}}", - "queryType": "randomWalk", - "refId": "Arch Count" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Ready capacity by node architecture", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "refresh": "1m", - "schemaVersion": 30, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": "", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(karpenter_capacity_node_count, provisioner)", - "description": "Karpenter Provisioner", - "error": null, - "hide": 0, - "includeAll": false, - "label": "Provisioner", - "multi": false, - "name": "provisioner", - "options": [], - "query": { - "query": "label_values(karpenter_capacity_node_count, provisioner)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": "", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(karpenter_capacity_ready_node_count, zone)", - "description": "Topology zone", - "error": null, - "hide": 2, - "includeAll": true, - "label": "Zone", - "multi": false, - "name": "zone", - "options": [], - "query": { - "query": "label_values(karpenter_capacity_ready_node_count, zone)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Capacity / History", - "uid": "ffLnEKv71", - "version": 4 -} diff --git a/grafana-dashboards/karpenter-node-metrics.json b/grafana-dashboards/karpenter-node-metrics.json deleted file mode 100644 index 239f4277863a..000000000000 --- a/grafana-dashboards/karpenter-node-metrics.json +++ /dev/null @@ -1,791 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "gnetId": null, - "graphTooltip": 0, - "id": 5, - "iteration": 1640028599664, - "links": [], - "liveNow": false, - "panels": [ - { - "datasource": "Prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "targets": [ - { - "exemplar": true, - "expr": "((karpenter_nodes_total_pod_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 + karpenter_nodes_total_daemon_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0) *100) / karpenter_nodes_allocatable{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", - "interval": "", - "legendFormat": "{{node_name}}", - "refId": "A" - } - ], - "title": "Node Utilization", - "type": "timeseries" - }, - { - "datasource": "Prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 8, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "targets": [ - { - "exemplar": true, - "expr": "((karpenter_nodes_total_pod_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 + karpenter_nodes_total_daemon_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 ) *100) / karpenter_nodes_allocatable{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", - "interval": "", - "legendFormat": "{{node_name}}", - "refId": "A" - } - ], - "title": "Node Overcommitment", - "type": "timeseries" - }, - { - "datasource": "Prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 3, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 12 - }, - "id": 5, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "text": {} - }, - "pluginVersion": "8.2.5", - "targets": [ - { - "exemplar": true, - "expr": "sum by (resource_type)(karpenter_nodes_allocatable{resource_type!=\"pods\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_total_overhead{resource_type!=\"pods\"}*0 or karpenter_nodes_total_requests{resource_type!=\"pods\"}*0)- sum by (resource_type)(karpenter_nodes_total_overhead{resource_type!=\"pods\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable{resource_type!=\"pods\"}*0 or karpenter_nodes_total_requests{resource_type!=\"pods\"}*0) - sum by (resource_type)(karpenter_nodes_total_requests{resource_type!=\"pods\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable{resource_type!=\"pods\"}*0 or karpenter_nodes_total_overhead{resource_type!=\"pods\"}*0)", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{resource_type}}", - "refId": "Residual" - }, - { - "exemplar": true, - "expr": "sum (karpenter_nodes_allocatable{resource_type=\"pods\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}) - sum (karpenter_pods_state{})", - "hide": false, - "interval": "", - "legendFormat": "pods", - "refId": "Number of Pods" - } - ], - "title": "Cluster Residual Capacity", - "type": "gauge" - }, - { - "datasource": "Prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "center", - "displayMode": "auto" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "name" - }, - "properties": [ - { - "id": "custom.width", - "value": 323 - } - ] - } - ] - }, - "gridPos": { - "h": 4, - "w": 24, - "x": 0, - "y": 19 - }, - "id": 2, - "options": { - "showHeader": true, - "sortBy": [] - }, - "pluginVersion": "8.2.5", - "targets": [ - { - "exemplar": true, - "expr": "karpenter_nodes_allocatable{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "Allocatable CPU", - "refId": "Allocatable" - }, - { - "exemplar": true, - "expr": "karpenter_nodes_total_pod_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "Requests CPU", - "refId": "Pod Requests" - }, - { - "exemplar": true, - "expr": "karpenter_nodes_total_daemon_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "Requests CPU", - "refId": "Daemon Requests" - }, - { - "exemplar": true, - "expr": "karpenter_nodes_total_pod_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "Limits CPU", - "refId": "Pod Limits" - }, - { - "exemplar": true, - "expr": "karpenter_nodes_total_daemon_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "Limits CPU", - "refId": "Daemon Limits" - }, - { - "exemplar": false, - "expr": "karpenter_nodes_system_overhead{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "Overhead CPU", - "refId": "Overhead" - }, - { - "exemplar": true, - "expr": "((karpenter_nodes_total_pod_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 + karpenter_nodes_total_daemon_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0) *100) / karpenter_nodes_allocatable{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "Utilization CPU", - "refId": "Utilization" - }, - { - "exemplar": true, - "expr": "((karpenter_nodes_total_pod_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 + karpenter_nodes_total_daemon_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 ) *100) / karpenter_nodes_allocatable{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "Overcommitment CPU", - "refId": "Overcommitment" - } - ], - "title": "Current Node Metrics", - "transformations": [ - { - "id": "seriesToColumns", - "options": { - "byField": "node_name" - } - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time 1": true, - "Time 2": true, - "Time 3": true, - "Time 4": true, - "Time 5": true, - "Time 6": true, - "Time 7": true, - "__name__ 1": true, - "__name__ 2": true, - "__name__ 3": true, - "__name__ 4": true, - "__name__ 5": true, - "arch 1": true, - "arch 2": true, - "arch 3": true, - "arch 4": true, - "arch 5": true, - "arch 6": true, - "arch 7": true, - "capacity_type 1": true, - "capacity_type 2": true, - "capacity_type 3": true, - "capacity_type 4": true, - "capacity_type 5": true, - "capacity_type 6": true, - "capacity_type 7": true, - "instance 1": true, - "instance 2": true, - "instance 3": true, - "instance 4": true, - "instance 5": true, - "instance 6": true, - "instance 7": true, - "instance_type 1": true, - "instance_type 2": true, - "instance_type 3": true, - "instance_type 4": true, - "instance_type 5": true, - "instance_type 6": true, - "instance_type 7": true, - "job 1": true, - "job 2": true, - "job 3": true, - "job 4": true, - "job 5": true, - "job 6": true, - "job 7": true, - "node_name": false, - "provisioner 1": true, - "provisioner 2": true, - "provisioner 3": true, - "provisioner 4": true, - "provisioner 5": true, - "provisioner 6": true, - "provisioner 7": true, - "resource_type 1": true, - "resource_type 2": true, - "resource_type 3": true, - "resource_type 4": true, - "resource_type 5": true, - "resource_type 6": true, - "resource_type 7": true, - "zone 1": true, - "zone 2": true, - "zone 3": true, - "zone 4": true, - "zone 5": true, - "zone 6": true, - "zone 7": true - }, - "indexByName": { - "Time 1": 7, - "Time 2": 17, - "Time 3": 27, - "Time 4": 37, - "Time 5": 47, - "Time 6": 56, - "Value #Allocatable": 1, - "Value #Overcommitment": 6, - "Value #Overhead": 4, - "Value #Pod Limits": 3, - "Value #Pod Requests": 2, - "Value #Utilization": 5, - "__name__ 1": 8, - "__name__ 2": 18, - "__name__ 3": 28, - "__name__ 4": 38, - "arch 1": 9, - "arch 2": 19, - "arch 3": 29, - "arch 4": 39, - "arch 5": 48, - "arch 6": 57, - "capacity_type 1": 10, - "capacity_type 2": 20, - "capacity_type 3": 30, - "capacity_type 4": 40, - "capacity_type 5": 49, - "capacity_type 6": 58, - "instance 1": 11, - "instance 2": 21, - "instance 3": 31, - "instance 4": 41, - "instance 5": 50, - "instance 6": 59, - "instance_type 1": 12, - "instance_type 2": 22, - "instance_type 3": 32, - "instance_type 4": 42, - "instance_type 5": 51, - "instance_type 6": 60, - "job 1": 13, - "job 2": 23, - "job 3": 33, - "job 4": 43, - "job 5": 52, - "job 6": 61, - "node_name": 0, - "provisioner 1": 14, - "provisioner 2": 24, - "provisioner 3": 34, - "provisioner 4": 44, - "provisioner 5": 53, - "provisioner 6": 62, - "resource_type 1": 15, - "resource_type 2": 25, - "resource_type 3": 35, - "resource_type 4": 45, - "resource_type 5": 54, - "resource_type 6": 63, - "zone 1": 16, - "zone 2": 26, - "zone 3": 36, - "zone 4": 46, - "zone 5": 55, - "zone 6": 64 - }, - "renameByName": { - "Value #Allocatable": "Allocatable", - "Value #Daemon Requests": "Requests from Daemon Set", - "Value #Overcommitment": "Overcommitment Rate %", - "Value #Overhead": "System Overhead", - "Value #Pod Limits": "Limits from Pods", - "Value #Pod Requests": "Requests from Pods", - "Value #Utilization": "Utilization Rate %", - "name": "Node Name" - } - } - } - ], - "type": "table" - } - ], - "refresh": "", - "schemaVersion": 32, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": null, - "current": { - "selected": true, - "text": [ - "amd64" - ], - "value": [ - "amd64" - ] - }, - "datasource": "Prometheus", - "definition": "label_values(arch)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "arch", - "options": [], - "query": { - "query": "label_values(arch)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "allValue": null, - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "Prometheus", - "definition": "label_values(capacity_type)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "capacity_type", - "options": [], - "query": { - "query": "label_values(capacity_type)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "allValue": null, - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "Prometheus", - "definition": "label_values(instance_type)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "instance_type", - "options": [], - "query": { - "query": "label_values(instance_type)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "allValue": null, - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "Prometheus", - "definition": "label_values(provisioner)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "provisioner", - "options": [], - "query": { - "query": "label_values(provisioner)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "allValue": null, - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "Prometheus", - "definition": "label_values(zone)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "zone", - "options": [], - "query": { - "query": "label_values(zone)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "allValue": null, - "current": { - "selected": true, - "text": "cpu", - "value": "cpu" - }, - "datasource": "Prometheus", - "definition": "label_values(resource_type)", - "description": null, - "error": null, - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "resource_type", - "options": [], - "query": { - "query": "label_values(resource_type)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "allValue": null, - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "Prometheus", - "definition": "label_values(node_name)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "node_name", - "options": [], - "query": { - "query": "label_values(node_name)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - } - ] - }, - "time": { - "from": "now-5m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Cluster Capacity", - "uid": "GwdOTionz", - "version": 44 -} \ No newline at end of file diff --git a/pkg/controllers/metrics/common.go b/pkg/controllers/metrics/common.go deleted file mode 100644 index 6e266af8f244..000000000000 --- a/pkg/controllers/metrics/common.go +++ /dev/null @@ -1,52 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/metrics" - "github.com/prometheus/client_golang/prometheus" - v1 "k8s.io/api/core/v1" -) - -const ( - controllerName = "metrics" - - metricSubsystemCapacity = "capacity" - metricSubsystemPods = "pods" - - metricLabelArch = "arch" - metricLabelInstanceType = "instancetype" - metricLabelPhase = "phase" - metricLabelProvisioner = metrics.ProvisionerLabel - metricLabelZone = "zone" - - nodeLabelArch = v1.LabelArchStable - nodeLabelInstanceType = v1.LabelInstanceTypeStable - nodeLabelZone = v1.LabelTopologyZone - - nodeConditionTypeReady = v1.NodeReady -) - -var nodeLabelProvisioner = v1alpha5.ProvisionerNameLabelKey - -func publishCount(gaugeVec *prometheus.GaugeVec, labels prometheus.Labels, count int) error { - gauge, err := gaugeVec.GetMetricWith(labels) - if err != nil { - return err - } - gauge.Set(float64(count)) - return nil -} diff --git a/pkg/controllers/metrics/controller.go b/pkg/controllers/metrics/controller.go deleted file mode 100644 index f61efb740c3c..000000000000 --- a/pkg/controllers/metrics/controller.go +++ /dev/null @@ -1,165 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "context" - "time" - - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider" - "github.com/aws/karpenter/pkg/utils/injection" - "go.uber.org/multierr" - "go.uber.org/zap" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/client-go/util/workqueue" - "knative.dev/pkg/logging" - controllerruntime "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/reconcile" -) - -type Controller struct { - CloudProvider cloudprovider.CloudProvider - KubeClient client.Client -} - -func NewController(kubeClient client.Client, cloudProvider cloudprovider.CloudProvider) *Controller { - return &Controller{ - CloudProvider: cloudProvider, - KubeClient: kubeClient, - } -} - -func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { - ctx = logging.WithLogger(ctx, zap.NewNop().Sugar()) - ctx = injection.WithControllerName(ctx, controllerName) - - // Does the provisioner exist? - provisioner := &v1alpha5.Provisioner{} - if err := c.KubeClient.Get(ctx, req.NamespacedName, provisioner); err != nil { - if !errors.IsNotFound(err) { - // Unable to determine existence of the provisioner, try again later. - return reconcile.Result{}, err - } - - // The provisioner has been deleted. - return reconcile.Result{}, nil - } - - // The provisioner does exist, so update counters. - if err := c.updateCounts(ctx, provisioner); err != nil { - return reconcile.Result{}, err - } - - // Schedule the next run. - return reconcile.Result{RequeueAfter: 10 * time.Second}, nil -} - -func (c *Controller) Register(_ context.Context, m manager.Manager) error { - return controllerruntime. - NewControllerManagedBy(m). - Named(controllerName). - For(&v1alpha5.Provisioner{}). - WithOptions(controller.Options{ - MaxConcurrentReconciles: 10, - }). - Complete(c) -} - -func (c *Controller) updateCounts(ctx context.Context, provisioner *v1alpha5.Provisioner) error { - updateCountFuncs := []func(context.Context, *v1alpha5.Provisioner) error{ - c.updateNodeCounts, - c.updatePodCounts, - } - updateCountFuncsLen := len(updateCountFuncs) - errors := make([]error, updateCountFuncsLen) - workqueue.ParallelizeUntil(ctx, updateCountFuncsLen, updateCountFuncsLen, func(index int) { - errors[index] = updateCountFuncs[index](ctx, provisioner) - }) - - return multierr.Combine(errors...) -} - -func (c *Controller) updateNodeCounts(ctx context.Context, provisioner *v1alpha5.Provisioner) error { - instanceTypes, err := c.CloudProvider.GetInstanceTypes(ctx, &provisioner.Spec.Constraints) - if err != nil { - return err - } - - archValues := sets.NewString() - instanceTypeValues := sets.NewString() - zoneValues := sets.NewString() - for _, instanceType := range instanceTypes { - archValues.Insert(instanceType.Architecture()) - instanceTypeValues.Insert(instanceType.Name()) - for _, offering := range instanceType.Offerings() { - zoneValues.Insert(offering.Zone) - } - } - knownValuesForNodeLabels := map[string]sets.String{ - nodeLabelArch: archValues, - nodeLabelInstanceType: instanceTypeValues, - nodeLabelZone: zoneValues, - } - - return publishNodeCounts(provisioner.Name, knownValuesForNodeLabels, func(matchingLabels client.MatchingLabels, consume nodeListConsumerFunc) error { - nodes := v1.NodeList{} - if err := c.KubeClient.List(ctx, &nodes, matchingLabels); err != nil { - return err - } - return consume(nodes.Items) - }) -} - -func (c *Controller) updatePodCounts(ctx context.Context, provisioner *v1alpha5.Provisioner) error { - podsForProvisioner, err := c.podsForProvisioner(ctx, provisioner) - if err != nil { - return err - } - - return publishPodCounts(provisioner.Name, podsForProvisioner) -} - -// podsForProvisioner returns a map of slices containing all pods scheduled to nodes in each zone. -func (c *Controller) podsForProvisioner(ctx context.Context, provisioner *v1alpha5.Provisioner) ([]v1.Pod, error) { - // Karpenter does not apply a label, or other marker, to pods. - - results := []v1.Pod{} - - // 1. Fetch all nodes associated with the provisioner. - nodeList := v1.NodeList{} - withProvisionerName := client.MatchingLabels{nodeLabelProvisioner: provisioner.Name} - if err := c.KubeClient.List(ctx, &nodeList, withProvisionerName); err != nil { - return nil, err - } - - // 2. Get all the pods scheduled to each node. - for _, node := range nodeList.Items { - podList := v1.PodList{} - withNodeName := client.MatchingFields{"spec.nodeName": node.Name} - if err := c.KubeClient.List(ctx, &podList, withNodeName); err != nil { - return nil, err - } - - results = append(results, podList.Items...) - } - - return results, nil -} diff --git a/pkg/controllers/metrics/nodes.go b/pkg/controllers/metrics/nodes.go deleted file mode 100644 index 005d98b3ecca..000000000000 --- a/pkg/controllers/metrics/nodes.go +++ /dev/null @@ -1,187 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "github.com/aws/karpenter/pkg/metrics" - "github.com/prometheus/client_golang/prometheus" - "go.uber.org/multierr" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/util/sets" - "sigs.k8s.io/controller-runtime/pkg/client" - crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" -) - -type ( - nodeListConsumerFunc = func([]v1.Node) error - consumeNodesWithFunc = func(client.MatchingLabels, nodeListConsumerFunc) error -) - -var ( - nodeCountByProvisioner = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: metricSubsystemCapacity, - Name: "node_count", - Help: "Total node count by provisioner.", - }, - []string{ - metricLabelProvisioner, - }, - ) - - readyNodeCountByProvisionerZone = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: metricSubsystemCapacity, - Name: "ready_node_count", - Help: "Count of nodes that are ready by provisioner and zone.", - }, - []string{ - metricLabelProvisioner, - metricLabelZone, - }, - ) - - readyNodeCountByArchProvisionerZone = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: metricSubsystemCapacity, - Name: "ready_node_arch_count", - Help: "Count of nodes that are ready by architecture, provisioner, and zone.", - }, - []string{ - metricLabelArch, - metricLabelProvisioner, - metricLabelZone, - }, - ) - - readyNodeCountByInstancetypeProvisionerZone = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: metricSubsystemCapacity, - Name: "ready_node_instancetype_count", - Help: "Count of nodes that are ready by instance type, provisioner, and zone.", - }, - []string{ - metricLabelInstanceType, - metricLabelProvisioner, - metricLabelZone, - }, - ) - - readyNodeCountByOsProvisionerZone = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: metricSubsystemCapacity, - Name: "ready_node_os_count", - Help: "Count of nodes that are ready by provisioner, and zone.", - }, - []string{ - metricLabelProvisioner, - metricLabelZone, - }, - ) -) - -func init() { - crmetrics.Registry.MustRegister(nodeCountByProvisioner) - crmetrics.Registry.MustRegister(readyNodeCountByProvisionerZone) - crmetrics.Registry.MustRegister(readyNodeCountByArchProvisionerZone) - crmetrics.Registry.MustRegister(readyNodeCountByInstancetypeProvisionerZone) - crmetrics.Registry.MustRegister(readyNodeCountByOsProvisionerZone) -} - -func publishNodeCounts(provisioner string, knownValuesForNodeLabels map[string]sets.String, consumeNodesWith consumeNodesWithFunc) error { - archValues := knownValuesForNodeLabels[nodeLabelArch] - instanceTypeValues := knownValuesForNodeLabels[nodeLabelInstanceType] - zoneValues := knownValuesForNodeLabels[nodeLabelZone] - - errors := make([]error, 0, len(archValues)*len(instanceTypeValues)*len(zoneValues)) - - nodeLabels := client.MatchingLabels{nodeLabelProvisioner: provisioner} - errors = append(errors, consumeNodesWith(nodeLabels, func(nodes []v1.Node) error { - return publishCount(nodeCountByProvisioner, metricLabelsFrom(nodeLabels), len(nodes)) - })) - - for zone := range zoneValues { - nodeLabels = client.MatchingLabels{ - nodeLabelProvisioner: provisioner, - nodeLabelZone: zone, - } - errors = append(errors, consumeNodesWith(nodeLabels, filterReadyNodes(func(readyNodes []v1.Node) error { - return publishCount(readyNodeCountByProvisionerZone, metricLabelsFrom(nodeLabels), len(readyNodes)) - }))) - - for arch := range archValues { - nodeLabels := client.MatchingLabels{ - nodeLabelArch: arch, - nodeLabelProvisioner: provisioner, - nodeLabelZone: zone, - } - errors = append(errors, consumeNodesWith(nodeLabels, filterReadyNodes(func(readyNodes []v1.Node) error { - return publishCount(readyNodeCountByArchProvisionerZone, metricLabelsFrom(nodeLabels), len(readyNodes)) - }))) - } - - for instanceType := range instanceTypeValues { - nodeLabels := client.MatchingLabels{ - nodeLabelInstanceType: instanceType, - nodeLabelProvisioner: provisioner, - nodeLabelZone: zone, - } - errors = append(errors, consumeNodesWith(nodeLabels, filterReadyNodes(func(readyNodes []v1.Node) error { - return publishCount(readyNodeCountByInstancetypeProvisionerZone, metricLabelsFrom(nodeLabels), len(readyNodes)) - }))) - } - } - - return multierr.Combine(errors...) -} - -// filterReadyNodes returns a new function that will filter "ready" nodes to pass on -// to `consume`, and returns the result. -func filterReadyNodes(consume nodeListConsumerFunc) nodeListConsumerFunc { - return func(nodes []v1.Node) error { - readyNodes := make([]v1.Node, 0, len(nodes)) - for _, node := range nodes { - for _, condition := range node.Status.Conditions { - if condition.Type == nodeConditionTypeReady && condition.Status == v1.ConditionTrue { - readyNodes = append(readyNodes, node) - } - } - } - return consume(readyNodes) - } -} - -func metricLabelsFrom(nodeLabels map[string]string) prometheus.Labels { - metricLabels := prometheus.Labels{} - // Exclude node label values that not present or are empty strings. - if arch := nodeLabels[nodeLabelArch]; arch != "" { - metricLabels[metricLabelArch] = arch - } - if instanceType := nodeLabels[nodeLabelInstanceType]; instanceType != "" { - metricLabels[metricLabelInstanceType] = instanceType - } - if provisioner := nodeLabels[nodeLabelProvisioner]; provisioner != "" { - metricLabels[metricLabelProvisioner] = provisioner - } - if zone := nodeLabels[nodeLabelZone]; zone != "" { - metricLabels[metricLabelZone] = zone - } - return metricLabels -} diff --git a/pkg/controllers/metrics/pods.go b/pkg/controllers/metrics/pods.go deleted file mode 100644 index 565beeef7bcb..000000000000 --- a/pkg/controllers/metrics/pods.go +++ /dev/null @@ -1,72 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "strings" - - "github.com/aws/karpenter/pkg/metrics" - "github.com/prometheus/client_golang/prometheus" - "go.uber.org/multierr" - v1 "k8s.io/api/core/v1" - crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" -) - -var ( - phaseValues = []v1.PodPhase{ - v1.PodFailed, - v1.PodPending, - v1.PodRunning, - v1.PodSucceeded, - v1.PodUnknown, - } - - podCountByPhaseProvisioner = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: metricSubsystemPods, - Name: "count", - Help: "Total pod count by phase and provisioner.", - }, - []string{ - metricLabelPhase, - metricLabelProvisioner, - }, - ) -) - -func init() { - crmetrics.Registry.MustRegister(podCountByPhaseProvisioner) -} - -func publishPodCounts(provisioner string, podList []v1.Pod) error { - countByPhase := make(map[v1.PodPhase]int, len(phaseValues)) - - for _, pod := range podList { - countByPhase[pod.Status.Phase]++ - } - - errors := make([]error, 0, len(phaseValues)) - - for _, phase := range phaseValues { - metricLabels := prometheus.Labels{ - metricLabelPhase: strings.ToLower(string(phase)), - metricLabelProvisioner: provisioner, - } - errors = append(errors, publishCount(podCountByPhaseProvisioner, metricLabels, countByPhase[phase])) - } - - return multierr.Combine(errors...) -} diff --git a/website/content/en/preview/getting-started/grafana-values.yaml b/website/content/en/preview/getting-started/grafana-values.yaml index 708ae9a561c3..3c0b01dcab5d 100644 --- a/website/content/en/preview/getting-started/grafana-values.yaml +++ b/website/content/en/preview/getting-started/grafana-values.yaml @@ -7,3 +7,21 @@ datasources: version: 1 url: http://prometheus-server:80 access: proxy +dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default +dashboards: + default: + pod-dashboard: + url: https://karpenter.sh/preview/getting-started/karpenter-pod-metrics.json + node-dashboard: + url: https://karpenter.sh/preview/getting-started/karpenter-node-metrics.json diff --git a/grafana-dashboards/karpenter-controllers-allocation.json b/website/content/en/preview/getting-started/karpenter-controllers-allocation.json similarity index 100% rename from grafana-dashboards/karpenter-controllers-allocation.json rename to website/content/en/preview/getting-started/karpenter-controllers-allocation.json diff --git a/grafana-dashboards/karpenter-controllers.json b/website/content/en/preview/getting-started/karpenter-controllers.json similarity index 100% rename from grafana-dashboards/karpenter-controllers.json rename to website/content/en/preview/getting-started/karpenter-controllers.json diff --git a/website/content/en/preview/getting-started/karpenter-node-metrics.json b/website/content/en/preview/getting-started/karpenter-node-metrics.json new file mode 100644 index 000000000000..11783d5a9efc --- /dev/null +++ b/website/content/en/preview/getting-started/karpenter-node-metrics.json @@ -0,0 +1,791 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": null, + "graphTooltip": 0, + "id": 5, + "iteration": 1640028599664, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "((karpenter_nodes_total_pod_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 + karpenter_nodes_total_daemon_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0) *100) / karpenter_nodes_allocatable{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", + "interval": "", + "legendFormat": "{{node_name}}", + "refId": "A" + } + ], + "title": "Node Utilization", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "((karpenter_nodes_total_pod_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 + karpenter_nodes_total_daemon_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 ) *100) / karpenter_nodes_allocatable{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", + "interval": "", + "legendFormat": "{{node_name}}", + "refId": "A" + } + ], + "title": "Node Overcommitment", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.2.5", + "targets": [ + { + "exemplar": true, + "expr": "sum by (resource_type)(karpenter_nodes_allocatable{resource_type!=\"pods\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_total_overhead{resource_type!=\"pods\"}*0 or karpenter_nodes_total_requests{resource_type!=\"pods\"}*0)- sum by (resource_type)(karpenter_nodes_total_overhead{resource_type!=\"pods\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable{resource_type!=\"pods\"}*0 or karpenter_nodes_total_requests{resource_type!=\"pods\"}*0) - sum by (resource_type)(karpenter_nodes_total_requests{resource_type!=\"pods\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable{resource_type!=\"pods\"}*0 or karpenter_nodes_total_overhead{resource_type!=\"pods\"}*0)", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{resource_type}}", + "refId": "Residual" + }, + { + "exemplar": true, + "expr": "sum (karpenter_nodes_allocatable{resource_type=\"pods\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}) - sum (karpenter_pods_state{})", + "hide": false, + "interval": "", + "legendFormat": "pods", + "refId": "Number of Pods" + } + ], + "title": "Cluster Residual Capacity", + "type": "gauge" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "auto" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "custom.width", + "value": 323 + } + ] + } + ] + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 2, + "options": { + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "8.2.5", + "targets": [ + { + "exemplar": true, + "expr": "karpenter_nodes_allocatable{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Allocatable CPU", + "refId": "Allocatable" + }, + { + "exemplar": true, + "expr": "karpenter_nodes_total_pod_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Requests CPU", + "refId": "Pod Requests" + }, + { + "exemplar": true, + "expr": "karpenter_nodes_total_daemon_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Requests CPU", + "refId": "Daemon Requests" + }, + { + "exemplar": true, + "expr": "karpenter_nodes_total_pod_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Limits CPU", + "refId": "Pod Limits" + }, + { + "exemplar": true, + "expr": "karpenter_nodes_total_daemon_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Limits CPU", + "refId": "Daemon Limits" + }, + { + "exemplar": false, + "expr": "karpenter_nodes_system_overhead{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Overhead CPU", + "refId": "Overhead" + }, + { + "exemplar": true, + "expr": "((karpenter_nodes_total_pod_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 + karpenter_nodes_total_daemon_requests{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0) *100) / karpenter_nodes_allocatable{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Utilization CPU", + "refId": "Utilization" + }, + { + "exemplar": true, + "expr": "((karpenter_nodes_total_pod_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 + karpenter_nodes_total_daemon_limits{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"} or karpenter_nodes_allocatable*0 ) *100) / karpenter_nodes_allocatable{resource_type=\"$resource_type\", arch=~\"$arch\", capacity_type=~\"$capacity_type\", instance_type=~\"$instance_type\", provisioner=~\"$provisioner\", zone=~\"$zone\", node_name=~\"$node_name\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Overcommitment CPU", + "refId": "Overcommitment" + } + ], + "title": "Current Node Metrics", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "node_name" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "arch 1": true, + "arch 2": true, + "arch 3": true, + "arch 4": true, + "arch 5": true, + "arch 6": true, + "arch 7": true, + "capacity_type 1": true, + "capacity_type 2": true, + "capacity_type 3": true, + "capacity_type 4": true, + "capacity_type 5": true, + "capacity_type 6": true, + "capacity_type 7": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_type 1": true, + "instance_type 2": true, + "instance_type 3": true, + "instance_type 4": true, + "instance_type 5": true, + "instance_type 6": true, + "instance_type 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true, + "node_name": false, + "provisioner 1": true, + "provisioner 2": true, + "provisioner 3": true, + "provisioner 4": true, + "provisioner 5": true, + "provisioner 6": true, + "provisioner 7": true, + "resource_type 1": true, + "resource_type 2": true, + "resource_type 3": true, + "resource_type 4": true, + "resource_type 5": true, + "resource_type 6": true, + "resource_type 7": true, + "zone 1": true, + "zone 2": true, + "zone 3": true, + "zone 4": true, + "zone 5": true, + "zone 6": true, + "zone 7": true + }, + "indexByName": { + "Time 1": 7, + "Time 2": 17, + "Time 3": 27, + "Time 4": 37, + "Time 5": 47, + "Time 6": 56, + "Value #Allocatable": 1, + "Value #Overcommitment": 6, + "Value #Overhead": 4, + "Value #Pod Limits": 3, + "Value #Pod Requests": 2, + "Value #Utilization": 5, + "__name__ 1": 8, + "__name__ 2": 18, + "__name__ 3": 28, + "__name__ 4": 38, + "arch 1": 9, + "arch 2": 19, + "arch 3": 29, + "arch 4": 39, + "arch 5": 48, + "arch 6": 57, + "capacity_type 1": 10, + "capacity_type 2": 20, + "capacity_type 3": 30, + "capacity_type 4": 40, + "capacity_type 5": 49, + "capacity_type 6": 58, + "instance 1": 11, + "instance 2": 21, + "instance 3": 31, + "instance 4": 41, + "instance 5": 50, + "instance 6": 59, + "instance_type 1": 12, + "instance_type 2": 22, + "instance_type 3": 32, + "instance_type 4": 42, + "instance_type 5": 51, + "instance_type 6": 60, + "job 1": 13, + "job 2": 23, + "job 3": 33, + "job 4": 43, + "job 5": 52, + "job 6": 61, + "node_name": 0, + "provisioner 1": 14, + "provisioner 2": 24, + "provisioner 3": 34, + "provisioner 4": 44, + "provisioner 5": 53, + "provisioner 6": 62, + "resource_type 1": 15, + "resource_type 2": 25, + "resource_type 3": 35, + "resource_type 4": 45, + "resource_type 5": 54, + "resource_type 6": 63, + "zone 1": 16, + "zone 2": 26, + "zone 3": 36, + "zone 4": 46, + "zone 5": 55, + "zone 6": 64 + }, + "renameByName": { + "Value #Allocatable": "Allocatable", + "Value #Daemon Requests": "Requests from Daemon Set", + "Value #Overcommitment": "Overcommitment Rate %", + "Value #Overhead": "System Overhead", + "Value #Pod Limits": "Limits from Pods", + "Value #Pod Requests": "Requests from Pods", + "Value #Utilization": "Utilization Rate %", + "name": "Node Name" + } + } + } + ], + "type": "table" + } + ], + "refresh": "", + "schemaVersion": 32, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": true, + "text": [ + "amd64" + ], + "value": [ + "amd64" + ] + }, + "datasource": "Prometheus", + "definition": "label_values(arch)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "arch", + "options": [], + "query": { + "query": "label_values(arch)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "definition": "label_values(capacity_type)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "capacity_type", + "options": [], + "query": { + "query": "label_values(capacity_type)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "definition": "label_values(instance_type)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "instance_type", + "options": [], + "query": { + "query": "label_values(instance_type)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "definition": "label_values(provisioner)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "provisioner", + "options": [], + "query": { + "query": "label_values(provisioner)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "definition": "label_values(zone)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "zone", + "options": [], + "query": { + "query": "label_values(zone)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "cpu", + "value": "cpu" + }, + "datasource": "Prometheus", + "definition": "label_values(resource_type)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "resource_type", + "options": [], + "query": { + "query": "label_values(resource_type)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "definition": "label_values(node_name)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "node_name", + "options": [], + "query": { + "query": "label_values(node_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Cluster Capacity", + "uid": "GwdOTionz", + "version": 44 +} diff --git a/grafana-dashboards/karpenter-pod-metrics.json b/website/content/en/preview/getting-started/karpenter-pod-metrics.json similarity index 100% rename from grafana-dashboards/karpenter-pod-metrics.json rename to website/content/en/preview/getting-started/karpenter-pod-metrics.json