From 37a0eb92db61c84c2b31471f6bae165e06f5f78c Mon Sep 17 00:00:00 2001 From: Jan Horstmann Date: Sat, 9 Nov 2024 15:52:02 +0100 Subject: [PATCH] Add blackbox-exporter mixin Signed-off-by: Jan Horstmann --- .src/jsonnetfile.json | 8 + .src/jsonnetfile.lock.json | 10 + .src/mixins/infrastructure/blackbox.libsonnet | 7 + .../infrastructure/blackbox-exporter.json | 1001 +++++++++++++++++ prometheus/blackbox.rec.rules | 1 + prometheus/blackbox.rules | 33 + 6 files changed, 1060 insertions(+) create mode 100644 .src/mixins/infrastructure/blackbox.libsonnet create mode 100644 grafana/dashboards/infrastructure/blackbox-exporter.json create mode 100644 prometheus/blackbox.rec.rules create mode 100644 prometheus/blackbox.rules diff --git a/.src/jsonnetfile.json b/.src/jsonnetfile.json index 148f54c..d8c0a2d 100644 --- a/.src/jsonnetfile.json +++ b/.src/jsonnetfile.json @@ -10,6 +10,14 @@ }, "version": "main" }, + { + "source": { + "git": { + "remote": "https://github.com/adinhodovic/blackbox-exporter-mixin.git" + } + }, + "version": "main" + }, { "source": { "git": { diff --git a/.src/jsonnetfile.lock.json b/.src/jsonnetfile.lock.json index 9cbfdd9..cd420c0 100644 --- a/.src/jsonnetfile.lock.json +++ b/.src/jsonnetfile.lock.json @@ -1,6 +1,16 @@ { "version": 1, "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/adinhodovic/blackbox-exporter-mixin.git", + "subdir": "" + } + }, + "version": "bbc578d63631d320d161bc1968b2362bdc49e339", + "sum": "xtZC6Wm4L5ks4fyiDAc2or//vnFs/sartBRYucFeY4s=" + }, { "source": { "git": { diff --git a/.src/mixins/infrastructure/blackbox.libsonnet b/.src/mixins/infrastructure/blackbox.libsonnet new file mode 100644 index 0000000..af06b63 --- /dev/null +++ b/.src/mixins/infrastructure/blackbox.libsonnet @@ -0,0 +1,7 @@ +local blackbox = import "blackbox-exporter-mixin/mixin.libsonnet"; + +blackbox { + prometheusRules+: {}, + prometheusAlerts+: {}, + grafanaDashboards+: {} +} diff --git a/grafana/dashboards/infrastructure/blackbox-exporter.json b/grafana/dashboards/infrastructure/blackbox-exporter.json new file mode 100644 index 0000000..dd74894 --- /dev/null +++ b/grafana/dashboards/infrastructure/blackbox-exporter.json @@ -0,0 +1,1001 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors the Blackbox-exporter. It is created using the [blackbox-exporter-mixin](https://github.com/adinhodovic/blackbox-exporter-mixin) for the the (blackbox-exporter)[https://github.com/prometheus/blackbox_exporter].", + "editable": true, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "links": [ + { + "targetBlank": true, + "title": "Go To Probe", + "type": "link", + "url": "d/blackbox-exporter-j4da/blackbox-exporter?var-instance=${__field.labels.instance}&var-job=${__field.labels.job}" + } + ], + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "Down" + }, + "1": { + "color": "green", + "text": "Up" + } + }, + "type": "value" + } + ], + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 2, + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "text": { + "titleSize": 18, + "valueSize": 18 + }, + "textMode": "value_and_name" + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_success{\n job=~\"$job\"\n}\n", + "legendFormat": "{{instance}}" + } + ], + "title": "Status Map", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 6 + }, + "id": 3, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n probe_success{\n job=~\"$job\"\n }\n)\n" + } + ], + "title": "Probes", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 0.98999999999999999 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 6 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "(\n count(\n probe_success{\n job=~\"$job\"\n } == 1\n )\n OR vector(0)\n) /\ncount(\n probe_success{\n job=~\"$job\"\n }\n)\n" + } + ], + "title": "Probes Success", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 6 + }, + "id": 5, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n probe_http_ssl{\n job=~\"$job\"\n } == 1\n) /\ncount(\n probe_http_version{\n job=~\"$job\"\n }\n)\n" + } + ], + "title": "Probes SSL", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 6 + }, + "id": 6, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "avg(\n probe_duration_seconds{\n job=~\"$job\"\n }\n)\n" + } + ], + "title": "Probe Average Duration", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 7, + "repeat": "instance", + "title": "$instance", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 0.98999999999999999 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 11 + }, + "id": 8, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": [ + "mean" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_success{\n job=~\"$job\",\n instance=~\"$instance\"\n}\n" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 0.98999999999999999 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 15 + }, + "id": 9, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": [ + "mean" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "avg_over_time(\n probe_success{\n job=~\"$job\",\n instance=~\"$instance\"\n }[30d]\n)\n" + } + ], + "title": "Uptime 30d", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "No" + }, + "1": { + "color": "green", + "text": "Yes" + } + }, + "type": "value" + } + ], + "unit": "short" + } + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 0, + "y": 15 + }, + "id": 10, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_success{\n job=~\"$job\",\n instance=~\"$instance\"\n}\n", + "instant": true + } + ], + "title": "Probe Success", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "blue", + "value": 300 + }, + { + "color": "yellow", + "value": 400 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 3, + "y": 15 + }, + "id": 11, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_http_status_code{\n job=~\"$job\",\n instance=~\"$instance\"\n}\n", + "instant": true + } + ], + "title": "Latest Response Code", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "No" + }, + "1": { + "color": "green", + "text": "Yes" + } + }, + "type": "value" + } + ], + "unit": "short" + } + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 0, + "y": 17 + }, + "id": 12, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_http_ssl{\n job=~\"$job\",\n instance=~\"$instance\"\n}\n", + "instant": true + } + ], + "title": "SSL", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 3, + "y": 17 + }, + "id": 13, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "name" + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_tls_version_info{\n job=~\"$job\",\n instance=~\"$instance\"\n}\n", + "instant": true, + "legendFormat": "{{version}}" + } + ], + "title": "SSL Version", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1814400 + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 0, + "y": 19 + }, + "id": 14, + "options": { + "colorMode": "background", + "graphMode": "none" + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_ssl_earliest_cert_expiry{\n job=~\"$job\",\n instance=~\"$instance\"\n} - time()\n" + } + ], + "title": "SSL Certificate Expiry", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "green", + "text": "No" + }, + "1": { + "color": "blue", + "text": "Yes" + } + }, + "type": "value" + } + ], + "unit": "short" + } + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 0, + "y": 22 + }, + "id": 15, + "options": { + "colorMode": "background", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_http_redirects{\n job=~\"$job\",\n instance=~\"$instance\"\n}\n", + "instant": true + } + ], + "title": "Redirects", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 3, + "y": 22 + }, + "id": 16, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_http_version{\n job=~\"$job\",\n instance=~\"$instance\"\n}\n", + "instant": true, + "legendFormat": "{{version}}" + } + ], + "title": "HTTP Version", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 25 + }, + "id": 17, + "options": { + "reduceOptions": { + "calcs": [ + "mean" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_duration_seconds{\n job=~\"$job\",\n instance=~\"$instance\"\n}\n" + } + ], + "title": "Average Latency", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 25 + }, + "id": 18, + "options": { + "reduceOptions": { + "calcs": [ + "mean" + ] + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "probe_dns_lookup_time_seconds{\n job=~\"$job\",\n instance=~\"$instance\"\n}\n" + } + ], + "title": "Average Latency", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 10, + "w": 18, + "x": 6, + "y": 11 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n probe_http_duration_seconds{\n job=~\"$job\",\n instance=~\"$instance\"\n }\n) by (instance)\n", + "legendFormat": "HTTP duration" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n probe_duration_seconds{\n job=~\"$job\",\n instance=~\"$instance\"\n }\n) by (instance)\n", + "legendFormat": "Total probe duration" + } + ], + "title": "Probe Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "percent" + } + }, + "unit": "s" + } + }, + "gridPos": { + "h": 10, + "w": 18, + "x": 6, + "y": 21 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n probe_http_duration_seconds{\n job=~\"$job\",\n instance=~\"$instance\"\n }\n) by (phase)\n", + "legendFormat": "{{ phase }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n probe_icmp_duration_seconds{\n job=~\"$job\",\n instance=~\"$instance\"\n }\n) by (phase)\n", + "legendFormat": "{{ phase }}" + } + ], + "title": "Probe Phases", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "blackbox-exporter", + "blackbox-exporter-mixin" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values(probe_success{}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Instance", + "multi": false, + "name": "instance", + "query": "label_values(probe_success{job=~\"$job\"}, instance)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timezone": "utc", + "title": "Blackbox Exporter", + "uid": "blackbox-exporter-j4da" +} diff --git a/prometheus/blackbox.rec.rules b/prometheus/blackbox.rec.rules new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/prometheus/blackbox.rec.rules @@ -0,0 +1 @@ +{} diff --git a/prometheus/blackbox.rules b/prometheus/blackbox.rules new file mode 100644 index 0000000..481b6fc --- /dev/null +++ b/prometheus/blackbox.rules @@ -0,0 +1,33 @@ +"groups": +- "name": "blackbox-exporter.rules" + "rules": + - "alert": "BlackboxProbeFailed" + "annotations": + "dashboard_url": "https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?instance={{ $labels.instance }}" + "description": "The probe failed for the instance {{ $labels.instance }}." + "summary": "Probe has failed for the past 1m interval." + "expr": | + probe_success{job="blackbox-exporter"} == 0 + "for": "1m" + "labels": + "severity": "critical" + - "alert": "BlackboxLowUptime30d" + "annotations": + "dashboard_url": "https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?instance={{ $labels.instance }}" + "description": "The probe has a lower uptime than 99.9% the last 30 days for the instance {{ $labels.instance }}." + "summary": "Probe uptime is lower than 99.9% for the last 30 days." + "expr": | + avg_over_time(probe_success{job="blackbox-exporter"}[30d]) * 100 < 99.900000000000006 + "labels": + "severity": "info" + - "alert": "BlackboxSslCertificateWillExpireSoon" + "annotations": + "dashboard_url": "https://grafana.com/d/blackbox-exporter-j4da/blackbox-exporter?instance={{ $labels.instance }}" + "description": | + The SSL certificate of the instance {{ $labels.instance }} is expiring within 21 days. + Actual time left: {{ $value | humanizeDuration }}. + "summary": "SSL certificate will expire soon." + "expr": | + probe_ssl_earliest_cert_expiry{job="blackbox-exporter"} - time() < 21 * 24 * 3600 + "labels": + "severity": "warning"