diff --git a/observability/grafana/dashboards/rabbitmq-queue.yml b/observability/grafana/dashboards/rabbitmq-queue.yml new file mode 100644 index 000000000..82fac265d --- /dev/null +++ b/observability/grafana/dashboards/rabbitmq-queue.yml @@ -0,0 +1,304 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rabbitmq-queue-grafana-dashboard + labels: + grafana_dashboard: "1" +data: + rabbitmq-queue-grafana-dashboard.json: |- + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.5.3" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1633508002435, + "links": [], + "panels": [ + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Messages", + "axisPlacement": "left", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Consumers" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "prefix:" + }, + { + "id": "custom.axisLabel", + "value": "Consumers" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Messages" + }, + "properties": [ + { + "id": "custom.drawStyle", + "value": "line" + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 17, + "w": 11, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "(rabbitmq_detailed_queue_messages{namespace=\"$namespace\", queue=\"$queue\"} * on (instance, job) rabbitmq_identity_info{namespace=\"$namespace\",rabbitmq_cluster=\"$rabbitmq_cluster\"})", + "interval": "", + "legendFormat": "Messages", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rabbitmq_detailed_queue_consumers{namespace=\"$namespace\", queue=\"$queue\"} * on (instance, job) rabbitmq_identity_info{namespace=\"$namespace\",rabbitmq_cluster=\"$rabbitmq_cluster\"}", + "hide": false, + "interval": "", + "legendFormat": "Consumers", + "refId": "B" + } + ], + "title": "Queue messages and consumers", + "type": "timeseries" + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "description": null, + "error": null, + "hide": 2, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(rabbitmq_identity_info, namespace)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(rabbitmq_identity_info, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "RabbitMQ Cluster", + "multi": false, + "name": "rabbitmq_cluster", + "options": [], + "query": { + "query": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\"})", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "Queue", + "multi": false, + "name": "queue", + "options": [], + "query": { + "query": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\", rabbitmq_cluster=\"$rabbitmq_cluster\"})", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "/.*queue=\"([^\"]+)\".*/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "RabbitMQ-Queue", + "uid": "j9t8vwH7k", + "version": 1 + } diff --git a/observability/prometheus/monitors/rabbitmq-servicemonitor.yml b/observability/prometheus/monitors/rabbitmq-servicemonitor.yml index 5019de974..b4ba4aa09 100644 --- a/observability/prometheus/monitors/rabbitmq-servicemonitor.yml +++ b/observability/prometheus/monitors/rabbitmq-servicemonitor.yml @@ -16,6 +16,26 @@ spec: scrapeTimeout: 14s tlsConfig: insecureSkipVerify: true + - port: prometheus + scheme: http + path: /metrics/detailed + params: + family: + - queue_coarse_metrics + - queue_metrics + interval: 15s + scrapeTimeout: 14s + - port: prometheus-tls + scheme: https + path: /metrics/detailed + params: + family: + - queue_coarse_metrics + - queue_metrics + interval: 15s + scrapeTimeout: 14s + tlsConfig: + insecureSkipVerify: true selector: matchLabels: app.kubernetes.io/component: rabbitmq diff --git a/observability/prometheus/rules/rabbitmq-per-object/README.md b/observability/prometheus/rules/rabbitmq-per-object/README.md new file mode 100644 index 000000000..c4f9b7136 --- /dev/null +++ b/observability/prometheus/rules/rabbitmq-per-object/README.md @@ -0,0 +1,5 @@ +# RabbitMQ per-object rules + +RabbitMQ >= 3.9.8 is required for functioning of these alerts. + +Also they are highly opinionated and probably require some tuning before applying, e.g. filtering by specific queue names. diff --git a/observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml b/observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml new file mode 100644 index 000000000..3f95b099d --- /dev/null +++ b/observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml @@ -0,0 +1,29 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: rabbitmq-queue-has-no-consumers + # If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here. + labels: + role: alert-rules +spec: + groups: + - name: rabbitmq + rules: + - alert: QueueHasNoConsumers + expr: | + ( + ((rabbitmq_detailed_queue_consumers{vhost="/", queue=~".*"} == 0) + rabbitmq_detailed_queue_messages) > 0 + ) * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info + for: 10m + annotations: + description: | + Over the last 10 minutes, non-empty queue `{{ $labels.queue }}` with {{ $value }} messages + in virtual host `{{ $labels.vhost }}` didn't have any consumers in + RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`. + summary: | + Messages are sitting idle in the queue, without any processing. + This alert is highly application specific (and e.g. doesn't make sense for stream queues). + labels: + rulesgroup: rabbitmq + severity: warning diff --git a/observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml b/observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml new file mode 100644 index 000000000..accbaf48b --- /dev/null +++ b/observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml @@ -0,0 +1,29 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: rabbitmq-queue-is-growing + # If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here. + labels: + role: alert-rules +spec: + groups: + - name: rabbitmq + rules: + - alert: QueueIsGrowing + # `> 1` because of floating point rounding errors + expr: | + ( + avg_over_time(rabbitmq_detailed_queue_messages[10m]) - avg_over_time(rabbitmq_detailed_queue_messages[10m] offset 1m) > 1 + ) * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info + for: 10m + annotations: + description: | + Over the last 10 minutes, queue `{{ $labels.queue }}` in virtual host `{{ $labels.vhost }}` + was growing. 10 minute moving average has grown by {{ $value }}. + This happens in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`. + summary: | + Queue size is steadily growing over time. + labels: + rulesgroup: rabbitmq + severity: warning