Skip to content

Commit

Permalink
Add some examples of per-queue alerts/dashboards (#865)
Browse files Browse the repository at this point in the history
* Add some examples of per-queue alerts/dashboards
* Use cleanly exported grafana dashboard definition

Co-authored-by: Alexey Lebedeff <[email protected]>
  • Loading branch information
binarin and binarin authored Oct 14, 2021
1 parent d4312c7 commit 2446575
Show file tree
Hide file tree
Showing 5 changed files with 387 additions and 0 deletions.
304 changes: 304 additions & 0 deletions observability/grafana/dashboards/rabbitmq-queue.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: rabbitmq-queue-grafana-dashboard
labels:
grafana_dashboard: "1"
data:
rabbitmq-queue-grafana-dashboard.json: |-
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "7.5.3"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1633508002435,
"links": [],
"panels": [
{
"datasource": null,
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "Messages",
"axisPlacement": "left",
"axisSoftMin": 0,
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"graph": false,
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Consumers"
},
"properties": [
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "unit",
"value": "prefix:"
},
{
"id": "custom.axisLabel",
"value": "Consumers"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Messages"
},
"properties": [
{
"id": "custom.drawStyle",
"value": "line"
},
{
"id": "custom.fillOpacity",
"value": 0
}
]
}
]
},
"gridPos": {
"h": 17,
"w": 11,
"x": 0,
"y": 0
},
"id": 2,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
},
"tooltipOptions": {
"mode": "single"
}
},
"targets": [
{
"exemplar": true,
"expr": "(rabbitmq_detailed_queue_messages{namespace=\"$namespace\", queue=\"$queue\"} * on (instance, job) rabbitmq_identity_info{namespace=\"$namespace\",rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"interval": "",
"legendFormat": "Messages",
"refId": "A"
},
{
"exemplar": true,
"expr": "rabbitmq_detailed_queue_consumers{namespace=\"$namespace\", queue=\"$queue\"} * on (instance, job) rabbitmq_identity_info{namespace=\"$namespace\",rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"hide": false,
"interval": "",
"legendFormat": "Consumers",
"refId": "B"
}
],
"title": "Queue messages and consumers",
"type": "timeseries"
}
],
"refresh": false,
"schemaVersion": 27,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Prometheus",
"value": "Prometheus"
},
"description": null,
"error": null,
"hide": 2,
"includeAll": false,
"label": "datasource",
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "label_values(rabbitmq_identity_info, namespace)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "namespace",
"options": [],
"query": {
"query": "label_values(rabbitmq_identity_info, namespace)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": "RabbitMQ Cluster",
"multi": false,
"name": "rabbitmq_cluster",
"options": [],
"query": {
"query": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\"})",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": "Queue",
"multi": false,
"name": "queue",
"options": [],
"query": {
"query": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\", rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "/.*queue=\"([^\"]+)\".*/",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "RabbitMQ-Queue",
"uid": "j9t8vwH7k",
"version": 1
}
20 changes: 20 additions & 0 deletions observability/prometheus/monitors/rabbitmq-servicemonitor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,26 @@ spec:
scrapeTimeout: 14s
tlsConfig:
insecureSkipVerify: true
- port: prometheus
scheme: http
path: /metrics/detailed
params:
family:
- queue_coarse_metrics
- queue_metrics
interval: 15s
scrapeTimeout: 14s
- port: prometheus-tls
scheme: https
path: /metrics/detailed
params:
family:
- queue_coarse_metrics
- queue_metrics
interval: 15s
scrapeTimeout: 14s
tlsConfig:
insecureSkipVerify: true
selector:
matchLabels:
app.kubernetes.io/component: rabbitmq
Expand Down
5 changes: 5 additions & 0 deletions observability/prometheus/rules/rabbitmq-per-object/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# RabbitMQ per-object rules

RabbitMQ >= 3.9.8 is required for functioning of these alerts.

Also they are highly opinionated and probably require some tuning before applying, e.g. filtering by specific queue names.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: rabbitmq-queue-has-no-consumers
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
labels:
role: alert-rules
spec:
groups:
- name: rabbitmq
rules:
- alert: QueueHasNoConsumers
expr: |
(
((rabbitmq_detailed_queue_consumers{vhost="/", queue=~".*"} == 0) + rabbitmq_detailed_queue_messages) > 0
) * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info
for: 10m
annotations:
description: |
Over the last 10 minutes, non-empty queue `{{ $labels.queue }}` with {{ $value }} messages
in virtual host `{{ $labels.vhost }}` didn't have any consumers in
RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
summary: |
Messages are sitting idle in the queue, without any processing.
This alert is highly application specific (and e.g. doesn't make sense for stream queues).
labels:
rulesgroup: rabbitmq
severity: warning
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: rabbitmq-queue-is-growing
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
labels:
role: alert-rules
spec:
groups:
- name: rabbitmq
rules:
- alert: QueueIsGrowing
# `> 1` because of floating point rounding errors
expr: |
(
avg_over_time(rabbitmq_detailed_queue_messages[10m]) - avg_over_time(rabbitmq_detailed_queue_messages[10m] offset 1m) > 1
) * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info
for: 10m
annotations:
description: |
Over the last 10 minutes, queue `{{ $labels.queue }}` in virtual host `{{ $labels.vhost }}`
was growing. 10 minute moving average has grown by {{ $value }}.
This happens in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
summary: |
Queue size is steadily growing over time.
labels:
rulesgroup: rabbitmq
severity: warning

0 comments on commit 2446575

Please sign in to comment.