diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 14367defa7..41e135b24c 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -64,39 +64,84 @@ rules: For Thanos ruler we run some alerts in local Prometheus, to make sure that Thanos Rule is working: -[//]: # "TODO(kakkoyun): Generate rule rules using thanos-mixin." - +[embedmd]:# (../tmp/thanos-ruler.rules.yaml yaml) ```yaml -- alert: ThanosRuleIsDown - expr: up{app="thanos-ruler"} == 0 or absent(up{app="thanos-ruler"}) +name: thanos-ruler.rules +rules: +- alert: ThanosRulerQueueIsDroppingAlerts + annotations: + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts. + expr: | + sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0 for: 5m labels: - team: TEAM + severity: critical +- alert: ThanosRulerSenderIsFailingAlerts annotations: - summary: Thanos Rule is down - impact: Alerts are not working - action: 'check {{ $labels.kubernetes_pod_name }} pod in {{ $labels.kubernetes_namespace}} namespace' - dashboard: RULE_DASHBOARD -- alert: ThanosRuleIsDroppingAlerts - expr: rate(thanos_alert_queue_alerts_dropped_total{app="thanos-ruler"}[5m]) > 0 + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts + to alertmanager. + expr: | + sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0 for: 5m labels: - team: TEAM + severity: critical +- alert: ThanosRulerHighRuleExaluationFailures annotations: - summary: Thanos Rule is dropping alerts - impact: Alerts are not working - action: 'check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace' - dashboard: RULE_DASHBOARD -- alert: ThanosRuleGrpcErrorRate - expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable",app="thanos-ruler"}[5m]) > 0 + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to evaluate rules. + expr: | + ( + sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 5 + ) for: 5m labels: - team: TEAM + severity: warning +- alert: ThanosRulerHighRuleExaluationWarnings annotations: - summary: Thanos Rule is returning Internal/Unavailable errors - impact: Recording Rules are not working - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: RULE_DASHBOARD + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} has high number of evaluation + warnings. + expr: | + sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-ruler.*"}[5m])) > 0 + for: 15m + labels: + severity: warning +- alert: ThanosRulerRuleEvaluationLatencyHigh + annotations: + message: Thanos Ruler {{$labels.job}}/{{$labels.pod}} has higher evaluation latency + than interval for {{$labels.rule_group}}. + expr: | + ( + sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"}) + > + sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"}) + ) + for: 5m + labels: + severity: warning +- alert: ThanosRulerGrpcErrorRate + annotations: + message: Thanos Ruler {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(grpc_server_started_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning +- alert: ThanosRulerConfigReloadFailure + annotations: + message: Thanos Ruler {{$labels.job}} has not been able to reload its configuration. + expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-ruler.*"}) by (job) + != 1 + for: 5m + labels: + severity: warning ``` ## Store Gateway diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 23f10b655f..0eb4fda1d1 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -277,6 +277,83 @@ groups: for: 10m labels: severity: warning +- name: thanos-ruler.rules + rules: + - alert: ThanosRulerQueueIsDroppingAlerts + annotations: + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts. + expr: | + sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0 + for: 5m + labels: + severity: critical + - alert: ThanosRulerSenderIsFailingAlerts + annotations: + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts + to alertmanager. + expr: | + sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-ruler.*"}[5m])) > 0 + for: 5m + labels: + severity: critical + - alert: ThanosRulerHighRuleExaluationFailures + annotations: + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to evaluate + rules. + expr: | + ( + sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + - alert: ThanosRulerHighRuleExaluationWarnings + annotations: + message: Thanos Ruler {{$labels.job}} {{$labels.pod}} has high number of evaluation + warnings. + expr: | + sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-ruler.*"}[5m])) > 0 + for: 15m + labels: + severity: warning + - alert: ThanosRulerRuleEvaluationLatencyHigh + annotations: + message: Thanos Ruler {{$labels.job}}/{{$labels.pod}} has higher evaluation + latency than interval for {{$labels.rule_group}}. + expr: | + ( + sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-ruler.*"}) + > + sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-ruler.*"}) + ) + for: 5m + labels: + severity: warning + - alert: ThanosRulerGrpcErrorRate + annotations: + message: Thanos Ruler {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-ruler.*"}[5m])) + / + sum by (job) (rate(grpc_server_started_total{job=~"thanos-ruler.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + - alert: ThanosRulerConfigReloadFailure + annotations: + message: Thanos Ruler {{$labels.job}} has not been able to reload its configuration. + expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-ruler.*"}) by + (job) != 1 + for: 5m + labels: + severity: warning - name: thanos-component-absent.rules rules: - alert: ThanosCompactorIsDown diff --git a/examples/dashboards/ruler.json b/examples/dashboards/ruler.json index fa2596da4a..6dc5f85a25 100644 --- a/examples/dashboards/ruler.json +++ b/examples/dashboards/ruler.json @@ -347,6 +347,174 @@ "title": "Alert Sent", "titleSize": "h6" }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of queued alerts.", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Push Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of dropped alerts compared to the total number of queued alerts.", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_alert_queue_alerts_pushed_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Drop Ratio", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alert Queue", + "titleSize": "h6" + }, { "collapse": false, "height": "250px", @@ -378,7 +546,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests.", "fill": 10, - "id": 5, + "id": 7, "legend": { "avg": false, "current": false, @@ -457,7 +625,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 6, + "id": 8, "legend": { "avg": false, "current": false, @@ -534,7 +702,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles.", "fill": 1, - "id": 7, + "id": 9, "legend": { "avg": false, "current": false, @@ -639,7 +807,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests.", "fill": 10, - "id": 8, + "id": 10, "legend": { "avg": false, "current": false, @@ -716,7 +884,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 9, + "id": 11, "legend": { "avg": false, "current": false, @@ -793,7 +961,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles.", "fill": 1, - "id": 10, + "id": 12, "legend": { "avg": false, "current": false, @@ -917,7 +1085,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Streamed gRPC requests.", "fill": 10, - "id": 11, + "id": 13, "legend": { "avg": false, "current": false, @@ -996,7 +1164,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 12, + "id": 14, "legend": { "avg": false, "current": false, @@ -1073,7 +1241,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles", "fill": 1, - "id": 13, + "id": 15, "legend": { "avg": false, "current": false, @@ -1178,7 +1346,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Streamed gRPC requests.", "fill": 10, - "id": 14, + "id": 16, "legend": { "avg": false, "current": false, @@ -1255,7 +1423,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 15, + "id": 17, "legend": { "avg": false, "current": false, @@ -1332,7 +1500,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles", "fill": 1, - "id": 16, + "id": 18, "legend": { "avg": false, "current": false, @@ -1436,7 +1604,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 17, + "id": 19, "legend": { "avg": false, "current": false, @@ -1552,7 +1720,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 18, + "id": 20, "legend": { "avg": false, "current": false, @@ -1628,7 +1796,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 19, + "id": 21, "legend": { "avg": false, "current": false, diff --git a/mixin/thanos/alerts/alerts.libsonnet b/mixin/thanos/alerts/alerts.libsonnet index 0eb63dc98d..e3fa004090 100644 --- a/mixin/thanos/alerts/alerts.libsonnet +++ b/mixin/thanos/alerts/alerts.libsonnet @@ -3,4 +3,5 @@ (import 'receiver.libsonnet') + (import 'sidecar.libsonnet') + (import 'store.libsonnet') + +(import 'ruler.libsonnet') + (import 'absent.libsonnet') diff --git a/mixin/thanos/alerts/ruler.libsonnet b/mixin/thanos/alerts/ruler.libsonnet new file mode 100644 index 0000000000..08de10a23b --- /dev/null +++ b/mixin/thanos/alerts/ruler.libsonnet @@ -0,0 +1,121 @@ +{ + local thanos = self, + ruler+:: { + jobPrefix: error 'must provide job prefix for Thanos Ruler alerts', + selector: error 'must provide selector for Thanos Ruler alerts', + }, + prometheusAlerts+:: { + groups+: [ + { + name: 'thanos-ruler.rules', + rules: [ + { + alert: 'ThanosRulerQueueIsDroppingAlerts', + annotations: { + message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to queue alerts.', + }, + expr: ||| + sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{%(selector)s}[5m])) > 0 + ||| % thanos.ruler, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'ThanosRulerSenderIsFailingAlerts', + annotations: { + message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.', + }, + expr: ||| + sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{%(selector)s}[5m])) > 0 + ||| % thanos.ruler, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'ThanosRulerHighRuleExaluationFailures', + annotations: { + message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.', + }, + expr: ||| + ( + sum by (job) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[5m])) + / + sum by (job) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.ruler, + + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosRulerHighRuleExaluationWarnings', + annotations: { + message: 'Thanos Ruler {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.', + }, + expr: ||| + sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{%(selector)s}[5m])) > 0 + ||| % thanos.ruler, + + 'for': '15m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosRulerRuleEvaluationLatencyHigh', + annotations: { + message: 'Thanos Ruler {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.', + }, + expr: ||| + ( + sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s}) + > + sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s}) + ) + ||| % thanos.ruler, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosRulerGrpcErrorRate', + annotations: { + message: 'Thanos Ruler {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + }, + expr: ||| + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) + / + sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.ruler, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosRulerConfigReloadFailure', + annotations: { + message: 'Thanos Ruler {{$labels.job}} has not been able to reload its configuration.', + }, + expr: 'avg(thanos_rule_config_last_reload_successful{%(selector)s}) by (job) != 1' % thanos.ruler, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/mixin/thanos/dashboards/ruler.libsonnet b/mixin/thanos/dashboards/ruler.libsonnet index 941c38b417..067e4d1af2 100644 --- a/mixin/thanos/dashboards/ruler.libsonnet +++ b/mixin/thanos/dashboards/ruler.libsonnet @@ -39,6 +39,23 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; g.latencyPanel('thanos_alert_sender_latency_seconds', 'namespace="$namespace",job=~"$job"'), ) ) + .addRow( + g.row('Alert Queue') + .addPanel( + g.panel('Push Rate', 'Shows rate of queued alerts.') + + g.queryPanel( + 'sum(rate(thanos_alert_queue_alerts_dropped_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, pod)', + '{{job}} {{pod}}' + ) + ) + .addPanel( + g.panel('Drop Ratio', 'Shows ratio of dropped alerts compared to the total number of queued alerts.') + + g.qpsErrTotalPanel( + 'thanos_alert_queue_alerts_dropped_total{namespace="$namespace",job=~"$job"}', + 'thanos_alert_queue_alerts_pushed_total{namespace="$namespace",job=~"$job"}', + ) + ) + ) .addRow( g.row('gRPC (Unary)') .addPanel(