Skip to content

Commit

Permalink
Merge pull request grafana/cortex-jsonnet#387 from stevesg/am-notific…
Browse files Browse the repository at this point in the history
…ations-rules

Add recording rules for Alertmanager dashboard,
  • Loading branch information
stevesg authored Sep 22, 2021
2 parents 699334b + 1033b9d commit c0d2408
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 19 deletions.
38 changes: 19 additions & 19 deletions jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
})
.addPanel(
$.panel('Total Alerts') +
$.statPanel('sum(cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), format='short')
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short')
)
.addPanel(
$.panel('Total Silences') +
$.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short')
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short')
)
.addPanel(
$.panel('Tenants') +
Expand All @@ -29,11 +29,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.queryPanel(
[
|||
sum(rate(cortex_alertmanager_alerts_received_total{%s}[$__rate_interval]))
sum(cluster_job:cortex_alertmanager_alerts_received_total:rate5m{%s})
-
sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval]))
sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
],
['success', 'failed']
)
Expand All @@ -46,11 +46,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.queryPanel(
[
|||
sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval]))
sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s})
-
sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval]))
sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
],
['success', 'failed']
)
Expand All @@ -61,13 +61,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
[
|||
(
sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) by(integration)
sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) by(integration)
-
sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration)
sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)
) > 0
or on () vector(0)
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration)' % $.jobMatcher('alertmanager'),
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher('alertmanager'),
],
['success - {{ integration }}', 'failed - {{ integration }}']
)
Expand Down Expand Up @@ -104,15 +104,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addPanel(
$.panel('Per %s Alerts' % $._config.per_instance_label) +
$.queryPanel(
'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')],
'{{%s}}' % $._config.per_instance_label
) +
$.stack
)
.addPanel(
$.panel('Per %s Silences' % $._config.per_instance_label) +
$.queryPanel(
'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')],
'{{%s}}' % $._config.per_instance_label
) +
$.stack
Expand Down Expand Up @@ -205,11 +205,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.queryPanel(
[
|||
sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval]))
sum(cluster_job:cortex_alertmanager_state_replication_total:rate5m{%s})
-
sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))
sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
],
['success', 'failed']
)
Expand All @@ -219,11 +219,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.queryPanel(
[
|||
sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval]))
sum(cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m{%s})
-
sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))
sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
],
['success', 'failed']
)
Expand Down
66 changes: 66 additions & 0 deletions jsonnet/mimir-mixin/recording_rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,72 @@ local utils = import 'mixin-utils/utils.libsonnet';
},
],
},
{
name: 'cortex_alertmanager_rules',
rules: [
// Aggregations of per-user Alertmanager metrics used in dashboards.
{
record: 'cluster_job_%s:cortex_alertmanager_alerts:sum' % $._config.per_instance_label,
expr: |||
sum by (cluster, job, %s) (cortex_alertmanager_alerts)
||| % $._config.per_instance_label,
},
{
record: 'cluster_job_%s:cortex_alertmanager_silences:sum' % $._config.per_instance_label,
expr: |||
sum by (cluster, job, %s) (cortex_alertmanager_silences)
||| % $._config.per_instance_label,
},
{
record: 'cluster_job:cortex_alertmanager_alerts_received_total:rate5m',
expr: |||
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
|||,
},
{
record: 'cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m',
expr: |||
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
|||,
},
{
record: 'cluster_job_integration:cortex_alertmanager_notifications_total:rate5m',
expr: |||
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
|||,
},
{
record: 'cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m',
expr: |||
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
|||,
},
{
record: 'cluster_job:cortex_alertmanager_state_replication_total:rate5m',
expr: |||
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
|||,
},
{
record: 'cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m',
expr: |||
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
|||,
},
{
record: 'cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m',
expr: |||
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
|||,
},
{
record: 'cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m',
expr: |||
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
|||,
},
],
},
],
},
}

0 comments on commit c0d2408

Please sign in to comment.