diff --git a/production/helm/loki/src/alerts.yaml.tpl b/production/helm/loki/src/alerts.yaml.tpl index 144e263f7061f..0aa37b708b523 100644 --- a/production/helm/loki/src/alerts.yaml.tpl +++ b/production/helm/loki/src/alerts.yaml.tpl @@ -52,7 +52,7 @@ groups: message: | {{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time. expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 for: "5m" labels: severity: "warning" diff --git a/production/loki-mixin-compiled-ssd/alerts.yaml b/production/loki-mixin-compiled-ssd/alerts.yaml index 7c0825d8580d6..09b9b6f543412 100644 --- a/production/loki-mixin-compiled-ssd/alerts.yaml +++ b/production/loki-mixin-compiled-ssd/alerts.yaml @@ -4,12 +4,12 @@ groups: - alert: LokiRequestErrors annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. summary: Loki request error rate is high. expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) > 10 for: 15m labels: @@ -17,16 +17,16 @@ groups: - alert: LokiRequestPanics annotations: description: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + {{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. summary: Loki requests are causing code panics. expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 @@ -39,7 +39,7 @@ groups: {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. summary: Loki deployment is running more than one compactor. expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 for: 5m labels: severity: warning diff --git a/production/loki-mixin-compiled/alerts.yaml b/production/loki-mixin-compiled/alerts.yaml index 7c0825d8580d6..09b9b6f543412 100644 --- a/production/loki-mixin-compiled/alerts.yaml +++ b/production/loki-mixin-compiled/alerts.yaml @@ -4,12 +4,12 @@ groups: - alert: LokiRequestErrors annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. summary: Loki request error rate is high. expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) > 10 for: 15m labels: @@ -17,16 +17,16 @@ groups: - alert: LokiRequestPanics annotations: description: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + {{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. summary: Loki requests are causing code panics. expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 @@ -39,7 +39,7 @@ groups: {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. summary: Loki deployment is running more than one compactor. expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 for: 5m labels: severity: warning diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index 5bff18e72c6e5..9261dbccecf99 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -6,36 +6,36 @@ rules: [ { alert: 'LokiRequestErrors', - expr: ||| - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + expr: std.strReplace(||| + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) > 10 - |||, + |||, 'cluster', $._config.per_cluster_label), 'for': '15m', labels: { severity: 'critical', }, annotations: { summary: 'Loki request error rate is high.', - description: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, 'cluster', $._config.per_cluster_label), }, }, { alert: 'LokiRequestPanics', expr: ||| - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - |||, + sum(increase(loki_panic_total[10m])) by (%s, namespace, job) > 0 + ||| % $._config.per_cluster_label, labels: { severity: 'critical', }, annotations: { summary: 'Loki requests are causing code panics.', - description: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - |||, + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + |||, 'cluster', $._config.per_cluster_label), }, }, { @@ -49,15 +49,15 @@ }, annotations: { summary: 'Loki request error latency is high.', - description: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - |||, + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, 'cluster', $._config.per_cluster_label), }, }, { alert: 'LokiTooManyCompactorsRunning', expr: ||| - sum(loki_boltdb_shipper_compactor_running) by (namespace, %s) > 1 + sum(loki_boltdb_shipper_compactor_running) by (%s, namespace) > 1 ||| % $._config.per_cluster_label, 'for': '5m', labels: {