From 901a699aeaf0e7df629c484ca5ab417a99197354 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Fri, 15 Oct 2021 15:28:26 -0500 Subject: [PATCH 1/5] add rule for critical distributor inflight push request alert --- cortex-mixin/alerts/alerts.libsonnet | 24 ++++++++++++++++++++++++ cortex-mixin/docs/playbooks.md | 3 +++ 2 files changed, 27 insertions(+) diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 59022dd..f915f0d 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -352,6 +352,30 @@ }, ], }, + { + name: 'cortex_distributor_inflight_push_request_alert', + rules: [ + { + alert: 'CortexDistributorReachingInflightPushRequestLimits', + expr: ||| + ( + (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) + and ignoring (limit) + (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0) + ) > 0.9 + |||, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Distributor {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. + |||, + }, + }, + ], + }, { name: 'cortex_wal_alerts', rules: [ diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 180ed50..536ab8d 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -108,6 +108,9 @@ How to **fix**: 1. Ensure shuffle-sharding is enabled in the Cortex cluster 1. Assuming shuffle-sharding is enabled, scaling up ingesters will lower the number of tenants per ingester. However, the effect of this change will be visible only after `-blocks-storage.tsdb.close-idle-tsdb-timeout` period so you may have to temporarily increase the limit +### CortexDistributorReachingInflightPushRequestLimits + _TODO: this playbook has not been written yet._ + ### CortexRequestLatency This alert fires when a specific Cortex route is experiencing an high latency. From 0ed69e84f42c07b4730cd8786709afca9ef39a56 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Mon, 18 Oct 2021 17:53:27 -0500 Subject: [PATCH 2/5] Reduce threshold to .8 and update message to match alert --- cortex-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index f915f0d..35b77f7 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -362,7 +362,7 @@ (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) and ignoring (limit) (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0) - ) > 0.9 + ) > 0.8 |||, 'for': '5m', labels: { @@ -370,7 +370,7 @@ }, annotations: { message: ||| - Distributor {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. + Distributor {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. |||, }, }, From 9c4da69fe6b0ea225753cf86fa47184f4d490343 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Tue, 19 Oct 2021 16:49:13 -0500 Subject: [PATCH 3/5] Add playbook for inflight push limit --- cortex-mixin/docs/playbooks.md | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 536ab8d..3c31a6a 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -109,7 +109,35 @@ How to **fix**: 1. Assuming shuffle-sharding is enabled, scaling up ingesters will lower the number of tenants per ingester. However, the effect of this change will be visible only after `-blocks-storage.tsdb.close-idle-tsdb-timeout` period so you may have to temporarily increase the limit ### CortexDistributorReachingInflightPushRequestLimits - _TODO: this playbook has not been written yet._ + +This alert fires when the `cortex_distributor_inflight_push_requests` per distributor instance limit is enabled and the actual number of inflight push requests is approaching the set limit. Once the limit is reached, push requests to the distributor will fail (5xx) for new requests, while existing inflight push requests will continue to succeed. + +In case of **emergency**: +- If the actual number of inflight push requests is very close to or already at the set limit, then you can increase the limit via runtime config to gain some time +- Increasing the limit will increase the distributor' memory utilization. Please monitor the distributors' memory utilization via the `Cortex / Writes Resources` dashboard + +How the limit is **configured**: +- The limit can be configured either on CLI (`-distributor.instance-limits.max-inflight-push-requests`) or in the runtime config: + ``` + distributor_instance_limits: + max_inflight_push_requests: + ``` +- The mixin configures the limit in the runtime config and can be fine-tuned via: + ``` + _config+:: { + distributor_instance_limits+:: { + max_inflight_push_requests: + } + } + ``` +- When configured in the runtime config, changes are applied live without requiring an distributor restart +- The configured limit can be queried via `cortex_distributor_instance_limits{limit="max_inflight_push_requests"})` + +How to **fix**: +1. **Temporarily increase the limit**
+ If the actual number of inflight push requests is very close to or already hit the limit. +2. **Scale up distributors**
+ Scaling up distributors will lower the number of inflight push requests per distributor. ### CortexRequestLatency From 6448f4a15a785808b548e7e84f7c913a23ecfeeb Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Thu, 21 Oct 2021 17:37:32 -0500 Subject: [PATCH 4/5] Update runbook to reflect it's a cli flag --- cortex-mixin/alerts/alerts.libsonnet | 2 +- cortex-mixin/docs/playbooks.md | 21 +++++++-------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 35b77f7..c3e23f8 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -356,7 +356,7 @@ name: 'cortex_distributor_inflight_push_request_alert', rules: [ { - alert: 'CortexDistributorReachingInflightPushRequestLimits', + alert: 'CortexDistributorReachingInflightPushRequestLimit', expr: ||| ( (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 3c31a6a..03a0b3a 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -108,29 +108,22 @@ How to **fix**: 1. Ensure shuffle-sharding is enabled in the Cortex cluster 1. Assuming shuffle-sharding is enabled, scaling up ingesters will lower the number of tenants per ingester. However, the effect of this change will be visible only after `-blocks-storage.tsdb.close-idle-tsdb-timeout` period so you may have to temporarily increase the limit -### CortexDistributorReachingInflightPushRequestLimits +### CortexDistributorReachingInflightPushRequestLimit This alert fires when the `cortex_distributor_inflight_push_requests` per distributor instance limit is enabled and the actual number of inflight push requests is approaching the set limit. Once the limit is reached, push requests to the distributor will fail (5xx) for new requests, while existing inflight push requests will continue to succeed. In case of **emergency**: -- If the actual number of inflight push requests is very close to or already at the set limit, then you can increase the limit via runtime config to gain some time -- Increasing the limit will increase the distributor' memory utilization. Please monitor the distributors' memory utilization via the `Cortex / Writes Resources` dashboard +- If the actual number of inflight push requests is very close to or already at the set limit, then you can increase the limit via CLI flag or config to gain some time +- Increasing the limit will increase the the number of inflight push requests which will increase distributors' memory utilization. Please monitor the distributors' memory utilization via the `Cortex / Writes Resources` dashboard How the limit is **configured**: -- The limit can be configured either on CLI (`-distributor.instance-limits.max-inflight-push-requests`) or in the runtime config: +- The limit can be configured either by the CLI flag (`-distributor.instance-limits.max-inflight-push-requests`) or in the config: ``` - distributor_instance_limits: - max_inflight_push_requests: - ``` -- The mixin configures the limit in the runtime config and can be fine-tuned via: - ``` - _config+:: { - distributor_instance_limits+:: { + distributor: + instance_limits: max_inflight_push_requests: - } - } ``` -- When configured in the runtime config, changes are applied live without requiring an distributor restart +- When configured the via the CLI flag or in the config these changes are applied with a distributor restart. - The configured limit can be queried via `cortex_distributor_instance_limits{limit="max_inflight_push_requests"})` How to **fix**: From a091553f04d07e2ddf8cf54e019bee2d648533fd Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Fri, 22 Oct 2021 14:41:47 -0500 Subject: [PATCH 5/5] remove group for single alert, update runbook, add changelog entry --- CHANGELOG.md | 1 + cortex-mixin/alerts/alerts.libsonnet | 5 ----- cortex-mixin/docs/playbooks.md | 4 ++-- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 698d38c..06d550f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,6 +67,7 @@ * [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. #387 * [ENHANCEMENT] Add `CortexRolloutStuck` alert. #405 * [ENHANCEMENT] Added `CortexKVStoreFailure` alert. #406 +* [ENHANCEMENT] Added `CortexDistributorReachingInflightPushRequestLimit` alert. #408 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index c3e23f8..08c15f3 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -350,11 +350,6 @@ |||, }, }, - ], - }, - { - name: 'cortex_distributor_inflight_push_request_alert', - rules: [ { alert: 'CortexDistributorReachingInflightPushRequestLimit', expr: ||| diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 03a0b3a..0e98a89 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -114,7 +114,7 @@ This alert fires when the `cortex_distributor_inflight_push_requests` per distri In case of **emergency**: - If the actual number of inflight push requests is very close to or already at the set limit, then you can increase the limit via CLI flag or config to gain some time -- Increasing the limit will increase the the number of inflight push requests which will increase distributors' memory utilization. Please monitor the distributors' memory utilization via the `Cortex / Writes Resources` dashboard +- Increasing the limit will increase the number of inflight push requests which will increase distributors' memory utilization. Please monitor the distributors' memory utilization via the `Cortex / Writes Resources` dashboard How the limit is **configured**: - The limit can be configured either by the CLI flag (`-distributor.instance-limits.max-inflight-push-requests`) or in the config: @@ -123,7 +123,7 @@ How the limit is **configured**: instance_limits: max_inflight_push_requests: ``` -- When configured the via the CLI flag or in the config these changes are applied with a distributor restart. +- These changes are applied with a distributor restart. - The configured limit can be queried via `cortex_distributor_instance_limits{limit="max_inflight_push_requests"})` How to **fix**: