diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index be3de8c0c2..1f28a7e54a 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -47,6 +47,19 @@ message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.', }, }, + { + // Alert if compactor failed to run 2 consecutive compactions. + alert: 'CortexCompactorHasNotSuccessfullyRunCompaction', + expr: ||| + increase(cortex_compactor_runs_failed_total[2h]) >= 2 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} failed to run 2 consecutive compactions.', + }, + }, { // Alert if the compactor has not uploaded anything in the last 24h. alert: 'CortexCompactorHasNotUploadedBlocks', @@ -65,7 +78,7 @@ }, { // Alert if the compactor has not uploaded anything since its start. - alert: 'CortexCompactorHasNotUploadedBlocksSinceStart', + alert: 'CortexCompactorHasNotUploadedBlocks', 'for': '24h', expr: ||| thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} == 0 @@ -77,21 +90,6 @@ message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', }, }, - { - // Alert if compactor fails. - alert: 'CortexCompactorRunFailed', - expr: ||| - increase(cortex_compactor_runs_failed_total[2h]) >= 2 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} failed to run compaction. - |||, - }, - }, ], }, ], diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index cc3a3ad928..704b649282 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -272,11 +272,21 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time. How to **investigate**: -- If the alert `CortexCompactorHasNotSuccessfullyRun` or `CortexCompactorHasNotSuccessfullyRunSinceStart` have fired as well, then investigate that issue first +- If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` has fired as well, then investigate that issue first - If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first - Ensure ingesters are successfully shipping blocks to the storage - Look for any error in the compactor logs +### CortexCompactorHasNotSuccessfullyRunCompaction + +This alert fires if the compactor is not able to successfully compact all discovered compactable blocks (across all tenants). + +When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block. + +How to **investigate**: +- Look for any error in the compactor logs + - Corruption: [`not healthy index found`](#compactor-is-failing-because-of-not-healthy-index-found) + #### Compactor is failing because of `not healthy index found` The compactor may fail to compact blocks due a corrupted block index found in one of the source blocks: @@ -301,18 +311,6 @@ To rename a block stored on GCS you can use the `gsutil` CLI: gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK ``` -### CortexCompactorHasNotUploadedBlocksSinceStart - -Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks). - -### CortexCompactorHasNotSuccessfullyRunCompaction - -_TODO: this playbook has not been written yet._ - -### CortexCompactorRunFailed - -_TODO: this playbook has not been written yet._ - ### CortexBucketIndexNotUpdated This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store.