From 9dd980bc10cc1e3606a03ce291d1e929cc897fc6 Mon Sep 17 00:00:00 2001 From: Ayoub Nasr Date: Tue, 16 May 2023 11:58:23 +0200 Subject: [PATCH] Disable etcdDatabaseHighFragmentationRatio --- charts/drop-prometheus-rules.yaml | 4 ++ .../prometheus-operator/deployed/chart.sls | 14 ------ .../lib-alert-tree/metalk8s/platform/core.py | 1 - tools/rule_extractor/alerting_rules.json | 8 +--- tools/rule_extractor/rules.json | 43 ------------------- 5 files changed, 5 insertions(+), 65 deletions(-) diff --git a/charts/drop-prometheus-rules.yaml b/charts/drop-prometheus-rules.yaml index 4c9b1b1403..7cfe06e586 100644 --- a/charts/drop-prometheus-rules.yaml +++ b/charts/drop-prometheus-rules.yaml @@ -14,3 +14,7 @@ node-exporter: - NodeRAIDDiskFailure - NodeTextFileCollectorScrapeError - NodeFileDescriptorLimit +# workaround: this fires upon install +# revert the entire commit after the fix is merged +etcd: + - etcdDatabaseHighFragmentationRatio diff --git a/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls b/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls index 458d4700f8..9ec7013b5e 100644 --- a/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls +++ b/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls @@ -68514,20 +68514,6 @@ spec: for: 10m labels: severity: warning - - alert: etcdDatabaseHighFragmentationRatio - annotations: - description: 'etcd cluster "{{ $labels.job }}": database size in use on instance - {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual - allocated disk space, please run defragmentation (e.g. etcdctl defrag) to - retrieve the unused fragmented disk space.' - runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation - summary: etcd database size in use is less than 50% of the actual allocated - storage. - expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) - < 0.5 - for: 10m - labels: - severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule diff --git a/tools/lib-alert-tree/metalk8s/platform/core.py b/tools/lib-alert-tree/metalk8s/platform/core.py index 26c343e1dd..ddfc097d57 100644 --- a/tools/lib-alert-tree/metalk8s/platform/core.py +++ b/tools/lib-alert-tree/metalk8s/platform/core.py @@ -16,7 +16,6 @@ Existing.warning("etcdHighNumberOfFailedProposals"), Existing.warning("etcdHighNumberOfLeaderChanges"), Existing.warning("etcdMemberCommunicationSlow"), - Existing.warning("etcdDatabaseHighFragmentationRatio"), Existing.warning("etcdExcessiveDatabaseGrowth"), Existing.warning("KubeCPUOvercommit"), Existing.warning("KubeCPUQuotaOvercommit"), diff --git a/tools/rule_extractor/alerting_rules.json b/tools/rule_extractor/alerting_rules.json index 8e63a6aabd..26cb9cd999 100644 --- a/tools/rule_extractor/alerting_rules.json +++ b/tools/rule_extractor/alerting_rules.json @@ -110,7 +110,7 @@ { "message": "The Kubernetes control plane is degraded.", "name": "KubernetesControlPlaneDegraded", - "query": "sum(ALERTS{alertname=\"KubeAPIErrorBudgetBurn\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeAPITerminatedRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedGRPCRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighCommitDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighFsyncDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedProposals\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfLeaderChanges\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdMemberCommunicationSlow\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdDatabaseHighFragmentationRatio\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdExcessiveDatabaseGrowth\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeVersionMismatch\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1", + "query": "sum(ALERTS{alertname=\"KubeAPIErrorBudgetBurn\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeAPITerminatedRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedGRPCRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighCommitDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighFsyncDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedProposals\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfLeaderChanges\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdMemberCommunicationSlow\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdExcessiveDatabaseGrowth\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeVersionMismatch\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1", "severity": "warning" }, { @@ -215,12 +215,6 @@ "query": "max_over_time(reloader_last_reload_successful{namespace=~\".+\"}[5m]) == 0", "severity": "warning" }, - { - "message": "etcd database size in use is less than 50% of the actual allocated storage.", - "name": "etcdDatabaseHighFragmentationRatio", - "query": "(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5", - "severity": "warning" - }, { "message": "etcd cluster database is running full.", "name": "etcdDatabaseQuotaLowSpace", diff --git a/tools/rule_extractor/rules.json b/tools/rule_extractor/rules.json index 20c45a6465..2543f24a66 100644 --- a/tools/rule_extractor/rules.json +++ b/tools/rule_extractor/rules.json @@ -774,49 +774,6 @@ "name": "etcd", "partialResponseStrategy": "ABORT", "rules": [ - { - "alerts": [ - { - "activeAt": "2023-03-08T09:49:35.541577744Z", - "annotations": { - "description": "etcd cluster \"kube-etcd\": database size in use on instance 192.168.1.100:2381 is 34.8% of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.", - "runbook_url": "https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation", - "summary": "etcd database size in use is less than 50% of the actual allocated storage." - }, - "labels": { - "alertname": "etcdDatabaseHighFragmentationRatio", - "container": "etcd", - "endpoint": "http-metrics", - "instance": "192.168.1.100:2381", - "job": "kube-etcd", - "namespace": "kube-system", - "pod": "etcd-bootstrap", - "service": "prometheus-operator-kube-etcd", - "severity": "warning" - }, - "partialResponseStrategy": "WARN", - "state": "firing", - "value": "3.480380775481774e-01" - } - ], - "annotations": { - "description": "etcd cluster \"{{ $labels.job }}\": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.", - "runbook_url": "https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation", - "summary": "etcd database size in use is less than 50% of the actual allocated storage." - }, - "duration": 600, - "evaluationTime": 0.000454219, - "health": "ok", - "labels": { - "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", - "severity": "warning" - }, - "lastEvaluation": "2023-03-08T13:38:35.560377703Z", - "name": "etcdDatabaseHighFragmentationRatio", - "query": "(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5", - "state": "firing", - "type": "alerting" - }, { "alerts": [], "annotations": {