From 9dd980bc10cc1e3606a03ce291d1e929cc897fc6 Mon Sep 17 00:00:00 2001
From: Ayoub Nasr <ayoub.nasr@scality.com>
Date: Tue, 16 May 2023 11:58:23 +0200
Subject: [PATCH] Disable etcdDatabaseHighFragmentationRatio

---
 charts/drop-prometheus-rules.yaml             |  4 ++
 .../prometheus-operator/deployed/chart.sls    | 14 ------
 .../lib-alert-tree/metalk8s/platform/core.py  |  1 -
 tools/rule_extractor/alerting_rules.json      |  8 +---
 tools/rule_extractor/rules.json               | 43 -------------------
 5 files changed, 5 insertions(+), 65 deletions(-)

diff --git a/charts/drop-prometheus-rules.yaml b/charts/drop-prometheus-rules.yaml
index 4c9b1b1403..7cfe06e586 100644
--- a/charts/drop-prometheus-rules.yaml
+++ b/charts/drop-prometheus-rules.yaml
@@ -14,3 +14,7 @@ node-exporter:
   - NodeRAIDDiskFailure
   - NodeTextFileCollectorScrapeError
   - NodeFileDescriptorLimit
+# workaround: this fires upon install
+# revert the entire commit after the fix is merged
+etcd:
+  - etcdDatabaseHighFragmentationRatio
diff --git a/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls b/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls
index 458d4700f8..9ec7013b5e 100644
--- a/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls
+++ b/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls
@@ -68514,20 +68514,6 @@ spec:
       for: 10m
       labels:
         severity: warning
-    - alert: etcdDatabaseHighFragmentationRatio
-      annotations:
-        description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
-          {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
-          allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
-          retrieve the unused fragmented disk space.'
-        runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
-        summary: etcd database size in use is less than 50% of the actual allocated
-          storage.
-      expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]))
-        < 0.5
-      for: 10m
-      labels:
-        severity: warning
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
diff --git a/tools/lib-alert-tree/metalk8s/platform/core.py b/tools/lib-alert-tree/metalk8s/platform/core.py
index 26c343e1dd..ddfc097d57 100644
--- a/tools/lib-alert-tree/metalk8s/platform/core.py
+++ b/tools/lib-alert-tree/metalk8s/platform/core.py
@@ -16,7 +16,6 @@
         Existing.warning("etcdHighNumberOfFailedProposals"),
         Existing.warning("etcdHighNumberOfLeaderChanges"),
         Existing.warning("etcdMemberCommunicationSlow"),
-        Existing.warning("etcdDatabaseHighFragmentationRatio"),
         Existing.warning("etcdExcessiveDatabaseGrowth"),
         Existing.warning("KubeCPUOvercommit"),
         Existing.warning("KubeCPUQuotaOvercommit"),
diff --git a/tools/rule_extractor/alerting_rules.json b/tools/rule_extractor/alerting_rules.json
index 8e63a6aabd..26cb9cd999 100644
--- a/tools/rule_extractor/alerting_rules.json
+++ b/tools/rule_extractor/alerting_rules.json
@@ -110,7 +110,7 @@
     {
         "message": "The Kubernetes control plane is degraded.",
         "name": "KubernetesControlPlaneDegraded",
-        "query": "sum(ALERTS{alertname=\"KubeAPIErrorBudgetBurn\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeAPITerminatedRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedGRPCRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighCommitDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighFsyncDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedProposals\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfLeaderChanges\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdMemberCommunicationSlow\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdDatabaseHighFragmentationRatio\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdExcessiveDatabaseGrowth\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeVersionMismatch\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1",
+        "query": "sum(ALERTS{alertname=\"KubeAPIErrorBudgetBurn\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeAPITerminatedRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedGRPCRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighCommitDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighFsyncDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedProposals\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfLeaderChanges\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdMemberCommunicationSlow\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdExcessiveDatabaseGrowth\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeVersionMismatch\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1",
         "severity": "warning"
     },
     {
@@ -215,12 +215,6 @@
         "query": "max_over_time(reloader_last_reload_successful{namespace=~\".+\"}[5m]) == 0",
         "severity": "warning"
     },
-    {
-        "message": "etcd database size in use is less than 50% of the actual allocated storage.",
-        "name": "etcdDatabaseHighFragmentationRatio",
-        "query": "(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5",
-        "severity": "warning"
-    },
     {
         "message": "etcd cluster database is running full.",
         "name": "etcdDatabaseQuotaLowSpace",
diff --git a/tools/rule_extractor/rules.json b/tools/rule_extractor/rules.json
index 20c45a6465..2543f24a66 100644
--- a/tools/rule_extractor/rules.json
+++ b/tools/rule_extractor/rules.json
@@ -774,49 +774,6 @@
                 "name": "etcd",
                 "partialResponseStrategy": "ABORT",
                 "rules": [
-                    {
-                        "alerts": [
-                            {
-                                "activeAt": "2023-03-08T09:49:35.541577744Z",
-                                "annotations": {
-                                    "description": "etcd cluster \"kube-etcd\": database size in use on instance 192.168.1.100:2381 is 34.8% of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.",
-                                    "runbook_url": "https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation",
-                                    "summary": "etcd database size in use is less than 50% of the actual allocated storage."
-                                },
-                                "labels": {
-                                    "alertname": "etcdDatabaseHighFragmentationRatio",
-                                    "container": "etcd",
-                                    "endpoint": "http-metrics",
-                                    "instance": "192.168.1.100:2381",
-                                    "job": "kube-etcd",
-                                    "namespace": "kube-system",
-                                    "pod": "etcd-bootstrap",
-                                    "service": "prometheus-operator-kube-etcd",
-                                    "severity": "warning"
-                                },
-                                "partialResponseStrategy": "WARN",
-                                "state": "firing",
-                                "value": "3.480380775481774e-01"
-                            }
-                        ],
-                        "annotations": {
-                            "description": "etcd cluster \"{{ $labels.job }}\": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.",
-                            "runbook_url": "https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation",
-                            "summary": "etcd database size in use is less than 50% of the actual allocated storage."
-                        },
-                        "duration": 600,
-                        "evaluationTime": 0.000454219,
-                        "health": "ok",
-                        "labels": {
-                            "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus",
-                            "severity": "warning"
-                        },
-                        "lastEvaluation": "2023-03-08T13:38:35.560377703Z",
-                        "name": "etcdDatabaseHighFragmentationRatio",
-                        "query": "(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5",
-                        "state": "firing",
-                        "type": "alerting"
-                    },
                     {
                         "alerts": [],
                         "annotations": {