config+schema

elastisys · Dec 17, 2024 · 111ace2 · 111ace2
1 parent 40179dd
commit 111ace2
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 41 deletions.
diff --git a/config/common-config.yaml b/config/common-config.yaml
@@ -415,17 +415,10 @@ prometheus:
     nodeCpuLimit1h: 95
     nodeMemoryLimit1h: 85
     ## for each cpu and memory add the node pattern for which you want to create an alert
-    requestlimit:
-      cpu:
-        - name: worker
-          limit: 80
-        - name: elastisys
-          limit: 80
-      memory:
-        - name: worker
-          limit: 80
-        - name: elastisys
-          limit: 80
+    requestLimit:
+      cpu: 80
+      memory: 80
+    nodeGroupRequestsExcludePattern: ""
   diskAlerts:
     storage:
       predictLinear:

diff --git a/config/schemas/config.yaml b/config/schemas/config.yaml
@@ -3985,38 +3985,28 @@ properties:
             description: Alert when a disk's usage reaches the limit in percent.
             type: number
             default: 75
-          requestlimit:
+          requestLimit:
             title: Capacity Management Alerts Request Limit
             description: Alert when a node's resource requests reaches the limits in percent.
             type: object
             additionalProperties: false
             properties:
               cpu:
                 title: Capacity Management Alerts CPU Request Limit
-                $ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/requestlimitList"
+                description: Configure a CPU request percentage limit to alert for.
+                type: number
+                default: 80
               memory:
                 title: Capacity Management Alerts Memory Request Limit
-                $ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/requestlimitList"
-          requestlimitList:
-            title: Capacity Management Alerts Request Limit List
-            description: Configure a list of node groups and request limits to alert for.
-            type: array
-            items:
-              $ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/requestlimitTerm"
-          requestlimitTerm:
-            title: Capacity Management Alerts Request Limit Term
-            description: Configure a node group and the request limit to alert for. Nodes within that group must have a matching `elastisys.io/node-group` label.
-            type: object
-            additionalProperties: false
-            properties:
-              name:
-                type: string
-                examples:
-                  - worker
-              limit:
+                description: Configure a memory request percentage limit to alert for.
                 type: number
-                examples:
-                  - 80
+                default: 80
+          nodeGroupRequestsExcludePattern:
+            title: Capacity Management Alerts Request Exclude Pattern
+            description: Configure a pattern of node groups to exclude from the resource request alerts. This can be used to exclude certain node groups from request alerts, while still getting usage alerts for those node groups.
+            type: string
+            default: ""
+            examples: ".*redis.*|.*postgres.*"
       diskAlerts:
         title: Disk Alerts
         description: Definitions for disk alerts.
@@ -4326,8 +4316,10 @@ properties:
             $ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/persistentVolume"
           disklimit:
             $ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/disklimit"
-          requestlimit:
-            $ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/requestlimit"
+          requestLimit:
+            $ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/requestLimit"
+          nodeGroupRequestsExcludePattern:
+            $ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/nodeGroupRequestsExcludePattern"
           usagelimit:
             default: 95
             type: number

diff --git a/helmfile.d/charts/prometheus-alerts/templates/alerts/cluster-capacity-management-alerts.yaml b/helmfile.d/charts/prometheus-alerts/templates/alerts/cluster-capacity-management-alerts.yaml
@@ -93,28 +93,28 @@ spec:
       for: 5m
       labels:
         severity: warning
-    - alert: NodeGroupCpuRequest80Percent
+    - alert: NodeGroupCpuRequest{{ .Values.capacityManagementAlertsCpuRequestLimit }}Percent
       annotations:
         message: Average CPU requests is over {{ .Values.capacityManagementAlertsCpuRequestLimit }}% in the Node Group {{`{{ $labels.label_elastisys_io_node_group }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
       expr: avg by (label_elastisys_io_node_group,cluster) (sum by (node,cluster) (kube_pod_container_resource_requests{cluster=~".*",namespace=~".*",resource="cpu"} and on(pod, namespace, cluster) kube_pod_status_phase{cluster=~".*",namespace=~".*",phase="Running"} == 1) / (sum by(node,cluster) (kube_node_status_allocatable{cluster=~".*",resource="cpu"})) * on (node) group_left (label_elastisys_io_node_group) label_replace(kube_node_labels{label_elastisys_io_node_group!~'{{ .Values.capacityManagementAlertsRequestsExcludePattern }}'}, "instance", "$1", "node", "(.*)")) >= 20/100
       for: 5m
       labels:
         severity: warning
-    - alert: NodeGroupMemoryRequest80Percent
+    - alert: NodeGroupMemoryRequest{{ .Values.capacityManagementAlertsMemoryRequestLimit }}Percent
       annotations:
         message: Average memory requests is over {{ .Values.capacityManagementAlertsMemoryRequestLimit }}% in the Node Group {{`{{ $labels.label_elastisys_io_node_group }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
       expr: avg by (label_elastisys_io_node_group,cluster) (sum by (node,cluster) (kube_pod_container_resource_requests{cluster=~".*",namespace=~".*",resource="memory"} and on(pod, namespace, cluster) kube_pod_status_phase{cluster=~".*",namespace=~".*",phase="Running"} == 1) / (sum by(node,cluster) (kube_node_status_allocatable{cluster=~".*",resource="memory"})) * on (node) group_left (label_elastisys_io_node_group) label_replace(kube_node_labels{label_elastisys_io_node_group!~'{{ .Values.capacityManagementAlertsRequestsExcludePattern }}'}, "instance", "$1", "node", "(.*)")) >= 20/100
       for: 5m
       labels:
         severity: warning
-    - alert: NodeCpuRequest80Percent
+    - alert: NodeCpuRequest{{ .Values.capacityManagementAlertsCpuRequestLimit }}Percent
       annotations:
         message: CPU requests is over {{ .Values.capacityManagementAlertsCpuRequestLimit }}% for the Node {{`{{ $labels.node }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
       expr: sum by (node,cluster) (kube_pod_container_resource_requests{cluster=~".*",namespace=~".*",resource="cpu"} and on(pod, namespace, cluster) kube_pod_status_phase{cluster=~".*",namespace=~".*",phase="Running"} == 1) / (sum by(node,cluster) (kube_node_status_allocatable{cluster=~".*",resource="cpu"})) * on (node) group_left (label_elastisys_io_node_group) label_replace(kube_node_labels{label_elastisys_io_node_group!~'{{ .Values.capacityManagementAlertsRequestsExcludePattern }}'}, "instance", "$1", "node", "(.*)") >= {{ .Values.capacityManagementAlertsCpuRequestLimit }}/100
       for: 5m
       labels:
         severity: warning
-    - alert: NodeMemoryRequest80Percent
+    - alert: NodeMemoryRequest{{ .Values.capacityManagementAlertsMemoryRequestLimit }}Percent
       annotations:
         message: Memory requests is over {{ .Values.capacityManagementAlertsMemoryRequestLimit }}% for the Node {{`{{ $labels.node}}`}} in Cluster {{`{{ $labels.cluster }}`}}.
       expr: sum by (node,cluster) (kube_pod_container_resource_requests{cluster=~".*",namespace=~".*",resource="memory"} and on(pod, namespace, cluster) kube_pod_status_phase{cluster=~".*",namespace=~".*",phase="Running"} == 1) / (sum by(node,cluster) (kube_node_status_allocatable{cluster=~".*",resource="memory"})) * on (node) group_left (label_elastisys_io_node_group) label_replace(kube_node_labels{label_elastisys_io_node_group!~'{{ .Values.capacityManagementAlertsRequestsExcludePattern }}'}, "instance", "$1", "node", "(.*)") >= {{ .Values.capacityManagementAlertsMemoryRequestLimit }}/100

diff --git a/helmfile.d/values/prometheus-user-alerts-wc.yaml.gotmpl b/helmfile.d/values/prometheus-user-alerts-wc.yaml.gotmpl
@@ -32,9 +32,9 @@ capacityManagementAlertsDiskLimit: {{ .Values.prometheus.capacityManagementAlert
 capacityManagementAlertsPredictUsage: {{ .Values.prometheus.capacityManagementAlerts.predictUsage }}
 capacityManagementAlertsUsageLimit: {{ .Values.prometheus.capacityManagementAlerts.usagelimit }}
 capacityManagementAlertsRequestLimit:
-{{ toYaml .Values.prometheus.capacityManagementAlerts.requestlimit.cpu | indent 2 }}
+{{ toYaml .Values.prometheus.capacityManagementAlerts.requestLimit.cpu | indent 2 }}
 capacityManagementAlertsMemoryRequestLimit:
-{{ toYaml .Values.prometheus.capacityManagementAlerts.requestlimit.memory | nindent 2 }}
+{{ toYaml .Values.prometheus.capacityManagementAlerts.requestLimit.memory | nindent 2 }}
 capacityManagementAlertsRequestsExcludePattern: {{ .Values.prometheus.capacityManagementAlerts.nodeGroupRequestsExcludePattern }}
 capacityManagementAlertsNodeGroupCpuLimit24h: {{ .Values.prometheus.capacityManagementAlerts.nodeGroupCpuLimit24h }}
 capacityManagementAlertsNodeGroupMemoryLimit24h: {{ .Values.prometheus.capacityManagementAlerts.nodeGroupMemoryLimit24h }}