Skip to content

Commit

Permalink
config+schema
Browse files Browse the repository at this point in the history
  • Loading branch information
lunkan93 committed Dec 17, 2024
1 parent 40179dd commit 111ace2
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 41 deletions.
15 changes: 4 additions & 11 deletions config/common-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -415,17 +415,10 @@ prometheus:
nodeCpuLimit1h: 95
nodeMemoryLimit1h: 85
## for each cpu and memory add the node pattern for which you want to create an alert
requestlimit:
cpu:
- name: worker
limit: 80
- name: elastisys
limit: 80
memory:
- name: worker
limit: 80
- name: elastisys
limit: 80
requestLimit:
cpu: 80
memory: 80
nodeGroupRequestsExcludePattern: ""
diskAlerts:
storage:
predictLinear:
Expand Down
40 changes: 16 additions & 24 deletions config/schemas/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3985,38 +3985,28 @@ properties:
description: Alert when a disk's usage reaches the limit in percent.
type: number
default: 75
requestlimit:
requestLimit:
title: Capacity Management Alerts Request Limit
description: Alert when a node's resource requests reaches the limits in percent.
type: object
additionalProperties: false
properties:
cpu:
title: Capacity Management Alerts CPU Request Limit
$ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/requestlimitList"
description: Configure a CPU request percentage limit to alert for.
type: number
default: 80
memory:
title: Capacity Management Alerts Memory Request Limit
$ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/requestlimitList"
requestlimitList:
title: Capacity Management Alerts Request Limit List
description: Configure a list of node groups and request limits to alert for.
type: array
items:
$ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/requestlimitTerm"
requestlimitTerm:
title: Capacity Management Alerts Request Limit Term
description: Configure a node group and the request limit to alert for. Nodes within that group must have a matching `elastisys.io/node-group` label.
type: object
additionalProperties: false
properties:
name:
type: string
examples:
- worker
limit:
description: Configure a memory request percentage limit to alert for.
type: number
examples:
- 80
default: 80
nodeGroupRequestsExcludePattern:
title: Capacity Management Alerts Request Exclude Pattern
description: Configure a pattern of node groups to exclude from the resource request alerts. This can be used to exclude certain node groups from request alerts, while still getting usage alerts for those node groups.
type: string
default: ""
examples: ".*redis.*|.*postgres.*"
diskAlerts:
title: Disk Alerts
description: Definitions for disk alerts.
Expand Down Expand Up @@ -4326,8 +4316,10 @@ properties:
$ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/persistentVolume"
disklimit:
$ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/disklimit"
requestlimit:
$ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/requestlimit"
requestLimit:
$ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/requestLimit"
nodeGroupRequestsExcludePattern:
$ref: "#/properties/prometheus/$defs/capacityManagementAlerts/properties/nodeGroupRequestsExcludePattern"
usagelimit:
default: 95
type: number
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,28 +93,28 @@ spec:
for: 5m
labels:
severity: warning
- alert: NodeGroupCpuRequest80Percent
- alert: NodeGroupCpuRequest{{ .Values.capacityManagementAlertsCpuRequestLimit }}Percent
annotations:
message: Average CPU requests is over {{ .Values.capacityManagementAlertsCpuRequestLimit }}% in the Node Group {{`{{ $labels.label_elastisys_io_node_group }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: avg by (label_elastisys_io_node_group,cluster) (sum by (node,cluster) (kube_pod_container_resource_requests{cluster=~".*",namespace=~".*",resource="cpu"} and on(pod, namespace, cluster) kube_pod_status_phase{cluster=~".*",namespace=~".*",phase="Running"} == 1) / (sum by(node,cluster) (kube_node_status_allocatable{cluster=~".*",resource="cpu"})) * on (node) group_left (label_elastisys_io_node_group) label_replace(kube_node_labels{label_elastisys_io_node_group!~'{{ .Values.capacityManagementAlertsRequestsExcludePattern }}'}, "instance", "$1", "node", "(.*)")) >= 20/100
for: 5m
labels:
severity: warning
- alert: NodeGroupMemoryRequest80Percent
- alert: NodeGroupMemoryRequest{{ .Values.capacityManagementAlertsMemoryRequestLimit }}Percent
annotations:
message: Average memory requests is over {{ .Values.capacityManagementAlertsMemoryRequestLimit }}% in the Node Group {{`{{ $labels.label_elastisys_io_node_group }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: avg by (label_elastisys_io_node_group,cluster) (sum by (node,cluster) (kube_pod_container_resource_requests{cluster=~".*",namespace=~".*",resource="memory"} and on(pod, namespace, cluster) kube_pod_status_phase{cluster=~".*",namespace=~".*",phase="Running"} == 1) / (sum by(node,cluster) (kube_node_status_allocatable{cluster=~".*",resource="memory"})) * on (node) group_left (label_elastisys_io_node_group) label_replace(kube_node_labels{label_elastisys_io_node_group!~'{{ .Values.capacityManagementAlertsRequestsExcludePattern }}'}, "instance", "$1", "node", "(.*)")) >= 20/100
for: 5m
labels:
severity: warning
- alert: NodeCpuRequest80Percent
- alert: NodeCpuRequest{{ .Values.capacityManagementAlertsCpuRequestLimit }}Percent
annotations:
message: CPU requests is over {{ .Values.capacityManagementAlertsCpuRequestLimit }}% for the Node {{`{{ $labels.node }}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: sum by (node,cluster) (kube_pod_container_resource_requests{cluster=~".*",namespace=~".*",resource="cpu"} and on(pod, namespace, cluster) kube_pod_status_phase{cluster=~".*",namespace=~".*",phase="Running"} == 1) / (sum by(node,cluster) (kube_node_status_allocatable{cluster=~".*",resource="cpu"})) * on (node) group_left (label_elastisys_io_node_group) label_replace(kube_node_labels{label_elastisys_io_node_group!~'{{ .Values.capacityManagementAlertsRequestsExcludePattern }}'}, "instance", "$1", "node", "(.*)") >= {{ .Values.capacityManagementAlertsCpuRequestLimit }}/100
for: 5m
labels:
severity: warning
- alert: NodeMemoryRequest80Percent
- alert: NodeMemoryRequest{{ .Values.capacityManagementAlertsMemoryRequestLimit }}Percent
annotations:
message: Memory requests is over {{ .Values.capacityManagementAlertsMemoryRequestLimit }}% for the Node {{`{{ $labels.node}}`}} in Cluster {{`{{ $labels.cluster }}`}}.
expr: sum by (node,cluster) (kube_pod_container_resource_requests{cluster=~".*",namespace=~".*",resource="memory"} and on(pod, namespace, cluster) kube_pod_status_phase{cluster=~".*",namespace=~".*",phase="Running"} == 1) / (sum by(node,cluster) (kube_node_status_allocatable{cluster=~".*",resource="memory"})) * on (node) group_left (label_elastisys_io_node_group) label_replace(kube_node_labels{label_elastisys_io_node_group!~'{{ .Values.capacityManagementAlertsRequestsExcludePattern }}'}, "instance", "$1", "node", "(.*)") >= {{ .Values.capacityManagementAlertsMemoryRequestLimit }}/100
Expand Down
4 changes: 2 additions & 2 deletions helmfile.d/values/prometheus-user-alerts-wc.yaml.gotmpl
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ capacityManagementAlertsDiskLimit: {{ .Values.prometheus.capacityManagementAlert
capacityManagementAlertsPredictUsage: {{ .Values.prometheus.capacityManagementAlerts.predictUsage }}
capacityManagementAlertsUsageLimit: {{ .Values.prometheus.capacityManagementAlerts.usagelimit }}
capacityManagementAlertsRequestLimit:
{{ toYaml .Values.prometheus.capacityManagementAlerts.requestlimit.cpu | indent 2 }}
{{ toYaml .Values.prometheus.capacityManagementAlerts.requestLimit.cpu | indent 2 }}
capacityManagementAlertsMemoryRequestLimit:
{{ toYaml .Values.prometheus.capacityManagementAlerts.requestlimit.memory | nindent 2 }}
{{ toYaml .Values.prometheus.capacityManagementAlerts.requestLimit.memory | nindent 2 }}
capacityManagementAlertsRequestsExcludePattern: {{ .Values.prometheus.capacityManagementAlerts.nodeGroupRequestsExcludePattern }}
capacityManagementAlertsNodeGroupCpuLimit24h: {{ .Values.prometheus.capacityManagementAlerts.nodeGroupCpuLimit24h }}
capacityManagementAlertsNodeGroupMemoryLimit24h: {{ .Values.prometheus.capacityManagementAlerts.nodeGroupMemoryLimit24h }}
Expand Down

0 comments on commit 111ace2

Please sign in to comment.