diff --git a/docs/user/alerts.md b/docs/user/alerts.md index 5e5b5973f..78ed179c7 100644 --- a/docs/user/alerts.md +++ b/docs/user/alerts.md @@ -69,7 +69,7 @@ cluster autoscaler (default 320000 cores). ### Query ``` # for: 15m -cluster_autoscaler_cluster_cpu_current_cores >= cluster_autoscaler_cpu_limits_cores{direction="maximum"} +increase(cluster_autoscaler_skipped_scale_events_count{direction=\"up\",reason=\"CpuLimitReached\"}[15]) > 0 ``` ### Possible Causes @@ -95,7 +95,7 @@ for the cluster autoscaler (default 6400000 gigabytes). ### Query ``` # for: 15m -cluster_autoscaler_cluster_memory_current_bytes >= cluster_autoscaler_memory_limits_bytes{direction="maximum"} +increase(cluster_autoscaler_skipped_scale_events_count{direction=\"up\",reason=\"MemoryResourceLimit\"}[15]) > 0 ``` ### Possible Causes diff --git a/pkg/controller/clusterautoscaler/monitoring.go b/pkg/controller/clusterautoscaler/monitoring.go index 3c2754fe0..af2185f36 100644 --- a/pkg/controller/clusterautoscaler/monitoring.go +++ b/pkg/controller/clusterautoscaler/monitoring.go @@ -199,31 +199,31 @@ true then the cluster autoscaler will enter an unsafe to scale state until the c }, { Alert: "ClusterAutoscalerUnableToScaleCPULimitReached", - Expr: intstr.FromString("cluster_autoscaler_cluster_cpu_current_cores >= cluster_autoscaler_cpu_limits_cores{direction=\"maximum\"}"), + Expr: intstr.FromString("increase(cluster_autoscaler_skipped_scale_events_count{direction=\"up\",reason=\"CpuResourceLimit\"}[15]) > 0"), For: "15m", Labels: map[string]string{ "severity": "info", }, Annotations: map[string]string{ - "summary": "Cluster Autoscaler has reached its CPU core limit and is unable to scale out", + "summary": "Cluster Autoscaler has reached its maximum CPU core limit and is unable to scale out", "description": `The number of total cores in the cluster has exceeded the maximum number set on the cluster autoscaler. This is calculated by summing the cpu capacity for all nodes in the cluster and comparing that number against the maximum cores value set for the -cluster autoscaler (default 320000 cores).`, +cluster autoscaler (default 320000 cores). Limits can be adjusted by modifying the ClusterAutoscaler resource.`, }, }, { Alert: "ClusterAutoscalerUnableToScaleMemoryLimitReached", - Expr: intstr.FromString("cluster_autoscaler_cluster_memory_current_bytes >= cluster_autoscaler_memory_limits_bytes{direction=\"maximum\"}"), + Expr: intstr.FromString("increase(cluster_autoscaler_skipped_scale_events_count{direction=\"up\",reason=\"MemoryResourceLimit\"}[15]) > 0"), For: "15m", Labels: map[string]string{ "severity": "info", }, Annotations: map[string]string{ - "summary": "Cluster Autoscaler has reached its Memory bytes limit and is unable to scale out", + "summary": "Cluster Autoscaler has reached its maximum Memory bytes limit and is unable to scale out", "description": `The number of total bytes of RAM in the cluster has exceeded the maximum number set on the cluster autoscaler. This is calculated by summing the memory capacity for all nodes in the cluster and comparing that number against the maximum memory bytes value set -for the cluster autoscaler (default 6400000 gigabytes).`, +for the cluster autoscaler (default 6400000 gigabytes). Limits can be adjusted by modifying the ClusterAutoscaler resource.`, }, }, },