diff --git a/cluster-autoscaler/core/scale_up.go b/cluster-autoscaler/core/scale_up.go index 74f8c9570904..ad2e12579cdc 100644 --- a/cluster-autoscaler/core/scale_up.go +++ b/cluster-autoscaler/core/scale_up.go @@ -431,6 +431,16 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto if checkResult.exceeded { klog.V(4).Infof("Skipping node group %s; maximal limit exceeded for %v", nodeGroup.Id(), checkResult.exceededResources) skippedNodeGroups[nodeGroup.Id()] = maxResourceLimitReached(checkResult.exceededResources) + for _, resource := range checkResult.exceededResources { + switch resource { + case cloudprovider.ResourceNameCores: + metrics.RegisterSkippedScaleUpCPU() + case cloudprovider.ResourceNameMemory: + metrics.RegisterSkippedScaleUpMemory() + default: + continue + } + } continue } diff --git a/cluster-autoscaler/core/scaledown/legacy/legacy.go b/cluster-autoscaler/core/scaledown/legacy/legacy.go index 330dac3fd7c9..b891cba93fcc 100644 --- a/cluster-autoscaler/core/scaledown/legacy/legacy.go +++ b/cluster-autoscaler/core/scaledown/legacy/legacy.go @@ -706,6 +706,16 @@ func (sd *ScaleDown) NodesToDelete(currentTime time.Time, pdbs []*policyv1.PodDi if checkResult.exceeded { klog.V(4).Infof("Skipping %s - minimal limit exceeded for %v", node.Name, checkResult.exceededResources) sd.unremovableNodes.AddReason(node, simulator.MinimalResourceLimitExceeded) + for _, resource := range checkResult.exceededResources { + switch resource { + case cloudprovider.ResourceNameCores: + metrics.RegisterSkippedScaleDownCPU() + case cloudprovider.ResourceNameMemory: + metrics.RegisterSkippedScaleDownMemory() + default: + continue + } + } continue } diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 23ea180ca44b..0d9736549a33 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -66,6 +66,16 @@ const ( // Timeout was encountered when trying to scale-up Timeout FailedScaleUpReason = "timeout" + // DirectionScaleDown is the direction of skipped scaling event when scaling in (shrinking) + DirectionScaleDown string = "down" + // DirectionScaleUp is the direction of skipped scaling event when scaling out (growing) + DirectionScaleUp string = "up" + + // CpuResourceLimit minimum or maximum reached, check the direction label to determine min or max + CpuResourceLimit string = "CpuResourceLimit" + // MemoryResourceLimit minimum or maximum reached, check the direction label to determine min or max + MemoryResourceLimit string = "MemoryResourceLimit" + // autoscaledGroup is managed by CA autoscaledGroup NodeGroupType = "autoscaled" // autoprovisionedGroup have been created by CA (Node Autoprovisioning), @@ -312,6 +322,15 @@ var ( }, ) + skippedScaleEventsCount = k8smetrics.NewCounterVec( + &k8smetrics.CounterOpts{ + Namespace: caNamespace, + Name: "skipped_scale_events_count", + Help: "Count of scaling events that the CA has chosen to skip.", + }, + []string{"direction", "reason"}, + ) + /**** Metrics related to NodeAutoprovisioning ****/ napEnabled = k8smetrics.NewGauge( &k8smetrics.GaugeOpts{ @@ -364,6 +383,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) { legacyregistry.MustRegister(scaleDownInCooldown) legacyregistry.MustRegister(oldUnregisteredNodesRemovedCount) legacyregistry.MustRegister(overflowingControllersCount) + legacyregistry.MustRegister(skippedScaleEventsCount) legacyregistry.MustRegister(napEnabled) legacyregistry.MustRegister(nodeGroupCreationCount) legacyregistry.MustRegister(nodeGroupDeletionCount) @@ -547,3 +567,23 @@ func RegisterOldUnregisteredNodesRemoved(nodesCount int) { func UpdateOverflowingControllers(count int) { overflowingControllersCount.Set(float64(count)) } + +// RegisterSkippedScaleDownCPU increases the count of skipped scale outs because of CPU resource limits +func RegisterSkippedScaleDownCPU() { + skippedScaleEventsCount.WithLabelValues(DirectionScaleDown, CpuResourceLimit).Add(1.0) +} + +// RegisterSkippedScaleDownMemory increases the count of skipped scale outs because of Memory resource limits +func RegisterSkippedScaleDownMemory() { + skippedScaleEventsCount.WithLabelValues(DirectionScaleDown, MemoryResourceLimit).Add(1.0) +} + +// RegisterSkippedScaleUpCPU increases the count of skipped scale outs because of CPU resource limits +func RegisterSkippedScaleUpCPU() { + skippedScaleEventsCount.WithLabelValues(DirectionScaleUp, CpuResourceLimit).Add(1.0) +} + +// RegisterSkippedScaleUpMemory increases the count of skipped scale outs because of Memory resource limits +func RegisterSkippedScaleUpMemory() { + skippedScaleEventsCount.WithLabelValues(DirectionScaleUp, MemoryResourceLimit).Add(1.0) +} diff --git a/cluster-autoscaler/proposals/metrics.md b/cluster-autoscaler/proposals/metrics.md index d21eb1a2f642..28855b5e3598 100644 --- a/cluster-autoscaler/proposals/metrics.md +++ b/cluster-autoscaler/proposals/metrics.md @@ -85,6 +85,7 @@ This metrics describe internal state and actions taken by Cluster Autoscaler. | evicted_pods_total | Counter | | Number of pods evicted by CA. | | unneeded_nodes_count | Gauge | | Number of nodes currently considered unneeded by CA. | | old_unregistered_nodes_removed_count | Counter | | Number of unregistered nodes removed by CA. | +| skipped_scale_events_count | Counter | `direction`=<scaling-direction>, `reason`=<skipped-scale-reason> | Number of times scaling has been skipped due to a resource limit being reached, or similar event. | * `errors_total` counter increases every time main CA loop encounters an error. * Growing `errors_total` count signifies an internal error in CA or a problem @@ -120,6 +121,12 @@ scale down reasons are `empty`, `underutilized`, `unready`. * `scaled_down_gpu_nodes_total` counts the number of nodes removed by CA. Scale down reasons are identical to `scaled_down_nodes_total`, `gpu_name` to `scaled_up_gpu_nodes_total`. +* `skipped_scale_events_count` counts the number of times that the + autoscaler has declined to scale a node group because of a resource limit being reached or + similar internal event. Scale direction can be either `up` or `down`, and the reason explains + why the scaling was skipped (eg `CPULimitReached`, `MemoryLimitReached`). This is + different than failed scaling events in that the autoscaler is choosing not to perform + a scaling action. ### Node Autoprovisioning operations