Skip to content

Commit

Permalink
Merge pull request #5059 from elmiko/skipped-scale-metric
Browse files Browse the repository at this point in the history
add metric for skipped scaling events
  • Loading branch information
k8s-ci-robot authored Aug 8, 2022
2 parents 68638b1 + da9d307 commit 3e25023
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 0 deletions.
10 changes: 10 additions & 0 deletions cluster-autoscaler/core/scale_up.go
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,16 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
if checkResult.exceeded {
klog.V(4).Infof("Skipping node group %s; maximal limit exceeded for %v", nodeGroup.Id(), checkResult.exceededResources)
skippedNodeGroups[nodeGroup.Id()] = maxResourceLimitReached(checkResult.exceededResources)
for _, resource := range checkResult.exceededResources {
switch resource {
case cloudprovider.ResourceNameCores:
metrics.RegisterSkippedScaleUpCPU()
case cloudprovider.ResourceNameMemory:
metrics.RegisterSkippedScaleUpMemory()
default:
continue
}
}
continue
}

Expand Down
10 changes: 10 additions & 0 deletions cluster-autoscaler/core/scaledown/legacy/legacy.go
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,16 @@ func (sd *ScaleDown) NodesToDelete(currentTime time.Time, pdbs []*policyv1.PodDi
if checkResult.exceeded {
klog.V(4).Infof("Skipping %s - minimal limit exceeded for %v", node.Name, checkResult.exceededResources)
sd.unremovableNodes.AddReason(node, simulator.MinimalResourceLimitExceeded)
for _, resource := range checkResult.exceededResources {
switch resource {
case cloudprovider.ResourceNameCores:
metrics.RegisterSkippedScaleDownCPU()
case cloudprovider.ResourceNameMemory:
metrics.RegisterSkippedScaleDownMemory()
default:
continue
}
}
continue
}

Expand Down
40 changes: 40 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,16 @@ const (
// Timeout was encountered when trying to scale-up
Timeout FailedScaleUpReason = "timeout"

// DirectionScaleDown is the direction of skipped scaling event when scaling in (shrinking)
DirectionScaleDown string = "down"
// DirectionScaleUp is the direction of skipped scaling event when scaling out (growing)
DirectionScaleUp string = "up"

// CpuResourceLimit minimum or maximum reached, check the direction label to determine min or max
CpuResourceLimit string = "CpuResourceLimit"
// MemoryResourceLimit minimum or maximum reached, check the direction label to determine min or max
MemoryResourceLimit string = "MemoryResourceLimit"

// autoscaledGroup is managed by CA
autoscaledGroup NodeGroupType = "autoscaled"
// autoprovisionedGroup have been created by CA (Node Autoprovisioning),
Expand Down Expand Up @@ -312,6 +322,15 @@ var (
},
)

skippedScaleEventsCount = k8smetrics.NewCounterVec(
&k8smetrics.CounterOpts{
Namespace: caNamespace,
Name: "skipped_scale_events_count",
Help: "Count of scaling events that the CA has chosen to skip.",
},
[]string{"direction", "reason"},
)

/**** Metrics related to NodeAutoprovisioning ****/
napEnabled = k8smetrics.NewGauge(
&k8smetrics.GaugeOpts{
Expand Down Expand Up @@ -364,6 +383,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
legacyregistry.MustRegister(scaleDownInCooldown)
legacyregistry.MustRegister(oldUnregisteredNodesRemovedCount)
legacyregistry.MustRegister(overflowingControllersCount)
legacyregistry.MustRegister(skippedScaleEventsCount)
legacyregistry.MustRegister(napEnabled)
legacyregistry.MustRegister(nodeGroupCreationCount)
legacyregistry.MustRegister(nodeGroupDeletionCount)
Expand Down Expand Up @@ -547,3 +567,23 @@ func RegisterOldUnregisteredNodesRemoved(nodesCount int) {
func UpdateOverflowingControllers(count int) {
overflowingControllersCount.Set(float64(count))
}

// RegisterSkippedScaleDownCPU increases the count of skipped scale outs because of CPU resource limits
func RegisterSkippedScaleDownCPU() {
skippedScaleEventsCount.WithLabelValues(DirectionScaleDown, CpuResourceLimit).Add(1.0)
}

// RegisterSkippedScaleDownMemory increases the count of skipped scale outs because of Memory resource limits
func RegisterSkippedScaleDownMemory() {
skippedScaleEventsCount.WithLabelValues(DirectionScaleDown, MemoryResourceLimit).Add(1.0)
}

// RegisterSkippedScaleUpCPU increases the count of skipped scale outs because of CPU resource limits
func RegisterSkippedScaleUpCPU() {
skippedScaleEventsCount.WithLabelValues(DirectionScaleUp, CpuResourceLimit).Add(1.0)
}

// RegisterSkippedScaleUpMemory increases the count of skipped scale outs because of Memory resource limits
func RegisterSkippedScaleUpMemory() {
skippedScaleEventsCount.WithLabelValues(DirectionScaleUp, MemoryResourceLimit).Add(1.0)
}
7 changes: 7 additions & 0 deletions cluster-autoscaler/proposals/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ This metrics describe internal state and actions taken by Cluster Autoscaler.
| evicted_pods_total | Counter | | Number of pods evicted by CA. |
| unneeded_nodes_count | Gauge | | Number of nodes currently considered unneeded by CA. |
| old_unregistered_nodes_removed_count | Counter | | Number of unregistered nodes removed by CA. |
| skipped_scale_events_count | Counter | `direction`=<scaling-direction>, `reason`=<skipped-scale-reason> | Number of times scaling has been skipped due to a resource limit being reached, or similar event. |

* `errors_total` counter increases every time main CA loop encounters an error.
* Growing `errors_total` count signifies an internal error in CA or a problem
Expand Down Expand Up @@ -120,6 +121,12 @@ scale down reasons are `empty`, `underutilized`, `unready`.
* `scaled_down_gpu_nodes_total` counts the number of nodes removed by CA. Scale
down reasons are identical to `scaled_down_nodes_total`, `gpu_name` to
`scaled_up_gpu_nodes_total`.
* `skipped_scale_events_count` counts the number of times that the
autoscaler has declined to scale a node group because of a resource limit being reached or
similar internal event. Scale direction can be either `up` or `down`, and the reason explains
why the scaling was skipped (eg `CPULimitReached`, `MemoryLimitReached`). This is
different than failed scaling events in that the autoscaler is choosing not to perform
a scaling action.

### Node Autoprovisioning operations

Expand Down

0 comments on commit 3e25023

Please sign in to comment.