Skip to content

Commit

Permalink
Merge pull request #5756 from wllbo/keep-backoff-out-of-resources
Browse files Browse the repository at this point in the history
add option to keep node group backoff on OutOfResource error
  • Loading branch information
k8s-ci-robot authored Feb 13, 2024
2 parents abc077e + 4477707 commit 7031519
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 6 deletions.
4 changes: 1 addition & 3 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,8 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {

for nodeGroupName, scaleUpRequest := range csr.scaleUpRequests {
if !csr.areThereUpcomingNodesInNodeGroup(nodeGroupName) {
// scale-out finished successfully
// remove it and reset node group backoff
// scale up finished successfully, remove request
delete(csr.scaleUpRequests, nodeGroupName)
csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
klog.V(4).Infof("Scale up in group %v finished successfully in %v",
nodeGroupName, currentTime.Sub(scaleUpRequest.Time))
continue
Expand Down
23 changes: 20 additions & 3 deletions cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,7 @@ func TestScaleUpBackoff(t *testing.T) {
},
}, clusterstate.NodeGroupScaleUpSafety(ng1, now))

// The backoff should be cleared after a successful scale-up
// After successful scale-up, node group should still be backed off
clusterstate.RegisterScaleUp(provider.GetNodeGroup("ng1"), 1, now)
ng1_4 := BuildTestNode("ng1-4", 1000, 1000)
SetNodeReadyState(ng1_4, true, now.Add(-1*time.Minute))
Expand All @@ -863,8 +863,25 @@ func TestScaleUpBackoff(t *testing.T) {
assert.NoError(t, err)
assert.True(t, clusterstate.IsClusterHealthy())
assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
assert.Equal(t, NodeGroupScalingSafety{SafeToScale: true, Healthy: true}, clusterstate.NodeGroupScaleUpSafety(ng1, now))
assert.Equal(t, backoff.Status{IsBackedOff: false}, clusterstate.backoff.BackoffStatus(ng1, nil, now))
assert.Equal(t, NodeGroupScalingSafety{
SafeToScale: false,
Healthy: true,
BackoffStatus: backoff.Status{
IsBackedOff: true,
ErrorInfo: cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s",
},
},
}, clusterstate.NodeGroupScaleUpSafety(ng1, now))
assert.Equal(t, backoff.Status{
IsBackedOff: true,
ErrorInfo: cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "timeout",
ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s",
}}, clusterstate.backoff.BackoffStatus(ng1, nil, now))
}

func TestGetClusterSize(t *testing.T) {
Expand Down

0 comments on commit 7031519

Please sign in to comment.