diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index cb5a8c7464a6..fc564783be6f 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -337,7 +337,10 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError return nil } - a.deleteCreatedNodesWithErrors() + if a.deleteCreatedNodesWithErrors() { + klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration") + return nil + } // Check if there has been a constant difference between the number of nodes in k8s and // the number of nodes on the cloud provider side. @@ -624,7 +627,7 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod return removedAny, nil } -func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() { +func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors() @@ -645,6 +648,8 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() { nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node) } + deletedAny := false + for nodeGroupId, nodesToBeDeleted := range nodesToBeDeletedByNodeGroupId { var err error klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToBeDeleted), nodeGroupId) @@ -660,8 +665,11 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() { klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err) } + deletedAny = deletedAny || err == nil a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup) } + + return deletedAny } func (a *StaticAutoscaler) nodeGroupsById() map[string]cloudprovider.NodeGroup { diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 50aaa1817612..93dd8a4903cf 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1038,7 +1038,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - autoscaler.deleteCreatedNodesWithErrors() + assert.True(t, autoscaler.deleteCreatedNodesWithErrors()) // check delete was called on correct nodes nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1062,7 +1062,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - autoscaler.deleteCreatedNodesWithErrors() + assert.True(t, autoscaler.deleteCreatedNodesWithErrors()) // nodes should be deleted again nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1125,7 +1125,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - autoscaler.deleteCreatedNodesWithErrors() + assert.False(t, autoscaler.deleteCreatedNodesWithErrors()) // we expect no more Delete Nodes nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2)