Skip to content

Commit

Permalink
Merge pull request kubernetes#4143 from BigDarkClown/master
Browse files Browse the repository at this point in the history
Skip iteration loop if node creation failed
  • Loading branch information
k8s-ci-robot authored and Evan Sheng committed Sep 8, 2022
1 parent 5307f4b commit b80a3eb
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
12 changes: 10 additions & 2 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,10 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return nil
}

a.deleteCreatedNodesWithErrors()
if a.deleteCreatedNodesWithErrors() {
klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration")
return nil
}

// Check if there has been a constant difference between the number of nodes in k8s and
// the number of nodes on the cloud provider side.
Expand Down Expand Up @@ -624,7 +627,7 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod
return removedAny, nil
}

func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
// We always schedule deleting of incoming errornous nodes
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors()
Expand All @@ -645,6 +648,8 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node)
}

deletedAny := false

for nodeGroupId, nodesToBeDeleted := range nodesToBeDeletedByNodeGroupId {
var err error
klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToBeDeleted), nodeGroupId)
Expand All @@ -660,8 +665,11 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err)
}

deletedAny = deletedAny || err == nil
a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
}

return deletedAny
}

func (a *StaticAutoscaler) nodeGroupsById() map[string]cloudprovider.NodeGroup {
Expand Down
6 changes: 3 additions & 3 deletions cluster-autoscaler/core/static_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1038,7 +1038,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, autoscaler.deleteCreatedNodesWithErrors())

// check delete was called on correct nodes
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand All @@ -1062,7 +1062,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, autoscaler.deleteCreatedNodesWithErrors())

// nodes should be deleted again
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand Down Expand Up @@ -1125,7 +1125,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, autoscaler.deleteCreatedNodesWithErrors())

// we expect no more Delete Nodes
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2)
Expand Down

0 comments on commit b80a3eb

Please sign in to comment.