Skip to content

Commit

Permalink
Skip iteration loop if node creation failed
Browse files Browse the repository at this point in the history
  • Loading branch information
BigDarkClown committed Jun 16, 2021
1 parent f57b8f9 commit 5076047
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
12 changes: 10 additions & 2 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,10 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return nil
}

a.deleteCreatedNodesWithErrors()
if a.deleteCreatedNodesWithErrors() {
klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration")
return nil
}

// Check if there has been a constant difference between the number of nodes in k8s and
// the number of nodes on the cloud provider side.
Expand Down Expand Up @@ -635,7 +638,7 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod
return removedAny, nil
}

func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
// We always schedule deleting of incoming errornous nodes
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors()
Expand All @@ -656,6 +659,8 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node)
}

deletedAny := false

for nodeGroupId, nodesToBeDeleted := range nodesToBeDeletedByNodeGroupId {
var err error
klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToBeDeleted), nodeGroupId)
Expand All @@ -671,8 +676,11 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err)
}

deletedAny = deletedAny || err == nil
a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
}

return deletedAny
}

func (a *StaticAutoscaler) nodeGroupsById() map[string]cloudprovider.NodeGroup {
Expand Down
6 changes: 3 additions & 3 deletions cluster-autoscaler/core/static_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1058,7 +1058,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, autoscaler.deleteCreatedNodesWithErrors())

// check delete was called on correct nodes
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand All @@ -1082,7 +1082,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, autoscaler.deleteCreatedNodesWithErrors())

// nodes should be deleted again
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand Down Expand Up @@ -1145,7 +1145,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, autoscaler.deleteCreatedNodesWithErrors())

// we expect no more Delete Nodes
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2)
Expand Down

0 comments on commit 5076047

Please sign in to comment.