From 5076047bf8bbe4df72549cf01ba570a90ef6d1b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Wr=C3=B3blewski?= Date: Tue, 15 Jun 2021 14:14:34 +0000 Subject: [PATCH] Skip iteration loop if node creation failed --- cluster-autoscaler/core/static_autoscaler.go | 12 ++++++++++-- cluster-autoscaler/core/static_autoscaler_test.go | 6 +++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index cfbf2c249f54..a0f54f960dd8 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -346,7 +346,10 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError return nil } - a.deleteCreatedNodesWithErrors() + if a.deleteCreatedNodesWithErrors() { + klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration") + return nil + } // Check if there has been a constant difference between the number of nodes in k8s and // the number of nodes on the cloud provider side. @@ -635,7 +638,7 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod return removedAny, nil } -func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() { +func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors() @@ -656,6 +659,8 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() { nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node) } + deletedAny := false + for nodeGroupId, nodesToBeDeleted := range nodesToBeDeletedByNodeGroupId { var err error klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToBeDeleted), nodeGroupId) @@ -671,8 +676,11 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() { klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err) } + deletedAny = deletedAny || err == nil a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup) } + + return deletedAny } func (a *StaticAutoscaler) nodeGroupsById() map[string]cloudprovider.NodeGroup { diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 8809fed019d1..e51c433fd5b3 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1058,7 +1058,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - autoscaler.deleteCreatedNodesWithErrors() + assert.True(t, autoscaler.deleteCreatedNodesWithErrors()) // check delete was called on correct nodes nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1082,7 +1082,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - autoscaler.deleteCreatedNodesWithErrors() + assert.True(t, autoscaler.deleteCreatedNodesWithErrors()) // nodes should be deleted again nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1145,7 +1145,7 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - autoscaler.deleteCreatedNodesWithErrors() + assert.False(t, autoscaler.deleteCreatedNodesWithErrors()) // we expect no more Delete Nodes nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2)