diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index b8417c85d63c..a7e31932780d 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -424,7 +424,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr unregisteredNodes := a.clusterStateRegistry.GetUnregisteredNodes() if len(unregisteredNodes) > 0 { klog.V(1).Infof("%d unregistered nodes present", len(unregisteredNodes)) - removedAny, err := a.removeOldUnregisteredNodes(unregisteredNodes, autoscalingContext, + removedAny, err := a.removeOldUnregisteredNodes(unregisteredNodes, a.clusterStateRegistry, currentTime, autoscalingContext.LogRecorder) // There was a problem with removing unregistered nodes. Retry in the next loop. if err != nil { @@ -752,39 +752,22 @@ func fixNodeGroupSize(context *context.AutoscalingContext, clusterStateRegistry return fixed, nil } -// Removes unregistered nodes if needed. Returns true if anything was removed and error if such occurred. -func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clusterstate.UnregisteredNode, context *context.AutoscalingContext, +// removeOldUnregisteredNodes removes unregistered nodes if needed. Returns true +// if anything was removed and error if such occurred. +func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clusterstate.UnregisteredNode, csr *clusterstate.ClusterStateRegistry, currentTime time.Time, logRecorder *utils.LogEventRecorder) (bool, error) { - nodeGroups := a.nodeGroupsById() - nodesToDeleteByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode) - for _, unregisteredNode := range allUnregisteredNodes { - nodeGroup, err := a.CloudProvider.NodeGroupForNode(unregisteredNode.Node) - if err != nil { - klog.Warningf("Failed to get node group for %s: %v", unregisteredNode.Node.Name, err) - continue - } - if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { - klog.Warningf("No node group for node %s, skipping", unregisteredNode.Node.Name) - continue - } - - maxNodeProvisionTime, err := csr.MaxNodeProvisionTime(nodeGroup) - if err != nil { - return false, fmt.Errorf("failed to retrieve maxNodeProvisionTime for node %s in nodeGroup %s", unregisteredNode.Node.Name, nodeGroup.Id()) - } - - if unregisteredNode.UnregisteredSince.Add(maxNodeProvisionTime).Before(currentTime) { - klog.V(0).Infof("Marking unregistered node %v for removal", unregisteredNode.Node.Name) - nodesToDeleteByNodeGroupId[nodeGroup.Id()] = append(nodesToDeleteByNodeGroupId[nodeGroup.Id()], unregisteredNode) - } + unregisteredNodesToRemove, err := a.oldUnregisteredNodesToRemove(allUnregisteredNodes, csr, currentTime) + if err != nil { + return false, err } + nodeGroups := a.nodeGroupsById() removedAny := false - for nodeGroupId, unregisteredNodesToDelete := range nodesToDeleteByNodeGroupId { + for nodeGroupId, nodesToRemove := range unregisteredNodesToRemove { nodeGroup := nodeGroups[nodeGroupId] - klog.V(0).Infof("Removing %v unregistered nodes for node group %v", len(unregisteredNodesToDelete), nodeGroupId) + klog.V(0).Infof("Removing %v unregistered nodes for node group %v", len(nodesToRemove), nodeGroupId) if !a.ForceDeleteLongUnregisteredNodes { size, err := nodeGroup.TargetSize() if err != nil { @@ -793,16 +776,16 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu } possibleToDelete := size - nodeGroup.MinSize() if possibleToDelete <= 0 { - klog.Warningf("Node group %s min size reached, skipping removal of %v unregistered nodes", nodeGroupId, len(unregisteredNodesToDelete)) + klog.Warningf("Node group %s min size reached, skipping removal of %v unregistered nodes", nodeGroupId, len(nodesToRemove)) continue } - if len(unregisteredNodesToDelete) > possibleToDelete { - klog.Warningf("Capping node group %s unregistered node removal to %d nodes, removing all %d would exceed min size constaint", nodeGroupId, possibleToDelete, len(unregisteredNodesToDelete)) - unregisteredNodesToDelete = unregisteredNodesToDelete[:possibleToDelete] + if len(nodesToRemove) > possibleToDelete { + klog.Warningf("Capping node group %s unregistered node removal to %d nodes, removing all %d would exceed min size constaint", nodeGroupId, possibleToDelete, len(nodesToRemove)) + nodesToRemove = nodesToRemove[:possibleToDelete] } } - nodesToDelete := toNodes(unregisteredNodesToDelete) + nodesToDelete := toNodes(nodesToRemove) nodesToDelete, err := overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete) if err != nil { klog.Warningf("Failed to remove unregistered nodes from node group %s: %v", nodeGroupId, err) @@ -836,6 +819,34 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu return removedAny, nil } +// oldUnregisteredNodes returns old unregistered nodes grouped by their node group id. +func (a *StaticAutoscaler) oldUnregisteredNodes(allUnregisteredNodes []clusterstate.UnregisteredNode, csr *clusterstate.ClusterStateRegistry, currentTime time.Time) (map[string][]clusterstate.UnregisteredNode, error) { + nodesByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode) + for _, unregisteredNode := range allUnregisteredNodes { + nodeGroup, err := a.CloudProvider.NodeGroupForNode(unregisteredNode.Node) + if err != nil { + klog.Warningf("Failed to get node group for %s: %v", unregisteredNode.Node.Name, err) + continue + } + if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { + klog.Warningf("No node group for node %s, skipping", unregisteredNode.Node.Name) + continue + } + + maxNodeProvisionTime, err := csr.MaxNodeProvisionTime(nodeGroup) + if err != nil { + return nil, fmt.Errorf("failed to retrieve maxNodeProvisionTime for node %s in nodeGroup %s", unregisteredNode.Node.Name, nodeGroup.Id()) + } + + if unregisteredNode.UnregisteredSince.Add(maxNodeProvisionTime).Before(currentTime) { + klog.V(0).Infof("Marking unregistered node %v for removal", unregisteredNode.Node.Name) + nodesByNodeGroupId[nodeGroup.Id()] = append(nodesByNodeGroupId[nodeGroup.Id()], unregisteredNode) + } + } + + return nodesByNodeGroupId, nil +} + func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node { nodes := []*apiv1.Node{} for _, n := range unregisteredNodes { diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 45162841033e..2df10d4b7355 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -2257,12 +2257,12 @@ func TestRemoveOldUnregisteredNodes(t *testing.T) { } // Nothing should be removed. The unregistered node is not old enough. - removed, err := autoscaler.removeOldUnregisteredNodes(unregisteredNodes, context, clusterState, now.Add(-50*time.Minute), fakeLogRecorder) + removed, err := autoscaler.removeOldUnregisteredNodes(unregisteredNodes, clusterState, now.Add(-50*time.Minute), fakeLogRecorder) assert.NoError(t, err) assert.False(t, removed) // ng1_2 should be removed. - removed, err = autoscaler.removeOldUnregisteredNodes(unregisteredNodes, context, clusterState, now, fakeLogRecorder) + removed, err = autoscaler.removeOldUnregisteredNodes(unregisteredNodes, clusterState, now, fakeLogRecorder) assert.NoError(t, err) assert.True(t, removed) deletedNode := core_utils.GetStringFromChan(deletedNodes) @@ -2317,12 +2317,12 @@ func TestRemoveOldUnregisteredNodesAtomic(t *testing.T) { } // Nothing should be removed. The unregistered node is not old enough. - removed, err := autoscaler.removeOldUnregisteredNodes(unregisteredNodes, context, clusterState, now.Add(-50*time.Minute), fakeLogRecorder) + removed, err := autoscaler.removeOldUnregisteredNodes(unregisteredNodes, clusterState, now.Add(-50*time.Minute), fakeLogRecorder) assert.NoError(t, err) assert.False(t, removed) // unregNode is long unregistered, so all of the nodes should be removed due to ZeroOrMaxNodeScaling option - removed, err = autoscaler.removeOldUnregisteredNodes(unregisteredNodes, context, clusterState, now, fakeLogRecorder) + removed, err = autoscaler.removeOldUnregisteredNodes(unregisteredNodes, clusterState, now, fakeLogRecorder) assert.NoError(t, err) assert.True(t, removed) wantNames, deletedNames := []string{}, []string{}