From aa1af0386254f91b4755f46e3f3d6a2edaded11c Mon Sep 17 00:00:00 2001 From: Will Bowers <22203232+wllbo@users.noreply.github.com> Date: Thu, 11 May 2023 19:04:27 -0700 Subject: [PATCH 1/5] add option to keep node group backoff on OutOfResource error --- cluster-autoscaler/clusterstate/clusterstate.go | 8 +++++++- cluster-autoscaler/config/autoscaling_options.go | 2 ++ cluster-autoscaler/main.go | 2 ++ cluster-autoscaler/utils/backoff/backoff.go | 2 ++ cluster-autoscaler/utils/backoff/exponential_backoff.go | 8 ++++++++ 5 files changed, 21 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 2240d3f22883..c51b287a8cc5 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -86,6 +86,8 @@ type ClusterStateRegistryConfig struct { // Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply. // This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling. OkTotalUnreadyCount int + // NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources. + NodeGroupKeepBackoffOutOfResources bool } // IncorrectNodeGroupSize contains information about how much the current size of the node group @@ -264,7 +266,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) { // scale-out finished successfully // remove it and reset node group backoff delete(csr.scaleUpRequests, nodeGroupName) - csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()]) + shouldKeepBackoff := csr.config.NodeGroupKeepBackoffOutOfResources && csr.backoff.IsNodeGroupOutOfResources(scaleUpRequest.NodeGroup) + if !shouldKeepBackoff { + klog.V(4).Infof("Removing backoff for node group %v", scaleUpRequest.NodeGroup.Id()) + csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()]) + } klog.V(4).Infof("Scale up in group %v finished successfully in %v", nodeGroupName, currentTime.Sub(scaleUpRequest.Time)) continue diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index b335c05dcb28..3f12020bb8b4 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -249,6 +249,8 @@ type AutoscalingOptions struct { MaxNodeGroupBackoffDuration time.Duration // NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset. NodeGroupBackoffResetTimeout time.Duration + // NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources. + NodeGroupKeepBackoffOutOfResources bool // MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel. MaxScaleDownParallelism int // MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel. diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index d0480eb1f99a..8464cda7290e 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -231,6 +231,7 @@ var ( "maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.") nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour, "nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.") + nodeGroupKeepBackoffOutOfResources = flag.Bool("node-group-keep-backoff-out-of-resources", false, "Prevents removal of backoff before expiration when a scale-up fails due to the cloud provider being out of resources.") maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.") maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.") recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.") @@ -406,6 +407,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration, MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration, NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout, + NodeGroupKeepBackoffOutOfResources: *nodeGroupKeepBackoffOutOfResources, MaxScaleDownParallelism: *maxScaleDownParallelismFlag, MaxDrainParallelism: *maxDrainParallelismFlag, RecordDuplicatedEvents: *recordDuplicatedEvents, diff --git a/cluster-autoscaler/utils/backoff/backoff.go b/cluster-autoscaler/utils/backoff/backoff.go index a4409d2f99ca..85da73556cd0 100644 --- a/cluster-autoscaler/utils/backoff/backoff.go +++ b/cluster-autoscaler/utils/backoff/backoff.go @@ -39,4 +39,6 @@ type Backoff interface { RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo) // RemoveStaleBackoffData removes stale backoff data. RemoveStaleBackoffData(currentTime time.Time) + // IsNodeGroupOutOfResources returns true if the given node group is out of resources. + IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool } diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff.go b/cluster-autoscaler/utils/backoff/exponential_backoff.go index a65b9c323dd1..eafca64e55e1 100644 --- a/cluster-autoscaler/utils/backoff/exponential_backoff.go +++ b/cluster-autoscaler/utils/backoff/exponential_backoff.go @@ -38,6 +38,7 @@ type exponentialBackoffInfo struct { backoffUntil time.Time lastFailedExecution time.Time errorInfo cloudprovider.InstanceErrorInfo + errorClass cloudprovider.InstanceErrorClass } // NewExponentialBackoff creates an instance of exponential backoff. @@ -89,6 +90,7 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo backoffUntil: backoffUntil, lastFailedExecution: currentTime, errorInfo: errorInfo, + errorClass: errorClass, } return backoffUntil } @@ -118,3 +120,9 @@ func (b *exponentialBackoff) RemoveStaleBackoffData(currentTime time.Time) { } } } + +// IsNodeGroupOutOfResources returns true if the given node group is out of resources. +func (b *exponentialBackoff) IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool { + backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)] + return found && backoffInfo.errorClass == cloudprovider.OutOfResourcesErrorClass +} From 8a2cae379578b235a239e5ff03eb312a3e7b8547 Mon Sep 17 00:00:00 2001 From: Will Bowers <22203232+wllbo@users.noreply.github.com> Date: Wed, 18 Oct 2023 10:14:02 -0700 Subject: [PATCH 2/5] remove changes to backoff interface --- cluster-autoscaler/utils/backoff/backoff.go | 2 -- cluster-autoscaler/utils/backoff/exponential_backoff.go | 6 ------ 2 files changed, 8 deletions(-) diff --git a/cluster-autoscaler/utils/backoff/backoff.go b/cluster-autoscaler/utils/backoff/backoff.go index 85da73556cd0..a4409d2f99ca 100644 --- a/cluster-autoscaler/utils/backoff/backoff.go +++ b/cluster-autoscaler/utils/backoff/backoff.go @@ -39,6 +39,4 @@ type Backoff interface { RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo) // RemoveStaleBackoffData removes stale backoff data. RemoveStaleBackoffData(currentTime time.Time) - // IsNodeGroupOutOfResources returns true if the given node group is out of resources. - IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool } diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff.go b/cluster-autoscaler/utils/backoff/exponential_backoff.go index eafca64e55e1..0631878a98a0 100644 --- a/cluster-autoscaler/utils/backoff/exponential_backoff.go +++ b/cluster-autoscaler/utils/backoff/exponential_backoff.go @@ -120,9 +120,3 @@ func (b *exponentialBackoff) RemoveStaleBackoffData(currentTime time.Time) { } } } - -// IsNodeGroupOutOfResources returns true if the given node group is out of resources. -func (b *exponentialBackoff) IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool { - backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)] - return found && backoffInfo.errorClass == cloudprovider.OutOfResourcesErrorClass -} From 00fd3a802ccb4a49a9a9e4ce07e1e64af7d51789 Mon Sep 17 00:00:00 2001 From: Will Bowers <22203232+wllbo@users.noreply.github.com> Date: Wed, 18 Oct 2023 10:44:33 -0700 Subject: [PATCH 3/5] attach errors to scale-up request and add comments --- .../clusterstate/clusterstate.go | 32 +++++-- .../clusterstate/clusterstate_test.go | 89 +++++++++++++++++++ .../config/autoscaling_options.go | 4 +- cluster-autoscaler/core/static_autoscaler.go | 5 +- cluster-autoscaler/main.go | 84 ++++++++--------- 5 files changed, 162 insertions(+), 52 deletions(-) diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index c51b287a8cc5..27c71470e85b 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -65,6 +65,8 @@ type ScaleUpRequest struct { ExpectedAddTime time.Time // How much the node group is increased. Increase int + // ErrorClasses is a set of the classes of errors encountered during a scale-up, if any. + ErrorClasses map[cloudprovider.InstanceErrorClass]struct{} } // ScaleDownRequest contains information about the requested node deletion. @@ -86,8 +88,11 @@ type ClusterStateRegistryConfig struct { // Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply. // This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling. OkTotalUnreadyCount int - // NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources. - NodeGroupKeepBackoffOutOfResources bool + // NodeGroupRemovePersistentErrorBackoffEarly is whether a backoff can be removed before expiration when + // a scale-up partially fails due to a likely persistent error. + // If true (default), the backoff will be removed early regardless of the error. + // If false and the backoff is due to a likely persistent error, e.g. OutOfResourcesError, it will not be removed early. + NodeGroupRemovePersistentErrorBackoffEarly bool } // IncorrectNodeGroupSize contains information about how much the current size of the node group @@ -217,6 +222,7 @@ func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudpr Increase: delta, Time: currentTime, ExpectedAddTime: currentTime.Add(maxNodeProvisionTime), + ErrorClasses: make(map[cloudprovider.InstanceErrorClass]struct{}), } csr.scaleUpRequests[nodeGroup.Id()] = scaleUpRequest return @@ -266,9 +272,16 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) { // scale-out finished successfully // remove it and reset node group backoff delete(csr.scaleUpRequests, nodeGroupName) - shouldKeepBackoff := csr.config.NodeGroupKeepBackoffOutOfResources && csr.backoff.IsNodeGroupOutOfResources(scaleUpRequest.NodeGroup) - if !shouldKeepBackoff { - klog.V(4).Infof("Removing backoff for node group %v", scaleUpRequest.NodeGroup.Id()) + // If a node group is backed off during a scale-up due to instance creation errors but partially succeeds, + // the backoff could be removed early, allowing the CA to retry scaling the same node group. + // Optionally, the backoff can be retained for persistent errors given the risk of recurrence. + // The backoff will be removed early if either of the following conditions are true: + // 1. NodeGroupRemovePersistentErrorBackoffEarly is enabled (default) + // 2. There is no persistent error class attached to the scale-up request + _, persistentError := scaleUpRequest.ErrorClasses[cloudprovider.OutOfResourcesErrorClass] + shouldRemoveBackoffEarly := csr.config.NodeGroupRemovePersistentErrorBackoffEarly || !persistentError + if shouldRemoveBackoffEarly { + klog.V(4).Infof("Removing backoff early for node group %v", scaleUpRequest.NodeGroup.Id()) csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()]) } klog.V(4).Infof("Scale up in group %v finished successfully in %v", @@ -337,6 +350,13 @@ func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprov csr.scaleUpFailures[nodeGroup.Id()] = append(csr.scaleUpFailures[nodeGroup.Id()], ScaleUpFailure{NodeGroup: nodeGroup, Reason: reason, Time: currentTime}) metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType) csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime) + // attach the error class to the scale-up request if it exists + // it will be used to determine whether to remove the backoff early when updating scale-up requests + scaleUpRequest, found := csr.scaleUpRequests[nodeGroup.Id()] + if found { + scaleUpRequest.ErrorClasses[errorClass] = struct{}{} + } + csr.backoffNodeGroup(nodeGroup, errorClass, errorCode, currentTime) } // UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats @@ -1111,7 +1131,7 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup( // If node group is scaling up and there are new node-create requests which cannot be satisfied because of // out-of-resources errors we: // - emit event - // - alter the scale-up + // - alter the scale-up and attach the error class // - increase scale-up failure metric // - backoff the node group for errorCode, instances := range currentErrorCodeToInstance { diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index 52f2952d6807..124932dcbea1 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -484,6 +484,95 @@ func TestRegisterScaleDown(t *testing.T) { assert.Empty(t, clusterstate.GetScaleUpFailures()) } +func TestRemovePersistentErrorBackoffEarlyEnabled(t *testing.T) { + ng1_1 := BuildTestNode("ng1-1", 1000, 1000) + provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroup("ng1", 1, 10, 1) + provider.AddNode("ng1", ng1_1) + assert.NotNil(t, provider) + + fakeClient := &fake.Clientset{} + fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") + clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ + MaxTotalUnreadyPercentage: 10, + OkTotalUnreadyCount: 1, + NodeGroupRemovePersistentErrorBackoffEarly: true, + }, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute})) + + now := time.Now() + + provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(4) + clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) + clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 3, now) + assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1")) + + // Fail two nodes with a persistent and a non-persistent error + clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OutOfResourcesErrorClass, string(metrics.CloudProviderError), "", "", now) + clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OtherErrorClass, string(metrics.CloudProviderError), "", "", now) + clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -2, now) + assert.Equal(t, 2, len(clusterstate.scaleUpRequests["ng1"].ErrorClasses)) + assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) + + // Reduce the target size to original value to complete the scale-up and trigger the backoff early removal + provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1) + clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) + assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1")) + assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) +} + +func TestRemovePersistentErrorBackoffEarlyDisabled(t *testing.T) { + ng1_1 := BuildTestNode("ng1-1", 1000, 1000) + provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroup("ng1", 1, 10, 1) + provider.AddNode("ng1", ng1_1) + assert.NotNil(t, provider) + + fakeClient := &fake.Clientset{} + fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") + clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ + MaxTotalUnreadyPercentage: 10, + OkTotalUnreadyCount: 1, + NodeGroupRemovePersistentErrorBackoffEarly: false, + }, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute})) + + now := time.Now() + + provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(3) + clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) + clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 2, now) + assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1")) + + // Fail one node with a persistent error + clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OutOfResourcesErrorClass, string(metrics.CloudProviderError), "", "", now) + clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -1, now) + assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) + + // Confirm the persistent error backoff is not removed early + provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1) + clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) + assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1")) + assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) + + // Remove the backoff and scale up again + clusterstate.backoff.RemoveBackoff(provider.GetNodeGroup("ng1"), nil) + provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(3) + clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) + clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 2, now) + assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) + assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1")) + + // Fail one node with a non-persistent error + clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OtherErrorClass, string(metrics.CloudProviderError), "", "", now) + clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -1, now) + assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) + + // Complete the scale-up and confirm the backoff is removed early + provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1) + clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) + assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1")) + assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) +} + func TestUpcomingNodes(t *testing.T) { provider := testprovider.NewTestCloudProvider(nil, nil) now := time.Now() diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 3f12020bb8b4..68fdc2b6d4fc 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -249,8 +249,8 @@ type AutoscalingOptions struct { MaxNodeGroupBackoffDuration time.Duration // NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset. NodeGroupBackoffResetTimeout time.Duration - // NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources. - NodeGroupKeepBackoffOutOfResources bool + // NodeGroupRemovePersistentErrorBackoffEarly is whether a backoff can be removed before expiration when a scale-up partially fails due to a likely persistent error. + NodeGroupRemovePersistentErrorBackoffEarly bool // MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel. MaxScaleDownParallelism int // MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel. diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 471a341077f2..7466e2577b0b 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -147,8 +147,9 @@ func NewStaticAutoscaler( drainabilityRules rules.Rules) *StaticAutoscaler { clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage, - OkTotalUnreadyCount: opts.OkTotalUnreadyCount, + MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage, + OkTotalUnreadyCount: opts.OkTotalUnreadyCount, + NodeGroupRemovePersistentErrorBackoffEarly: opts.NodeGroupRemovePersistentErrorBackoffEarly, } clusterStateRegistry := clusterstate.NewClusterStateRegistry(cloudProvider, clusterStateConfig, autoscalingKubeClients.LogRecorder, backoff, processors.NodeGroupConfigProcessor) processorCallbacks := newStaticAutoscalerProcessorCallbacks() diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 8464cda7290e..fe6f7e053890 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -231,26 +231,26 @@ var ( "maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.") nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour, "nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.") - nodeGroupKeepBackoffOutOfResources = flag.Bool("node-group-keep-backoff-out-of-resources", false, "Prevents removal of backoff before expiration when a scale-up fails due to the cloud provider being out of resources.") - maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.") - maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.") - recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.") - maxNodesPerScaleUp = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.") - maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.") - skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)") - skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath") - skipNodesWithCustomControllerPods = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers") - minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down") - nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it") - scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.") - parallelDrain = flag.Bool("parallel-drain", true, "Whether to allow parallel drain of nodes. This flag is deprecated and will be removed in future releases.") - maxCapacityMemoryDifferenceRatio = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.") - maxFreeDifferenceRatio = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.") - maxAllocatableDifferenceRatio = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.") - forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.") - dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server") - bypassedSchedulers = pflag.StringSlice("bypassed-scheduler-names", []string{}, fmt.Sprintf("Names of schedulers to bypass. If set to non-empty value, CA will not wait for pods to reach a certain age before triggering a scale-up.")) - drainPriorityConfig = flag.String("drain-priority-config", "", + nodeGroupRemovePersistentErrorBackoffEarly = flag.Bool("node-group-remove-persistent-error-backoff-early", true, "Allows early removal of backoff before expiration when a scale-up partially fails due to a likely to be persistent error, e.g. cloud provider being out of resources.") + maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.") + maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.") + recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.") + maxNodesPerScaleUp = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.") + maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.") + skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)") + skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath") + skipNodesWithCustomControllerPods = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers") + minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down") + nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it") + scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.") + parallelDrain = flag.Bool("parallel-drain", true, "Whether to allow parallel drain of nodes. This flag is deprecated and will be removed in future releases.") + maxCapacityMemoryDifferenceRatio = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.") + maxFreeDifferenceRatio = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.") + maxAllocatableDifferenceRatio = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.") + forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.") + dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server") + bypassedSchedulers = pflag.StringSlice("bypassed-scheduler-names", []string{}, fmt.Sprintf("Names of schedulers to bypass. If set to non-empty value, CA will not wait for pods to reach a certain age before triggering a scale-up.")) + drainPriorityConfig = flag.String("drain-priority-config", "", "List of ',' separated pairs (priority:terminationGracePeriodSeconds) of integers separated by ':' enables priority evictor. Priority evictor groups pods into priority groups based on pod priority and evict pods in the ascending order of group priorities"+ "--max-graceful-termination-sec flag should not be set when this flag is set. Not setting this flag will use unordered evictor by default."+ "Priority evictor reuses the concepts of drain logic in kubelet(https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2712-pod-priority-based-graceful-node-shutdown#migration-from-the-node-graceful-shutdown-feature)."+ @@ -399,28 +399,28 @@ func createAutoscalingOptions() config.AutoscalingOptions { ConcurrentRefreshes: *concurrentGceRefreshes, MigInstancesMinRefreshWaitTime: *gceMigInstancesMinRefreshWaitTime, }, - ClusterAPICloudConfigAuthoritative: *clusterAPICloudConfigAuthoritative, - CordonNodeBeforeTerminate: *cordonNodeBeforeTerminate, - DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes, - DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes, - UserAgent: *userAgent, - InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration, - MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration, - NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout, - NodeGroupKeepBackoffOutOfResources: *nodeGroupKeepBackoffOutOfResources, - MaxScaleDownParallelism: *maxScaleDownParallelismFlag, - MaxDrainParallelism: *maxDrainParallelismFlag, - RecordDuplicatedEvents: *recordDuplicatedEvents, - MaxNodesPerScaleUp: *maxNodesPerScaleUp, - MaxNodeGroupBinpackingDuration: *maxNodeGroupBinpackingDuration, - NodeDeletionBatcherInterval: *nodeDeletionBatcherInterval, - SkipNodesWithSystemPods: *skipNodesWithSystemPods, - SkipNodesWithLocalStorage: *skipNodesWithLocalStorage, - MinReplicaCount: *minReplicaCount, - NodeDeleteDelayAfterTaint: *nodeDeleteDelayAfterTaint, - ScaleDownSimulationTimeout: *scaleDownSimulationTimeout, - ParallelDrain: *parallelDrain, - SkipNodesWithCustomControllerPods: *skipNodesWithCustomControllerPods, + ClusterAPICloudConfigAuthoritative: *clusterAPICloudConfigAuthoritative, + CordonNodeBeforeTerminate: *cordonNodeBeforeTerminate, + DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes, + DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes, + UserAgent: *userAgent, + InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration, + MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration, + NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout, + NodeGroupRemovePersistentErrorBackoffEarly: *nodeGroupRemovePersistentErrorBackoffEarly, + MaxScaleDownParallelism: *maxScaleDownParallelismFlag, + MaxDrainParallelism: *maxDrainParallelismFlag, + RecordDuplicatedEvents: *recordDuplicatedEvents, + MaxNodesPerScaleUp: *maxNodesPerScaleUp, + MaxNodeGroupBinpackingDuration: *maxNodeGroupBinpackingDuration, + NodeDeletionBatcherInterval: *nodeDeletionBatcherInterval, + SkipNodesWithSystemPods: *skipNodesWithSystemPods, + SkipNodesWithLocalStorage: *skipNodesWithLocalStorage, + MinReplicaCount: *minReplicaCount, + NodeDeleteDelayAfterTaint: *nodeDeleteDelayAfterTaint, + ScaleDownSimulationTimeout: *scaleDownSimulationTimeout, + ParallelDrain: *parallelDrain, + SkipNodesWithCustomControllerPods: *skipNodesWithCustomControllerPods, NodeGroupSetRatios: config.NodeGroupDifferenceRatios{ MaxCapacityMemoryDifferenceRatio: *maxCapacityMemoryDifferenceRatio, MaxAllocatableDifferenceRatio: *maxAllocatableDifferenceRatio, From 8e867f66c5ccedaee35142112c7de36a237c6878 Mon Sep 17 00:00:00 2001 From: Will Bowers <22203232+wllbo@users.noreply.github.com> Date: Tue, 13 Feb 2024 07:09:44 -0800 Subject: [PATCH 4/5] revert optionally keeping node group backoff --- .../clusterstate/clusterstate.go | 30 +------ .../clusterstate/clusterstate_test.go | 89 ------------------- .../config/autoscaling_options.go | 2 - cluster-autoscaler/core/static_autoscaler.go | 5 +- cluster-autoscaler/main.go | 82 +++++++++-------- .../utils/backoff/exponential_backoff.go | 2 - 6 files changed, 44 insertions(+), 166 deletions(-) diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 27c71470e85b..2240d3f22883 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -65,8 +65,6 @@ type ScaleUpRequest struct { ExpectedAddTime time.Time // How much the node group is increased. Increase int - // ErrorClasses is a set of the classes of errors encountered during a scale-up, if any. - ErrorClasses map[cloudprovider.InstanceErrorClass]struct{} } // ScaleDownRequest contains information about the requested node deletion. @@ -88,11 +86,6 @@ type ClusterStateRegistryConfig struct { // Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply. // This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling. OkTotalUnreadyCount int - // NodeGroupRemovePersistentErrorBackoffEarly is whether a backoff can be removed before expiration when - // a scale-up partially fails due to a likely persistent error. - // If true (default), the backoff will be removed early regardless of the error. - // If false and the backoff is due to a likely persistent error, e.g. OutOfResourcesError, it will not be removed early. - NodeGroupRemovePersistentErrorBackoffEarly bool } // IncorrectNodeGroupSize contains information about how much the current size of the node group @@ -222,7 +215,6 @@ func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudpr Increase: delta, Time: currentTime, ExpectedAddTime: currentTime.Add(maxNodeProvisionTime), - ErrorClasses: make(map[cloudprovider.InstanceErrorClass]struct{}), } csr.scaleUpRequests[nodeGroup.Id()] = scaleUpRequest return @@ -272,18 +264,7 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) { // scale-out finished successfully // remove it and reset node group backoff delete(csr.scaleUpRequests, nodeGroupName) - // If a node group is backed off during a scale-up due to instance creation errors but partially succeeds, - // the backoff could be removed early, allowing the CA to retry scaling the same node group. - // Optionally, the backoff can be retained for persistent errors given the risk of recurrence. - // The backoff will be removed early if either of the following conditions are true: - // 1. NodeGroupRemovePersistentErrorBackoffEarly is enabled (default) - // 2. There is no persistent error class attached to the scale-up request - _, persistentError := scaleUpRequest.ErrorClasses[cloudprovider.OutOfResourcesErrorClass] - shouldRemoveBackoffEarly := csr.config.NodeGroupRemovePersistentErrorBackoffEarly || !persistentError - if shouldRemoveBackoffEarly { - klog.V(4).Infof("Removing backoff early for node group %v", scaleUpRequest.NodeGroup.Id()) - csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()]) - } + csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()]) klog.V(4).Infof("Scale up in group %v finished successfully in %v", nodeGroupName, currentTime.Sub(scaleUpRequest.Time)) continue @@ -350,13 +331,6 @@ func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprov csr.scaleUpFailures[nodeGroup.Id()] = append(csr.scaleUpFailures[nodeGroup.Id()], ScaleUpFailure{NodeGroup: nodeGroup, Reason: reason, Time: currentTime}) metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType) csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime) - // attach the error class to the scale-up request if it exists - // it will be used to determine whether to remove the backoff early when updating scale-up requests - scaleUpRequest, found := csr.scaleUpRequests[nodeGroup.Id()] - if found { - scaleUpRequest.ErrorClasses[errorClass] = struct{}{} - } - csr.backoffNodeGroup(nodeGroup, errorClass, errorCode, currentTime) } // UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats @@ -1131,7 +1105,7 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup( // If node group is scaling up and there are new node-create requests which cannot be satisfied because of // out-of-resources errors we: // - emit event - // - alter the scale-up and attach the error class + // - alter the scale-up // - increase scale-up failure metric // - backoff the node group for errorCode, instances := range currentErrorCodeToInstance { diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index 124932dcbea1..52f2952d6807 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -484,95 +484,6 @@ func TestRegisterScaleDown(t *testing.T) { assert.Empty(t, clusterstate.GetScaleUpFailures()) } -func TestRemovePersistentErrorBackoffEarlyEnabled(t *testing.T) { - ng1_1 := BuildTestNode("ng1-1", 1000, 1000) - provider := testprovider.NewTestCloudProvider(nil, nil) - provider.AddNodeGroup("ng1", 1, 10, 1) - provider.AddNode("ng1", ng1_1) - assert.NotNil(t, provider) - - fakeClient := &fake.Clientset{} - fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") - clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ - MaxTotalUnreadyPercentage: 10, - OkTotalUnreadyCount: 1, - NodeGroupRemovePersistentErrorBackoffEarly: true, - }, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute})) - - now := time.Now() - - provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(4) - clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) - clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 3, now) - assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1")) - - // Fail two nodes with a persistent and a non-persistent error - clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OutOfResourcesErrorClass, string(metrics.CloudProviderError), "", "", now) - clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OtherErrorClass, string(metrics.CloudProviderError), "", "", now) - clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -2, now) - assert.Equal(t, 2, len(clusterstate.scaleUpRequests["ng1"].ErrorClasses)) - assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) - - // Reduce the target size to original value to complete the scale-up and trigger the backoff early removal - provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1) - clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) - assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1")) - assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) -} - -func TestRemovePersistentErrorBackoffEarlyDisabled(t *testing.T) { - ng1_1 := BuildTestNode("ng1-1", 1000, 1000) - provider := testprovider.NewTestCloudProvider(nil, nil) - provider.AddNodeGroup("ng1", 1, 10, 1) - provider.AddNode("ng1", ng1_1) - assert.NotNil(t, provider) - - fakeClient := &fake.Clientset{} - fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") - clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{ - MaxTotalUnreadyPercentage: 10, - OkTotalUnreadyCount: 1, - NodeGroupRemovePersistentErrorBackoffEarly: false, - }, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute})) - - now := time.Now() - - provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(3) - clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) - clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 2, now) - assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1")) - - // Fail one node with a persistent error - clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OutOfResourcesErrorClass, string(metrics.CloudProviderError), "", "", now) - clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -1, now) - assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) - - // Confirm the persistent error backoff is not removed early - provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1) - clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) - assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1")) - assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) - - // Remove the backoff and scale up again - clusterstate.backoff.RemoveBackoff(provider.GetNodeGroup("ng1"), nil) - provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(3) - clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) - clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 2, now) - assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) - assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1")) - - // Fail one node with a non-persistent error - clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OtherErrorClass, string(metrics.CloudProviderError), "", "", now) - clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -1, now) - assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) - - // Complete the scale-up and confirm the backoff is removed early - provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1) - clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now) - assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1")) - assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now)) -} - func TestUpcomingNodes(t *testing.T) { provider := testprovider.NewTestCloudProvider(nil, nil) now := time.Now() diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 68fdc2b6d4fc..b335c05dcb28 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -249,8 +249,6 @@ type AutoscalingOptions struct { MaxNodeGroupBackoffDuration time.Duration // NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset. NodeGroupBackoffResetTimeout time.Duration - // NodeGroupRemovePersistentErrorBackoffEarly is whether a backoff can be removed before expiration when a scale-up partially fails due to a likely persistent error. - NodeGroupRemovePersistentErrorBackoffEarly bool // MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel. MaxScaleDownParallelism int // MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel. diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 7466e2577b0b..471a341077f2 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -147,9 +147,8 @@ func NewStaticAutoscaler( drainabilityRules rules.Rules) *StaticAutoscaler { clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage, - OkTotalUnreadyCount: opts.OkTotalUnreadyCount, - NodeGroupRemovePersistentErrorBackoffEarly: opts.NodeGroupRemovePersistentErrorBackoffEarly, + MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage, + OkTotalUnreadyCount: opts.OkTotalUnreadyCount, } clusterStateRegistry := clusterstate.NewClusterStateRegistry(cloudProvider, clusterStateConfig, autoscalingKubeClients.LogRecorder, backoff, processors.NodeGroupConfigProcessor) processorCallbacks := newStaticAutoscalerProcessorCallbacks() diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index fe6f7e053890..d0480eb1f99a 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -231,26 +231,25 @@ var ( "maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.") nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour, "nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.") - nodeGroupRemovePersistentErrorBackoffEarly = flag.Bool("node-group-remove-persistent-error-backoff-early", true, "Allows early removal of backoff before expiration when a scale-up partially fails due to a likely to be persistent error, e.g. cloud provider being out of resources.") - maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.") - maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.") - recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.") - maxNodesPerScaleUp = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.") - maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.") - skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)") - skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath") - skipNodesWithCustomControllerPods = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers") - minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down") - nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it") - scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.") - parallelDrain = flag.Bool("parallel-drain", true, "Whether to allow parallel drain of nodes. This flag is deprecated and will be removed in future releases.") - maxCapacityMemoryDifferenceRatio = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.") - maxFreeDifferenceRatio = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.") - maxAllocatableDifferenceRatio = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.") - forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.") - dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server") - bypassedSchedulers = pflag.StringSlice("bypassed-scheduler-names", []string{}, fmt.Sprintf("Names of schedulers to bypass. If set to non-empty value, CA will not wait for pods to reach a certain age before triggering a scale-up.")) - drainPriorityConfig = flag.String("drain-priority-config", "", + maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.") + maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.") + recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.") + maxNodesPerScaleUp = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.") + maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.") + skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)") + skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath") + skipNodesWithCustomControllerPods = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers") + minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down") + nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it") + scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.") + parallelDrain = flag.Bool("parallel-drain", true, "Whether to allow parallel drain of nodes. This flag is deprecated and will be removed in future releases.") + maxCapacityMemoryDifferenceRatio = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.") + maxFreeDifferenceRatio = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.") + maxAllocatableDifferenceRatio = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.") + forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.") + dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server") + bypassedSchedulers = pflag.StringSlice("bypassed-scheduler-names", []string{}, fmt.Sprintf("Names of schedulers to bypass. If set to non-empty value, CA will not wait for pods to reach a certain age before triggering a scale-up.")) + drainPriorityConfig = flag.String("drain-priority-config", "", "List of ',' separated pairs (priority:terminationGracePeriodSeconds) of integers separated by ':' enables priority evictor. Priority evictor groups pods into priority groups based on pod priority and evict pods in the ascending order of group priorities"+ "--max-graceful-termination-sec flag should not be set when this flag is set. Not setting this flag will use unordered evictor by default."+ "Priority evictor reuses the concepts of drain logic in kubelet(https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2712-pod-priority-based-graceful-node-shutdown#migration-from-the-node-graceful-shutdown-feature)."+ @@ -399,28 +398,27 @@ func createAutoscalingOptions() config.AutoscalingOptions { ConcurrentRefreshes: *concurrentGceRefreshes, MigInstancesMinRefreshWaitTime: *gceMigInstancesMinRefreshWaitTime, }, - ClusterAPICloudConfigAuthoritative: *clusterAPICloudConfigAuthoritative, - CordonNodeBeforeTerminate: *cordonNodeBeforeTerminate, - DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes, - DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes, - UserAgent: *userAgent, - InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration, - MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration, - NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout, - NodeGroupRemovePersistentErrorBackoffEarly: *nodeGroupRemovePersistentErrorBackoffEarly, - MaxScaleDownParallelism: *maxScaleDownParallelismFlag, - MaxDrainParallelism: *maxDrainParallelismFlag, - RecordDuplicatedEvents: *recordDuplicatedEvents, - MaxNodesPerScaleUp: *maxNodesPerScaleUp, - MaxNodeGroupBinpackingDuration: *maxNodeGroupBinpackingDuration, - NodeDeletionBatcherInterval: *nodeDeletionBatcherInterval, - SkipNodesWithSystemPods: *skipNodesWithSystemPods, - SkipNodesWithLocalStorage: *skipNodesWithLocalStorage, - MinReplicaCount: *minReplicaCount, - NodeDeleteDelayAfterTaint: *nodeDeleteDelayAfterTaint, - ScaleDownSimulationTimeout: *scaleDownSimulationTimeout, - ParallelDrain: *parallelDrain, - SkipNodesWithCustomControllerPods: *skipNodesWithCustomControllerPods, + ClusterAPICloudConfigAuthoritative: *clusterAPICloudConfigAuthoritative, + CordonNodeBeforeTerminate: *cordonNodeBeforeTerminate, + DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes, + DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes, + UserAgent: *userAgent, + InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration, + MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration, + NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout, + MaxScaleDownParallelism: *maxScaleDownParallelismFlag, + MaxDrainParallelism: *maxDrainParallelismFlag, + RecordDuplicatedEvents: *recordDuplicatedEvents, + MaxNodesPerScaleUp: *maxNodesPerScaleUp, + MaxNodeGroupBinpackingDuration: *maxNodeGroupBinpackingDuration, + NodeDeletionBatcherInterval: *nodeDeletionBatcherInterval, + SkipNodesWithSystemPods: *skipNodesWithSystemPods, + SkipNodesWithLocalStorage: *skipNodesWithLocalStorage, + MinReplicaCount: *minReplicaCount, + NodeDeleteDelayAfterTaint: *nodeDeleteDelayAfterTaint, + ScaleDownSimulationTimeout: *scaleDownSimulationTimeout, + ParallelDrain: *parallelDrain, + SkipNodesWithCustomControllerPods: *skipNodesWithCustomControllerPods, NodeGroupSetRatios: config.NodeGroupDifferenceRatios{ MaxCapacityMemoryDifferenceRatio: *maxCapacityMemoryDifferenceRatio, MaxAllocatableDifferenceRatio: *maxAllocatableDifferenceRatio, diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff.go b/cluster-autoscaler/utils/backoff/exponential_backoff.go index 0631878a98a0..a65b9c323dd1 100644 --- a/cluster-autoscaler/utils/backoff/exponential_backoff.go +++ b/cluster-autoscaler/utils/backoff/exponential_backoff.go @@ -38,7 +38,6 @@ type exponentialBackoffInfo struct { backoffUntil time.Time lastFailedExecution time.Time errorInfo cloudprovider.InstanceErrorInfo - errorClass cloudprovider.InstanceErrorClass } // NewExponentialBackoff creates an instance of exponential backoff. @@ -90,7 +89,6 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo backoffUntil: backoffUntil, lastFailedExecution: currentTime, errorInfo: errorInfo, - errorClass: errorClass, } return backoffUntil } From 4477707256f3fb011e39887f9d302f2c1f1733b4 Mon Sep 17 00:00:00 2001 From: Will Bowers <22203232+wllbo@users.noreply.github.com> Date: Tue, 13 Feb 2024 07:12:40 -0800 Subject: [PATCH 5/5] remove RemoveBackoff from updateScaleRequests --- .../clusterstate/clusterstate.go | 4 +--- .../clusterstate/clusterstate_test.go | 23 ++++++++++++++++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 2240d3f22883..1a28e1d37c50 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -261,10 +261,8 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) { for nodeGroupName, scaleUpRequest := range csr.scaleUpRequests { if !csr.areThereUpcomingNodesInNodeGroup(nodeGroupName) { - // scale-out finished successfully - // remove it and reset node group backoff + // scale up finished successfully, remove request delete(csr.scaleUpRequests, nodeGroupName) - csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()]) klog.V(4).Infof("Scale up in group %v finished successfully in %v", nodeGroupName, currentTime.Sub(scaleUpRequest.Time)) continue diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index 52f2952d6807..7b4fb3d6c736 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -854,7 +854,7 @@ func TestScaleUpBackoff(t *testing.T) { }, }, clusterstate.NodeGroupScaleUpSafety(ng1, now)) - // The backoff should be cleared after a successful scale-up + // After successful scale-up, node group should still be backed off clusterstate.RegisterScaleUp(provider.GetNodeGroup("ng1"), 1, now) ng1_4 := BuildTestNode("ng1-4", 1000, 1000) SetNodeReadyState(ng1_4, true, now.Add(-1*time.Minute)) @@ -863,8 +863,25 @@ func TestScaleUpBackoff(t *testing.T) { assert.NoError(t, err) assert.True(t, clusterstate.IsClusterHealthy()) assert.True(t, clusterstate.IsNodeGroupHealthy("ng1")) - assert.Equal(t, NodeGroupScalingSafety{SafeToScale: true, Healthy: true}, clusterstate.NodeGroupScaleUpSafety(ng1, now)) - assert.Equal(t, backoff.Status{IsBackedOff: false}, clusterstate.backoff.BackoffStatus(ng1, nil, now)) + assert.Equal(t, NodeGroupScalingSafety{ + SafeToScale: false, + Healthy: true, + BackoffStatus: backoff.Status{ + IsBackedOff: true, + ErrorInfo: cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "timeout", + ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s", + }, + }, + }, clusterstate.NodeGroupScaleUpSafety(ng1, now)) + assert.Equal(t, backoff.Status{ + IsBackedOff: true, + ErrorInfo: cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "timeout", + ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s", + }}, clusterstate.backoff.BackoffStatus(ng1, nil, now)) } func TestGetClusterSize(t *testing.T) {