From aa1af0386254f91b4755f46e3f3d6a2edaded11c Mon Sep 17 00:00:00 2001
From: Will Bowers <22203232+wllbo@users.noreply.github.com>
Date: Thu, 11 May 2023 19:04:27 -0700
Subject: [PATCH 1/5] add option to keep node group backoff on OutOfResource
 error

---
 cluster-autoscaler/clusterstate/clusterstate.go         | 8 +++++++-
 cluster-autoscaler/config/autoscaling_options.go        | 2 ++
 cluster-autoscaler/main.go                              | 2 ++
 cluster-autoscaler/utils/backoff/backoff.go             | 2 ++
 cluster-autoscaler/utils/backoff/exponential_backoff.go | 8 ++++++++
 5 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
index 2240d3f22883..c51b287a8cc5 100644
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -86,6 +86,8 @@ type ClusterStateRegistryConfig struct {
 	// Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply.
 	// This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling.
 	OkTotalUnreadyCount int
+	// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
+	NodeGroupKeepBackoffOutOfResources bool
 }
 
 // IncorrectNodeGroupSize contains information about how much the current size of the node group
@@ -264,7 +266,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
 			// scale-out finished successfully
 			// remove it and reset node group backoff
 			delete(csr.scaleUpRequests, nodeGroupName)
-			csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
+			shouldKeepBackoff := csr.config.NodeGroupKeepBackoffOutOfResources && csr.backoff.IsNodeGroupOutOfResources(scaleUpRequest.NodeGroup)
+			if !shouldKeepBackoff {
+				klog.V(4).Infof("Removing backoff for node group %v", scaleUpRequest.NodeGroup.Id())
+				csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
+			}
 			klog.V(4).Infof("Scale up in group %v finished successfully in %v",
 				nodeGroupName, currentTime.Sub(scaleUpRequest.Time))
 			continue
diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
index b335c05dcb28..3f12020bb8b4 100644
--- a/cluster-autoscaler/config/autoscaling_options.go
+++ b/cluster-autoscaler/config/autoscaling_options.go
@@ -249,6 +249,8 @@ type AutoscalingOptions struct {
 	MaxNodeGroupBackoffDuration time.Duration
 	// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
 	NodeGroupBackoffResetTimeout time.Duration
+	// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
+	NodeGroupKeepBackoffOutOfResources bool
 	// MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel.
 	MaxScaleDownParallelism int
 	// MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel.
diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
index d0480eb1f99a..8464cda7290e 100644
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@@ -231,6 +231,7 @@ var (
 		"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
 	nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
 		"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
+	nodeGroupKeepBackoffOutOfResources 		= flag.Bool("node-group-keep-backoff-out-of-resources", false, "Prevents removal of backoff before expiration when a scale-up fails due to the cloud provider being out of resources.")
 	maxScaleDownParallelismFlag             = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
 	maxDrainParallelismFlag                 = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
 	recordDuplicatedEvents                  = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
@@ -406,6 +407,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		InitialNodeGroupBackoffDuration:    *initialNodeGroupBackoffDuration,
 		MaxNodeGroupBackoffDuration:        *maxNodeGroupBackoffDuration,
 		NodeGroupBackoffResetTimeout:       *nodeGroupBackoffResetTimeout,
+		NodeGroupKeepBackoffOutOfResources: *nodeGroupKeepBackoffOutOfResources,
 		MaxScaleDownParallelism:            *maxScaleDownParallelismFlag,
 		MaxDrainParallelism:                *maxDrainParallelismFlag,
 		RecordDuplicatedEvents:             *recordDuplicatedEvents,
diff --git a/cluster-autoscaler/utils/backoff/backoff.go b/cluster-autoscaler/utils/backoff/backoff.go
index a4409d2f99ca..85da73556cd0 100644
--- a/cluster-autoscaler/utils/backoff/backoff.go
+++ b/cluster-autoscaler/utils/backoff/backoff.go
@@ -39,4 +39,6 @@ type Backoff interface {
 	RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo)
 	// RemoveStaleBackoffData removes stale backoff data.
 	RemoveStaleBackoffData(currentTime time.Time)
+	// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
+	IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool
 }
diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff.go b/cluster-autoscaler/utils/backoff/exponential_backoff.go
index a65b9c323dd1..eafca64e55e1 100644
--- a/cluster-autoscaler/utils/backoff/exponential_backoff.go
+++ b/cluster-autoscaler/utils/backoff/exponential_backoff.go
@@ -38,6 +38,7 @@ type exponentialBackoffInfo struct {
 	backoffUntil        time.Time
 	lastFailedExecution time.Time
 	errorInfo           cloudprovider.InstanceErrorInfo
+	errorClass          cloudprovider.InstanceErrorClass
 }
 
 // NewExponentialBackoff creates an instance of exponential backoff.
@@ -89,6 +90,7 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo
 		backoffUntil:        backoffUntil,
 		lastFailedExecution: currentTime,
 		errorInfo:           errorInfo,
+		errorClass:          errorClass,
 	}
 	return backoffUntil
 }
@@ -118,3 +120,9 @@ func (b *exponentialBackoff) RemoveStaleBackoffData(currentTime time.Time) {
 		}
 	}
 }
+
+// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
+func (b *exponentialBackoff) IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool {
+	backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)]
+	return found && backoffInfo.errorClass == cloudprovider.OutOfResourcesErrorClass
+}

From 8a2cae379578b235a239e5ff03eb312a3e7b8547 Mon Sep 17 00:00:00 2001
From: Will Bowers <22203232+wllbo@users.noreply.github.com>
Date: Wed, 18 Oct 2023 10:14:02 -0700
Subject: [PATCH 2/5] remove changes to backoff interface

---
 cluster-autoscaler/utils/backoff/backoff.go             | 2 --
 cluster-autoscaler/utils/backoff/exponential_backoff.go | 6 ------
 2 files changed, 8 deletions(-)

diff --git a/cluster-autoscaler/utils/backoff/backoff.go b/cluster-autoscaler/utils/backoff/backoff.go
index 85da73556cd0..a4409d2f99ca 100644
--- a/cluster-autoscaler/utils/backoff/backoff.go
+++ b/cluster-autoscaler/utils/backoff/backoff.go
@@ -39,6 +39,4 @@ type Backoff interface {
 	RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo)
 	// RemoveStaleBackoffData removes stale backoff data.
 	RemoveStaleBackoffData(currentTime time.Time)
-	// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
-	IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool
 }
diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff.go b/cluster-autoscaler/utils/backoff/exponential_backoff.go
index eafca64e55e1..0631878a98a0 100644
--- a/cluster-autoscaler/utils/backoff/exponential_backoff.go
+++ b/cluster-autoscaler/utils/backoff/exponential_backoff.go
@@ -120,9 +120,3 @@ func (b *exponentialBackoff) RemoveStaleBackoffData(currentTime time.Time) {
 		}
 	}
 }
-
-// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
-func (b *exponentialBackoff) IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool {
-	backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)]
-	return found && backoffInfo.errorClass == cloudprovider.OutOfResourcesErrorClass
-}

From 00fd3a802ccb4a49a9a9e4ce07e1e64af7d51789 Mon Sep 17 00:00:00 2001
From: Will Bowers <22203232+wllbo@users.noreply.github.com>
Date: Wed, 18 Oct 2023 10:44:33 -0700
Subject: [PATCH 3/5] attach errors to scale-up request and add comments

---
 .../clusterstate/clusterstate.go              | 32 +++++--
 .../clusterstate/clusterstate_test.go         | 89 +++++++++++++++++++
 .../config/autoscaling_options.go             |  4 +-
 cluster-autoscaler/core/static_autoscaler.go  |  5 +-
 cluster-autoscaler/main.go                    | 84 ++++++++---------
 5 files changed, 162 insertions(+), 52 deletions(-)

diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
index c51b287a8cc5..27c71470e85b 100644
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -65,6 +65,8 @@ type ScaleUpRequest struct {
 	ExpectedAddTime time.Time
 	// How much the node group is increased.
 	Increase int
+	// ErrorClasses is a set of the classes of errors encountered during a scale-up, if any.
+	ErrorClasses map[cloudprovider.InstanceErrorClass]struct{}
 }
 
 // ScaleDownRequest contains information about the requested node deletion.
@@ -86,8 +88,11 @@ type ClusterStateRegistryConfig struct {
 	// Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply.
 	// This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling.
 	OkTotalUnreadyCount int
-	// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
-	NodeGroupKeepBackoffOutOfResources bool
+	// NodeGroupRemovePersistentErrorBackoffEarly is whether a backoff can be removed before expiration when
+	// a scale-up partially fails due to a likely persistent error.
+	// If true (default), the backoff will be removed early regardless of the error.
+	// If false and the backoff is due to a likely persistent error, e.g. OutOfResourcesError, it will not be removed early.
+	NodeGroupRemovePersistentErrorBackoffEarly bool
 }
 
 // IncorrectNodeGroupSize contains information about how much the current size of the node group
@@ -217,6 +222,7 @@ func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudpr
 			Increase:        delta,
 			Time:            currentTime,
 			ExpectedAddTime: currentTime.Add(maxNodeProvisionTime),
+			ErrorClasses:    make(map[cloudprovider.InstanceErrorClass]struct{}),
 		}
 		csr.scaleUpRequests[nodeGroup.Id()] = scaleUpRequest
 		return
@@ -266,9 +272,16 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
 			// scale-out finished successfully
 			// remove it and reset node group backoff
 			delete(csr.scaleUpRequests, nodeGroupName)
-			shouldKeepBackoff := csr.config.NodeGroupKeepBackoffOutOfResources && csr.backoff.IsNodeGroupOutOfResources(scaleUpRequest.NodeGroup)
-			if !shouldKeepBackoff {
-				klog.V(4).Infof("Removing backoff for node group %v", scaleUpRequest.NodeGroup.Id())
+			// If a node group is backed off during a scale-up due to instance creation errors but partially succeeds,
+			// the backoff could be removed early, allowing the CA to retry scaling the same node group.
+			// Optionally, the backoff can be retained for persistent errors given the risk of recurrence.
+			// The backoff will be removed early if either of the following conditions are true:
+			// 1. NodeGroupRemovePersistentErrorBackoffEarly is enabled (default)
+			// 2. There is no persistent error class attached to the scale-up request
+			_, persistentError := scaleUpRequest.ErrorClasses[cloudprovider.OutOfResourcesErrorClass]
+			shouldRemoveBackoffEarly := csr.config.NodeGroupRemovePersistentErrorBackoffEarly || !persistentError
+			if shouldRemoveBackoffEarly {
+				klog.V(4).Infof("Removing backoff early for node group %v", scaleUpRequest.NodeGroup.Id())
 				csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
 			}
 			klog.V(4).Infof("Scale up in group %v finished successfully in %v",
@@ -337,6 +350,13 @@ func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprov
 	csr.scaleUpFailures[nodeGroup.Id()] = append(csr.scaleUpFailures[nodeGroup.Id()], ScaleUpFailure{NodeGroup: nodeGroup, Reason: reason, Time: currentTime})
 	metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType)
 	csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime)
+	// attach the error class to the scale-up request if it exists
+	// it will be used to determine whether to remove the backoff early when updating scale-up requests
+	scaleUpRequest, found := csr.scaleUpRequests[nodeGroup.Id()]
+	if found {
+		scaleUpRequest.ErrorClasses[errorClass] = struct{}{}
+	}
+	csr.backoffNodeGroup(nodeGroup, errorClass, errorCode, currentTime)
 }
 
 // UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats
@@ -1111,7 +1131,7 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup(
 	// If node group is scaling up and there are new node-create requests which cannot be satisfied because of
 	// out-of-resources errors we:
 	//  - emit event
-	//  - alter the scale-up
+	//  - alter the scale-up and attach the error class
 	//  - increase scale-up failure metric
 	//  - backoff the node group
 	for errorCode, instances := range currentErrorCodeToInstance {
diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go
index 52f2952d6807..124932dcbea1 100644
--- a/cluster-autoscaler/clusterstate/clusterstate_test.go
+++ b/cluster-autoscaler/clusterstate/clusterstate_test.go
@@ -484,6 +484,95 @@ func TestRegisterScaleDown(t *testing.T) {
 	assert.Empty(t, clusterstate.GetScaleUpFailures())
 }
 
+func TestRemovePersistentErrorBackoffEarlyEnabled(t *testing.T) {
+	ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
+	provider := testprovider.NewTestCloudProvider(nil, nil)
+	provider.AddNodeGroup("ng1", 1, 10, 1)
+	provider.AddNode("ng1", ng1_1)
+	assert.NotNil(t, provider)
+
+	fakeClient := &fake.Clientset{}
+	fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
+	clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
+		MaxTotalUnreadyPercentage:                  10,
+		OkTotalUnreadyCount:                        1,
+		NodeGroupRemovePersistentErrorBackoffEarly: true,
+	}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}))
+
+	now := time.Now()
+
+	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(4)
+	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
+	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 3, now)
+	assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))
+
+	// Fail two nodes with a persistent and a non-persistent error
+	clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OutOfResourcesErrorClass, string(metrics.CloudProviderError), "", "", now)
+	clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OtherErrorClass, string(metrics.CloudProviderError), "", "", now)
+	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -2, now)
+	assert.Equal(t, 2, len(clusterstate.scaleUpRequests["ng1"].ErrorClasses))
+	assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
+
+	// Reduce the target size to original value to complete the scale-up and trigger the backoff early removal
+	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1)
+	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
+	assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
+	assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
+}
+
+func TestRemovePersistentErrorBackoffEarlyDisabled(t *testing.T) {
+	ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
+	provider := testprovider.NewTestCloudProvider(nil, nil)
+	provider.AddNodeGroup("ng1", 1, 10, 1)
+	provider.AddNode("ng1", ng1_1)
+	assert.NotNil(t, provider)
+
+	fakeClient := &fake.Clientset{}
+	fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
+	clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
+		MaxTotalUnreadyPercentage:                  10,
+		OkTotalUnreadyCount:                        1,
+		NodeGroupRemovePersistentErrorBackoffEarly: false,
+	}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}))
+
+	now := time.Now()
+
+	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(3)
+	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
+	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 2, now)
+	assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))
+
+	// Fail one node with a persistent error
+	clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OutOfResourcesErrorClass, string(metrics.CloudProviderError), "", "", now)
+	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -1, now)
+	assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
+
+	// Confirm the persistent error backoff is not removed early
+	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1)
+	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
+	assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
+	assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
+
+	// Remove the backoff and scale up again
+	clusterstate.backoff.RemoveBackoff(provider.GetNodeGroup("ng1"), nil)
+	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(3)
+	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
+	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 2, now)
+	assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
+	assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))
+
+	// Fail one node with a non-persistent error
+	clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OtherErrorClass, string(metrics.CloudProviderError), "", "", now)
+	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -1, now)
+	assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
+
+	// Complete the scale-up and confirm the backoff is removed early
+	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1)
+	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
+	assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
+	assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
+}
+
 func TestUpcomingNodes(t *testing.T) {
 	provider := testprovider.NewTestCloudProvider(nil, nil)
 	now := time.Now()
diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
index 3f12020bb8b4..68fdc2b6d4fc 100644
--- a/cluster-autoscaler/config/autoscaling_options.go
+++ b/cluster-autoscaler/config/autoscaling_options.go
@@ -249,8 +249,8 @@ type AutoscalingOptions struct {
 	MaxNodeGroupBackoffDuration time.Duration
 	// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
 	NodeGroupBackoffResetTimeout time.Duration
-	// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
-	NodeGroupKeepBackoffOutOfResources bool
+	// NodeGroupRemovePersistentErrorBackoffEarly is whether a backoff can be removed before expiration when a scale-up partially fails due to a likely persistent error.
+	NodeGroupRemovePersistentErrorBackoffEarly bool
 	// MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel.
 	MaxScaleDownParallelism int
 	// MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel.
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
index 471a341077f2..7466e2577b0b 100644
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@@ -147,8 +147,9 @@ func NewStaticAutoscaler(
 	drainabilityRules rules.Rules) *StaticAutoscaler {
 
 	clusterStateConfig := clusterstate.ClusterStateRegistryConfig{
-		MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage,
-		OkTotalUnreadyCount:       opts.OkTotalUnreadyCount,
+		MaxTotalUnreadyPercentage:                  opts.MaxTotalUnreadyPercentage,
+		OkTotalUnreadyCount:                        opts.OkTotalUnreadyCount,
+		NodeGroupRemovePersistentErrorBackoffEarly: opts.NodeGroupRemovePersistentErrorBackoffEarly,
 	}
 	clusterStateRegistry := clusterstate.NewClusterStateRegistry(cloudProvider, clusterStateConfig, autoscalingKubeClients.LogRecorder, backoff, processors.NodeGroupConfigProcessor)
 	processorCallbacks := newStaticAutoscalerProcessorCallbacks()
diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
index 8464cda7290e..fe6f7e053890 100644
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@@ -231,26 +231,26 @@ var (
 		"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
 	nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
 		"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
-	nodeGroupKeepBackoffOutOfResources 		= flag.Bool("node-group-keep-backoff-out-of-resources", false, "Prevents removal of backoff before expiration when a scale-up fails due to the cloud provider being out of resources.")
-	maxScaleDownParallelismFlag             = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
-	maxDrainParallelismFlag                 = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
-	recordDuplicatedEvents                  = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
-	maxNodesPerScaleUp                      = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
-	maxNodeGroupBinpackingDuration          = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
-	skipNodesWithSystemPods                 = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
-	skipNodesWithLocalStorage               = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
-	skipNodesWithCustomControllerPods       = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers")
-	minReplicaCount                         = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
-	nodeDeleteDelayAfterTaint               = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
-	scaleDownSimulationTimeout              = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
-	parallelDrain                           = flag.Bool("parallel-drain", true, "Whether to allow parallel drain of nodes. This flag is deprecated and will be removed in future releases.")
-	maxCapacityMemoryDifferenceRatio        = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
-	maxFreeDifferenceRatio                  = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")
-	maxAllocatableDifferenceRatio           = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
-	forceDaemonSets                         = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
-	dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server")
-	bypassedSchedulers                      = pflag.StringSlice("bypassed-scheduler-names", []string{}, fmt.Sprintf("Names of schedulers to bypass. If set to non-empty value, CA will not wait for pods to reach a certain age before triggering a scale-up."))
-	drainPriorityConfig                     = flag.String("drain-priority-config", "",
+	nodeGroupRemovePersistentErrorBackoffEarly = flag.Bool("node-group-remove-persistent-error-backoff-early", true, "Allows early removal of backoff before expiration when a scale-up partially fails due to a likely to be persistent error, e.g. cloud provider being out of resources.")
+	maxScaleDownParallelismFlag                = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
+	maxDrainParallelismFlag                    = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
+	recordDuplicatedEvents                     = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
+	maxNodesPerScaleUp                         = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
+	maxNodeGroupBinpackingDuration             = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
+	skipNodesWithSystemPods                    = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
+	skipNodesWithLocalStorage                  = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
+	skipNodesWithCustomControllerPods          = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers")
+	minReplicaCount                            = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
+	nodeDeleteDelayAfterTaint                  = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
+	scaleDownSimulationTimeout                 = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
+	parallelDrain                              = flag.Bool("parallel-drain", true, "Whether to allow parallel drain of nodes. This flag is deprecated and will be removed in future releases.")
+	maxCapacityMemoryDifferenceRatio           = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
+	maxFreeDifferenceRatio                     = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")
+	maxAllocatableDifferenceRatio              = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
+	forceDaemonSets                            = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
+	dynamicNodeDeleteDelayAfterTaintEnabled    = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server")
+	bypassedSchedulers                         = pflag.StringSlice("bypassed-scheduler-names", []string{}, fmt.Sprintf("Names of schedulers to bypass. If set to non-empty value, CA will not wait for pods to reach a certain age before triggering a scale-up."))
+	drainPriorityConfig                        = flag.String("drain-priority-config", "",
 		"List of ',' separated pairs (priority:terminationGracePeriodSeconds) of integers separated by ':' enables priority evictor. Priority evictor groups pods into priority groups based on pod priority and evict pods in the ascending order of group priorities"+
 			"--max-graceful-termination-sec flag should not be set when this flag is set. Not setting this flag will use unordered evictor by default."+
 			"Priority evictor reuses the concepts of drain logic in kubelet(https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2712-pod-priority-based-graceful-node-shutdown#migration-from-the-node-graceful-shutdown-feature)."+
@@ -399,28 +399,28 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 			ConcurrentRefreshes:            *concurrentGceRefreshes,
 			MigInstancesMinRefreshWaitTime: *gceMigInstancesMinRefreshWaitTime,
 		},
-		ClusterAPICloudConfigAuthoritative: *clusterAPICloudConfigAuthoritative,
-		CordonNodeBeforeTerminate:          *cordonNodeBeforeTerminate,
-		DaemonSetEvictionForEmptyNodes:     *daemonSetEvictionForEmptyNodes,
-		DaemonSetEvictionForOccupiedNodes:  *daemonSetEvictionForOccupiedNodes,
-		UserAgent:                          *userAgent,
-		InitialNodeGroupBackoffDuration:    *initialNodeGroupBackoffDuration,
-		MaxNodeGroupBackoffDuration:        *maxNodeGroupBackoffDuration,
-		NodeGroupBackoffResetTimeout:       *nodeGroupBackoffResetTimeout,
-		NodeGroupKeepBackoffOutOfResources: *nodeGroupKeepBackoffOutOfResources,
-		MaxScaleDownParallelism:            *maxScaleDownParallelismFlag,
-		MaxDrainParallelism:                *maxDrainParallelismFlag,
-		RecordDuplicatedEvents:             *recordDuplicatedEvents,
-		MaxNodesPerScaleUp:                 *maxNodesPerScaleUp,
-		MaxNodeGroupBinpackingDuration:     *maxNodeGroupBinpackingDuration,
-		NodeDeletionBatcherInterval:        *nodeDeletionBatcherInterval,
-		SkipNodesWithSystemPods:            *skipNodesWithSystemPods,
-		SkipNodesWithLocalStorage:          *skipNodesWithLocalStorage,
-		MinReplicaCount:                    *minReplicaCount,
-		NodeDeleteDelayAfterTaint:          *nodeDeleteDelayAfterTaint,
-		ScaleDownSimulationTimeout:         *scaleDownSimulationTimeout,
-		ParallelDrain:                      *parallelDrain,
-		SkipNodesWithCustomControllerPods:  *skipNodesWithCustomControllerPods,
+		ClusterAPICloudConfigAuthoritative:         *clusterAPICloudConfigAuthoritative,
+		CordonNodeBeforeTerminate:                  *cordonNodeBeforeTerminate,
+		DaemonSetEvictionForEmptyNodes:             *daemonSetEvictionForEmptyNodes,
+		DaemonSetEvictionForOccupiedNodes:          *daemonSetEvictionForOccupiedNodes,
+		UserAgent:                                  *userAgent,
+		InitialNodeGroupBackoffDuration:            *initialNodeGroupBackoffDuration,
+		MaxNodeGroupBackoffDuration:                *maxNodeGroupBackoffDuration,
+		NodeGroupBackoffResetTimeout:               *nodeGroupBackoffResetTimeout,
+		NodeGroupRemovePersistentErrorBackoffEarly: *nodeGroupRemovePersistentErrorBackoffEarly,
+		MaxScaleDownParallelism:                    *maxScaleDownParallelismFlag,
+		MaxDrainParallelism:                        *maxDrainParallelismFlag,
+		RecordDuplicatedEvents:                     *recordDuplicatedEvents,
+		MaxNodesPerScaleUp:                         *maxNodesPerScaleUp,
+		MaxNodeGroupBinpackingDuration:             *maxNodeGroupBinpackingDuration,
+		NodeDeletionBatcherInterval:                *nodeDeletionBatcherInterval,
+		SkipNodesWithSystemPods:                    *skipNodesWithSystemPods,
+		SkipNodesWithLocalStorage:                  *skipNodesWithLocalStorage,
+		MinReplicaCount:                            *minReplicaCount,
+		NodeDeleteDelayAfterTaint:                  *nodeDeleteDelayAfterTaint,
+		ScaleDownSimulationTimeout:                 *scaleDownSimulationTimeout,
+		ParallelDrain:                              *parallelDrain,
+		SkipNodesWithCustomControllerPods:          *skipNodesWithCustomControllerPods,
 		NodeGroupSetRatios: config.NodeGroupDifferenceRatios{
 			MaxCapacityMemoryDifferenceRatio: *maxCapacityMemoryDifferenceRatio,
 			MaxAllocatableDifferenceRatio:    *maxAllocatableDifferenceRatio,

From 8e867f66c5ccedaee35142112c7de36a237c6878 Mon Sep 17 00:00:00 2001
From: Will Bowers <22203232+wllbo@users.noreply.github.com>
Date: Tue, 13 Feb 2024 07:09:44 -0800
Subject: [PATCH 4/5] revert optionally keeping node group backoff

---
 .../clusterstate/clusterstate.go              | 30 +------
 .../clusterstate/clusterstate_test.go         | 89 -------------------
 .../config/autoscaling_options.go             |  2 -
 cluster-autoscaler/core/static_autoscaler.go  |  5 +-
 cluster-autoscaler/main.go                    | 82 +++++++++--------
 .../utils/backoff/exponential_backoff.go      |  2 -
 6 files changed, 44 insertions(+), 166 deletions(-)

diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
index 27c71470e85b..2240d3f22883 100644
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -65,8 +65,6 @@ type ScaleUpRequest struct {
 	ExpectedAddTime time.Time
 	// How much the node group is increased.
 	Increase int
-	// ErrorClasses is a set of the classes of errors encountered during a scale-up, if any.
-	ErrorClasses map[cloudprovider.InstanceErrorClass]struct{}
 }
 
 // ScaleDownRequest contains information about the requested node deletion.
@@ -88,11 +86,6 @@ type ClusterStateRegistryConfig struct {
 	// Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply.
 	// This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling.
 	OkTotalUnreadyCount int
-	// NodeGroupRemovePersistentErrorBackoffEarly is whether a backoff can be removed before expiration when
-	// a scale-up partially fails due to a likely persistent error.
-	// If true (default), the backoff will be removed early regardless of the error.
-	// If false and the backoff is due to a likely persistent error, e.g. OutOfResourcesError, it will not be removed early.
-	NodeGroupRemovePersistentErrorBackoffEarly bool
 }
 
 // IncorrectNodeGroupSize contains information about how much the current size of the node group
@@ -222,7 +215,6 @@ func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudpr
 			Increase:        delta,
 			Time:            currentTime,
 			ExpectedAddTime: currentTime.Add(maxNodeProvisionTime),
-			ErrorClasses:    make(map[cloudprovider.InstanceErrorClass]struct{}),
 		}
 		csr.scaleUpRequests[nodeGroup.Id()] = scaleUpRequest
 		return
@@ -272,18 +264,7 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
 			// scale-out finished successfully
 			// remove it and reset node group backoff
 			delete(csr.scaleUpRequests, nodeGroupName)
-			// If a node group is backed off during a scale-up due to instance creation errors but partially succeeds,
-			// the backoff could be removed early, allowing the CA to retry scaling the same node group.
-			// Optionally, the backoff can be retained for persistent errors given the risk of recurrence.
-			// The backoff will be removed early if either of the following conditions are true:
-			// 1. NodeGroupRemovePersistentErrorBackoffEarly is enabled (default)
-			// 2. There is no persistent error class attached to the scale-up request
-			_, persistentError := scaleUpRequest.ErrorClasses[cloudprovider.OutOfResourcesErrorClass]
-			shouldRemoveBackoffEarly := csr.config.NodeGroupRemovePersistentErrorBackoffEarly || !persistentError
-			if shouldRemoveBackoffEarly {
-				klog.V(4).Infof("Removing backoff early for node group %v", scaleUpRequest.NodeGroup.Id())
-				csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
-			}
+			csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
 			klog.V(4).Infof("Scale up in group %v finished successfully in %v",
 				nodeGroupName, currentTime.Sub(scaleUpRequest.Time))
 			continue
@@ -350,13 +331,6 @@ func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprov
 	csr.scaleUpFailures[nodeGroup.Id()] = append(csr.scaleUpFailures[nodeGroup.Id()], ScaleUpFailure{NodeGroup: nodeGroup, Reason: reason, Time: currentTime})
 	metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType)
 	csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime)
-	// attach the error class to the scale-up request if it exists
-	// it will be used to determine whether to remove the backoff early when updating scale-up requests
-	scaleUpRequest, found := csr.scaleUpRequests[nodeGroup.Id()]
-	if found {
-		scaleUpRequest.ErrorClasses[errorClass] = struct{}{}
-	}
-	csr.backoffNodeGroup(nodeGroup, errorClass, errorCode, currentTime)
 }
 
 // UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats
@@ -1131,7 +1105,7 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup(
 	// If node group is scaling up and there are new node-create requests which cannot be satisfied because of
 	// out-of-resources errors we:
 	//  - emit event
-	//  - alter the scale-up and attach the error class
+	//  - alter the scale-up
 	//  - increase scale-up failure metric
 	//  - backoff the node group
 	for errorCode, instances := range currentErrorCodeToInstance {
diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go
index 124932dcbea1..52f2952d6807 100644
--- a/cluster-autoscaler/clusterstate/clusterstate_test.go
+++ b/cluster-autoscaler/clusterstate/clusterstate_test.go
@@ -484,95 +484,6 @@ func TestRegisterScaleDown(t *testing.T) {
 	assert.Empty(t, clusterstate.GetScaleUpFailures())
 }
 
-func TestRemovePersistentErrorBackoffEarlyEnabled(t *testing.T) {
-	ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
-	provider := testprovider.NewTestCloudProvider(nil, nil)
-	provider.AddNodeGroup("ng1", 1, 10, 1)
-	provider.AddNode("ng1", ng1_1)
-	assert.NotNil(t, provider)
-
-	fakeClient := &fake.Clientset{}
-	fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
-	clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
-		MaxTotalUnreadyPercentage:                  10,
-		OkTotalUnreadyCount:                        1,
-		NodeGroupRemovePersistentErrorBackoffEarly: true,
-	}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}))
-
-	now := time.Now()
-
-	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(4)
-	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
-	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 3, now)
-	assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))
-
-	// Fail two nodes with a persistent and a non-persistent error
-	clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OutOfResourcesErrorClass, string(metrics.CloudProviderError), "", "", now)
-	clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OtherErrorClass, string(metrics.CloudProviderError), "", "", now)
-	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -2, now)
-	assert.Equal(t, 2, len(clusterstate.scaleUpRequests["ng1"].ErrorClasses))
-	assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
-
-	// Reduce the target size to original value to complete the scale-up and trigger the backoff early removal
-	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1)
-	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
-	assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
-	assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
-}
-
-func TestRemovePersistentErrorBackoffEarlyDisabled(t *testing.T) {
-	ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
-	provider := testprovider.NewTestCloudProvider(nil, nil)
-	provider.AddNodeGroup("ng1", 1, 10, 1)
-	provider.AddNode("ng1", ng1_1)
-	assert.NotNil(t, provider)
-
-	fakeClient := &fake.Clientset{}
-	fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
-	clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
-		MaxTotalUnreadyPercentage:                  10,
-		OkTotalUnreadyCount:                        1,
-		NodeGroupRemovePersistentErrorBackoffEarly: false,
-	}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}))
-
-	now := time.Now()
-
-	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(3)
-	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
-	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 2, now)
-	assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))
-
-	// Fail one node with a persistent error
-	clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OutOfResourcesErrorClass, string(metrics.CloudProviderError), "", "", now)
-	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -1, now)
-	assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
-
-	// Confirm the persistent error backoff is not removed early
-	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1)
-	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
-	assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
-	assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
-
-	// Remove the backoff and scale up again
-	clusterstate.backoff.RemoveBackoff(provider.GetNodeGroup("ng1"), nil)
-	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(3)
-	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
-	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 2, now)
-	assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
-	assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))
-
-	// Fail one node with a non-persistent error
-	clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OtherErrorClass, string(metrics.CloudProviderError), "", "", now)
-	clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -1, now)
-	assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
-
-	// Complete the scale-up and confirm the backoff is removed early
-	provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1)
-	clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
-	assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
-	assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
-}
-
 func TestUpcomingNodes(t *testing.T) {
 	provider := testprovider.NewTestCloudProvider(nil, nil)
 	now := time.Now()
diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
index 68fdc2b6d4fc..b335c05dcb28 100644
--- a/cluster-autoscaler/config/autoscaling_options.go
+++ b/cluster-autoscaler/config/autoscaling_options.go
@@ -249,8 +249,6 @@ type AutoscalingOptions struct {
 	MaxNodeGroupBackoffDuration time.Duration
 	// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
 	NodeGroupBackoffResetTimeout time.Duration
-	// NodeGroupRemovePersistentErrorBackoffEarly is whether a backoff can be removed before expiration when a scale-up partially fails due to a likely persistent error.
-	NodeGroupRemovePersistentErrorBackoffEarly bool
 	// MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel.
 	MaxScaleDownParallelism int
 	// MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel.
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
index 7466e2577b0b..471a341077f2 100644
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@@ -147,9 +147,8 @@ func NewStaticAutoscaler(
 	drainabilityRules rules.Rules) *StaticAutoscaler {
 
 	clusterStateConfig := clusterstate.ClusterStateRegistryConfig{
-		MaxTotalUnreadyPercentage:                  opts.MaxTotalUnreadyPercentage,
-		OkTotalUnreadyCount:                        opts.OkTotalUnreadyCount,
-		NodeGroupRemovePersistentErrorBackoffEarly: opts.NodeGroupRemovePersistentErrorBackoffEarly,
+		MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage,
+		OkTotalUnreadyCount:       opts.OkTotalUnreadyCount,
 	}
 	clusterStateRegistry := clusterstate.NewClusterStateRegistry(cloudProvider, clusterStateConfig, autoscalingKubeClients.LogRecorder, backoff, processors.NodeGroupConfigProcessor)
 	processorCallbacks := newStaticAutoscalerProcessorCallbacks()
diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
index fe6f7e053890..d0480eb1f99a 100644
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@@ -231,26 +231,25 @@ var (
 		"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
 	nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
 		"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
-	nodeGroupRemovePersistentErrorBackoffEarly = flag.Bool("node-group-remove-persistent-error-backoff-early", true, "Allows early removal of backoff before expiration when a scale-up partially fails due to a likely to be persistent error, e.g. cloud provider being out of resources.")
-	maxScaleDownParallelismFlag                = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
-	maxDrainParallelismFlag                    = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
-	recordDuplicatedEvents                     = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
-	maxNodesPerScaleUp                         = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
-	maxNodeGroupBinpackingDuration             = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
-	skipNodesWithSystemPods                    = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
-	skipNodesWithLocalStorage                  = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
-	skipNodesWithCustomControllerPods          = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers")
-	minReplicaCount                            = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
-	nodeDeleteDelayAfterTaint                  = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
-	scaleDownSimulationTimeout                 = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
-	parallelDrain                              = flag.Bool("parallel-drain", true, "Whether to allow parallel drain of nodes. This flag is deprecated and will be removed in future releases.")
-	maxCapacityMemoryDifferenceRatio           = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
-	maxFreeDifferenceRatio                     = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")
-	maxAllocatableDifferenceRatio              = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
-	forceDaemonSets                            = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
-	dynamicNodeDeleteDelayAfterTaintEnabled    = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server")
-	bypassedSchedulers                         = pflag.StringSlice("bypassed-scheduler-names", []string{}, fmt.Sprintf("Names of schedulers to bypass. If set to non-empty value, CA will not wait for pods to reach a certain age before triggering a scale-up."))
-	drainPriorityConfig                        = flag.String("drain-priority-config", "",
+	maxScaleDownParallelismFlag             = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
+	maxDrainParallelismFlag                 = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
+	recordDuplicatedEvents                  = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
+	maxNodesPerScaleUp                      = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
+	maxNodeGroupBinpackingDuration          = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
+	skipNodesWithSystemPods                 = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
+	skipNodesWithLocalStorage               = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
+	skipNodesWithCustomControllerPods       = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers")
+	minReplicaCount                         = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
+	nodeDeleteDelayAfterTaint               = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
+	scaleDownSimulationTimeout              = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
+	parallelDrain                           = flag.Bool("parallel-drain", true, "Whether to allow parallel drain of nodes. This flag is deprecated and will be removed in future releases.")
+	maxCapacityMemoryDifferenceRatio        = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
+	maxFreeDifferenceRatio                  = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")
+	maxAllocatableDifferenceRatio           = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
+	forceDaemonSets                         = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
+	dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server")
+	bypassedSchedulers                      = pflag.StringSlice("bypassed-scheduler-names", []string{}, fmt.Sprintf("Names of schedulers to bypass. If set to non-empty value, CA will not wait for pods to reach a certain age before triggering a scale-up."))
+	drainPriorityConfig                     = flag.String("drain-priority-config", "",
 		"List of ',' separated pairs (priority:terminationGracePeriodSeconds) of integers separated by ':' enables priority evictor. Priority evictor groups pods into priority groups based on pod priority and evict pods in the ascending order of group priorities"+
 			"--max-graceful-termination-sec flag should not be set when this flag is set. Not setting this flag will use unordered evictor by default."+
 			"Priority evictor reuses the concepts of drain logic in kubelet(https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2712-pod-priority-based-graceful-node-shutdown#migration-from-the-node-graceful-shutdown-feature)."+
@@ -399,28 +398,27 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 			ConcurrentRefreshes:            *concurrentGceRefreshes,
 			MigInstancesMinRefreshWaitTime: *gceMigInstancesMinRefreshWaitTime,
 		},
-		ClusterAPICloudConfigAuthoritative:         *clusterAPICloudConfigAuthoritative,
-		CordonNodeBeforeTerminate:                  *cordonNodeBeforeTerminate,
-		DaemonSetEvictionForEmptyNodes:             *daemonSetEvictionForEmptyNodes,
-		DaemonSetEvictionForOccupiedNodes:          *daemonSetEvictionForOccupiedNodes,
-		UserAgent:                                  *userAgent,
-		InitialNodeGroupBackoffDuration:            *initialNodeGroupBackoffDuration,
-		MaxNodeGroupBackoffDuration:                *maxNodeGroupBackoffDuration,
-		NodeGroupBackoffResetTimeout:               *nodeGroupBackoffResetTimeout,
-		NodeGroupRemovePersistentErrorBackoffEarly: *nodeGroupRemovePersistentErrorBackoffEarly,
-		MaxScaleDownParallelism:                    *maxScaleDownParallelismFlag,
-		MaxDrainParallelism:                        *maxDrainParallelismFlag,
-		RecordDuplicatedEvents:                     *recordDuplicatedEvents,
-		MaxNodesPerScaleUp:                         *maxNodesPerScaleUp,
-		MaxNodeGroupBinpackingDuration:             *maxNodeGroupBinpackingDuration,
-		NodeDeletionBatcherInterval:                *nodeDeletionBatcherInterval,
-		SkipNodesWithSystemPods:                    *skipNodesWithSystemPods,
-		SkipNodesWithLocalStorage:                  *skipNodesWithLocalStorage,
-		MinReplicaCount:                            *minReplicaCount,
-		NodeDeleteDelayAfterTaint:                  *nodeDeleteDelayAfterTaint,
-		ScaleDownSimulationTimeout:                 *scaleDownSimulationTimeout,
-		ParallelDrain:                              *parallelDrain,
-		SkipNodesWithCustomControllerPods:          *skipNodesWithCustomControllerPods,
+		ClusterAPICloudConfigAuthoritative: *clusterAPICloudConfigAuthoritative,
+		CordonNodeBeforeTerminate:          *cordonNodeBeforeTerminate,
+		DaemonSetEvictionForEmptyNodes:     *daemonSetEvictionForEmptyNodes,
+		DaemonSetEvictionForOccupiedNodes:  *daemonSetEvictionForOccupiedNodes,
+		UserAgent:                          *userAgent,
+		InitialNodeGroupBackoffDuration:    *initialNodeGroupBackoffDuration,
+		MaxNodeGroupBackoffDuration:        *maxNodeGroupBackoffDuration,
+		NodeGroupBackoffResetTimeout:       *nodeGroupBackoffResetTimeout,
+		MaxScaleDownParallelism:            *maxScaleDownParallelismFlag,
+		MaxDrainParallelism:                *maxDrainParallelismFlag,
+		RecordDuplicatedEvents:             *recordDuplicatedEvents,
+		MaxNodesPerScaleUp:                 *maxNodesPerScaleUp,
+		MaxNodeGroupBinpackingDuration:     *maxNodeGroupBinpackingDuration,
+		NodeDeletionBatcherInterval:        *nodeDeletionBatcherInterval,
+		SkipNodesWithSystemPods:            *skipNodesWithSystemPods,
+		SkipNodesWithLocalStorage:          *skipNodesWithLocalStorage,
+		MinReplicaCount:                    *minReplicaCount,
+		NodeDeleteDelayAfterTaint:          *nodeDeleteDelayAfterTaint,
+		ScaleDownSimulationTimeout:         *scaleDownSimulationTimeout,
+		ParallelDrain:                      *parallelDrain,
+		SkipNodesWithCustomControllerPods:  *skipNodesWithCustomControllerPods,
 		NodeGroupSetRatios: config.NodeGroupDifferenceRatios{
 			MaxCapacityMemoryDifferenceRatio: *maxCapacityMemoryDifferenceRatio,
 			MaxAllocatableDifferenceRatio:    *maxAllocatableDifferenceRatio,
diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff.go b/cluster-autoscaler/utils/backoff/exponential_backoff.go
index 0631878a98a0..a65b9c323dd1 100644
--- a/cluster-autoscaler/utils/backoff/exponential_backoff.go
+++ b/cluster-autoscaler/utils/backoff/exponential_backoff.go
@@ -38,7 +38,6 @@ type exponentialBackoffInfo struct {
 	backoffUntil        time.Time
 	lastFailedExecution time.Time
 	errorInfo           cloudprovider.InstanceErrorInfo
-	errorClass          cloudprovider.InstanceErrorClass
 }
 
 // NewExponentialBackoff creates an instance of exponential backoff.
@@ -90,7 +89,6 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo
 		backoffUntil:        backoffUntil,
 		lastFailedExecution: currentTime,
 		errorInfo:           errorInfo,
-		errorClass:          errorClass,
 	}
 	return backoffUntil
 }

From 4477707256f3fb011e39887f9d302f2c1f1733b4 Mon Sep 17 00:00:00 2001
From: Will Bowers <22203232+wllbo@users.noreply.github.com>
Date: Tue, 13 Feb 2024 07:12:40 -0800
Subject: [PATCH 5/5] remove RemoveBackoff from updateScaleRequests

---
 .../clusterstate/clusterstate.go              |  4 +---
 .../clusterstate/clusterstate_test.go         | 23 ++++++++++++++++---
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
index 2240d3f22883..1a28e1d37c50 100644
--- a/cluster-autoscaler/clusterstate/clusterstate.go
+++ b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -261,10 +261,8 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
 
 	for nodeGroupName, scaleUpRequest := range csr.scaleUpRequests {
 		if !csr.areThereUpcomingNodesInNodeGroup(nodeGroupName) {
-			// scale-out finished successfully
-			// remove it and reset node group backoff
+			// scale up finished successfully, remove request
 			delete(csr.scaleUpRequests, nodeGroupName)
-			csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
 			klog.V(4).Infof("Scale up in group %v finished successfully in %v",
 				nodeGroupName, currentTime.Sub(scaleUpRequest.Time))
 			continue
diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go
index 52f2952d6807..7b4fb3d6c736 100644
--- a/cluster-autoscaler/clusterstate/clusterstate_test.go
+++ b/cluster-autoscaler/clusterstate/clusterstate_test.go
@@ -854,7 +854,7 @@ func TestScaleUpBackoff(t *testing.T) {
 		},
 	}, clusterstate.NodeGroupScaleUpSafety(ng1, now))
 
-	// The backoff should be cleared after a successful scale-up
+	// After successful scale-up, node group should still be backed off
 	clusterstate.RegisterScaleUp(provider.GetNodeGroup("ng1"), 1, now)
 	ng1_4 := BuildTestNode("ng1-4", 1000, 1000)
 	SetNodeReadyState(ng1_4, true, now.Add(-1*time.Minute))
@@ -863,8 +863,25 @@ func TestScaleUpBackoff(t *testing.T) {
 	assert.NoError(t, err)
 	assert.True(t, clusterstate.IsClusterHealthy())
 	assert.True(t, clusterstate.IsNodeGroupHealthy("ng1"))
-	assert.Equal(t, NodeGroupScalingSafety{SafeToScale: true, Healthy: true}, clusterstate.NodeGroupScaleUpSafety(ng1, now))
-	assert.Equal(t, backoff.Status{IsBackedOff: false}, clusterstate.backoff.BackoffStatus(ng1, nil, now))
+	assert.Equal(t, NodeGroupScalingSafety{
+		SafeToScale: false,
+		Healthy:     true,
+		BackoffStatus: backoff.Status{
+			IsBackedOff: true,
+			ErrorInfo: cloudprovider.InstanceErrorInfo{
+				ErrorClass:   cloudprovider.OtherErrorClass,
+				ErrorCode:    "timeout",
+				ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s",
+			},
+		},
+	}, clusterstate.NodeGroupScaleUpSafety(ng1, now))
+	assert.Equal(t, backoff.Status{
+		IsBackedOff: true,
+		ErrorInfo: cloudprovider.InstanceErrorInfo{
+			ErrorClass:   cloudprovider.OtherErrorClass,
+			ErrorCode:    "timeout",
+			ErrorMessage: "Scale-up timed out for node group ng1 after 2m1s",
+		}}, clusterstate.backoff.BackoffStatus(ng1, nil, now))
 }
 
 func TestGetClusterSize(t *testing.T) {