add option to keep node group backoff on OutOfResource error

kubernetes · May 12, 2023 · 70e1991 · 70e1991
1 parent ee59c74
commit 70e1991
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 19 deletions.
diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -81,6 +81,8 @@ type ClusterStateRegistryConfig struct {
 	// Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply.
 	// This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling.
 	OkTotalUnreadyCount int
+	// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
+	NodeGroupKeepBackoffOutOfResources bool
 }
 
 // IncorrectNodeGroupSize contains information about how much the current size of the node group
@@ -254,7 +256,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
 			// scale-out finished successfully
 			// remove it and reset node group backoff
 			delete(csr.scaleUpRequests, nodeGroupName)
-			csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
+			shouldKeepBackoff := csr.config.NodeGroupKeepBackoffOutOfResources && csr.backoff.IsNodeGroupOutOfResources(scaleUpRequest.NodeGroup)
+			if !shouldKeepBackoff {
+				klog.V(4).Infof("Removing backoff for node group %v", scaleUpRequest.NodeGroup.Id())
+				csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
+			}
 			klog.V(4).Infof("Scale up in group %v finished successfully in %v",
 				nodeGroupName, currentTime.Sub(scaleUpRequest.Time))
 			continue

diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -237,6 +237,8 @@ type AutoscalingOptions struct {
 	MaxNodeGroupBackoffDuration time.Duration
 	// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
 	NodeGroupBackoffResetTimeout time.Duration
+	// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
+	NodeGroupKeepBackoffOutOfResources bool
 	// MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel.
 	MaxScaleDownParallelism int
 	// MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel.

diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -156,8 +156,9 @@ func NewStaticAutoscaler(
 		remainingPdbTracker)
 
 	clusterStateConfig := clusterstate.ClusterStateRegistryConfig{
-		MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage,
-		OkTotalUnreadyCount:       opts.OkTotalUnreadyCount,
+		MaxTotalUnreadyPercentage:          opts.MaxTotalUnreadyPercentage,
+		OkTotalUnreadyCount:                opts.OkTotalUnreadyCount,
+		NodeGroupKeepBackoffOutOfResources: opts.NodeGroupKeepBackoffOutOfResources,
 	}
 
 	taintConfig := taints.NewTaintConfig(opts)

diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
@@ -214,22 +214,23 @@ var (
 		"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
 	nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
 		"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
-	maxScaleDownParallelismFlag       = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
-	maxDrainParallelismFlag           = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
-	recordDuplicatedEvents            = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
-	maxNodesPerScaleUp                = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
-	maxNodeGroupBinpackingDuration    = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
-	skipNodesWithSystemPods           = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
-	skipNodesWithLocalStorage         = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
-	skipNodesWithCustomControllerPods = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers")
-	minReplicaCount                   = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
-	nodeDeleteDelayAfterTaint         = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
-	scaleDownSimulationTimeout        = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
-	parallelDrain                     = flag.Bool("parallel-drain", false, "Whether to allow parallel drain of nodes.")
-	maxCapacityMemoryDifferenceRatio  = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
-	maxFreeDifferenceRatio            = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")
-	maxAllocatableDifferenceRatio     = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
-	forceDaemonSets                   = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
+	nodeGroupKeepBackoffOutOfResources = flag.Bool("node-group-keep-backoff-out-of-resources", false, "Prevents removal of backoff before expiration when a scale-up fails due to the cloud provider being out of resources.")
+	maxScaleDownParallelismFlag        = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
+	maxDrainParallelismFlag            = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
+	recordDuplicatedEvents             = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
+	maxNodesPerScaleUp                 = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
+	maxNodeGroupBinpackingDuration     = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
+	skipNodesWithSystemPods            = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
+	skipNodesWithLocalStorage          = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
+	skipNodesWithCustomControllerPods  = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers")
+	minReplicaCount                    = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
+	nodeDeleteDelayAfterTaint          = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
+	scaleDownSimulationTimeout         = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
+	parallelDrain                      = flag.Bool("parallel-drain", false, "Whether to allow parallel drain of nodes.")
+	maxCapacityMemoryDifferenceRatio   = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
+	maxFreeDifferenceRatio             = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")
+	maxAllocatableDifferenceRatio      = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
+	forceDaemonSets                    = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
 )
 
 func createAutoscalingOptions() config.AutoscalingOptions {
@@ -326,6 +327,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		InitialNodeGroupBackoffDuration:    *initialNodeGroupBackoffDuration,
 		MaxNodeGroupBackoffDuration:        *maxNodeGroupBackoffDuration,
 		NodeGroupBackoffResetTimeout:       *nodeGroupBackoffResetTimeout,
+		NodeGroupKeepBackoffOutOfResources: *nodeGroupKeepBackoffOutOfResources,
 		MaxScaleDownParallelism:            *maxScaleDownParallelismFlag,
 		MaxDrainParallelism:                *maxDrainParallelismFlag,
 		RecordDuplicatedEvents:             *recordDuplicatedEvents,

diff --git a/cluster-autoscaler/utils/backoff/backoff.go b/cluster-autoscaler/utils/backoff/backoff.go
@@ -33,4 +33,6 @@ type Backoff interface {
 	RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo)
 	// RemoveStaleBackoffData removes stale backoff data.
 	RemoveStaleBackoffData(currentTime time.Time)
+	// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
+	IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool
 }
diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff.go b/cluster-autoscaler/utils/backoff/exponential_backoff.go
@@ -37,6 +37,7 @@ type exponentialBackoffInfo struct {
 	duration            time.Duration
 	backoffUntil        time.Time
 	lastFailedExecution time.Time
+	errorClass          cloudprovider.InstanceErrorClass
 }
 
 // NewExponentialBackoff creates an instance of exponential backoff.
@@ -87,6 +88,7 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo
 		duration:            duration,
 		backoffUntil:        backoffUntil,
 		lastFailedExecution: currentTime,
+		errorClass:          errorClass,
 	}
 	return backoffUntil
 }
@@ -110,3 +112,9 @@ func (b *exponentialBackoff) RemoveStaleBackoffData(currentTime time.Time) {
 		}
 	}
 }
+
+// IsNodeGroupOutOfResources returns true if the given node group is out of resources.
+func (b *exponentialBackoff) IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool {
+	backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)]
+	return found && backoffInfo.errorClass == cloudprovider.OutOfResourcesErrorClass
+}