diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 9665d83a431e..a32b1eb8772c 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -81,6 +81,8 @@ type ClusterStateRegistryConfig struct { // Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply. // This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling. OkTotalUnreadyCount int + // NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources. + NodeGroupKeepBackoffOutOfResources bool } // IncorrectNodeGroupSize contains information about how much the current size of the node group @@ -254,7 +256,11 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) { // scale-out finished successfully // remove it and reset node group backoff delete(csr.scaleUpRequests, nodeGroupName) - csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()]) + shouldKeepBackoff := csr.config.NodeGroupKeepBackoffOutOfResources && csr.backoff.IsNodeGroupOutOfResources(scaleUpRequest.NodeGroup) + if !shouldKeepBackoff { + klog.V(4).Infof("Removing backoff for node group %v", scaleUpRequest.NodeGroup.Id()) + csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()]) + } klog.V(4).Infof("Scale up in group %v finished successfully in %v", nodeGroupName, currentTime.Sub(scaleUpRequest.Time)) continue diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 62b708ec3674..e9e6fb279a55 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -237,6 +237,8 @@ type AutoscalingOptions struct { MaxNodeGroupBackoffDuration time.Duration // NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset. NodeGroupBackoffResetTimeout time.Duration + // NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources. + NodeGroupKeepBackoffOutOfResources bool // MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel. MaxScaleDownParallelism int // MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel. diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 3499bd6f863f..748007b40919 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -156,8 +156,9 @@ func NewStaticAutoscaler( remainingPdbTracker) clusterStateConfig := clusterstate.ClusterStateRegistryConfig{ - MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage, - OkTotalUnreadyCount: opts.OkTotalUnreadyCount, + MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage, + OkTotalUnreadyCount: opts.OkTotalUnreadyCount, + NodeGroupKeepBackoffOutOfResources: opts.NodeGroupKeepBackoffOutOfResources, } taintConfig := taints.NewTaintConfig(opts) diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 073e3fdf1bf3..bb19c43f0bb4 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -214,22 +214,23 @@ var ( "maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.") nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour, "nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.") - maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.") - maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.") - recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.") - maxNodesPerScaleUp = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.") - maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.") - skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)") - skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath") - skipNodesWithCustomControllerPods = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers") - minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down") - nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it") - scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.") - parallelDrain = flag.Bool("parallel-drain", false, "Whether to allow parallel drain of nodes.") - maxCapacityMemoryDifferenceRatio = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.") - maxFreeDifferenceRatio = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.") - maxAllocatableDifferenceRatio = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.") - forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.") + nodeGroupKeepBackoffOutOfResources = flag.Bool("node-group-keep-backoff-out-of-resources", false, "Prevents removal of backoff before expiration when a scale-up fails due to the cloud provider being out of resources.") + maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.") + maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.") + recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.") + maxNodesPerScaleUp = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.") + maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.") + skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)") + skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath") + skipNodesWithCustomControllerPods = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers") + minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down") + nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it") + scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.") + parallelDrain = flag.Bool("parallel-drain", false, "Whether to allow parallel drain of nodes.") + maxCapacityMemoryDifferenceRatio = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.") + maxFreeDifferenceRatio = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.") + maxAllocatableDifferenceRatio = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.") + forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.") ) func createAutoscalingOptions() config.AutoscalingOptions { @@ -326,6 +327,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration, MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration, NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout, + NodeGroupKeepBackoffOutOfResources: *nodeGroupKeepBackoffOutOfResources, MaxScaleDownParallelism: *maxScaleDownParallelismFlag, MaxDrainParallelism: *maxDrainParallelismFlag, RecordDuplicatedEvents: *recordDuplicatedEvents, diff --git a/cluster-autoscaler/utils/backoff/backoff.go b/cluster-autoscaler/utils/backoff/backoff.go index d42df155eabf..b4967b033c2c 100644 --- a/cluster-autoscaler/utils/backoff/backoff.go +++ b/cluster-autoscaler/utils/backoff/backoff.go @@ -33,4 +33,6 @@ type Backoff interface { RemoveBackoff(nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo) // RemoveStaleBackoffData removes stale backoff data. RemoveStaleBackoffData(currentTime time.Time) + // IsNodeGroupOutOfResources returns true if the given node group is out of resources. + IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool } diff --git a/cluster-autoscaler/utils/backoff/exponential_backoff.go b/cluster-autoscaler/utils/backoff/exponential_backoff.go index 013273bc0587..7b555d9c45a8 100644 --- a/cluster-autoscaler/utils/backoff/exponential_backoff.go +++ b/cluster-autoscaler/utils/backoff/exponential_backoff.go @@ -37,6 +37,7 @@ type exponentialBackoffInfo struct { duration time.Duration backoffUntil time.Time lastFailedExecution time.Time + errorClass cloudprovider.InstanceErrorClass } // NewExponentialBackoff creates an instance of exponential backoff. @@ -87,6 +88,7 @@ func (b *exponentialBackoff) Backoff(nodeGroup cloudprovider.NodeGroup, nodeInfo duration: duration, backoffUntil: backoffUntil, lastFailedExecution: currentTime, + errorClass: errorClass, } return backoffUntil } @@ -110,3 +112,9 @@ func (b *exponentialBackoff) RemoveStaleBackoffData(currentTime time.Time) { } } } + +// IsNodeGroupOutOfResources returns true if the given node group is out of resources. +func (b *exponentialBackoff) IsNodeGroupOutOfResources(nodeGroup cloudprovider.NodeGroup) bool { + backoffInfo, found := b.backoffInfo[b.nodeGroupKey(nodeGroup)] + return found && backoffInfo.errorClass == cloudprovider.OutOfResourcesErrorClass +}