feat: add flag to allow scale down on custom controller pods

- set to `false` by default - `false` will be set to `true` by default in the future - right now, we want to ensure backwards compatibility and make the feature available if the flag is explicitly set to `true` - TODO: this code might need some unit tests. Look into adding unit tests. Signed-off-by: vadasambar <[email protected]>
kubernetes · Mar 6, 2023 · 1acb6b2 · 1acb6b2
1 parent 0ed6aba
commit 1acb6b2
Show file tree

Hide file tree

Showing 6 changed files with 212 additions and 100 deletions.
diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -247,4 +247,7 @@ type AutoscalingOptions struct {
 	ParallelDrain bool
 	// NodeGroupSetRatio is a collection of ratios used by CA used to make scaling decisions.
 	NodeGroupSetRatios NodeGroupDifferenceRatios
+	// AllowScaleDownOnCustomControllerOwnedPods doesn't block node scale-down if a pod owned by a custom controller
+	// is running on the node
+	AllowScaleDownOnCustomControllerOwnedPods bool
 }
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -167,9 +167,10 @@ func NewStaticAutoscaler(
 	processors.ScaleDownCandidatesNotifier.Register(clusterStateRegistry)
 
 	deleteOptions := simulator.NodeDeleteOptions{
-		SkipNodesWithSystemPods:   opts.SkipNodesWithSystemPods,
-		SkipNodesWithLocalStorage: opts.SkipNodesWithLocalStorage,
-		MinReplicaCount:           opts.MinReplicaCount,
+		SkipNodesWithSystemPods:                   opts.SkipNodesWithSystemPods,
+		SkipNodesWithLocalStorage:                 opts.SkipNodesWithLocalStorage,
+		MinReplicaCount:                           opts.MinReplicaCount,
+		AllowScaleDownOnCustomControllerOwnedPods: opts.AllowScaleDownOnCustomControllerOwnedPods,
 	}
 
 	// TODO: Populate the ScaleDownActuator/Planner fields in AutoscalingContext

diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
@@ -205,22 +205,23 @@ var (
 		"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
 	nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
 		"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
-	maxScaleDownParallelismFlag        = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
-	maxDrainParallelismFlag            = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
-	gceExpanderEphemeralStorageSupport = flag.Bool("gce-expander-ephemeral-storage-support", false, "Whether scale-up takes ephemeral storage resources into account for GCE cloud provider")
-	recordDuplicatedEvents             = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
-	maxNodesPerScaleUp                 = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
-	maxNodeGroupBinpackingDuration     = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
-	skipNodesWithSystemPods            = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
-	skipNodesWithLocalStorage          = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
-	minReplicaCount                    = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
-	nodeDeleteDelayAfterTaint          = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
-	scaleDownSimulationTimeout         = flag.Duration("scale-down-simulation-timeout", 5*time.Minute, "How long should we run scale down simulation.")
-	parallelDrain                      = flag.Bool("parallel-drain", false, "Whether to allow parallel drain of nodes.")
-	maxCapacityMemoryDifferenceRatio   = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
-	maxFreeDifferenceRatio             = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")
-	maxAllocatableDifferenceRatio      = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
-	forceDaemonSets                    = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
+	maxScaleDownParallelismFlag               = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
+	maxDrainParallelismFlag                   = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
+	gceExpanderEphemeralStorageSupport        = flag.Bool("gce-expander-ephemeral-storage-support", false, "Whether scale-up takes ephemeral storage resources into account for GCE cloud provider")
+	recordDuplicatedEvents                    = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
+	maxNodesPerScaleUp                        = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
+	maxNodeGroupBinpackingDuration            = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
+	skipNodesWithSystemPods                   = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
+	skipNodesWithLocalStorage                 = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
+	minReplicaCount                           = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
+	nodeDeleteDelayAfterTaint                 = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
+	scaleDownSimulationTimeout                = flag.Duration("scale-down-simulation-timeout", 5*time.Minute, "How long should we run scale down simulation.")
+	parallelDrain                             = flag.Bool("parallel-drain", false, "Whether to allow parallel drain of nodes.")
+	maxCapacityMemoryDifferenceRatio          = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
+	maxFreeDifferenceRatio                    = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")
+	maxAllocatableDifferenceRatio             = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
+	forceDaemonSets                           = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
+	allowScaleDownOnCustomControllerOwnedPods = flag.Bool("allow-scale-down-on-custom-controller-owned-pods", false, "Don't block node scale-down if a pod owned by a custom controller is running on the node.")
 )
 
 func createAutoscalingOptions() config.AutoscalingOptions {
@@ -250,80 +251,81 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 			ScaleDownUnneededTime:            *scaleDownUnneededTime,
 			ScaleDownUnreadyTime:             *scaleDownUnreadyTime,
 		},
-		CloudConfig:                        *cloudConfig,
-		CloudProviderName:                  *cloudProviderFlag,
-		NodeGroupAutoDiscovery:             *nodeGroupAutoDiscoveryFlag,
-		MaxTotalUnreadyPercentage:          *maxTotalUnreadyPercentage,
-		OkTotalUnreadyCount:                *okTotalUnreadyCount,
-		ScaleUpFromZero:                    *scaleUpFromZero,
-		EstimatorName:                      *estimatorFlag,
-		ExpanderNames:                      *expanderFlag,
-		GRPCExpanderCert:                   *grpcExpanderCert,
-		GRPCExpanderURL:                    *grpcExpanderURL,
-		IgnoreDaemonSetsUtilization:        *ignoreDaemonSetsUtilization,
-		IgnoreMirrorPodsUtilization:        *ignoreMirrorPodsUtilization,
-		MaxBulkSoftTaintCount:              *maxBulkSoftTaintCount,
-		MaxBulkSoftTaintTime:               *maxBulkSoftTaintTime,
-		MaxEmptyBulkDelete:                 *maxEmptyBulkDeleteFlag,
-		MaxGracefulTerminationSec:          *maxGracefulTerminationFlag,
-		MaxNodeProvisionTime:               *maxNodeProvisionTime,
-		MaxPodEvictionTime:                 *maxPodEvictionTime,
-		MaxNodesTotal:                      *maxNodesTotal,
-		MaxCoresTotal:                      maxCoresTotal,
-		MinCoresTotal:                      minCoresTotal,
-		MaxMemoryTotal:                     maxMemoryTotal,
-		MinMemoryTotal:                     minMemoryTotal,
-		GpuTotal:                           parsedGpuTotal,
-		NodeGroups:                         *nodeGroupsFlag,
-		EnforceNodeGroupMinSize:            *enforceNodeGroupMinSize,
-		ScaleDownDelayAfterAdd:             *scaleDownDelayAfterAdd,
-		ScaleDownDelayAfterDelete:          *scaleDownDelayAfterDelete,
-		ScaleDownDelayAfterFailure:         *scaleDownDelayAfterFailure,
-		ScaleDownEnabled:                   *scaleDownEnabled,
-		ScaleDownNonEmptyCandidatesCount:   *scaleDownNonEmptyCandidatesCount,
-		ScaleDownCandidatesPoolRatio:       *scaleDownCandidatesPoolRatio,
-		ScaleDownCandidatesPoolMinCount:    *scaleDownCandidatesPoolMinCount,
-		WriteStatusConfigMap:               *writeStatusConfigMapFlag,
-		StatusConfigMapName:                *statusConfigMapName,
-		BalanceSimilarNodeGroups:           *balanceSimilarNodeGroupsFlag,
-		ConfigNamespace:                    *namespace,
-		ClusterName:                        *clusterName,
-		NodeAutoprovisioningEnabled:        *nodeAutoprovisioningEnabled,
-		MaxAutoprovisionedNodeGroupCount:   *maxAutoprovisionedNodeGroupCount,
-		UnremovableNodeRecheckTimeout:      *unremovableNodeRecheckTimeout,
-		ExpendablePodsPriorityCutoff:       *expendablePodsPriorityCutoff,
-		Regional:                           *regional,
-		NewPodScaleUpDelay:                 *newPodScaleUpDelay,
-		IgnoredTaints:                      *ignoreTaintsFlag,
-		BalancingExtraIgnoredLabels:        *balancingIgnoreLabelsFlag,
-		BalancingLabels:                    *balancingLabelsFlag,
-		KubeConfigPath:                     *kubeConfigFile,
-		KubeClientBurst:                    *kubeClientBurst,
-		KubeClientQPS:                      *kubeClientQPS,
-		NodeDeletionDelayTimeout:           *nodeDeletionDelayTimeout,
-		AWSUseStaticInstanceList:           *awsUseStaticInstanceList,
-		ConcurrentGceRefreshes:             *concurrentGceRefreshes,
-		ClusterAPICloudConfigAuthoritative: *clusterAPICloudConfigAuthoritative,
-		CordonNodeBeforeTerminate:          *cordonNodeBeforeTerminate,
-		DaemonSetEvictionForEmptyNodes:     *daemonSetEvictionForEmptyNodes,
-		DaemonSetEvictionForOccupiedNodes:  *daemonSetEvictionForOccupiedNodes,
-		UserAgent:                          *userAgent,
-		InitialNodeGroupBackoffDuration:    *initialNodeGroupBackoffDuration,
-		MaxNodeGroupBackoffDuration:        *maxNodeGroupBackoffDuration,
-		NodeGroupBackoffResetTimeout:       *nodeGroupBackoffResetTimeout,
-		MaxScaleDownParallelism:            *maxScaleDownParallelismFlag,
-		MaxDrainParallelism:                *maxDrainParallelismFlag,
-		GceExpanderEphemeralStorageSupport: *gceExpanderEphemeralStorageSupport,
-		RecordDuplicatedEvents:             *recordDuplicatedEvents,
-		MaxNodesPerScaleUp:                 *maxNodesPerScaleUp,
-		MaxNodeGroupBinpackingDuration:     *maxNodeGroupBinpackingDuration,
-		NodeDeletionBatcherInterval:        *nodeDeletionBatcherInterval,
-		SkipNodesWithSystemPods:            *skipNodesWithSystemPods,
-		SkipNodesWithLocalStorage:          *skipNodesWithLocalStorage,
-		MinReplicaCount:                    *minReplicaCount,
-		NodeDeleteDelayAfterTaint:          *nodeDeleteDelayAfterTaint,
-		ScaleDownSimulationTimeout:         *scaleDownSimulationTimeout,
-		ParallelDrain:                      *parallelDrain,
+		CloudConfig:                               *cloudConfig,
+		CloudProviderName:                         *cloudProviderFlag,
+		NodeGroupAutoDiscovery:                    *nodeGroupAutoDiscoveryFlag,
+		MaxTotalUnreadyPercentage:                 *maxTotalUnreadyPercentage,
+		OkTotalUnreadyCount:                       *okTotalUnreadyCount,
+		ScaleUpFromZero:                           *scaleUpFromZero,
+		EstimatorName:                             *estimatorFlag,
+		ExpanderNames:                             *expanderFlag,
+		GRPCExpanderCert:                          *grpcExpanderCert,
+		GRPCExpanderURL:                           *grpcExpanderURL,
+		IgnoreDaemonSetsUtilization:               *ignoreDaemonSetsUtilization,
+		IgnoreMirrorPodsUtilization:               *ignoreMirrorPodsUtilization,
+		MaxBulkSoftTaintCount:                     *maxBulkSoftTaintCount,
+		MaxBulkSoftTaintTime:                      *maxBulkSoftTaintTime,
+		MaxEmptyBulkDelete:                        *maxEmptyBulkDeleteFlag,
+		MaxGracefulTerminationSec:                 *maxGracefulTerminationFlag,
+		MaxNodeProvisionTime:                      *maxNodeProvisionTime,
+		MaxPodEvictionTime:                        *maxPodEvictionTime,
+		MaxNodesTotal:                             *maxNodesTotal,
+		MaxCoresTotal:                             maxCoresTotal,
+		MinCoresTotal:                             minCoresTotal,
+		MaxMemoryTotal:                            maxMemoryTotal,
+		MinMemoryTotal:                            minMemoryTotal,
+		GpuTotal:                                  parsedGpuTotal,
+		NodeGroups:                                *nodeGroupsFlag,
+		EnforceNodeGroupMinSize:                   *enforceNodeGroupMinSize,
+		ScaleDownDelayAfterAdd:                    *scaleDownDelayAfterAdd,
+		ScaleDownDelayAfterDelete:                 *scaleDownDelayAfterDelete,
+		ScaleDownDelayAfterFailure:                *scaleDownDelayAfterFailure,
+		ScaleDownEnabled:                          *scaleDownEnabled,
+		ScaleDownNonEmptyCandidatesCount:          *scaleDownNonEmptyCandidatesCount,
+		ScaleDownCandidatesPoolRatio:              *scaleDownCandidatesPoolRatio,
+		ScaleDownCandidatesPoolMinCount:           *scaleDownCandidatesPoolMinCount,
+		WriteStatusConfigMap:                      *writeStatusConfigMapFlag,
+		StatusConfigMapName:                       *statusConfigMapName,
+		BalanceSimilarNodeGroups:                  *balanceSimilarNodeGroupsFlag,
+		ConfigNamespace:                           *namespace,
+		ClusterName:                               *clusterName,
+		NodeAutoprovisioningEnabled:               *nodeAutoprovisioningEnabled,
+		MaxAutoprovisionedNodeGroupCount:          *maxAutoprovisionedNodeGroupCount,
+		UnremovableNodeRecheckTimeout:             *unremovableNodeRecheckTimeout,
+		ExpendablePodsPriorityCutoff:              *expendablePodsPriorityCutoff,
+		Regional:                                  *regional,
+		NewPodScaleUpDelay:                        *newPodScaleUpDelay,
+		IgnoredTaints:                             *ignoreTaintsFlag,
+		BalancingExtraIgnoredLabels:               *balancingIgnoreLabelsFlag,
+		BalancingLabels:                           *balancingLabelsFlag,
+		KubeConfigPath:                            *kubeConfigFile,
+		KubeClientBurst:                           *kubeClientBurst,
+		KubeClientQPS:                             *kubeClientQPS,
+		NodeDeletionDelayTimeout:                  *nodeDeletionDelayTimeout,
+		AWSUseStaticInstanceList:                  *awsUseStaticInstanceList,
+		ConcurrentGceRefreshes:                    *concurrentGceRefreshes,
+		ClusterAPICloudConfigAuthoritative:        *clusterAPICloudConfigAuthoritative,
+		CordonNodeBeforeTerminate:                 *cordonNodeBeforeTerminate,
+		DaemonSetEvictionForEmptyNodes:            *daemonSetEvictionForEmptyNodes,
+		DaemonSetEvictionForOccupiedNodes:         *daemonSetEvictionForOccupiedNodes,
+		UserAgent:                                 *userAgent,
+		InitialNodeGroupBackoffDuration:           *initialNodeGroupBackoffDuration,
+		MaxNodeGroupBackoffDuration:               *maxNodeGroupBackoffDuration,
+		NodeGroupBackoffResetTimeout:              *nodeGroupBackoffResetTimeout,
+		MaxScaleDownParallelism:                   *maxScaleDownParallelismFlag,
+		MaxDrainParallelism:                       *maxDrainParallelismFlag,
+		GceExpanderEphemeralStorageSupport:        *gceExpanderEphemeralStorageSupport,
+		RecordDuplicatedEvents:                    *recordDuplicatedEvents,
+		MaxNodesPerScaleUp:                        *maxNodesPerScaleUp,
+		MaxNodeGroupBinpackingDuration:            *maxNodeGroupBinpackingDuration,
+		NodeDeletionBatcherInterval:               *nodeDeletionBatcherInterval,
+		SkipNodesWithSystemPods:                   *skipNodesWithSystemPods,
+		SkipNodesWithLocalStorage:                 *skipNodesWithLocalStorage,
+		MinReplicaCount:                           *minReplicaCount,
+		NodeDeleteDelayAfterTaint:                 *nodeDeleteDelayAfterTaint,
+		ScaleDownSimulationTimeout:                *scaleDownSimulationTimeout,
+		ParallelDrain:                             *parallelDrain,
+		AllowScaleDownOnCustomControllerOwnedPods: *allowScaleDownOnCustomControllerOwnedPods,
 		NodeGroupSetRatios: config.NodeGroupDifferenceRatios{
 			MaxCapacityMemoryDifferenceRatio: *maxCapacityMemoryDifferenceRatio,
 			MaxAllocatableDifferenceRatio:    *maxAllocatableDifferenceRatio,

diff --git a/cluster-autoscaler/simulator/drain.go b/cluster-autoscaler/simulator/drain.go
@@ -38,6 +38,9 @@ type NodeDeleteOptions struct {
 	// MinReplicaCount controls the minimum number of replicas that a replica set or replication controller should have
 	// to allow their pods deletion in scale down
 	MinReplicaCount int
+	// AllowScaleDownOnCustomControllerOwnedPods doesn't block node scale-down if a pod owned by a custom controller
+	// is running on the node
+	AllowScaleDownOnCustomControllerOwnedPods bool
 }
 
 // GetPodsToMove returns a list of pods that should be moved elsewhere
@@ -57,6 +60,7 @@ func GetPodsToMove(nodeInfo *schedulerframework.NodeInfo, deleteOptions NodeDele
 		pdbs,
 		deleteOptions.SkipNodesWithSystemPods,
 		deleteOptions.SkipNodesWithLocalStorage,
+		deleteOptions.AllowScaleDownOnCustomControllerOwnedPods,
 		listers,
 		int32(deleteOptions.MinReplicaCount),
 		timestamp)