Skip to content

Commit

Permalink
feat: add flag to allow scale down on custom controller pods
Browse files Browse the repository at this point in the history
- set to `false` by default
- `false` will be set to `true` by default in the future
- right now, we want to ensure backwards compatibility and make the feature available if the flag is explicitly set to `true`
- TODO: this code might need some unit tests. Look into adding unit tests.
Signed-off-by: vadasambar <[email protected]>
  • Loading branch information
vadasambar committed Mar 6, 2023
1 parent 0ed6aba commit 1acb6b2
Show file tree
Hide file tree
Showing 6 changed files with 212 additions and 100 deletions.
3 changes: 3 additions & 0 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,4 +247,7 @@ type AutoscalingOptions struct {
ParallelDrain bool
// NodeGroupSetRatio is a collection of ratios used by CA used to make scaling decisions.
NodeGroupSetRatios NodeGroupDifferenceRatios
// AllowScaleDownOnCustomControllerOwnedPods doesn't block node scale-down if a pod owned by a custom controller
// is running on the node
AllowScaleDownOnCustomControllerOwnedPods bool
}
7 changes: 4 additions & 3 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,10 @@ func NewStaticAutoscaler(
processors.ScaleDownCandidatesNotifier.Register(clusterStateRegistry)

deleteOptions := simulator.NodeDeleteOptions{
SkipNodesWithSystemPods: opts.SkipNodesWithSystemPods,
SkipNodesWithLocalStorage: opts.SkipNodesWithLocalStorage,
MinReplicaCount: opts.MinReplicaCount,
SkipNodesWithSystemPods: opts.SkipNodesWithSystemPods,
SkipNodesWithLocalStorage: opts.SkipNodesWithLocalStorage,
MinReplicaCount: opts.MinReplicaCount,
AllowScaleDownOnCustomControllerOwnedPods: opts.AllowScaleDownOnCustomControllerOwnedPods,
}

// TODO: Populate the ScaleDownActuator/Planner fields in AutoscalingContext
Expand Down
182 changes: 92 additions & 90 deletions cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,22 +205,23 @@ var (
"maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.")
nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour,
"nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.")
maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
gceExpanderEphemeralStorageSupport = flag.Bool("gce-expander-ephemeral-storage-support", false, "Whether scale-up takes ephemeral storage resources into account for GCE cloud provider")
recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
maxNodesPerScaleUp = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 5*time.Minute, "How long should we run scale down simulation.")
parallelDrain = flag.Bool("parallel-drain", false, "Whether to allow parallel drain of nodes.")
maxCapacityMemoryDifferenceRatio = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
maxFreeDifferenceRatio = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")
maxAllocatableDifferenceRatio = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.")
maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.")
gceExpanderEphemeralStorageSupport = flag.Bool("gce-expander-ephemeral-storage-support", false, "Whether scale-up takes ephemeral storage resources into account for GCE cloud provider")
recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.")
maxNodesPerScaleUp = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.")
maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 5*time.Minute, "How long should we run scale down simulation.")
parallelDrain = flag.Bool("parallel-drain", false, "Whether to allow parallel drain of nodes.")
maxCapacityMemoryDifferenceRatio = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
maxFreeDifferenceRatio = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")
maxAllocatableDifferenceRatio = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
allowScaleDownOnCustomControllerOwnedPods = flag.Bool("allow-scale-down-on-custom-controller-owned-pods", false, "Don't block node scale-down if a pod owned by a custom controller is running on the node.")
)

func createAutoscalingOptions() config.AutoscalingOptions {
Expand Down Expand Up @@ -250,80 +251,81 @@ func createAutoscalingOptions() config.AutoscalingOptions {
ScaleDownUnneededTime: *scaleDownUnneededTime,
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
},
CloudConfig: *cloudConfig,
CloudProviderName: *cloudProviderFlag,
NodeGroupAutoDiscovery: *nodeGroupAutoDiscoveryFlag,
MaxTotalUnreadyPercentage: *maxTotalUnreadyPercentage,
OkTotalUnreadyCount: *okTotalUnreadyCount,
ScaleUpFromZero: *scaleUpFromZero,
EstimatorName: *estimatorFlag,
ExpanderNames: *expanderFlag,
GRPCExpanderCert: *grpcExpanderCert,
GRPCExpanderURL: *grpcExpanderURL,
IgnoreDaemonSetsUtilization: *ignoreDaemonSetsUtilization,
IgnoreMirrorPodsUtilization: *ignoreMirrorPodsUtilization,
MaxBulkSoftTaintCount: *maxBulkSoftTaintCount,
MaxBulkSoftTaintTime: *maxBulkSoftTaintTime,
MaxEmptyBulkDelete: *maxEmptyBulkDeleteFlag,
MaxGracefulTerminationSec: *maxGracefulTerminationFlag,
MaxNodeProvisionTime: *maxNodeProvisionTime,
MaxPodEvictionTime: *maxPodEvictionTime,
MaxNodesTotal: *maxNodesTotal,
MaxCoresTotal: maxCoresTotal,
MinCoresTotal: minCoresTotal,
MaxMemoryTotal: maxMemoryTotal,
MinMemoryTotal: minMemoryTotal,
GpuTotal: parsedGpuTotal,
NodeGroups: *nodeGroupsFlag,
EnforceNodeGroupMinSize: *enforceNodeGroupMinSize,
ScaleDownDelayAfterAdd: *scaleDownDelayAfterAdd,
ScaleDownDelayAfterDelete: *scaleDownDelayAfterDelete,
ScaleDownDelayAfterFailure: *scaleDownDelayAfterFailure,
ScaleDownEnabled: *scaleDownEnabled,
ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount,
ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio,
ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount,
WriteStatusConfigMap: *writeStatusConfigMapFlag,
StatusConfigMapName: *statusConfigMapName,
BalanceSimilarNodeGroups: *balanceSimilarNodeGroupsFlag,
ConfigNamespace: *namespace,
ClusterName: *clusterName,
NodeAutoprovisioningEnabled: *nodeAutoprovisioningEnabled,
MaxAutoprovisionedNodeGroupCount: *maxAutoprovisionedNodeGroupCount,
UnremovableNodeRecheckTimeout: *unremovableNodeRecheckTimeout,
ExpendablePodsPriorityCutoff: *expendablePodsPriorityCutoff,
Regional: *regional,
NewPodScaleUpDelay: *newPodScaleUpDelay,
IgnoredTaints: *ignoreTaintsFlag,
BalancingExtraIgnoredLabels: *balancingIgnoreLabelsFlag,
BalancingLabels: *balancingLabelsFlag,
KubeConfigPath: *kubeConfigFile,
KubeClientBurst: *kubeClientBurst,
KubeClientQPS: *kubeClientQPS,
NodeDeletionDelayTimeout: *nodeDeletionDelayTimeout,
AWSUseStaticInstanceList: *awsUseStaticInstanceList,
ConcurrentGceRefreshes: *concurrentGceRefreshes,
ClusterAPICloudConfigAuthoritative: *clusterAPICloudConfigAuthoritative,
CordonNodeBeforeTerminate: *cordonNodeBeforeTerminate,
DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes,
DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes,
UserAgent: *userAgent,
InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration,
MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration,
NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout,
MaxScaleDownParallelism: *maxScaleDownParallelismFlag,
MaxDrainParallelism: *maxDrainParallelismFlag,
GceExpanderEphemeralStorageSupport: *gceExpanderEphemeralStorageSupport,
RecordDuplicatedEvents: *recordDuplicatedEvents,
MaxNodesPerScaleUp: *maxNodesPerScaleUp,
MaxNodeGroupBinpackingDuration: *maxNodeGroupBinpackingDuration,
NodeDeletionBatcherInterval: *nodeDeletionBatcherInterval,
SkipNodesWithSystemPods: *skipNodesWithSystemPods,
SkipNodesWithLocalStorage: *skipNodesWithLocalStorage,
MinReplicaCount: *minReplicaCount,
NodeDeleteDelayAfterTaint: *nodeDeleteDelayAfterTaint,
ScaleDownSimulationTimeout: *scaleDownSimulationTimeout,
ParallelDrain: *parallelDrain,
CloudConfig: *cloudConfig,
CloudProviderName: *cloudProviderFlag,
NodeGroupAutoDiscovery: *nodeGroupAutoDiscoveryFlag,
MaxTotalUnreadyPercentage: *maxTotalUnreadyPercentage,
OkTotalUnreadyCount: *okTotalUnreadyCount,
ScaleUpFromZero: *scaleUpFromZero,
EstimatorName: *estimatorFlag,
ExpanderNames: *expanderFlag,
GRPCExpanderCert: *grpcExpanderCert,
GRPCExpanderURL: *grpcExpanderURL,
IgnoreDaemonSetsUtilization: *ignoreDaemonSetsUtilization,
IgnoreMirrorPodsUtilization: *ignoreMirrorPodsUtilization,
MaxBulkSoftTaintCount: *maxBulkSoftTaintCount,
MaxBulkSoftTaintTime: *maxBulkSoftTaintTime,
MaxEmptyBulkDelete: *maxEmptyBulkDeleteFlag,
MaxGracefulTerminationSec: *maxGracefulTerminationFlag,
MaxNodeProvisionTime: *maxNodeProvisionTime,
MaxPodEvictionTime: *maxPodEvictionTime,
MaxNodesTotal: *maxNodesTotal,
MaxCoresTotal: maxCoresTotal,
MinCoresTotal: minCoresTotal,
MaxMemoryTotal: maxMemoryTotal,
MinMemoryTotal: minMemoryTotal,
GpuTotal: parsedGpuTotal,
NodeGroups: *nodeGroupsFlag,
EnforceNodeGroupMinSize: *enforceNodeGroupMinSize,
ScaleDownDelayAfterAdd: *scaleDownDelayAfterAdd,
ScaleDownDelayAfterDelete: *scaleDownDelayAfterDelete,
ScaleDownDelayAfterFailure: *scaleDownDelayAfterFailure,
ScaleDownEnabled: *scaleDownEnabled,
ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount,
ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio,
ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount,
WriteStatusConfigMap: *writeStatusConfigMapFlag,
StatusConfigMapName: *statusConfigMapName,
BalanceSimilarNodeGroups: *balanceSimilarNodeGroupsFlag,
ConfigNamespace: *namespace,
ClusterName: *clusterName,
NodeAutoprovisioningEnabled: *nodeAutoprovisioningEnabled,
MaxAutoprovisionedNodeGroupCount: *maxAutoprovisionedNodeGroupCount,
UnremovableNodeRecheckTimeout: *unremovableNodeRecheckTimeout,
ExpendablePodsPriorityCutoff: *expendablePodsPriorityCutoff,
Regional: *regional,
NewPodScaleUpDelay: *newPodScaleUpDelay,
IgnoredTaints: *ignoreTaintsFlag,
BalancingExtraIgnoredLabels: *balancingIgnoreLabelsFlag,
BalancingLabels: *balancingLabelsFlag,
KubeConfigPath: *kubeConfigFile,
KubeClientBurst: *kubeClientBurst,
KubeClientQPS: *kubeClientQPS,
NodeDeletionDelayTimeout: *nodeDeletionDelayTimeout,
AWSUseStaticInstanceList: *awsUseStaticInstanceList,
ConcurrentGceRefreshes: *concurrentGceRefreshes,
ClusterAPICloudConfigAuthoritative: *clusterAPICloudConfigAuthoritative,
CordonNodeBeforeTerminate: *cordonNodeBeforeTerminate,
DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes,
DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes,
UserAgent: *userAgent,
InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration,
MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration,
NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout,
MaxScaleDownParallelism: *maxScaleDownParallelismFlag,
MaxDrainParallelism: *maxDrainParallelismFlag,
GceExpanderEphemeralStorageSupport: *gceExpanderEphemeralStorageSupport,
RecordDuplicatedEvents: *recordDuplicatedEvents,
MaxNodesPerScaleUp: *maxNodesPerScaleUp,
MaxNodeGroupBinpackingDuration: *maxNodeGroupBinpackingDuration,
NodeDeletionBatcherInterval: *nodeDeletionBatcherInterval,
SkipNodesWithSystemPods: *skipNodesWithSystemPods,
SkipNodesWithLocalStorage: *skipNodesWithLocalStorage,
MinReplicaCount: *minReplicaCount,
NodeDeleteDelayAfterTaint: *nodeDeleteDelayAfterTaint,
ScaleDownSimulationTimeout: *scaleDownSimulationTimeout,
ParallelDrain: *parallelDrain,
AllowScaleDownOnCustomControllerOwnedPods: *allowScaleDownOnCustomControllerOwnedPods,
NodeGroupSetRatios: config.NodeGroupDifferenceRatios{
MaxCapacityMemoryDifferenceRatio: *maxCapacityMemoryDifferenceRatio,
MaxAllocatableDifferenceRatio: *maxAllocatableDifferenceRatio,
Expand Down
4 changes: 4 additions & 0 deletions cluster-autoscaler/simulator/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ type NodeDeleteOptions struct {
// MinReplicaCount controls the minimum number of replicas that a replica set or replication controller should have
// to allow their pods deletion in scale down
MinReplicaCount int
// AllowScaleDownOnCustomControllerOwnedPods doesn't block node scale-down if a pod owned by a custom controller
// is running on the node
AllowScaleDownOnCustomControllerOwnedPods bool
}

// GetPodsToMove returns a list of pods that should be moved elsewhere
Expand All @@ -57,6 +60,7 @@ func GetPodsToMove(nodeInfo *schedulerframework.NodeInfo, deleteOptions NodeDele
pdbs,
deleteOptions.SkipNodesWithSystemPods,
deleteOptions.SkipNodesWithLocalStorage,
deleteOptions.AllowScaleDownOnCustomControllerOwnedPods,
listers,
int32(deleteOptions.MinReplicaCount),
timestamp)
Expand Down
Loading

0 comments on commit 1acb6b2

Please sign in to comment.