From 1b98b3823a7f622d39aa41efa9e58507e9dc7f5c Mon Sep 17 00:00:00 2001 From: James Ravn Date: Tue, 29 Jun 2021 11:21:37 +0100 Subject: [PATCH] Allow balancing by labels exclusively Adds a new flag `--balance-label` which allows users to balance between node groups exclusively via labels. This gives users the flexibility to specify the similarity logic themselves when --balance-similar-node-groups is in use. --- cluster-autoscaler/FAQ.md | 1 + .../cloudprovider/aws/README.md | 4 + .../config/autoscaling_options.go | 3 + cluster-autoscaler/main.go | 30 +++--- .../nodegroupset/label_nodegroups.go | 49 ++++++++++ .../nodegroupset/label_nodegroups_test.go | 92 +++++++++++++++++++ 6 files changed, 168 insertions(+), 11 deletions(-) create mode 100644 cluster-autoscaler/processors/nodegroupset/label_nodegroups.go create mode 100644 cluster-autoscaler/processors/nodegroupset/label_nodegroups_test.go diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md index dc80cf2f4522..c9e2eae8702f 100644 --- a/cluster-autoscaler/FAQ.md +++ b/cluster-autoscaler/FAQ.md @@ -753,6 +753,7 @@ The following startup parameters are supported for cluster autoscaler: | `max-failing-time` | Maximum time from last recorded successful autoscaler run before automatic restart | 15 minutes | `balance-similar-node-groups` | Detect similar node groups and balance the number of nodes between them | false | `balancing-ignore-label` | Define a node label that should be ignored when considering node group similarity. One label per flag occurrence. | "" +| `balancing-label` | Define a node label to use when comparing node group similarity. If set, all other comparison logic is disabled, and only labels are considered when comparing groups. One label per flag occurrence. | "" | `node-autoprovisioning-enabled` | Should CA autoprovision node groups when needed | false | `max-autoprovisioned-node-group-count` | The maximum number of autoprovisioned groups in the cluster | 15 | `unremovable-node-recheck-timeout` | The timeout before we check again a node that couldn't be removed before | 5 minutes diff --git a/cluster-autoscaler/cloudprovider/aws/README.md b/cluster-autoscaler/cloudprovider/aws/README.md index f2d1c802a1e6..dcd2b2f95e5f 100644 --- a/cluster-autoscaler/cloudprovider/aws/README.md +++ b/cluster-autoscaler/cloudprovider/aws/README.md @@ -362,6 +362,10 @@ spec: - i3.2xlarge ``` +Similarly, if using the `balancing-label` flag, you should only choose labels which have the same value for all nodes in +the node group. Otherwise you may get unexpected results, as the flag values will vary based on the nodes created by +the ASG. + ### Example usage: - Create a [Launch diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 34408a2ddf41..da0e1a41d96f 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -154,6 +154,9 @@ type AutoscalingOptions struct { // BalancingExtraIgnoredLabels is a list of labels to additionally ignore when comparing if two node groups are similar. // Labels in BasicIgnoredLabels and the cloud provider-specific ignored labels are always ignored. BalancingExtraIgnoredLabels []string + // BalancingLabels is a list of labels to use when comparing if two node groups are similar. + // If this is set, only labels are used to compare node groups. It is mutually exclusive with BalancingExtraIgnoredLabels. + BalancingLabels []string // AWSUseStaticInstanceList tells if AWS cloud provider use static instance type list or dynamically fetch from remote APIs. AWSUseStaticInstanceList bool // ConcurrentGceRefreshes is the maximum number of concurrently refreshed instance groups or instance templates. diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index a0489b55b862..7d0e7c3d56c4 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -180,6 +180,7 @@ var ( ignoreTaintsFlag = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group") balancingIgnoreLabelsFlag = multiStringFlag("balancing-ignore-label", "Specifies a label to ignore in addition to the basic and cloud-provider set of labels when comparing if two node groups are similar") + balancingLabelsFlag = multiStringFlag("balancing-label", "Specifies a label to use for comparing if two node groups are similar, rather than the built in heuristics. Setting this flag disables all other comparison logic, and cannot be combined with --balancing-ignore-label.") awsUseStaticInstanceList = flag.Bool("aws-use-static-instance-list", false, "Should CA fetch instance types in runtime or use a static list. AWS only") concurrentGceRefreshes = flag.Int("gce-concurrent-refreshes", 1, "Maximum number of concurrent refreshes per cloud object type.") enableProfiling = flag.Bool("profiling", false, "Is debug/pprof endpoint enabled") @@ -275,6 +276,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { NewPodScaleUpDelay: *newPodScaleUpDelay, IgnoredTaints: *ignoreTaintsFlag, BalancingExtraIgnoredLabels: *balancingIgnoreLabelsFlag, + BalancingLabels: *balancingLabelsFlag, KubeConfigPath: *kubeConfigFile, NodeDeletionDelayTimeout: *nodeDeletionDelayTimeout, AWSUseStaticInstanceList: *awsUseStaticInstanceList, @@ -356,20 +358,26 @@ func buildAutoscaler(debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter opts.Processors.TemplateNodeInfoProvider = nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nodeInfoCacheExpireTime) opts.Processors.PodListProcessor = filteroutschedulable.NewFilterOutSchedulablePodListProcessor() - nodeInfoComparatorBuilder := nodegroupset.CreateGenericNodeInfoComparator - if autoscalingOptions.CloudProviderName == cloudprovider.AzureProviderName { - nodeInfoComparatorBuilder = nodegroupset.CreateAzureNodeInfoComparator - } else if autoscalingOptions.CloudProviderName == cloudprovider.AwsProviderName { - nodeInfoComparatorBuilder = nodegroupset.CreateAwsNodeInfoComparator - } else if autoscalingOptions.CloudProviderName == cloudprovider.GceProviderName { - nodeInfoComparatorBuilder = nodegroupset.CreateGceNodeInfoComparator - opts.Processors.TemplateNodeInfoProvider = nodeinfosprovider.NewAnnotationNodeInfoProvider(nodeInfoCacheExpireTime) - } else if autoscalingOptions.CloudProviderName == cloudprovider.ClusterAPIProviderName { - nodeInfoComparatorBuilder = nodegroupset.CreateClusterAPINodeInfoComparator + var nodeInfoComparator nodegroupset.NodeInfoComparator + if len(autoscalingOptions.BalancingLabels) > 0 { + nodeInfoComparator = nodegroupset.CreateLabelNodeInfoComparator(autoscalingOptions.BalancingLabels) + } else { + nodeInfoComparatorBuilder := nodegroupset.CreateGenericNodeInfoComparator + if autoscalingOptions.CloudProviderName == cloudprovider.AzureProviderName { + nodeInfoComparatorBuilder = nodegroupset.CreateAzureNodeInfoComparator + } else if autoscalingOptions.CloudProviderName == cloudprovider.AwsProviderName { + nodeInfoComparatorBuilder = nodegroupset.CreateAwsNodeInfoComparator + } else if autoscalingOptions.CloudProviderName == cloudprovider.GceProviderName { + nodeInfoComparatorBuilder = nodegroupset.CreateGceNodeInfoComparator + opts.Processors.TemplateNodeInfoProvider = nodeinfosprovider.NewAnnotationNodeInfoProvider(nodeInfoCacheExpireTime) + } else if autoscalingOptions.CloudProviderName == cloudprovider.ClusterAPIProviderName { + nodeInfoComparatorBuilder = nodegroupset.CreateClusterAPINodeInfoComparator + } + nodeInfoComparator = nodeInfoComparatorBuilder(autoscalingOptions.BalancingExtraIgnoredLabels) } opts.Processors.NodeGroupSetProcessor = &nodegroupset.BalancingNodeGroupSetProcessor{ - Comparator: nodeInfoComparatorBuilder(autoscalingOptions.BalancingExtraIgnoredLabels), + Comparator: nodeInfoComparator, } // These metrics should be published only once. diff --git a/cluster-autoscaler/processors/nodegroupset/label_nodegroups.go b/cluster-autoscaler/processors/nodegroupset/label_nodegroups.go new file mode 100644 index 000000000000..df25e6d237c6 --- /dev/null +++ b/cluster-autoscaler/processors/nodegroupset/label_nodegroups.go @@ -0,0 +1,49 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodegroupset + +import ( + klog "k8s.io/klog/v2" + schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" +) + +// CreateLabelNodeInfoComparator returns a comparator that checks for node group similarity using the given labels. +func CreateLabelNodeInfoComparator(labels []string) NodeInfoComparator { + return func(n1, n2 *schedulerframework.NodeInfo) bool { + return areLabelsSame(n1, n2, labels) + } +} + +func areLabelsSame(n1, n2 *schedulerframework.NodeInfo, labels []string) bool { + for _, label := range labels { + val1, exists := n1.Node().ObjectMeta.Labels[label] + if !exists { + klog.V(8).Infof("%s label not present on %s", label, n1.Node().Name) + return false + } + val2, exists := n2.Node().ObjectMeta.Labels[label] + if !exists { + klog.V(8).Infof("%s label not present on %s", label, n1.Node().Name) + return false + } + if val1 != val2 { + klog.V(8).Infof("%s label did not match. %s: %s, %s: %s", label, n1.Node().Name, val1, n2.Node().Name, val2) + return false + } + } + return true +} diff --git a/cluster-autoscaler/processors/nodegroupset/label_nodegroups_test.go b/cluster-autoscaler/processors/nodegroupset/label_nodegroups_test.go new file mode 100644 index 000000000000..aa3f7722ccbe --- /dev/null +++ b/cluster-autoscaler/processors/nodegroupset/label_nodegroups_test.go @@ -0,0 +1,92 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodegroupset + +import ( + "testing" + + . "k8s.io/autoscaler/cluster-autoscaler/utils/test" +) + +func TestNodeLabelComparison(t *testing.T) { + labels := []string{"node.kubernetes.io/instance-type", "kubernetes.io/arch"} + comparator := CreateLabelNodeInfoComparator(labels) + node1 := BuildTestNode("node1", 1000, 2000) + node2 := BuildTestNode("node2", 1000, 2000) + + for _, tc := range []struct { + description string + labels1 map[string]string + labels2 map[string]string + isSimilar bool + }{ + { + description: "both labels match", + labels1: map[string]string{"node.kubernetes.io/instance-type": "m5.4xlarge", "kubernetes.io/arch": "amd64"}, + labels2: map[string]string{"node.kubernetes.io/instance-type": "m5.4xlarge", "kubernetes.io/arch": "amd64"}, + isSimilar: true, + }, + { + description: "one label doesn't match", + labels1: map[string]string{"node.kubernetes.io/instance-type": "m5.4xlarge", "kubernetes.io/arch": "amd64"}, + labels2: map[string]string{"node.kubernetes.io/instance-type": "m5.4xlarge", "kubernetes.io/arch": "i386"}, + isSimilar: false, + }, + { + description: "unspecified labels are not considered", + labels1: map[string]string{"node.kubernetes.io/instance-type": "m5.4xlarge", "kubernetes.io/arch": "amd64", "unspecified-label": "eu-west1"}, + labels2: map[string]string{"node.kubernetes.io/instance-type": "m5.4xlarge", "kubernetes.io/arch": "amd64", "unspecified-label": "eu-west2"}, + isSimilar: true, + }, + { + description: "no labels are set", + labels1: map[string]string{}, + labels2: map[string]string{}, + isSimilar: false, + }, + { + description: "single label matches, label is unset on second group", + labels1: map[string]string{"node.kubernetes.io/instance-type": "m5.4xlarge", "kubernetes.io/arch": "amd64"}, + labels2: map[string]string{"kubernetes.io/arch": "amd64"}, + isSimilar: false, + }, + { + description: "single label matches, label is unset on first group", + labels1: map[string]string{"kubernetes.io/arch": "amd64"}, + labels2: map[string]string{"node.kubernetes.io/instance-type": "m5.4xlarge", "kubernetes.io/arch": "amd64"}, + isSimilar: false, + }, + { + description: "labels are explicitly set to be empty", + labels1: map[string]string{"node.kubernetes.io/instance-type": "", "kubernetes.io/arch": ""}, + labels2: map[string]string{"node.kubernetes.io/instance-type": "", "kubernetes.io/arch": ""}, + isSimilar: true, + }, + { + description: "one labels is explicitly set to be empty", + labels1: map[string]string{"node.kubernetes.io/instance-type": "", "kubernetes.io/arch": "amd64"}, + labels2: map[string]string{"node.kubernetes.io/instance-type": "", "kubernetes.io/arch": "amd64"}, + isSimilar: true, + }, + } { + t.Run(tc.description, func(t *testing.T) { + node1.ObjectMeta.Labels = tc.labels1 + node2.ObjectMeta.Labels = tc.labels2 + checkNodesSimilar(t, node1, node2, comparator, tc.isSimilar) + }) + } +}