From a153405f43aee09acd22cc04518c90aecc78075e Mon Sep 17 00:00:00 2001 From: Benjamin Pineau Date: Mon, 14 Dec 2020 11:31:24 +0100 Subject: [PATCH] NodeInfo processor to Refine synthetic NodeInfos Comparing synthetic NodeInfos obtained from nodegroup's TemplateInfo() to NodeInfos obtained from real-world nodes is bound to fail, even with kube reservations provided through nodegroups labels/annotations (for instance: kernel mem reservation is hard to predict). This makes `balance-similar-node-groups` likely to misbehave when `scale-up-from-zero` is enabled (and a first nodegroup gets a real node), for instance. Following [Maciek Pytel suggestion](https://github.com/kubernetes/autoscaler/pull/3608#issuecomment-709308255) (from discussions on a previous attempt at solving this), we can implement a NodeInfo Processor that would improve template-generated NodeInfos whenever a node was created off a similar nodegroup. We're storing node's virtual origin through machineid, which works fine but is a bit ugly (suggestions welcome). Tested this solves balance-similar-node-groups + scale-up-from-zero, with various instance types on AWS and GCP. Previous attempts to solve that issue/discussions: * https://github.com/kubernetes/autoscaler/issues/2892#issuecomment-595769765 * https://github.com/kubernetes/autoscaler/pull/3608#issuecomment-709308255 --- .../config/autoscaling_options.go | 3 + cluster-autoscaler/core/utils/utils.go | 27 ++- cluster-autoscaler/core/utils/utils_test.go | 2 +- cluster-autoscaler/main.go | 10 + .../nodeinfos/refine_node_infos_processor.go | 184 ++++++++++++++++++ .../refine_node_infos_processor_test.go | 129 ++++++++++++ 6 files changed, 351 insertions(+), 4 deletions(-) create mode 100644 cluster-autoscaler/processors/nodeinfos/refine_node_infos_processor.go create mode 100644 cluster-autoscaler/processors/nodeinfos/refine_node_infos_processor_test.go diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 0071bfada451..73559fa23d69 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -121,6 +121,9 @@ type AutoscalingOptions struct { StatusConfigMapName string // BalanceSimilarNodeGroups enables logic that identifies node groups with similar machines and tries to balance node count between them. BalanceSimilarNodeGroups bool + // EnableRefineUsingSimilarNodeGroups tells wether cluster-autoscaler should try to refine synthetic nodeinfos (generated + // by using cloud providers TemplateInfos()) by using nodeinfos obtained from real-world nodes from similar nodegroups. + EnableRefineUsingSimilarNodeGroups bool // ConfigNamespace is the namespace cluster-autoscaler is running in and all related configmaps live in ConfigNamespace string // ClusterName if available diff --git a/cluster-autoscaler/core/utils/utils.go b/cluster-autoscaler/core/utils/utils.go index c60b050c2c4d..def6fa24a596 100644 --- a/cluster-autoscaler/core/utils/utils.go +++ b/cluster-autoscaler/core/utils/utils.go @@ -38,6 +38,8 @@ import ( klog "k8s.io/klog/v2" ) +const templatedNodeAnnotation = "synthetic-autoscaler-node-from-template" + // GetNodeInfosForGroups finds NodeInfos for all node groups used to manage the given nodes. It also returns a node group to sample node mapping. func GetNodeInfosForGroups(nodes []*apiv1.Node, nodeInfoCache map[string]*schedulerframework.NodeInfo, cloudProvider cloudprovider.CloudProvider, listers kube_util.ListerRegistry, // TODO(mwielgus): This returns map keyed by url, while most code (including scheduler) uses node.Name for a key. @@ -67,7 +69,7 @@ func GetNodeInfosForGroups(nodes []*apiv1.Node, nodeInfoCache map[string]*schedu if err != nil { return false, "", err } - sanitizedNodeInfo, err := sanitizeNodeInfo(nodeInfo, id, ignoredTaints) + sanitizedNodeInfo, err := SanitizeNodeInfo(nodeInfo, id, ignoredTaints) if err != nil { return false, "", err } @@ -181,13 +183,31 @@ func GetNodeInfoFromTemplate(nodeGroup cloudprovider.NodeGroup, daemonsets []*ap } fullNodeInfo := schedulerframework.NewNodeInfo(pods...) fullNodeInfo.SetNode(baseNodeInfo.Node()) - sanitizedNodeInfo, typedErr := sanitizeNodeInfo(fullNodeInfo, id, ignoredTaints) + SetNodeInfoBuiltFromTemplate(fullNodeInfo) + sanitizedNodeInfo, typedErr := SanitizeNodeInfo(fullNodeInfo, id, ignoredTaints) if typedErr != nil { return nil, typedErr } return sanitizedNodeInfo, nil } +// SetNodeInfoBuiltFromTemplate marks a NodeInfo as generated from a cloud provider's TemplateInfo() +func SetNodeInfoBuiltFromTemplate(nodeInfo *schedulerframework.NodeInfo) { + node := nodeInfo.Node() + if node.Annotations == nil { + node.Annotations = make(map[string]string) + } + node.Annotations[templatedNodeAnnotation] = "true" + nodeInfo.SetNode(node) +} + +// IsNodeInfoBuiltFromTemplate returns true if a NodeInfo was generated from a cloud +// provider's TemplateInfo(), false if generated from a real-world node. +func IsNodeInfoBuiltFromTemplate(nodeInfo *schedulerframework.NodeInfo) bool { + _, ok := nodeInfo.Node().Annotations[templatedNodeAnnotation] + return ok +} + // isVirtualNode determines if the node is created by virtual kubelet func isVirtualNode(node *apiv1.Node) bool { return node.ObjectMeta.Labels["type"] == "virtual-kubelet" @@ -228,7 +248,8 @@ func deepCopyNodeInfo(nodeInfo *schedulerframework.NodeInfo) (*schedulerframewor return newNodeInfo, nil } -func sanitizeNodeInfo(nodeInfo *schedulerframework.NodeInfo, nodeGroupName string, ignoredTaints taints.TaintKeySet) (*schedulerframework.NodeInfo, errors.AutoscalerError) { +// SanitizeNodeInfo updates a provided, generic NodeInfo to match the nodegroup's name +func SanitizeNodeInfo(nodeInfo *schedulerframework.NodeInfo, nodeGroupName string, ignoredTaints taints.TaintKeySet) (*schedulerframework.NodeInfo, errors.AutoscalerError) { // Sanitize node name. sanitizedNode, err := sanitizeTemplateNode(nodeInfo.Node(), nodeGroupName, ignoredTaints) if err != nil { diff --git a/cluster-autoscaler/core/utils/utils_test.go b/cluster-autoscaler/core/utils/utils_test.go index f15234397dda..667dc0dfcbe9 100644 --- a/cluster-autoscaler/core/utils/utils_test.go +++ b/cluster-autoscaler/core/utils/utils_test.go @@ -228,7 +228,7 @@ func TestSanitizeNodeInfo(t *testing.T) { nodeInfo := schedulerframework.NewNodeInfo(pod) nodeInfo.SetNode(node) - res, err := sanitizeNodeInfo(nodeInfo, "test-group", nil) + res, err := SanitizeNodeInfo(nodeInfo, "test-group", nil) assert.NoError(t, err) assert.Equal(t, 1, len(res.Pods)) } diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 530c9f730c3e..87262d3259a7 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -43,6 +43,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/metrics" ca_processors "k8s.io/autoscaler/cluster-autoscaler/processors" "k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset" + "k8s.io/autoscaler/cluster-autoscaler/processors/nodeinfos" "k8s.io/autoscaler/cluster-autoscaler/simulator" "k8s.io/autoscaler/cluster-autoscaler/utils/errors" kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes" @@ -170,6 +171,7 @@ var ( regional = flag.Bool("regional", false, "Cluster is regional.") newPodScaleUpDelay = flag.Duration("new-pod-scale-up-delay", 0*time.Second, "Pods less than this old will not be considered for scale-up.") + enableRefineUsingSimilarNodeGroups = flag.Bool("refine-using-similar-nodegroups", false, "Mitigate unbalance among similar node groups when scale-up-from-zero is enabled") ignoreTaintsFlag = multiStringFlag("ignore-taint", "Specifies a taint to ignore in node templates when considering to scale a node group") balancingIgnoreLabelsFlag = multiStringFlag("balancing-ignore-label", "Specifies a label to ignore in addition to the basic and cloud-provider set of labels when comparing if two node groups are similar") awsUseStaticInstanceList = flag.Bool("aws-use-static-instance-list", false, "Should CA fetch instance types in runtime or use a static list. AWS only") @@ -237,6 +239,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { WriteStatusConfigMap: *writeStatusConfigMapFlag, StatusConfigMapName: *statusConfigMapName, BalanceSimilarNodeGroups: *balanceSimilarNodeGroupsFlag, + EnableRefineUsingSimilarNodeGroups: *enableRefineUsingSimilarNodeGroups, ConfigNamespace: *namespace, ClusterName: *clusterName, NodeAutoprovisioningEnabled: *nodeAutoprovisioningEnabled, @@ -329,6 +332,13 @@ func buildAutoscaler() (core.Autoscaler, error) { Comparator: nodeInfoComparatorBuilder(autoscalingOptions.BalancingExtraIgnoredLabels), } + if autoscalingOptions.EnableRefineUsingSimilarNodeGroups { + opts.Processors.NodeInfoProcessor = &nodeinfos.RefineNodeInfosProcessor{ + NodeGroupSetProcessor: opts.Processors.NodeGroupSetProcessor, + RefineUsingSimilarNodeGroups: autoscalingOptions.EnableRefineUsingSimilarNodeGroups, + } + } + // These metrics should be published only once. metrics.UpdateNapEnabled(autoscalingOptions.NodeAutoprovisioningEnabled) metrics.UpdateMaxNodesCount(autoscalingOptions.MaxNodesTotal) diff --git a/cluster-autoscaler/processors/nodeinfos/refine_node_infos_processor.go b/cluster-autoscaler/processors/nodeinfos/refine_node_infos_processor.go new file mode 100644 index 000000000000..996d5a6ee0a7 --- /dev/null +++ b/cluster-autoscaler/processors/nodeinfos/refine_node_infos_processor.go @@ -0,0 +1,184 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeinfos + +import ( + "sync" + "time" + + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" + "k8s.io/autoscaler/cluster-autoscaler/context" + "k8s.io/autoscaler/cluster-autoscaler/core/utils" + "k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset" + "k8s.io/autoscaler/cluster-autoscaler/utils/errors" + "k8s.io/autoscaler/cluster-autoscaler/utils/taints" + klog "k8s.io/klog/v2" + + schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" +) + +const templateNodeInfoCacheTTL = 10 * time.Minute + +// RefineNodeInfosProcessor improves NodeInfos accuracy where possible. +// For now it adjusts synthetic NodeInfos generated by TemplateInfo() with more accurate +// NodeInfos obtained from real-world nodes for similar nodegroups (when available). +type RefineNodeInfosProcessor struct { + NodeGroupSetProcessor nodegroupset.NodeGroupSetProcessor + RefineUsingSimilarNodeGroups bool + nodeInfoCache nodeInfoCache +} + +type nodeInfoCache struct { + sync.RWMutex + nodeInfos map[string]*schedulerframework.NodeInfo + lastUpdated time.Time +} + +// Process refines nodeInfos obtained by TemplateInfos when possible +func (p *RefineNodeInfosProcessor) Process(ctx *context.AutoscalingContext, nodeInfosForNodeGroups map[string]*schedulerframework.NodeInfo) (map[string]*schedulerframework.NodeInfo, error) { + inaccurateNodeInfos := make(map[string]*schedulerframework.NodeInfo) + templatesNodeInfo := make(map[string]*schedulerframework.NodeInfo) + + // for now this processor only supports refining by leveraging similar nodegroups + if !p.RefineUsingSimilarNodeGroups { + return nodeInfosForNodeGroups, nil + } + + nodeGroups := make(map[string]cloudprovider.NodeGroup) + for _, nodeGroup := range ctx.CloudProvider.NodeGroups() { + nodeGroups[nodeGroup.Id()] = nodeGroup + } + + if err := p.nodeInfoCache.update(nodeGroups); err != nil { + return nodeInfosForNodeGroups, err + } + + ignoredTaints := make(taints.TaintKeySet) + for _, taintKey := range ctx.IgnoredTaints { + ignoredTaints[taintKey] = true + } + + // build comparable (all from templates) NodeInfos + for groupID, nodeInfo := range nodeInfosForNodeGroups { + templatedNodeInfo, ok := p.nodeInfoCache.nodeInfos[groupID] + if !ok || templatedNodeInfo == nil { + continue + } + + templatesNodeInfo[groupID] = templatedNodeInfo + + if utils.IsNodeInfoBuiltFromTemplate(nodeInfo) { + // use the provided nodeInfo rather than the one we just generated: + // we want to keep the original daemonsets pods in refined template. + inaccurateNodeInfos[groupID] = nodeInfo + } + } + + // refine inaccurate nodeinfos when we can find similar nodeInfos built from real nodes + for groupID, nodeInfo := range inaccurateNodeInfos { + nodeGroup, ok := nodeGroups[groupID] + if !ok { + continue + } + + similars, err := p.NodeGroupSetProcessor.FindSimilarNodeGroups(ctx, nodeGroup, templatesNodeInfo) + if err != nil { + klog.Warningf("Failed to lookup for matching node groups for %s: %v", groupID, err) + return nodeInfosForNodeGroups, nil + } + + for _, nodeGroup := range similars { + similarInfo, found := nodeInfosForNodeGroups[nodeGroup.Id()] + if !found || utils.IsNodeInfoBuiltFromTemplate(similarInfo) { + continue + } + refinedNodeInfo, err := refineNodeInfoUsingSimilarGroup(nodeInfo, similarInfo, groupID, ignoredTaints) + if err != nil { + return nodeInfosForNodeGroups, err + } + nodeInfosForNodeGroups[groupID] = refinedNodeInfo + break + } + } + + return nodeInfosForNodeGroups, nil +} + +// CleanUp cleans up processor's internal structures. +func (p *RefineNodeInfosProcessor) CleanUp() { +} + +func refineNodeInfoUsingSimilarGroup(nodeInfo, similarInfo *schedulerframework.NodeInfo, nodeGroupName string, ignoredTaints taints.TaintKeySet) (*schedulerframework.NodeInfo, errors.AutoscalerError) { + nodeInfoCopy := nodeInfo.Clone() + + // original node identity and locality (region, zone, ...) must be retained + newNode := similarInfo.Node().DeepCopy() + if newNode.Labels == nil { + newNode.Labels = make(map[string]string) + } + for label, val := range nodeInfo.Node().GetLabels() { + if _, found := nodegroupset.BasicIgnoredLabels[label]; found { + newNode.Labels[label] = val + } + } + newNode.Name = nodeInfo.Node().Name + newNode.UID = nodeInfo.Node().UID + nodeInfoCopy.SetNode(newNode) + + utils.SetNodeInfoBuiltFromTemplate(nodeInfoCopy) + return utils.SanitizeNodeInfo(nodeInfoCopy, nodeGroupName, ignoredTaints) +} + +func (n *nodeInfoCache) update(nodeGroups map[string]cloudprovider.NodeGroup) error { + n.Lock() + defer n.Unlock() + + if len(n.nodeInfos) == 0 { + n.nodeInfos = make(map[string]*schedulerframework.NodeInfo) + n.lastUpdated = time.Now() + } + needsRefresh := n.lastUpdated.Add(templateNodeInfoCacheTTL).Before(time.Now()) + + for groupID := range n.nodeInfos { + if _, ok := nodeGroups[groupID]; !ok { + delete(n.nodeInfos, groupID) + } + } + + for groupID, nodeGroup := range nodeGroups { + cachedNodeInfo, ok := n.nodeInfos[groupID] + if ok && !needsRefresh && !utils.IsNodeInfoBuiltFromTemplate(cachedNodeInfo) { + continue + } + + nodeInfo, err := nodeGroup.TemplateNodeInfo() + if err != nil { + if err == cloudprovider.ErrNotImplemented { + continue + } else { + return errors.ToAutoscalerError(errors.CloudProviderError, err) + } + } + n.nodeInfos[groupID] = nodeInfo + } + + if needsRefresh { + n.lastUpdated = time.Now() + } + + return nil +} diff --git a/cluster-autoscaler/processors/nodeinfos/refine_node_infos_processor_test.go b/cluster-autoscaler/processors/nodeinfos/refine_node_infos_processor_test.go new file mode 100644 index 000000000000..1fe9158f4aa2 --- /dev/null +++ b/cluster-autoscaler/processors/nodeinfos/refine_node_infos_processor_test.go @@ -0,0 +1,129 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeinfos + +import ( + "testing" + + testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test" + "k8s.io/autoscaler/cluster-autoscaler/context" + "k8s.io/autoscaler/cluster-autoscaler/core/utils" + "k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset" + . "k8s.io/autoscaler/cluster-autoscaler/utils/test" + schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" + + "github.com/stretchr/testify/assert" +) + +type testNodeGroup struct { + name string + cpu int64 + mem int64 + fromRealNode bool +} + +type refinetest struct { + name string + nodegroups []testNodeGroup + templates []testNodeGroup + expected []testNodeGroup +} + +var refineCases = []refinetest{ + { + "Generated templates should leverage NodeInfo from similar nodegroup when available", + []testNodeGroup{{"n1", 1000, 1000, true}, {"n2", 3000, 3000, false}}, + []testNodeGroup{{"n1", 1000, 1000, false}, {"n2", 1000, 1000, false}}, + []testNodeGroup{{"n1", 1000, 1000, true}, {"n2", 1000, 1000, true}}, + }, + { + "NodeInfos obtained from real-world nodes should remain untouched", + []testNodeGroup{{"n1", 1000, 1000, true}, {"n2", 1000, 1000, true}}, + []testNodeGroup{{"n1", 3000, 3000, false}, {"n2", 3000, 3000, false}}, + []testNodeGroup{{"n1", 1000, 1000, true}, {"n2", 1000, 1000, true}}, + }, + { + "Should be no-op on cloudproviders not implementing TemplateInfos", + []testNodeGroup{{"n1", 1000, 1000, true}, {"n2", 2000, 2000, false}}, + nil, /* TemplateNodeInfo() returns cloudprovider.ErrNotImplemented */ + []testNodeGroup{{"n1", 1000, 1000, true}, {"n2", 2000, 2000, false}}, + }, + { + "Should not use NodeInfo templates that aren't built from real nodes", + []testNodeGroup{{"n1", 1000, 1000, false}, {"n2", 7000, 7000, false}}, + []testNodeGroup{{"n1", 1000, 1000, false}, {"n2", 1000, 1000, false}}, + []testNodeGroup{{"n1", 1000, 1000, false}, {"n2", 7000, 7000, false}}, + }, + { + "Should not use NodeInfo templates from dissimilar nodegroups", + []testNodeGroup{{"n1", 1000, 1000, true}, {"n2", 7000, 7000, false}}, + []testNodeGroup{{"n1", 1000, 1000, false}, {"n2", 7000, 7000, false}}, + []testNodeGroup{{"n1", 1000, 1000, true}, {"n2", 7000, 7000, false}}, + }, +} + +func buildRefineNodeInfosProcessorTest(r refinetest) (*context.AutoscalingContext, map[string]*schedulerframework.NodeInfo) { + templates := toNodeInfoMap(r.templates) + nodeinfos := toNodeInfoMap(r.nodegroups) + provider := testprovider.NewTestAutoprovisioningCloudProvider(nil, nil, nil, nil, []string{}, templates) + if r.templates == nil { + provider = testprovider.NewTestCloudProvider(nil, nil) + } + for _, ng := range r.nodegroups { + provider.AddNodeGroup(ng.name, 1, 10, 1) + } + + ctx := &context.AutoscalingContext{ + CloudProvider: provider, + } + + return ctx, nodeinfos +} + +func toNodeInfoMap(nodeGroups []testNodeGroup) map[string]*schedulerframework.NodeInfo { + nodeinfos := make(map[string]*schedulerframework.NodeInfo) + for _, ng := range nodeGroups { + node := BuildTestNode(ng.name, ng.cpu, ng.mem) + ni := schedulerframework.NewNodeInfo() + ni.SetNode(node) + if !ng.fromRealNode { + utils.SetNodeInfoBuiltFromTemplate(ni) + } + nodeinfos[ng.name] = ni + } + return nodeinfos +} + +func TestRefineNodeInfosProcessor(t *testing.T) { + for _, tt := range refineCases { + proc := RefineNodeInfosProcessor{ + NodeGroupSetProcessor: nodegroupset.NewDefaultNodeGroupSetProcessor([]string{}), + RefineUsingSimilarNodeGroups: true, + } + ctx, nodeInfosForNodeGroups := buildRefineNodeInfosProcessorTest(tt) + out, err := proc.Process(ctx, nodeInfosForNodeGroups) + + assert.NoError(t, err) + expected := toNodeInfoMap(tt.expected) + comparator := nodegroupset.CreateGenericNodeInfoComparator([]string{}) + + for nodeGroupName, nodeGroup := range expected { + assert.Contains(t, out, nodeGroupName, tt.name) + assert.True(t, comparator(out[nodeGroupName], nodeGroup), tt.name) + } + } +}