From 9831623810aac0f85383dd509a36cb459363603e Mon Sep 17 00:00:00 2001 From: Maciek Pytel Date: Fri, 12 Feb 2021 19:37:30 +0100 Subject: [PATCH] Set different hostname label for upcoming nodes Function copying template node to use for upcoming nodes was not chaning hostname label, meaning that features relying on this label (ex. pod antiaffinity on hostname topology) would treat all upcoming nodes as a single node. This resulted in triggering too many scale-ups for pods using such features. Analogous function in binpacking didn't have the same bug (but it didn't set unique UID or pod names). I extracted the functionality to a util function used in both places to avoid the two functions getting out of sync again. --- cluster-autoscaler/core/static_autoscaler.go | 19 ++------------ .../estimator/binpacking_estimator.go | 20 +++++---------- .../utils/scheduler/scheduler.go | 25 +++++++++++++++++++ 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 71b8bfa22b9e..4809035f281a 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -23,7 +23,6 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/util/uuid" schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" @@ -42,6 +41,7 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/utils/deletetaint" "k8s.io/autoscaler/cluster-autoscaler/utils/errors" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" + scheduler_utils "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler" "k8s.io/autoscaler/cluster-autoscaler/utils/taints" "k8s.io/autoscaler/cluster-autoscaler/utils/tpu" @@ -775,21 +775,6 @@ func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool { return found && oldest.Add(unschedulablePodWithGpuTimeBuffer).After(currentTime) } -func deepCopyNodeInfo(nodeTemplate *schedulerframework.NodeInfo, index int) *schedulerframework.NodeInfo { - node := nodeTemplate.Node().DeepCopy() - node.Name = fmt.Sprintf("%s-%d", node.Name, index) - node.UID = uuid.NewUUID() - nodeInfo := schedulerframework.NewNodeInfo() - nodeInfo.SetNode(node) - for _, podInfo := range nodeTemplate.Pods { - pod := podInfo.Pod.DeepCopy() - pod.Name = fmt.Sprintf("%s-%d", podInfo.Pod.Name, index) - pod.UID = uuid.NewUUID() - nodeInfo.AddPod(pod) - } - return nodeInfo -} - func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos map[string]*schedulerframework.NodeInfo) []*schedulerframework.NodeInfo { upcomingNodes := make([]*schedulerframework.NodeInfo, 0) for nodeGroup, numberOfNodes := range registry.GetUpcomingNodes() { @@ -808,7 +793,7 @@ func getUpcomingNodeInfos(registry *clusterstate.ClusterStateRegistry, nodeInfos // Ensure new nodes have different names because nodeName // will be used as a map key. Also deep copy pods (daemonsets & // any pods added by cloud provider on template). - upcomingNodes = append(upcomingNodes, deepCopyNodeInfo(nodeTemplate, i)) + upcomingNodes = append(upcomingNodes, scheduler_utils.DeepCopyTemplateNode(nodeTemplate, i)) } } return upcomingNodes diff --git a/cluster-autoscaler/estimator/binpacking_estimator.go b/cluster-autoscaler/estimator/binpacking_estimator.go index 6cbbf389aaf4..b998143cce70 100644 --- a/cluster-autoscaler/estimator/binpacking_estimator.go +++ b/cluster-autoscaler/estimator/binpacking_estimator.go @@ -17,13 +17,12 @@ limitations under the License. package estimator import ( - "fmt" "sort" - "time" apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/autoscaler/cluster-autoscaler/simulator" + "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler" klog "k8s.io/klog/v2" schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" ) @@ -75,7 +74,6 @@ func (estimator *BinpackingNodeEstimator) Estimate( } }() - newNodeNameTimestamp := time.Now() newNodeNameIndex := 0 for _, podInfo := range podInfos { @@ -94,7 +92,7 @@ func (estimator *BinpackingNodeEstimator) Estimate( if !found { // Add new node - newNodeName, err := estimator.addNewNodeToSnapshot(nodeTemplate, newNodeNameTimestamp, newNodeNameIndex) + newNodeName, err := estimator.addNewNodeToSnapshot(nodeTemplate, newNodeNameIndex) if err != nil { klog.Errorf("Error while adding new node for template to ClusterSnapshot; %v", err) return 0 @@ -113,23 +111,17 @@ func (estimator *BinpackingNodeEstimator) Estimate( func (estimator *BinpackingNodeEstimator) addNewNodeToSnapshot( template *schedulerframework.NodeInfo, - nameTimestamp time.Time, nameIndex int) (string, error) { - newNode := template.Node().DeepCopy() - newNode.Name = fmt.Sprintf("%s-%d-%d", newNode.Name, nameTimestamp.Unix(), nameIndex) - if newNode.Labels == nil { - newNode.Labels = make(map[string]string) - } - newNode.Labels["kubernetes.io/hostname"] = newNode.Name + newNodeInfo := scheduler.DeepCopyTemplateNode(template, nameIndex) var pods []*apiv1.Pod - for _, podInfo := range template.Pods { + for _, podInfo := range newNodeInfo.Pods { pods = append(pods, podInfo.Pod) } - if err := estimator.clusterSnapshot.AddNodeWithPods(newNode, pods); err != nil { + if err := estimator.clusterSnapshot.AddNodeWithPods(newNodeInfo.Node(), pods); err != nil { return "", err } - return newNode.Name, nil + return newNodeInfo.Node().Name, nil } // Calculates score for all pods and returns podInfo structure. diff --git a/cluster-autoscaler/utils/scheduler/scheduler.go b/cluster-autoscaler/utils/scheduler/scheduler.go index c78e12a9fcfd..f3e4439f6748 100644 --- a/cluster-autoscaler/utils/scheduler/scheduler.go +++ b/cluster-autoscaler/utils/scheduler/scheduler.go @@ -17,7 +17,10 @@ limitations under the License. package scheduler import ( + "fmt" + apiv1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/uuid" schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" ) @@ -57,3 +60,25 @@ func CreateNodeNameToInfoMap(pods []*apiv1.Pod, nodes []*apiv1.Node) map[string] return nodeNameToNodeInfo } + +// DeepCopyTemplateNode copies NodeInfo object used as a template. It changes +// names of UIDs of both node and pods running on it, so that copies can be used +// to represent multiple nodes. +func DeepCopyTemplateNode(nodeTemplate *schedulerframework.NodeInfo, index int) *schedulerframework.NodeInfo { + node := nodeTemplate.Node().DeepCopy() + node.Name = fmt.Sprintf("%s-%d", node.Name, index) + node.UID = uuid.NewUUID() + if node.Labels == nil { + node.Labels = make(map[string]string) + } + node.Labels["kubernetes.io/hostname"] = node.Name + nodeInfo := schedulerframework.NewNodeInfo() + nodeInfo.SetNode(node) + for _, podInfo := range nodeTemplate.Pods { + pod := podInfo.Pod.DeepCopy() + pod.Name = fmt.Sprintf("%s-%d", podInfo.Pod.Name, index) + pod.UID = uuid.NewUUID() + nodeInfo.AddPod(pod) + } + return nodeInfo +}