From 004d51eb360edd12c9dcb05e5302169a40c40d69 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Mon, 3 May 2021 12:19:12 +0200 Subject: [PATCH] Introduce UpdateDeprecatedTemplateLabels to set beta/deprecated labels And at the same time only set stable labels in all buildGenericLabels implementations. This fixes issues when a node group has 0 nodes yet and node labels are built using buildGenericLabels and the node-template labels. Issues include (anti-)affinity and nodeSelectors for the given labels, giving false-negative results for candidate nodes, which leads to ASGs never scaling up. --- .../alicloud/alicloud_manager.go | 11 ++++----- .../cloudprovider/aws/aws_manager.go | 12 +++++----- .../cloudprovider/aws/aws_manager_test.go | 5 ++-- .../cloudprovider/azure/azure_template.go | 12 ++++------ .../cloudprovider/gce/templates.go | 13 +++-------- .../cloudprovider/gce/templates_test.go | 13 +++-------- .../huaweicloud_service_manager.go | 11 ++++----- cluster-autoscaler/core/utils/utils.go | 23 +++++++++++++++++++ 8 files changed, 51 insertions(+), 49 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/alicloud/alicloud_manager.go b/cluster-autoscaler/cloudprovider/alicloud/alicloud_manager.go index c321249c34c5..429df4e81f9a 100644 --- a/cluster-autoscaler/cloudprovider/alicloud/alicloud_manager.go +++ b/cluster-autoscaler/cloudprovider/alicloud/alicloud_manager.go @@ -27,7 +27,6 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/alicloud/alibaba-cloud-sdk-go/services/ess" klog "k8s.io/klog/v2" - kubeletapis "k8s.io/kubelet/pkg/apis" "math/rand" "time" ) @@ -237,13 +236,13 @@ func (m *AliCloudManager) buildNodeFromTemplate(sg *Asg, template *sgTemplate) ( func buildGenericLabels(template *sgTemplate, nodeName string) map[string]string { result := make(map[string]string) - result[kubeletapis.LabelArch] = cloudprovider.DefaultArch - result[kubeletapis.LabelOS] = cloudprovider.DefaultOS + result[apiv1.LabelArchStable] = cloudprovider.DefaultArch + result[apiv1.LabelOSStable] = cloudprovider.DefaultOS - result[apiv1.LabelInstanceType] = template.InstanceType.instanceTypeID + result[apiv1.LabelInstanceTypeStable] = template.InstanceType.instanceTypeID - result[apiv1.LabelZoneRegion] = template.Region - result[apiv1.LabelZoneFailureDomain] = template.Zone + result[apiv1.LabelTopologyRegion] = template.Region + result[apiv1.LabelTopologyZone] = template.Zone result[apiv1.LabelHostname] = nodeName // append custom node labels diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager.go b/cluster-autoscaler/cloudprovider/aws/aws_manager.go index d9f387e34851..920a452bf58f 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_manager.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_manager.go @@ -41,7 +41,6 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" klog "k8s.io/klog/v2" - kubeletapis "k8s.io/kubelet/pkg/apis" provider_aws "k8s.io/legacy-cloud-providers/aws" ) @@ -389,14 +388,15 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap func buildGenericLabels(template *asgTemplate, nodeName string) map[string]string { result := make(map[string]string) + // TODO: extract it somehow - result[kubeletapis.LabelArch] = cloudprovider.DefaultArch - result[kubeletapis.LabelOS] = cloudprovider.DefaultOS + result[apiv1.LabelArchStable] = cloudprovider.DefaultArch + result[apiv1.LabelOSStable] = cloudprovider.DefaultOS - result[apiv1.LabelInstanceType] = template.InstanceType.InstanceType + result[apiv1.LabelInstanceTypeStable] = template.InstanceType.InstanceType - result[apiv1.LabelZoneRegion] = template.Region - result[apiv1.LabelZoneFailureDomain] = template.Zone + result[apiv1.LabelTopologyRegion] = template.Region + result[apiv1.LabelTopologyZone] = template.Zone result[apiv1.LabelHostname] = nodeName return result } diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go b/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go index e03caed1283b..d23b7808f8db 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go @@ -38,7 +38,6 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" - kubeletapis "k8s.io/kubelet/pkg/apis" provider_aws "k8s.io/legacy-cloud-providers/aws" ) @@ -83,8 +82,8 @@ func TestBuildGenericLabels(t *testing.T) { assert.Equal(t, "us-east-1", labels[apiv1.LabelZoneRegion]) assert.Equal(t, "sillyname", labels[apiv1.LabelHostname]) assert.Equal(t, "c4.large", labels[apiv1.LabelInstanceType]) - assert.Equal(t, cloudprovider.DefaultArch, labels[kubeletapis.LabelArch]) - assert.Equal(t, cloudprovider.DefaultOS, labels[kubeletapis.LabelOS]) + assert.Equal(t, cloudprovider.DefaultArch, labels[apiv1.LabelArchStable]) + assert.Equal(t, cloudprovider.DefaultOS, labels[apiv1.LabelOSStable]) } func TestExtractAllocatableResourcesFromAsg(t *testing.T) { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_template.go b/cluster-autoscaler/cloudprovider/azure/azure_template.go index eb931f31c3a0..2995f53622ba 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_template.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_template.go @@ -26,7 +26,6 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" cloudvolume "k8s.io/cloud-provider/volume" "k8s.io/klog/v2" - kubeletapis "k8s.io/kubelet/pkg/apis" "math/rand" "regexp" "strings" @@ -44,14 +43,11 @@ func buildInstanceOS(template compute.VirtualMachineScaleSet) string { func buildGenericLabels(template compute.VirtualMachineScaleSet, nodeName string) map[string]string { result := make(map[string]string) - result[kubeletapis.LabelArch] = cloudprovider.DefaultArch result[apiv1.LabelArchStable] = cloudprovider.DefaultArch - - result[kubeletapis.LabelOS] = buildInstanceOS(template) result[apiv1.LabelOSStable] = buildInstanceOS(template) - result[apiv1.LabelInstanceType] = *template.Sku.Name - result[apiv1.LabelZoneRegion] = strings.ToLower(*template.Location) + result[apiv1.LabelInstanceTypeStable] = *template.Sku.Name + result[apiv1.LabelTopologyRegion] = strings.ToLower(*template.Location) if template.Zones != nil && len(*template.Zones) > 0 { failureDomains := make([]string, len(*template.Zones)) @@ -59,9 +55,9 @@ func buildGenericLabels(template compute.VirtualMachineScaleSet, nodeName string failureDomains[k] = strings.ToLower(*template.Location) + "-" + v } - result[apiv1.LabelZoneFailureDomain] = strings.Join(failureDomains[:], cloudvolume.LabelMultiZoneDelimiter) + result[apiv1.LabelTopologyZone] = strings.Join(failureDomains[:], cloudvolume.LabelMultiZoneDelimiter) } else { - result[apiv1.LabelZoneFailureDomain] = "0" + result[apiv1.LabelTopologyZone] = "0" } result[apiv1.LabelHostname] = nodeName diff --git a/cluster-autoscaler/cloudprovider/gce/templates.go b/cluster-autoscaler/cloudprovider/gce/templates.go index ff82c7dd7449..96be9bd4201b 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates.go +++ b/cluster-autoscaler/cloudprovider/gce/templates.go @@ -23,6 +23,7 @@ import ( "regexp" "strings" + "github.com/ghodss/yaml" gce "google.golang.org/api/compute/v1" apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -30,9 +31,6 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" "k8s.io/autoscaler/cluster-autoscaler/utils/units" - kubeletapis "k8s.io/kubelet/pkg/apis" - - "github.com/ghodss/yaml" klog "k8s.io/klog/v2" ) @@ -265,21 +263,16 @@ func BuildGenericLabels(ref GceRef, machineType string, nodeName string, os Oper } // TODO: extract it somehow - result[kubeletapis.LabelArch] = cloudprovider.DefaultArch result[apiv1.LabelArchStable] = cloudprovider.DefaultArch - result[kubeletapis.LabelOS] = string(os) result[apiv1.LabelOSStable] = string(os) - result[apiv1.LabelInstanceType] = machineType result[apiv1.LabelInstanceTypeStable] = machineType ix := strings.LastIndex(ref.Zone, "-") if ix == -1 { return nil, fmt.Errorf("unexpected zone: %s", ref.Zone) } - result[apiv1.LabelZoneRegion] = ref.Zone[:ix] - result[apiv1.LabelZoneRegionStable] = ref.Zone[:ix] - result[apiv1.LabelZoneFailureDomain] = ref.Zone - result[apiv1.LabelZoneFailureDomainStable] = ref.Zone + result[apiv1.LabelTopologyRegion] = ref.Zone[:ix] + result[apiv1.LabelTopologyZone] = ref.Zone result[gceCSITopologyKeyZone] = ref.Zone result[apiv1.LabelHostname] = nodeName return result, nil diff --git a/cluster-autoscaler/cloudprovider/gce/templates_test.go b/cluster-autoscaler/cloudprovider/gce/templates_test.go index 9a27937f47ff..27e5c1676c5f 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates_test.go +++ b/cluster-autoscaler/cloudprovider/gce/templates_test.go @@ -26,13 +26,11 @@ import ( gpuUtils "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" "k8s.io/autoscaler/cluster-autoscaler/utils/units" + "github.com/stretchr/testify/assert" gce "google.golang.org/api/compute/v1" apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" quota "k8s.io/apiserver/pkg/quota/v1" - kubeletapis "k8s.io/kubelet/pkg/apis" - - "github.com/stretchr/testify/assert" ) func TestBuildNodeFromTemplateSetsResources(t *testing.T) { @@ -228,16 +226,11 @@ func TestBuildGenericLabels(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { expectedLabels := map[string]string{ - apiv1.LabelZoneRegion: "us-central1", - apiv1.LabelZoneRegionStable: "us-central1", - apiv1.LabelZoneFailureDomain: "us-central1-b", - apiv1.LabelZoneFailureDomainStable: "us-central1-b", + apiv1.LabelTopologyRegion: "us-central1", + apiv1.LabelTopologyZone: "us-central1-b", gceCSITopologyKeyZone: "us-central1-b", apiv1.LabelHostname: "sillyname", - apiv1.LabelInstanceType: "n1-standard-8", apiv1.LabelInstanceTypeStable: "n1-standard-8", - kubeletapis.LabelArch: cloudprovider.DefaultArch, - kubeletapis.LabelOS: tc.expectedOsLabel, apiv1.LabelArchStable: cloudprovider.DefaultArch, apiv1.LabelOSStable: tc.expectedOsLabel, } diff --git a/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_service_manager.go b/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_service_manager.go index aacf7c7edb2f..23d1741e96a3 100644 --- a/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_service_manager.go +++ b/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud_service_manager.go @@ -35,7 +35,6 @@ import ( huaweicloudsdkecsmodel "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/huaweicloud/huaweicloud-sdk-go-v3/services/ecs/v2/model" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" "k8s.io/klog/v2" - kubeletapis "k8s.io/kubelet/pkg/apis" ) // ElasticCloudServerService represents the elastic cloud server interfaces. @@ -569,13 +568,13 @@ func (csm *cloudServiceManager) buildNodeFromTemplate(asgName string, template * func buildGenericLabels(template *asgTemplate, nodeName string) map[string]string { result := make(map[string]string) - result[kubeletapis.LabelArch] = cloudprovider.DefaultArch - result[kubeletapis.LabelOS] = cloudprovider.DefaultOS + result[apiv1.LabelArchStable] = cloudprovider.DefaultArch + result[apiv1.LabelOSStable] = cloudprovider.DefaultOS - result[apiv1.LabelInstanceType] = template.name + result[apiv1.LabelInstanceTypeStable] = template.name - result[apiv1.LabelZoneRegion] = template.region - result[apiv1.LabelZoneFailureDomain] = template.zone + result[apiv1.LabelTopologyRegion] = template.region + result[apiv1.LabelTopologyZone] = template.zone result[apiv1.LabelHostname] = nodeName // append custom node labels diff --git a/cluster-autoscaler/core/utils/utils.go b/cluster-autoscaler/core/utils/utils.go index c60b050c2c4d..143114d3bf21 100644 --- a/cluster-autoscaler/core/utils/utils.go +++ b/cluster-autoscaler/core/utils/utils.go @@ -18,6 +18,7 @@ package utils import ( "fmt" + kubeletapis "k8s.io/kubelet/pkg/apis" "math/rand" "reflect" "time" @@ -172,6 +173,8 @@ func GetNodeInfoFromTemplate(nodeGroup cloudprovider.NodeGroup, daemonsets []*ap return nil, errors.ToAutoscalerError(errors.CloudProviderError, err) } + UpdateDeprecatedTemplateLabels(baseNodeInfo) + pods, err := daemonset.GetDaemonSetPodsForNode(baseNodeInfo, daemonsets, predicateChecker) if err != nil { return nil, errors.ToAutoscalerError(errors.InternalError, err) @@ -188,6 +191,26 @@ func GetNodeInfoFromTemplate(nodeGroup cloudprovider.NodeGroup, daemonsets []*ap return sanitizedNodeInfo, nil } +// UpdateDeprecatedTemplateLabels updates beta and deprecated labels from stable labels +func UpdateDeprecatedTemplateLabels(nodeInfo *schedulerframework.NodeInfo) { + node := nodeInfo.Node() + if v, ok := node.ObjectMeta.Labels[apiv1.LabelArchStable]; ok { + node.ObjectMeta.Labels[kubeletapis.LabelArch] = v + } + if v, ok := node.ObjectMeta.Labels[apiv1.LabelOSStable]; ok { + node.ObjectMeta.Labels[kubeletapis.LabelOS] = v + } + if v, ok := node.ObjectMeta.Labels[apiv1.LabelInstanceTypeStable]; ok { + node.ObjectMeta.Labels[apiv1.LabelInstanceType] = v + } + if v, ok := node.ObjectMeta.Labels[apiv1.LabelTopologyRegion]; ok { + node.ObjectMeta.Labels[apiv1.LabelZoneRegion] = v + } + if v, ok := node.ObjectMeta.Labels[apiv1.LabelTopologyZone]; ok { + node.ObjectMeta.Labels[apiv1.LabelZoneFailureDomain] = v + } +} + // isVirtualNode determines if the node is created by virtual kubelet func isVirtualNode(node *apiv1.Node) bool { return node.ObjectMeta.Labels["type"] == "virtual-kubelet"