diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 05cf4301080f..68a64c672035 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -633,7 +633,9 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, erro return nil, err } - node, err := buildNodeFromTemplate(scaleSet.Name, template, scaleSet.manager, scaleSet.enableDynamicInstanceList) + inputLabels := map[string]string{} + inputTaints := "" + node, err := buildNodeFromTemplate(scaleSet.Name, inputLabels, inputTaints, template, scaleSet.manager, scaleSet.enableDynamicInstanceList) if err != nil { return nil, err diff --git a/cluster-autoscaler/cloudprovider/azure/azure_template.go b/cluster-autoscaler/cloudprovider/azure/azure_template.go index 39afb09af222..9411354be35b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_template.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_template.go @@ -31,17 +31,61 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" "k8s.io/klog/v2" + kubeletapis "k8s.io/kubelet/pkg/apis" ) const ( - azureDiskTopologyKey string = "topology.disk.csi.azure.com/zone" // AKSLabelPrefixValue represents the constant prefix for AKSLabelKeyPrefixValue AKSLabelPrefixValue = "kubernetes.azure.com" // AKSLabelKeyPrefixValue represents prefix for AKS Labels AKSLabelKeyPrefixValue = AKSLabelPrefixValue + "/" + + azureDiskTopologyKey = "topology.disk.csi.azure.com/zone" + // For NP-series SKU, the xilinx device plugin uses that resource name + // https://github.com/Xilinx/FPGA_as_a_Service/tree/master/k8s-fpga-device-plugin + xilinxFpgaResourceName = "xilinx.com/fpga-xilinx_u250_gen3x16_xdma_shell_2_1-0" + + // legacyPoolNameTag is the legacy tag that AKS adds to the VMSS with its value + // being the agentpool name + legacyPoolNameTag = "poolName" + // poolNameTag is the new tag that replaces the above one + // Newly created pools and clusters will have this one on the VMSS + // instead of the legacy one. We'll have to live with both tags for a while. + poolNameTag = "aks-managed-poolName" + + // This is the legacy label is added by agentbaker, agentpool={poolName} and we want to predict that + // a node added to this agentpool will have this as a node label. The value is fetched + // from the VMSS tag with key poolNameTag/legacyPoolNameTag + legacyAgentPoolNodeLabelKey = "agentpool" + // New label that replaces the above + agentPoolNodeLabelKey = AKSLabelKeyPrefixValue + "agentpool" + + // Storage profile node labels + legacyStorageProfileNodeLabelKey = "storageprofile" + storageProfileNodeLabelKey = AKSLabelKeyPrefixValue + "storageprofile" + + // Storage tier node labels + legacyStorageTierNodeLabelKey = "storagetier" + storageTierNodeLabelKey = AKSLabelKeyPrefixValue + "storagetier" + + // Fips node label + fipsNodeLabelKey = AKSLabelKeyPrefixValue + "fips_enabled" + + // OS Sku node Label + osSkuLabelKey = AKSLabelKeyPrefixValue + "os-sku" + + // Security node label + securityTypeLabelKey = AKSLabelKeyPrefixValue + "security-type" + + customCATrustEnabledLabelKey = AKSLabelKeyPrefixValue + "custom-ca-trust-enabled" + kataMshvVMIsolationLabelKey = AKSLabelKeyPrefixValue + "kata-mshv-vm-isolation" + + // Cluster node label + clusterLabelKey = AKSLabelKeyPrefixValue + "cluster" ) -func buildNodeFromTemplate(nodeGroupName string, template compute.VirtualMachineScaleSet, manager *AzureManager, enableDynamicInstanceList bool) (*apiv1.Node, error) { +func buildNodeFromTemplate(nodeGroupName string, inputLabels map[string]string, inputTaints string, + template compute.VirtualMachineScaleSet, manager *AzureManager, enableDynamicInstanceList bool) (*apiv1.Node, error) { node := apiv1.Node{} nodeName := fmt.Sprintf("%s-asg-%d", nodeGroupName, rand.Int63()) @@ -90,7 +134,9 @@ func buildNodeFromTemplate(nodeGroupName string, template compute.VirtualMachine node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(vcpu, resource.DecimalSI) // isNPSeries returns if a SKU is an NP-series SKU // SKU API reports GPUs for NP-series but it's actually FPGAs - if !isNPSeries(*template.Sku.Name) { + if isNPSeries(*template.Sku.Name) { + node.Status.Capacity[xilinxFpgaResourceName] = *resource.NewQuantity(gpuCount, resource.DecimalSI) + } else { node.Status.Capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(gpuCount, resource.DecimalSI) } @@ -113,16 +159,78 @@ func buildNodeFromTemplate(nodeGroupName string, template compute.VirtualMachine // GenericLabels node.Labels = cloudprovider.JoinStringMaps(node.Labels, buildGenericLabels(template, nodeName)) + // Labels from the Scale Set's Tags - node.Labels = cloudprovider.JoinStringMaps(node.Labels, extractLabelsFromScaleSet(template.Tags)) + labels := make(map[string]string) + // Prefer the explicit labels in spec coming from RP over the VMSS template + if len(inputLabels) > 0 { + labels = inputLabels + } else { + labels = extractLabelsFromScaleSet(template.Tags) + } + + // Add the agentpool label, its value should come from the VMSS poolName tag + // NOTE: The plan is for agentpool label to be deprecated in favor of the aks-prefixed one + // We will have to live with both labels for a while + if node.Labels[legacyPoolNameTag] != "" { + labels[legacyAgentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag] + labels[agentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag] + } + if node.Labels[poolNameTag] != "" { + labels[legacyAgentPoolNodeLabelKey] = node.Labels[poolNameTag] + labels[agentPoolNodeLabelKey] = node.Labels[poolNameTag] + } + + // Add the storage profile and storage tier labels + if template.VirtualMachineProfile != nil && template.VirtualMachineProfile.StorageProfile != nil && template.VirtualMachineProfile.StorageProfile.OsDisk != nil { + // ephemeral + if template.VirtualMachineProfile.StorageProfile.OsDisk.DiffDiskSettings != nil && template.VirtualMachineProfile.StorageProfile.OsDisk.DiffDiskSettings.Option == compute.Local { + labels[legacyStorageProfileNodeLabelKey] = "ephemeral" + labels[storageProfileNodeLabelKey] = "ephemeral" + } else { + labels[legacyStorageProfileNodeLabelKey] = "managed" + labels[storageProfileNodeLabelKey] = "managed" + } + if template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk != nil { + labels[legacyStorageTierNodeLabelKey] = string(template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk.StorageAccountType) + labels[storageTierNodeLabelKey] = string(template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk.StorageAccountType) + } + // Add ephemeral-storage value + if template.VirtualMachineProfile.StorageProfile.OsDisk.DiskSizeGB != nil { + node.Status.Capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(int(*template.VirtualMachineProfile.StorageProfile.OsDisk.DiskSizeGB)*1024*1024*1024), resource.DecimalSI) + klog.V(4).Infof("OS Disk Size from template is: %d", *template.VirtualMachineProfile.StorageProfile.OsDisk.DiskSizeGB) + klog.V(4).Infof("Setting ephemeral storage to: %v", node.Status.Capacity[apiv1.ResourceEphemeralStorage]) + } + } + + // If we are on GPU-enabled SKUs, append the accelerator + // label so that CA makes better decision when scaling from zero for GPU pools + if isNvidiaEnabledSKU(*template.Sku.Name) { + labels[GPULabel] = "nvidia" + labels[legacyGPULabel] = "nvidia" + } + + // Extract allocatables from tags resourcesFromTags := extractAllocatableResourcesFromScaleSet(template.Tags) for resourceName, val := range resourcesFromTags { node.Status.Capacity[apiv1.ResourceName(resourceName)] = *val } + node.Labels = cloudprovider.JoinStringMaps(node.Labels, labels) + klog.V(4).Infof("Setting node %s labels to: %s", nodeName, node.Labels) + + var taints []apiv1.Taint + // Prefer the explicit taints in spec over the VMSS template + if inputTaints != "" { + taints = extractTaintsFromSpecString(inputTaints) + } else { + taints = extractTaintsFromScaleSet(template.Tags) + } + // Taints from the Scale Set's Tags - node.Spec.Taints = extractTaintsFromScaleSet(template.Tags) + node.Spec.Taints = taints + klog.V(4).Infof("Setting node %s taints to: %s", nodeName, node.Spec.Taints) node.Status.Conditions = cloudprovider.BuildReadyConditions() return &node, nil @@ -140,10 +248,15 @@ func buildInstanceOS(template compute.VirtualMachineScaleSet) string { func buildGenericLabels(template compute.VirtualMachineScaleSet, nodeName string) map[string]string { result := make(map[string]string) + result[kubeletapis.LabelArch] = cloudprovider.DefaultArch result[apiv1.LabelArchStable] = cloudprovider.DefaultArch + + result[kubeletapis.LabelOS] = buildInstanceOS(template) result[apiv1.LabelOSStable] = buildInstanceOS(template) + result[apiv1.LabelInstanceType] = *template.Sku.Name result[apiv1.LabelInstanceTypeStable] = *template.Sku.Name + result[apiv1.LabelZoneRegion] = strings.ToLower(*template.Location) result[apiv1.LabelTopologyRegion] = strings.ToLower(*template.Location) if template.Zones != nil && len(*template.Zones) > 0 { @@ -157,9 +270,11 @@ func buildGenericLabels(template compute.VirtualMachineScaleSet, nodeName string //Note that the if the customer is to have some pod affinity picking exact zone, this logic won't work. //For now, discourage the customers from using podAffinity to pick the availability zones. randomZone := failureDomains[rand.Intn(len(failureDomains))] + result[apiv1.LabelZoneFailureDomain] = randomZone result[apiv1.LabelTopologyZone] = randomZone result[azureDiskTopologyKey] = randomZone } else { + result[apiv1.LabelZoneFailureDomain] = "0" result[apiv1.LabelTopologyZone] = "0" result[azureDiskTopologyKey] = "" } @@ -212,6 +327,37 @@ func extractTaintsFromScaleSet(tags map[string]*string) []apiv1.Taint { return taints } +// Example of a valid taints string, is the same argument to kubelet's `--register-with-taints` +// "dedicated=foo:NoSchedule,group=bar:NoExecute,app=fizz:PreferNoSchedule" +func extractTaintsFromSpecString(taintsString string) []apiv1.Taint { + taints := make([]apiv1.Taint, 0) + // First split the taints at the separator + splits := strings.Split(taintsString, ",") + for _, split := range splits { + taintSplit := strings.Split(split, "=") + if len(taintSplit) != 2 { + continue + } + + taintKey := taintSplit[0] + taintValue := taintSplit[1] + + r, _ := regexp.Compile("(.*):(?:NoSchedule|NoExecute|PreferNoSchedule)") + if !r.MatchString(taintValue) { + continue + } + + values := strings.SplitN(taintValue, ":", 2) + taints = append(taints, apiv1.Taint{ + Key: taintKey, + Value: values[0], + Effect: apiv1.TaintEffect(values[1]), + }) + } + + return taints +} + func extractAutoscalingOptionsFromScaleSetTags(tags map[string]*string) map[string]string { options := make(map[string]string) for tagName, tagValue := range tags { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_template_test.go b/cluster-autoscaler/cloudprovider/azure/azure_template_test.go index d8eaeb1d8fe0..3711a11ce372 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_template_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_template_test.go @@ -18,6 +18,7 @@ package azure import ( "fmt" + "strings" "testing" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" @@ -104,6 +105,45 @@ func TestExtractTaintsFromScaleSet(t *testing.T) { assert.Equal(t, makeTaintSet(expectedTaints), makeTaintSet(taints)) } +func TestExtractTaintsFromSpecString(t *testing.T) { + taintsString := []string{ + "dedicated=foo:NoSchedule", + "group=bar:NoExecute", + "app=fizz:PreferNoSchedule", + "k8s.io/testing/underscore/to/slash=fizz:PreferNoSchedule", + "bar=baz", + "blank=", + "nosplit=some_value", + } + + expectedTaints := []apiv1.Taint{ + { + Key: "dedicated", + Value: "foo", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "group", + Value: "bar", + Effect: apiv1.TaintEffectNoExecute, + }, + { + Key: "app", + Value: "fizz", + Effect: apiv1.TaintEffectPreferNoSchedule, + }, + { + Key: "k8s.io/testing/underscore/to/slash", + Value: "fizz", + Effect: apiv1.TaintEffectPreferNoSchedule, + }, + } + + taints := extractTaintsFromSpecString(strings.Join(taintsString, ",")) + assert.Len(t, taints, 4) + assert.Equal(t, makeTaintSet(expectedTaints), makeTaintSet(taints)) +} + func TestExtractAllocatableResourcesFromScaleSet(t *testing.T) { tags := map[string]*string{ fmt.Sprintf("%s%s", nodeResourcesTagName, "cpu"): to.StringPtr("100m"), @@ -123,14 +163,6 @@ func TestExtractAllocatableResourcesFromScaleSet(t *testing.T) { assert.Equal(t, (&exepectedCustomAllocatable).String(), labels["nvidia.com/Tesla-P100-PCIE"].String()) } -func makeTaintSet(taints []apiv1.Taint) map[apiv1.Taint]bool { - set := make(map[apiv1.Taint]bool) - for _, taint := range taints { - set[taint] = true - } - return set -} - func TestTopologyFromScaleSet(t *testing.T) { testNodeName := "test-node" testSkuName := "test-sku" @@ -144,12 +176,16 @@ func TestTopologyFromScaleSet(t *testing.T) { Location: to.StringPtr("westus"), } expectedZoneValues := []string{"westus-1", "westus-2", "westus-3"} + labels := buildGenericLabels(testVmss, testNodeName) + failureDomain, ok := labels[apiv1.LabelZoneFailureDomain] + assert.True(t, ok) topologyZone, ok := labels[apiv1.LabelTopologyZone] assert.True(t, ok) azureDiskTopology, ok := labels[azureDiskTopologyKey] assert.True(t, ok) + assert.Contains(t, expectedZoneValues, failureDomain) assert.Contains(t, expectedZoneValues, topologyZone) assert.Contains(t, expectedZoneValues, azureDiskTopology) } @@ -165,10 +201,16 @@ func TestEmptyTopologyFromScaleSet(t *testing.T) { VirtualMachineProfile: &compute.VirtualMachineScaleSetVMProfile{OsProfile: nil}}, Location: to.StringPtr("westus"), } + + expectedFailureDomain := "0" expectedTopologyZone := "0" expectedAzureDiskTopology := "" labels := buildGenericLabels(testVmss, testNodeName) + failureDomain, ok := labels[apiv1.LabelZoneFailureDomain] + assert.True(t, ok) + assert.Equal(t, expectedFailureDomain, failureDomain) + topologyZone, ok := labels[apiv1.LabelTopologyZone] assert.True(t, ok) assert.Equal(t, expectedTopologyZone, topologyZone) @@ -177,3 +219,11 @@ func TestEmptyTopologyFromScaleSet(t *testing.T) { assert.True(t, ok) assert.Equal(t, expectedAzureDiskTopology, azureDiskTopology) } + +func makeTaintSet(taints []apiv1.Taint) map[apiv1.Taint]bool { + set := make(map[apiv1.Taint]bool) + for _, taint := range taints { + set[taint] = true + } + return set +}