From 7212628934390a5df6a3c57bac13019df9c2a472 Mon Sep 17 00:00:00 2001 From: Jont828 Date: Wed, 10 Jan 2024 18:34:08 -0500 Subject: [PATCH] Implement force delete in Azure provider --- charts/cluster-autoscaler/Chart.yaml | 2 +- charts/cluster-autoscaler/README.md | 1 + .../templates/deployment.yaml | 2 + charts/cluster-autoscaler/values.yaml | 3 ++ .../azure/azure_cloud_provider_test.go | 1 + .../cloudprovider/azure/azure_config.go | 10 ++++ .../cloudprovider/azure/azure_scale_set.go | 18 ++++++- .../azure/azure_scale_set_test.go | 48 +++++++++++++++---- 8 files changed, 73 insertions(+), 12 deletions(-) diff --git a/charts/cluster-autoscaler/Chart.yaml b/charts/cluster-autoscaler/Chart.yaml index 504eea76d3a8..2438a226ff14 100644 --- a/charts/cluster-autoscaler/Chart.yaml +++ b/charts/cluster-autoscaler/Chart.yaml @@ -11,4 +11,4 @@ name: cluster-autoscaler sources: - https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler type: application -version: 9.34.1 +version: 9.35.0 diff --git a/charts/cluster-autoscaler/README.md b/charts/cluster-autoscaler/README.md index 9bc2619ca048..2c77e29c6438 100644 --- a/charts/cluster-autoscaler/README.md +++ b/charts/cluster-autoscaler/README.md @@ -382,6 +382,7 @@ vpa: | awsSecretAccessKey | string | `""` | AWS access secret key ([if AWS user keys used](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md#using-aws-credentials)) | | azureClientID | string | `""` | Service Principal ClientID with contributor permission to Cluster and Node ResourceGroup. Required if `cloudProvider=azure` | | azureClientSecret | string | `""` | Service Principal ClientSecret with contributor permission to Cluster and Node ResourceGroup. Required if `cloudProvider=azure` | +| azureEnableForceDelete | bool | `false` | Whether to force delete VMs or VMSS instances when scaling down. | | azureResourceGroup | string | `""` | Azure resource group that the cluster is located. Required if `cloudProvider=azure` | | azureSubscriptionID | string | `""` | Azure subscription where the resources are located. Required if `cloudProvider=azure` | | azureTenantID | string | `""` | Azure tenant where the resources are located. Required if `cloudProvider=azure` | diff --git a/charts/cluster-autoscaler/templates/deployment.yaml b/charts/cluster-autoscaler/templates/deployment.yaml index ccbe4353edc8..02835eb85683 100644 --- a/charts/cluster-autoscaler/templates/deployment.yaml +++ b/charts/cluster-autoscaler/templates/deployment.yaml @@ -166,6 +166,8 @@ spec: secretKeyRef: key: VMType name: {{ default (include "cluster-autoscaler.fullname" .) .Values.secretKeyRefNameOverride }} + - name: AZURE_ENABLE_FORCE_DELETE + value: "{{ .Values.azureEnableForceDelete }}" {{- if .Values.azureUseWorkloadIdentityExtension }} - name: ARM_USE_WORKLOAD_IDENTITY_EXTENSION value: "true" diff --git a/charts/cluster-autoscaler/values.yaml b/charts/cluster-autoscaler/values.yaml index 15e3177a1c8a..7492e5d2db9e 100644 --- a/charts/cluster-autoscaler/values.yaml +++ b/charts/cluster-autoscaler/values.yaml @@ -96,6 +96,9 @@ azureUseWorkloadIdentityExtension: false # azureVMType -- Azure VM type. azureVMType: "vmss" +# azureEnableForceDelete -- Whether to force delete VMs or VMSS instances when scaling down. +azureEnableForceDelete: false + # cloudConfigPath -- Configuration file for cloud provider. cloudConfigPath: "" diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go index ff9d02749058..6038db4b5c43 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go @@ -53,6 +53,7 @@ func newTestAzureManager(t *testing.T) *AzureManager { VMType: vmTypeVMSS, MaxDeploymentsCount: 2, Deployment: "deployment", + EnableForceDelete: true, }, azClient: &azClient{ virtualMachineScaleSetsClient: mockVMSSClient, diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index 9fe559d07e14..2f6f5f279ab2 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -130,6 +130,9 @@ type Config struct { CloudProviderBackoffDuration int `json:"cloudProviderBackoffDuration,omitempty" yaml:"cloudProviderBackoffDuration,omitempty"` CloudProviderBackoffJitter float64 `json:"cloudProviderBackoffJitter,omitempty" yaml:"cloudProviderBackoffJitter,omitempty"` + // EnableForceDelete defines whether to enable force deletion on the APIs + EnableForceDelete bool `json:"enableForceDelete,omitempty" yaml:"enableForceDelete,omitempty"` + // EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check EnableDynamicInstanceList bool `json:"enableDynamicInstanceList,omitempty" yaml:"enableDynamicInstanceList,omitempty"` @@ -303,6 +306,13 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { } } + if enableForceDelete := os.Getenv("AZURE_ENABLE_FORCE_DELETE"); enableForceDelete != "" { + cfg.EnableForceDelete, err = strconv.ParseBool(enableForceDelete) + if err != nil { + return nil, fmt.Errorf("failed to parse AZURE_ENABLE_FORCE_DELETE: %q, %v", enableForceDelete, err) + } + } + err = initializeCloudProviderRateLimitConfig(&cfg.CloudProviderRateLimitConfig) if err != nil { return nil, err diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index f0b2e83f07e8..99c6af3c42c1 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -58,6 +58,8 @@ type ScaleSet struct { minSize int maxSize int + enableForceDelete bool + sizeMutex sync.Mutex curSize int64 @@ -87,6 +89,7 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) ( sizeRefreshPeriod: az.azureCache.refreshInterval, enableDynamicInstanceList: az.config.EnableDynamicInstanceList, instancesRefreshJitter: az.config.VmssVmsCacheJitter, + enableForceDelete: az.config.EnableForceDelete, } if az.config.VmssVmsCacheTTL != 0 { @@ -432,8 +435,15 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered resourceGroup := scaleSet.manager.config.ResourceGroup scaleSet.instanceMutex.Lock() - klog.V(3).Infof("Calling virtualMachineScaleSetsClient.DeleteInstancesAsync(%v)", requiredIds.InstanceIds) - future, rerr := scaleSet.manager.azClient.virtualMachineScaleSetsClient.DeleteInstancesAsync(ctx, resourceGroup, commonAsg.Id(), *requiredIds, false) + klog.V(3).Infof("Calling virtualMachineScaleSetsClient.DeleteInstancesAsync(%v), force delete set to %v", requiredIds.InstanceIds, scaleSet.enableForceDelete) + future, rerr := scaleSet.manager.azClient.virtualMachineScaleSetsClient.DeleteInstancesAsync(ctx, resourceGroup, commonAsg.Id(), *requiredIds, scaleSet.enableForceDelete) + + if scaleSet.enableForceDelete && isOperationNotAllowed(rerr) { + klog.Infof("falling back to normal delete for instances %v for %s", requiredIds.InstanceIds, scaleSet.Name) + future, rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.DeleteInstancesAsync(ctx, resourceGroup, + commonAsg.Id(), *requiredIds, false) + } + scaleSet.instanceMutex.Unlock() if rerr != nil { klog.Errorf("virtualMachineScaleSetsClient.DeleteInstancesAsync for instances %v failed: %v", requiredIds.InstanceIds, rerr) @@ -741,3 +751,7 @@ func (scaleSet *ScaleSet) getOrchestrationMode() (compute.OrchestrationMode, err } return vmss.OrchestrationMode, nil } + +func isOperationNotAllowed(rerr *retry.Error) bool { + return rerr != nil && rerr.ServiceErrorCode() == retry.OperationNotAllowed +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index cd7c7d56b772..9a409cd25949 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -37,9 +37,10 @@ func newTestScaleSet(manager *AzureManager, name string) *ScaleSet { azureRef: azureRef{ Name: name, }, - manager: manager, - minSize: 1, - maxSize: 5, + manager: manager, + minSize: 1, + maxSize: 5, + enableForceDelete: manager.config.EnableForceDelete, } } @@ -405,18 +406,48 @@ func TestDeleteNodes(t *testing.T) { vmssName := "test-asg" var vmssCapacity int64 = 3 - orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible} - expectedVMSSVMs := newTestVMSSVMList(3) - expectedVMs := newTestVMList(3) + cases := []struct { + name string + orchestrationMode compute.OrchestrationMode + enableForceDelete bool + }{ + { + name: "uniform, force delete enabled", + orchestrationMode: compute.Uniform, + enableForceDelete: true, + }, + { + name: "uniform, force delete disabled", + orchestrationMode: compute.Uniform, + enableForceDelete: false, + }, + { + name: "flexible, force delete enabled", + orchestrationMode: compute.Flexible, + enableForceDelete: true, + }, + { + name: "flexible, force delete disabled", + orchestrationMode: compute.Flexible, + enableForceDelete: false, + }, + } - for _, orchMode := range orchestrationModes { + for _, tc := range cases { + orchMode := tc.orchestrationMode + enableForceDelete := tc.enableForceDelete + + expectedVMSSVMs := newTestVMSSVMList(3) + expectedVMs := newTestVMList(3) manager := newTestAzureManager(t) + manager.config.EnableForceDelete = enableForceDelete expectedScaleSets := newTestVMSSList(vmssCapacity, vmssName, "eastus", orchMode) + fmt.Printf("orchMode: %s, enableForceDelete: %t\n", orchMode, enableForceDelete) mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2) - mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any(), false).Return(nil, nil) + mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any(), enableForceDelete).Return(nil, nil) mockVMSSClient.EXPECT().WaitForDeleteInstancesResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient @@ -497,7 +528,6 @@ func TestDeleteNodes(t *testing.T) { instance2, found := scaleSet.getInstanceByProviderID(nodesToDelete[1].Spec.ProviderID) assert.True(t, found, true) assert.Equal(t, instance2.Status.State, cloudprovider.InstanceDeleting) - } }