From 2d37e7de7e956f94067c9ca30c8c71091fff0c70 Mon Sep 17 00:00:00 2001 From: willie-yao Date: Wed, 9 Oct 2024 23:51:47 +0000 Subject: [PATCH] Add toggle for fast delete Enable fast delete for flex --- .../cloudprovider/azure/azure_config.go | 9 +++++ .../cloudprovider/azure/azure_manager_test.go | 10 ++++-- .../cloudprovider/azure/azure_scale_set.go | 35 +++++++++++++++---- .../azure/azure_scale_set_instance_cache.go | 26 ++++++++++++++ .../azure/azure_scale_set_test.go | 18 +++++----- 5 files changed, 81 insertions(+), 17 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index 5511001dde03..ecbc7ed044f3 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -100,6 +100,9 @@ type Config struct { UseFederatedWorkloadIdentityExtension bool `json:"useFederatedWorkloadIdentityExtension" yaml:"useFederatedWorkloadIdentityExtension"` AADFederatedTokenFile string `json:"aadFederatedTokenFile" yaml:"aadFederatedTokenFile"` + + // EnableFastDeleteOnFailedProvisioning defines whether to delete the experimental faster VMSS instance deletion on failed provisioning + EnableFastDeleteOnFailedProvisioning bool `json:"enableFastDeleteOnFailedProvisioning,omitempty" yaml:"enableFastDeleteOnFailedProvisioning,omitempty"` } // These are only here for backward compabitility. Their equivalent exists in providerazure.Config with a different name. @@ -328,6 +331,12 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { cfg.DeploymentParameters = parameters } + if enableFastDeleteOnFailedProvisioning := os.Getenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING"); enableFastDeleteOnFailedProvisioning != "" { + cfg.EnableFastDeleteOnFailedProvisioning, err = strconv.ParseBool(enableFastDeleteOnFailedProvisioning) + if err != nil { + return nil, fmt.Errorf("failed to parse AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING: %q, %v", enableFastDeleteOnFailedProvisioning, err) + } + } providerazureconfig.InitializeCloudProviderRateLimitConfig(&cfg.CloudProviderRateLimitConfig) if err := cfg.validate(); err != nil { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index 1b0f6a835a8c..33230550104b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -63,7 +63,8 @@ const validAzureCfg = `{ "routeRateLimit": { "cloudProviderRateLimit": true, "cloudProviderRateLimitQPS": 3 - } + }, + "enableFastDeleteOnFailedProvisioning": true }` const validAzureCfgLegacy = `{ @@ -268,8 +269,9 @@ func TestCreateAzureManagerValidConfig(t *testing.T) { }, }, }, - VmssVmsCacheJitter: 120, - MaxDeploymentsCount: 8, + VmssVmsCacheJitter: 120, + MaxDeploymentsCount: 8, + EnableFastDeleteOnFailedProvisioning: true, } assert.NoError(t, err) @@ -665,6 +667,7 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) { t.Setenv("CLUSTER_NAME", "mycluster") t.Setenv("ARM_CLUSTER_RESOURCE_GROUP", "myrg") t.Setenv("ARM_BASE_URL_FOR_AP_CLIENT", "nodeprovisioner-svc.nodeprovisioner.svc.cluster.local") + t.Setenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING", "true") t.Run("environment variables correctly set", func(t *testing.T) { manager, err := createAzureManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient) @@ -906,6 +909,7 @@ func TestCreateAzureManagerWithEnvOverridingConfig(t *testing.T) { t.Setenv("CLUSTER_NAME", "mycluster") t.Setenv("ARM_CLUSTER_RESOURCE_GROUP", "myrg") t.Setenv("ARM_BASE_URL_FOR_AP_CLIENT", "nodeprovisioner-svc.nodeprovisioner.svc.cluster.local") + t.Setenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING", "true") t.Run("environment variables correctly set", func(t *testing.T) { manager, err := createAzureManagerInternal(strings.NewReader(validAzureCfgForStandardVMType), cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 491a3c10c6c1..353731c9a4c3 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -87,6 +87,8 @@ type ScaleSet struct { // uses Azure Dedicated Host dedicatedHost bool + + enableFastDeleteOnFailedProvisioning bool } // NewScaleSet creates a new NewScaleSet. @@ -128,6 +130,8 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64, d klog.V(2).Infof("enableDetailedCSEMessage: %t", scaleSet.enableDetailedCSEMessage) } + scaleSet.enableFastDeleteOnFailedProvisioning = az.config.EnableFastDeleteOnFailedProvisioning + return scaleSet, nil } @@ -686,7 +690,7 @@ func (scaleSet *ScaleSet) buildScaleSetCacheForFlex() error { return rerr.Error() } - scaleSet.instanceCache = buildInstanceCacheForFlex(vms) + scaleSet.instanceCache = buildInstanceCacheForFlex(vms, scaleSet.enableFastDeleteOnFailedProvisioning) scaleSet.lastInstanceRefresh = lastRefresh return nil @@ -744,21 +748,21 @@ func (scaleSet *ScaleSet) buildScaleSetCacheForUniform() error { // Note that the GetScaleSetVms() results is not used directly because for the List endpoint, // their resource ID format is not consistent with Get endpoint // buildInstanceCacheForFlex used by orchestrationMode == compute.Flexible -func buildInstanceCacheForFlex(vms []compute.VirtualMachine) []cloudprovider.Instance { +func buildInstanceCacheForFlex(vms []compute.VirtualMachine, enableFastDeleteOnFailedProvisioning bool) []cloudprovider.Instance { var instances []cloudprovider.Instance for _, vm := range vms { powerState := vmPowerStateRunning if vm.InstanceView != nil && vm.InstanceView.Statuses != nil { powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses) } - addVMToCache(&instances, vm.ID, vm.ProvisioningState, powerState) + addVMToCache(&instances, vm.ID, vm.ProvisioningState, powerState, enableFastDeleteOnFailedProvisioning) } return instances } // addVMToCache used by orchestrationMode == compute.Flexible -func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *string, powerState string) { +func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) { // The resource ID is empty string, which indicates the instance may be in deleting state. if len(*id) == 0 { return @@ -773,13 +777,13 @@ func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *st *instances = append(*instances, cloudprovider.Instance{ Id: azurePrefix + resourceID, - Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState), + Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState, enableFastDeleteOnFailedProvisioning), }) } // instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state to cloudprovider.InstanceStatus // instanceStatusFromProvisioningStateAndPowerState used by orchestrationMode == compute.Flexible -func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string) *cloudprovider.InstanceStatus { +func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) *cloudprovider.InstanceStatus { if provisioningState == nil { return nil } @@ -792,6 +796,25 @@ func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisi status.State = cloudprovider.InstanceDeleting case string(compute.ProvisioningStateCreating): status.State = cloudprovider.InstanceCreating + case string(compute.ProvisioningStateFailed): + if enableFastDeleteOnFailedProvisioning { + // Provisioning can fail both during instance creation or after the instance is running. + // Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states, + // ProvisioningState represents the most recent provisioning state, therefore only report + // InstanceCreating errors when the power state indicates the instance has not yet started running + if !isRunningVmPowerState(powerState) { + klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", resourceID, powerState) + status.State = cloudprovider.InstanceCreating + status.ErrorInfo = &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OutOfResourcesErrorClass, + ErrorCode: "provisioning-state-failed", + ErrorMessage: "Azure failed to provision a node for this node group", + } + } else { + klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", resourceID, powerState) + status.State = cloudprovider.InstanceRunning + } + } default: status.State = cloudprovider.InstanceRunning } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go index bf8ae065cdbb..444523d1931a 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go @@ -18,6 +18,7 @@ package azure import ( "fmt" + "strconv" "sync" "time" @@ -211,6 +212,10 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe } return nil } + powerState := vmPowerStateRunning + if vm.InstanceView != nil && vm.InstanceView.Statuses != nil { + powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses) + } status := &cloudprovider.InstanceStatus{} switch *vm.ProvisioningState { @@ -218,6 +223,26 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe status.State = cloudprovider.InstanceDeleting case string(compute.ProvisioningStateCreating): status.State = cloudprovider.InstanceCreating + case string(compute.ProvisioningStateFailed): + klog.V(3).Infof("VM %s reports failed provisioning state with power state: %s, eligible for fast delete: %s", to.String(vm.ID), powerState, strconv.FormatBool(scaleSet.enableFastDeleteOnFailedProvisioning)) + if scaleSet.enableFastDeleteOnFailedProvisioning { + // Provisioning can fail both during instance creation or after the instance is running. + // Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states, + // ProvisioningState represents the most recent provisioning state, therefore only report + // InstanceCreating errors when the power state indicates the instance has not yet started running + if !isRunningVmPowerState(powerState) { + klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", *vm.ID, powerState) + status.State = cloudprovider.InstanceCreating + status.ErrorInfo = &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OutOfResourcesErrorClass, + ErrorCode: "provisioning-state-failed", + ErrorMessage: "Azure failed to provision a node for this node group", + } + } else { + klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", *vm.ID, powerState) + status.State = cloudprovider.InstanceRunning + } + } default: status.State = cloudprovider.InstanceRunning } @@ -225,6 +250,7 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe // Add vmssCSE Provisioning Failed Message in error info body for vmssCSE Extensions if enableDetailedCSEMessage is true if scaleSet.enableDetailedCSEMessage && vm.InstanceView != nil { if err, failed := scaleSet.cseErrors(vm.InstanceView.Extensions); failed { + klog.V(3).Infof("VM %s reports CSE failure: %v, with provisioning state %s, power state %s", to.String(vm.ID), err, to.String(vm.ProvisioningState), powerState) errorInfo := &cloudprovider.InstanceErrorInfo{ ErrorClass: cloudprovider.OtherErrorClass, ErrorCode: vmssExtensionProvisioningFailed, diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index 477760f7fe72..fff3e9ae0ef3 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -43,10 +43,11 @@ func newTestScaleSet(manager *AzureManager, name string) *ScaleSet { azureRef: azureRef{ Name: name, }, - manager: manager, - minSize: 1, - maxSize: 5, - enableForceDelete: manager.config.EnableForceDelete, + manager: manager, + minSize: 1, + maxSize: 5, + enableForceDelete: manager.config.EnableForceDelete, + enableFastDeleteOnFailedProvisioning: true, } } @@ -55,10 +56,11 @@ func newTestScaleSetMinSizeZero(manager *AzureManager, name string) *ScaleSet { azureRef: azureRef{ Name: name, }, - manager: manager, - minSize: 0, - maxSize: 5, - enableForceDelete: manager.config.EnableForceDelete, + manager: manager, + minSize: 0, + maxSize: 5, + enableForceDelete: manager.config.EnableForceDelete, + enableFastDeleteOnFailedProvisioning: true, } }