Skip to content

Commit

Permalink
Add toggle for fast delete
Browse files Browse the repository at this point in the history
Enable fast delete for flex
  • Loading branch information
willie-yao committed Oct 16, 2024
1 parent 0c597db commit 2d37e7d
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 17 deletions.
9 changes: 9 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ type Config struct {

UseFederatedWorkloadIdentityExtension bool `json:"useFederatedWorkloadIdentityExtension" yaml:"useFederatedWorkloadIdentityExtension"`
AADFederatedTokenFile string `json:"aadFederatedTokenFile" yaml:"aadFederatedTokenFile"`

// EnableFastDeleteOnFailedProvisioning defines whether to delete the experimental faster VMSS instance deletion on failed provisioning
EnableFastDeleteOnFailedProvisioning bool `json:"enableFastDeleteOnFailedProvisioning,omitempty" yaml:"enableFastDeleteOnFailedProvisioning,omitempty"`
}

// These are only here for backward compabitility. Their equivalent exists in providerazure.Config with a different name.
Expand Down Expand Up @@ -328,6 +331,12 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {

cfg.DeploymentParameters = parameters
}
if enableFastDeleteOnFailedProvisioning := os.Getenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING"); enableFastDeleteOnFailedProvisioning != "" {
cfg.EnableFastDeleteOnFailedProvisioning, err = strconv.ParseBool(enableFastDeleteOnFailedProvisioning)
if err != nil {
return nil, fmt.Errorf("failed to parse AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING: %q, %v", enableFastDeleteOnFailedProvisioning, err)
}
}
providerazureconfig.InitializeCloudProviderRateLimitConfig(&cfg.CloudProviderRateLimitConfig)

if err := cfg.validate(); err != nil {
Expand Down
10 changes: 7 additions & 3 deletions cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ const validAzureCfg = `{
"routeRateLimit": {
"cloudProviderRateLimit": true,
"cloudProviderRateLimitQPS": 3
}
},
"enableFastDeleteOnFailedProvisioning": true
}`

const validAzureCfgLegacy = `{
Expand Down Expand Up @@ -268,8 +269,9 @@ func TestCreateAzureManagerValidConfig(t *testing.T) {
},
},
},
VmssVmsCacheJitter: 120,
MaxDeploymentsCount: 8,
VmssVmsCacheJitter: 120,
MaxDeploymentsCount: 8,
EnableFastDeleteOnFailedProvisioning: true,
}

assert.NoError(t, err)
Expand Down Expand Up @@ -665,6 +667,7 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) {
t.Setenv("CLUSTER_NAME", "mycluster")
t.Setenv("ARM_CLUSTER_RESOURCE_GROUP", "myrg")
t.Setenv("ARM_BASE_URL_FOR_AP_CLIENT", "nodeprovisioner-svc.nodeprovisioner.svc.cluster.local")
t.Setenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING", "true")

t.Run("environment variables correctly set", func(t *testing.T) {
manager, err := createAzureManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient)
Expand Down Expand Up @@ -906,6 +909,7 @@ func TestCreateAzureManagerWithEnvOverridingConfig(t *testing.T) {
t.Setenv("CLUSTER_NAME", "mycluster")
t.Setenv("ARM_CLUSTER_RESOURCE_GROUP", "myrg")
t.Setenv("ARM_BASE_URL_FOR_AP_CLIENT", "nodeprovisioner-svc.nodeprovisioner.svc.cluster.local")
t.Setenv("AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING", "true")

t.Run("environment variables correctly set", func(t *testing.T) {
manager, err := createAzureManagerInternal(strings.NewReader(validAzureCfgForStandardVMType), cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient)
Expand Down
35 changes: 29 additions & 6 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ type ScaleSet struct {

// uses Azure Dedicated Host
dedicatedHost bool

enableFastDeleteOnFailedProvisioning bool
}

// NewScaleSet creates a new NewScaleSet.
Expand Down Expand Up @@ -128,6 +130,8 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64, d
klog.V(2).Infof("enableDetailedCSEMessage: %t", scaleSet.enableDetailedCSEMessage)
}

scaleSet.enableFastDeleteOnFailedProvisioning = az.config.EnableFastDeleteOnFailedProvisioning

return scaleSet, nil
}

Expand Down Expand Up @@ -686,7 +690,7 @@ func (scaleSet *ScaleSet) buildScaleSetCacheForFlex() error {
return rerr.Error()
}

scaleSet.instanceCache = buildInstanceCacheForFlex(vms)
scaleSet.instanceCache = buildInstanceCacheForFlex(vms, scaleSet.enableFastDeleteOnFailedProvisioning)
scaleSet.lastInstanceRefresh = lastRefresh

return nil
Expand Down Expand Up @@ -744,21 +748,21 @@ func (scaleSet *ScaleSet) buildScaleSetCacheForUniform() error {
// Note that the GetScaleSetVms() results is not used directly because for the List endpoint,
// their resource ID format is not consistent with Get endpoint
// buildInstanceCacheForFlex used by orchestrationMode == compute.Flexible
func buildInstanceCacheForFlex(vms []compute.VirtualMachine) []cloudprovider.Instance {
func buildInstanceCacheForFlex(vms []compute.VirtualMachine, enableFastDeleteOnFailedProvisioning bool) []cloudprovider.Instance {
var instances []cloudprovider.Instance
for _, vm := range vms {
powerState := vmPowerStateRunning
if vm.InstanceView != nil && vm.InstanceView.Statuses != nil {
powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses)
}
addVMToCache(&instances, vm.ID, vm.ProvisioningState, powerState)
addVMToCache(&instances, vm.ID, vm.ProvisioningState, powerState, enableFastDeleteOnFailedProvisioning)
}

return instances
}

// addVMToCache used by orchestrationMode == compute.Flexible
func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *string, powerState string) {
func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) {
// The resource ID is empty string, which indicates the instance may be in deleting state.
if len(*id) == 0 {
return
Expand All @@ -773,13 +777,13 @@ func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *st

*instances = append(*instances, cloudprovider.Instance{
Id: azurePrefix + resourceID,
Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState),
Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState, enableFastDeleteOnFailedProvisioning),
})
}

// instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state to cloudprovider.InstanceStatus
// instanceStatusFromProvisioningStateAndPowerState used by orchestrationMode == compute.Flexible
func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string) *cloudprovider.InstanceStatus {
func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string, enableFastDeleteOnFailedProvisioning bool) *cloudprovider.InstanceStatus {
if provisioningState == nil {
return nil
}
Expand All @@ -792,6 +796,25 @@ func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisi
status.State = cloudprovider.InstanceDeleting
case string(compute.ProvisioningStateCreating):
status.State = cloudprovider.InstanceCreating
case string(compute.ProvisioningStateFailed):
if enableFastDeleteOnFailedProvisioning {
// Provisioning can fail both during instance creation or after the instance is running.
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,
// ProvisioningState represents the most recent provisioning state, therefore only report
// InstanceCreating errors when the power state indicates the instance has not yet started running
if !isRunningVmPowerState(powerState) {
klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", resourceID, powerState)
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
}
} else {
klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", resourceID, powerState)
status.State = cloudprovider.InstanceRunning
}
}
default:
status.State = cloudprovider.InstanceRunning
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package azure

import (
"fmt"
"strconv"
"sync"
"time"

Expand Down Expand Up @@ -211,20 +212,45 @@ func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSe
}
return nil
}
powerState := vmPowerStateRunning
if vm.InstanceView != nil && vm.InstanceView.Statuses != nil {
powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses)
}

status := &cloudprovider.InstanceStatus{}
switch *vm.ProvisioningState {
case string(compute.ProvisioningStateDeleting):
status.State = cloudprovider.InstanceDeleting
case string(compute.ProvisioningStateCreating):
status.State = cloudprovider.InstanceCreating
case string(compute.ProvisioningStateFailed):
klog.V(3).Infof("VM %s reports failed provisioning state with power state: %s, eligible for fast delete: %s", to.String(vm.ID), powerState, strconv.FormatBool(scaleSet.enableFastDeleteOnFailedProvisioning))
if scaleSet.enableFastDeleteOnFailedProvisioning {
// Provisioning can fail both during instance creation or after the instance is running.
// Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states,
// ProvisioningState represents the most recent provisioning state, therefore only report
// InstanceCreating errors when the power state indicates the instance has not yet started running
if !isRunningVmPowerState(powerState) {
klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", *vm.ID, powerState)
status.State = cloudprovider.InstanceCreating
status.ErrorInfo = &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "provisioning-state-failed",
ErrorMessage: "Azure failed to provision a node for this node group",
}
} else {
klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", *vm.ID, powerState)
status.State = cloudprovider.InstanceRunning
}
}
default:
status.State = cloudprovider.InstanceRunning
}

// Add vmssCSE Provisioning Failed Message in error info body for vmssCSE Extensions if enableDetailedCSEMessage is true
if scaleSet.enableDetailedCSEMessage && vm.InstanceView != nil {
if err, failed := scaleSet.cseErrors(vm.InstanceView.Extensions); failed {
klog.V(3).Infof("VM %s reports CSE failure: %v, with provisioning state %s, power state %s", to.String(vm.ID), err, to.String(vm.ProvisioningState), powerState)
errorInfo := &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: vmssExtensionProvisioningFailed,
Expand Down
18 changes: 10 additions & 8 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,11 @@ func newTestScaleSet(manager *AzureManager, name string) *ScaleSet {
azureRef: azureRef{
Name: name,
},
manager: manager,
minSize: 1,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
manager: manager,
minSize: 1,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
enableFastDeleteOnFailedProvisioning: true,
}
}

Expand All @@ -55,10 +56,11 @@ func newTestScaleSetMinSizeZero(manager *AzureManager, name string) *ScaleSet {
azureRef: azureRef{
Name: name,
},
manager: manager,
minSize: 0,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
manager: manager,
minSize: 0,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
enableFastDeleteOnFailedProvisioning: true,
}
}

Expand Down

0 comments on commit 2d37e7d

Please sign in to comment.