Skip to content

Commit

Permalink
Implement force delete in Azure provider
Browse files Browse the repository at this point in the history
  • Loading branch information
Jont828 committed Jan 10, 2024
1 parent d29ffd0 commit 7212628
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 12 deletions.
2 changes: 1 addition & 1 deletion charts/cluster-autoscaler/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ name: cluster-autoscaler
sources:
- https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler
type: application
version: 9.34.1
version: 9.35.0
1 change: 1 addition & 0 deletions charts/cluster-autoscaler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ vpa:
| awsSecretAccessKey | string | `""` | AWS access secret key ([if AWS user keys used](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md#using-aws-credentials)) |
| azureClientID | string | `""` | Service Principal ClientID with contributor permission to Cluster and Node ResourceGroup. Required if `cloudProvider=azure` |
| azureClientSecret | string | `""` | Service Principal ClientSecret with contributor permission to Cluster and Node ResourceGroup. Required if `cloudProvider=azure` |
| azureEnableForceDelete | bool | `false` | Whether to force delete VMs or VMSS instances when scaling down. |
| azureResourceGroup | string | `""` | Azure resource group that the cluster is located. Required if `cloudProvider=azure` |
| azureSubscriptionID | string | `""` | Azure subscription where the resources are located. Required if `cloudProvider=azure` |
| azureTenantID | string | `""` | Azure tenant where the resources are located. Required if `cloudProvider=azure` |
Expand Down
2 changes: 2 additions & 0 deletions charts/cluster-autoscaler/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ spec:
secretKeyRef:
key: VMType
name: {{ default (include "cluster-autoscaler.fullname" .) .Values.secretKeyRefNameOverride }}
- name: AZURE_ENABLE_FORCE_DELETE
value: "{{ .Values.azureEnableForceDelete }}"
{{- if .Values.azureUseWorkloadIdentityExtension }}
- name: ARM_USE_WORKLOAD_IDENTITY_EXTENSION
value: "true"
Expand Down
3 changes: 3 additions & 0 deletions charts/cluster-autoscaler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ azureUseWorkloadIdentityExtension: false
# azureVMType -- Azure VM type.
azureVMType: "vmss"

# azureEnableForceDelete -- Whether to force delete VMs or VMSS instances when scaling down.
azureEnableForceDelete: false

# cloudConfigPath -- Configuration file for cloud provider.
cloudConfigPath: ""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ func newTestAzureManager(t *testing.T) *AzureManager {
VMType: vmTypeVMSS,
MaxDeploymentsCount: 2,
Deployment: "deployment",
EnableForceDelete: true,
},
azClient: &azClient{
virtualMachineScaleSetsClient: mockVMSSClient,
Expand Down
10 changes: 10 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ type Config struct {
CloudProviderBackoffDuration int `json:"cloudProviderBackoffDuration,omitempty" yaml:"cloudProviderBackoffDuration,omitempty"`
CloudProviderBackoffJitter float64 `json:"cloudProviderBackoffJitter,omitempty" yaml:"cloudProviderBackoffJitter,omitempty"`

// EnableForceDelete defines whether to enable force deletion on the APIs
EnableForceDelete bool `json:"enableForceDelete,omitempty" yaml:"enableForceDelete,omitempty"`

// EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check
EnableDynamicInstanceList bool `json:"enableDynamicInstanceList,omitempty" yaml:"enableDynamicInstanceList,omitempty"`

Expand Down Expand Up @@ -303,6 +306,13 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
}
}

if enableForceDelete := os.Getenv("AZURE_ENABLE_FORCE_DELETE"); enableForceDelete != "" {
cfg.EnableForceDelete, err = strconv.ParseBool(enableForceDelete)
if err != nil {
return nil, fmt.Errorf("failed to parse AZURE_ENABLE_FORCE_DELETE: %q, %v", enableForceDelete, err)
}
}

err = initializeCloudProviderRateLimitConfig(&cfg.CloudProviderRateLimitConfig)
if err != nil {
return nil, err
Expand Down
18 changes: 16 additions & 2 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ type ScaleSet struct {
minSize int
maxSize int

enableForceDelete bool

sizeMutex sync.Mutex
curSize int64

Expand Down Expand Up @@ -87,6 +89,7 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) (
sizeRefreshPeriod: az.azureCache.refreshInterval,
enableDynamicInstanceList: az.config.EnableDynamicInstanceList,
instancesRefreshJitter: az.config.VmssVmsCacheJitter,
enableForceDelete: az.config.EnableForceDelete,
}

if az.config.VmssVmsCacheTTL != 0 {
Expand Down Expand Up @@ -432,8 +435,15 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered
resourceGroup := scaleSet.manager.config.ResourceGroup

scaleSet.instanceMutex.Lock()
klog.V(3).Infof("Calling virtualMachineScaleSetsClient.DeleteInstancesAsync(%v)", requiredIds.InstanceIds)
future, rerr := scaleSet.manager.azClient.virtualMachineScaleSetsClient.DeleteInstancesAsync(ctx, resourceGroup, commonAsg.Id(), *requiredIds, false)
klog.V(3).Infof("Calling virtualMachineScaleSetsClient.DeleteInstancesAsync(%v), force delete set to %v", requiredIds.InstanceIds, scaleSet.enableForceDelete)
future, rerr := scaleSet.manager.azClient.virtualMachineScaleSetsClient.DeleteInstancesAsync(ctx, resourceGroup, commonAsg.Id(), *requiredIds, scaleSet.enableForceDelete)

if scaleSet.enableForceDelete && isOperationNotAllowed(rerr) {
klog.Infof("falling back to normal delete for instances %v for %s", requiredIds.InstanceIds, scaleSet.Name)
future, rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.DeleteInstancesAsync(ctx, resourceGroup,
commonAsg.Id(), *requiredIds, false)
}

scaleSet.instanceMutex.Unlock()
if rerr != nil {
klog.Errorf("virtualMachineScaleSetsClient.DeleteInstancesAsync for instances %v failed: %v", requiredIds.InstanceIds, rerr)
Expand Down Expand Up @@ -741,3 +751,7 @@ func (scaleSet *ScaleSet) getOrchestrationMode() (compute.OrchestrationMode, err
}
return vmss.OrchestrationMode, nil
}

func isOperationNotAllowed(rerr *retry.Error) bool {
return rerr != nil && rerr.ServiceErrorCode() == retry.OperationNotAllowed
}
48 changes: 39 additions & 9 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ func newTestScaleSet(manager *AzureManager, name string) *ScaleSet {
azureRef: azureRef{
Name: name,
},
manager: manager,
minSize: 1,
maxSize: 5,
manager: manager,
minSize: 1,
maxSize: 5,
enableForceDelete: manager.config.EnableForceDelete,
}
}

Expand Down Expand Up @@ -405,18 +406,48 @@ func TestDeleteNodes(t *testing.T) {

vmssName := "test-asg"
var vmssCapacity int64 = 3
orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible}
expectedVMSSVMs := newTestVMSSVMList(3)
expectedVMs := newTestVMList(3)
cases := []struct {
name string
orchestrationMode compute.OrchestrationMode
enableForceDelete bool
}{
{
name: "uniform, force delete enabled",
orchestrationMode: compute.Uniform,
enableForceDelete: true,
},
{
name: "uniform, force delete disabled",
orchestrationMode: compute.Uniform,
enableForceDelete: false,
},
{
name: "flexible, force delete enabled",
orchestrationMode: compute.Flexible,
enableForceDelete: true,
},
{
name: "flexible, force delete disabled",
orchestrationMode: compute.Flexible,
enableForceDelete: false,
},
}

for _, orchMode := range orchestrationModes {
for _, tc := range cases {
orchMode := tc.orchestrationMode
enableForceDelete := tc.enableForceDelete

expectedVMSSVMs := newTestVMSSVMList(3)
expectedVMs := newTestVMList(3)

manager := newTestAzureManager(t)
manager.config.EnableForceDelete = enableForceDelete
expectedScaleSets := newTestVMSSList(vmssCapacity, vmssName, "eastus", orchMode)
fmt.Printf("orchMode: %s, enableForceDelete: %t\n", orchMode, enableForceDelete)

mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2)
mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any(), false).Return(nil, nil)
mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any(), enableForceDelete).Return(nil, nil)
mockVMSSClient.EXPECT().WaitForDeleteInstancesResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes()
manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient

Expand Down Expand Up @@ -497,7 +528,6 @@ func TestDeleteNodes(t *testing.T) {
instance2, found := scaleSet.getInstanceByProviderID(nodesToDelete[1].Spec.ProviderID)
assert.True(t, found, true)
assert.Equal(t, instance2.Status.State, cloudprovider.InstanceDeleting)

}
}

Expand Down

0 comments on commit 7212628

Please sign in to comment.