Skip to content

Commit

Permalink
Merge pull request #2981 from nilo19/propagate-error
Browse files Browse the repository at this point in the history
Properly propagate scale-up failures in scale sets.
  • Loading branch information
k8s-ci-robot authored Mar 28, 2020
2 parents c31848d + b2988e5 commit d253829
Show file tree
Hide file tree
Showing 5 changed files with 575 additions and 31 deletions.
7 changes: 1 addition & 6 deletions cluster-autoscaler/cloudprovider/azure/azure_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,16 @@ import (

"github.com/Azure/azure-sdk-for-go/services/containerservice/mgmt/2018-03-31/containerservice"
"github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2017-05-10/resources"
"github.com/Azure/azure-sdk-for-go/services/storage/mgmt/2018-07-01/storage"
"github.com/Azure/go-autorest/autorest"
"github.com/Azure/go-autorest/autorest/adal"
"github.com/Azure/go-autorest/autorest/azure"

"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/azure/clients/vmssclient"
"k8s.io/klog"
"k8s.io/legacy-cloud-providers/azure/clients/diskclient"
"k8s.io/legacy-cloud-providers/azure/clients/interfaceclient"
"k8s.io/legacy-cloud-providers/azure/clients/storageaccountclient"
"k8s.io/legacy-cloud-providers/azure/clients/vmclient"
"k8s.io/legacy-cloud-providers/azure/clients/vmssclient"
"k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient"
)

Expand Down Expand Up @@ -135,10 +134,6 @@ func (az *azDeploymentsClient) Delete(ctx context.Context, resourceGroupName, de
return future.Response(), err
}

type azAccountsClient struct {
client storage.AccountsClient
}

type azClient struct {
virtualMachineScaleSetsClient vmssclient.Interface
virtualMachineScaleSetVMsClient vmssvmclient.Interface
Expand Down
11 changes: 11 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_fakes.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/Azure/azure-sdk-for-go/services/network/mgmt/2019-06-01/network"
"github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2017-05-10/resources"
"github.com/Azure/azure-sdk-for-go/services/storage/mgmt/2019-06-01/storage"
"github.com/Azure/go-autorest/autorest/azure"
"github.com/stretchr/testify/mock"

"k8s.io/legacy-cloud-providers/azure/retry"
Expand Down Expand Up @@ -72,6 +73,16 @@ func (client *VirtualMachineScaleSetsClientMock) CreateOrUpdate(ctx context.Cont
return nil
}

// CreateOrUpdateAsync sends the request to arm client and DO NOT wait for the response
func (client *VirtualMachineScaleSetsClientMock) CreateOrUpdateAsync(ctx context.Context, resourceGroupName string, VMScaleSetName string, parameters compute.VirtualMachineScaleSet) (*azure.Future, *retry.Error) {
return nil, nil
}

// WaitForAsyncOperationResult waits for the response of the request
func (client *VirtualMachineScaleSetsClientMock) WaitForAsyncOperationResult(ctx context.Context, future *azure.Future) (*http.Response, error) {
return nil, nil
}

// DeleteInstances deletes a set of instances for specified VirtualMachineScaleSet.
func (client *VirtualMachineScaleSetsClientMock) DeleteInstances(ctx context.Context, resourceGroupName string, vmScaleSetName string, vmInstanceIDs compute.VirtualMachineScaleSetVMInstanceRequiredIDs) *retry.Error {
args := client.Called(resourceGroupName, vmScaleSetName, vmInstanceIDs)
Expand Down
63 changes: 38 additions & 25 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (

"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute"
"github.com/Azure/go-autorest/autorest"
"github.com/Azure/go-autorest/autorest/azure"
)

var (
Expand Down Expand Up @@ -227,24 +228,47 @@ func (scaleSet *ScaleSet) GetScaleSetSize() (int64, error) {
}

// updateVMSSCapacity invokes virtualMachineScaleSetsClient to update the capacity for VMSS.
func (scaleSet *ScaleSet) updateVMSSCapacity(size int64) {
var vmssInfo compute.VirtualMachineScaleSet
var rerr *retry.Error
func (scaleSet *ScaleSet) updateVMSSCapacity(future *azure.Future) {
var err error

defer func() {
if rerr != nil {
klog.Errorf("Failed to update the capacity for vmss %s with error %v, invalidate the cache so as to get the real size from API", scaleSet.Name, rerr)
if err != nil {
klog.Errorf("Failed to update the capacity for vmss %s with error %v, invalidate the cache so as to get the real size from API", scaleSet.Name, err)
// Invalidate the VMSS size cache in order to fetch the size from the API.
scaleSet.sizeMutex.Lock()
defer scaleSet.sizeMutex.Unlock()
scaleSet.lastSizeRefresh = time.Now().Add(-1 * scaleSet.sizeRefreshPeriod)
}
}()

vmssInfo, rerr = scaleSet.getVMSSInfo()
ctx, cancel := getContextWithCancel()
defer cancel()
httpResponse, err := scaleSet.manager.azClient.virtualMachineScaleSetsClient.WaitForAsyncOperationResult(ctx, future)

isSuccess, err := isSuccessHTTPResponse(httpResponse, err)
if isSuccess {
klog.V(3).Infof("virtualMachineScaleSetsClient.WaitForAsyncOperationResult(%s) success", scaleSet.Name)
scaleSet.invalidateInstanceCache()

return
}

klog.Errorf("virtualMachineScaleSetsClient.WaitForCreateOrUpdate for scale set %q failed: %v", scaleSet.Name, err)
}

// SetScaleSetSize sets ScaleSet size.
func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error {
scaleSet.sizeMutex.Lock()
defer scaleSet.sizeMutex.Unlock()

// Proactively set the VMSS size so autoscaler makes better decisions.
scaleSet.curSize = size
scaleSet.lastSizeRefresh = time.Now()

vmssInfo, rerr := scaleSet.getVMSSInfo()
if rerr != nil {
klog.Errorf("Failed to get information for VMSS (%q): %v", scaleSet.Name, rerr)
return
return rerr.Error()
}

// Update the new capacity to cache.
Expand All @@ -260,27 +284,17 @@ func (scaleSet *ScaleSet) updateVMSSCapacity(size int64) {
}
ctx, cancel := getContextWithCancel()
defer cancel()
klog.V(3).Infof("Waiting for virtualMachineScaleSetsClient.CreateOrUpdate(%s)", scaleSet.Name)
rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.CreateOrUpdate(ctx, scaleSet.manager.config.ResourceGroup, scaleSet.Name, op)
klog.V(3).Infof("Waiting for virtualMachineScaleSetsClient.CreateOrUpdateAsync(%s)", scaleSet.Name)
future, rerr := scaleSet.manager.azClient.virtualMachineScaleSetsClient.CreateOrUpdateAsync(ctx, scaleSet.manager.config.ResourceGroup, scaleSet.Name, op)
if rerr != nil {
klog.Errorf("virtualMachineScaleSetsClient.CreateOrUpdate for scale set %q failed: %v", scaleSet.Name, rerr)
return
return rerr.Error()
}

klog.V(3).Infof("virtualMachineScaleSetsClient.CreateOrUpdate(%s) success", scaleSet.Name)
scaleSet.invalidateInstanceCache()
return
}

// SetScaleSetSize sets ScaleSet size.
func (scaleSet *ScaleSet) SetScaleSetSize(size int64) {
scaleSet.sizeMutex.Lock()
defer scaleSet.sizeMutex.Unlock()
klog.V(3).Infof("create a goroutine to wait for the result of the virtualMachineScaleSetsClient.CreateOrUpdate request")
go scaleSet.updateVMSSCapacity(future)

// Proactively set the VMSS size so autoscaler makes better decisions.
scaleSet.curSize = size
scaleSet.lastSizeRefresh = time.Now()
go scaleSet.updateVMSSCapacity(size)
return nil
}

// TargetSize returns the current TARGET size of the node group. It is possible that the
Expand All @@ -305,8 +319,7 @@ func (scaleSet *ScaleSet) IncreaseSize(delta int) error {
return fmt.Errorf("size increase too large - desired:%d max:%d", int(size)+delta, scaleSet.MaxSize())
}

scaleSet.SetScaleSetSize(size + int64(delta))
return nil
return scaleSet.SetScaleSetSize(size + int64(delta))
}

// GetScaleSetVms returns list of nodes for the given scale set.
Expand Down
Loading

0 comments on commit d253829

Please sign in to comment.