From 33aaae52c02847ab9328c9bec710f1512f989f3f Mon Sep 17 00:00:00 2001 From: Pengfei Ni Date: Tue, 19 Feb 2019 16:10:50 +0800 Subject: [PATCH] Fix error message for long-waiting operations Get HTTP responses together with error, so that we could get full error messages. It also fixes some edge cases for success responses. --- .../cloudprovider/azure/azure_agent_pool.go | 17 +++-- .../azure/azure_container_service_pool.go | 22 ++++++- .../cloudprovider/azure/azure_fakes.go | 4 +- .../cloudprovider/azure/azure_scale_set.go | 17 +++-- .../cloudprovider/azure/azure_util.go | 19 ++++++ .../cloudprovider/azure/azure_util_test.go | 65 +++++++++++++++++++ 6 files changed, 128 insertions(+), 16 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go index 1778c72e1a03..0c2e711f49ee 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go @@ -209,15 +209,20 @@ func (as *AgentPool) IncreaseSize(delta int) error { } ctx, cancel := getContextWithCancel() defer cancel() - _, err = as.manager.azClient.deploymentsClient.CreateOrUpdate(ctx, as.manager.config.ResourceGroup, newDeploymentName, newDeployment) klog.V(3).Infof("Waiting for deploymentsClient.CreateOrUpdate(%s, %s, %v)", as.manager.config.ResourceGroup, newDeploymentName, newDeployment) - if err != nil { - return err + resp, err := as.manager.azClient.deploymentsClient.CreateOrUpdate(ctx, as.manager.config.ResourceGroup, newDeploymentName, newDeployment) + isSuccess, realError := isSuccessHTTPResponse(resp, err) + if isSuccess { + klog.V(3).Infof("deploymentsClient.CreateOrUpdate(%s, %s, %v) success", as.manager.config.ResourceGroup, newDeploymentName, newDeployment) + + // Update cache after scale success. + as.curSize = int64(expectedSize) + as.lastRefresh = time.Now() + return nil } - as.curSize = int64(expectedSize) - as.lastRefresh = time.Now() - return err + klog.Errorf("deploymentsClient.CreateOrUpdate for deployment %q failed: %v", newDeploymentName, realError) + return realError } // GetVirtualMachines returns list of nodes for the given agent pool. diff --git a/cluster-autoscaler/cloudprovider/azure/azure_container_service_pool.go b/cluster-autoscaler/cloudprovider/azure/azure_container_service_pool.go index b8e1887eda70..b0a8fe3d2390 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_container_service_pool.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_container_service_pool.go @@ -193,7 +193,16 @@ func (agentPool *ContainerServiceAgentPool) setAKSNodeCount(count int) error { klog.Errorf("Failed to update AKS cluster (%q): %v", agentPool.clusterName, err) return err } - return future.WaitForCompletionRef(updateCtx, aksClient.Client) + + err = future.WaitForCompletionRef(updateCtx, aksClient.Client) + isSuccess, realError := isSuccessHTTPResponse(future.Response(), err) + if isSuccess { + klog.V(3).Infof("aksClient.CreateOrUpdate for aks cluster %q success", agentPool.clusterName) + return nil + } + + klog.Errorf("aksClient.CreateOrUpdate for aks cluster %q failed: %v", agentPool.clusterName, realError) + return realError } // setACSNodeCount sets node count for ACS agent pool. @@ -226,7 +235,16 @@ func (agentPool *ContainerServiceAgentPool) setACSNodeCount(count int) error { klog.Errorf("Failed to update ACS cluster (%q): %v", agentPool.clusterName, err) return err } - return future.WaitForCompletionRef(updateCtx, acsClient.Client) + + err = future.WaitForCompletionRef(updateCtx, acsClient.Client) + isSuccess, realError := isSuccessHTTPResponse(future.Response(), err) + if isSuccess { + klog.V(3).Infof("acsClient.CreateOrUpdate for acs cluster %q success", agentPool.clusterName) + return nil + } + + klog.Errorf("acsClient.CreateOrUpdate for acs cluster %q failed: %v", agentPool.clusterName, realError) + return realError } //GetNodeCount returns the count of nodes from the managed agent pool profile diff --git a/cluster-autoscaler/cloudprovider/azure/azure_fakes.go b/cluster-autoscaler/cloudprovider/azure/azure_fakes.go index d6e08e1c5dea..dd1f532bb88c 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_fakes.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_fakes.go @@ -63,7 +63,9 @@ func (client *VirtualMachineScaleSetsClientMock) CreateOrUpdate(ctx context.Cont } client.FakeStore[resourceGroupName][VMScaleSetName] = parameters - return nil, nil + return &http.Response{ + StatusCode: http.StatusOK, + }, nil } // DeleteInstances deletes a set of instances for specified VirtualMachineScaleSet. diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 9af9e84ce824..16db7bc4e937 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -153,15 +153,18 @@ func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error { op.VirtualMachineScaleSetProperties.ProvisioningState = nil updateCtx, updateCancel := getContextWithCancel() defer updateCancel() - _, err = scaleSet.manager.azClient.virtualMachineScaleSetsClient.CreateOrUpdate(updateCtx, resourceGroup, scaleSet.Name, op) - if err != nil { - klog.Errorf("virtualMachineScaleSetsClient.CreateOrUpdate for scale set %q failed: %v", scaleSet.Name, err) - return err + klog.V(3).Infof("Waiting for virtualMachineScaleSetsClient.CreateOrUpdate(%s)", scaleSet.Name) + resp, err := scaleSet.manager.azClient.virtualMachineScaleSetsClient.CreateOrUpdate(updateCtx, resourceGroup, scaleSet.Name, op) + isSuccess, realError := isSuccessHTTPResponse(resp, err) + if isSuccess { + klog.V(3).Infof("virtualMachineScaleSetsClient.CreateOrUpdate(%s) success", scaleSet.Name) + scaleSet.curSize = size + scaleSet.lastRefresh = time.Now() + return nil } - scaleSet.curSize = size - scaleSet.lastRefresh = time.Now() - return nil + klog.Errorf("virtualMachineScaleSetsClient.CreateOrUpdate for scale set %q failed: %v", scaleSet.Name, realError) + return realError } // TargetSize returns the current TARGET size of the node group. It is possible that the diff --git a/cluster-autoscaler/cloudprovider/azure/azure_util.go b/cluster-autoscaler/cloudprovider/azure/azure_util.go index 138efd5431fe..61892118a321 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_util.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_util.go @@ -609,3 +609,22 @@ func checkResourceExistsFromError(err error) (bool, error) { } return false, v } + +// isSuccessHTTPResponse determines if the response from an HTTP request suggests success +func isSuccessHTTPResponse(resp *http.Response, err error) (isSuccess bool, realError error) { + if err != nil { + return false, err + } + + if resp != nil { + // HTTP 2xx suggests a successful response + if 199 < resp.StatusCode && resp.StatusCode < 300 { + return true, nil + } + + return false, fmt.Errorf("failed with HTTP status code %d", resp.StatusCode) + } + + // This shouldn't happen, it only ensures all exceptions are handled. + return false, fmt.Errorf("failed with unknown error") +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_util_test.go b/cluster-autoscaler/cloudprovider/azure/azure_util_test.go index 1813a12ff8c8..c93cb0f3825f 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_util_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_util_test.go @@ -18,9 +18,11 @@ package azure import ( "fmt" + "net/http" "testing" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-10-01/compute" + "github.com/stretchr/testify/assert" ) func TestSplitBlobURI(t *testing.T) { @@ -124,3 +126,66 @@ func TestGetVMNameIndexWindows(t *testing.T) { t.Fatalf("unexpected error: %s", err) } } + +func TestIsSuccessResponse(t *testing.T) { + tests := []struct { + name string + resp *http.Response + err error + expected bool + expectedError error + }{ + { + name: "both resp and err nil should report error", + expected: false, + expectedError: fmt.Errorf("failed with unknown error"), + }, + { + name: "http.StatusNotFound should report error", + resp: &http.Response{ + StatusCode: http.StatusNotFound, + }, + expected: false, + expectedError: fmt.Errorf("failed with HTTP status code %d", http.StatusNotFound), + }, + { + name: "http.StatusInternalServerError should report error", + resp: &http.Response{ + StatusCode: http.StatusInternalServerError, + }, + expected: false, + expectedError: fmt.Errorf("failed with HTTP status code %d", http.StatusInternalServerError), + }, + { + name: "http.StatusOK shouldn't report error", + resp: &http.Response{ + StatusCode: http.StatusOK, + }, + expected: true, + }, + { + name: "non-nil response error with http.StatusOK should report error", + resp: &http.Response{ + StatusCode: http.StatusOK, + }, + err: fmt.Errorf("test error"), + expected: false, + expectedError: fmt.Errorf("test error"), + }, + { + name: "non-nil response error with http.StatusInternalServerError should report error", + resp: &http.Response{ + StatusCode: http.StatusInternalServerError, + }, + err: fmt.Errorf("test error"), + expected: false, + expectedError: fmt.Errorf("test error"), + }, + } + + for _, test := range tests { + result, realError := isSuccessHTTPResponse(test.resp, test.err) + assert.Equal(t, test.expected, result, "[%s] expected: %v, saw: %v", test.name, result, test.expected) + assert.Equal(t, test.expectedError, realError, "[%s] expected: %v, saw: %v", test.name, realError, test.expectedError) + } +}