From 5c6541bdca7e10dcefde8385f4af818eb8414515 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Tue, 2 Jan 2018 16:13:13 -0800 Subject: [PATCH 01/12] return deployment status during upgrade --- pkg/armhelpers/interfaces.go | 8 +++++++ .../kubernetesupgrade/upgradeagentnode.go | 23 +++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/pkg/armhelpers/interfaces.go b/pkg/armhelpers/interfaces.go index ae1e6cb177..7e927aefd2 100644 --- a/pkg/armhelpers/interfaces.go +++ b/pkg/armhelpers/interfaces.go @@ -78,6 +78,14 @@ type ACSEngineClient interface { GetKubernetesClient(masterURL, kubeConfig string, interval, timeout time.Duration) (KubernetesClient, error) ListProviders() (resources.ProviderListResult, error) + + // DEPLOYMENTS + + // ListDeploymentOperations gets all deployments operations for a deployment. + ListDeploymentOperations(resourceGroupName string, deploymentName string, top *int32) (result resources.DeploymentOperationsListResult, err error) + + // ListDeploymentOperationsNextResults retrieves the next set of results, if any. + ListDeploymentOperationsNextResults(lastResults resources.DeploymentOperationsListResult) (result resources.DeploymentOperationsListResult, err error) } // ACSStorageClient interface models the azure storage client diff --git a/pkg/operations/kubernetesupgrade/upgradeagentnode.go b/pkg/operations/kubernetesupgrade/upgradeagentnode.go index 74910bf55c..8ec4bbe5ba 100644 --- a/pkg/operations/kubernetesupgrade/upgradeagentnode.go +++ b/pkg/operations/kubernetesupgrade/upgradeagentnode.go @@ -87,11 +87,30 @@ func (kan *UpgradeAgentNode) CreateNode(poolName string, agentNo int) error { kan.ParametersMap, nil) + //if err == nil { + // return nil + //} + // Retrieve deployment error + // errors := []string{err.Error()} + + var top int32 = 1 + res, err := kan.Client.ListDeploymentOperations(kan.ResourceGroup, deploymentName, &top) if err != nil { - return err + kan.logger.Warnf("unable to list deployment operations: %v", err) + } else { + fmt.Printf("Response: %v\n", res.Response) } - return nil + for res.NextLink != nil { + res, err = kan.Client.ListDeploymentOperationsNextResults(res) + if err != nil { + kan.logger.Warnf("unable to list next deployment operations: %v", err) + break + } + fmt.Printf("Response: %v\n", res.Response) + // results = append(results, res) + } + return err } // Validate will verify that agent node has been upgraded as expected. From 37953d369269001366f1288273dc5d37af50d4a6 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Thu, 4 Jan 2018 12:11:37 -0800 Subject: [PATCH 02/12] added deployment error --- pkg/api/orchestrators.go | 3 + pkg/armhelpers/deploymentError.go | 150 ++++++++++++++++++ .../kubernetesupgrade/upgradeagentnode.go | 33 ++-- 3 files changed, 164 insertions(+), 22 deletions(-) create mode 100644 pkg/armhelpers/deploymentError.go diff --git a/pkg/api/orchestrators.go b/pkg/api/orchestrators.go index c23af769d8..234b5d5481 100644 --- a/pkg/api/orchestrators.go +++ b/pkg/api/orchestrators.go @@ -181,6 +181,9 @@ func kubernetesUpgrades(csOrch *OrchestratorProfile) ([]*OrchestratorProfile, er case currentMajor == 1 && currentMinor == 7: nextMajor = 1 nextMinor = 8 + case currentMajor == 1 && currentMinor == 8: + nextMajor = 1 + nextMinor = 9 } for ver, supported := range common.AllKubernetesSupportedVersions { if !supported { diff --git a/pkg/armhelpers/deploymentError.go b/pkg/armhelpers/deploymentError.go new file mode 100644 index 0000000000..172c4bc293 --- /dev/null +++ b/pkg/armhelpers/deploymentError.go @@ -0,0 +1,150 @@ +package armhelpers + +import ( + "encoding/json" + "fmt" + + "github.com/Azure/acs-engine/pkg/api" + "github.com/Azure/azure-sdk-for-go/arm/resources/resources" + "github.com/sirupsen/logrus" +) + +// Error is the OData v4 format, used by the RPC and +// will go into the v2.2 Azure REST API guidelines +type Error struct { + Code string `json:"code"` + Message string `json:"message"` + Target string `json:"target,omitempty"` + Details []Error `json:"details,omitempty"` +} + +// ErrorResponse defines Resource Provider API 2.0 Error Response Content structure +type ErrorResponse struct { + Body Error `json:"error"` +} + +// Error implements error interface to return error in json +func (e *ErrorResponse) Error() string { + return e.Body.Error() +} + +// Error implements error interface to return error in json +func (e *Error) Error() string { + output, err := json.MarshalIndent(e, " ", " ") + if err != nil { + return err.Error() + } + return string(output) +} + +func toArmError(logger *logrus.Entry, operation resources.DeploymentOperation) (*ErrorResponse, error) { + errresp := &ErrorResponse{} + if operation.Properties != nil && operation.Properties.StatusMessage != nil { + b, err := json.MarshalIndent(operation.Properties.StatusMessage, "", " ") + if err != nil { + logger.Errorf("Error occurred marshalling JSON: '%v'", err) + return nil, err + } + + if err := json.Unmarshal(b, errresp); err != nil { + logger.Errorf("Error occurred unmarshalling JSON: '%v' JSON: '%s'", err, string(b)) + return nil, err + } + } + + // In some cases ARM returns "ResourceDeploymentFailure" as the top error code and error + // code returned by the resource provider is in the child array "details". The following code + // covers both of those scenarios. Also note, that details array is recursive i.e. the objects + // in details array can contain another details array with no max depth of this tree enforced + // by ARM. The following code only evaluates the first level property of "StatusMessage": "error" + // and its child array: "details" for error codes. + + // If error code is ResourceDeploymentFailure then RP error is defined in the child object field: "details + if errresp.Body.Code == "ResourceDeploymentFailure" { + // StatusMessage.error.details array supports multiple errors but in this particular case + // DeploymentOperationProperties contains error from one specific resource type so the + // chances of multiple deployment errors being returned for a single resource type is slim + // (but possible) based on current error/QoS analysis. In those cases where multiple errors + // are returned ACS will pick the first error code for determining whether this is an internal + // or a client error. This can be reevaluated later based on practical experience. + // However, note that customer will be returned the entire contents of "StatusMessage" object + // (like before) so they have access to all the errors returned by ARM. + logger.Infof("Found ResourceDeploymentFailure error code - error response = '%v'", *errresp) + details := errresp.Body.Details + if len(details) > 0 { + errresp.Body.Code = details[0].Code + errresp.Body.Message = details[0].Message + errresp.Body.Target = details[0].Target + } + } + + return errresp, nil +} + +func toArmErrors(logger *logrus.Entry, deploymentName string, operationsList resources.DeploymentOperationsListResult) ([]*ErrorResponse, error) { + ret := []*ErrorResponse{} + + if operationsList.Value == nil { + return ret, nil + } + + for _, operation := range *operationsList.Value { + if operation.Properties == nil || *operation.Properties.ProvisioningState != string(api.Failed) { + continue + } + + errresp, err := toArmError(logger, operation) + if err != nil { + logger.Errorf("unable to convert deployment operation to error response in deployment %s from ARM. error: %v", deploymentName, err) + return ret, err + } + + if len(errresp.Body.Code) > 0 { + logger.Warnf("got failed deployment operation in deployment %s. error: %v", deploymentName, errresp.Error()) + } + ret = append(ret, errresp) + } + return ret, nil +} + +// GetDeploymentError returns deployment error +func GetDeploymentError(az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string) (*ErrorResponse, error) { + errList := []*ErrorResponse{} + logger.Infof("Getting detailed deployment errors for %s", deploymentName) + + var top int32 = 1 + operationList, err := az.ListDeploymentOperations(resourceGroupName, deploymentName, &top) + if err != nil { + logger.Warnf("unable to list deployment operations: %v", err) + return nil, err + } + eList, err := toArmErrors(logger, deploymentName, operationList) + if err != nil { + return nil, err + } + errList = append(errList, eList...) + for operationList.NextLink != nil { + operationList, err = az.ListDeploymentOperationsNextResults(operationList) + if err != nil { + logger.Warnf("unable to list next deployment operations: %v", err) + break + } + eList, err := toArmErrors(logger, deploymentName, operationList) + if err != nil { + return nil, err + } + errList = append(errList, eList...) + } + switch len(errList) { + case 0: + return nil, fmt.Errorf("unable to extract deployment error for %s", deploymentName) + case 1: + return errList[0], nil + default: + // combine all errors + for _, e := range errList[1:] { + errList[0].Body.Details = append(errList[0].Body.Details, e.Body.Details...) + } + return errList[0], nil + } +} diff --git a/pkg/operations/kubernetesupgrade/upgradeagentnode.go b/pkg/operations/kubernetesupgrade/upgradeagentnode.go index 8ec4bbe5ba..69b4f51697 100644 --- a/pkg/operations/kubernetesupgrade/upgradeagentnode.go +++ b/pkg/operations/kubernetesupgrade/upgradeagentnode.go @@ -87,30 +87,19 @@ func (kan *UpgradeAgentNode) CreateNode(poolName string, agentNo int) error { kan.ParametersMap, nil) - //if err == nil { - // return nil - //} - // Retrieve deployment error - // errors := []string{err.Error()} - - var top int32 = 1 - res, err := kan.Client.ListDeploymentOperations(kan.ResourceGroup, deploymentName, &top) - if err != nil { - kan.logger.Warnf("unable to list deployment operations: %v", err) - } else { - fmt.Printf("Response: %v\n", res.Response) + if err == nil { + return nil } - for res.NextLink != nil { - res, err = kan.Client.ListDeploymentOperationsNextResults(res) - if err != nil { - kan.logger.Warnf("unable to list next deployment operations: %v", err) - break - } - fmt.Printf("Response: %v\n", res.Response) - // results = append(results, res) + kan.logger.Errorf("Deployment %s failed with error %v", deploymentName, err) + // Get deployment error details + errResp, e := armhelpers.GetDeploymentError(kan.Client, kan.logger, kan.ResourceGroup, deploymentName) + if e != nil { + kan.logger.Errorf("Failed to get error details for deployment %s: %v", deploymentName, e) + // return original deployment error + return err } - return err + return errResp } // Validate will verify that agent node has been upgraded as expected. @@ -119,7 +108,7 @@ func (kan *UpgradeAgentNode) Validate(vmName *string) error { kan.logger.Warningf("VM name was empty. Skipping node condition check") return nil } - + kan.logger.Infof("Validating %s", *vmName) var masterURL string if kan.UpgradeContainerService.Properties.HostedMasterProfile != nil { masterURL = kan.UpgradeContainerService.Properties.HostedMasterProfile.FQDN From b12322f2134f357f7910033f093da79ec2d46177 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Thu, 4 Jan 2018 13:01:19 -0800 Subject: [PATCH 03/12] removed debug info --- cmd/upgrade.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/upgrade.go b/cmd/upgrade.go index bb3f429ab6..39d1dee4c4 100644 --- a/cmd/upgrade.go +++ b/cmd/upgrade.go @@ -195,7 +195,7 @@ func (uc *upgradeCmd) run(cmd *cobra.Command, args []string) error { Client: uc.client, StepTimeout: uc.timeout, } - log.Fatalf("%v", upgradeCluster.StepTimeout) + kubeConfig, err := acsengine.GenerateKubeConfig(uc.containerService.Properties, uc.location) if err != nil { log.Fatalf("failed to generate kube config") // TODO: cleanup From 5e20b45a77208d279f1fc161a77de2a8b56799e4 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Thu, 4 Jan 2018 13:31:12 -0800 Subject: [PATCH 04/12] removed unrelated changes --- pkg/api/orchestrators.go | 3 --- pkg/armhelpers/mockclients.go | 10 ++++++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pkg/api/orchestrators.go b/pkg/api/orchestrators.go index 234b5d5481..c23af769d8 100644 --- a/pkg/api/orchestrators.go +++ b/pkg/api/orchestrators.go @@ -181,9 +181,6 @@ func kubernetesUpgrades(csOrch *OrchestratorProfile) ([]*OrchestratorProfile, er case currentMajor == 1 && currentMinor == 7: nextMajor = 1 nextMinor = 8 - case currentMajor == 1 && currentMinor == 8: - nextMajor = 1 - nextMinor = 9 } for ver, supported := range common.AllKubernetesSupportedVersions { if !supported { diff --git a/pkg/armhelpers/mockclients.go b/pkg/armhelpers/mockclients.go index 3f163e4412..6d67b13190 100644 --- a/pkg/armhelpers/mockclients.go +++ b/pkg/armhelpers/mockclients.go @@ -397,3 +397,13 @@ func (mc *MockACSEngineClient) ListProviders() (resources.ProviderListResult, er return resources.ProviderListResult{}, nil } + +// ListDeploymentOperations gets all deployments operations for a deployment. +func (mc *MockACSEngineClient) ListDeploymentOperations(resourceGroupName string, deploymentName string, top *int32) (result resources.DeploymentOperationsListResult, err error) { + return resources.DeploymentOperationsListResult{}, nil +} + +// ListDeploymentOperationsNextResults retrieves the next set of results, if any. +func (mc *MockACSEngineClient) ListDeploymentOperationsNextResults(lastResults resources.DeploymentOperationsListResult) (result resources.DeploymentOperationsListResult, err error) { + return resources.DeploymentOperationsListResult{}, nil +} From 9cca8cffd2928961bbbea760b10ef3469a20b696 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Fri, 5 Jan 2018 11:00:09 -0800 Subject: [PATCH 05/12] addressed comments --- pkg/armhelpers/deploymentError.go | 50 ++----------------- .../kubernetesupgrade/upgradeagentnode.go | 11 ++-- 2 files changed, 13 insertions(+), 48 deletions(-) diff --git a/pkg/armhelpers/deploymentError.go b/pkg/armhelpers/deploymentError.go index 172c4bc293..16fcb092e6 100644 --- a/pkg/armhelpers/deploymentError.go +++ b/pkg/armhelpers/deploymentError.go @@ -2,7 +2,6 @@ package armhelpers import ( "encoding/json" - "fmt" "github.com/Azure/acs-engine/pkg/api" "github.com/Azure/azure-sdk-for-go/arm/resources/resources" @@ -45,39 +44,11 @@ func toArmError(logger *logrus.Entry, operation resources.DeploymentOperation) ( logger.Errorf("Error occurred marshalling JSON: '%v'", err) return nil, err } - if err := json.Unmarshal(b, errresp); err != nil { logger.Errorf("Error occurred unmarshalling JSON: '%v' JSON: '%s'", err, string(b)) return nil, err } } - - // In some cases ARM returns "ResourceDeploymentFailure" as the top error code and error - // code returned by the resource provider is in the child array "details". The following code - // covers both of those scenarios. Also note, that details array is recursive i.e. the objects - // in details array can contain another details array with no max depth of this tree enforced - // by ARM. The following code only evaluates the first level property of "StatusMessage": "error" - // and its child array: "details" for error codes. - - // If error code is ResourceDeploymentFailure then RP error is defined in the child object field: "details - if errresp.Body.Code == "ResourceDeploymentFailure" { - // StatusMessage.error.details array supports multiple errors but in this particular case - // DeploymentOperationProperties contains error from one specific resource type so the - // chances of multiple deployment errors being returned for a single resource type is slim - // (but possible) based on current error/QoS analysis. In those cases where multiple errors - // are returned ACS will pick the first error code for determining whether this is an internal - // or a client error. This can be reevaluated later based on practical experience. - // However, note that customer will be returned the entire contents of "StatusMessage" object - // (like before) so they have access to all the errors returned by ARM. - logger.Infof("Found ResourceDeploymentFailure error code - error response = '%v'", *errresp) - details := errresp.Body.Details - if len(details) > 0 { - errresp.Body.Code = details[0].Code - errresp.Body.Message = details[0].Message - errresp.Body.Target = details[0].Target - } - } - return errresp, nil } @@ -89,14 +60,14 @@ func toArmErrors(logger *logrus.Entry, deploymentName string, operationsList res } for _, operation := range *operationsList.Value { - if operation.Properties == nil || *operation.Properties.ProvisioningState != string(api.Failed) { + if operation.Properties == nil || operation.Properties.ProvisioningState == nil || *operation.Properties.ProvisioningState != string(api.Failed) { continue } errresp, err := toArmError(logger, operation) if err != nil { - logger.Errorf("unable to convert deployment operation to error response in deployment %s from ARM. error: %v", deploymentName, err) - return ret, err + logger.Warnf("unable to convert deployment operation to error response in deployment %s from ARM. error: %v", deploymentName, err) + continue } if len(errresp.Body.Code) > 0 { @@ -108,7 +79,7 @@ func toArmErrors(logger *logrus.Entry, deploymentName string, operationsList res } // GetDeploymentError returns deployment error -func GetDeploymentError(az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string) (*ErrorResponse, error) { +func GetDeploymentError(az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string) ([]*ErrorResponse, error) { errList := []*ErrorResponse{} logger.Infof("Getting detailed deployment errors for %s", deploymentName) @@ -135,16 +106,5 @@ func GetDeploymentError(az ACSEngineClient, logger *logrus.Entry, resourceGroupN } errList = append(errList, eList...) } - switch len(errList) { - case 0: - return nil, fmt.Errorf("unable to extract deployment error for %s", deploymentName) - case 1: - return errList[0], nil - default: - // combine all errors - for _, e := range errList[1:] { - errList[0].Body.Details = append(errList[0].Body.Details, e.Body.Details...) - } - return errList[0], nil - } + return errList, nil } diff --git a/pkg/operations/kubernetesupgrade/upgradeagentnode.go b/pkg/operations/kubernetesupgrade/upgradeagentnode.go index 8ae9c176cf..0f69ba70b4 100644 --- a/pkg/operations/kubernetesupgrade/upgradeagentnode.go +++ b/pkg/operations/kubernetesupgrade/upgradeagentnode.go @@ -3,6 +3,7 @@ package kubernetesupgrade import ( "fmt" "math/rand" + "strings" "time" "k8s.io/client-go/pkg/api/v1/node" @@ -93,13 +94,17 @@ func (kan *UpgradeAgentNode) CreateNode(poolName string, agentNo int) error { kan.logger.Errorf("Deployment %s failed with error %v", deploymentName, err) // Get deployment error details - errResp, e := armhelpers.GetDeploymentError(kan.Client, kan.logger, kan.ResourceGroup, deploymentName) - if e != nil { + errRespList, e := armhelpers.GetDeploymentError(kan.Client, kan.logger, kan.ResourceGroup, deploymentName) + if e != nil || len(errRespList) == 0 { kan.logger.Errorf("Failed to get error details for deployment %s: %v", deploymentName, e) // return original deployment error return err } - return errResp + errStrList := make([]string, len(errRespList)) + for i, errResp := range errRespList { + errStrList[i] = errResp.Error() + } + return fmt.Errorf("%s", strings.Join(errStrList, " | ")) } // Validate will verify that agent node has been upgraded as expected. From a88c55bf64ceeb4ec4ebad36b60ff23e8cb85f63 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Tue, 16 Jan 2018 09:50:14 -0800 Subject: [PATCH 06/12] tmp --- pkg/armhelpers/deploymentError.go | 86 +++++++++++++++++-- .../kubernetesupgrade/upgradeagentnode.go | 13 +-- 2 files changed, 84 insertions(+), 15 deletions(-) diff --git a/pkg/armhelpers/deploymentError.go b/pkg/armhelpers/deploymentError.go index 16fcb092e6..8d1a9a89e8 100644 --- a/pkg/armhelpers/deploymentError.go +++ b/pkg/armhelpers/deploymentError.go @@ -2,12 +2,26 @@ package armhelpers import ( "encoding/json" + "strings" "github.com/Azure/acs-engine/pkg/api" "github.com/Azure/azure-sdk-for-go/arm/resources/resources" "github.com/sirupsen/logrus" ) +//TODO move pkg/core/apierror/ to acs-engine + +// ErrorCategory indicates the kind of error +type ErrorCategory string + +const ( + // ClientError is expected error + ClientError ErrorCategory = "ClientError" + + // InternalError is system or internal error + InternalError ErrorCategory = "InternalError" +) + // Error is the OData v4 format, used by the RPC and // will go into the v2.2 Azure REST API guidelines type Error struct { @@ -15,13 +29,34 @@ type Error struct { Message string `json:"message"` Target string `json:"target,omitempty"` Details []Error `json:"details,omitempty"` + + Category ErrorCategory `json:"-"` } -// ErrorResponse defines Resource Provider API 2.0 Error Response Content structure +// ErrorResponse defines Resource Provider API 2.0 Error Response Content structure type ErrorResponse struct { Body Error `json:"error"` } +// DeploymentError defines deployment error along with deployment operation errors +type DeploymentError struct { + RootError error + OperationErrors []*ErrorResponse +} + +// Error implements error interface to return error in json +func (e *DeploymentError) Error() string { + if len(e.OperationErrors) == 0 { + return e.RootError.Error() + } + errStrList := make([]string, len(e.OperationErrors)+1) + errStrList[0] = e.RootError.Error() + for i, errResp := range e.OperationErrors { + errStrList[i+1] = errResp.Error() + } + return strings.Join(errStrList, " | ") +} + // Error implements error interface to return error in json func (e *ErrorResponse) Error() string { return e.Body.Error() @@ -78,10 +113,49 @@ func toArmErrors(logger *logrus.Entry, deploymentName string, operationsList res return ret, nil } +//TODO errorCode is ErrorCode +func newErrorResponse(errorCategory ErrorCategory, errorCode string, message string) *ErrorResponse { + return &ErrorResponse{ + Body: Error{ + Code: errorCode, + Message: message, + Category: errorCategory, + }, + } +} + // GetDeploymentError returns deployment error -func GetDeploymentError(az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string) ([]*ErrorResponse, error) { - errList := []*ErrorResponse{} +func GetDeploymentError(res *resources.DeploymentExtended, rootError error, az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string) (*DeploymentError, error) { + if rootError == nil { + return nil, nil + } logger.Infof("Getting detailed deployment errors for %s", deploymentName) + deploymentError := &DeploymentError{RootError: rootError} + + if res != nil && res.Response.Response != nil && res.Body != nil { + armErr := &ErrorResponse{} + if d := json.NewDecoder(res.Body); d != nil { + if err := d.Decode(armErr); err == nil { + logger.Errorf("StatusCode: %d, ErrorCode: %s, ErrorMessage: %s", res.Response.StatusCode, armErr.Body.Code, armErr.Body.Message) + deploymentError.OperationErrors = append(deploymentError.OperationErrors, armErr) + switch { + case res.Response.StatusCode < 500 && res.Response.StatusCode >= 400: + armErr.Body.Category = ClientError + return deploymentError, nil + case res.Response.StatusCode >= 500: + armErr.Body.Category = InternalError + return deploymentError, nil + } + } else { + logger.Errorf("unable to unmarshal response into apierror: %v", err) + } + } + } else { + logger.Errorf("Got error from Azure SDK without response from ARM, error: %v", rootError) + // This is the failed sdk validation before calling ARM path + deploymentError.OperationErrors = append(deploymentError.OperationErrors, newErrorResponse(InternalError, "InternalOperationError", rootError.Error())) + return deploymentError, nil + } var top int32 = 1 operationList, err := az.ListDeploymentOperations(resourceGroupName, deploymentName, &top) @@ -93,7 +167,7 @@ func GetDeploymentError(az ACSEngineClient, logger *logrus.Entry, resourceGroupN if err != nil { return nil, err } - errList = append(errList, eList...) + deploymentError.OperationErrors = append(deploymentError.OperationErrors, eList...) for operationList.NextLink != nil { operationList, err = az.ListDeploymentOperationsNextResults(operationList) if err != nil { @@ -104,7 +178,7 @@ func GetDeploymentError(az ACSEngineClient, logger *logrus.Entry, resourceGroupN if err != nil { return nil, err } - errList = append(errList, eList...) + deploymentError.OperationErrors = append(deploymentError.OperationErrors, eList...) } - return errList, nil + return deploymentError, nil } diff --git a/pkg/operations/kubernetesupgrade/upgradeagentnode.go b/pkg/operations/kubernetesupgrade/upgradeagentnode.go index 0f69ba70b4..b4dbab9e9a 100644 --- a/pkg/operations/kubernetesupgrade/upgradeagentnode.go +++ b/pkg/operations/kubernetesupgrade/upgradeagentnode.go @@ -3,7 +3,6 @@ package kubernetesupgrade import ( "fmt" "math/rand" - "strings" "time" "k8s.io/client-go/pkg/api/v1/node" @@ -81,7 +80,7 @@ func (kan *UpgradeAgentNode) CreateNode(poolName string, agentNo int) error { deploymentSuffix := random.Int31() deploymentName := fmt.Sprintf("agent-%s-%d", time.Now().Format("06-01-02T15.04.05"), deploymentSuffix) - _, err := kan.Client.DeployTemplate( + depExt, err := kan.Client.DeployTemplate( kan.ResourceGroup, deploymentName, kan.TemplateMap, @@ -94,17 +93,13 @@ func (kan *UpgradeAgentNode) CreateNode(poolName string, agentNo int) error { kan.logger.Errorf("Deployment %s failed with error %v", deploymentName, err) // Get deployment error details - errRespList, e := armhelpers.GetDeploymentError(kan.Client, kan.logger, kan.ResourceGroup, deploymentName) - if e != nil || len(errRespList) == 0 { + deploymentError, e := armhelpers.GetDeploymentError(depExt, err, kan.Client, kan.logger, kan.ResourceGroup, deploymentName) + if e != nil { kan.logger.Errorf("Failed to get error details for deployment %s: %v", deploymentName, e) // return original deployment error return err } - errStrList := make([]string, len(errRespList)) - for i, errResp := range errRespList { - errStrList[i] = errResp.Error() - } - return fmt.Errorf("%s", strings.Join(errStrList, " | ")) + return deploymentError } // Validate will verify that agent node has been upgraded as expected. From 283ef5264fd8c34508d9561abd3085ba18e42b5a Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Tue, 23 Jan 2018 17:20:29 -0800 Subject: [PATCH 07/12] revised logic --- pkg/api/types.go | 2 + pkg/apierror/apierror.go | 16 ++ pkg/apierror/apierror_test.go | 33 +++ pkg/apierror/const.go | 63 +++++ pkg/apierror/helper.go | 47 ++++ pkg/apierror/helper_test.go | 21 ++ pkg/apierror/types.go | 39 +++ pkg/armhelpers/deploymentError.go | 242 ++++++++---------- pkg/armhelpers/deployments.go | 2 +- .../kubernetesupgrade/upgradeagentnode.go | 2 +- 10 files changed, 325 insertions(+), 142 deletions(-) create mode 100644 pkg/apierror/apierror.go create mode 100644 pkg/apierror/apierror_test.go create mode 100644 pkg/apierror/const.go create mode 100644 pkg/apierror/helper.go create mode 100644 pkg/apierror/helper_test.go create mode 100644 pkg/apierror/types.go diff --git a/pkg/api/types.go b/pkg/api/types.go index 6ad45516d7..1b5df8398e 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -149,6 +149,8 @@ const ( Migrating ProvisioningState = "Migrating" // Upgrading means an existing ContainerService resource is being upgraded Upgrading ProvisioningState = "Upgrading" + // Canceled means a deployment has been canceled + Canceled ProvisioningState = "Canceled" ) // OrchestratorProfile contains Orchestrator properties diff --git a/pkg/apierror/apierror.go b/pkg/apierror/apierror.go new file mode 100644 index 0000000000..d32b751180 --- /dev/null +++ b/pkg/apierror/apierror.go @@ -0,0 +1,16 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +package apierror + +// New creates an ErrorResponse +func New(errorCategory ErrorCategory, errorCode ErrorCode, message string) *ErrorResponse { + return &ErrorResponse{ + Body: Error{ + Code: errorCode, + Message: message, + Category: errorCategory, + }, + } +} diff --git a/pkg/apierror/apierror_test.go b/pkg/apierror/apierror_test.go new file mode 100644 index 0000000000..08f46a86b3 --- /dev/null +++ b/pkg/apierror/apierror_test.go @@ -0,0 +1,33 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +package apierror + +import ( + "testing" + + . "github.com/onsi/gomega" +) + +func TestNewAPIError(t *testing.T) { + RegisterTestingT(t) + + apiError := New( + ClientError, + InvalidParameter, + "error test") + + Expect(apiError.Body.Code).Should(Equal(ErrorCode("InvalidParameter"))) +} + +func TestAcsNewAPIError(t *testing.T) { + RegisterTestingT(t) + + apiError := New( + ClientError, + ScaleDownInternalError, + "error test") + + Expect(apiError.Body.Code).Should(Equal(ErrorCode("ScaleDownInternalError"))) +} diff --git a/pkg/apierror/const.go b/pkg/apierror/const.go new file mode 100644 index 0000000000..27cb6df165 --- /dev/null +++ b/pkg/apierror/const.go @@ -0,0 +1,63 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +package apierror + +// ErrorCategory indicates the kind of error +type ErrorCategory string + +const ( + // ClientError is expected error + ClientError ErrorCategory = "ClientError" + + // InternalError is system or internal error + InternalError ErrorCategory = "InternalError" +) + +// Common Azure Resource Provider API error code +type ErrorCode string + +const ( + // From Microsoft.Azure.ResourceProvider.API.ErrorCode + InvalidParameter ErrorCode = "InvalidParameter" + BadRequest ErrorCode = "BadRequest" + NotFound ErrorCode = "NotFound" + Conflict ErrorCode = "Conflict" + PreconditionFailed ErrorCode = "PreconditionFailed" + OperationNotAllowed ErrorCode = "OperationNotAllowed" + OperationPreempted ErrorCode = "OperationPreempted" + PropertyChangeNotAllowed ErrorCode = "PropertyChangeNotAllowed" + InternalOperationError ErrorCode = "InternalOperationError" + InvalidSubscriptionStateTransition ErrorCode = "InvalidSubscriptionStateTransition" + UnregisterWithResourcesNotAllowed ErrorCode = "UnregisterWithResourcesNotAllowed" + InvalidParameterConflictingProperties ErrorCode = "InvalidParameterConflictingProperties" + SubscriptionNotRegistered ErrorCode = "SubscriptionNotRegistered" + ConflictingUserInput ErrorCode = "ConflictingUserInput" + ProvisioningInternalError ErrorCode = "ProvisioningInternalError" + ProvisioningFailed ErrorCode = "ProvisioningFailed" + NetworkingInternalOperationError ErrorCode = "NetworkingInternalOperationError" + QuotaExceeded ErrorCode = "QuotaExceeded" + Unauthorized ErrorCode = "Unauthorized" + ResourcesOverConstrained ErrorCode = "ResourcesOverConstrained" + ControlPlaneProvisioningInternalError ErrorCode = "ControlPlaneProvisioningInternalError" + ControlPlaneProvisioningSyncError ErrorCode = "ControlPlaneProvisioningSyncError" + ControlPlaneInternalError ErrorCode = "ControlPlaneInternalError" + ControlPlaneCloudProviderNotSet ErrorCode = "ControlPlaneCloudProviderNotSet" + + // From Microsoft.WindowsAzure.ContainerService.API.AcsErrorCode + ScaleDownInternalError ErrorCode = "ScaleDownInternalError" + + // New + PreconditionCheckTimeOut ErrorCode = "PreconditionCheckTimeOut" + UpgradeFailed ErrorCode = "UpgradeFailed" + ScaleError ErrorCode = "ScaleError" + CreateRoleAssignmentError ErrorCode = "CreateRoleAssignmentError" + ServicePrincipalNotFound ErrorCode = "ServicePrincipalNotFound" + ClusterResourceGroupNotFound ErrorCode = "ClusterResourceGroupNotFound" + + // Error codes returned by HCP + UnderlayNotFound ErrorCode = "UnderlayNotFound" + UnderlaysOverConstrained ErrorCode = "UnderlaysOverConstrained" + UnexpectedUnderlayCount ErrorCode = "UnexpectedUnderlayCount" +) diff --git a/pkg/apierror/helper.go b/pkg/apierror/helper.go new file mode 100644 index 0000000000..4b96a479de --- /dev/null +++ b/pkg/apierror/helper.go @@ -0,0 +1,47 @@ +package apierror + +import ( + "encoding/json" + "net/http" + + "github.com/Azure/azure-sdk-for-go/arm/resources/resources" +) + +// ExtractCodeFromARMHttpResponse returns the ARM error's Code field +// If not found return defaultCode +func ExtractCodeFromARMHttpResponse(resp *http.Response, defaultCode ErrorCode) ErrorCode { + if resp == nil { + return defaultCode + } + decoder := json.NewDecoder(resp.Body) + errorJSON := ErrorResponse{} + if err := decoder.Decode(&errorJSON); err != nil { + return defaultCode + } + + if errorJSON.Body.Code == "" { + return defaultCode + } + return ErrorCode(errorJSON.Body.Code) +} + +//ConvertToAPIError turns a ManagementErrorWithDetails into a apierror.Error +func ConvertToAPIError(mError *resources.ManagementErrorWithDetails) *Error { + retVal := &Error{} + if mError.Code != nil { + retVal.Code = ErrorCode(*mError.Code) + } + if mError.Message != nil { + retVal.Message = *mError.Message + } + if mError.Target != nil { + retVal.Target = *mError.Target + } + if mError.Details != nil { + retVal.Details = []Error{} + for _, me := range *mError.Details { + retVal.Details = append(retVal.Details, *ConvertToAPIError(&me)) + } + } + return retVal +} diff --git a/pkg/apierror/helper_test.go b/pkg/apierror/helper_test.go new file mode 100644 index 0000000000..0c45711974 --- /dev/null +++ b/pkg/apierror/helper_test.go @@ -0,0 +1,21 @@ +package apierror + +import ( + "bytes" + "io/ioutil" + "net/http" + "testing" + + . "github.com/onsi/gomega" +) + +func TestExtractCodeFromARMHttpResponse(t *testing.T) { + RegisterTestingT(t) + + resp := &http.Response{ + Body: ioutil.NopCloser(bytes.NewBufferString(`{"error":{"code":"ResourceGroupNotFound","message":"Resource group 'jiren-fakegroup' could not be found."}}`)), + } + + code := ExtractCodeFromARMHttpResponse(resp, "") + Expect(code).To(Equal(ErrorCode("ResourceGroupNotFound"))) +} diff --git a/pkg/apierror/types.go b/pkg/apierror/types.go new file mode 100644 index 0000000000..307044a1a3 --- /dev/null +++ b/pkg/apierror/types.go @@ -0,0 +1,39 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +package apierror + +import "encoding/json" + +// Error is the OData v4 format, used by the RPC and +// will go into the v2.2 Azure REST API guidelines +type Error struct { + Code ErrorCode `json:"code"` + Message string `json:"message"` + Target string `json:"target,omitempty"` + Details []Error `json:"details,omitempty"` + + Category ErrorCategory `json:"-"` + ExceptionType string `json:"-"` + InternalDetail string `json:"-"` +} + +// ErrorResponse defines Resource Provider API 2.0 Error Response Content structure +type ErrorResponse struct { + Body Error `json:"error"` +} + +// Error implements error interface to return error in json +func (e *ErrorResponse) Error() string { + return e.Body.Error() +} + +// Error implements error interface to return error in json +func (e *Error) Error() string { + output, err := json.MarshalIndent(e, " ", " ") + if err != nil { + return err.Error() + } + return string(output) +} diff --git a/pkg/armhelpers/deploymentError.go b/pkg/armhelpers/deploymentError.go index 8d1a9a89e8..8efcb68660 100644 --- a/pkg/armhelpers/deploymentError.go +++ b/pkg/armhelpers/deploymentError.go @@ -5,74 +5,13 @@ import ( "strings" "github.com/Azure/acs-engine/pkg/api" + "github.com/Azure/acs-engine/pkg/apierror" "github.com/Azure/azure-sdk-for-go/arm/resources/resources" "github.com/sirupsen/logrus" ) -//TODO move pkg/core/apierror/ to acs-engine - -// ErrorCategory indicates the kind of error -type ErrorCategory string - -const ( - // ClientError is expected error - ClientError ErrorCategory = "ClientError" - - // InternalError is system or internal error - InternalError ErrorCategory = "InternalError" -) - -// Error is the OData v4 format, used by the RPC and -// will go into the v2.2 Azure REST API guidelines -type Error struct { - Code string `json:"code"` - Message string `json:"message"` - Target string `json:"target,omitempty"` - Details []Error `json:"details,omitempty"` - - Category ErrorCategory `json:"-"` -} - -// ErrorResponse defines Resource Provider API 2.0 Error Response Content structure -type ErrorResponse struct { - Body Error `json:"error"` -} - -// DeploymentError defines deployment error along with deployment operation errors -type DeploymentError struct { - RootError error - OperationErrors []*ErrorResponse -} - -// Error implements error interface to return error in json -func (e *DeploymentError) Error() string { - if len(e.OperationErrors) == 0 { - return e.RootError.Error() - } - errStrList := make([]string, len(e.OperationErrors)+1) - errStrList[0] = e.RootError.Error() - for i, errResp := range e.OperationErrors { - errStrList[i+1] = errResp.Error() - } - return strings.Join(errStrList, " | ") -} - -// Error implements error interface to return error in json -func (e *ErrorResponse) Error() string { - return e.Body.Error() -} - -// Error implements error interface to return error in json -func (e *Error) Error() string { - output, err := json.MarshalIndent(e, " ", " ") - if err != nil { - return err.Error() - } - return string(output) -} - -func toArmError(logger *logrus.Entry, operation resources.DeploymentOperation) (*ErrorResponse, error) { - errresp := &ErrorResponse{} +func toErrorResponse(logger *logrus.Entry, operation resources.DeploymentOperation) (*apierror.ErrorResponse, error) { + errresp := &apierror.ErrorResponse{} if operation.Properties != nil && operation.Properties.StatusMessage != nil { b, err := json.MarshalIndent(operation.Properties.StatusMessage, "", " ") if err != nil { @@ -84,101 +23,124 @@ func toArmError(logger *logrus.Entry, operation resources.DeploymentOperation) ( return nil, err } } + errresp.Body.Category = getErrorCategory(errresp.Body.Code) return errresp, nil } -func toArmErrors(logger *logrus.Entry, deploymentName string, operationsList resources.DeploymentOperationsListResult) ([]*ErrorResponse, error) { - ret := []*ErrorResponse{} - - if operationsList.Value == nil { - return ret, nil +func getErrorCategory(code apierror.ErrorCode) apierror.ErrorCategory { + switch code { + case apierror.InvalidParameter, + apierror.BadRequest, + apierror.OperationNotAllowed, + apierror.PropertyChangeNotAllowed, + apierror.UnregisterWithResourcesNotAllowed, + apierror.InvalidParameterConflictingProperties, + apierror.SubscriptionNotRegistered, + apierror.ConflictingUserInput, + apierror.QuotaExceeded, + apierror.Unauthorized, + apierror.ResourcesOverConstrained: + return apierror.ClientError + default: + return apierror.InternalError } +} - for _, operation := range *operationsList.Value { - if operation.Properties == nil || operation.Properties.ProvisioningState == nil || *operation.Properties.ProvisioningState != string(api.Failed) { - continue - } +// GetDeploymentError returns deployment error +func GetDeploymentError(res *resources.DeploymentExtended, az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string) (*apierror.Error, error) { + logger.Infof("Getting detailed deployment errors for %s", deploymentName) - errresp, err := toArmError(logger, operation) + if res == nil || res.Properties == nil || res.Properties.ProvisioningState == nil { + return nil, nil + } + properties := res.Properties + + switch *properties.ProvisioningState { + case string(api.Canceled): + logger.Warning("template deployment has been canceled") + return &apierror.Error{ + Code: apierror.ProvisioningFailed, + Message: "template deployment has been canceled", + Category: apierror.ClientError}, nil + + case string(api.Failed): + var top int32 = 1 + results := make([]resources.DeploymentOperationsListResult, top) + res, err := az.ListDeploymentOperations(resourceGroupName, deploymentName, &top) if err != nil { - logger.Warnf("unable to convert deployment operation to error response in deployment %s from ARM. error: %v", deploymentName, err) - continue + logger.Errorf("unable to list deployment operations %s. error: %v", deploymentName, err) + return nil, err } + results[0] = res - if len(errresp.Body.Code) > 0 { - logger.Warnf("got failed deployment operation in deployment %s. error: %v", deploymentName, errresp.Error()) + for res.NextLink != nil { + res, err = az.ListDeploymentOperationsNextResults(res) + if err != nil { + logger.Warningf("unable to list next deployment operations %s. error: %v", deploymentName, err) + break + } + + results = append(results, res) } - ret = append(ret, errresp) - } - return ret, nil -} + return analyzeDeploymentResultAndSaveError(resourceGroupName, deploymentName, results, logger) -//TODO errorCode is ErrorCode -func newErrorResponse(errorCategory ErrorCategory, errorCode string, message string) *ErrorResponse { - return &ErrorResponse{ - Body: Error{ - Code: errorCode, - Message: message, - Category: errorCategory, - }, + default: + return nil, nil } } -// GetDeploymentError returns deployment error -func GetDeploymentError(res *resources.DeploymentExtended, rootError error, az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string) (*DeploymentError, error) { - if rootError == nil { - return nil, nil - } - logger.Infof("Getting detailed deployment errors for %s", deploymentName) - deploymentError := &DeploymentError{RootError: rootError} - - if res != nil && res.Response.Response != nil && res.Body != nil { - armErr := &ErrorResponse{} - if d := json.NewDecoder(res.Body); d != nil { - if err := d.Decode(armErr); err == nil { - logger.Errorf("StatusCode: %d, ErrorCode: %s, ErrorMessage: %s", res.Response.StatusCode, armErr.Body.Code, armErr.Body.Message) - deploymentError.OperationErrors = append(deploymentError.OperationErrors, armErr) - switch { - case res.Response.StatusCode < 500 && res.Response.StatusCode >= 400: - armErr.Body.Category = ClientError - return deploymentError, nil - case res.Response.StatusCode >= 500: - armErr.Body.Category = InternalError - return deploymentError, nil - } +func analyzeDeploymentResultAndSaveError(resourceGroupName, deploymentName string, + operationLists []resources.DeploymentOperationsListResult, logger *logrus.Entry) (*apierror.Error, error) { + var errresp *apierror.ErrorResponse + var err error + errs := []string{} + isInternalErr := false + failedCnt := 0 + for _, operationsList := range operationLists { + if operationsList.Value == nil { + continue + } + + for _, operation := range *operationsList.Value { + if operation.Properties == nil || *operation.Properties.ProvisioningState != string(api.Failed) { + continue + } + + // log the full deployment operation error response + if operation.ID != nil && operation.OperationID != nil { + b, _ := json.Marshal(operation.Properties) + logger.Infof("deployment operation ID %s, operationID %s, prooperties: %s", *operation.ID, *operation.OperationID, b) } else { - logger.Errorf("unable to unmarshal response into apierror: %v", err) + logger.Error("either deployment ID or operationID is nil") } - } - } else { - logger.Errorf("Got error from Azure SDK without response from ARM, error: %v", rootError) - // This is the failed sdk validation before calling ARM path - deploymentError.OperationErrors = append(deploymentError.OperationErrors, newErrorResponse(InternalError, "InternalOperationError", rootError.Error())) - return deploymentError, nil - } - var top int32 = 1 - operationList, err := az.ListDeploymentOperations(resourceGroupName, deploymentName, &top) - if err != nil { - logger.Warnf("unable to list deployment operations: %v", err) - return nil, err - } - eList, err := toArmErrors(logger, deploymentName, operationList) - if err != nil { - return nil, err + failedCnt++ + errresp, err = toErrorResponse(logger, operation) + if err != nil { + logger.Errorf("unable to convert deployment operation to error response in deployment %s from ARM. error: %v", deploymentName, err) + return nil, err + } + if errresp.Body.Category == apierror.InternalError { + isInternalErr = true + } + errs = append(errs, errresp.Error()) + } } - deploymentError.OperationErrors = append(deploymentError.OperationErrors, eList...) - for operationList.NextLink != nil { - operationList, err = az.ListDeploymentOperationsNextResults(operationList) - if err != nil { - logger.Warnf("unable to list next deployment operations: %v", err) - break + provisionErr := &apierror.Error{} + if failedCnt > 0 { + if isInternalErr { + provisionErr.Category = apierror.InternalError + } else { + provisionErr.Category = apierror.ClientError } - eList, err := toArmErrors(logger, deploymentName, operationList) - if err != nil { - return nil, err + if failedCnt == 1 { + provisionErr = &errresp.Body + } else { + provisionErr.Code = apierror.ProvisioningFailed + provisionErr.Message = strings.Join(errs, "\n") } - deploymentError.OperationErrors = append(deploymentError.OperationErrors, eList...) + return provisionErr, nil } - return deploymentError, nil + + return nil, nil } diff --git a/pkg/armhelpers/deployments.go b/pkg/armhelpers/deployments.go index c354e7fce9..9c926a4d97 100644 --- a/pkg/armhelpers/deployments.go +++ b/pkg/armhelpers/deployments.go @@ -31,7 +31,7 @@ func (az *AzureClient) DeployTemplate(resourceGroupName, deploymentName string, return nil, err } - log.Infof("Finished ARM Deployment (%s).", deploymentName) + log.Infof("Finished ARM Deployment (%s). Error: %v", deploymentName, err) return &res, err } diff --git a/pkg/operations/kubernetesupgrade/upgradeagentnode.go b/pkg/operations/kubernetesupgrade/upgradeagentnode.go index b4dbab9e9a..deb2e94b49 100644 --- a/pkg/operations/kubernetesupgrade/upgradeagentnode.go +++ b/pkg/operations/kubernetesupgrade/upgradeagentnode.go @@ -93,7 +93,7 @@ func (kan *UpgradeAgentNode) CreateNode(poolName string, agentNo int) error { kan.logger.Errorf("Deployment %s failed with error %v", deploymentName, err) // Get deployment error details - deploymentError, e := armhelpers.GetDeploymentError(depExt, err, kan.Client, kan.logger, kan.ResourceGroup, deploymentName) + deploymentError, e := armhelpers.GetDeploymentError(depExt, kan.Client, kan.logger, kan.ResourceGroup, deploymentName) if e != nil { kan.logger.Errorf("Failed to get error details for deployment %s: %v", deploymentName, e) // return original deployment error From c4cae2cd242fbed20ccd376cd3d078f52f054c42 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Thu, 25 Jan 2018 13:53:28 -0800 Subject: [PATCH 08/12] refactored deployment error --- pkg/apierror/const.go | 86 ++++++----- pkg/armhelpers/deploymentError.go | 142 ++++++++++++++---- .../kubernetesupgrade/upgradeagentnode.go | 21 +-- 3 files changed, 159 insertions(+), 90 deletions(-) diff --git a/pkg/apierror/const.go b/pkg/apierror/const.go index 27cb6df165..6c2e5a1147 100644 --- a/pkg/apierror/const.go +++ b/pkg/apierror/const.go @@ -20,44 +20,52 @@ type ErrorCode string const ( // From Microsoft.Azure.ResourceProvider.API.ErrorCode - InvalidParameter ErrorCode = "InvalidParameter" - BadRequest ErrorCode = "BadRequest" - NotFound ErrorCode = "NotFound" - Conflict ErrorCode = "Conflict" - PreconditionFailed ErrorCode = "PreconditionFailed" - OperationNotAllowed ErrorCode = "OperationNotAllowed" - OperationPreempted ErrorCode = "OperationPreempted" - PropertyChangeNotAllowed ErrorCode = "PropertyChangeNotAllowed" - InternalOperationError ErrorCode = "InternalOperationError" - InvalidSubscriptionStateTransition ErrorCode = "InvalidSubscriptionStateTransition" - UnregisterWithResourcesNotAllowed ErrorCode = "UnregisterWithResourcesNotAllowed" + + // InvalidParameter error + InvalidParameter ErrorCode = "InvalidParameter" + // BadRequest error + BadRequest ErrorCode = "BadRequest" + // NotFound error + NotFound ErrorCode = "NotFound" + // Conflict error + Conflict ErrorCode = "Conflict" + // PreconditionFailed error + PreconditionFailed ErrorCode = "PreconditionFailed" + // OperationNotAllowed error + OperationNotAllowed ErrorCode = "OperationNotAllowed" + // OperationPreempted error + OperationPreempted ErrorCode = "OperationPreempted" + // PropertyChangeNotAllowed error + PropertyChangeNotAllowed ErrorCode = "PropertyChangeNotAllowed" + // InternalOperationError error + InternalOperationError ErrorCode = "InternalOperationError" + // InvalidSubscriptionStateTransition error + InvalidSubscriptionStateTransition ErrorCode = "InvalidSubscriptionStateTransition" + // UnregisterWithResourcesNotAllowed error + UnregisterWithResourcesNotAllowed ErrorCode = "UnregisterWithResourcesNotAllowed" + // InvalidParameterConflictingProperties error InvalidParameterConflictingProperties ErrorCode = "InvalidParameterConflictingProperties" - SubscriptionNotRegistered ErrorCode = "SubscriptionNotRegistered" - ConflictingUserInput ErrorCode = "ConflictingUserInput" - ProvisioningInternalError ErrorCode = "ProvisioningInternalError" - ProvisioningFailed ErrorCode = "ProvisioningFailed" - NetworkingInternalOperationError ErrorCode = "NetworkingInternalOperationError" - QuotaExceeded ErrorCode = "QuotaExceeded" - Unauthorized ErrorCode = "Unauthorized" - ResourcesOverConstrained ErrorCode = "ResourcesOverConstrained" - ControlPlaneProvisioningInternalError ErrorCode = "ControlPlaneProvisioningInternalError" - ControlPlaneProvisioningSyncError ErrorCode = "ControlPlaneProvisioningSyncError" - ControlPlaneInternalError ErrorCode = "ControlPlaneInternalError" - ControlPlaneCloudProviderNotSet ErrorCode = "ControlPlaneCloudProviderNotSet" - - // From Microsoft.WindowsAzure.ContainerService.API.AcsErrorCode - ScaleDownInternalError ErrorCode = "ScaleDownInternalError" - - // New - PreconditionCheckTimeOut ErrorCode = "PreconditionCheckTimeOut" - UpgradeFailed ErrorCode = "UpgradeFailed" - ScaleError ErrorCode = "ScaleError" - CreateRoleAssignmentError ErrorCode = "CreateRoleAssignmentError" - ServicePrincipalNotFound ErrorCode = "ServicePrincipalNotFound" - ClusterResourceGroupNotFound ErrorCode = "ClusterResourceGroupNotFound" - - // Error codes returned by HCP - UnderlayNotFound ErrorCode = "UnderlayNotFound" - UnderlaysOverConstrained ErrorCode = "UnderlaysOverConstrained" - UnexpectedUnderlayCount ErrorCode = "UnexpectedUnderlayCount" + // SubscriptionNotRegistered error + SubscriptionNotRegistered ErrorCode = "SubscriptionNotRegistered" + // ConflictingUserInput error + ConflictingUserInput ErrorCode = "ConflictingUserInput" + // ProvisioningInternalError error + ProvisioningInternalError ErrorCode = "ProvisioningInternalError" + // ProvisioningFailed error + ProvisioningFailed ErrorCode = "ProvisioningFailed" + // NetworkingInternalOperationError error + NetworkingInternalOperationError ErrorCode = "NetworkingInternalOperationError" + // QuotaExceeded error + QuotaExceeded ErrorCode = "QuotaExceeded" + // Unauthorized error + Unauthorized ErrorCode = "Unauthorized" + // ResourcesOverConstrained error + ResourcesOverConstrained ErrorCode = "ResourcesOverConstrained" + + // ResourceDeploymentFailure error + ResourceDeploymentFailure ErrorCode = "ResourceDeploymentFailure" + // InvalidTemplateDeployment error + InvalidTemplateDeployment ErrorCode = "InvalidTemplateDeployment" + // DeploymentFailed error + DeploymentFailed ErrorCode = "DeploymentFailed" ) diff --git a/pkg/armhelpers/deploymentError.go b/pkg/armhelpers/deploymentError.go index 8efcb68660..88d6fb7d77 100644 --- a/pkg/armhelpers/deploymentError.go +++ b/pkg/armhelpers/deploymentError.go @@ -1,7 +1,9 @@ package armhelpers import ( + "bytes" "encoding/json" + "fmt" "strings" "github.com/Azure/acs-engine/pkg/api" @@ -10,21 +12,47 @@ import ( "github.com/sirupsen/logrus" ) -func toErrorResponse(logger *logrus.Entry, operation resources.DeploymentOperation) (*apierror.ErrorResponse, error) { +func parseDeploymentOperation(logger *logrus.Entry, operation resources.DeploymentOperation) (*apierror.Error, error) { + if operation.Properties == nil || operation.Properties.StatusMessage == nil { + return nil, fmt.Errorf("DeploymentOperation.Properties is not set") + } + b, err := json.MarshalIndent(operation.Properties.StatusMessage, "", " ") + if err != nil { + logger.Errorf("Error occurred marshalling JSON: '%v'", err) + return nil, err + } + return toError(logger, b) +} + +func toError(logger *logrus.Entry, b []byte) (*apierror.Error, error) { errresp := &apierror.ErrorResponse{} - if operation.Properties != nil && operation.Properties.StatusMessage != nil { - b, err := json.MarshalIndent(operation.Properties.StatusMessage, "", " ") - if err != nil { - logger.Errorf("Error occurred marshalling JSON: '%v'", err) - return nil, err - } - if err := json.Unmarshal(b, errresp); err != nil { - logger.Errorf("Error occurred unmarshalling JSON: '%v' JSON: '%s'", err, string(b)) - return nil, err + + if err := json.Unmarshal(b, errresp); err != nil { + logger.Errorf("Error occurred unmarshalling JSON: '%v' JSON: '%s'", err, string(b)) + return nil, err + } + + armError := &errresp.Body + // If error code is ResourceDeploymentFailure then RP error is defined in the child object field: "details + switch armError.Code { + case apierror.ResourceDeploymentFailure, + apierror.InvalidTemplateDeployment, + apierror.DeploymentFailed: + // StatusMessage.error.details array supports multiple errors but in this particular case + // DeploymentOperationProperties contains error from one specific resource type so the + // chances of multiple deployment errors being returned for a single resource type is slim + // (but possible) based on current error/QoS analysis. In those cases where multiple errors + // are returned ACS will pick the first error code for determining whether this is an internal + // or a client error. This can be reevaluated later based on practical experience. + // However, note that customer will be returned the entire contents of "StatusMessage" object + // (like before) so they have access to all the errors returned by ARM. + logger.Infof("Found %s error code - error response = '%v'", armError.Code, armError) + if len(armError.Details) > 0 { + armError = &armError.Details[0] } } - errresp.Body.Category = getErrorCategory(errresp.Body.Code) - return errresp, nil + armError.Category = getErrorCategory(armError.Code) + return armError, nil } func getErrorCategory(code apierror.ErrorCode) apierror.ErrorCategory { @@ -46,14 +74,63 @@ func getErrorCategory(code apierror.ErrorCode) apierror.ErrorCategory { } } -// GetDeploymentError returns deployment error -func GetDeploymentError(res *resources.DeploymentExtended, az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string) (*apierror.Error, error) { +// DeployTemplateSync deploys the template and returns apierror +func DeployTemplateSync(az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string, template map[string]interface{}, parameters map[string]interface{}) *apierror.Error { + depExt, depErr := az.DeployTemplate(resourceGroupName, deploymentName, template, parameters, nil) + if depErr == nil { + return nil + } + logger.Infof("Getting detailed deployment errors for %s", deploymentName) - if res == nil || res.Properties == nil || res.Properties.ProvisioningState == nil { - return nil, nil + if depExt == nil { + logger.Warn("DeploymentExtended is nil") + return &apierror.Error{ + Code: apierror.InternalOperationError, + Message: depErr.Error(), + Category: apierror.InternalError} + } + + // try to extract error from ARM Response + var armErr *apierror.Error + if depExt.Response.Response != nil && depExt.Body != nil { + buf := new(bytes.Buffer) + buf.ReadFrom(depExt.Body) + logger.Infof("StatusCode: %d, Error: %s", depExt.Response.StatusCode, buf.String()) + if resp, err := toError(logger, buf.Bytes()); err == nil { + switch { + case depExt.Response.StatusCode < 500 && depExt.Response.StatusCode >= 400: + resp.Category = apierror.ClientError + case depExt.Response.StatusCode >= 500: + resp.Category = apierror.InternalError + } + armErr = resp + } else { + logger.Errorf("unable to unmarshal response into apierror: %v", err) + } + } else { + logger.Errorf("Got error from Azure SDK without response from ARM") + // This is the failed sdk validation before calling ARM path + return &apierror.Error{ + Code: apierror.InternalOperationError, + Message: depErr.Error(), + Category: apierror.InternalError} } - properties := res.Properties + + // Check that ARM returned ErrorResponse + if armErr == nil || len(armErr.Message) == 0 || len(armErr.Code) == 0 { + logger.Warn("Not an ARM Response") + return &apierror.Error{ + Code: apierror.InternalOperationError, + Message: depErr.Error(), + Category: apierror.InternalError} + } + + if depExt.Properties == nil || depExt.Properties.ProvisioningState == nil { + logger.Warn("No resources.DeploymentExtended.Properties") + return armErr + } + properties := depExt.Properties switch *properties.ProvisioningState { case string(api.Canceled): @@ -61,7 +138,7 @@ func GetDeploymentError(res *resources.DeploymentExtended, az ACSEngineClient, l return &apierror.Error{ Code: apierror.ProvisioningFailed, Message: "template deployment has been canceled", - Category: apierror.ClientError}, nil + Category: apierror.ClientError} case string(api.Failed): var top int32 = 1 @@ -69,7 +146,7 @@ func GetDeploymentError(res *resources.DeploymentExtended, az ACSEngineClient, l res, err := az.ListDeploymentOperations(resourceGroupName, deploymentName, &top) if err != nil { logger.Errorf("unable to list deployment operations %s. error: %v", deploymentName, err) - return nil, err + return armErr } results[0] = res @@ -82,20 +159,25 @@ func GetDeploymentError(res *resources.DeploymentExtended, az ACSEngineClient, l results = append(results, res) } - return analyzeDeploymentResultAndSaveError(resourceGroupName, deploymentName, results, logger) + apierr, err := analyzeDeploymentResultAndSaveError(resourceGroupName, deploymentName, results, logger) + if err != nil || apierr == nil { + return armErr + } + return apierr default: - return nil, nil + logger.Warningf("Unexpected ProvisioningState %s", *properties.ProvisioningState) + return armErr } } func analyzeDeploymentResultAndSaveError(resourceGroupName, deploymentName string, operationLists []resources.DeploymentOperationsListResult, logger *logrus.Entry) (*apierror.Error, error) { - var errresp *apierror.ErrorResponse + var apierr *apierror.Error var err error errs := []string{} isInternalErr := false - failedCnt := 0 + for _, operationsList := range operationLists { if operationsList.Value == nil { continue @@ -114,33 +196,31 @@ func analyzeDeploymentResultAndSaveError(resourceGroupName, deploymentName strin logger.Error("either deployment ID or operationID is nil") } - failedCnt++ - errresp, err = toErrorResponse(logger, operation) + apierr, err = parseDeploymentOperation(logger, operation) if err != nil { logger.Errorf("unable to convert deployment operation to error response in deployment %s from ARM. error: %v", deploymentName, err) return nil, err } - if errresp.Body.Category == apierror.InternalError { + if apierr.Category == apierror.InternalError { isInternalErr = true } - errs = append(errs, errresp.Error()) + errs = append(errs, apierr.Error()) } } provisionErr := &apierror.Error{} - if failedCnt > 0 { + if len(errs) > 0 { if isInternalErr { provisionErr.Category = apierror.InternalError } else { provisionErr.Category = apierror.ClientError } - if failedCnt == 1 { - provisionErr = &errresp.Body + if len(errs) == 1 { + provisionErr = apierr } else { provisionErr.Code = apierror.ProvisioningFailed provisionErr.Message = strings.Join(errs, "\n") } return provisionErr, nil } - return nil, nil } diff --git a/pkg/operations/kubernetesupgrade/upgradeagentnode.go b/pkg/operations/kubernetesupgrade/upgradeagentnode.go index deb2e94b49..207b61e654 100644 --- a/pkg/operations/kubernetesupgrade/upgradeagentnode.go +++ b/pkg/operations/kubernetesupgrade/upgradeagentnode.go @@ -80,26 +80,7 @@ func (kan *UpgradeAgentNode) CreateNode(poolName string, agentNo int) error { deploymentSuffix := random.Int31() deploymentName := fmt.Sprintf("agent-%s-%d", time.Now().Format("06-01-02T15.04.05"), deploymentSuffix) - depExt, err := kan.Client.DeployTemplate( - kan.ResourceGroup, - deploymentName, - kan.TemplateMap, - kan.ParametersMap, - nil) - - if err == nil { - return nil - } - - kan.logger.Errorf("Deployment %s failed with error %v", deploymentName, err) - // Get deployment error details - deploymentError, e := armhelpers.GetDeploymentError(depExt, kan.Client, kan.logger, kan.ResourceGroup, deploymentName) - if e != nil { - kan.logger.Errorf("Failed to get error details for deployment %s: %v", deploymentName, e) - // return original deployment error - return err - } - return deploymentError + return armhelpers.DeployTemplateSync(kan.Client, kan.logger, kan.ResourceGroup, deploymentName, kan.TemplateMap, kan.ParametersMap) } // Validate will verify that agent node has been upgraded as expected. From 7c40de08b8767df493cb98bb15a5c955cf623248 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Fri, 26 Jan 2018 10:16:25 -0800 Subject: [PATCH 09/12] fixed apierror_test --- pkg/apierror/apierror_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/apierror/apierror_test.go b/pkg/apierror/apierror_test.go index 08f46a86b3..563911de2f 100644 --- a/pkg/apierror/apierror_test.go +++ b/pkg/apierror/apierror_test.go @@ -26,8 +26,8 @@ func TestAcsNewAPIError(t *testing.T) { apiError := New( ClientError, - ScaleDownInternalError, + InvalidSubscriptionStateTransition, "error test") - Expect(apiError.Body.Code).Should(Equal(ErrorCode("ScaleDownInternalError"))) + Expect(apiError.Body.Code).Should(Equal(ErrorCode("InvalidSubscriptionStateTransition"))) } From 283e3ac743dd877a01584d17c82ddb42a79388bc Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Fri, 26 Jan 2018 11:00:01 -0800 Subject: [PATCH 10/12] fixed lint warning --- pkg/apierror/const.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/apierror/const.go b/pkg/apierror/const.go index 6c2e5a1147..b5fcb7eade 100644 --- a/pkg/apierror/const.go +++ b/pkg/apierror/const.go @@ -15,7 +15,7 @@ const ( InternalError ErrorCategory = "InternalError" ) -// Common Azure Resource Provider API error code +// ErrorCode is Common Azure Resource Provider API error code type ErrorCode string const ( From 59cce2a643a1195fce5837a1470fed7d68c370f6 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Fri, 26 Jan 2018 11:24:14 -0800 Subject: [PATCH 11/12] changed func signature --- pkg/armhelpers/deploymentError.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/armhelpers/deploymentError.go b/pkg/armhelpers/deploymentError.go index 88d6fb7d77..3e63710c32 100644 --- a/pkg/armhelpers/deploymentError.go +++ b/pkg/armhelpers/deploymentError.go @@ -75,7 +75,7 @@ func getErrorCategory(code apierror.ErrorCode) apierror.ErrorCategory { } // DeployTemplateSync deploys the template and returns apierror -func DeployTemplateSync(az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string, template map[string]interface{}, parameters map[string]interface{}) *apierror.Error { +func DeployTemplateSync(az ACSEngineClient, logger *logrus.Entry, resourceGroupName, deploymentName string, template map[string]interface{}, parameters map[string]interface{}) error { depExt, depErr := az.DeployTemplate(resourceGroupName, deploymentName, template, parameters, nil) if depErr == nil { return nil From 8a84f967837a14b92603afd7afbf9498c0ef32f2 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Fri, 26 Jan 2018 16:08:31 -0800 Subject: [PATCH 12/12] added unittests for DeployTemplateSync --- pkg/armhelpers/deploymentError_test.go | 55 +++++++++++++++++++++++++ pkg/armhelpers/mockclients.go | 56 ++++++++++++++++++++++++-- 2 files changed, 107 insertions(+), 4 deletions(-) create mode 100644 pkg/armhelpers/deploymentError_test.go diff --git a/pkg/armhelpers/deploymentError_test.go b/pkg/armhelpers/deploymentError_test.go new file mode 100644 index 0000000000..0872561d6a --- /dev/null +++ b/pkg/armhelpers/deploymentError_test.go @@ -0,0 +1,55 @@ +package armhelpers + +import ( + "testing" + + "github.com/Azure/acs-engine/pkg/apierror" + . "github.com/Azure/acs-engine/pkg/test" + . "github.com/onsi/gomega" + + . "github.com/onsi/ginkgo" + log "github.com/sirupsen/logrus" +) + +func TestUpgradeCluster(t *testing.T) { + RunSpecsWithReporters(t, "templatedeployment", "Server Suite") +} + +var _ = Describe("Template deployment tests", func() { + + It("Should return InternalOperationError error code", func() { + mockClient := &MockACSEngineClient{} + mockClient.FailDeployTemplate = true + logger := log.NewEntry(log.New()) + + err := DeployTemplateSync(mockClient, logger, "rg1", "agentvm", map[string]interface{}{}, map[string]interface{}{}) + Expect(err).NotTo(BeNil()) + apierr, ok := err.(*apierror.Error) + Expect(ok).To(BeTrue()) + Expect(apierr.Code).To(Equal(apierror.InternalOperationError)) + }) + + It("Should return QuotaExceeded error code, specified in details", func() { + mockClient := &MockACSEngineClient{} + mockClient.FailDeployTemplateQuota = true + logger := log.NewEntry(log.New()) + + err := DeployTemplateSync(mockClient, logger, "rg1", "agentvm", map[string]interface{}{}, map[string]interface{}{}) + Expect(err).NotTo(BeNil()) + apierr, ok := err.(*apierror.Error) + Expect(ok).To(BeTrue()) + Expect(apierr.Code).To(Equal(apierror.QuotaExceeded)) + }) + + It("Should return Conflict error code, specified in details", func() { + mockClient := &MockACSEngineClient{} + mockClient.FailDeployTemplateConflict = true + logger := log.NewEntry(log.New()) + + err := DeployTemplateSync(mockClient, logger, "rg1", "agentvm", map[string]interface{}{}, map[string]interface{}{}) + Expect(err).NotTo(BeNil()) + apierr, ok := err.(*apierror.Error) + Expect(ok).To(BeTrue()) + Expect(apierr.Code).To(Equal(apierror.Conflict)) + }) +}) diff --git a/pkg/armhelpers/mockclients.go b/pkg/armhelpers/mockclients.go index 6d67b13190..ae57d8e053 100644 --- a/pkg/armhelpers/mockclients.go +++ b/pkg/armhelpers/mockclients.go @@ -1,7 +1,11 @@ package armhelpers import ( + "bytes" + "errors" "fmt" + "io/ioutil" + "net/http" "time" "github.com/Azure/azure-sdk-for-go/arm/authorization" @@ -17,6 +21,8 @@ import ( //MockACSEngineClient is an implementation of ACSEngineClient where all requests error out type MockACSEngineClient struct { FailDeployTemplate bool + FailDeployTemplateQuota bool + FailDeployTemplateConflict bool FailEnsureResourceGroup bool FailListVirtualMachines bool FailListVirtualMachineScaleSets bool @@ -125,11 +131,53 @@ func (mc *MockACSEngineClient) AddAcceptLanguages(languages []string) { //DeployTemplate mock func (mc *MockACSEngineClient) DeployTemplate(resourceGroup, name string, template, parameters map[string]interface{}, cancel <-chan struct{}) (*resources.DeploymentExtended, error) { - if mc.FailDeployTemplate { - return nil, fmt.Errorf("DeployTemplate failed") + switch { + case mc.FailDeployTemplate: + return nil, errors.New("DeployTemplate failed") + + case mc.FailDeployTemplateQuota: + errmsg := `resources.DeploymentsClient#CreateOrUpdate: Failure responding to request: StatusCode=400 -- Original Error: autorest/azure: Service returned an error.` + resp := `{ +"error":{ + "code":"InvalidTemplateDeployment", + "message":"The template deployment is not valid according to the validation procedure. The tracking id is 'b5bd7d6b-fddf-4ec3-a3b0-ce285a48bd31'. See inner errors for details. Please see https://aka.ms/arm-deploy for usage details.", + "details":[{ + "code":"QuotaExceeded", + "message":"Operation results in exceeding quota limits of Core. Maximum allowed: 10, Current in use: 10, Additional requested: 2. Please read more about quota increase at http://aka.ms/corequotaincrease." +}]}}` + + return &resources.DeploymentExtended{ + Response: autorest.Response{ + Response: &http.Response{ + Status: "400 Bad Request", + StatusCode: 400, + Body: ioutil.NopCloser(bytes.NewReader([]byte(resp))), + }}}, + errors.New(errmsg) + + case mc.FailDeployTemplateConflict: + errmsg := `resources.DeploymentsClient#CreateOrUpdate: Failure sending request: StatusCode=200 -- Original Error: Long running operation terminated with status 'Failed': Code="DeploymentFailed" Message="At least one resource deployment operation failed. Please list deployment operations for details. Please see https://aka.ms/arm-debug for usage details.` + resp := `{ +"status":"Failed", +"error":{ + "code":"DeploymentFailed", + "message":"At least one resource deployment operation failed. Please list deployment operations for details. Please see https://aka.ms/arm-debug for usage details.", + "details":[{ + "code":"Conflict", + "message":"{\r\n \"error\": {\r\n \"code\": \"PropertyChangeNotAllowed\",\r\n \"target\": \"dataDisk.createOption\",\r\n \"message\": \"Changing property 'dataDisk.createOption' is not allowed.\"\r\n }\r\n}" +}]}}` + return &resources.DeploymentExtended{ + Response: autorest.Response{ + Response: &http.Response{ + Status: "200 OK", + StatusCode: 200, + Body: ioutil.NopCloser(bytes.NewReader([]byte(resp))), + }}}, + errors.New(errmsg) + + default: + return nil, nil } - - return nil, nil } //EnsureResourceGroup mock