Skip to content

Commit

Permalink
Add retry for internal 160009 errors (#8017) (#14727)
Browse files Browse the repository at this point in the history
Signed-off-by: Modular Magician <[email protected]>
  • Loading branch information
modular-magician authored and ScottSuarez committed May 25, 2023
1 parent 18e51a1 commit ca82db5
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 42 deletions.
6 changes: 6 additions & 0 deletions .changelog/8017.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
```release-note:bug
serviceusage: added retries to handle internal error: type: "googleapis.com" subject: "160009" when activating services
```
```release-note:bug
cloudresourcemanager: added retries to handle internal error: type: "googleapis.com" subject: "160009" when activating "compute.googleapis.com" to destroy the default network when `auto_create_network` is `false`
```
90 changes: 64 additions & 26 deletions google/resource_google_project.go
Original file line number Diff line number Diff line change
Expand Up @@ -629,41 +629,79 @@ func EnableServiceUsageProjectServices(services []string, project, billingProjec

func doEnableServicesRequest(services []string, project, billingProject, userAgent string, config *transport_tpg.Config, timeout time.Duration) error {
var op *serviceusage.Operation
var call ServicesCall
err := transport_tpg.RetryTimeDuration(func() error {
var rerr error
if len(services) == 1 {
// BatchEnable returns an error for a single item, so just enable
// using service endpoint.
name := fmt.Sprintf("projects/%s/services/%s", project, services[0])
req := &serviceusage.EnableServiceRequest{}
call = config.NewServiceUsageClient(userAgent).Services.Enable(name, req)
} else {
// Batch enable for multiple services.
name := fmt.Sprintf("projects/%s", project)
req := &serviceusage.BatchEnableServicesRequest{ServiceIds: services}
call = config.NewServiceUsageClient(userAgent).Services.BatchEnable(name, req)

// errors can come up at multiple points, so there are a few levels of
// retrying here.
// logicalErr / waitErr: overall error on the logical operation (enabling services)
// but possibly also errors when retrieving the LRO (these are rare)
// err / reqErr: precondition errors when sending the request received instead of an LRO
logicalErr := transport_tpg.RetryTimeDuration(func() error {
err := transport_tpg.RetryTimeDuration(func() error {
var reqErr error
var call ServicesCall
if len(services) == 1 {
// BatchEnable returns an error for a single item, so enable with single endpoint
name := fmt.Sprintf("projects/%s/services/%s", project, services[0])
req := &serviceusage.EnableServiceRequest{}
call = config.NewServiceUsageClient(userAgent).Services.Enable(name, req)
} else {
// Batch enable for multiple services.
name := fmt.Sprintf("projects/%s", project)
req := &serviceusage.BatchEnableServicesRequest{ServiceIds: services}
call = config.NewServiceUsageClient(userAgent).Services.BatchEnable(name, req)
}

if config.UserProjectOverride && billingProject != "" {
call.Header().Add("X-Goog-User-Project", billingProject)
}

op, reqErr = call.Do()
return handleServiceUsageRetryablePreconditionError(reqErr)
},
timeout,
transport_tpg.ServiceUsageServiceBeingActivated,
)
if err != nil {
return errwrap.Wrapf("failed on request preconditions: {{err}}", err)
}
if config.UserProjectOverride && billingProject != "" {
call.Header().Add("X-Goog-User-Project", billingProject)

waitErr := serviceUsageOperationWait(config, op, billingProject, fmt.Sprintf("Enable Project %q Services: %+v", project, services), userAgent, timeout)
if waitErr != nil {
return waitErr
}
op, rerr = call.Do()
return handleServiceUsageRetryableError(rerr)

return nil
},
timeout,
transport_tpg.ServiceUsageServiceBeingActivated,
transport_tpg.ServiceUsageInternalError160009,
)
if err != nil {
return errwrap.Wrapf("failed to send enable services request: {{err}}", err)
}
// Poll for the API to return
waitErr := serviceUsageOperationWait(config, op, billingProject, fmt.Sprintf("Enable Project %q Services: %+v", project, services), userAgent, timeout)
if waitErr != nil {
return waitErr

if logicalErr != nil {
return errwrap.Wrapf("failed to enable services: {{err}}", logicalErr)
}

return nil
}

// Handle errors that are retryable at call time for serviceusage
// Specifically, errors in https://cloud.google.com/service-usage/docs/reference/rest/v1/services/batchEnable#response-body
// Errors in operations are handled separately.
// NOTE(rileykarson): This should probably be turned into a retry predicate
func handleServiceUsageRetryablePreconditionError(err error) error {
if err == nil {
return nil
}
if gerr, ok := err.(*googleapi.Error); ok {
if (gerr.Code == 400 || gerr.Code == 412) && gerr.Message == "Precondition check failed." {
return &googleapi.Error{
Code: 503,
Message: "api returned \"precondition failed\" while enabling service",
}
}
}
return err
}

// Retrieve a project's services from the API
// if a service has been renamed, this function will list both the old and new
// forms of the service. LIST responses are expected to return only the old or
Expand Down
16 changes: 0 additions & 16 deletions google/serviceusage_operation.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"time"

transport_tpg "github.com/hashicorp/terraform-provider-google/google/transport"
"google.golang.org/api/googleapi"
"google.golang.org/api/serviceusage/v1"
)

Expand All @@ -22,18 +21,3 @@ func serviceUsageOperationWait(config *transport_tpg.Config, op *serviceusage.Op
}
return ServiceUsageOperationWaitTime(config, m, project, activity, userAgent, timeout)
}

func handleServiceUsageRetryableError(err error) error {
if err == nil {
return nil
}
if gerr, ok := err.(*googleapi.Error); ok {
if (gerr.Code == 400 || gerr.Code == 412) && gerr.Message == "Precondition check failed." {
return &googleapi.Error{
Code: 503,
Message: "api returned \"precondition failed\" while enabling service",
}
}
}
return err
}
14 changes: 14 additions & 0 deletions google/transport/error_retry_predicates.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,20 @@ func ServiceUsageServiceBeingActivated(err error) (bool, string) {
return false, ""
}

// See https://github.com/hashicorp/terraform-provider-google/issues/14691 for
// details on the error message this handles
// This is a post-operation error so it uses tpgresource.CommonOpError instead of googleapi.Error
func ServiceUsageInternalError160009(err error) (bool, string) {
// a cyclical dependency between transport/tpgresource blocks using tpgresource.CommonOpError
// so just work off the error string. Ideally, we'd use that type instead.
s := err.Error()
if strings.Contains(s, "encountered internal error") && strings.Contains(s, "160009") && strings.Contains(s, "with failed services") {
return true, "retrying internal error 160009."
}

return false, ""
}

// Retry if Bigquery operation returns a 403 with a specific message for
// concurrent operations (which are implemented in terms of 'edit quota').
func IsBigqueryIAMQuotaError(err error) (bool, string) {
Expand Down

0 comments on commit ca82db5

Please sign in to comment.