Skip to content

Commit

Permalink
Node pool operations should retry if they encountered quota error. (#…
Browse files Browse the repository at this point in the history
…8828) (#15820)

Signed-off-by: Modular Magician <[email protected]>
  • Loading branch information
modular-magician authored Sep 12, 2023
1 parent c0c6796 commit 68dc6cc
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 34 deletions.
3 changes: 3 additions & 0 deletions .changelog/8828.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
container: fixed concurrent ops' quota-error to be retriable in `google_container_node_pool `
```
56 changes: 39 additions & 17 deletions google/services/container/resource_container_node_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -517,9 +517,11 @@ func resourceContainerNodePoolCreate(d *schema.ResourceData, meta interface{}) e
operation, err = clusterNodePoolsCreateCall.Do()

if err != nil {
if tpgresource.IsFailedPreconditionError(err) {
if tpgresource.IsFailedPreconditionError(err) || tpgresource.IsQuotaError(err) {
// We get failed precondition errors if the cluster is updating
// while we try to add the node pool.
// We get quota errors if there the number of running concurrent
// operations reaches the quota.
return resource.RetryableError(err)
}
return resource.NonRetryableError(err)
Expand Down Expand Up @@ -722,9 +724,11 @@ func resourceContainerNodePoolDelete(d *schema.ResourceData, meta interface{}) e
operation, err = clusterNodePoolsDeleteCall.Do()

if err != nil {
if tpgresource.IsFailedPreconditionError(err) {
if tpgresource.IsFailedPreconditionError(err) || tpgresource.IsQuotaError(err) {
// We get failed precondition errors if the cluster is updating
// while we try to delete the node pool.
// We get quota errors if there the number of running concurrent
// operations reaches the quota.
return resource.RetryableError(err)
}
return resource.NonRetryableError(err)
Expand Down Expand Up @@ -1202,7 +1206,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
timeout)
}

if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}
log.Printf("[INFO] Updated autoscaling in Node Pool %s", d.Id())
Expand Down Expand Up @@ -1240,7 +1244,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
timeout)
}

if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}

Expand Down Expand Up @@ -1294,7 +1298,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
timeout)
}

if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}
log.Printf("[INFO] Updated tags for node pool %s", name)
Expand Down Expand Up @@ -1331,7 +1335,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
}

// Call update serially.
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}

Expand Down Expand Up @@ -1369,7 +1373,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
}

// Call update serially.
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}

Expand Down Expand Up @@ -1401,7 +1405,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
timeout)
}

if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}
log.Printf("[INFO] Updated image type in Node Pool %s", d.Id())
Expand Down Expand Up @@ -1435,7 +1439,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
timeout)
}

if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}
log.Printf("[INFO] Updated workload_metadata_config for node pool %s", name)
Expand Down Expand Up @@ -1468,7 +1472,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
timeout)
}

if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}

Expand Down Expand Up @@ -1501,7 +1505,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
timeout)
}

if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}

Expand Down Expand Up @@ -1532,7 +1536,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
nodePoolInfo.location, "updating GKE node pool size", userAgent,
timeout)
}
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}
log.Printf("[INFO] GKE node pool %s size has been updated to %d", name, newSize)
Expand Down Expand Up @@ -1567,7 +1571,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
nodePoolInfo.location, "updating GKE node pool management", userAgent, timeout)
}

if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}
log.Printf("[INFO] Updated management in Node Pool %s", name)
Expand All @@ -1594,7 +1598,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
nodePoolInfo.project,
nodePoolInfo.location, "updating GKE node pool version", userAgent, timeout)
}
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}
log.Printf("[INFO] Updated version in Node Pool %s", name)
Expand All @@ -1619,7 +1623,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool node locations", userAgent, timeout)
}

if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}
log.Printf("[INFO] Updated node locations in Node Pool %s", name)
Expand Down Expand Up @@ -1699,7 +1703,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
// Wait until it's updated
return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool upgrade settings", userAgent, timeout)
}
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}
log.Printf("[INFO] Updated upgrade settings in Node Pool %s", name)
Expand Down Expand Up @@ -1730,7 +1734,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
timeout)
}

if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
return err
}

Expand Down Expand Up @@ -1781,3 +1785,21 @@ func containerNodePoolAwaitRestingState(config *transport_tpg.Config, name, proj

return state, err
}

// Retries an operation while the canonical error code is FAILED_PRECONDTION
// or RESOURCE_EXHAUSTED which indicates there is an incompatible operation
// already running on the cluster or there are the number of allowed
// concurrent operations running on the cluster. These errors can be safely
// retried until the incompatible operation completes, and the newly
// requested operation can begin.
func retryWhileIncompatibleOperation(timeout time.Duration, lockKey string, f func() error) error {
return resource.Retry(timeout, func() *resource.RetryError {
if err := transport_tpg.LockedCall(lockKey, f); err != nil {
if tpgresource.IsFailedPreconditionError(err) || tpgresource.IsQuotaError(err) {
return resource.RetryableError(err)
}
return resource.NonRetryableError(err)
}
return nil
})
}
31 changes: 14 additions & 17 deletions google/tpgresource/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"github.com/hashicorp/errwrap"
fwDiags "github.com/hashicorp/terraform-plugin-framework/diag"
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/resource"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
"github.com/hashicorp/terraform-plugin-sdk/v2/terraform"
"google.golang.org/api/googleapi"
Expand Down Expand Up @@ -128,6 +127,20 @@ func IsFailedPreconditionError(err error) bool {
return false
}

func IsQuotaError(err error) bool {
gerr, ok := errwrap.GetType(err, &googleapi.Error{}).(*googleapi.Error)
if !ok {
return false
}
if gerr == nil {
return false
}
if gerr.Code != 429 {
return false
}
return true
}

func IsConflictError(err error) bool {
if e, ok := err.(*googleapi.Error); ok && (e.Code == 409 || e.Code == 412) {
return true
Expand Down Expand Up @@ -503,22 +516,6 @@ func CheckGoogleIamPolicy(value string) error {
return nil
}

// Retries an operation while the canonical error code is FAILED_PRECONDTION
// which indicates there is an incompatible operation already running on the
// cluster. This error can be safely retried until the incompatible operation
// completes, and the newly requested operation can begin.
func RetryWhileIncompatibleOperation(timeout time.Duration, lockKey string, f func() error) error {
return resource.Retry(timeout, func() *resource.RetryError {
if err := transport_tpg.LockedCall(lockKey, f); err != nil {
if IsFailedPreconditionError(err) {
return resource.RetryableError(err)
}
return resource.NonRetryableError(err)
}
return nil
})
}

func FrameworkDiagsToSdkDiags(fwD fwDiags.Diagnostics) *diag.Diagnostics {
var diags diag.Diagnostics
for _, e := range fwD.Errors() {
Expand Down

0 comments on commit 68dc6cc

Please sign in to comment.