Skip to content

Commit

Permalink
ec_deployment: Retry destroy on Timeout Exceeded (#308)
Browse files Browse the repository at this point in the history
Adds retries when the plan tracking message of a deployment shutdown
contains the Timeout Exceeded message, with a maximum of 3 retries.

Signed-off-by: Marc Lopez <[email protected]>
  • Loading branch information
marclop authored May 12, 2021
1 parent 2daa9af commit 28607b8
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 30 deletions.
3 changes: 3 additions & 0 deletions .changelog/308.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:enhancement
resource/ec_deployment: Retries the Shutdown API call on the destroy operation up to 3 times when the transient "Timeout Exceeded" error returned from the Elastic Cloud API.
```
81 changes: 53 additions & 28 deletions ec/ecresource/deploymentresource/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,53 +20,78 @@ package deploymentresource
import (
"context"
"errors"
"strings"

"github.com/elastic/cloud-sdk-go/pkg/api"
"github.com/elastic/cloud-sdk-go/pkg/api/deploymentapi"
"github.com/elastic/cloud-sdk-go/pkg/client/deployments"
"github.com/elastic/cloud-sdk-go/pkg/multierror"
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/resource"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
)

// Delete shuts down and deletes the remote deployment.
func deleteResource(_ context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
// Delete shuts down and deletes the remote deployment retrying up to 3 times
// the Shutdown API call in case the plan returns with a failure that contains
// the "Timeout Exceeded" string, which is a fairly common transient error state
// returned from the API.
func deleteResource(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
const maxRetries = 3
var retries int
timeout := d.Timeout(schema.TimeoutDelete)
client := meta.(*api.API)

if _, err := deploymentapi.Shutdown(deploymentapi.ShutdownParams{
API: client, DeploymentID: d.Id(),
}); err != nil {
if alreadyDestroyed(err) {
d.SetId("")
return nil
return diag.FromErr(resource.RetryContext(ctx, timeout, func() *resource.RetryError {
if _, err := deploymentapi.Shutdown(deploymentapi.ShutdownParams{
API: client, DeploymentID: d.Id(),
}); err != nil {
if alreadyDestroyed(err) {
d.SetId("")
return nil
}
return resource.NonRetryableError(multierror.NewPrefixed(
"failed shutting down the deployment", err,
))
}
return diag.FromErr(multierror.NewPrefixed(
"failed shutting down the deployment", err,
))
}

if err := WaitForPlanCompletion(client, d.Id()); err != nil {
return diag.FromErr(err)
}
if err := WaitForPlanCompletion(client, d.Id()); err != nil {
if shouldRetryShutdown(err, retries, maxRetries) {
retries++
return resource.RetryableError(err)
}
return resource.NonRetryableError(err)
}

if err := handleTrafficFilterChange(d, client); err != nil {
return diag.FromErr(err)
}
if err := handleTrafficFilterChange(d, client); err != nil {
return resource.NonRetryableError(err)
}

// We don't particularly care if delete succeeds or not. It's better to
// remove it, but it might fail on ESS. For example, when user's aren't
// allowed to delete deployments, or on ECE when the cluster is "still
// being shutdown". Sumarizing, even if the call fails the deployment
// won't be there.
_, _ = deploymentapi.Delete(deploymentapi.DeleteParams{
API: client, DeploymentID: d.Id(),
})
// We don't particularly care if delete succeeds or not. It's better to
// remove it, but it might fail on ESS. For example, when user's aren't
// allowed to delete deployments, or on ECE when the cluster is "still
// being shutdown". Sumarizing, even if the call fails the deployment
// won't be there.
_, _ = deploymentapi.Delete(deploymentapi.DeleteParams{
API: client, DeploymentID: d.Id(),
})

d.SetId("")
return nil
d.SetId("")
return nil
}))
}

func alreadyDestroyed(err error) bool {
var destroyed *deployments.ShutdownDeploymentNotFound
return errors.As(err, &destroyed)
}

func shouldRetryShutdown(err error, retries, maxRetries int) bool {
const timeout = "Timeout exceeded"
needsRetry := retries < maxRetries

var isTimeout bool
if err != nil {
isTimeout = strings.Contains(err.Error(), timeout)
}
return isTimeout && needsRetry
}
80 changes: 78 additions & 2 deletions ec/ecresource/deploymentresource/delete_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ package deploymentresource

import (
"context"
"errors"
"testing"

"github.com/elastic/cloud-sdk-go/pkg/api"
"github.com/elastic/cloud-sdk-go/pkg/api/mock"
"github.com/elastic/cloud-sdk-go/pkg/multierror"
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -55,7 +57,6 @@ func Test_deleteResource(t *testing.T) {
wantTC404.SetId("")

type args struct {
ctx context.Context
d *schema.ResourceData
meta interface{}
}
Expand Down Expand Up @@ -95,7 +96,7 @@ func Test_deleteResource(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := deleteResource(tt.args.ctx, tt.args.d, tt.args.meta)
got := deleteResource(context.Background(), tt.args.d, tt.args.meta)
assert.Equal(t, tt.want, got)
var want interface{}
if tt.wantRD != nil {
Expand All @@ -113,3 +114,78 @@ func Test_deleteResource(t *testing.T) {
})
}
}

func Test_shouldRetryShutdown(t *testing.T) {
type args struct {
err error
retries int
maxRetries int
}
tests := []struct {
name string
args args
want bool
}{
{
name: "returns false when error doesn't contain timeout string",
args: args{
err: errors.New("some error"),
retries: 1,
maxRetries: 10,
},
want: false,
},
{
name: "returns false when the error is nil",
args: args{
retries: 1,
maxRetries: 10,
},
want: false,
},
{
name: "returns false when error doesn't contain timeout string",
args: args{
err: errors.New("timeout exceeded"),
retries: 1,
maxRetries: 10,
},
want: false,
},
{
name: "returns true when error contains timeout string",
args: args{
err: errors.New("Timeout exceeded"),
retries: 1,
maxRetries: 10,
},
want: true,
},
{
name: "returns true when error contains timeout string",
args: args{
err: multierror.NewPrefixed("aa",
errors.New("Timeout exceeded"),
),
retries: 1,
maxRetries: 10,
},
want: true,
},
{
name: "returns false when error contains timeout string but exceeds max timeouts",
args: args{
err: errors.New("Timeout exceeded"),
retries: 10,
maxRetries: 10,
},
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := shouldRetryShutdown(tt.args.err, tt.args.retries, tt.args.maxRetries)
assert.Equal(t, tt.want, got)
})
}
}

0 comments on commit 28607b8

Please sign in to comment.