Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ec_deployment: Retry destroy on Timeout Exceeded #308

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/308.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:enhancement
resource/ec_deployment: Retries the Shutdown API call on the destroy operation up to 3 times when the transient "Timeout Exceeded" error returned from the Elastic Cloud API.
```
81 changes: 53 additions & 28 deletions ec/ecresource/deploymentresource/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,53 +20,78 @@ package deploymentresource
import (
"context"
"errors"
"strings"

"github.com/elastic/cloud-sdk-go/pkg/api"
"github.com/elastic/cloud-sdk-go/pkg/api/deploymentapi"
"github.com/elastic/cloud-sdk-go/pkg/client/deployments"
"github.com/elastic/cloud-sdk-go/pkg/multierror"
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/resource"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
)

// Delete shuts down and deletes the remote deployment.
func deleteResource(_ context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
// Delete shuts down and deletes the remote deployment retrying up to 3 times
// the Shutdown API call in case the plan returns with a failure that contains
// the "Timeout Exceeded" string, which is a fairly common transient error state
// returned from the API.
func deleteResource(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
const maxRetries = 3
var retries int
timeout := d.Timeout(schema.TimeoutDelete)
client := meta.(*api.API)

if _, err := deploymentapi.Shutdown(deploymentapi.ShutdownParams{
API: client, DeploymentID: d.Id(),
}); err != nil {
if alreadyDestroyed(err) {
d.SetId("")
return nil
return diag.FromErr(resource.RetryContext(ctx, timeout, func() *resource.RetryError {
if _, err := deploymentapi.Shutdown(deploymentapi.ShutdownParams{
API: client, DeploymentID: d.Id(),
}); err != nil {
if alreadyDestroyed(err) {
d.SetId("")
return nil
}
return resource.NonRetryableError(multierror.NewPrefixed(
"failed shutting down the deployment", err,
))
}
return diag.FromErr(multierror.NewPrefixed(
"failed shutting down the deployment", err,
))
}

if err := WaitForPlanCompletion(client, d.Id()); err != nil {
return diag.FromErr(err)
}
if err := WaitForPlanCompletion(client, d.Id()); err != nil {
if shouldRetryShutdown(err, retries, maxRetries) {
retries++
return resource.RetryableError(err)
}
return resource.NonRetryableError(err)
}

if err := handleTrafficFilterChange(d, client); err != nil {
return diag.FromErr(err)
}
if err := handleTrafficFilterChange(d, client); err != nil {
return resource.NonRetryableError(err)
}

// We don't particularly care if delete succeeds or not. It's better to
// remove it, but it might fail on ESS. For example, when user's aren't
// allowed to delete deployments, or on ECE when the cluster is "still
// being shutdown". Sumarizing, even if the call fails the deployment
// won't be there.
_, _ = deploymentapi.Delete(deploymentapi.DeleteParams{
API: client, DeploymentID: d.Id(),
})
// We don't particularly care if delete succeeds or not. It's better to
// remove it, but it might fail on ESS. For example, when user's aren't
// allowed to delete deployments, or on ECE when the cluster is "still
// being shutdown". Sumarizing, even if the call fails the deployment
// won't be there.
_, _ = deploymentapi.Delete(deploymentapi.DeleteParams{
API: client, DeploymentID: d.Id(),
})

d.SetId("")
return nil
d.SetId("")
return nil
}))
}

func alreadyDestroyed(err error) bool {
var destroyed *deployments.ShutdownDeploymentNotFound
return errors.As(err, &destroyed)
}

func shouldRetryShutdown(err error, retries, maxRetries int) bool {
const timeout = "Timeout exceeded"
needsRetry := retries < maxRetries

var isTimeout bool
if err != nil {
isTimeout = strings.Contains(err.Error(), timeout)
}
return isTimeout && needsRetry
}
80 changes: 78 additions & 2 deletions ec/ecresource/deploymentresource/delete_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ package deploymentresource

import (
"context"
"errors"
"testing"

"github.com/elastic/cloud-sdk-go/pkg/api"
"github.com/elastic/cloud-sdk-go/pkg/api/mock"
"github.com/elastic/cloud-sdk-go/pkg/multierror"
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -55,7 +57,6 @@ func Test_deleteResource(t *testing.T) {
wantTC404.SetId("")

type args struct {
ctx context.Context
d *schema.ResourceData
meta interface{}
}
Expand Down Expand Up @@ -95,7 +96,7 @@ func Test_deleteResource(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := deleteResource(tt.args.ctx, tt.args.d, tt.args.meta)
got := deleteResource(context.Background(), tt.args.d, tt.args.meta)
assert.Equal(t, tt.want, got)
var want interface{}
if tt.wantRD != nil {
Expand All @@ -113,3 +114,78 @@ func Test_deleteResource(t *testing.T) {
})
}
}

func Test_shouldRetryShutdown(t *testing.T) {
type args struct {
err error
retries int
maxRetries int
}
tests := []struct {
name string
args args
want bool
}{
{
name: "returns false when error doesn't contain timeout string",
args: args{
err: errors.New("some error"),
retries: 1,
maxRetries: 10,
},
want: false,
},
{
name: "returns false when the error is nil",
args: args{
retries: 1,
maxRetries: 10,
},
want: false,
},
{
name: "returns false when error doesn't contain timeout string",
args: args{
err: errors.New("timeout exceeded"),
retries: 1,
maxRetries: 10,
},
want: false,
},
{
name: "returns true when error contains timeout string",
args: args{
err: errors.New("Timeout exceeded"),
retries: 1,
maxRetries: 10,
},
want: true,
},
{
name: "returns true when error contains timeout string",
args: args{
err: multierror.NewPrefixed("aa",
errors.New("Timeout exceeded"),
),
retries: 1,
maxRetries: 10,
},
want: true,
},
{
name: "returns false when error contains timeout string but exceeds max timeouts",
args: args{
err: errors.New("Timeout exceeded"),
retries: 10,
maxRetries: 10,
},
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := shouldRetryShutdown(tt.args.err, tt.args.retries, tt.args.maxRetries)
assert.Equal(t, tt.want, got)
})
}
}