diff --git a/.changelog/308.txt b/.changelog/308.txt new file mode 100644 index 000000000..c9c617290 --- /dev/null +++ b/.changelog/308.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +resource/ec_deployment: Retries the Shutdown API call on the destroy operation up to 3 times when the transient "Timeout Exceeded" error returned from the Elastic Cloud API. +``` \ No newline at end of file diff --git a/ec/ecresource/deploymentresource/delete.go b/ec/ecresource/deploymentresource/delete.go index 306fb5326..4f5030d98 100644 --- a/ec/ecresource/deploymentresource/delete.go +++ b/ec/ecresource/deploymentresource/delete.go @@ -20,53 +20,78 @@ package deploymentresource import ( "context" "errors" + "strings" "github.com/elastic/cloud-sdk-go/pkg/api" "github.com/elastic/cloud-sdk-go/pkg/api/deploymentapi" "github.com/elastic/cloud-sdk-go/pkg/client/deployments" "github.com/elastic/cloud-sdk-go/pkg/multierror" "github.com/hashicorp/terraform-plugin-sdk/v2/diag" + "github.com/hashicorp/terraform-plugin-sdk/v2/helper/resource" "github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema" ) -// Delete shuts down and deletes the remote deployment. -func deleteResource(_ context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics { +// Delete shuts down and deletes the remote deployment retrying up to 3 times +// the Shutdown API call in case the plan returns with a failure that contains +// the "Timeout Exceeded" string, which is a fairly common transient error state +// returned from the API. +func deleteResource(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics { + const maxRetries = 3 + var retries int + timeout := d.Timeout(schema.TimeoutDelete) client := meta.(*api.API) - if _, err := deploymentapi.Shutdown(deploymentapi.ShutdownParams{ - API: client, DeploymentID: d.Id(), - }); err != nil { - if alreadyDestroyed(err) { - d.SetId("") - return nil + return diag.FromErr(resource.RetryContext(ctx, timeout, func() *resource.RetryError { + if _, err := deploymentapi.Shutdown(deploymentapi.ShutdownParams{ + API: client, DeploymentID: d.Id(), + }); err != nil { + if alreadyDestroyed(err) { + d.SetId("") + return nil + } + return resource.NonRetryableError(multierror.NewPrefixed( + "failed shutting down the deployment", err, + )) } - return diag.FromErr(multierror.NewPrefixed( - "failed shutting down the deployment", err, - )) - } - if err := WaitForPlanCompletion(client, d.Id()); err != nil { - return diag.FromErr(err) - } + if err := WaitForPlanCompletion(client, d.Id()); err != nil { + if shouldRetryShutdown(err, retries, maxRetries) { + retries++ + return resource.RetryableError(err) + } + return resource.NonRetryableError(err) + } - if err := handleTrafficFilterChange(d, client); err != nil { - return diag.FromErr(err) - } + if err := handleTrafficFilterChange(d, client); err != nil { + return resource.NonRetryableError(err) + } - // We don't particularly care if delete succeeds or not. It's better to - // remove it, but it might fail on ESS. For example, when user's aren't - // allowed to delete deployments, or on ECE when the cluster is "still - // being shutdown". Sumarizing, even if the call fails the deployment - // won't be there. - _, _ = deploymentapi.Delete(deploymentapi.DeleteParams{ - API: client, DeploymentID: d.Id(), - }) + // We don't particularly care if delete succeeds or not. It's better to + // remove it, but it might fail on ESS. For example, when user's aren't + // allowed to delete deployments, or on ECE when the cluster is "still + // being shutdown". Sumarizing, even if the call fails the deployment + // won't be there. + _, _ = deploymentapi.Delete(deploymentapi.DeleteParams{ + API: client, DeploymentID: d.Id(), + }) - d.SetId("") - return nil + d.SetId("") + return nil + })) } func alreadyDestroyed(err error) bool { var destroyed *deployments.ShutdownDeploymentNotFound return errors.As(err, &destroyed) } + +func shouldRetryShutdown(err error, retries, maxRetries int) bool { + const timeout = "Timeout exceeded" + needsRetry := retries < maxRetries + + var isTimeout bool + if err != nil { + isTimeout = strings.Contains(err.Error(), timeout) + } + return isTimeout && needsRetry +} diff --git a/ec/ecresource/deploymentresource/delete_test.go b/ec/ecresource/deploymentresource/delete_test.go index 21746d418..fea5882eb 100644 --- a/ec/ecresource/deploymentresource/delete_test.go +++ b/ec/ecresource/deploymentresource/delete_test.go @@ -19,10 +19,12 @@ package deploymentresource import ( "context" + "errors" "testing" "github.com/elastic/cloud-sdk-go/pkg/api" "github.com/elastic/cloud-sdk-go/pkg/api/mock" + "github.com/elastic/cloud-sdk-go/pkg/multierror" "github.com/hashicorp/terraform-plugin-sdk/v2/diag" "github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema" "github.com/stretchr/testify/assert" @@ -55,7 +57,6 @@ func Test_deleteResource(t *testing.T) { wantTC404.SetId("") type args struct { - ctx context.Context d *schema.ResourceData meta interface{} } @@ -95,7 +96,7 @@ func Test_deleteResource(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := deleteResource(tt.args.ctx, tt.args.d, tt.args.meta) + got := deleteResource(context.Background(), tt.args.d, tt.args.meta) assert.Equal(t, tt.want, got) var want interface{} if tt.wantRD != nil { @@ -113,3 +114,78 @@ func Test_deleteResource(t *testing.T) { }) } } + +func Test_shouldRetryShutdown(t *testing.T) { + type args struct { + err error + retries int + maxRetries int + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "returns false when error doesn't contain timeout string", + args: args{ + err: errors.New("some error"), + retries: 1, + maxRetries: 10, + }, + want: false, + }, + { + name: "returns false when the error is nil", + args: args{ + retries: 1, + maxRetries: 10, + }, + want: false, + }, + { + name: "returns false when error doesn't contain timeout string", + args: args{ + err: errors.New("timeout exceeded"), + retries: 1, + maxRetries: 10, + }, + want: false, + }, + { + name: "returns true when error contains timeout string", + args: args{ + err: errors.New("Timeout exceeded"), + retries: 1, + maxRetries: 10, + }, + want: true, + }, + { + name: "returns true when error contains timeout string", + args: args{ + err: multierror.NewPrefixed("aa", + errors.New("Timeout exceeded"), + ), + retries: 1, + maxRetries: 10, + }, + want: true, + }, + { + name: "returns false when error contains timeout string but exceeds max timeouts", + args: args{ + err: errors.New("Timeout exceeded"), + retries: 10, + maxRetries: 10, + }, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := shouldRetryShutdown(tt.args.err, tt.args.retries, tt.args.maxRetries) + assert.Equal(t, tt.want, got) + }) + } +}