From de8c2436c4abc6d2ca3eeffe551bfd708b545928 Mon Sep 17 00:00:00 2001 From: Marc Lopez Date: Wed, 5 May 2021 17:20:11 +0500 Subject: [PATCH 1/3] ec_deployment: Retry destroy on Timeout Exceeded Adds retries when the plan tracking message of a deployment shutdown contains the Timeout Exceeded message, with a maximum of 3 retries. Signed-off-by: Marc Lopez --- ec/ecresource/deploymentresource/delete.go | 68 +++++++++++-------- .../deploymentresource/delete_test.go | 3 +- 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/ec/ecresource/deploymentresource/delete.go b/ec/ecresource/deploymentresource/delete.go index 306fb5326..a05434974 100644 --- a/ec/ecresource/deploymentresource/delete.go +++ b/ec/ecresource/deploymentresource/delete.go @@ -20,50 +20,64 @@ package deploymentresource import ( "context" "errors" + "log" + "strings" "github.com/elastic/cloud-sdk-go/pkg/api" "github.com/elastic/cloud-sdk-go/pkg/api/deploymentapi" "github.com/elastic/cloud-sdk-go/pkg/client/deployments" "github.com/elastic/cloud-sdk-go/pkg/multierror" "github.com/hashicorp/terraform-plugin-sdk/v2/diag" + "github.com/hashicorp/terraform-plugin-sdk/v2/helper/resource" "github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema" ) // Delete shuts down and deletes the remote deployment. -func deleteResource(_ context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics { +func deleteResource(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics { client := meta.(*api.API) + const maxRetries = 3 + var retries int - if _, err := deploymentapi.Shutdown(deploymentapi.ShutdownParams{ - API: client, DeploymentID: d.Id(), - }); err != nil { - if alreadyDestroyed(err) { - d.SetId("") - return nil + return diag.FromErr(resource.RetryContext(ctx, d.Timeout(schema.TimeoutDelete), func() *resource.RetryError { + if _, err := deploymentapi.Shutdown(deploymentapi.ShutdownParams{ + API: client, DeploymentID: d.Id(), + }); err != nil { + if alreadyDestroyed(err) { + d.SetId("") + return nil + } + return resource.NonRetryableError(multierror.NewPrefixed( + "failed shutting down the deployment", err, + )) } - return diag.FromErr(multierror.NewPrefixed( - "failed shutting down the deployment", err, - )) - } - if err := WaitForPlanCompletion(client, d.Id()); err != nil { - return diag.FromErr(err) - } + if err := WaitForPlanCompletion(client, d.Id()); err != nil { + isTimeout := strings.Contains(err.Error(), "Timeout exceeded") + log.Println("[DEBUG]: AA", err.Error(), isTimeout, retries, maxRetries) + if isTimeout && retries < maxRetries { + retries++ + log.Println("[DEBUG]: Retry", retries) + return resource.RetryableError(err) + } + return resource.NonRetryableError(err) + } - if err := handleTrafficFilterChange(d, client); err != nil { - return diag.FromErr(err) - } + if err := handleTrafficFilterChange(d, client); err != nil { + return resource.NonRetryableError(err) + } - // We don't particularly care if delete succeeds or not. It's better to - // remove it, but it might fail on ESS. For example, when user's aren't - // allowed to delete deployments, or on ECE when the cluster is "still - // being shutdown". Sumarizing, even if the call fails the deployment - // won't be there. - _, _ = deploymentapi.Delete(deploymentapi.DeleteParams{ - API: client, DeploymentID: d.Id(), - }) + // We don't particularly care if delete succeeds or not. It's better to + // remove it, but it might fail on ESS. For example, when user's aren't + // allowed to delete deployments, or on ECE when the cluster is "still + // being shutdown". Sumarizing, even if the call fails the deployment + // won't be there. + _, _ = deploymentapi.Delete(deploymentapi.DeleteParams{ + API: client, DeploymentID: d.Id(), + }) - d.SetId("") - return nil + d.SetId("") + return nil + })) } func alreadyDestroyed(err error) bool { diff --git a/ec/ecresource/deploymentresource/delete_test.go b/ec/ecresource/deploymentresource/delete_test.go index 21746d418..dedb316e1 100644 --- a/ec/ecresource/deploymentresource/delete_test.go +++ b/ec/ecresource/deploymentresource/delete_test.go @@ -55,7 +55,6 @@ func Test_deleteResource(t *testing.T) { wantTC404.SetId("") type args struct { - ctx context.Context d *schema.ResourceData meta interface{} } @@ -95,7 +94,7 @@ func Test_deleteResource(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := deleteResource(tt.args.ctx, tt.args.d, tt.args.meta) + got := deleteResource(context.Background(), tt.args.d, tt.args.meta) assert.Equal(t, tt.want, got) var want interface{} if tt.wantRD != nil { From f5e2da22538b010e64628a1bf3505870f70149f5 Mon Sep 17 00:00:00 2001 From: Marc Lopez Date: Thu, 6 May 2021 07:00:46 +0500 Subject: [PATCH 2/3] Refactor for readability Signed-off-by: Marc Lopez --- ec/ecresource/deploymentresource/delete.go | 27 +++++-- .../deploymentresource/delete_test.go | 77 +++++++++++++++++++ 2 files changed, 96 insertions(+), 8 deletions(-) diff --git a/ec/ecresource/deploymentresource/delete.go b/ec/ecresource/deploymentresource/delete.go index a05434974..4f5030d98 100644 --- a/ec/ecresource/deploymentresource/delete.go +++ b/ec/ecresource/deploymentresource/delete.go @@ -20,7 +20,6 @@ package deploymentresource import ( "context" "errors" - "log" "strings" "github.com/elastic/cloud-sdk-go/pkg/api" @@ -32,13 +31,17 @@ import ( "github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema" ) -// Delete shuts down and deletes the remote deployment. +// Delete shuts down and deletes the remote deployment retrying up to 3 times +// the Shutdown API call in case the plan returns with a failure that contains +// the "Timeout Exceeded" string, which is a fairly common transient error state +// returned from the API. func deleteResource(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics { - client := meta.(*api.API) const maxRetries = 3 var retries int + timeout := d.Timeout(schema.TimeoutDelete) + client := meta.(*api.API) - return diag.FromErr(resource.RetryContext(ctx, d.Timeout(schema.TimeoutDelete), func() *resource.RetryError { + return diag.FromErr(resource.RetryContext(ctx, timeout, func() *resource.RetryError { if _, err := deploymentapi.Shutdown(deploymentapi.ShutdownParams{ API: client, DeploymentID: d.Id(), }); err != nil { @@ -52,11 +55,8 @@ func deleteResource(ctx context.Context, d *schema.ResourceData, meta interface{ } if err := WaitForPlanCompletion(client, d.Id()); err != nil { - isTimeout := strings.Contains(err.Error(), "Timeout exceeded") - log.Println("[DEBUG]: AA", err.Error(), isTimeout, retries, maxRetries) - if isTimeout && retries < maxRetries { + if shouldRetryShutdown(err, retries, maxRetries) { retries++ - log.Println("[DEBUG]: Retry", retries) return resource.RetryableError(err) } return resource.NonRetryableError(err) @@ -84,3 +84,14 @@ func alreadyDestroyed(err error) bool { var destroyed *deployments.ShutdownDeploymentNotFound return errors.As(err, &destroyed) } + +func shouldRetryShutdown(err error, retries, maxRetries int) bool { + const timeout = "Timeout exceeded" + needsRetry := retries < maxRetries + + var isTimeout bool + if err != nil { + isTimeout = strings.Contains(err.Error(), timeout) + } + return isTimeout && needsRetry +} diff --git a/ec/ecresource/deploymentresource/delete_test.go b/ec/ecresource/deploymentresource/delete_test.go index dedb316e1..fea5882eb 100644 --- a/ec/ecresource/deploymentresource/delete_test.go +++ b/ec/ecresource/deploymentresource/delete_test.go @@ -19,10 +19,12 @@ package deploymentresource import ( "context" + "errors" "testing" "github.com/elastic/cloud-sdk-go/pkg/api" "github.com/elastic/cloud-sdk-go/pkg/api/mock" + "github.com/elastic/cloud-sdk-go/pkg/multierror" "github.com/hashicorp/terraform-plugin-sdk/v2/diag" "github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema" "github.com/stretchr/testify/assert" @@ -112,3 +114,78 @@ func Test_deleteResource(t *testing.T) { }) } } + +func Test_shouldRetryShutdown(t *testing.T) { + type args struct { + err error + retries int + maxRetries int + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "returns false when error doesn't contain timeout string", + args: args{ + err: errors.New("some error"), + retries: 1, + maxRetries: 10, + }, + want: false, + }, + { + name: "returns false when the error is nil", + args: args{ + retries: 1, + maxRetries: 10, + }, + want: false, + }, + { + name: "returns false when error doesn't contain timeout string", + args: args{ + err: errors.New("timeout exceeded"), + retries: 1, + maxRetries: 10, + }, + want: false, + }, + { + name: "returns true when error contains timeout string", + args: args{ + err: errors.New("Timeout exceeded"), + retries: 1, + maxRetries: 10, + }, + want: true, + }, + { + name: "returns true when error contains timeout string", + args: args{ + err: multierror.NewPrefixed("aa", + errors.New("Timeout exceeded"), + ), + retries: 1, + maxRetries: 10, + }, + want: true, + }, + { + name: "returns false when error contains timeout string but exceeds max timeouts", + args: args{ + err: errors.New("Timeout exceeded"), + retries: 10, + maxRetries: 10, + }, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := shouldRetryShutdown(tt.args.err, tt.args.retries, tt.args.maxRetries) + assert.Equal(t, tt.want, got) + }) + } +} From b3b172af69cc7ebd660b75f598e62efc7ba30094 Mon Sep 17 00:00:00 2001 From: Marc Lopez Date: Thu, 6 May 2021 07:01:00 +0500 Subject: [PATCH 3/3] Add changelog entry Signed-off-by: Marc Lopez --- .changelog/308.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/308.txt diff --git a/.changelog/308.txt b/.changelog/308.txt new file mode 100644 index 000000000..c9c617290 --- /dev/null +++ b/.changelog/308.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +resource/ec_deployment: Retries the Shutdown API call on the destroy operation up to 3 times when the transient "Timeout Exceeded" error returned from the Elastic Cloud API. +``` \ No newline at end of file