From ea8e736a13d0b0f5a0ab90d1d97bb4ccd9ef1300 Mon Sep 17 00:00:00 2001 From: Aditya Maru Date: Tue, 28 Jun 2022 15:55:30 -0400 Subject: [PATCH] amazon: add custom retryer to retry on `read: connection reset` This change implements a custom retryer that we use when initializing a new s3 client for interaction with the external storage sink. This change was motivated by the increased number of backup job failures we were observing with a `read: connection reset` error being thrown by s3. A read connection reset error is thrown when the SDK is unable to read the response of an underlying API request due to a connection reset. The DefaultRetryer in the AWS SDK does not treat this error as a retryable error since the SDK does not have knowledge about the idempotence of the request, and whether it is safe to retry - https://github.com/aws/aws-sdk-go/pull/2926#issuecomment-553637658. In CRDB all operations with s3 (read, write, list) are considered idempotent, and so we can treat the read connection reset error as retryable too. Release note (bug fix): Retry s3 operations when they error out with a read connection reset error instead of failing the top level job. --- pkg/cloud/amazon/BUILD.bazel | 11 +++++---- pkg/cloud/amazon/s3_storage.go | 41 ++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/pkg/cloud/amazon/BUILD.bazel b/pkg/cloud/amazon/BUILD.bazel index 146fdc92c8d7..3098b179ff5a 100644 --- a/pkg/cloud/amazon/BUILD.bazel +++ b/pkg/cloud/amazon/BUILD.bazel @@ -11,17 +11,21 @@ go_library( deps = [ "//pkg/base", "//pkg/cloud", - "//pkg/roachpb:with-mocks", + "//pkg/roachpb", "//pkg/server/telemetry", "//pkg/settings", "//pkg/settings/cluster", "//pkg/util/contextutil", + "//pkg/util/ioctx", "//pkg/util/log", "//pkg/util/syncutil", "//pkg/util/tracing", "@com_github_aws_aws_sdk_go//aws", "@com_github_aws_aws_sdk_go//aws/awserr", + "@com_github_aws_aws_sdk_go//aws/client", "@com_github_aws_aws_sdk_go//aws/credentials", + "@com_github_aws_aws_sdk_go//aws/credentials/stscreds", + "@com_github_aws_aws_sdk_go//aws/request", "@com_github_aws_aws_sdk_go//aws/session", "@com_github_aws_aws_sdk_go//service/kms", "@com_github_aws_aws_sdk_go//service/s3", @@ -35,7 +39,6 @@ go_test( name = "amazon_test", srcs = [ "aws_kms_test.go", - "kms_test.go", "s3_storage_test.go", ], embed = [":amazon"], @@ -44,8 +47,8 @@ go_test( "//pkg/blobs", "//pkg/cloud", "//pkg/cloud/cloudtestutils", - "//pkg/roachpb:with-mocks", - "//pkg/security", + "//pkg/roachpb", + "//pkg/security/username", "//pkg/settings/cluster", "//pkg/testutils", "//pkg/testutils/skip", diff --git a/pkg/cloud/amazon/s3_storage.go b/pkg/cloud/amazon/s3_storage.go index e5aa0750c6df..f80c105df44b 100644 --- a/pkg/cloud/amazon/s3_storage.go +++ b/pkg/cloud/amazon/s3_storage.go @@ -22,7 +22,9 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" + "github.com/aws/aws-sdk-go/aws/client" "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/s3" "github.com/aws/aws-sdk-go/service/s3/s3manager" @@ -81,6 +83,39 @@ type s3Storage struct { cached *s3Client } +var _ request.Retryer = &customRetryer{} + +// customRetryer implements the `request.Retryer` interface and allows for +// customization of the retry behavior of an AWS client. +type customRetryer struct { + client.DefaultRetryer +} + +// isErrReadConnectionReset returns true if the underlying error is a read +// connection reset error. +// +// NB: A read connection reset error is thrown when the SDK is unable to read +// the response of an underlying API request due to a connection reset. The +// DefaultRetryer in the AWS SDK does not treat this error as a retryable error +// since the SDK does not have knowledge about the idempotence of the request, +// and whether it is safe to retry - +// https://github.com/aws/aws-sdk-go/pull/2926#issuecomment-553637658. +// +// In CRDB all operations with s3 (read, write, list) are considered idempotent, +// and so we can treat the read connection reset error as retryable too. +func isErrReadConnectionReset(err error) bool { + // The error string must match the one in + // github.com/aws/aws-sdk-go/aws/request/connection_reset_error.go. This is + // unfortunate but the only solution until the SDK exposes a specialized error + // code or type for this class of errors. + return err != nil && strings.Contains(err.Error(), "read: connection reset") +} + +// ShouldRetry implements the request.Retryer interface. +func (sr *customRetryer) ShouldRetry(r *request.Request) bool { + return sr.DefaultRetryer.ShouldRetry(r) || isErrReadConnectionReset(r.Error) +} + // s3Client wraps an SDK client and uploader for a given session. type s3Client struct { client *s3.S3 @@ -340,6 +375,12 @@ func newClient( opts.Config.LogLevel = aws.LogLevel(aws.LogDebugWithRequestRetries | aws.LogDebugWithRequestErrors) } + retryer := &customRetryer{ + DefaultRetryer: client.DefaultRetryer{ + NumMaxRetries: *opts.Config.MaxRetries, + }, + } + opts.Config.Retryer = retryer sess, err := session.NewSessionWithOptions(opts) if err != nil { return s3Client{}, "", errors.Wrap(err, "new aws session")