From cb92673fe7933ec468789b4fc37f611cece48b40 Mon Sep 17 00:00:00 2001 From: Rui Hu Date: Fri, 29 Jul 2022 11:03:19 -0400 Subject: [PATCH] cloud/gcp: add custom retryer for gcs storage, retry on stream INTERNAL_ERROR Currently, errors like `stream error: stream ID ; INTERNAL_ERROR; received from peer` are not being retried. Create a custom retryer to retry these errors as suggested by: https://github.com/googleapis/google-cloud-go/issues/3735 https://github.com/googleapis/google-cloud-go/issues/784 Fixes: #85217, #85216, #85204, #84162 Release note: None --- build/bazelutil/nogo_config.json | 3 +- pkg/cloud/gcp/BUILD.bazel | 5 ++ pkg/cloud/gcp/gcs_retry.go | 78 ++++++++++++++++++++++++++++++++ pkg/cloud/gcp/gcs_storage.go | 30 ++++++++++++ pkg/testutils/lint/lint_test.go | 2 + 5 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 pkg/cloud/gcp/gcs_retry.go diff --git a/build/bazelutil/nogo_config.json b/build/bazelutil/nogo_config.json index b17463c08e64..94122bde6c21 100644 --- a/build/bazelutil/nogo_config.json +++ b/build/bazelutil/nogo_config.json @@ -53,7 +53,8 @@ "cockroach/pkg/sql/pgwire/pgerror/pgcode\\.go$": "invalid direct cast on error object", "cockroach/pkg/testutils/lint/lint_test\\.go$": "invalid direct cast on error object", "cockroach/pkg/util/contextutil/timeout_error\\.go$": "invalid direct cast on error object", - "cockroach/pkg/util/sysutil/sysutil_.*": "type can change by system" + "cockroach/pkg/util/sysutil/sysutil_.*": "type can change by system", + "cockroach/pkg/cloud/gcp/gcs_retry\\.go$": "invalid direct cast on error object" }, "only_files": { "cockroach/pkg/.*$": "first-party code" diff --git a/pkg/cloud/gcp/BUILD.bazel b/pkg/cloud/gcp/BUILD.bazel index 81407c889602..db08e83e80d6 100644 --- a/pkg/cloud/gcp/BUILD.bazel +++ b/pkg/cloud/gcp/BUILD.bazel @@ -5,6 +5,7 @@ go_library( name = "gcp", srcs = [ "gcs_kms.go", + "gcs_retry.go", "gcs_storage.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/cloud/gcp", @@ -23,11 +24,15 @@ go_library( "@com_github_gogo_protobuf//types", "@com_google_cloud_go_kms//apiv1", "@com_google_cloud_go_storage//:storage", + "@org_golang_google_api//googleapi", "@org_golang_google_api//impersonate", "@org_golang_google_api//iterator", "@org_golang_google_api//option", "@org_golang_google_genproto//googleapis/cloud/kms/v1:kms", + "@org_golang_google_grpc//codes", + "@org_golang_google_grpc//status", "@org_golang_google_protobuf//types/known/wrapperspb", + "@org_golang_x_net//http2", "@org_golang_x_oauth2//:oauth2", ], ) diff --git a/pkg/cloud/gcp/gcs_retry.go b/pkg/cloud/gcp/gcs_retry.go new file mode 100644 index 000000000000..f70051c18ff9 --- /dev/null +++ b/pkg/cloud/gcp/gcs_retry.go @@ -0,0 +1,78 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package gcp + +import ( + "io" + "net" + "net/url" + "strings" + + "github.com/cockroachdb/errors" + "google.golang.org/api/googleapi" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// defaultShouldRetry is google-cloud's default predicate for determining +// whether an error can be retried. +// +// TODO(rui): Currently this code is copied as-is from the google-cloud-go SDK +// in order to get the default retry behavior on top of our own customizations. +// There's currently a PR in google-cloud-go that exposes the default retry +// function, so this can be removed when it is merged: +// https://github.com/googleapis/google-cloud-go/pull/6370 +func defaultShouldRetry(err error) bool { + if err == nil { + return false + } + if errors.Is(err, io.ErrUnexpectedEOF) { + return true + } + + switch e := err.(type) { + case *net.OpError: + if strings.Contains(e.Error(), "use of closed network connection") { + // TODO: check against net.ErrClosed (go 1.16+) instead of string + return true + } + case *googleapi.Error: + // Retry on 408, 429, and 5xx, according to + // https://cloud.google.com/storage/docs/exponential-backoff. + return e.Code == 408 || e.Code == 429 || (e.Code >= 500 && e.Code < 600) + case *url.Error: + // Retry socket-level errors ECONNREFUSED and ECONNRESET (from syscall). + // Unfortunately the error type is unexported, so we resort to string + // matching. + retriable := []string{"connection refused", "connection reset"} + for _, s := range retriable { + if strings.Contains(e.Error(), s) { + return true + } + } + case interface{ Temporary() bool }: + if e.Temporary() { + return true + } + } + // HTTP 429, 502, 503, and 504 all map to gRPC UNAVAILABLE per + // https://grpc.github.io/grpc/core/md_doc_http-grpc-status-mapping.html. + // + // This is only necessary for the experimental gRPC-based media operations. + if st, ok := status.FromError(err); ok && st.Code() == codes.Unavailable { + return true + } + // Unwrap is only supported in go1.13.x+ + if e, ok := err.(interface{ Unwrap() error }); ok { + return defaultShouldRetry(e.Unwrap()) + } + return false +} diff --git a/pkg/cloud/gcp/gcs_storage.go b/pkg/cloud/gcp/gcs_storage.go index 9787007b2c01..8cc9fa31946d 100644 --- a/pkg/cloud/gcp/gcs_storage.go +++ b/pkg/cloud/gcp/gcs_storage.go @@ -31,6 +31,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/tracing" "github.com/cockroachdb/errors" "github.com/gogo/protobuf/types" + "golang.org/x/net/http2" "golang.org/x/oauth2" "google.golang.org/api/impersonate" "google.golang.org/api/iterator" @@ -174,6 +175,7 @@ func makeGCSStorage( if err != nil { return nil, errors.Wrap(err, "failed to create google cloud client") } + g.SetRetry(gcs.WithErrorFunc(shouldRetry)) bucket := g.Bucket(conf.Bucket) if conf.BillingProject != `` { bucket = bucket.UserProject(conf.BillingProject) @@ -343,6 +345,34 @@ func (g *gcsStorage) Close() error { return g.client.Close() } +// shouldRetry is the predicate that determines whether a GCS client error +// should be retried. The predicate combines google-cloud-go's default retry +// predicate and some additional predicates when determining whether the error +// is retried. The additional predicates are: +// +// - http2.StreamError error with code http2.ErrCodeInternal: this error has +// been recommended to be retried in several issues in the google-cloud-go repo: +// https://github.com/googleapis/google-cloud-go/issues/3735 +// https://github.com/googleapis/google-cloud-go/issues/784 +// Remove if this error ever becomes part of the default retry predicate. +func shouldRetry(err error) bool { + if defaultShouldRetry(err) { + return true + } + + if e := (http2.StreamError{}); errors.As(err, &e) { + if e.Code == http2.ErrCodeInternal { + return true + } + } + + if e := (errors.Wrapper)(nil); errors.As(err, &e) { + return shouldRetry(e.Unwrap()) + } + + return false +} + func init() { cloud.RegisterExternalStorageProvider(cloudpb.ExternalStorageProvider_gs, parseGSURL, makeGCSStorage, cloud.RedactedParams(CredentialsParam, BearerTokenParam), "gs") diff --git a/pkg/testutils/lint/lint_test.go b/pkg/testutils/lint/lint_test.go index b40606c973bf..e7c6e7004d74 100644 --- a/pkg/testutils/lint/lint_test.go +++ b/pkg/testutils/lint/lint_test.go @@ -2208,6 +2208,8 @@ func TestLint(t *testing.T) { stream.GrepNot(`pkg/roachprod/logger/log\.go:.*format argument is not a constant expression`), // We purposefully produce nil dereferences in this file to test crash conditions stream.GrepNot(`pkg/util/log/logcrash/crash_reporting_test\.go:.*nil dereference in type assertion`), + // Temporarily copied code from google-cloud-go's retry predicate. + stream.GrepNot(`pkg/cloud/gcp/gcs_retry\.go:.*invalid direct cast on error object`), // Spawning naked goroutines is ok when it's not as part of the main CRDB // binary. This is for now - if we use #58164 to introduce more aggressive // pooling, etc, then test code needs to adhere as well.