From 9064a0c53606ffd5e0639bddedde78ea58b8c5e4 Mon Sep 17 00:00:00 2001 From: Yevgeniy Miretskiy Date: Fri, 10 Jun 2022 18:57:40 -0400 Subject: [PATCH] changefeedccl: Handle node unavailable error. Treat NodeUnavailable error as a retryable changefeed error. NodeUnavaiable error may be returned by changefeed processors if the node is being shutdown/drained. However, this error return is racy. Sometimes, the coordinator would see "rpc context cancellation" error instead -- in those cases it would treat the error as retryable. However, sometimes it is possible to get this error propagated (for example: when server shutdown races with starting up kv feed, which uses Stopper, whcih may return NodeUnavailable error if it's being shutdown). Release Notes (bug fix): Treat node unavailable error as a retryable changefeed error. --- pkg/ccl/changefeedccl/changefeedbase/BUILD.bazel | 1 + pkg/ccl/changefeedccl/changefeedbase/errors.go | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/ccl/changefeedccl/changefeedbase/BUILD.bazel b/pkg/ccl/changefeedccl/changefeedbase/BUILD.bazel index 88b7f4fcae03..c007b56469b8 100644 --- a/pkg/ccl/changefeedccl/changefeedbase/BUILD.bazel +++ b/pkg/ccl/changefeedccl/changefeedbase/BUILD.bazel @@ -15,6 +15,7 @@ go_library( "//pkg/clusterversion", "//pkg/jobs/joberror", "//pkg/jobs/jobspb", + "//pkg/roachpb", "//pkg/settings", "//pkg/sql", "//pkg/sql/catalog", diff --git a/pkg/ccl/changefeedccl/changefeedbase/errors.go b/pkg/ccl/changefeedccl/changefeedbase/errors.go index ec73d85110d2..66ec4adee015 100644 --- a/pkg/ccl/changefeedccl/changefeedbase/errors.go +++ b/pkg/ccl/changefeedccl/changefeedbase/errors.go @@ -14,6 +14,7 @@ import ( "strings" "github.com/cockroachdb/cockroach/pkg/jobs/joberror" + "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/sql/flowinfra" "github.com/cockroachdb/errors" ) @@ -117,7 +118,9 @@ func IsRetryableError(err error) bool { return true } - return (joberror.IsDistSQLRetryableError(err) || flowinfra.IsNoInboundStreamConnectionError(err)) + return (joberror.IsDistSQLRetryableError(err) || + flowinfra.IsNoInboundStreamConnectionError(err) || + errors.HasType(err, (*roachpb.NodeUnavailableError)(nil))) } // MaybeStripRetryableErrorMarker performs some minimal attempt to clean the