From f366520210152c6cd7c144f1df92c281c144061f Mon Sep 17 00:00:00 2001 From: Evan Wall Date: Tue, 24 Jan 2023 23:00:08 +0000 Subject: [PATCH] sql: improve stack trace for get-user-timeout timeouts Fixes #95794 The cause of the `get-user-timeout errors` is unknown. Part of the problem is that the stack trace gets cut off at ``` | | github.com/cockroachdb/cockroach/pkg/sql.retrieveSessionInitInfoWithCache | | github.com/cockroachdb/cockroach/pkg/sql/user.go:238 ``` which does not explain what is actually being blocked. The reason that the stack trace is cut off is that the timeout is initiated by `contextutil.RunWithTimeout` which results in a "simple" (no stack trace) `context.DeadlineExceeded` error. `retrieveSessionInitInfoWithCache` is the first line in the stack trace because it calls `errors.Wrap` on `context.DeadlineExceeded`. To get a fuller stack trace, `context.DeadlineExceeded` must be wrapped immediately (`errors.Wrap` or `errors.WithStack`) before it bubbles up. Release note: None --- pkg/util/contextutil/context.go | 13 +++++++++++++ pkg/util/contextutil/context_test.go | 19 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/pkg/util/contextutil/context.go b/pkg/util/contextutil/context.go index db7dbb03597c..87a139437153 100644 --- a/pkg/util/contextutil/context.go +++ b/pkg/util/contextutil/context.go @@ -79,6 +79,18 @@ func wrap(ctx context.Context, cancel context.CancelFunc) (context.Context, cont } } +// ctxWithStacktrace overrides Err to annotate context.DeadlineExceeded and +// context.Canceled errors with a stacktrace. +// See: https://github.com/cockroachdb/cockroach/issues/95794 +type ctxWithStacktrace struct { + context.Context +} + +// Err implements the context.Context interface. +func (ctx *ctxWithStacktrace) Err() error { + return errors.WithStack(ctx.Context.Err()) +} + // RunWithTimeout runs a function with a timeout, the same way you'd do with // context.WithTimeout. It improves the opaque error messages returned by // WithTimeout by augmenting them with the op string that is passed in. @@ -86,6 +98,7 @@ func RunWithTimeout( ctx context.Context, op string, timeout time.Duration, fn func(ctx context.Context) error, ) error { ctx, cancel := context.WithTimeout(ctx, timeout) + ctx = &ctxWithStacktrace{Context: ctx} defer cancel() start := timeutil.Now() err := fn(ctx) diff --git a/pkg/util/contextutil/context_test.go b/pkg/util/contextutil/context_test.go index f313f8f540df..f84748f5a535 100644 --- a/pkg/util/contextutil/context_test.go +++ b/pkg/util/contextutil/context_test.go @@ -12,6 +12,7 @@ package contextutil import ( "context" + "fmt" "net" "testing" "time" @@ -70,6 +71,24 @@ func TestRunWithTimeout(t *testing.T) { } } +func testFuncA(ctx context.Context) error { + return testFuncB(ctx) +} + +func testFuncB(ctx context.Context) error { + <-ctx.Done() + return ctx.Err() +} + +func TestRunWithTimeoutCtxWithStacktrace(t *testing.T) { + ctx := context.Background() + err := RunWithTimeout(ctx, "foo", 1, testFuncA) + require.Error(t, err) + stacktrace := fmt.Sprintf("%+v", err) + require.Contains(t, stacktrace, "testFuncB") + require.Contains(t, stacktrace, "testFuncA") +} + // TestRunWithTimeoutWithoutDeadlineExceeded ensures that when a timeout on the // context occurs but the underlying error does not have // context.DeadlineExceeded as its Cause (perhaps due to serialization) the