Skip to content

Commit

Permalink
kvclient: improve logging of slow RPCs in DistSender
Browse files Browse the repository at this point in the history
Release justification: small change improving logging, hoping to help
debug cockroachdb#53197.

This patch moves the logging that the DistSender does when it detects an
RPC that is slow such that it kicks in in more cases:
1) if the RPC was successful (but slow)
2) if the lower layer in the DistSender failed to contact any replica
   and returned a sendError{}. In this case the range descriptor will be
   refreshed and the RPC will be retried, similar to the case in which a
   replica returned RangeKeyMismatchError. The RKME case was already
   covered by the logging, but the sendError case wasn't.

Touches cockroachdb#53197

Release note: None
  • Loading branch information
andreimatei committed Aug 31, 2020
1 parent d2c73b3 commit 038045b
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 22 deletions.
46 changes: 32 additions & 14 deletions pkg/kv/kvclient/kvcoord/dist_sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -1370,9 +1370,21 @@ func (ds *DistSender) sendPartialBatchAsync(
}

func slowRangeRPCWarningStr(
dur time.Duration, attempts int64, desc *roachpb.RangeDescriptor, pErr *roachpb.Error,
ba roachpb.BatchRequest,
dur time.Duration,
attempts int64,
desc *roachpb.RangeDescriptor,
err error,
br *roachpb.BatchResponse,
) string {
return fmt.Sprintf("have been waiting %.2fs (%d attempts) for RPC to %s: %s", dur.Seconds(), attempts, desc, pErr)
var resp string
if err != nil {
resp = err.Error()
} else {
resp = br.String()
}
return fmt.Sprintf("have been waiting %.2fs (%d attempts) for RPC %s to %s; resp: %s",
dur.Seconds(), attempts, ba, desc, resp)
}

func slowRangeRPCReturnWarningStr(dur time.Duration, attempts int64) string {
Expand Down Expand Up @@ -1475,6 +1487,24 @@ func (ds *DistSender) sendPartialBatch(
}

reply, err = ds.sendToReplicas(ctx, ba, routing, withCommit)

const slowDistSenderThreshold = time.Minute
if dur := timeutil.Since(tBegin); dur > slowDistSenderThreshold && !tBegin.IsZero() {
log.Warningf(ctx, "slow range RPC: %v",
slowRangeRPCWarningStr(ba, dur, attempts, routing.Desc(), err, reply))
// If the RPC wasn't successful, defer the logging of a message once the
// RPC is not retried any more.
if err != nil || reply.Error != nil {
ds.metrics.SlowRPCs.Inc(1)
defer func(tBegin time.Time, attempts int64) {
ds.metrics.SlowRPCs.Dec(1)
log.Warningf(ctx, "slow RPC response: %v",
slowRangeRPCReturnWarningStr(timeutil.Since(tBegin), attempts))
}(tBegin, attempts)
}
tBegin = time.Time{} // prevent reentering branch for this RPC
}

if err != nil {
// Set pErr so that, if we don't perform any more retries, the
// deduceRetryEarlyExitError() call below the loop is inhibited.
Expand Down Expand Up @@ -1522,18 +1552,6 @@ func (ds *DistSender) sendPartialBatch(
pErr.Index.Index = int32(positions[pErr.Index.Index])
}

const slowDistSenderThreshold = time.Minute
if dur := timeutil.Since(tBegin); dur > slowDistSenderThreshold && !tBegin.IsZero() {
ds.metrics.SlowRPCs.Inc(1)
log.Warningf(ctx, "slow range RPC: %v",
slowRangeRPCWarningStr(dur, attempts, routing.Desc(), pErr))
defer func(tBegin time.Time, attempts int64) {
ds.metrics.SlowRPCs.Dec(1)
log.Warningf(ctx, "slow RPC response: %v",
slowRangeRPCReturnWarningStr(timeutil.Since(tBegin), attempts))
}(tBegin, attempts)
tBegin = time.Time{} // prevent reentering branch for this RPC
}
log.VErrEventf(ctx, 2, "reply error %s: %s", ba, pErr)

// Error handling: If the error indicates that our range
Expand Down
17 changes: 9 additions & 8 deletions pkg/kv/kvclient/kvcoord/dist_sender_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3945,16 +3945,17 @@ func TestDistSenderSlowLogMessage(t *testing.T) {
dur = 8158 * time.Millisecond
attempts = 120
)
var ba roachpb.BatchRequest
get := &roachpb.GetRequest{}
get.Key = roachpb.Key("a")
ba.Add(get)
br := &roachpb.BatchResponse{}
br.Error = roachpb.NewError(errors.New("boom"))
desc := &roachpb.RangeDescriptor{RangeID: 9, StartKey: roachpb.RKey("x")}
{
exp := `have been waiting 8.16s (120 attempts) for RPC to` +
` r9:{-} [<no replicas>, next=0, gen=0]: boom`
act := slowRangeRPCWarningStr(
dur,
120,
desc,
roachpb.NewError(errors.New("boom")))

exp := `have been waiting 8.16s (120 attempts) for RPC Get ["a",/Min) to` +
` r9:{-} [<no replicas>, next=0, gen=0]; resp: (err: boom)`
act := slowRangeRPCWarningStr(ba, dur, attempts, desc, nil /* err */, br)
require.Equal(t, exp, act)
}

Expand Down

0 comments on commit 038045b

Please sign in to comment.