Skip to content

Commit

Permalink
kvserver: remove extraneous circuit breaker check in Raft transport
Browse files Browse the repository at this point in the history
See #68419.

Release justification: bug fix
Release note (bug fix): Previously, after a temporary node outage, other
nodes in the cluster could fail to connect to the restarted node due to
their circuit breakers not resetting. This would manifest in the logs
via messages "unable to dial nXX: breaker open", where `XX` is the ID
of the restarted node. (Note that such errors are expected for nodes
that are truly unreachable, and may still occur around the time of
the restart, but for no longer than a few seconds).
  • Loading branch information
tbg committed Aug 26, 2021
1 parent 2a16eed commit 81ba3c7
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
6 changes: 5 additions & 1 deletion pkg/kv/kvserver/raft_transport.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,11 +614,15 @@ func (t *RaftTransport) startProcessNewQueue(
}
defer cleanup(ch)
defer t.queues[class].Delete(int64(toNodeID))
conn, err := t.dialer.Dial(ctx, toNodeID, class)
// NB: we dial without a breaker here because the caller has already
// checked the breaker. Checking it again can cause livelock, see:
// https://github.com/cockroachdb/cockroach/issues/68419
conn, err := t.dialer.DialNoBreaker(ctx, toNodeID, class)
if err != nil {
// DialNode already logs sufficiently, so just return.
return
}

client := NewMultiRaftClient(conn)
batchCtx, cancel := context.WithCancel(ctx)
defer cancel()
Expand Down
10 changes: 6 additions & 4 deletions pkg/rpc/nodedialer/nodedialer.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,12 @@ func (n *Dialer) Dial(
breaker.Fail(err)
return nil, err
}
return n.dial(ctx, nodeID, addr, breaker, class)
return n.dial(ctx, nodeID, addr, breaker, true /* checkBreaker */, class)
}

// DialNoBreaker ignores the breaker if there is an error dialing. This function
// should only be used when there is good reason to believe that the node is reachable.
// The result of the dial will be announced to the breaker.
func (n *Dialer) DialNoBreaker(
ctx context.Context, nodeID roachpb.NodeID, class rpc.ConnectionClass,
) (_ *grpc.ClientConn, err error) {
Expand All @@ -103,7 +104,7 @@ func (n *Dialer) DialNoBreaker(
if err != nil {
return nil, err
}
return n.dial(ctx, nodeID, addr, nil /* breaker */, class)
return n.dial(ctx, nodeID, addr, n.getBreaker(nodeID, class), false /* checkBreaker */, class)
}

// DialInternalClient is a specialization of DialClass for callers that
Expand Down Expand Up @@ -131,7 +132,7 @@ func (n *Dialer) DialInternalClient(
return localCtx, localClient, nil
}
log.VEventf(ctx, 2, "sending request to %s", addr)
conn, err := n.dial(ctx, nodeID, addr, n.getBreaker(nodeID, class), class)
conn, err := n.dial(ctx, nodeID, addr, n.getBreaker(nodeID, class), true /* checkBreaker */, class)
if err != nil {
return nil, nil, err
}
Expand All @@ -145,13 +146,14 @@ func (n *Dialer) dial(
nodeID roachpb.NodeID,
addr net.Addr,
breaker *wrappedBreaker,
checkBreaker bool,
class rpc.ConnectionClass,
) (_ *grpc.ClientConn, err error) {
// Don't trip the breaker if we're already canceled.
if ctxErr := ctx.Err(); ctxErr != nil {
return nil, ctxErr
}
if breaker != nil && !breaker.Ready() {
if checkBreaker && !breaker.Ready() {
err = errors.Wrapf(circuit.ErrBreakerOpen, "unable to dial n%d", nodeID)
return nil, err
}
Expand Down

0 comments on commit 81ba3c7

Please sign in to comment.