diff --git a/pkg/kv/kvserver/raft_transport.go b/pkg/kv/kvserver/raft_transport.go index ff92d4ede2e8..7eb0bb452be6 100644 --- a/pkg/kv/kvserver/raft_transport.go +++ b/pkg/kv/kvserver/raft_transport.go @@ -614,11 +614,15 @@ func (t *RaftTransport) startProcessNewQueue( } defer cleanup(ch) defer t.queues[class].Delete(int64(toNodeID)) - conn, err := t.dialer.Dial(ctx, toNodeID, class) + // NB: we dial without a breaker here because the caller has already + // checked the breaker. Checking it again can cause livelock, see: + // https://github.com/cockroachdb/cockroach/issues/68419 + conn, err := t.dialer.DialNoBreaker(ctx, toNodeID, class) if err != nil { // DialNode already logs sufficiently, so just return. return } + client := NewMultiRaftClient(conn) batchCtx, cancel := context.WithCancel(ctx) defer cancel() diff --git a/pkg/rpc/nodedialer/nodedialer.go b/pkg/rpc/nodedialer/nodedialer.go index 1d42d308d557..9ebc40e2e3d0 100644 --- a/pkg/rpc/nodedialer/nodedialer.go +++ b/pkg/rpc/nodedialer/nodedialer.go @@ -88,11 +88,12 @@ func (n *Dialer) Dial( breaker.Fail(err) return nil, err } - return n.dial(ctx, nodeID, addr, breaker, class) + return n.dial(ctx, nodeID, addr, breaker, true /* checkBreaker */, class) } // DialNoBreaker ignores the breaker if there is an error dialing. This function // should only be used when there is good reason to believe that the node is reachable. +// The result of the dial will be announced to the breaker. func (n *Dialer) DialNoBreaker( ctx context.Context, nodeID roachpb.NodeID, class rpc.ConnectionClass, ) (_ *grpc.ClientConn, err error) { @@ -103,7 +104,7 @@ func (n *Dialer) DialNoBreaker( if err != nil { return nil, err } - return n.dial(ctx, nodeID, addr, nil /* breaker */, class) + return n.dial(ctx, nodeID, addr, n.getBreaker(nodeID, class), false /* checkBreaker */, class) } // DialInternalClient is a specialization of DialClass for callers that @@ -131,7 +132,7 @@ func (n *Dialer) DialInternalClient( return localCtx, localClient, nil } log.VEventf(ctx, 2, "sending request to %s", addr) - conn, err := n.dial(ctx, nodeID, addr, n.getBreaker(nodeID, class), class) + conn, err := n.dial(ctx, nodeID, addr, n.getBreaker(nodeID, class), true /* checkBreaker */, class) if err != nil { return nil, nil, err } @@ -145,13 +146,14 @@ func (n *Dialer) dial( nodeID roachpb.NodeID, addr net.Addr, breaker *wrappedBreaker, + checkBreaker bool, class rpc.ConnectionClass, ) (_ *grpc.ClientConn, err error) { // Don't trip the breaker if we're already canceled. if ctxErr := ctx.Err(); ctxErr != nil { return nil, ctxErr } - if breaker != nil && !breaker.Ready() { + if checkBreaker && !breaker.Ready() { err = errors.Wrapf(circuit.ErrBreakerOpen, "unable to dial n%d", nodeID) return nil, err }