diff --git a/pkg/storage/store.go b/pkg/storage/store.go index ad4550df8a9e..34d384a68190 100644 --- a/pkg/storage/store.go +++ b/pkg/storage/store.go @@ -3795,7 +3795,24 @@ func (s *Store) raftTickLoop(ctx context.Context) { rangeIDs = rangeIDs[:0] // Update the liveness map. if s.cfg.NodeLiveness != nil { - s.livenessMap.Store(s.cfg.NodeLiveness.GetIsLiveMap()) + nextMap := s.cfg.NodeLiveness.GetIsLiveMap() + for nodeID, isLive := range nextMap { + if isLive { + continue + } + // Liveness claims that this node is down, but ConnHealth gets the last say + // because we'd rather quiesce a range too little than one too often. + // + // NB: This has false negatives. If a node doesn't have a conn open to it + // when ConnHealth is called, then ConnHealth will return + // rpc.ErrNotHeartbeated regardless of whether the node is up or not. That + // said, for the nodes that matter, we're likely talking to them via the + // Raft transport, so ConnHealth should usually indicate a real problem if + // it gives us an error back. The check can also have false positives if the + // node goes down after populating the map, but that matters even less. + nextMap[nodeID] = (s.cfg.NodeDialer.ConnHealth(nodeID) == nil) + } + s.livenessMap.Store(nextMap) } s.unquiescedReplicas.Lock()