From 960868d393db787bc31bdb88400e877448ed3cf8 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Mon, 16 Jul 2018 22:23:29 +0200 Subject: [PATCH] storage: quiesce less aggressively PR #26911 added a mechanism which during quiescence decisions ignored nodes which (at some recent point in time) seemed non-live. Unfortunately, the corresponding mechanism introduced to unquiesce when the node becomes live is quite racy, at least when tormented with the wild timings we throw at it in unit testing. The basic discrepancy is that unquiescing is triggered by an explicit liveness event (i.e. the kv record being overwritten), whereas a node may seem non-live for mere reasons of timing inbetween. In effect this means that quiescence is more aggressive than unquiescence. I explored various venues to address this (going as far as introducing an unquiesce message to the Raft scheduler and serializing unquiescence through the store Raft ticker goroutine), but it's hopeless. It's an equally idle diversion to try to adjust the test liveness durations. So, in this commit, the liveness check becomes more lenient: even when a node seems non-live, we count it as live if there's a healthy connection established to it. This effectively deals with issues in tests but also seems like a good addition for real world workloads: when the liveness range breaks down, we don't want to aggressively quiesce. Fixes #27545. Fixes #27607. Release note: None --- pkg/storage/store.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pkg/storage/store.go b/pkg/storage/store.go index ad4550df8a9e..34d384a68190 100644 --- a/pkg/storage/store.go +++ b/pkg/storage/store.go @@ -3795,7 +3795,24 @@ func (s *Store) raftTickLoop(ctx context.Context) { rangeIDs = rangeIDs[:0] // Update the liveness map. if s.cfg.NodeLiveness != nil { - s.livenessMap.Store(s.cfg.NodeLiveness.GetIsLiveMap()) + nextMap := s.cfg.NodeLiveness.GetIsLiveMap() + for nodeID, isLive := range nextMap { + if isLive { + continue + } + // Liveness claims that this node is down, but ConnHealth gets the last say + // because we'd rather quiesce a range too little than one too often. + // + // NB: This has false negatives. If a node doesn't have a conn open to it + // when ConnHealth is called, then ConnHealth will return + // rpc.ErrNotHeartbeated regardless of whether the node is up or not. That + // said, for the nodes that matter, we're likely talking to them via the + // Raft transport, so ConnHealth should usually indicate a real problem if + // it gives us an error back. The check can also have false positives if the + // node goes down after populating the map, but that matters even less. + nextMap[nodeID] = (s.cfg.NodeDialer.ConnHealth(nodeID) == nil) + } + s.livenessMap.Store(nextMap) } s.unquiescedReplicas.Lock()