From 960868d393db787bc31bdb88400e877448ed3cf8 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Mon, 16 Jul 2018 22:23:29 +0200
Subject: [PATCH] storage: quiesce less aggressively

PR #26911 added a mechanism which during quiescence decisions ignored
nodes which (at some recent point in time) seemed non-live.
Unfortunately, the corresponding mechanism introduced to unquiesce when
the node becomes live is quite racy, at least when tormented with the
wild timings we throw at it in unit testing.  The basic discrepancy is
that unquiescing is triggered by an explicit liveness event (i.e. the kv
record being overwritten), whereas a node may seem non-live for mere
reasons of timing inbetween. In effect this means that quiescence is
more aggressive than unquiescence.

I explored various venues to address this (going as far as introducing
an unquiesce message to the Raft scheduler and serializing unquiescence
through the store Raft ticker goroutine), but it's hopeless. It's an
equally idle diversion to try to adjust the test liveness durations.

So, in this commit, the liveness check becomes more lenient: even when a
node seems non-live, we count it as live if there's a healthy connection
established to it. This effectively deals with issues in tests but also
seems like a good addition for real world workloads: when the liveness
range breaks down, we don't want to aggressively quiesce.

Fixes #27545.
Fixes #27607.

Release note: None
---
 pkg/storage/store.go | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/pkg/storage/store.go b/pkg/storage/store.go
index ad4550df8a9e..34d384a68190 100644
--- a/pkg/storage/store.go
+++ b/pkg/storage/store.go
@@ -3795,7 +3795,24 @@ func (s *Store) raftTickLoop(ctx context.Context) {
 			rangeIDs = rangeIDs[:0]
 			// Update the liveness map.
 			if s.cfg.NodeLiveness != nil {
-				s.livenessMap.Store(s.cfg.NodeLiveness.GetIsLiveMap())
+				nextMap := s.cfg.NodeLiveness.GetIsLiveMap()
+				for nodeID, isLive := range nextMap {
+					if isLive {
+						continue
+					}
+					// Liveness claims that this node is down, but ConnHealth gets the last say
+					// because we'd rather quiesce a range too little than one too often.
+					//
+					// NB: This has false negatives. If a node doesn't have a conn open to it
+					// when ConnHealth is called, then ConnHealth will return
+					// rpc.ErrNotHeartbeated regardless of whether the node is up or not. That
+					// said, for the nodes that matter, we're likely talking to them via the
+					// Raft transport, so ConnHealth should usually indicate a real problem if
+					// it gives us an error back. The check can also have false positives if the
+					// node goes down after populating the map, but that matters even less.
+					nextMap[nodeID] = (s.cfg.NodeDialer.ConnHealth(nodeID) == nil)
+				}
+				s.livenessMap.Store(nextMap)
 			}
 
 			s.unquiescedReplicas.Lock()