From 1ae7e1c87880d88738a4a9b22080557e09ceb643 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Tue, 21 May 2019 14:49:07 +0200
Subject: [PATCH 1/3] builtins: don't request consistency checker diff by
 default

Release note: None
---
 pkg/sql/sem/builtins/generator_builtins.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/pkg/sql/sem/builtins/generator_builtins.go b/pkg/sql/sem/builtins/generator_builtins.go
index 0fa5ddd3ed4a..d393ac1c6878 100644
--- a/pkg/sql/sem/builtins/generator_builtins.go
+++ b/pkg/sql/sem/builtins/generator_builtins.go
@@ -965,8 +965,10 @@ func (c *checkConsistencyGenerator) Start() error {
 			Key:    c.from,
 			EndKey: c.to,
 		},
-		Mode:     c.mode,
-		WithDiff: true,
+		Mode: c.mode,
+		// No meaningful diff can be created if we're checking the stats only,
+		// so request one only if a full check is run.
+		WithDiff: c.mode == roachpb.ChecksumMode_CHECK_FULL,
 	})
 	if err := c.db.Run(c.ctx, &b); err != nil {
 		return err

From 492967a0b0582415b7b44be19b1bb4450c00ee84 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Tue, 21 May 2019 12:33:10 +0200
Subject: [PATCH 2/3] batcheval: bump ReplicaChecksumVersion

In #35861, I made changes to the consistency checksum computation that
were not backwards-compatible. When a 19.1 node asks a 2.1 node for a
fast SHA, the 2.1 node would run a full computation and return a
corresponding SHA which wouldn't match with the leaseholder's.

Bump ReplicaChecksumVersion to make sure that we don't attempt to
compare SHAs across these two releases.

Fixes #37425.

Release note (bug fix): Fixed a potential source of (faux) replica
inconsistencies that can be reported while running a mixed v19.1 / v2.1
cluster. This error (in that situation only) is benign and can be
resolved by upgrading to the latest v19.1 patch release. Every time this
error occurs a "checkpoint" is created which will occupy a large amount
of disk space and which needs to be removed manually (see <store
directory>/auxiliary/checkpoints).
---
 pkg/storage/batcheval/cmd_compute_checksum.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/storage/batcheval/cmd_compute_checksum.go b/pkg/storage/batcheval/cmd_compute_checksum.go
index 44c8e3900857..86ed7557828c 100644
--- a/pkg/storage/batcheval/cmd_compute_checksum.go
+++ b/pkg/storage/batcheval/cmd_compute_checksum.go
@@ -42,7 +42,7 @@ func declareKeysComputeChecksum(
 // Version numbers for Replica checksum computation. Requests silently no-op
 // unless the versions are compatible.
 const (
-	ReplicaChecksumVersion    = 3
+	ReplicaChecksumVersion    = 4
 	ReplicaChecksumGCInterval = time.Hour
 )
 

From 0ceaf61490dbd16183c2573b7b0bc7efbb90e1ff Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Tue, 21 May 2019 12:57:23 +0200
Subject: [PATCH 3/3] server: avoid stalled process in ./cockroach quit

There are "known unknown" problems regarding shutting down gRPC or the
stopper, which can leave the process dangling while its sockets are
already closed. The UX is poor and it's likely very annoying to find,
fix, and regress against the root cause. Luckily, all we want to achieve
is that the process is dead soon after the client disconnects, and we
can do that.

If I were to rewrite this code, I would probably not even bother with
stopping the stopper or grpc, but just call `os.Exit` straight away.
I'm not doing this right now to minimize fallout since this change
will be backported to release-19.1.

Release note (bug fix): Fixed a case in which `./cockroach quit` would
return success even though the server process was still running in a
severely degraded state.
---
 pkg/server/admin.go | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/pkg/server/admin.go b/pkg/server/admin.go
index e0a1c0a1f328..622c642651f9 100644
--- a/pkg/server/admin.go
+++ b/pkg/server/admin.go
@@ -19,6 +19,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"os"
 	"sort"
 	"strconv"
 	"strings"
@@ -1353,12 +1354,12 @@ func (s *adminServer) Drain(req *serverpb.DrainRequest, stream serverpb.Admin_Dr
 		return nil
 	}
 
-	s.server.grpc.Stop()
-
 	go func() {
-		// The explicit closure here allows callers.Lookup() to return something
-		// sensible referring to this file (otherwise it ends up in runtime
-		// internals).
+		// TODO(tbg): why don't we stop the stopper first? Stopping the stopper
+		// first seems more reasonable since grpc.Stop closes the listener right
+		// away (and who knows whether gRPC-goroutines are tied up in some
+		// stopper task somewhere).
+		s.server.grpc.Stop()
 		s.server.stopper.Stop(ctx)
 	}()
 
@@ -1367,6 +1368,23 @@ func (s *adminServer) Drain(req *serverpb.DrainRequest, stream serverpb.Admin_Dr
 		return nil
 	case <-ctx.Done():
 		return ctx.Err()
+	case <-time.After(10 * time.Second):
+		// This is a hack to work around the problem in
+		// https://github.com/cockroachdb/cockroach/issues/37425#issuecomment-494336131
+		//
+		// There appear to be deadlock scenarios in which we don't manage to
+		// fully stop the grpc server (which implies closing the listener, i.e.
+		// seeming dead to the outside world) or don't manage to shut down the
+		// stopper (the evidence in #37425 is inconclusive which one it is).
+		//
+		// Other problems in this area are known, such as
+		// https://github.com/cockroachdb/cockroach/pull/31692
+		//
+		// The signal-based shutdown path uses a similar time-based escape hatch.
+		// Until we spend (potentially lots of time to) understand and fix this
+		// issue, this will serve us well.
+		os.Exit(1)
+		return errors.New("unreachable")
 	}
 }