From 1ae7e1c87880d88738a4a9b22080557e09ceb643 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Tue, 21 May 2019 14:49:07 +0200 Subject: [PATCH 1/3] builtins: don't request consistency checker diff by default Release note: None --- pkg/sql/sem/builtins/generator_builtins.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/sql/sem/builtins/generator_builtins.go b/pkg/sql/sem/builtins/generator_builtins.go index 0fa5ddd3ed4a..d393ac1c6878 100644 --- a/pkg/sql/sem/builtins/generator_builtins.go +++ b/pkg/sql/sem/builtins/generator_builtins.go @@ -965,8 +965,10 @@ func (c *checkConsistencyGenerator) Start() error { Key: c.from, EndKey: c.to, }, - Mode: c.mode, - WithDiff: true, + Mode: c.mode, + // No meaningful diff can be created if we're checking the stats only, + // so request one only if a full check is run. + WithDiff: c.mode == roachpb.ChecksumMode_CHECK_FULL, }) if err := c.db.Run(c.ctx, &b); err != nil { return err From 492967a0b0582415b7b44be19b1bb4450c00ee84 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Tue, 21 May 2019 12:33:10 +0200 Subject: [PATCH 2/3] batcheval: bump ReplicaChecksumVersion In #35861, I made changes to the consistency checksum computation that were not backwards-compatible. When a 19.1 node asks a 2.1 node for a fast SHA, the 2.1 node would run a full computation and return a corresponding SHA which wouldn't match with the leaseholder's. Bump ReplicaChecksumVersion to make sure that we don't attempt to compare SHAs across these two releases. Fixes #37425. Release note (bug fix): Fixed a potential source of (faux) replica inconsistencies that can be reported while running a mixed v19.1 / v2.1 cluster. This error (in that situation only) is benign and can be resolved by upgrading to the latest v19.1 patch release. Every time this error occurs a "checkpoint" is created which will occupy a large amount of disk space and which needs to be removed manually (see /auxiliary/checkpoints). --- pkg/storage/batcheval/cmd_compute_checksum.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/storage/batcheval/cmd_compute_checksum.go b/pkg/storage/batcheval/cmd_compute_checksum.go index 44c8e3900857..86ed7557828c 100644 --- a/pkg/storage/batcheval/cmd_compute_checksum.go +++ b/pkg/storage/batcheval/cmd_compute_checksum.go @@ -42,7 +42,7 @@ func declareKeysComputeChecksum( // Version numbers for Replica checksum computation. Requests silently no-op // unless the versions are compatible. const ( - ReplicaChecksumVersion = 3 + ReplicaChecksumVersion = 4 ReplicaChecksumGCInterval = time.Hour ) From 0ceaf61490dbd16183c2573b7b0bc7efbb90e1ff Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Tue, 21 May 2019 12:57:23 +0200 Subject: [PATCH 3/3] server: avoid stalled process in ./cockroach quit There are "known unknown" problems regarding shutting down gRPC or the stopper, which can leave the process dangling while its sockets are already closed. The UX is poor and it's likely very annoying to find, fix, and regress against the root cause. Luckily, all we want to achieve is that the process is dead soon after the client disconnects, and we can do that. If I were to rewrite this code, I would probably not even bother with stopping the stopper or grpc, but just call `os.Exit` straight away. I'm not doing this right now to minimize fallout since this change will be backported to release-19.1. Release note (bug fix): Fixed a case in which `./cockroach quit` would return success even though the server process was still running in a severely degraded state. --- pkg/server/admin.go | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/pkg/server/admin.go b/pkg/server/admin.go index e0a1c0a1f328..622c642651f9 100644 --- a/pkg/server/admin.go +++ b/pkg/server/admin.go @@ -19,6 +19,7 @@ import ( "context" "encoding/json" "fmt" + "os" "sort" "strconv" "strings" @@ -1353,12 +1354,12 @@ func (s *adminServer) Drain(req *serverpb.DrainRequest, stream serverpb.Admin_Dr return nil } - s.server.grpc.Stop() - go func() { - // The explicit closure here allows callers.Lookup() to return something - // sensible referring to this file (otherwise it ends up in runtime - // internals). + // TODO(tbg): why don't we stop the stopper first? Stopping the stopper + // first seems more reasonable since grpc.Stop closes the listener right + // away (and who knows whether gRPC-goroutines are tied up in some + // stopper task somewhere). + s.server.grpc.Stop() s.server.stopper.Stop(ctx) }() @@ -1367,6 +1368,23 @@ func (s *adminServer) Drain(req *serverpb.DrainRequest, stream serverpb.Admin_Dr return nil case <-ctx.Done(): return ctx.Err() + case <-time.After(10 * time.Second): + // This is a hack to work around the problem in + // https://github.com/cockroachdb/cockroach/issues/37425#issuecomment-494336131 + // + // There appear to be deadlock scenarios in which we don't manage to + // fully stop the grpc server (which implies closing the listener, i.e. + // seeming dead to the outside world) or don't manage to shut down the + // stopper (the evidence in #37425 is inconclusive which one it is). + // + // Other problems in this area are known, such as + // https://github.com/cockroachdb/cockroach/pull/31692 + // + // The signal-based shutdown path uses a similar time-based escape hatch. + // Until we spend (potentially lots of time to) understand and fix this + // issue, this will serve us well. + os.Exit(1) + return errors.New("unreachable") } }