From 973cca913ec04e638e4009e4b1d9e257864f4c4a Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Tue, 21 May 2019 12:57:23 +0200 Subject: [PATCH 1/5] server: avoid stalled process in ./cockroach quit There are "known unknown" problems regarding shutting down gRPC or the stopper, which can leave the process dangling while its sockets are already closed. The UX is poor and it's likely very annoying to find, fix, and regress against the root cause. Luckily, all we want to achieve is that the process is dead soon after the client disconnects, and we can do that. If I were to rewrite this code, I would probably not even bother with stopping the stopper or grpc, but just call `os.Exit` straight away. I'm not doing this right now to minimize fallout since this change will be backported to release-19.1. Release note (bug fix): Fixed a case in which `./cockroach quit` would return success even though the server process was still running in a severely degraded state. --- pkg/server/admin.go | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/pkg/server/admin.go b/pkg/server/admin.go index bdecb9bf6c66..2208d8d9354d 100644 --- a/pkg/server/admin.go +++ b/pkg/server/admin.go @@ -19,6 +19,7 @@ import ( "context" "encoding/json" "fmt" + "os" "sort" "strconv" "strings" @@ -1353,12 +1354,12 @@ func (s *adminServer) Drain(req *serverpb.DrainRequest, stream serverpb.Admin_Dr return nil } - s.server.grpc.Stop() - go func() { - // The explicit closure here allows callers.Lookup() to return something - // sensible referring to this file (otherwise it ends up in runtime - // internals). + // TODO(tbg): why don't we stop the stopper first? Stopping the stopper + // first seems more reasonable since grpc.Stop closes the listener right + // away (and who knows whether gRPC-goroutines are tied up in some + // stopper task somewhere). + s.server.grpc.Stop() s.server.stopper.Stop(ctx) }() @@ -1367,6 +1368,23 @@ func (s *adminServer) Drain(req *serverpb.DrainRequest, stream serverpb.Admin_Dr return nil case <-ctx.Done(): return ctx.Err() + case <-time.After(10 * time.Second): + // This is a hack to work around the problem in + // https://github.com/cockroachdb/cockroach/issues/37425#issuecomment-494336131 + // + // There appear to be deadlock scenarios in which we don't manage to + // fully stop the grpc server (which implies closing the listener, i.e. + // seeming dead to the outside world) or don't manage to shut down the + // stopper (the evidence in #37425 is inconclusive which one it is). + // + // Other problems in this area are known, such as + // https://github.com/cockroachdb/cockroach/pull/31692 + // + // The signal-based shutdown path uses a similar time-based escape hatch. + // Until we spend (potentially lots of time to) understand and fix this + // issue, this will serve us well. + os.Exit(1) + return errors.New("unreachable") } } From e0184978dff36f313790f282d926a537b6b615b9 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Tue, 21 May 2019 12:33:10 +0200 Subject: [PATCH 2/5] batcheval: bump ReplicaChecksumVersion In #35861, I made changes to the consistency checksum computation that were not backwards-compatible. When a 19.1 node asks a 2.1 node for a fast SHA, the 2.1 node would run a full computation and return a corresponding SHA which wouldn't match with the leaseholder's. Bump ReplicaChecksumVersion to make sure that we don't attempt to compare SHAs across these two releases. Fixes #37425. Release note (bug fix): Fixed a potential source of (faux) replica inconsistencies that can be reported while running a mixed v19.1 / v2.1 cluster. This error (in that situation only) is benign and can be resolved by upgrading to the latest v19.1 patch release. Every time this error occurs a "checkpoint" is created which will occupy a large amount of disk space and which needs to be removed manually (see /auxiliary/checkpoints). --- pkg/storage/batcheval/cmd_compute_checksum.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/storage/batcheval/cmd_compute_checksum.go b/pkg/storage/batcheval/cmd_compute_checksum.go index 44c8e3900857..86ed7557828c 100644 --- a/pkg/storage/batcheval/cmd_compute_checksum.go +++ b/pkg/storage/batcheval/cmd_compute_checksum.go @@ -42,7 +42,7 @@ func declareKeysComputeChecksum( // Version numbers for Replica checksum computation. Requests silently no-op // unless the versions are compatible. const ( - ReplicaChecksumVersion = 3 + ReplicaChecksumVersion = 4 ReplicaChecksumGCInterval = time.Hour ) From a473886b4700033f42dd99daf650e99bccd56d99 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Tue, 21 May 2019 14:35:17 +0200 Subject: [PATCH 3/5] roachtest: actually check correct node The mixed version test was always verifying the first node by accident. Release note: None --- pkg/cmd/roachtest/version.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cmd/roachtest/version.go b/pkg/cmd/roachtest/version.go index 4eef75a06855..0137eab1ef42 100644 --- a/pkg/cmd/roachtest/version.go +++ b/pkg/cmd/roachtest/version.go @@ -95,7 +95,7 @@ func registerVersion(r *registry) { // Make sure everyone is still running. for i := 1; i <= nodes; i++ { t.WorkerStatus("checking ", i) - db := c.Conn(ctx, 1) + db := c.Conn(ctx, i) defer db.Close() rows, err := db.Query(`SHOW DATABASES`) if err != nil { From b2904de78eae6a41572596bd938ed5c13be88b8f Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Tue, 21 May 2019 14:45:05 +0200 Subject: [PATCH 4/5] roachtest: run inconsistency checks during mixed version test This regression tests #37425, which exposed an incompatibility between v19.1 and v2.1. `./bin/roachtest run --local version/mixed/nodes=3` ran successfully after these changes. I took the opportunity to address a TODO in FailOnReplicaDivergence. Release note: None --- pkg/cmd/roachtest/cluster.go | 74 +++++++++++++++++++++++++++--------- pkg/cmd/roachtest/version.go | 5 +++ 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 73d14e3e8295..69519d7c54f9 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1041,39 +1041,75 @@ func (c *cluster) FailOnDeadNodes(ctx context.Context, t *test) { }) } -// FailOnReplicaDivergence fails the test if -// crdb_internal.check_consistency(true, '', '') indicates that any ranges' -// replicas are inconsistent with each other. -func (c *cluster) FailOnReplicaDivergence(ctx context.Context, t *test) { - if c.nodes < 1 { - return // unit tests - } - // TODO(tbg): n1 isn't necessarily online at this point. Try to sniff out - // a node that is. - db := c.Conn(ctx, 1) - defer db.Close() - - c.l.Printf("running (fast) consistency checks") +// CheckReplicaDivergenceOnDB runs a fast consistency check of the whole keyspace +// against the provided db. If an inconsistency is found, it returns it in the +// error. Note that this will swallow errors returned directly from the consistency +// check since we know that such spurious errors are possibly without any relation +// to the check having failed. +func (c *cluster) CheckReplicaDivergenceOnDB(ctx context.Context, db *gosql.DB) error { rows, err := db.QueryContext(ctx, ` SELECT t.range_id, t.start_key_pretty, t.status, t.detail FROM crdb_internal.check_consistency(true, '', '') as t WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`) if err != nil { - c.l.Printf("%s", err) - return + // TODO(tbg): the checks can fail for silly reasons like missing gossiped + // descriptors, etc. -- not worth failing the test for. Ideally this would + // be rock solid. + c.l.Printf("consistency check failed with %v; ignoring", err) + return nil } + var buf bytes.Buffer for rows.Next() { var rangeID int32 var prettyKey, status, detail string if err := rows.Scan(&rangeID, &prettyKey, &status, &detail); err != nil { - c.l.Printf("%s", err) - break + return err } - t.Fatalf("r%d (%s) is inconsistent: %s %s", rangeID, prettyKey, status, detail) + fmt.Fprintf(&buf, "r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail) } if err := rows.Err(); err != nil { - c.l.Printf("%s", err) + return err + } + + msg := buf.String() + if msg != "" { + return errors.New(msg) + } + return nil +} + +// FailOnReplicaDivergence fails the test if +// crdb_internal.check_consistency(true, '', '') indicates that any ranges' +// replicas are inconsistent with each other. It uses the first node that +// is up to run the query. +func (c *cluster) FailOnReplicaDivergence(ctx context.Context, t *test) { + if c.nodes < 1 { + return // unit tests + } + + // Find a live node to run against, if one exists. + var db *gosql.DB + for i := 1; i <= c.nodes; i++ { + db = c.Conn(ctx, i) + _, err := db.Exec(`SELECT 1`) + if err != nil { + _ = db.Close() + db = nil + continue + } + c.l.Printf("running (fast) consistency checks on node %d", i) + break + } + if db == nil { + c.l.Printf("no live node found, skipping consistency check") + return + } + + defer db.Close() + + if err := c.CheckReplicaDivergenceOnDB(ctx, db); err != nil { + t.Fatal(err) } } diff --git a/pkg/cmd/roachtest/version.go b/pkg/cmd/roachtest/version.go index 0137eab1ef42..bf3bfda05c4e 100644 --- a/pkg/cmd/roachtest/version.go +++ b/pkg/cmd/roachtest/version.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/binfetcher" _ "github.com/lib/pq" + "github.com/pkg/errors" ) func registerVersion(r *registry) { @@ -104,6 +105,10 @@ func registerVersion(r *registry) { if err := rows.Close(); err != nil { return err } + // Regression test for #37425. + if err := c.CheckReplicaDivergenceOnDB(ctx, db); err != nil { + return errors.Wrapf(err, "node %d", i) + } } return nil } From e3ae436816d43833a1e56b61b92f8a4cd7381844 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Tue, 21 May 2019 14:49:07 +0200 Subject: [PATCH 5/5] builtins: don't request consistency checker diff by default Release note: None --- pkg/sql/sem/builtins/generator_builtins.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/sql/sem/builtins/generator_builtins.go b/pkg/sql/sem/builtins/generator_builtins.go index 79f0297908f1..f905e4b3f6ba 100644 --- a/pkg/sql/sem/builtins/generator_builtins.go +++ b/pkg/sql/sem/builtins/generator_builtins.go @@ -962,8 +962,10 @@ func (c *checkConsistencyGenerator) Start() error { Key: c.from, EndKey: c.to, }, - Mode: c.mode, - WithDiff: true, + Mode: c.mode, + // No meaningful diff can be created if we're checking the stats only, + // so request one only if a full check is run. + WithDiff: c.mode == roachpb.ChecksumMode_CHECK_FULL, }) if err := c.db.Run(c.ctx, &b); err != nil { return err