Skip to content

Commit

Permalink
Merge pull request #37722 from tbg/backport19.1-37668
Browse files Browse the repository at this point in the history
backport-19.1: storage: fix and test a bogus source of replica divergence errors
  • Loading branch information
tbg authored May 22, 2019
2 parents 1810a4e + 0ceaf61 commit 699f675
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 8 deletions.
28 changes: 23 additions & 5 deletions pkg/server/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"context"
"encoding/json"
"fmt"
"os"
"sort"
"strconv"
"strings"
Expand Down Expand Up @@ -1353,12 +1354,12 @@ func (s *adminServer) Drain(req *serverpb.DrainRequest, stream serverpb.Admin_Dr
return nil
}

s.server.grpc.Stop()

go func() {
// The explicit closure here allows callers.Lookup() to return something
// sensible referring to this file (otherwise it ends up in runtime
// internals).
// TODO(tbg): why don't we stop the stopper first? Stopping the stopper
// first seems more reasonable since grpc.Stop closes the listener right
// away (and who knows whether gRPC-goroutines are tied up in some
// stopper task somewhere).
s.server.grpc.Stop()
s.server.stopper.Stop(ctx)
}()

Expand All @@ -1367,6 +1368,23 @@ func (s *adminServer) Drain(req *serverpb.DrainRequest, stream serverpb.Admin_Dr
return nil
case <-ctx.Done():
return ctx.Err()
case <-time.After(10 * time.Second):
// This is a hack to work around the problem in
// https://github.com/cockroachdb/cockroach/issues/37425#issuecomment-494336131
//
// There appear to be deadlock scenarios in which we don't manage to
// fully stop the grpc server (which implies closing the listener, i.e.
// seeming dead to the outside world) or don't manage to shut down the
// stopper (the evidence in #37425 is inconclusive which one it is).
//
// Other problems in this area are known, such as
// https://github.com/cockroachdb/cockroach/pull/31692
//
// The signal-based shutdown path uses a similar time-based escape hatch.
// Until we spend (potentially lots of time to) understand and fix this
// issue, this will serve us well.
os.Exit(1)
return errors.New("unreachable")
}
}

Expand Down
6 changes: 4 additions & 2 deletions pkg/sql/sem/builtins/generator_builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -965,8 +965,10 @@ func (c *checkConsistencyGenerator) Start() error {
Key: c.from,
EndKey: c.to,
},
Mode: c.mode,
WithDiff: true,
Mode: c.mode,
// No meaningful diff can be created if we're checking the stats only,
// so request one only if a full check is run.
WithDiff: c.mode == roachpb.ChecksumMode_CHECK_FULL,
})
if err := c.db.Run(c.ctx, &b); err != nil {
return err
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/batcheval/cmd_compute_checksum.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func declareKeysComputeChecksum(
// Version numbers for Replica checksum computation. Requests silently no-op
// unless the versions are compatible.
const (
ReplicaChecksumVersion = 3
ReplicaChecksumVersion = 4
ReplicaChecksumGCInterval = time.Hour
)

Expand Down

0 comments on commit 699f675

Please sign in to comment.