From d4777beb4b64bdfbe2e850bceaf87fc59d709fee Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Sun, 13 May 2018 14:49:50 -0400 Subject: [PATCH] storage: fix deadlock in consistency queue When `CheckConsistency` returns an error, the queue checks whether the store is draining to decide whether the error is worth logging. Unfortunately this check was incorrect and would block until the store actually started draining. A toy example of this problem is below (this will deadlock). The dual return form of chan receive isn't non-blocking -- the second parameter indicates whether the received value corresponds to a closing of the channel. Switch to a `select` instead. ``` package main import ( "fmt" ) func main() { ch := make(chan struct{}) _, ok := <-ch fmt.Println(ok) } ``` Touches #21824. Release note (bug fix): Prevent the consistency checker from deadlocking. This would previously manifest itself as a steady number of replicas queued for consistency checking on one or more nodes and would resolve by restarting the affected nodes. --- pkg/storage/consistency_queue.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pkg/storage/consistency_queue.go b/pkg/storage/consistency_queue.go index 8e2f6942686a..c522aed3f41d 100644 --- a/pkg/storage/consistency_queue.go +++ b/pkg/storage/consistency_queue.go @@ -102,9 +102,17 @@ func (q *consistencyQueue) process( if q.interval() <= 0 { return nil } + + // We only need this variable in the `pErr != nil` branch below, but for + // better testing coverage it was moved here. + var shouldQuiesce bool + select { + case <-repl.store.Stopper().ShouldQuiesce(): + shouldQuiesce = true + } + req := roachpb.CheckConsistencyRequest{} if _, pErr := repl.CheckConsistency(ctx, req); pErr != nil { - _, shouldQuiesce := <-repl.store.Stopper().ShouldQuiesce() if !shouldQuiesce || !grpcutil.IsClosedConnection(pErr.GoError()) { // Suppress noisy errors about closed GRPC connections when the // server is quiescing.