Merge #37477

37477: storage: Fix deadlock in TestStoreRangeMergeSlowWatcher r=andy-kimball a=andy-kimball This test is occasionally flaking under heavy race stress in CI runs. Here is the probable sequence of events: 1. A<-B merge starts and Subsume request locks down B. 2. Watcher on B sends PushTxn request, which is intercepted by the TestingRequestFilter in the test. 3. The merge txn aborts due to interference with the replica GC, since it's concurrently reading range descriptors. 4. The merge txn is retried, but the Watcher on B is locking the range, so it aborts again. 5. #4 repeats until the allowPushTxn channel fills up (it has capacity of 10). This causes a deadlock because the merge txn can't continue. Meanwhile, the watcher is blocked waiting for the results of the PushTxn request, which gets blocked waiting for the merge txn. The fix is to get rid of the arbitrarily limited channel size of 10 and use sync.Cond synchronization instead. Multiple retries of the merge txn will repeatedly signal the Cond, rather than fill up the channel. One of the problems with the channel was that there can be an imbalance between the number of items sent to the channel (by merge txns) with the number of items received from the channel (by the watcher). This imbalance meant the channel gradually filled up until finally the right sequence of events caused deadlock. Using a sync.Cond also fixes a race condition I saw several times, in which the merge transaction tries to send to the channel while it is being concurrently closed. Release note: None Co-authored-by: Andrew Kimball <[email protected]>
cockroachdb · May 15, 2019 · 8a6cc79 · 8a6cc79
2 parents c03a8b3 + aa605bf
commit 8a6cc79
Showing 1 changed file with 8 additions and 6 deletions.
diff --git a/pkg/storage/client_merge_test.go b/pkg/storage/client_merge_test.go
@@ -23,6 +23,7 @@ import (
 	"reflect"
 	"regexp"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -2850,8 +2851,6 @@ func testMergeWatcher(t *testing.T, injectFailures bool) {
 func TestStoreRangeMergeSlowWatcher(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 
-	t.Skip("https://github.com/cockroachdb/cockroach/issues/37191")
-
 	ctx := context.Background()
 	aKey, bKey, cKey := roachpb.RKey("a"), roachpb.RKey("b"), roachpb.RKey("c")
 	storeCfg := storage.TestStoreConfig(nil)
@@ -2862,23 +2861,26 @@ func TestStoreRangeMergeSlowWatcher(t *testing.T) {
 	// Force PushTxn requests generated by the watcher goroutine to wait on a
 	// channel. This is how we control when store1's merge watcher goroutine hears
 	// about the status of the A <- B merge.
-	allowPushTxn := make(chan struct{}, 10) // buffered for headroom if the merge txn restarts
+	var syn syncutil.Mutex
+	cond := sync.NewCond(&syn)
 	storeCfg.TestingKnobs.TestingRequestFilter = func(ba roachpb.BatchRequest) *roachpb.Error {
+		syn.Lock()
+		defer syn.Unlock()
 		for _, req := range ba.Requests {
 			// We can detect PushTxn requests generated by the watcher goroutine
 			// because they use the minimum transaction priority. Note that we
 			// only block the watcher goroutine on store1 so that we only interfere
 			// with the first merge (A <- B) and not the later merge (AB <- C).
 			if pt := req.GetPushTxn(); pt != nil && pt.PusherTxn.Priority == enginepb.MinTxnPriority &&
 				ba.GatewayNodeID == store1.Ident.NodeID {
-				<-allowPushTxn
+				cond.Wait()
 			}
 			if et := req.GetEndTransaction(); et != nil && !et.Commit && ba.Txn.Name == "merge" {
 				// The merge transaction needed to restart for some reason. To avoid
 				// deadlocking, we need to allow the watcher goroutine's PushTxn request
 				// through so that it allows traffic on the range again. We'll try again
 				// with the restarted merge transaction.
-				allowPushTxn <- struct{}{}
+				cond.Signal()
 			}
 		}
 		return nil
@@ -2962,7 +2964,7 @@ func TestStoreRangeMergeSlowWatcher(t *testing.T) {
 
 	// With the meta2CKey intent cleaned up, allow store1's merge watcher
 	// goroutine to proceed.
-	close(allowPushTxn)
+	cond.Signal()
 
 	// We *must* see a RangeNotFound error from the get request we sent earlier
 	// because we sent it after the merge completed. Anything else is a