roachtest: fix bank/cluster-recovery

Informs cockroachdb#38785. The "stalls" detected boil down to the time between chaos monkey iterations. Within each chaos monkey iteration we do the following: - Lock all clients, in sequence - Restart each node - Unlock all clients - Sleep until at least one client has made progress Given our stall timeout is only 30s, we have just that long to go through all of the above. In each client we lock around the UPDATE query so as to not be interrupted. The problem is that every now and then these UPDATE queries take a lot longer than a few milliseconds. This is expected behaviour: this is primarily due to txnwait procedures and having to wait for the expiration of an extant contending txn. More importantly, it's not what we're testing here as the clients are still making progress. Given the chaos monkey first locks each client, it has to drain out these requests, which eats out of the 30s or so we have for each chaos monkey iteration. This is made worse by the fact that we do this in sequence for each client. When we're unlucky, we run into this particular convoy situation and we're unable to finish the round in time, and a "stall" is detected. We should really only be interested in how long it takes for the chaos monkey to restart a set of nodes, and ensuring that after it does, that clients are still making progress. We already have statement timeouts for the UPDATE queries that fail if we take "too long". Removing the stopClients apparatus gives us what we need. Release justification: Category 1: Non-production code changes Release note: None
irfansharif · Sep 23, 2019 · e7b4be4 · e7b4be4
1 parent bffadfe
commit e7b4be4
Showing 1 changed file with 4 additions and 21 deletions.
diff --git a/pkg/cmd/roachtest/bank.go b/pkg/cmd/roachtest/bank.go
@@ -202,13 +202,11 @@ func (s *bankState) verifyAccounts(ctx context.Context, t *test) {
 	}
 }
 
-// startChaosMonkey picks a set of nodes and restarts them. If stopClients is set
-// all the clients are locked before the nodes are restarted.
+// startChaosMonkey picks a set of nodes and restarts them.
 func (s *bankState) startChaosMonkey(
 	ctx context.Context,
 	t *test,
 	c *cluster,
-	stopClients bool,
 	pickNodes func() []int,
 	consistentIdx int,
 ) {
@@ -237,12 +235,6 @@ func (s *bankState) startChaosMonkey(
 			// Pick nodes to be restarted.
 			nodes := pickNodes()
 
-			if stopClients {
-				// Prevent all clients from writing while nodes are being restarted.
-				for i := 0; i < len(s.clients); i++ {
-					s.clients[i].Lock()
-				}
-			}
 			t.l.Printf("round %d: restarting nodes %v\n", curRound, nodes)
 			for _, i := range nodes {
 				if s.done(ctx) {
@@ -252,15 +244,6 @@ func (s *bankState) startChaosMonkey(
 
 				c.Stop(ctx, c.Node(i))
 				c.Start(ctx, t, c.Node(i))
-				if stopClients {
-					// Reinitialize the client talking to the restarted node.
-					s.initClient(ctx, c, i)
-				}
-			}
-			if stopClients {
-				for i := 0; i < len(s.clients); i++ {
-					s.clients[i].Unlock()
-				}
 			}
 
 			preCount := s.counts()
@@ -457,7 +440,7 @@ func runBankClusterRecovery(ctx context.Context, t *test, c *cluster) {
 		}
 		return nodes
 	}
-	s.startChaosMonkey(ctx, t, c, true, pickNodes, -1)
+	s.startChaosMonkey(ctx, t, c, pickNodes, -1)
 
 	s.waitClientsStop(ctx, t, c, 30*time.Second)
 
@@ -499,7 +482,7 @@ func runBankNodeRestart(ctx context.Context, t *test, c *cluster) {
 	pickNodes := func() []int {
 		return []int{1 + rnd.Intn(clientIdx)}
 	}
-	s.startChaosMonkey(ctx, t, c, false, pickNodes, clientIdx)
+	s.startChaosMonkey(ctx, t, c, pickNodes, clientIdx)
 
 	s.waitClientsStop(ctx, t, c, 30*time.Second)
 
@@ -578,7 +561,7 @@ func runBankZeroSumRestart(ctx context.Context, t *test, c *cluster) {
 	}
 
 	// Starting up the goroutines that restart and do splits and lease moves.
-	s.startChaosMonkey(ctx, t, c, false, pickNodes, -1)
+	s.startChaosMonkey(ctx, t, c, pickNodes, -1)
 	s.startSplitMonkey(ctx, 2*time.Second, c)
 	s.waitClientsStop(ctx, t, c, 30*time.Second)