Skip to content

Commit

Permalink
roachtest: fix bank/cluster-recovery
Browse files Browse the repository at this point in the history
Informs cockroachdb#38785.
The "stalls" detected boil down to the time between chaos monkey
iterations. Within each chaos monkey iteration we do the following:

  - Lock all clients, in sequence
  - Restart each node
  - Unlock all clients
  - Sleep until at least one client has made progress

Given our stall timeout is only 30s, we have just that long to go through all
of the above. In each client we lock around the UPDATE query so as to not be
interrupted. The problem is that every now and then these UPDATE queries take a
lot longer than a few milliseconds. This is expected behaviour: this is
primarily due to txnwait procedures and having to wait for the expiration of
an extant contending txn. More importantly, it's not what we're testing here as
the clients are still making progress. Given the chaos monkey first locks each
client, it has to drain out these requests, which eats out of the 30s or so we
have for each chaos monkey iteration. This is made worse by the fact that we do
this in sequence for each client. When we're unlucky, we run into this
particular convoy situation and we're unable to finish the round in time, and a
"stall" is detected.

We should really only be interested in how long it takes for the chaos monkey
to restart a set of nodes, and ensuring that after it does, that clients are
still making progress. We already have statement timeouts for the UPDATE
queries that fail if we take "too long". Removing the stopClients apparatus
gives us what we need.

Release justification: Category 1: Non-production code changes

Release note: None
  • Loading branch information
irfansharif committed Sep 23, 2019
1 parent bffadfe commit e7b4be4
Showing 1 changed file with 4 additions and 21 deletions.
25 changes: 4 additions & 21 deletions pkg/cmd/roachtest/bank.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,11 @@ func (s *bankState) verifyAccounts(ctx context.Context, t *test) {
}
}

// startChaosMonkey picks a set of nodes and restarts them. If stopClients is set
// all the clients are locked before the nodes are restarted.
// startChaosMonkey picks a set of nodes and restarts them.
func (s *bankState) startChaosMonkey(
ctx context.Context,
t *test,
c *cluster,
stopClients bool,
pickNodes func() []int,
consistentIdx int,
) {
Expand Down Expand Up @@ -237,12 +235,6 @@ func (s *bankState) startChaosMonkey(
// Pick nodes to be restarted.
nodes := pickNodes()

if stopClients {
// Prevent all clients from writing while nodes are being restarted.
for i := 0; i < len(s.clients); i++ {
s.clients[i].Lock()
}
}
t.l.Printf("round %d: restarting nodes %v\n", curRound, nodes)
for _, i := range nodes {
if s.done(ctx) {
Expand All @@ -252,15 +244,6 @@ func (s *bankState) startChaosMonkey(

c.Stop(ctx, c.Node(i))
c.Start(ctx, t, c.Node(i))
if stopClients {
// Reinitialize the client talking to the restarted node.
s.initClient(ctx, c, i)
}
}
if stopClients {
for i := 0; i < len(s.clients); i++ {
s.clients[i].Unlock()
}
}

preCount := s.counts()
Expand Down Expand Up @@ -457,7 +440,7 @@ func runBankClusterRecovery(ctx context.Context, t *test, c *cluster) {
}
return nodes
}
s.startChaosMonkey(ctx, t, c, true, pickNodes, -1)
s.startChaosMonkey(ctx, t, c, pickNodes, -1)

s.waitClientsStop(ctx, t, c, 30*time.Second)

Expand Down Expand Up @@ -499,7 +482,7 @@ func runBankNodeRestart(ctx context.Context, t *test, c *cluster) {
pickNodes := func() []int {
return []int{1 + rnd.Intn(clientIdx)}
}
s.startChaosMonkey(ctx, t, c, false, pickNodes, clientIdx)
s.startChaosMonkey(ctx, t, c, pickNodes, clientIdx)

s.waitClientsStop(ctx, t, c, 30*time.Second)

Expand Down Expand Up @@ -578,7 +561,7 @@ func runBankZeroSumRestart(ctx context.Context, t *test, c *cluster) {
}

// Starting up the goroutines that restart and do splits and lease moves.
s.startChaosMonkey(ctx, t, c, false, pickNodes, -1)
s.startChaosMonkey(ctx, t, c, pickNodes, -1)
s.startSplitMonkey(ctx, 2*time.Second, c)
s.waitClientsStop(ctx, t, c, 30*time.Second)

Expand Down

0 comments on commit e7b4be4

Please sign in to comment.