Skip to content

Commit

Permalink
cmd/roachtest: deflake gossip/chaos roachtest
Browse files Browse the repository at this point in the history
Deflake `gossip/chaos` by adding a missing
`waitForFullReplication`. This test loops, killing a node and then
verifying that the remaining nodes in the cluster stabilize on the same
view of gossip connectivity. Periodically the test was failing because
gossip wasn't stabilizing. The root issue was that the SQL query to
retrieve the gossip connectivity from one node was hanging. And that
query was hanging due to unavailability of a range. Logs show that the
leaseholder for that range was on a down node and that the range only
seemed to contain a single replica. This could happen near the start of
the test if we started killing nodes before full replication was
achieved.

Fixes #38829

Release note: None
  • Loading branch information
petermattis authored and tbg committed Mar 12, 2020
1 parent 2863173 commit 2783f1a
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions pkg/cmd/roachtest/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ import (

func registerGossip(r *testRegistry) {
runGossipChaos := func(ctx context.Context, t *test, c *cluster) {
args := startArgs("--args=--vmodule=*=1")
c.Put(ctx, cockroach, "./cockroach", c.All())
c.Start(ctx, t, c.All())
c.Start(ctx, t, c.All(), args)
waitForFullReplication(t, c.Conn(ctx, 1))

gossipNetwork := func(node int) string {
const query = `
Expand Down Expand Up @@ -65,6 +67,7 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
if i == deadNode {
continue
}
c.l.Printf("%d: checking gossip\n", i)
s := gossipNetwork(i)
if !initialized {
deadNodeStr := fmt.Sprint(deadNode)
Expand All @@ -88,7 +91,7 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
return false
}
}
fmt.Printf("gossip ok: %s (%0.0fs)\n", expected, timeutil.Since(start).Seconds())
c.l.Printf("gossip ok: %s (%0.0fs)\n", expected, timeutil.Since(start).Seconds())
return true
}

Expand All @@ -109,7 +112,7 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
deadNode = nodes.randNode()[0]
c.Stop(ctx, c.Node(deadNode))
waitForGossip()
c.Start(ctx, t, c.Node(deadNode))
c.Start(ctx, t, c.Node(deadNode), args)
}
}

Expand Down

0 comments on commit 2783f1a

Please sign in to comment.