Skip to content

Commit

Permalink
cmd/roachtest: deflake gossip/chaos roachtest
Browse files Browse the repository at this point in the history
Deflake `gossip/chaos` by adding a missing
`waitForFullReplication`. This test loops, killing a node and then
verifying that the remaining nodes in the cluster stabilize on the same
view of gossip connectivity. Periodically the test was failing because
gossip wasn't stabilizing. The root issue was that the SQL query to
retrieve the gossip connectivity from one node was hanging. And that
query was hanging due to unavailability of a range. Logs show that the
leaseholder for that range was on a down node and that the range only
seemed to contain a single replica. This could happen near the start of
the test if we started killing nodes before full replication was
achieved.

Fixes #38829

Release note: None
  • Loading branch information
petermattis committed Feb 11, 2020
1 parent e1da62b commit a7a9146
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions pkg/cmd/roachtest/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ import (

func registerGossip(r *testRegistry) {
runGossipChaos := func(ctx context.Context, t *test, c *cluster) {
args := startArgs("--args=--vmodule=*=1")
c.Put(ctx, cockroach, "./cockroach", c.All())
c.Start(ctx, t, c.All())
c.Start(ctx, t, c.All(), args)
waitForFullReplication(t, c.Conn(ctx, 1))

gossipNetwork := func(node int) string {
const query = `
Expand Down Expand Up @@ -65,6 +67,7 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
if i == deadNode {
continue
}
c.l.Printf("%d: checking gossip\n", i)
s := gossipNetwork(i)
if !initialized {
deadNodeStr := fmt.Sprint(deadNode)
Expand All @@ -88,7 +91,7 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
return false
}
}
fmt.Printf("gossip ok: %s (%0.0fs)\n", expected, timeutil.Since(start).Seconds())
c.l.Printf("gossip ok: %s (%0.0fs)\n", expected, timeutil.Since(start).Seconds())
return true
}

Expand All @@ -109,7 +112,7 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
deadNode = nodes.randNode()[0]
c.Stop(ctx, c.Node(deadNode))
waitForGossip()
c.Start(ctx, t, c.Node(deadNode))
c.Start(ctx, t, c.Node(deadNode), args)
}
}

Expand Down

0 comments on commit a7a9146

Please sign in to comment.