From 8b159d8fd7019b0f890f60a27171408499c671b2 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Thu, 22 Jul 2021 14:09:59 +0200 Subject: [PATCH] roachtest: fix replicagc-changed-peers The test ends up in the following situation: n1: down, no replicas n2: down, no replicas n3: alive, with constraint that wants all replicas to move, and there may be a few ranges still on n3 n4-n6: alive where the ranges predominantly 3x-replicated. The test is then verifying that the replica count (as in, replicas on n3, in contrast to replicas assigned via the meta ranges) on n3 drops to zero. However, system ranges cannot move in this configuration. The number of cluster nodes is six (decommission{ing,ed} nodes would be excluded, but no nodes are decommission{ing,ed} here) and so the system ranges operate at a replication factor of five. There are only four live nodes here, so if n3 is still a member of any system ranges, they will stay there and the test fails. This commit attempts to rectify that by making sure that while n3 is down earlier in the test, all replicas are moved from it. That was always the intent of the test, which is concerned with n3 realizing that replicas have moved elsewhere and initiating replicaGC; however prior to this commit it was always left to chance whether n3 would or would not have replicas assigned to it by the time the test moved to the stage above. The reason the test wasn't previously waiting for all replicas to be moved off n3 while it was down was that it required checking the meta ranges, which wasn't necessary for the other two nodes. This commit passed all five runs of replicagc-changed-peers/restart=false, so I think it reliably addresses the problem. There is still the lingering question of why this is failing only now (note that both flavors of the test failed on master last night, so I doubt it is rare). We just merged https://github.com/cockroachdb/cockroach/pull/67319 which is likely somehow related. Fixes #67910. Fixes #67914. Release note: None --- pkg/cmd/roachtest/replicagc.go | 41 +++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/pkg/cmd/roachtest/replicagc.go b/pkg/cmd/roachtest/replicagc.go index 3229c3e581eb..472fa18cdd6f 100644 --- a/pkg/cmd/roachtest/replicagc.go +++ b/pkg/cmd/roachtest/replicagc.go @@ -17,7 +17,9 @@ import ( "strconv" "time" + "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/errors" ) func registerReplicaGC(r *testRegistry) { @@ -72,7 +74,7 @@ func runReplicaGCChangedPeers(ctx context.Context, t *test, c *cluster, withRest // Start three new nodes that will take over all data. c.Start(ctx, t, args, c.Range(4, 6)) - // Recommission n1-3, with n3 in absentia, moving the replicas to n4-6. + // Decommission n1-3, with n3 in absentia, moving the replicas to n4-6. if err := h.decommission(ctx, c.Range(1, 3), 2, "--wait=none"); err != nil { t.Fatal(err) } @@ -83,6 +85,16 @@ func runReplicaGCChangedPeers(ctx context.Context, t *test, c *cluster, withRest t.Status("waiting for zero replicas on n2") h.waitForZeroReplicas(ctx, 2) + // Wait for the replica count on n3 to also drop to zero. This makes the test + // "test more" but also it prevents the test from failing spuriously, as later + // in the test any system ranges still on n3 would have a replication factor + // of five applied to them, and they would be unable to move off n3 as n1 and + // n2 will be down at that point. For details, see: + // + // https://github.com/cockroachdb/cockroach/issues/67910#issuecomment-884856356 + t.Status("waiting for zero replicas on n3") + waitForZeroReplicasOnN3(ctx, t, c.Conn(ctx, 1)) + // Stop the remaining two old nodes, no replicas remaining there. c.Stop(ctx, c.Range(1, 2)) @@ -238,3 +250,30 @@ func (h *replicagcTestHelper) isolateDeadNodes(ctx context.Context, runNode int) } } } + +func waitForZeroReplicasOnN3(ctx context.Context, t *test, db *gosql.DB) { + if err := retry.ForDuration(5*time.Minute, func() error { + const q = `select range_id, replicas from crdb_internal.ranges_no_leases where replicas @> ARRAY[3];` + rows, err := db.QueryContext(ctx, q) + if err != nil { + return err + } + m := make(map[int64]string) + for rows.Next() { + var rangeID int64 + var replicas string + if err := rows.Scan(&rangeID, replicas); err != nil { + return err + } + } + if err := rows.Err(); err != nil { + return err + } + if len(m) == 0 { + return nil + } + return errors.Errorf("ranges remained on n3 (according to meta2): %+v", m) + }); err != nil { + t.Fatal(err) + } +}