Skip to content

Commit

Permalink
kvserver: deflake and unskip rq dead non voters
Browse files Browse the repository at this point in the history
`TestReplicateQueueDeadNonVoters` would flake periodically due to racing
between the snapshot and replicate queue.

Add a filter to queue processing in order to only process the scratch
range. This speeds up the test and prevents timing conditions between
the two queues under stress. The test remains skipped under stress-race
due to its size (5 nodes) causing timeouts.

Resolves: #76996
Epic: none
Release note: None
  • Loading branch information
kvoli committed Sep 18, 2023
1 parent 0cde11b commit 399eaf7
Showing 1 changed file with 14 additions and 15 deletions.
29 changes: 14 additions & 15 deletions pkg/kv/kvserver/replicate_queue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1031,23 +1031,25 @@ func getLeaseholderStore(
func TestReplicateQueueDeadNonVoters(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.WithIssue(t, 76996)
skip.UnderRace(t, "takes a long time or times out under race")

ctx := context.Background()

// Disable the replicate queue for all ranges except the scratch range. This
// speeds up the test, as the queue only needs to up-replicate the dead
// replica (non-voter) for a single range.
var scratchRangeID int64
atomic.StoreInt64(&scratchRangeID, -1)
var livenessTrap atomic.Value
setupFn := func(t *testing.T) (*testcluster.TestCluster, roachpb.RangeDescriptor) {
tc := testcluster.StartTestCluster(t, 5,
base.TestClusterArgs{
ReplicationMode: base.ReplicationAuto,
ReplicationMode: base.ReplicationManual,
ServerArgs: base.TestServerArgs{
ScanMinIdleTime: time.Millisecond,
ScanMaxIdleTime: time.Millisecond,
Knobs: base.TestingKnobs{
Store: &kvserver.StoreTestingKnobs{
ReplicaPlannerKnobs: plan.ReplicaPlannerTestingKnobs{
DisableReplicaRebalancing: true,
BaseQueueDisabledBypassFilter: func(rangeID roachpb.RangeID) bool {
return rangeID == roachpb.RangeID(atomic.LoadInt64(&scratchRangeID))
},
},
SpanConfig: &spanconfig.TestingKnobs{
Expand All @@ -1066,14 +1068,11 @@ func TestReplicateQueueDeadNonVoters(t *testing.T) {
},
},
)
_, err := tc.ServerConn(0).Exec(
`SET CLUSTER SETTING server.failed_reservation_timeout='1ms'`)
require.NoError(t, err)

// Setup a scratch range on a test cluster with 2 non-voters and 1 voter.
scratchKey := tc.ScratchRange(t)
scratchRange := tc.LookupRangeOrFatal(t, scratchKey)
_, err = tc.ServerConn(0).Exec(
atomic.StoreInt64(&scratchRangeID, int64(scratchRange.RangeID))
_, err := tc.ServerConn(0).Exec(
`ALTER RANGE DEFAULT CONFIGURE ZONE USING num_replicas = 3, num_voters = 1`,
)
require.NoError(t, err)
Expand Down Expand Up @@ -1175,18 +1174,18 @@ func TestReplicateQueueDeadNonVoters(t *testing.T) {
// AllocatorRemoveDeadNonVoter` code path. The test does the following:
//
// 1. Instantiate a range with 1 voter and 2 non-voters on a 5-node cluster.
// 2. Turn off the replicateQueue
// 2. Turn off the queue bypasss (disable replicate queue processing).
// 3. Change the zone configs such that there should be no non-voters --
// the two existing non-voters should now be considered "over-replicated"
// by the system.
// 4. Kill the nodes that have non-voters.
// 5. Turn on the replicateQueue
// 5. Turn on the queue bypass (enable the replicate queue processing).
// 6. Make sure that the non-voters are downreplicated from the dead nodes.
t.Run("remove", func(t *testing.T) {
tc, scratchRange := setupFn(t)
defer tc.Stopper().Stop(ctx)

toggleReplicationQueues(tc, false)
atomic.StoreInt64(&scratchRangeID, -1)
_, err := tc.ServerConn(0).Exec(
// Remove all non-voters.
"ALTER RANGE default CONFIGURE ZONE USING num_replicas = 1",
Expand Down Expand Up @@ -1216,8 +1215,8 @@ func TestReplicateQueueDeadNonVoters(t *testing.T) {

beforeNodeIDs := getNonVoterNodeIDs(scratchRange)
markDead(beforeNodeIDs)
atomic.StoreInt64(&scratchRangeID, int64(scratchRange.RangeID))

toggleReplicationQueues(tc, true)
require.Eventually(t, func() bool {
ok, err := checkReplicaCount(ctx, tc, &scratchRange, 1 /* voterCount */, 0 /* nonVoterCount */)
if err != nil {
Expand Down

0 comments on commit 399eaf7

Please sign in to comment.