From 300f22983cf72daf12cf82e7103fa75090014099 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Mon, 30 Sep 2024 16:40:05 -0400 Subject: [PATCH] kvserver: metamorphically enable kv.raft.leader_fortification.fraction_enabled Part of #123847. This commit metamorphically enables the `kv.raft.leader_fortification.fraction_enabled` to exercise raft fortification and leader leases. The commit also includes a few other WIP changes to try to stabilize this. It won't be fully stable until defortification is implemented. Release note: None --- pkg/cli/debug_recover_loss_of_quorum_test.go | 4 ++++ .../kvclient/kvcoord/dist_sender_ambiguous_test.go | 1 + pkg/kv/kvserver/client_merge_test.go | 6 +++++- pkg/kv/kvserver/client_raft_test.go | 3 +++ pkg/kv/kvserver/client_replica_test.go | 12 +++++++++++- pkg/kv/kvserver/loqrecovery/collect_raft_log_test.go | 4 ++++ pkg/kv/kvserver/replica_range_lease.go | 1 + pkg/kv/kvserver/replica_store_liveness.go | 8 ++++---- pkg/kv/kvserver/storeliveness/support_manager.go | 2 +- 9 files changed, 34 insertions(+), 7 deletions(-) diff --git a/pkg/cli/debug_recover_loss_of_quorum_test.go b/pkg/cli/debug_recover_loss_of_quorum_test.go index 5a2c4582602a..b49c756a9e05 100644 --- a/pkg/cli/debug_recover_loss_of_quorum_test.go +++ b/pkg/cli/debug_recover_loss_of_quorum_test.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/server" "github.com/cockroachdb/cockroach/pkg/server/serverpb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage/fs" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/listenerutil" @@ -182,8 +183,11 @@ func TestLossOfQuorumRecovery(t *testing.T) { // would not be able to progress, but we will apply recovery procedure and // mark on replicas on node 1 as designated survivors. After that, starting // single node should succeed. + st := cluster.MakeTestingClusterSettings() + kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV) tcBefore := testcluster.NewTestCluster(t, 3, base.TestClusterArgs{ ServerArgs: base.TestServerArgs{ + Settings: st, // This logic is specific to the storage layer. DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant, }, diff --git a/pkg/kv/kvclient/kvcoord/dist_sender_ambiguous_test.go b/pkg/kv/kvclient/kvcoord/dist_sender_ambiguous_test.go index eee3e18f40dc..4d8b7f5c76aa 100644 --- a/pkg/kv/kvclient/kvcoord/dist_sender_ambiguous_test.go +++ b/pkg/kv/kvclient/kvcoord/dist_sender_ambiguous_test.go @@ -341,6 +341,7 @@ func TestTransactionUnexpectedlyCommitted(t *testing.T) { ctx := context.Background() st := cluster.MakeTestingClusterSettings() + kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV) // Disable closed timestamps for control over when transaction gets bumped. closedts.TargetDuration.Override(ctx, &st.SV, 1*time.Hour) diff --git a/pkg/kv/kvserver/client_merge_test.go b/pkg/kv/kvserver/client_merge_test.go index 1b424175562a..e3d52aa2e584 100644 --- a/pkg/kv/kvserver/client_merge_test.go +++ b/pkg/kv/kvserver/client_merge_test.go @@ -493,7 +493,7 @@ func mergeCheckingTimestampCaches( st := cluster.MakeTestingClusterSettings() // This test explicitly sets up a leader/leaseholder partition, which doesn't // work with expiration leases (the lease expires). - kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, false) // override metamorphism + kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV) tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ ReplicationMode: base.ReplicationManual, @@ -2991,13 +2991,17 @@ func TestStoreRangeMergeAbandonedFollowers(t *testing.T) { // RHS, as it interpreted destroyReasonMergePending to mean that the RHS replica // had already been garbage collected. func TestStoreRangeMergeAbandonedFollowersAutomaticallyGarbageCollected(t *testing.T) { + defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) ctx := context.Background() + st := cluster.MakeTestingClusterSettings() + kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV) tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{}, }) defer tc.Stopper().Stop(ctx) scratch := tc.ScratchRange(t) diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go index 05f3b9437897..71b0b1cbcffd 100644 --- a/pkg/kv/kvserver/client_raft_test.go +++ b/pkg/kv/kvserver/client_raft_test.go @@ -5735,6 +5735,8 @@ func TestElectionAfterRestart(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) ctx := context.Background() + st := cluster.MakeTestingClusterSettings() + kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV) // We use a single node to avoid rare flakes due to dueling elections. // The code is set up to support multiple nodes, though the test will @@ -5756,6 +5758,7 @@ func TestElectionAfterRestart(t *testing.T) { ReplicationMode: replMode, ParallelStart: parallel, ServerArgs: base.TestServerArgs{ + Settings: st, RaftConfig: base.RaftConfig{ RaftElectionTimeoutTicks: electionTimeoutTicks, RaftTickInterval: raftTickInterval, diff --git a/pkg/kv/kvserver/client_replica_test.go b/pkg/kv/kvserver/client_replica_test.go index 76f90eeb05a5..39a0643f9a4a 100644 --- a/pkg/kv/kvserver/client_replica_test.go +++ b/pkg/kv/kvserver/client_replica_test.go @@ -3308,12 +3308,17 @@ func TestChangeReplicasSwapVoterWithNonVoter(t *testing.T) { func TestReplicaTombstone(t *testing.T) { defer leaktest.AfterTest(t)() + ctx := context.Background() + st := cluster.MakeTestingClusterSettings() + kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV) + t.Run("(1) ChangeReplicasTrigger", func(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - ctx := context.Background() + tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ ServerArgs: base.TestServerArgs{ + Settings: st, Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{ DisableReplicaGCQueue: true, }}, @@ -3351,6 +3356,7 @@ func TestReplicaTombstone(t *testing.T) { ctx := context.Background() tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ ServerArgs: base.TestServerArgs{ + Settings: st, RaftConfig: base.RaftConfig{ // Make the tick interval short so we don't need to wait too long for // the partitioned node to time out. @@ -3418,6 +3424,7 @@ func TestReplicaTombstone(t *testing.T) { ctx := context.Background() tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ ServerArgs: base.TestServerArgs{ + Settings: st, Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{ DisableReplicaGCQueue: true, }}, @@ -3455,6 +3462,7 @@ func TestReplicaTombstone(t *testing.T) { ctx := context.Background() tc := testcluster.StartTestCluster(t, 4, base.TestClusterArgs{ ServerArgs: base.TestServerArgs{ + Settings: st, Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{ DisableReplicaGCQueue: true, }}, @@ -3504,6 +3512,7 @@ func TestReplicaTombstone(t *testing.T) { ctx := context.Background() tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ ServerArgs: base.TestServerArgs{ + Settings: st, RaftConfig: base.RaftConfig{ // Make the tick interval short so we don't need to wait too long // for a heartbeat to be sent. @@ -3629,6 +3638,7 @@ func TestReplicaTombstone(t *testing.T) { proposalFilter.Store(noopProposalFilter) tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ ServerArgs: base.TestServerArgs{ + Settings: st, Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{ DisableReplicaGCQueue: true, TestingProposalFilter: func(args kvserverbase.ProposalFilterArgs) *kvpb.Error { diff --git a/pkg/kv/kvserver/loqrecovery/collect_raft_log_test.go b/pkg/kv/kvserver/loqrecovery/collect_raft_log_test.go index de114845264e..05569b6a5265 100644 --- a/pkg/kv/kvserver/loqrecovery/collect_raft_log_test.go +++ b/pkg/kv/kvserver/loqrecovery/collect_raft_log_test.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery/loqrecoverypb" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" @@ -160,8 +161,11 @@ func checkRaftLog( RaftLogTruncationThreshold: math.MaxInt64, } + st := cluster.MakeTestingClusterSettings() + kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV) tc := testcluster.NewTestCluster(t, 2, base.TestClusterArgs{ ServerArgs: base.TestServerArgs{ + Settings: st, Knobs: base.TestingKnobs{ Store: &kvserver.StoreTestingKnobs{ DisableGCQueue: true, diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index 5507a7906054..17b417c29b4e 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -166,6 +166,7 @@ func OverrideDefaultLeaseType(ctx context.Context, sv *settings.Values, typ roac switch typ { case roachpb.LeaseExpiration: ExpirationLeasesOnly.Override(ctx, sv, true) + RaftLeaderFortificationFractionEnabled.Override(ctx, sv, 0.0) case roachpb.LeaseEpoch: ExpirationLeasesOnly.Override(ctx, sv, false) RaftLeaderFortificationFractionEnabled.Override(ctx, sv, 0.0) diff --git a/pkg/kv/kvserver/replica_store_liveness.go b/pkg/kv/kvserver/replica_store_liveness.go index fe0f518f431d..6dd7dddd67a0 100644 --- a/pkg/kv/kvserver/replica_store_liveness.go +++ b/pkg/kv/kvserver/replica_store_liveness.go @@ -16,9 +16,9 @@ import ( "github.com/cockroachdb/cockroach/pkg/raft/raftstoreliveness" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings" - "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/metamorphic" ) // RaftLeaderFortificationFractionEnabled controls the fraction of ranges for @@ -33,9 +33,9 @@ var RaftLeaderFortificationFractionEnabled = settings.RegisterFloatSetting( "by extension, use Leader leases for all ranges which do not require "+ "expiration-based leases. Set to a value between 0.0 and 1.0 to gradually "+ "roll out Leader leases across the ranges in a cluster.", - // TODO(nvanbenschoten): make this a metamorphic constant once raft leader - // fortification and leader leases are sufficiently stable. - envutil.EnvOrDefaultFloat64("COCKROACH_LEADER_FORTIFICATION_FRACTION_ENABLED", 0.0), + metamorphic.ConstantWithTestChoice("kv.raft.leader_fortification.fraction_enabled", + 1.0, /* defaultValue */ + 1.0 /* otherValues */), settings.FloatInRange(0.0, 1.0), settings.WithPublic, ) diff --git a/pkg/kv/kvserver/storeliveness/support_manager.go b/pkg/kv/kvserver/storeliveness/support_manager.go index 97956b8772f0..c21bbe07b2c6 100644 --- a/pkg/kv/kvserver/storeliveness/support_manager.go +++ b/pkg/kv/kvserver/storeliveness/support_manager.go @@ -243,7 +243,7 @@ func (sm *SupportManager) startLoop(ctx context.Context) { select { case <-sm.storesToAdd.sig: sm.maybeAddStores(ctx) - sm.sendHeartbeats(ctx) + continue case <-heartbeatTicker.C: sm.sendHeartbeats(ctx)