Merge #94060

94060: kvserver: rm in-memory replica tombstone tracking r=tbg a=pavelkalinnikov The invariants guaranteed by getOrCreateReplica make it unnecessary to track tombstoneMinReplicaID for the split trigger purposes. This field was introduced to guard against Replica changing its ID backwards, however since then Replica no longer can change its ID in the first place. Invariants of the Store (at most one replica for rangeID), in combination with the fact that replicaID is immutable, and with the fact that both in-memory replica ID, and replica tombstone in storage, are always consulted before creating a Replica, guarantee that active replica ID never regresses. Release note: None Epic: CRDB-220 Co-authored-by: Pavel Kalinnikov <[email protected]>
cockroachdb · Jan 3, 2023 · ab6a865 · ab6a865
2 parents 66419e9 + 0703807
commit ab6a865
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 29 deletions.
diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go
@@ -339,6 +339,9 @@ type Replica struct {
 		// scheduled for destruction or has been GCed.
 		// destroyStatus should only be set while also holding the raftMu and
 		// readOnlyCmdMu.
+		//
+		// When this replica is being removed, the destroyStatus is updated and
+		// RangeTombstone is written in the same raftMu critical section.
 		destroyStatus
 		// Is the range quiescent? Quiescent ranges are not Tick()'d and unquiesce
 		// whenever a Raft operation is performed.
@@ -494,9 +497,6 @@ type Replica struct {
 		applyingEntries bool
 		// The replica's Raft group "node".
 		internalRaftGroup *raft.RawNode
-		// The minimum allowed ID for this replica. Initialized from
-		// RangeTombstone.NextReplicaID.
-		tombstoneMinReplicaID roachpb.ReplicaID
 
 		// The ID of the leader replica within the Raft group. NB: this is updated
 		// in a separate critical section from the Raft group, and can therefore
@@ -1734,8 +1734,7 @@ func (r *Replica) shouldWaitForPendingMergeRLocked(
 // have been removed from this store after the split.
 //
 // TODO(tbg): the below is true as of 22.2: we persist any Replica's ReplicaID
-// under RaftReplicaIDKey, so the below caveats should be addressed now and we
-// should be able to simplify isNewerThanSplit to just compare replicaIDs.
+// under RaftReplicaIDKey, so the below caveats should be addressed now.
 //
 // TODO(ajwerner):  There is one false negative where false will be returned but
 // the hard state may be due to a newer replica which is outlined below. It
@@ -1763,21 +1762,12 @@ func (r *Replica) shouldWaitForPendingMergeRLocked(
 //
 // See TestProcessSplitAfterRightHandSideHasBeenRemoved.
 func (r *Replica) isNewerThanSplit(split *roachpb.SplitTrigger) bool {
-	r.mu.RLock()
-	defer r.mu.RUnlock()
-	return r.isNewerThanSplitRLocked(split)
-}
-
-func (r *Replica) isNewerThanSplitRLocked(split *roachpb.SplitTrigger) bool {
 	rightDesc, _ := split.RightDesc.GetReplicaDescriptor(r.StoreID())
-	// If we have written a tombstone for this range then we know that the RHS
-	// must have already been removed at the split replica ID.
-	return r.mu.tombstoneMinReplicaID != 0 ||
-		// If the first raft message we received for the RHS range was for a replica
-		// ID which is above the replica ID of the split then we would not have
-		// written a tombstone but we will have a replica ID that will exceed the
-		// split replica ID.
-		r.replicaID > rightDesc.ReplicaID
+	// If the first raft message we received for the RHS range was for a replica
+	// ID which is above the replica ID of the split then we would not have
+	// written a tombstone but we will have a replica ID that will exceed the
+	// split replica ID.
+	return r.replicaID > rightDesc.ReplicaID
 }
 
 // WatchForMerge is like maybeWatchForMergeLocked, except it expects a merge to

diff --git a/pkg/kv/kvserver/replica_destroy.go b/pkg/kv/kvserver/replica_destroy.go
@@ -227,12 +227,9 @@ func (r *Replica) setTombstoneKey(
 ) error {
 	r.mu.Lock()
 	nextReplicaID := r.mu.state.Desc.NextReplicaID
-	if nextReplicaID < externalNextReplicaID {
+	if externalNextReplicaID > nextReplicaID {
 		nextReplicaID = externalNextReplicaID
 	}
-	if nextReplicaID > r.mu.tombstoneMinReplicaID {
-		r.mu.tombstoneMinReplicaID = nextReplicaID
-	}
 	r.mu.Unlock()
 	return writeTombstoneKey(ctx, writer, r.RangeID, nextReplicaID)
 }

diff --git a/pkg/kv/kvserver/store_create_replica.go b/pkg/kv/kvserver/store_create_replica.go
@@ -25,10 +25,37 @@ import (
 
 var errRetry = errors.New("retry: orphaned replica")
 
-// getOrCreateReplica returns a replica for the given RangeID, creating an
-// uninitialized replica if necessary. The caller must not hold the store's
-// lock. The returned replica has Replica.raftMu locked and it is the caller's
-// responsibility to unlock it.
+// getOrCreateReplica returns an existing or newly created replica with the
+// given replicaID for the given rangeID, or roachpb.RaftGroupDeletedError if
+// this replicaID has been deleted. A returned replica's Replica.raftMu is
+// locked, and the caller is responsible for unlocking it.
+//
+// Commonly, if the requested replica is present in Store's memory and is not
+// being destroyed, it gets returned. Otherwise, this replica is either deleted
+// (which is confirmed by reading the RangeTombstone), or gets created (as an
+// uninitialized replica) in memory and storage, and loaded from storage if it
+// was stored before.
+//
+// The above assertions and actions on the in-memory (Store, Replica) and stored
+// (RaftReplicaID, RangeTombstone) state can't all be done atomically, but this
+// method effectively makes them appear atomically done under the returned
+// replica's Replica.raftMu.
+//
+// In particular, if getOrCreateReplica returns a replica, the guarantee is that
+// the following invariants (derived from raftMu and Store invariants) are true
+// while Replica.raftMu is held:
+//
+//   - Store.GetReplica(rangeID) successfully returns this and only this replica
+//   - The Replica is not being removed as seen by its Replica.mu.destroyStatus
+//   - The RangeTombstone in storage does not see this replica as removed
+//
+// If getOrCreateReplica returns roachpb.RaftGroupDeletedError, the guarantee is:
+//
+//   - getOrCreateReplica will never return this replica
+//   - Store.GetReplica(rangeID) can now only return replicas with higher IDs
+//   - The RangeTombstone in storage does see this replica as removed
+//
+// The caller must not hold the store's lock.
 func (s *Store) getOrCreateReplica(
 	ctx context.Context,
 	rangeID roachpb.RangeID,
@@ -167,7 +194,6 @@ func (s *Store) tryGetOrCreateReplica(
 	// replica even outside of raft processing. Have to do this after grabbing
 	// Store.mu to maintain lock ordering invariant.
 	repl.mu.Lock()
-	repl.mu.tombstoneMinReplicaID = tombstone.NextReplicaID
 
 	// NB: A Replica should never be in the store's replicas map with a nil
 	// descriptor. Assign it directly here. In the case that the Replica should

diff --git a/pkg/kv/kvserver/store_split.go b/pkg/kv/kvserver/store_split.go
@@ -245,7 +245,7 @@ func prepareRightReplicaForSplit(
 	// If we know that the RHS has already been removed at this replica ID
 	// then we also know that its data has already been removed by the preApply
 	// so we skip initializing it as the RHS of the split.
-	if rightRepl.isNewerThanSplitRLocked(split) {
+	if rightRepl.isNewerThanSplit(split) {
 		return nil
 	}