From 3c7cc9c5bf18ad74b9a75c670974d57434cf6613 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Thu, 22 Dec 2022 13:56:49 +0000 Subject: [PATCH 1/3] kvserver: clarify getOrCreateReplica semantics Release note: None Epic: CRDB-220 --- pkg/kv/kvserver/store_create_replica.go | 35 ++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/pkg/kv/kvserver/store_create_replica.go b/pkg/kv/kvserver/store_create_replica.go index 4a2f59d93820..9a849d43c2d6 100644 --- a/pkg/kv/kvserver/store_create_replica.go +++ b/pkg/kv/kvserver/store_create_replica.go @@ -25,10 +25,37 @@ import ( var errRetry = errors.New("retry: orphaned replica") -// getOrCreateReplica returns a replica for the given RangeID, creating an -// uninitialized replica if necessary. The caller must not hold the store's -// lock. The returned replica has Replica.raftMu locked and it is the caller's -// responsibility to unlock it. +// getOrCreateReplica returns an existing or newly created replica with the +// given replicaID for the given rangeID, or roachpb.RaftGroupDeletedError if +// this replicaID has been deleted. A returned replica's Replica.raftMu is +// locked, and the caller is responsible for unlocking it. +// +// Commonly, if the requested replica is present in Store's memory and is not +// being destroyed, it gets returned. Otherwise, this replica is either deleted +// (which is confirmed by reading the RangeTombstone), or gets created (as an +// uninitialized replica) in memory and storage, and loaded from storage if it +// was stored before. +// +// The above assertions and actions on the in-memory (Store, Replica) and stored +// (RaftReplicaID, RangeTombstone) state can't all be done atomically, but this +// method effectively makes them appear atomically done under the returned +// replica's Replica.raftMu. +// +// In particular, if getOrCreateReplica returns a replica, the guarantee is that +// the following invariants (derived from raftMu and Store invariants) are true +// while Replica.raftMu is held: +// +// - Store.GetReplica(rangeID) successfully returns this and only this replica +// - The Replica is not being removed as seen by its Replica.mu.destroyStatus +// - The RangeTombstone in storage does not see this replica as removed +// +// If getOrCreateReplica returns roachpb.RaftGroupDeletedError, the guarantee is: +// +// - getOrCreateReplica will never return this replica +// - Store.GetReplica(rangeID) can now only return replicas with higher IDs +// - The RangeTombstone in storage does see this replica as removed +// +// The caller must not hold the store's lock. func (s *Store) getOrCreateReplica( ctx context.Context, rangeID roachpb.RangeID, From 269de538048dc622df4b65f9df16f71d265902ac Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Fri, 23 Dec 2022 12:36:27 +0000 Subject: [PATCH 2/3] kvserver: note RangeTombstone in destroyStatus Epic: none --- pkg/kv/kvserver/replica.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go index 971b649a97b3..f7b41e1f67df 100644 --- a/pkg/kv/kvserver/replica.go +++ b/pkg/kv/kvserver/replica.go @@ -339,6 +339,9 @@ type Replica struct { // scheduled for destruction or has been GCed. // destroyStatus should only be set while also holding the raftMu and // readOnlyCmdMu. + // + // When this replica is being removed, the destroyStatus is updated and + // RangeTombstone is written in the same raftMu critical section. destroyStatus // Is the range quiescent? Quiescent ranges are not Tick()'d and unquiesce // whenever a Raft operation is performed. From 0703807d018fb1c0f5352e8d83315b463c90d1e9 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Wed, 21 Dec 2022 14:45:44 +0000 Subject: [PATCH 3/3] kvserver: rm in-memory replica tombstone tracking The invariants guaranteed by getOrCreateReplica make it unnecessary to track tombstoneMinReplicaID for the split trigger purposes. This field was introduced to guard against Replica changing its ID backwards, however since then Replica no longer can change its ID in the first place. Invariants of the Store (at most one replica for rangeID), in combination with the fact that replicaID is immutable, and with the fact that both in-memory replica ID, and replica tombstone in storage, are always consulted before creating a Replica, guarantee that active replica ID never regresses. Release note: None Epic: CRDB-220 --- pkg/kv/kvserver/replica.go | 25 ++++++------------------- pkg/kv/kvserver/replica_destroy.go | 5 +---- pkg/kv/kvserver/store_create_replica.go | 1 - pkg/kv/kvserver/store_split.go | 2 +- 4 files changed, 8 insertions(+), 25 deletions(-) diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go index f7b41e1f67df..7bc906f85073 100644 --- a/pkg/kv/kvserver/replica.go +++ b/pkg/kv/kvserver/replica.go @@ -497,9 +497,6 @@ type Replica struct { applyingEntries bool // The replica's Raft group "node". internalRaftGroup *raft.RawNode - // The minimum allowed ID for this replica. Initialized from - // RangeTombstone.NextReplicaID. - tombstoneMinReplicaID roachpb.ReplicaID // The ID of the leader replica within the Raft group. Used to determine // when the leadership changes. @@ -1736,8 +1733,7 @@ func (r *Replica) shouldWaitForPendingMergeRLocked( // have been removed from this store after the split. // // TODO(tbg): the below is true as of 22.2: we persist any Replica's ReplicaID -// under RaftReplicaIDKey, so the below caveats should be addressed now and we -// should be able to simplify isNewerThanSplit to just compare replicaIDs. +// under RaftReplicaIDKey, so the below caveats should be addressed now. // // TODO(ajwerner): There is one false negative where false will be returned but // the hard state may be due to a newer replica which is outlined below. It @@ -1765,21 +1761,12 @@ func (r *Replica) shouldWaitForPendingMergeRLocked( // // See TestProcessSplitAfterRightHandSideHasBeenRemoved. func (r *Replica) isNewerThanSplit(split *roachpb.SplitTrigger) bool { - r.mu.RLock() - defer r.mu.RUnlock() - return r.isNewerThanSplitRLocked(split) -} - -func (r *Replica) isNewerThanSplitRLocked(split *roachpb.SplitTrigger) bool { rightDesc, _ := split.RightDesc.GetReplicaDescriptor(r.StoreID()) - // If we have written a tombstone for this range then we know that the RHS - // must have already been removed at the split replica ID. - return r.mu.tombstoneMinReplicaID != 0 || - // If the first raft message we received for the RHS range was for a replica - // ID which is above the replica ID of the split then we would not have - // written a tombstone but we will have a replica ID that will exceed the - // split replica ID. - r.replicaID > rightDesc.ReplicaID + // If the first raft message we received for the RHS range was for a replica + // ID which is above the replica ID of the split then we would not have + // written a tombstone but we will have a replica ID that will exceed the + // split replica ID. + return r.replicaID > rightDesc.ReplicaID } // WatchForMerge is like maybeWatchForMergeLocked, except it expects a merge to diff --git a/pkg/kv/kvserver/replica_destroy.go b/pkg/kv/kvserver/replica_destroy.go index ba9518b08d3f..a4c65b9a4780 100644 --- a/pkg/kv/kvserver/replica_destroy.go +++ b/pkg/kv/kvserver/replica_destroy.go @@ -227,12 +227,9 @@ func (r *Replica) setTombstoneKey( ) error { r.mu.Lock() nextReplicaID := r.mu.state.Desc.NextReplicaID - if nextReplicaID < externalNextReplicaID { + if externalNextReplicaID > nextReplicaID { nextReplicaID = externalNextReplicaID } - if nextReplicaID > r.mu.tombstoneMinReplicaID { - r.mu.tombstoneMinReplicaID = nextReplicaID - } r.mu.Unlock() return writeTombstoneKey(ctx, writer, r.RangeID, nextReplicaID) } diff --git a/pkg/kv/kvserver/store_create_replica.go b/pkg/kv/kvserver/store_create_replica.go index 9a849d43c2d6..aa558acde9c5 100644 --- a/pkg/kv/kvserver/store_create_replica.go +++ b/pkg/kv/kvserver/store_create_replica.go @@ -194,7 +194,6 @@ func (s *Store) tryGetOrCreateReplica( // replica even outside of raft processing. Have to do this after grabbing // Store.mu to maintain lock ordering invariant. repl.mu.Lock() - repl.mu.tombstoneMinReplicaID = tombstone.NextReplicaID // NB: A Replica should never be in the store's replicas map with a nil // descriptor. Assign it directly here. In the case that the Replica should diff --git a/pkg/kv/kvserver/store_split.go b/pkg/kv/kvserver/store_split.go index 395fe567d4b3..b8c80b8f8059 100644 --- a/pkg/kv/kvserver/store_split.go +++ b/pkg/kv/kvserver/store_split.go @@ -245,7 +245,7 @@ func prepareRightReplicaForSplit( // If we know that the RHS has already been removed at this replica ID // then we also know that its data has already been removed by the preApply // so we skip initializing it as the RHS of the split. - if rightRepl.isNewerThanSplitRLocked(split) { + if rightRepl.isNewerThanSplit(split) { return nil }