Skip to content

Commit

Permalink
Merge pull request #69801 from aayushshah15/backport20.2-69696
Browse files Browse the repository at this point in the history
release-20.2: kvserver: stop transferring leases to replicas that may need snapshots
  • Loading branch information
aayushshah15 authored Sep 3, 2021
2 parents 319f7ce + 7004f6e commit a16ddf2
Show file tree
Hide file tree
Showing 3 changed files with 275 additions and 29 deletions.
91 changes: 83 additions & 8 deletions pkg/kv/kvserver/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,11 @@ func (a *Allocator) TransferLeaseTarget(
ctx context.Context,
zone *zonepb.ZoneConfig,
existing []roachpb.ReplicaDescriptor,
leaseStoreID roachpb.StoreID,
leaseRepl interface {
RaftStatus() *raft.Status
StoreID() roachpb.StoreID
GetRangeID() roachpb.RangeID
},
stats *replicaStats,
checkTransferLeaseSource bool,
checkCandidateFullness bool,
Expand Down Expand Up @@ -801,7 +805,7 @@ func (a *Allocator) TransferLeaseTarget(
}
sl = makeStoreList(filteredDescs)

source, ok := a.storePool.getStoreDescriptor(leaseStoreID)
source, ok := a.storePool.getStoreDescriptor(leaseRepl.StoreID())
if !ok {
return roachpb.ReplicaDescriptor{}
}
Expand All @@ -819,14 +823,14 @@ func (a *Allocator) TransferLeaseTarget(
// it's too big a change to make right before a major release.
var candidates []roachpb.ReplicaDescriptor
for _, repl := range existing {
if repl.StoreID != leaseStoreID {
if repl.StoreID != leaseRepl.StoreID() {
candidates = append(candidates, repl)
}
}
preferred = a.preferredLeaseholders(zone, candidates)
}
if len(preferred) == 1 {
if preferred[0].StoreID == leaseStoreID {
if preferred[0].StoreID == leaseRepl.StoreID() {
return roachpb.ReplicaDescriptor{}
}
// Verify that the preferred replica is eligible to receive the lease.
Expand All @@ -839,17 +843,36 @@ func (a *Allocator) TransferLeaseTarget(
// If the current leaseholder is not preferred, set checkTransferLeaseSource
// to false to motivate the below logic to transfer the lease.
existing = preferred
if !storeHasReplica(leaseStoreID, preferred) {
if !storeHasReplica(leaseRepl.StoreID(), preferred) {
checkTransferLeaseSource = false
}
}

// Only consider live, non-draining replicas.
existing, _ = a.storePool.liveAndDeadReplicas(existing)

// Only proceed with the lease transfer if we are also the raft leader (we
// already know we are the leaseholder at this point), and only consider
// replicas that are in `StateReplicate` as potential candidates.
//
// NB: The RaftStatus() only returns a non-empty and non-nil result on the
// Raft leader (since Raft followers do not track the progress of other
// replicas, only the leader does).
//
// NB: On every Raft tick, we try to ensure that leadership is collocated with
// leaseholdership (see
// Replica.maybeTransferRaftLeadershipToLeaseholderLocked()). This means that
// on a range that is not already borked (i.e. can accept writes), periods of
// leader/leaseholder misalignment should be ephemeral and rare. We choose to
// be pessimistic here and choose to bail on the lease transfer, as opposed to
// potentially transferring the lease to a replica that may be waiting for a
// snapshot (which will wedge the range until the replica applies that
// snapshot).
existing = excludeReplicasInNeedOfSnapshots(ctx, leaseRepl.RaftStatus(), existing)

// Short-circuit if there are no valid targets out there.
if len(existing) == 0 || (len(existing) == 1 && existing[0].StoreID == leaseStoreID) {
log.VEventf(ctx, 2, "no lease transfer target found")
if len(existing) == 0 || (len(existing) == 1 && existing[0].StoreID == leaseRepl.StoreID()) {
log.VEventf(ctx, 2, "no lease transfer target found for r%d", leaseRepl.GetRangeID())
return roachpb.ReplicaDescriptor{}
}

Expand Down Expand Up @@ -886,7 +909,7 @@ func (a *Allocator) TransferLeaseTarget(
var bestOption roachpb.ReplicaDescriptor
bestOptionLeaseCount := int32(math.MaxInt32)
for _, repl := range existing {
if leaseStoreID == repl.StoreID {
if leaseRepl.StoreID() == repl.StoreID {
continue
}
storeDesc, ok := a.storePool.getStoreDescriptor(repl.StoreID)
Expand Down Expand Up @@ -1294,6 +1317,58 @@ func replicaIsBehind(raftStatus *raft.Status, replicaID roachpb.ReplicaID) bool
return true
}

// replicaMayNeedSnapshot determines whether the replica referred to by
// `replicaID` may be in need of a raft snapshot. If this function is called
// with an empty or nil `raftStatus` (as will be the case when its called by a
// replica that is not the raft leader), we pessimistically assume that
// `replicaID` may need a snapshot.
func replicaMayNeedSnapshot(raftStatus *raft.Status, replicaID roachpb.ReplicaID) bool {
if raftStatus == nil || len(raftStatus.Progress) == 0 {
return true
}
if progress, ok := raftStatus.Progress[uint64(replicaID)]; ok {
// We can only reasonably assume that the follower replica is not in need of
// a snapshot iff it is in `StateReplicate`. However, even this is racey
// because we can still possibly have an ill-timed log truncation between
// when we make this determination and when we act on it.
return progress.State != tracker.StateReplicate
}
return true
}

// excludeReplicasInNeedOfSnapshots filters out the `replicas` that may be in
// need of a raft snapshot. If this function is called with the `raftStatus` of
// a non-raft leader replica, an empty slice is returned.
func excludeReplicasInNeedOfSnapshots(
ctx context.Context, raftStatus *raft.Status, replicas []roachpb.ReplicaDescriptor,
) []roachpb.ReplicaDescriptor {
if raftStatus == nil || len(raftStatus.Progress) == 0 {
log.VEventf(
ctx,
5,
"raft leader not collocated with the leaseholder; will not produce any lease transfer targets",
)
return []roachpb.ReplicaDescriptor{}
}

filled := 0
for _, repl := range replicas {
if replicaMayNeedSnapshot(raftStatus, repl.ReplicaID) {
log.VEventf(
ctx,
5,
"not considering [n%d, s%d] as a potential candidate for a lease transfer"+
" because the replica may be waiting for a snapshot",
repl.NodeID, repl.StoreID,
)
continue
}
replicas[filled] = repl
filled++
}
return replicas[:filled]
}

// simulateFilterUnremovableReplicas removes any unremovable replicas from the
// supplied slice. Unlike filterUnremovableReplicas, brandNewReplicaID is
// considered up-to-date (and thus can participate in quorum), but is not
Expand Down
Loading

0 comments on commit a16ddf2

Please sign in to comment.