-
Notifications
You must be signed in to change notification settings - Fork 3.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
kvserver: clean up replica unquiescence #105041
Changes from 7 commits
b798cc0
f032313
391647c
69089a8
151fd45
fdbef91
50d7264
3dd1a37
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,17 +52,27 @@ func (r *Replica) quiesceLocked(ctx context.Context, lagging laggingReplicaSet) | |
} | ||
} | ||
|
||
func (r *Replica) maybeUnquiesce() bool { | ||
// maybeUnquiesce unquiesces the replica if it is quiesced and can be | ||
// unquiesced, returning true in that case. See maybeUnquiesceLocked() for | ||
// details. | ||
func (r *Replica) maybeUnquiesce(wakeLeader, mayCampaign bool) bool { | ||
r.mu.Lock() | ||
defer r.mu.Unlock() | ||
return r.maybeUnquiesceLocked() | ||
return r.maybeUnquiesceLocked(wakeLeader, mayCampaign) | ||
} | ||
|
||
func (r *Replica) maybeUnquiesceLocked() bool { | ||
return r.maybeUnquiesceWithOptionsLocked(true /* campaignOnWake */) | ||
} | ||
|
||
func (r *Replica) maybeUnquiesceWithOptionsLocked(campaignOnWake bool) bool { | ||
// maybeUnquiesceLocked unquiesces the replica if it is quiesced and can be | ||
// unquiesced, returning true in that case. | ||
// | ||
// If wakeLeader is true, wake the leader by proposing an empty command. Should | ||
// typically be true, unless e.g. the caller is either about to propose a | ||
// command anyway, or it knows the leader is awake because it received a message | ||
// from it. | ||
// | ||
// If mayCampaign is true, the replica may campaign if appropriate. This will | ||
// respect PreVote and CheckQuorum, and thus won't disrupt a current leader. | ||
// Should typically be true, unless the caller wants to avoid election ties. | ||
func (r *Replica) maybeUnquiesceLocked(wakeLeader, mayCampaign bool) bool { | ||
if !r.canUnquiesceRLocked() { | ||
return false | ||
} | ||
|
@@ -75,34 +85,29 @@ func (r *Replica) maybeUnquiesceWithOptionsLocked(campaignOnWake bool) bool { | |
r.store.unquiescedReplicas.Lock() | ||
r.store.unquiescedReplicas.m[r.RangeID] = struct{}{} | ||
r.store.unquiescedReplicas.Unlock() | ||
if campaignOnWake { | ||
r.maybeCampaignOnWakeLocked(ctx) | ||
} | ||
// NB: we know there's a non-nil RaftStatus because internalRaftGroup isn't nil. | ||
r.mu.lastUpdateTimes.updateOnUnquiesce( | ||
r.mu.state.Desc.Replicas().Descriptors(), r.raftSparseStatusRLocked().Progress, timeutil.Now(), | ||
) | ||
return true | ||
} | ||
|
||
func (r *Replica) maybeUnquiesceAndWakeLeaderLocked() bool { | ||
if !r.canUnquiesceRLocked() { | ||
return false | ||
st := r.raftSparseStatusRLocked() | ||
if st.RaftState == raft.StateLeader { | ||
r.mu.lastUpdateTimes.updateOnUnquiesce( | ||
r.mu.state.Desc.Replicas().Descriptors(), st.Progress, timeutil.Now()) | ||
|
||
} else if st.RaftState == raft.StateFollower && wakeLeader { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct, a candidate has no leader, and won't send proposals. |
||
// Propose an empty command which will wake the leader. | ||
if log.V(3) { | ||
log.Infof(ctx, "waking %d leader", r.RangeID) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about "r%d" instead of "%d". It's easier to search by "rNNN" in logs when investigating, this avoids false positives. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was following the convention of the existing log messages here, didn't particularly feel like updating a bunch of log messages. But I can do a pass if there aren't that many to update. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tacked on a commit. |
||
} | ||
data := raftlog.EncodeRaftCommand(raftlog.EntryEncodingStandardWithoutAC, makeIDKey(), nil) | ||
_ = r.mu.internalRaftGroup.Propose(data) | ||
r.mu.lastProposalAtTicks = r.mu.ticks // delay imminent quiescence | ||
} | ||
ctx := r.AnnotateCtx(context.TODO()) | ||
if log.V(3) { | ||
log.Infof(ctx, "unquiescing %d: waking leader", r.RangeID) | ||
|
||
// NB: campaign after attempting to wake leader, since we won't send the | ||
// proposal in candidate state. This gives it a chance to assert leadership if | ||
// we're wrong about it being dead. | ||
if mayCampaign { | ||
r.maybeCampaignOnWakeLocked(ctx) | ||
} | ||
r.mu.quiescent = false | ||
r.mu.laggingFollowersOnQuiesce = nil | ||
r.store.unquiescedReplicas.Lock() | ||
r.store.unquiescedReplicas.m[r.RangeID] = struct{}{} | ||
r.store.unquiescedReplicas.Unlock() | ||
r.maybeCampaignOnWakeLocked(ctx) | ||
// Propose an empty command which will wake the leader. | ||
data := raftlog.EncodeRaftCommand(raftlog.EntryEncodingStandardWithoutAC, makeIDKey(), nil) | ||
_ = r.mu.internalRaftGroup.Propose(data) | ||
r.mu.lastProposalAtTicks = r.mu.ticks // delay imminent quiescence | ||
|
||
return true | ||
} | ||
|
||
|
@@ -184,14 +189,6 @@ func (r *Replica) canUnquiesceRLocked() bool { | |
// are behind, whether or not they are live. If any entry in the livenessMap is | ||
// nil, then the missing node ID is treated as live and will prevent the range | ||
// from quiescing. | ||
// | ||
// TODO(peter): There remains a scenario in which a follower is left unquiesced | ||
// while the leader is quiesced: the follower's receive queue is full and the | ||
// "quiesce" message is dropped. This seems very very unlikely because if the | ||
// follower isn't keeping up with raft messages it is unlikely that the leader | ||
// would quiesce. The fallout from this situation are undesirable raft | ||
// elections which will cause throughput hiccups to the range, but not | ||
// correctness issues. | ||
func (r *Replica) maybeQuiesceRaftMuLockedReplicaMuLocked( | ||
ctx context.Context, leaseStatus kvserverpb.LeaseStatus, livenessMap livenesspb.IsLiveMap, | ||
) bool { | ||
|
@@ -468,13 +465,12 @@ func (r *Replica) quiesceAndNotifyRaftMuLockedReplicaMuLocked( | |
if roachpb.ReplicaID(id) == r.replicaID { | ||
continue | ||
} | ||
toReplica, toErr := r.getReplicaDescriptorByIDRLocked( | ||
roachpb.ReplicaID(id), lastFromReplica) | ||
toReplica, toErr := r.getReplicaDescriptorByIDRLocked(roachpb.ReplicaID(id), lastFromReplica) | ||
if toErr != nil { | ||
if log.V(4) { | ||
log.Infof(ctx, "failed to quiesce: cannot find to replica (%d)", id) | ||
} | ||
r.maybeUnquiesceLocked() | ||
r.maybeUnquiesceLocked(false /* wakeLeader */, false /* mayCampaign */) // already leader | ||
return false | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -705,7 +705,7 @@ func (s *Store) nodeIsLiveCallback(l livenesspb.Liveness) { | |
lagging := r.mu.laggingFollowersOnQuiesce | ||
r.mu.RUnlock() | ||
if quiescent && lagging.MemberStale(l) { | ||
r.maybeUnquiesce() | ||
r.maybeUnquiesce(false /* wakeLeader */, false /* mayCampaign */) // already leader | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we know we're already leader here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because |
||
} | ||
}) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The semantics of
MaybeUnquiesce
changed. Were there any callers who wanted the old behaviour (wakeLeader=mayCampaign=false)?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tests don't care -- explained in the commit message:
In case it wasn't clear, this is a test-only file,
helpers_test.go
.