Skip to content

Commit

Permalink
storage: Maintain a separate set of unquiesced replicas
Browse files Browse the repository at this point in the history
This means that idle replicas no longer have a per-tick CPU cost,
which is one of the bottlenecks limiting the amount of data we can
handle per store.

Fixes cockroachdb#17609

Release note (performance improvement): Reduced CPU overhead of idle
ranges
  • Loading branch information
bdarnell committed Apr 20, 2018
1 parent 26e216e commit 646debb
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 11 deletions.
3 changes: 3 additions & 0 deletions pkg/storage/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -3388,6 +3388,7 @@ func (r *Replica) quiesceLocked() bool {
log.Infof(ctx, "quiescing")
}
r.mu.quiescent = true
r.store.unquiescedReplicas.Delete(int64(r.RangeID))
} else if log.V(4) {
log.Infof(ctx, "already quiesced")
}
Expand All @@ -3401,6 +3402,7 @@ func (r *Replica) unquiesceLocked() {
log.Infof(ctx, "unquiescing")
}
r.mu.quiescent = false
r.store.unquiescedReplicas.Store(int64(r.RangeID), unsafe.Pointer(r))
r.maybeCampaignOnWakeLocked(ctx)
r.refreshLastUpdateTimeForAllReplicasLocked()
}
Expand Down Expand Up @@ -4972,6 +4974,7 @@ func (r *Replica) acquireSplitLock(
rightRng.mu.destroyStatus.Set(errors.Errorf("%s: failed to initialize", rightRng), destroyReasonRemoved)
rightRng.mu.Unlock()
r.store.mu.Lock()
r.store.unquiescedReplicas.Delete(int64(rightRng.RangeID))
r.store.mu.replicas.Delete(int64(rightRng.RangeID))
delete(r.store.mu.uninitReplicas, rightRng.RangeID)
r.store.replicaQueues.Delete(int64(rightRng.RangeID))
Expand Down
20 changes: 9 additions & 11 deletions pkg/storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,9 @@ type Store struct {
replicaPlaceholders map[roachpb.RangeID]*ReplicaPlaceholder
}

// The uniquesed subset of mu.replicas.
unquiescedReplicas syncutil.IntMap

// replicaQueues is a map of per-Replica incoming request queues. These
// queues might more naturally belong in Replica, but are kept separate to
// avoid reworking the locking in getOrCreateReplica which requires
Expand Down Expand Up @@ -2092,6 +2095,7 @@ func (s *Store) SplitRange(ctx context.Context, origRng, newRng *Replica) error
log.Fatalf(ctx, "found unexpected uninitialized replica: %s vs %s", exRng, newRng)
}
delete(s.mu.uninitReplicas, newDesc.RangeID)
s.unquiescedReplicas.Delete(int64(newDesc.RangeID))
s.mu.replicas.Delete(int64(newDesc.RangeID))
s.replicaQueues.Delete(int64(newDesc.RangeID))
}
Expand Down Expand Up @@ -2301,6 +2305,7 @@ func (s *Store) addReplicaToRangeMapLocked(repl *Replica) error {
if _, loaded := s.mu.replicas.LoadOrStore(int64(repl.RangeID), unsafe.Pointer(repl)); loaded {
return errors.Errorf("%s: replica already exists", repl)
}
s.unquiescedReplicas.Store(int64(repl.RangeID), unsafe.Pointer(repl))
return nil
}

Expand Down Expand Up @@ -2398,6 +2403,7 @@ func (s *Store) removeReplicaImpl(

s.mu.Lock()
defer s.mu.Unlock()
s.unquiescedReplicas.Delete(int64(rep.RangeID))
s.mu.replicas.Delete(int64(rep.RangeID))
delete(s.mu.uninitReplicas, rep.RangeID)
s.replicaQueues.Delete(int64(rep.RangeID))
Expand Down Expand Up @@ -3539,23 +3545,14 @@ func (s *Store) raftTickLoop(ctx context.Context) {
case <-ticker.C:
rangeIDs = rangeIDs[:0]

s.mu.replicas.Range(func(k int64, v unsafe.Pointer) bool {
// Fast-path handling of quiesced replicas. This avoids the overhead of
// queueing the replica on the Raft scheduler. This overhead is
// significant and there is overhead to filling the Raft scheduler with
// replicas to tick. A node with 3TB of disk might contain 50k+
// replicas. Filling the Raft scheduler with all of those replicas
// every tick interval can starve other Raft processing of cycles.
//
s.unquiescedReplicas.Range(func(k int64, _ unsafe.Pointer) bool {
// Why do we bother to ever queue a Replica on the Raft scheduler for
// tick processing? Couldn't we just call Replica.tick() here? Yes, but
// then a single bad/slow Replica can disrupt tick processing for every
// Replica on the store which cascades into Raft elections and more
// disruption. Replica.maybeTickQuiesced only grabs short-duration
// locks and not locks that are held during disk I/O.
if !(*Replica)(v).needsTick() {
rangeIDs = append(rangeIDs, roachpb.RangeID(k))
}
rangeIDs = append(rangeIDs, roachpb.RangeID(k))
return true
})

Expand Down Expand Up @@ -3799,6 +3796,7 @@ func (s *Store) tryGetOrCreateReplica(
repl.mu.destroyStatus.Set(errors.Wrapf(err, "%s: failed to initialize", repl), destroyReasonRemoved)
repl.mu.Unlock()
s.mu.Lock()
s.unquiescedReplicas.Delete(int64(rangeID))
s.mu.replicas.Delete(int64(rangeID))
delete(s.mu.uninitReplicas, rangeID)
s.replicaQueues.Delete(int64(rangeID))
Expand Down

0 comments on commit 646debb

Please sign in to comment.