Skip to content

Commit

Permalink
compact: add shared compaction pool for multiple stores
Browse files Browse the repository at this point in the history
This change adds a new compaction pool which enforces a global max
compaction concurrency in a multi-store configuration. Each Pebble store
(i.e. an instance of *DB) still maintains its own per-store compaction
concurrency which is controlled by `opts.MaxConcurrentCompactions`.
However, in a multi-store configuration, disk I/O is a per-store resource
while CPU is shared across stores. A significant portion of compaction
is CPU-intensive, and so this ensures that excessive compactions don't
interrupt foreground CPU tasks even if the disks are capable of handling
the additional throughput from those compactions.
  • Loading branch information
anish-shanbhag committed Aug 23, 2024
1 parent 94561af commit ed2ae1e
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 57 deletions.
176 changes: 130 additions & 46 deletions compaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ import (
"context"
"fmt"
"math"
"runtime"
"runtime/pprof"
"slices"
"sync"
"sync/atomic"
"time"

Expand Down Expand Up @@ -1644,21 +1646,10 @@ func (d *DB) maybeScheduleCompactionAsync() {
d.mu.Unlock()
}

// maybeScheduleCompaction schedules a compaction if necessary.
//
// d.mu must be held when calling this.
func (d *DB) maybeScheduleCompaction() {
d.maybeScheduleCompactionPicker(pickAuto)
}

func pickAuto(picker compactionPicker, env compactionEnv) *pickedCompaction {
return picker.pickAuto(env)
}

func pickElisionOnly(picker compactionPicker, env compactionEnv) *pickedCompaction {
return picker.pickElisionOnlyCompaction(env)
}

// tryScheduleDownloadCompaction tries to start a download compaction.
//
// Returns true if we started a download compaction (or completed it
Expand All @@ -1683,27 +1674,15 @@ func (d *DB) tryScheduleDownloadCompaction(env compactionEnv, maxConcurrentDownl
return false
}

// maybeScheduleCompactionPicker schedules a compaction if necessary,
// calling `pickFunc` to pick automatic compactions.
// withCompactionEnv runs the specified function after initializing the
// compaction picking environment. If the DB is read-only or has already been
// closed, the function will not be run.
//
// Requires d.mu to be held.
func (d *DB) maybeScheduleCompactionPicker(
pickFunc func(compactionPicker, compactionEnv) *pickedCompaction,
) {
func (d *DB) withCompactionEnv(f func(env compactionEnv)) {
if d.closed.Load() != nil || d.opts.ReadOnly {
return
}
maxCompactions := d.opts.MaxConcurrentCompactions()
maxDownloads := d.opts.MaxConcurrentDownloads()

if d.mu.compact.compactingCount >= maxCompactions &&
(len(d.mu.compact.downloads) == 0 || d.mu.compact.downloadingCount >= maxDownloads) {
if len(d.mu.compact.manual) > 0 {
// Inability to run head blocks later manual compactions.
d.mu.compact.manual[0].retries++
}
return
}

// Compaction picking needs a coherent view of a Version. In particular, we
// need to exclude concurrent ingestions from making a decision on which level
Expand All @@ -1722,9 +1701,102 @@ func (d *DB) maybeScheduleCompactionPicker(
diskAvailBytes: d.diskAvailBytes.Load(),
earliestSnapshotSeqNum: d.mu.snapshots.earliest(),
earliestUnflushedSeqNum: d.getEarliestUnflushedSeqNumLocked(),
inProgressCompactions: d.getInProgressCompactionInfoLocked(nil),
readCompactionEnv: readCompactionEnv{
readCompactions: &d.mu.compact.readCompactions,
flushing: d.mu.compact.flushing || d.passedFlushThreshold(),
rescheduleReadCompaction: &d.mu.compact.rescheduleReadCompaction,
},
}

f(env)
}

// compactionPool enforces a global max compaction concurrency in a multi-store
// configuration. If multiple DBs are waiting to perform a compaction, it
// prioritizes the DB with the highest compaction score across levels.
type compactionPool struct {
mu sync.Mutex
compactingCount int
waiting map[*DB]struct{}
maxCompactionConcurrency int
}

// NewCompactionPool creates a new compactionPool with the specified
// maxCompactionConcurrency.
func NewCompactionPool(maxCompactionConcurrency int) *compactionPool {
if maxCompactionConcurrency <= 0 {
panic("pebble: maxCompactionConcurrency for a CompactionPool must be greater than 0")
}
return &compactionPool{
maxCompactionConcurrency: maxCompactionConcurrency,
waiting: make(map[*DB]struct{}),
}
}

var defaultCompactionPool = NewCompactionPool(runtime.GOMAXPROCS(0) * 2)

// maybeScheduleWaitingCompactionLocked attempts to schedule a waiting
// compaction from the list of waiting DBs. It prioritizes the DB with the
// highest compaction score across all levels. If no DBs have a compaction
// score above the threshold, it effectively picks a DB at random.
//
// c.mu must be held. DB.mu must not be held for any DB.
func (c *compactionPool) maybeScheduleWaitingCompactionLocked() {
if len(c.waiting) == 0 || c.compactingCount >= c.maxCompactionConcurrency {
return
}

// NB: highestScore starts at 1 so that we effectively have no preference
// between two DBs that don't have any level with a score above 1.
highestScore := float64(compactionScoreThreshold)
var pickedDB *DB
for d := range c.waiting {
if len(c.waiting) == 1 {
pickedDB = d
// No need to calculate scores if only one DB is waiting.
break
}
d.mu.Lock()
inProgress := d.getInProgressCompactionInfoLocked(nil)
scores := d.mu.versions.picker.getScores(inProgress)
if pickedDB == nil || scores[0] >= highestScore {
highestScore = scores[0]
pickedDB = d
}
d.mu.Unlock()
}

pickedDB.mu.Lock()
if !pickedDB.tryScheduleAutoCompaction() {
// If we can't schedule a compaction for this DB right now, mark it as
// no longer waiting.
delete(c.waiting, pickedDB)
}
pickedDB.mu.Unlock()

c.maybeScheduleWaitingCompactionLocked()
}

// maybeScheduleCompaction schedules a compaction if necessary.
//
// Requires d.mu to be held.
func (d *DB) maybeScheduleCompaction() {
d.withCompactionEnv(func(env compactionEnv) {
maxDownloads := d.opts.MaxConcurrentDownloads()
for len(d.mu.compact.downloads) > 0 && d.mu.compact.downloadingCount < maxDownloads &&
d.tryScheduleDownloadCompaction(env, maxDownloads) {
}

maxCompactions := d.opts.MaxConcurrentCompactions()
if d.mu.compact.compactingCount >= maxCompactions {
if len(d.mu.compact.manual) > 0 {
// Inability to run head blocks later manual compactions.
d.mu.compact.manual[0].retries++
}
return
}

if d.mu.compact.compactingCount < maxCompactions {
// Check for delete-only compactions first, because they're expected to be
// cheap and reduce future compaction work.
if !d.opts.private.disableDeleteOnlyCompactions &&
Expand All @@ -1741,14 +1813,19 @@ func (d *DB) maybeScheduleCompactionPicker(
}
d.mu.compact.manual = d.mu.compact.manual[1:]
}
})

for !d.opts.DisableAutomaticCompactions && d.mu.compact.compactingCount < maxCompactions &&
d.tryScheduleAutoCompaction(env, pickFunc) {
}
}

for len(d.mu.compact.downloads) > 0 && d.mu.compact.downloadingCount < maxDownloads &&
d.tryScheduleDownloadCompaction(env, maxDownloads) {
if !d.opts.DisableAutomaticCompactions {
// NB: we must release d.mu to avoid deadlock when calling
// maybeScheduleWaitingCompactionLocked below.
d.mu.Unlock()
d.compactionPool.mu.Lock()
// Mark this DB as waiting for an automatic compaction to
// be scheduled.
d.compactionPool.waiting[d] = struct{}{}
d.compactionPool.maybeScheduleWaitingCompactionLocked()
d.compactionPool.mu.Unlock()
d.mu.Lock()
}
}

Expand Down Expand Up @@ -1801,24 +1878,31 @@ func (d *DB) tryScheduleManualCompaction(env compactionEnv, manual *manualCompac
// Returns false if no automatic compactions are necessary or able to run at
// this time.
//
// Requires d.mu to be held.
func (d *DB) tryScheduleAutoCompaction(
env compactionEnv, pickFunc func(compactionPicker, compactionEnv) *pickedCompaction,
) bool {
env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil)
env.readCompactionEnv = readCompactionEnv{
readCompactions: &d.mu.compact.readCompactions,
flushing: d.mu.compact.flushing || d.passedFlushThreshold(),
rescheduleReadCompaction: &d.mu.compact.rescheduleReadCompaction,
// Requires d.mu and d.compactionPool.mu to be held.
func (d *DB) tryScheduleAutoCompaction() bool {
if d.mu.compact.compactingCount >= d.opts.MaxConcurrentCompactions() {
return false
}
pc := pickFunc(d.mu.versions.picker, env)

var pc *pickedCompaction
d.withCompactionEnv(func(env compactionEnv) {
pc = pickAuto(d.mu.versions.picker, env)
})

if pc == nil {
return false
}
c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider())
d.mu.compact.compactingCount++
d.compactionPool.compactingCount++
d.addInProgressCompaction(c)
go d.compact(c, nil)
go func() {
d.compact(c, nil)
d.compactionPool.mu.Lock()
d.compactionPool.compactingCount--
d.compactionPool.maybeScheduleWaitingCompactionLocked()
d.compactionPool.mu.Unlock()
}()
return true
}

Expand Down
9 changes: 2 additions & 7 deletions compaction_picker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1413,16 +1413,11 @@ func TestCompactionPickerPickFile(t *testing.T) {
d.mu.Lock()
defer d.mu.Unlock()

// Use maybeScheduleCompactionPicker to take care of all of the
// initialization of the compaction-picking environment, but never
// pick a compaction; just call pickFile using the user-provided
// level.
var lf manifest.LevelFile
var ok bool
d.maybeScheduleCompactionPicker(func(untypedPicker compactionPicker, env compactionEnv) *pickedCompaction {
p := untypedPicker.(*compactionPickerByScore)
d.withCompactionEnv(func(env compactionEnv) {
p := d.mu.versions.picker.(*compactionPickerByScore)
lf, ok = pickCompactionSeedFile(p.vers, p.virtualBackings, opts, level, level+1, env.earliestSnapshotSeqNum)
return nil
})
if !ok {
return "(none)"
Expand Down
11 changes: 11 additions & 0 deletions db.go
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,17 @@ type DB struct {
// compactionShedulers.Wait() should not be called while the DB.mu is held.
compactionSchedulers sync.WaitGroup

// compactionPool enforces a global max compaction concurrency in a
// multi-store configuration. Each Pebble store (i.e. an instance of *DB)
// has its own per-store compaction concurrency which is controlled by
// opts.MaxConcurrentCompactions. However, in a multi-store configuration,
// disk I/O is a per-store resource while CPU is shared across stores.
// A significant portion of compaction is CPU-intensive, and so
// compactionPool is necessary to ensure that excessive compactions don't
// interrupt foreground CPU tasks even if the disks are capable of handling
// the additional throughput from those compactions.
compactionPool *compactionPool

// The main mutex protecting internal DB state. This mutex encompasses many
// fields because those fields need to be accessed and updated atomically. In
// particular, the current version, log.*, mem.*, and snapshot list need to
Expand Down
4 changes: 1 addition & 3 deletions format_major_version.go
Original file line number Diff line number Diff line change
Expand Up @@ -402,9 +402,7 @@ func (d *DB) compactMarkedFilesLocked() error {
for curr.Stats.MarkedForCompaction > 0 {
// Attempt to schedule a compaction to rewrite a file marked for
// compaction.
d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction {
return picker.pickRewriteCompaction(env)
})
d.maybeScheduleCompaction()

// The above attempt might succeed and schedule a rewrite compaction. Or
// there might not be available compaction concurrency to schedule the
Expand Down
2 changes: 2 additions & 0 deletions open.go
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,8 @@ func Open(dirname string, opts *Options) (db *DB, err error) {
}
d.calculateDiskAvailableBytes()

d.compactionPool = opts.CompactionPool

d.maybeScheduleFlush()
d.maybeScheduleCompaction()

Expand Down
9 changes: 9 additions & 0 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -975,6 +975,12 @@ type Options struct {
// The default value is 1.
MaxConcurrentCompactions func() int

// CompactionPool is an instance of compactionPool that enforces a global
// maximum compaction concurrency in a multi-store configuration. By
// default, up to runtime.GOMAXPROCS(0) compactions are allowed to run
// concurrently.
CompactionPool *compactionPool

// MaxConcurrentDownloads specifies the maximum number of download
// compactions. These are compactions that copy an external file to the local
// store.
Expand Down Expand Up @@ -1268,6 +1274,9 @@ func (o *Options) EnsureDefaults() *Options {
if o.MaxConcurrentCompactions == nil {
o.MaxConcurrentCompactions = func() int { return 1 }
}
if o.CompactionPool == nil {
o.CompactionPool = defaultCompactionPool
}
if o.MaxConcurrentDownloads == nil {
o.MaxConcurrentDownloads = func() int { return 1 }
}
Expand Down
2 changes: 1 addition & 1 deletion snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ func (s *Snapshot) closeLocked() error {
// If s was the previous earliest snapshot, we might be able to reclaim
// disk space by dropping obsolete records that were pinned by s.
if e := s.db.mu.snapshots.earliest(); e > s.seqNum {
s.db.maybeScheduleCompactionPicker(pickElisionOnly)
s.db.maybeScheduleCompaction()
}
s.db = nil
return nil
Expand Down

0 comments on commit ed2ae1e

Please sign in to comment.