Skip to content

Commit

Permalink
Merge pull request #96096 from itsbilal/backport22.2-96021
Browse files Browse the repository at this point in the history
release-22.2: storage: Make logging event listener async for DiskSlow
  • Loading branch information
itsbilal authored Jan 27, 2023
2 parents 363edec + edf383f commit d2bb630
Showing 1 changed file with 34 additions and 5 deletions.
39 changes: 34 additions & 5 deletions pkg/storage/pebble.go
Original file line number Diff line number Diff line change
Expand Up @@ -919,12 +919,35 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (p *Pebble, err error) {
return int(atomic.LoadUint64(&p.atomic.compactionConcurrency))
}

// NB: The ordering of the event listeners passed to TeeEventListener is
// deliberate. The listener returned by makeMetricEtcEventListener is
// responsible for crashing the process if a DiskSlow event indicates the
// disk is stalled. While the logging subsystem should also be robust to
// stalls and crash the process if unable to write logs, there's less risk
// to sequencing the crashing listener first.
//
// For the same reason, make the logging call asynchronous for DiskSlow events.
// This prevents slow logging calls during a disk slow/stall event from holding
// up Pebble's internal disk health checking, and better obeys the
// EventListener contract for not having any functions block or take a while to
// run. Creating goroutines is acceptable given their low cost, and the low
// write concurrency to Pebble's FS (Pebble compactions + flushes + SQL
// spilling to disk). If the maximum concurrency of DiskSlow events increases
// significantly in the future, we can improve the logic here by queueing up
// most of the logging work (except for the Fatalf call), and have it be done
// by a single goroutine.
lel := pebble.MakeLoggingEventListener(pebbleLogger{
ctx: logCtx,
depth: 2, // skip over the EventListener stack frame
})
oldDiskSlow := lel.DiskSlow
lel.DiskSlow = func(info pebble.DiskSlowInfo) {
// Run oldDiskSlow asynchronously.
go oldDiskSlow(info)
}
cfg.Opts.EventListener = pebble.TeeEventListener(
pebble.MakeLoggingEventListener(pebbleLogger{
ctx: logCtx,
depth: 2, // skip over the EventListener stack frame
}),
p.makeMetricEtcEventListener(ctx),
lel,
)
p.eventListener = &cfg.Opts.EventListener
p.wrappedIntentWriter = wrapIntentWriter(p)
Expand Down Expand Up @@ -1001,11 +1024,17 @@ func (p *Pebble) makeMetricEtcEventListener(ctx context.Context) pebble.EventLis
atomic.AddInt64(&p.diskStallCount, 1)
// Note that the below log messages go to the main cockroach log, not
// the pebble-specific log.
//
// Run non-fatal log.* calls in separate goroutines as they could block
// if the logging device is also slow/stalling, preventing pebble's disk
// health checking from functioning correctly. See the comment in
// pebble.EventListener on why it's important for this method to return
// quickly.
if fatalOnExceeded {
log.Fatalf(ctx, "disk stall detected: pebble unable to write to %s in %.2f seconds",
info.Path, redact.Safe(info.Duration.Seconds()))
} else {
log.Errorf(ctx, "disk stall detected: pebble unable to write to %s in %.2f seconds",
go log.Errorf(ctx, "disk stall detected: pebble unable to write to %s in %.2f seconds",
info.Path, redact.Safe(info.Duration.Seconds()))
}
return
Expand Down

0 comments on commit d2bb630

Please sign in to comment.