Skip to content

Commit

Permalink
kv: log on excessive latch hold duration
Browse files Browse the repository at this point in the history
This commit aims to help observability by logging request holding latches over
threshold. long_latch_hold_duration is a new cluster setting that is introduced
to set the latch holding time threshold, latches held over the threshold will
be logged at most every second. To achieve logging spanlatch.manager now
contains a pointer to cluster setting.

Fixes: cockroachdb#114609

Release note: None
  • Loading branch information
lyang24 authored and wenyihu6 committed Feb 21, 2024
1 parent 57beed3 commit 30d6a4d
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 7 deletions.
1 change: 1 addition & 0 deletions pkg/kv/kvserver/concurrency/concurrency_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ func NewManager(cfg Config) Manager {
m: spanlatch.Make(
cfg.Stopper,
cfg.SlowLatchGauge,
cfg.Settings,
),
},
lt: lt,
Expand Down
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/spanlatch/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ go_library(
"doc.go",
"list.go",
"manager.go",
"settings.go",
"signal.go",
":latch_interval_btree.go", # keep
],
Expand All @@ -18,6 +19,8 @@ go_library(
"//pkg/kv/kvserver/concurrency/poison",
"//pkg/kv/kvserver/spanset",
"//pkg/roachpb",
"//pkg/settings",
"//pkg/settings/cluster",
"//pkg/util/hlc",
"//pkg/util/log",
"//pkg/util/metric",
Expand Down
30 changes: 24 additions & 6 deletions pkg/kv/kvserver/spanlatch/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@ package spanlatch

import (
"context"
"time"
"unsafe"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/poison"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/metric"
Expand Down Expand Up @@ -62,8 +64,10 @@ type Manager struct {
idAlloc uint64
scopes [spanset.NumSpanScope]scopedManager

stopper *stop.Stopper
slowReqs *metric.Gauge
stopper *stop.Stopper
slowReqs *metric.Gauge
settings *cluster.Settings
everySecondLogger log.EveryN
}

// scopedManager is a latch manager scoped to either local or global keys.
Expand All @@ -75,10 +79,12 @@ type scopedManager struct {

// Make returns an initialized Manager. Using this constructor is optional as
// the type's zero value is valid to use directly.
func Make(stopper *stop.Stopper, slowReqs *metric.Gauge) Manager {
func Make(stopper *stop.Stopper, slowReqs *metric.Gauge, settings *cluster.Settings) Manager {
return Manager{
stopper: stopper,
slowReqs: slowReqs,
stopper: stopper,
slowReqs: slowReqs,
settings: settings,
everySecondLogger: log.Every(1 * time.Second),
}
}

Expand Down Expand Up @@ -137,7 +143,8 @@ type Guard struct {
latchesLens [spanset.NumSpanScope][spanset.NumSpanAccess]int32
// Non-nil only when AcquireOptimistic has retained the snapshot for later
// checking of conflicts, and waiting.
snap *snapshot
snap *snapshot
acquireTime int64
}

func (lg *Guard) latches(s spanset.SpanScope, a spanset.SpanAccess) []latch {
Expand Down Expand Up @@ -522,6 +529,7 @@ func (m *Manager) wait(ctx context.Context, lg *Guard, snap snapshot) error {
}
}
}
lg.acquireTime = timeutil.Now().UnixNano()
return nil
}

Expand Down Expand Up @@ -624,6 +632,16 @@ func (m *Manager) Release(lg *Guard) {
m.mu.Lock()
m.removeLocked(lg)
m.mu.Unlock()
held := timeutil.Now().UnixNano() - lg.acquireTime
if lg.acquireTime != 0 && m.settings != nil && held > longLatchHoldDuration.Get(&m.settings.SV).Nanoseconds() {
const longLatchHeldMsg = "%s has held latch for %d ns. Some possible causes are " +
"slow disk reads, slow raft replication, and expensive request processing."
if m.everySecondLogger.ShouldLog() {
log.Warningf(context.Background(), longLatchHeldMsg, lg.baFmt, held)
} else {
log.VEventf(context.Background(), 2, longLatchHeldMsg, lg.baFmt, held)
}
}
}

// removeLocked removes the latches owned by the provided Guard from the
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/spanlatch/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ func TestSizeOfLatch(t *testing.T) {
func TestSizeOfLatchGuard(t *testing.T) {
var lg Guard
size := int(unsafe.Sizeof(lg))
require.Equal(t, 112, size)
require.Equal(t, 120, size)
}

// TestLatchStringAndSafeformat tests the output of latch.SafeFormat.
Expand Down
29 changes: 29 additions & 0 deletions pkg/kv/kvserver/spanlatch/settings.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

// Package concurrency provides a concurrency manager structure that
// encapsulates the details of concurrency control and contention handling for
// serializable key-value transactions.

package spanlatch

import (
"time"

"github.com/cockroachdb/cockroach/pkg/settings"
)

var longLatchHoldDuration = settings.RegisterDurationSetting(
settings.SystemOnly,
"kv.concurrency.long_latch_hold_duration",
"duration threshold for logging long latch holding",
3*time.Second,
settings.NonNegativeDuration,
)

0 comments on commit 30d6a4d

Please sign in to comment.