From 30d6a4d3ce86e9cc7853ff59c8477b36ed15794a Mon Sep 17 00:00:00 2001 From: lyang24 Date: Wed, 6 Dec 2023 22:50:46 -0800 Subject: [PATCH] kv: log on excessive latch hold duration This commit aims to help observability by logging request holding latches over threshold. long_latch_hold_duration is a new cluster setting that is introduced to set the latch holding time threshold, latches held over the threshold will be logged at most every second. To achieve logging spanlatch.manager now contains a pointer to cluster setting. Fixes: https://github.com/cockroachdb/cockroach/issues/114609 Release note: None --- .../concurrency/concurrency_manager.go | 1 + pkg/kv/kvserver/spanlatch/BUILD.bazel | 3 ++ pkg/kv/kvserver/spanlatch/manager.go | 30 +++++++++++++++---- pkg/kv/kvserver/spanlatch/manager_test.go | 2 +- pkg/kv/kvserver/spanlatch/settings.go | 29 ++++++++++++++++++ 5 files changed, 58 insertions(+), 7 deletions(-) create mode 100644 pkg/kv/kvserver/spanlatch/settings.go diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager.go b/pkg/kv/kvserver/concurrency/concurrency_manager.go index 1091fe0a99ad..e7e081c42093 100644 --- a/pkg/kv/kvserver/concurrency/concurrency_manager.go +++ b/pkg/kv/kvserver/concurrency/concurrency_manager.go @@ -172,6 +172,7 @@ func NewManager(cfg Config) Manager { m: spanlatch.Make( cfg.Stopper, cfg.SlowLatchGauge, + cfg.Settings, ), }, lt: lt, diff --git a/pkg/kv/kvserver/spanlatch/BUILD.bazel b/pkg/kv/kvserver/spanlatch/BUILD.bazel index 18f69ddf2952..acbba0a3fe08 100644 --- a/pkg/kv/kvserver/spanlatch/BUILD.bazel +++ b/pkg/kv/kvserver/spanlatch/BUILD.bazel @@ -7,6 +7,7 @@ go_library( "doc.go", "list.go", "manager.go", + "settings.go", "signal.go", ":latch_interval_btree.go", # keep ], @@ -18,6 +19,8 @@ go_library( "//pkg/kv/kvserver/concurrency/poison", "//pkg/kv/kvserver/spanset", "//pkg/roachpb", + "//pkg/settings", + "//pkg/settings/cluster", "//pkg/util/hlc", "//pkg/util/log", "//pkg/util/metric", diff --git a/pkg/kv/kvserver/spanlatch/manager.go b/pkg/kv/kvserver/spanlatch/manager.go index 2192c1517baf..b92cc8534e5d 100644 --- a/pkg/kv/kvserver/spanlatch/manager.go +++ b/pkg/kv/kvserver/spanlatch/manager.go @@ -12,6 +12,7 @@ package spanlatch import ( "context" + "time" "unsafe" "github.com/cockroachdb/cockroach/pkg/base" @@ -19,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/poison" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/metric" @@ -62,8 +64,10 @@ type Manager struct { idAlloc uint64 scopes [spanset.NumSpanScope]scopedManager - stopper *stop.Stopper - slowReqs *metric.Gauge + stopper *stop.Stopper + slowReqs *metric.Gauge + settings *cluster.Settings + everySecondLogger log.EveryN } // scopedManager is a latch manager scoped to either local or global keys. @@ -75,10 +79,12 @@ type scopedManager struct { // Make returns an initialized Manager. Using this constructor is optional as // the type's zero value is valid to use directly. -func Make(stopper *stop.Stopper, slowReqs *metric.Gauge) Manager { +func Make(stopper *stop.Stopper, slowReqs *metric.Gauge, settings *cluster.Settings) Manager { return Manager{ - stopper: stopper, - slowReqs: slowReqs, + stopper: stopper, + slowReqs: slowReqs, + settings: settings, + everySecondLogger: log.Every(1 * time.Second), } } @@ -137,7 +143,8 @@ type Guard struct { latchesLens [spanset.NumSpanScope][spanset.NumSpanAccess]int32 // Non-nil only when AcquireOptimistic has retained the snapshot for later // checking of conflicts, and waiting. - snap *snapshot + snap *snapshot + acquireTime int64 } func (lg *Guard) latches(s spanset.SpanScope, a spanset.SpanAccess) []latch { @@ -522,6 +529,7 @@ func (m *Manager) wait(ctx context.Context, lg *Guard, snap snapshot) error { } } } + lg.acquireTime = timeutil.Now().UnixNano() return nil } @@ -624,6 +632,16 @@ func (m *Manager) Release(lg *Guard) { m.mu.Lock() m.removeLocked(lg) m.mu.Unlock() + held := timeutil.Now().UnixNano() - lg.acquireTime + if lg.acquireTime != 0 && m.settings != nil && held > longLatchHoldDuration.Get(&m.settings.SV).Nanoseconds() { + const longLatchHeldMsg = "%s has held latch for %d ns. Some possible causes are " + + "slow disk reads, slow raft replication, and expensive request processing." + if m.everySecondLogger.ShouldLog() { + log.Warningf(context.Background(), longLatchHeldMsg, lg.baFmt, held) + } else { + log.VEventf(context.Background(), 2, longLatchHeldMsg, lg.baFmt, held) + } + } } // removeLocked removes the latches owned by the provided Guard from the diff --git a/pkg/kv/kvserver/spanlatch/manager_test.go b/pkg/kv/kvserver/spanlatch/manager_test.go index 11870cad0bfa..abac3c4a8d24 100644 --- a/pkg/kv/kvserver/spanlatch/manager_test.go +++ b/pkg/kv/kvserver/spanlatch/manager_test.go @@ -756,7 +756,7 @@ func TestSizeOfLatch(t *testing.T) { func TestSizeOfLatchGuard(t *testing.T) { var lg Guard size := int(unsafe.Sizeof(lg)) - require.Equal(t, 112, size) + require.Equal(t, 120, size) } // TestLatchStringAndSafeformat tests the output of latch.SafeFormat. diff --git a/pkg/kv/kvserver/spanlatch/settings.go b/pkg/kv/kvserver/spanlatch/settings.go new file mode 100644 index 000000000000..a14faa9009e4 --- /dev/null +++ b/pkg/kv/kvserver/spanlatch/settings.go @@ -0,0 +1,29 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +// Package concurrency provides a concurrency manager structure that +// encapsulates the details of concurrency control and contention handling for +// serializable key-value transactions. + +package spanlatch + +import ( + "time" + + "github.com/cockroachdb/cockroach/pkg/settings" +) + +var longLatchHoldDuration = settings.RegisterDurationSetting( + settings.SystemOnly, + "kv.concurrency.long_latch_hold_duration", + "duration threshold for logging long latch holding", + 3*time.Second, + settings.NonNegativeDuration, +)