diff --git a/pkg/kv/kvserver/concurrency/BUILD.bazel b/pkg/kv/kvserver/concurrency/BUILD.bazel index c71fe92632d3..682d3252b17d 100644 --- a/pkg/kv/kvserver/concurrency/BUILD.bazel +++ b/pkg/kv/kvserver/concurrency/BUILD.bazel @@ -27,6 +27,7 @@ go_library( "//pkg/settings", "//pkg/settings/cluster", "//pkg/storage/enginepb", + "//pkg/util/buildutil", "//pkg/util/hlc", "//pkg/util/humanizeutil", "//pkg/util/log", diff --git a/pkg/kv/kvserver/concurrency/lock_table.go b/pkg/kv/kvserver/concurrency/lock_table.go index a8e86e3da2d2..a0e9f77af02e 100644 --- a/pkg/kv/kvserver/concurrency/lock_table.go +++ b/pkg/kv/kvserver/concurrency/lock_table.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/util/buildutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/uuid" @@ -151,6 +152,7 @@ type treeMu struct { // For constraining memory consumption. We need better memory accounting // than this. + // TODO(nvanbenschoten): use an atomic.Int64. numLocks int64 // For dampening the frequency with which we enforce lockTableImpl.maxLocks. @@ -226,6 +228,7 @@ type lockTableImpl struct { // reservation at A. // Now in the queues for A and B req1 is behind req3 and vice versa and // this deadlock has been created entirely due to the lock table's behavior. + // TODO(nvanbenschoten): use an atomic.Uint64. seqNum uint64 // locks contains the btree objects (wrapped in the treeMu structure) that @@ -1340,6 +1343,25 @@ func (l *lockState) isEmptyLock() bool { return false } +// assertEmptyLock asserts that the lockState is empty. This condition must hold +// for a lock to be safely removed from the tree. If it does not hold, requests +// with stale snapshots of the btree will still be able to enter the lock's +// wait-queue, after which point they will never hear of lock updates. +// REQUIRES: l.mu is locked. +func (l *lockState) assertEmptyLock() { + if !l.isEmptyLock() { + panic("lockState is not empty") + } +} + +// assertEmptyLockUnlocked is like assertEmptyLock, but it locks the lockState. +// REQUIRES: l.mu is not locked. +func (l *lockState) assertEmptyLockUnlocked() { + l.mu.Lock() + defer l.mu.Unlock() + l.assertEmptyLock() +} + // Returns the duration of time the lock has been tracked as held in the lock table. // REQUIRES: l.mu is locked. func (l *lockState) lockHeldDuration(now time.Time) time.Duration { @@ -1999,15 +2021,15 @@ func (l *lockState) tryClearLock(force bool) bool { if l.notRemovable > 0 && !force { return false } - replicatedHeld := l.holder.locked && l.holder.holder[lock.Replicated].txn != nil - // Remove unreplicated holder. - l.holder.holder[lock.Unreplicated] = lockHolderInfo{} + // Clear lock holder. While doing so, determine which waitingState to + // transition waiters to. var waitState waitingState + replicatedHeld := l.holder.locked && l.holder.holder[lock.Replicated].txn != nil if replicatedHeld && !force { lockHolderTxn, _ := l.getLockHolder() - // Note that none of the current waiters can be requests - // from lockHolderTxn. + // Note that none of the current waiters can be requests from lockHolderTxn, + // so they will never be told to waitElsewhere on themselves. waitState = waitingState{ kind: waitElsewhere, txn: lockHolderTxn, @@ -2018,11 +2040,11 @@ func (l *lockState) tryClearLock(force bool) bool { } else { // !replicatedHeld || force. Both are handled as doneWaiting since the // system is no longer tracking the lock that was possibly held. - l.clearLockHolder() waitState = waitingState{kind: doneWaiting} } + l.clearLockHolder() - l.distinguishedWaiter = nil + // Clear reservation. if l.reservation != nil { g := l.reservation g.mu.Lock() @@ -2030,6 +2052,8 @@ func (l *lockState) tryClearLock(force bool) bool { g.mu.Unlock() l.reservation = nil } + + // Clear waitingReaders. for e := l.waitingReaders.Front(); e != nil; { g := e.Value.(*lockTableGuardImpl) curr := e @@ -2043,6 +2067,7 @@ func (l *lockState) tryClearLock(force bool) bool { g.mu.Unlock() } + // Clear queuedWriters. waitState.guardAccess = spanset.SpanReadWrite for e := l.queuedWriters.Front(); e != nil; { qg := e.Value.(*queuedGuard) @@ -2059,6 +2084,12 @@ func (l *lockState) tryClearLock(force bool) bool { delete(g.mu.locks, l) g.mu.Unlock() } + + // Clear distinguishedWaiter. + l.distinguishedWaiter = nil + + // The lockState must now be empty. + l.assertEmptyLock() return true } @@ -2344,6 +2375,7 @@ func (l *lockState) lockIsFree() (gc bool) { } if l.queuedWriters.Len() == 0 { + l.assertEmptyLock() return true } @@ -2366,6 +2398,27 @@ func (l *lockState) lockIsFree() (gc bool) { return false } +// Delete removes the specified lock from the tree. +// REQUIRES: t.mu is locked. +func (t *treeMu) Delete(l *lockState) { + if buildutil.CrdbTestBuild { + l.assertEmptyLockUnlocked() + } + t.btree.Delete(l) +} + +// Reset removes all locks from the tree. +// REQUIRES: t.mu is locked. +func (t *treeMu) Reset() { + if buildutil.CrdbTestBuild { + iter := t.MakeIter() + for iter.First(); iter.Valid(); iter.Next() { + iter.Cur().assertEmptyLockUnlocked() + } + } + t.btree.Reset() +} + func (t *treeMu) nextLockSeqNum() (seqNum uint64, checkMaxLocks bool) { t.lockIDSeqNum++ checkMaxLocks = t.lockIDSeqNum%t.lockAddMaxLocksCheckInterval == 0 diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/wait_elsewhere b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/wait_elsewhere index 159997ac386e..b23fc629b438 100644 --- a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/wait_elsewhere +++ b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/wait_elsewhere @@ -25,7 +25,6 @@ new-request name=reqWaiter txn=txnWaiter ts=20,1 put key=k value=val2 ---- - sequence req=reqFirstLock ---- [1] sequence reqFirstLock: sequencing request @@ -41,7 +40,6 @@ finish req=reqFirstLock ---- [-] finish reqFirstLock: finishing request - sequence req=reqWaiter ---- [2] sequence reqWaiter: sequencing request @@ -115,5 +113,173 @@ finish req=reqWaiter ---- [-] finish reqWaiter: finishing request +# --------------------------------------------------------------------------------- +# Exercise the case where the lock table is cleared due to a lock limit while a +# request's lock table guard is holding a tree snapshot. The removed lockStates +# should be emptied and ignored by requests. This is a regression test against +# the bug described in #99635. +# +# To test this, we sequence a read such that it discovers locks at key "k1" and +# "k2" without exceeding the lock limit. We then re-sequence the read such that +# it captures a lock table tree snapshot and blocks on a lock at key "k1" first. +# Next, we instruct a second read to discover another lock that overflows the +# lock table's lock limit, causing the lock table to be cleared. Finally, we +# release the lock at key "k1" and watch whether the read starts waiting on key +# "k2". If it did, it would get stranded and stall indefinitely. +# --------------------------------------------------------------------------------- + +debug-set-max-locks n=2 +---- + +new-txn name=txnThreeKeyWriter ts=10,1 epoch=0 +---- + +new-request name=reqThreeKeyWriter txn=txnThreeKeyWriter ts=10,1 + put key=k1 value=val1 + put key=k2 value=val2 + put key=k3 value=val3 +---- + +new-request name=reqTwoKeyWaiter txn=txnWaiter ts=20,1 + scan key=k1 endkey=k3 +---- + +new-request name=reqThreeKeyWaiter txn=txnWaiter ts=20,1 + scan key=k1 endkey=k4 +---- + +sequence req=reqThreeKeyWriter +---- +[6] sequence reqThreeKeyWriter: sequencing request +[6] sequence reqThreeKeyWriter: acquiring latches +[6] sequence reqThreeKeyWriter: scanning lock table for conflicting locks +[6] sequence reqThreeKeyWriter: sequencing complete, returned guard + +on-lock-acquired req=reqThreeKeyWriter key=k1 dur=r +---- +[-] acquire lock: txn 00000003 @ k1 + +on-lock-acquired req=reqThreeKeyWriter key=k2 dur=r +---- +[-] acquire lock: txn 00000003 @ k2 + +on-lock-acquired req=reqThreeKeyWriter key=k3 dur=r +---- +[-] acquire lock: txn 00000003 @ k3 + +finish req=reqThreeKeyWriter +---- +[-] finish reqThreeKeyWriter: finishing request + +debug-lock-table +---- +global: num=0 +local: num=0 + +sequence req=reqTwoKeyWaiter +---- +[7] sequence reqTwoKeyWaiter: sequencing request +[7] sequence reqTwoKeyWaiter: acquiring latches +[7] sequence reqTwoKeyWaiter: scanning lock table for conflicting locks +[7] sequence reqTwoKeyWaiter: sequencing complete, returned guard + +sequence req=reqThreeKeyWaiter +---- +[8] sequence reqThreeKeyWaiter: sequencing request +[8] sequence reqThreeKeyWaiter: acquiring latches +[8] sequence reqThreeKeyWaiter: scanning lock table for conflicting locks +[8] sequence reqThreeKeyWaiter: sequencing complete, returned guard + +# Simulate that the replicated locks were discovered, so they are added to the +# lock table. +handle-write-intent-error req=reqTwoKeyWaiter lease-seq=1 + intent txn=txnThreeKeyWriter key=k1 + intent txn=txnThreeKeyWriter key=k2 +---- +[9] handle write intent error reqTwoKeyWaiter: handled conflicting intents on "k1", "k2", released latches + +sequence req=reqTwoKeyWaiter +---- +[10] sequence reqTwoKeyWaiter: re-sequencing request +[10] sequence reqTwoKeyWaiter: acquiring latches +[10] sequence reqTwoKeyWaiter: scanning lock table for conflicting locks +[10] sequence reqTwoKeyWaiter: waiting in lock wait-queues +[10] sequence reqTwoKeyWaiter: lock wait-queue event: wait for (distinguished) txn 00000003 holding lock @ key "k1" (queuedWriters: 0, queuedReaders: 1) +[10] sequence reqTwoKeyWaiter: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = false +[10] sequence reqTwoKeyWaiter: pushing timestamp of txn 00000003 above 20.000000000,1 +[10] sequence reqTwoKeyWaiter: blocked on select in concurrency_test.(*cluster).PushTransaction + +debug-lock-table +---- +global: num=2 + lock: "k1" + holder: txn: 00000003-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0] + waiting readers: + req: 5, txn: 00000002-0000-0000-0000-000000000000 + distinguished req: 5 + lock: "k2" + holder: txn: 00000003-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0] +local: num=0 + +# Simulate that the replicated locks were discovered, so they are added to the +# lock table. Keys "k1" and "k2" were previously discovered, but "k3" is new. +handle-write-intent-error req=reqThreeKeyWaiter lease-seq=1 + intent txn=txnThreeKeyWriter key=k1 + intent txn=txnThreeKeyWriter key=k2 + intent txn=txnThreeKeyWriter key=k3 +---- +[11] handle write intent error reqThreeKeyWaiter: handled conflicting intents on "k1", "k2", "k3", released latches + +sequence req=reqThreeKeyWaiter +---- +[12] sequence reqThreeKeyWaiter: re-sequencing request +[12] sequence reqThreeKeyWaiter: acquiring latches +[12] sequence reqThreeKeyWaiter: scanning lock table for conflicting locks +[12] sequence reqThreeKeyWaiter: waiting in lock wait-queues +[12] sequence reqThreeKeyWaiter: lock wait-queue event: wait for txn 00000003 holding lock @ key "k1" (queuedWriters: 0, queuedReaders: 2) +[12] sequence reqThreeKeyWaiter: pushing after 0s for: liveness detection = false, deadlock detection = true, timeout enforcement = false, priority enforcement = false +[12] sequence reqThreeKeyWaiter: pushing timestamp of txn 00000003 above 20.000000000,1 +[12] sequence reqThreeKeyWaiter: blocked on select in concurrency_test.(*cluster).PushTransaction + +debug-lock-table +---- +global: num=1 + lock: "k1" + holder: txn: 00000003-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0] + waiting readers: + req: 6, txn: 00000002-0000-0000-0000-000000000000 + req: 5, txn: 00000002-0000-0000-0000-000000000000 + distinguished req: 5 +local: num=0 + +# Before #99635 was fixed, reqTwoKeyWaiter would move on to waiting on key k2 +# and get stuck in lockTableWaiterImpl.WaitOn. Even after it resolved the intent +# at the pushed timestamp, its lock table guard would not be notified because it +# was enqueued in a leaked lock wait-queue (one attached to a lock that had been +# removed from the lockTable btree). +on-txn-updated txn=txnThreeKeyWriter status=pending ts=20,2 +---- +[-] update txn: increasing timestamp of txnThreeKeyWriter +[10] sequence reqTwoKeyWaiter: resolving intent "k1" for txn 00000003 with PENDING status +[10] sequence reqTwoKeyWaiter: lock wait-queue event: done waiting +[10] sequence reqTwoKeyWaiter: conflicted with 00000003-0000-0000-0000-000000000000 on "k1" for 0.000s +[10] sequence reqTwoKeyWaiter: acquiring latches +[10] sequence reqTwoKeyWaiter: scanning lock table for conflicting locks +[10] sequence reqTwoKeyWaiter: sequencing complete, returned guard +[12] sequence reqThreeKeyWaiter: resolving intent "k1" for txn 00000003 with PENDING status +[12] sequence reqThreeKeyWaiter: lock wait-queue event: done waiting +[12] sequence reqThreeKeyWaiter: conflicted with 00000003-0000-0000-0000-000000000000 on "k1" for 0.000s +[12] sequence reqThreeKeyWaiter: acquiring latches +[12] sequence reqThreeKeyWaiter: scanning lock table for conflicting locks +[12] sequence reqThreeKeyWaiter: sequencing complete, returned guard + +finish req=reqTwoKeyWaiter +---- +[-] finish reqTwoKeyWaiter: finishing request + +finish req=reqThreeKeyWaiter +---- +[-] finish reqThreeKeyWaiter: finishing request + reset ----