Skip to content

Commit

Permalink
Merge pull request cockroachdb#100944 from cockroachdb/blathers/backp…
Browse files Browse the repository at this point in the history
…ort-release-22.1-100842

release-22.1: kv: don't leak lock wait-queues for replicated locks on memory limit
  • Loading branch information
nvanbenschoten authored Apr 18, 2023
2 parents 20d13d9 + 948196f commit 5c0078b
Show file tree
Hide file tree
Showing 3 changed files with 229 additions and 9 deletions.
1 change: 1 addition & 0 deletions pkg/kv/kvserver/concurrency/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ go_library(
"//pkg/settings",
"//pkg/settings/cluster",
"//pkg/storage/enginepb",
"//pkg/util/buildutil",
"//pkg/util/hlc",
"//pkg/util/humanizeutil",
"//pkg/util/log",
Expand Down
67 changes: 60 additions & 7 deletions pkg/kv/kvserver/concurrency/lock_table.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
Expand Down Expand Up @@ -151,6 +152,7 @@ type treeMu struct {

// For constraining memory consumption. We need better memory accounting
// than this.
// TODO(nvanbenschoten): use an atomic.Int64.
numLocks int64

// For dampening the frequency with which we enforce lockTableImpl.maxLocks.
Expand Down Expand Up @@ -226,6 +228,7 @@ type lockTableImpl struct {
// reservation at A.
// Now in the queues for A and B req1 is behind req3 and vice versa and
// this deadlock has been created entirely due to the lock table's behavior.
// TODO(nvanbenschoten): use an atomic.Uint64.
seqNum uint64

// locks contains the btree objects (wrapped in the treeMu structure) that
Expand Down Expand Up @@ -1340,6 +1343,25 @@ func (l *lockState) isEmptyLock() bool {
return false
}

// assertEmptyLock asserts that the lockState is empty. This condition must hold
// for a lock to be safely removed from the tree. If it does not hold, requests
// with stale snapshots of the btree will still be able to enter the lock's
// wait-queue, after which point they will never hear of lock updates.
// REQUIRES: l.mu is locked.
func (l *lockState) assertEmptyLock() {
if !l.isEmptyLock() {
panic("lockState is not empty")
}
}

// assertEmptyLockUnlocked is like assertEmptyLock, but it locks the lockState.
// REQUIRES: l.mu is not locked.
func (l *lockState) assertEmptyLockUnlocked() {
l.mu.Lock()
defer l.mu.Unlock()
l.assertEmptyLock()
}

// Returns the duration of time the lock has been tracked as held in the lock table.
// REQUIRES: l.mu is locked.
func (l *lockState) lockHeldDuration(now time.Time) time.Duration {
Expand Down Expand Up @@ -1999,15 +2021,15 @@ func (l *lockState) tryClearLock(force bool) bool {
if l.notRemovable > 0 && !force {
return false
}
replicatedHeld := l.holder.locked && l.holder.holder[lock.Replicated].txn != nil

// Remove unreplicated holder.
l.holder.holder[lock.Unreplicated] = lockHolderInfo{}
// Clear lock holder. While doing so, determine which waitingState to
// transition waiters to.
var waitState waitingState
replicatedHeld := l.holder.locked && l.holder.holder[lock.Replicated].txn != nil
if replicatedHeld && !force {
lockHolderTxn, _ := l.getLockHolder()
// Note that none of the current waiters can be requests
// from lockHolderTxn.
// Note that none of the current waiters can be requests from lockHolderTxn,
// so they will never be told to waitElsewhere on themselves.
waitState = waitingState{
kind: waitElsewhere,
txn: lockHolderTxn,
Expand All @@ -2018,18 +2040,20 @@ func (l *lockState) tryClearLock(force bool) bool {
} else {
// !replicatedHeld || force. Both are handled as doneWaiting since the
// system is no longer tracking the lock that was possibly held.
l.clearLockHolder()
waitState = waitingState{kind: doneWaiting}
}
l.clearLockHolder()

l.distinguishedWaiter = nil
// Clear reservation.
if l.reservation != nil {
g := l.reservation
g.mu.Lock()
delete(g.mu.locks, l)
g.mu.Unlock()
l.reservation = nil
}

// Clear waitingReaders.
for e := l.waitingReaders.Front(); e != nil; {
g := e.Value.(*lockTableGuardImpl)
curr := e
Expand All @@ -2043,6 +2067,7 @@ func (l *lockState) tryClearLock(force bool) bool {
g.mu.Unlock()
}

// Clear queuedWriters.
waitState.guardAccess = spanset.SpanReadWrite
for e := l.queuedWriters.Front(); e != nil; {
qg := e.Value.(*queuedGuard)
Expand All @@ -2059,6 +2084,12 @@ func (l *lockState) tryClearLock(force bool) bool {
delete(g.mu.locks, l)
g.mu.Unlock()
}

// Clear distinguishedWaiter.
l.distinguishedWaiter = nil

// The lockState must now be empty.
l.assertEmptyLock()
return true
}

Expand Down Expand Up @@ -2344,6 +2375,7 @@ func (l *lockState) lockIsFree() (gc bool) {
}

if l.queuedWriters.Len() == 0 {
l.assertEmptyLock()
return true
}

Expand All @@ -2366,6 +2398,27 @@ func (l *lockState) lockIsFree() (gc bool) {
return false
}

// Delete removes the specified lock from the tree.
// REQUIRES: t.mu is locked.
func (t *treeMu) Delete(l *lockState) {
if buildutil.CrdbTestBuild {
l.assertEmptyLockUnlocked()
}
t.btree.Delete(l)
}

// Reset removes all locks from the tree.
// REQUIRES: t.mu is locked.
func (t *treeMu) Reset() {
if buildutil.CrdbTestBuild {
iter := t.MakeIter()
for iter.First(); iter.Valid(); iter.Next() {
iter.Cur().assertEmptyLockUnlocked()
}
}
t.btree.Reset()
}

func (t *treeMu) nextLockSeqNum() (seqNum uint64, checkMaxLocks bool) {
t.lockIDSeqNum++
checkMaxLocks = t.lockIDSeqNum%t.lockAddMaxLocksCheckInterval == 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ new-request name=reqWaiter txn=txnWaiter ts=20,1
put key=k value=val2
----


sequence req=reqFirstLock
----
[1] sequence reqFirstLock: sequencing request
Expand All @@ -41,7 +40,6 @@ finish req=reqFirstLock
----
[-] finish reqFirstLock: finishing request


sequence req=reqWaiter
----
[2] sequence reqWaiter: sequencing request
Expand Down Expand Up @@ -115,5 +113,173 @@ finish req=reqWaiter
----
[-] finish reqWaiter: finishing request

# ---------------------------------------------------------------------------------
# Exercise the case where the lock table is cleared due to a lock limit while a
# request's lock table guard is holding a tree snapshot. The removed lockStates
# should be emptied and ignored by requests. This is a regression test against
# the bug described in #99635.
#
# To test this, we sequence a read such that it discovers locks at key "k1" and
# "k2" without exceeding the lock limit. We then re-sequence the read such that
# it captures a lock table tree snapshot and blocks on a lock at key "k1" first.
# Next, we instruct a second read to discover another lock that overflows the
# lock table's lock limit, causing the lock table to be cleared. Finally, we
# release the lock at key "k1" and watch whether the read starts waiting on key
# "k2". If it did, it would get stranded and stall indefinitely.
# ---------------------------------------------------------------------------------

debug-set-max-locks n=2
----

new-txn name=txnThreeKeyWriter ts=10,1 epoch=0
----

new-request name=reqThreeKeyWriter txn=txnThreeKeyWriter ts=10,1
put key=k1 value=val1
put key=k2 value=val2
put key=k3 value=val3
----

new-request name=reqTwoKeyWaiter txn=txnWaiter ts=20,1
scan key=k1 endkey=k3
----

new-request name=reqThreeKeyWaiter txn=txnWaiter ts=20,1
scan key=k1 endkey=k4
----

sequence req=reqThreeKeyWriter
----
[6] sequence reqThreeKeyWriter: sequencing request
[6] sequence reqThreeKeyWriter: acquiring latches
[6] sequence reqThreeKeyWriter: scanning lock table for conflicting locks
[6] sequence reqThreeKeyWriter: sequencing complete, returned guard

on-lock-acquired req=reqThreeKeyWriter key=k1 dur=r
----
[-] acquire lock: txn 00000003 @ k1

on-lock-acquired req=reqThreeKeyWriter key=k2 dur=r
----
[-] acquire lock: txn 00000003 @ k2

on-lock-acquired req=reqThreeKeyWriter key=k3 dur=r
----
[-] acquire lock: txn 00000003 @ k3

finish req=reqThreeKeyWriter
----
[-] finish reqThreeKeyWriter: finishing request

debug-lock-table
----
global: num=0
local: num=0

sequence req=reqTwoKeyWaiter
----
[7] sequence reqTwoKeyWaiter: sequencing request
[7] sequence reqTwoKeyWaiter: acquiring latches
[7] sequence reqTwoKeyWaiter: scanning lock table for conflicting locks
[7] sequence reqTwoKeyWaiter: sequencing complete, returned guard

sequence req=reqThreeKeyWaiter
----
[8] sequence reqThreeKeyWaiter: sequencing request
[8] sequence reqThreeKeyWaiter: acquiring latches
[8] sequence reqThreeKeyWaiter: scanning lock table for conflicting locks
[8] sequence reqThreeKeyWaiter: sequencing complete, returned guard

# Simulate that the replicated locks were discovered, so they are added to the
# lock table.
handle-write-intent-error req=reqTwoKeyWaiter lease-seq=1
intent txn=txnThreeKeyWriter key=k1
intent txn=txnThreeKeyWriter key=k2
----
[9] handle write intent error reqTwoKeyWaiter: handled conflicting intents on "k1", "k2", released latches

sequence req=reqTwoKeyWaiter
----
[10] sequence reqTwoKeyWaiter: re-sequencing request
[10] sequence reqTwoKeyWaiter: acquiring latches
[10] sequence reqTwoKeyWaiter: scanning lock table for conflicting locks
[10] sequence reqTwoKeyWaiter: waiting in lock wait-queues
[10] sequence reqTwoKeyWaiter: lock wait-queue event: wait for (distinguished) txn 00000003 holding lock @ key "k1" (queuedWriters: 0, queuedReaders: 1)
[10] sequence reqTwoKeyWaiter: pushing after 0s for: liveness detection = true, deadlock detection = true, timeout enforcement = false, priority enforcement = false
[10] sequence reqTwoKeyWaiter: pushing timestamp of txn 00000003 above 20.000000000,1
[10] sequence reqTwoKeyWaiter: blocked on select in concurrency_test.(*cluster).PushTransaction

debug-lock-table
----
global: num=2
lock: "k1"
holder: txn: 00000003-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
waiting readers:
req: 5, txn: 00000002-0000-0000-0000-000000000000
distinguished req: 5
lock: "k2"
holder: txn: 00000003-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
local: num=0

# Simulate that the replicated locks were discovered, so they are added to the
# lock table. Keys "k1" and "k2" were previously discovered, but "k3" is new.
handle-write-intent-error req=reqThreeKeyWaiter lease-seq=1
intent txn=txnThreeKeyWriter key=k1
intent txn=txnThreeKeyWriter key=k2
intent txn=txnThreeKeyWriter key=k3
----
[11] handle write intent error reqThreeKeyWaiter: handled conflicting intents on "k1", "k2", "k3", released latches

sequence req=reqThreeKeyWaiter
----
[12] sequence reqThreeKeyWaiter: re-sequencing request
[12] sequence reqThreeKeyWaiter: acquiring latches
[12] sequence reqThreeKeyWaiter: scanning lock table for conflicting locks
[12] sequence reqThreeKeyWaiter: waiting in lock wait-queues
[12] sequence reqThreeKeyWaiter: lock wait-queue event: wait for txn 00000003 holding lock @ key "k1" (queuedWriters: 0, queuedReaders: 2)
[12] sequence reqThreeKeyWaiter: pushing after 0s for: liveness detection = false, deadlock detection = true, timeout enforcement = false, priority enforcement = false
[12] sequence reqThreeKeyWaiter: pushing timestamp of txn 00000003 above 20.000000000,1
[12] sequence reqThreeKeyWaiter: blocked on select in concurrency_test.(*cluster).PushTransaction

debug-lock-table
----
global: num=1
lock: "k1"
holder: txn: 00000003-0000-0000-0000-000000000000, ts: 10.000000000,1, info: repl epoch: 0, seqs: [0]
waiting readers:
req: 6, txn: 00000002-0000-0000-0000-000000000000
req: 5, txn: 00000002-0000-0000-0000-000000000000
distinguished req: 5
local: num=0

# Before #99635 was fixed, reqTwoKeyWaiter would move on to waiting on key k2
# and get stuck in lockTableWaiterImpl.WaitOn. Even after it resolved the intent
# at the pushed timestamp, its lock table guard would not be notified because it
# was enqueued in a leaked lock wait-queue (one attached to a lock that had been
# removed from the lockTable btree).
on-txn-updated txn=txnThreeKeyWriter status=pending ts=20,2
----
[-] update txn: increasing timestamp of txnThreeKeyWriter
[10] sequence reqTwoKeyWaiter: resolving intent "k1" for txn 00000003 with PENDING status
[10] sequence reqTwoKeyWaiter: lock wait-queue event: done waiting
[10] sequence reqTwoKeyWaiter: conflicted with 00000003-0000-0000-0000-000000000000 on "k1" for 0.000s
[10] sequence reqTwoKeyWaiter: acquiring latches
[10] sequence reqTwoKeyWaiter: scanning lock table for conflicting locks
[10] sequence reqTwoKeyWaiter: sequencing complete, returned guard
[12] sequence reqThreeKeyWaiter: resolving intent "k1" for txn 00000003 with PENDING status
[12] sequence reqThreeKeyWaiter: lock wait-queue event: done waiting
[12] sequence reqThreeKeyWaiter: conflicted with 00000003-0000-0000-0000-000000000000 on "k1" for 0.000s
[12] sequence reqThreeKeyWaiter: acquiring latches
[12] sequence reqThreeKeyWaiter: scanning lock table for conflicting locks
[12] sequence reqThreeKeyWaiter: sequencing complete, returned guard

finish req=reqTwoKeyWaiter
----
[-] finish reqTwoKeyWaiter: finishing request

finish req=reqThreeKeyWaiter
----
[-] finish reqThreeKeyWaiter: finishing request

reset
----

0 comments on commit 5c0078b

Please sign in to comment.