-
Notifications
You must be signed in to change notification settings - Fork 9.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
*: allow fully concurrent large read #9384
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,6 +49,11 @@ var ( | |
) | ||
|
||
type Backend interface { | ||
// CommittedReadTx returns a non-blocking read tx that is suitable for large reads. | ||
// CommittedReadTx call itself will not return until the current BatchTx gets committed to | ||
// ensure consistency. | ||
CommittedReadTx() ReadTx | ||
|
||
ReadTx() ReadTx | ||
BatchTx() BatchTx | ||
|
||
|
@@ -97,6 +102,8 @@ type backend struct { | |
|
||
readTx *readTx | ||
|
||
concurrentReadTxCh chan chan ReadTx | ||
|
||
stopc chan struct{} | ||
donec chan struct{} | ||
|
||
|
@@ -165,6 +172,8 @@ func newBackend(bcfg BackendConfig) *backend { | |
buckets: make(map[string]*bolt.Bucket), | ||
}, | ||
|
||
concurrentReadTxCh: make(chan chan ReadTx), | ||
|
||
stopc: make(chan struct{}), | ||
donec: make(chan struct{}), | ||
|
||
|
@@ -184,6 +193,12 @@ func (b *backend) BatchTx() BatchTx { | |
|
||
func (b *backend) ReadTx() ReadTx { return b.readTx } | ||
|
||
func (b *backend) CommittedReadTx() ReadTx { | ||
rch := make(chan ReadTx) | ||
b.concurrentReadTxCh <- rch | ||
return <-rch | ||
} | ||
|
||
// ForceCommit forces the current batching tx to commit. | ||
func (b *backend) ForceCommit() { | ||
b.batchTx.Commit() | ||
|
@@ -301,6 +316,25 @@ func (b *backend) run() { | |
b.batchTx.Commit() | ||
} | ||
t.Reset(b.batchInterval) | ||
b.createConcurrentReadTxs() | ||
} | ||
} | ||
|
||
func (b *backend) createConcurrentReadTxs() { | ||
// do not allow too many concurrent read txs. | ||
// TODO: improve this by having a global pending counter? | ||
for i := 0; i < 100; i++ { | ||
select { | ||
case rch := <-b.concurrentReadTxCh: | ||
rtx, err := b.db.Begin(false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lazily create tx following each commit? |
||
if err != nil { | ||
plog.Fatalf("cannot begin read tx (%s)", err) | ||
} | ||
rch <- &concurrentReadTx{tx: rtx} | ||
default: | ||
// no more to create. | ||
return | ||
} | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// Copyright 2018 The etcd Authors | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package backend | ||
|
||
import ( | ||
bolt "github.com/coreos/bbolt" | ||
) | ||
|
||
type concurrentReadTx struct { | ||
tx *bolt.Tx | ||
} | ||
|
||
func (rt *concurrentReadTx) Lock() {} | ||
func (rt *concurrentReadTx) Unlock() { rt.tx.Rollback() } | ||
|
||
func (rt *concurrentReadTx) UnsafeRange(bucketName, key, endKey []byte, limit int64) ([][]byte, [][]byte) { | ||
bucket := rt.tx.Bucket(bucketName) | ||
if bucket == nil { | ||
plog.Fatalf("bucket %s does not exist", bucketName) | ||
} | ||
return unsafeRange(bucket.Cursor(), key, endKey, limit) | ||
} | ||
|
||
func (rt *concurrentReadTx) UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error { | ||
return unsafeForEach(rt.tx, bucketName, visitor) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,10 +91,11 @@ func (ti *treeIndex) keyIndex(keyi *keyIndex) *keyIndex { | |
func (ti *treeIndex) visit(key, end []byte, f func(ki *keyIndex)) { | ||
keyi, endi := &keyIndex{key: key}, &keyIndex{key: end} | ||
|
||
ti.RLock() | ||
defer ti.RUnlock() | ||
ti.Lock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jingyih i think this improvement is still needed? can you make a separate PR for this change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good. |
||
clone := ti.tree.Clone() | ||
ti.Unlock() | ||
|
||
ti.tree.AscendGreaterOrEqual(keyi, func(item btree.Item) bool { | ||
clone.AscendGreaterOrEqual(keyi, func(item btree.Item) bool { | ||
if len(endi.key) > 0 && !item.Less(endi) { | ||
return false | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,9 +21,21 @@ import ( | |
"go.uber.org/zap" | ||
) | ||
|
||
const ( | ||
expensiveReadLimit = 1000 | ||
readonly = true | ||
readwrite = false | ||
) | ||
|
||
type storeTxnRead struct { | ||
s *store | ||
tx backend.ReadTx | ||
s *store | ||
tx backend.ReadTx | ||
txlocked bool | ||
|
||
// for creating concurrent read tx when the read is expensive. | ||
b backend.Backend | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it looks like there should be something like |
||
// is the transcation readonly? | ||
ro bool | ||
|
||
firstRev int64 | ||
rev int64 | ||
|
@@ -33,10 +45,15 @@ func (s *store) Read() TxnRead { | |
s.mu.RLock() | ||
tx := s.b.ReadTx() | ||
s.revMu.RLock() | ||
tx.Lock() | ||
firstRev, rev := s.compactMainRev, s.currentRev | ||
s.revMu.RUnlock() | ||
return newMetricsTxnRead(&storeTxnRead{s, tx, firstRev, rev}) | ||
return newMetricsTxnRead(&storeTxnRead{ | ||
s: s, | ||
tx: tx, | ||
b: s.b, | ||
ro: readonly, | ||
firstRev: firstRev, | ||
rev: rev}) | ||
} | ||
|
||
func (tr *storeTxnRead) FirstRev() int64 { return tr.firstRev } | ||
|
@@ -47,7 +64,9 @@ func (tr *storeTxnRead) Range(key, end []byte, ro RangeOptions) (r *RangeResult, | |
} | ||
|
||
func (tr *storeTxnRead) End() { | ||
tr.tx.Unlock() | ||
if tr.txlocked { | ||
tr.tx.Unlock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be better encapsulated? Could |
||
} | ||
tr.s.mu.RUnlock() | ||
} | ||
|
||
|
@@ -64,10 +83,15 @@ func (s *store) Write() TxnWrite { | |
tx := s.b.BatchTx() | ||
tx.Lock() | ||
tw := &storeTxnWrite{ | ||
storeTxnRead: storeTxnRead{s, tx, 0, 0}, | ||
tx: tx, | ||
beginRev: s.currentRev, | ||
changes: make([]mvccpb.KeyValue, 0, 4), | ||
storeTxnRead: storeTxnRead{ | ||
s: s, | ||
txlocked: true, | ||
tx: tx, | ||
ro: readwrite, | ||
}, | ||
tx: tx, | ||
beginRev: s.currentRev, | ||
changes: make([]mvccpb.KeyValue, 0, 4), | ||
} | ||
return newMetricsTxnWrite(tw) | ||
} | ||
|
@@ -134,6 +158,15 @@ func (tr *storeTxnRead) rangeKeys(key, end []byte, curRev int64, ro RangeOptions | |
limit = len(revpairs) | ||
} | ||
|
||
if limit > expensiveReadLimit && !tr.txlocked && tr.ro { // first expensive read in a read only transcation | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. e.g., |
||
// too many keys to range. upgrade the read transcation to concurrent read tx. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As an alternative to upgrading the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nvm, once we perform a |
||
tr.tx = tr.b.CommittedReadTx() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Traced through the code. The original tx is manged by the backend and is rolled back as usual when it finishes the current batch iteration. This looks like it works correctly. |
||
} | ||
if !tr.txlocked { | ||
tr.tx.Lock() | ||
tr.txlocked = true | ||
} | ||
|
||
kvs := make([]mvccpb.KeyValue, limit) | ||
revBytes := newRevBytes() | ||
for i, revpair := range revpairs[:len(kvs)] { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
May want to avoid O(n) broadcast here; can do O(1) by returning
<-chan struct{}
that closes on the next commit, wait on channel close, fetch the latest committed tx, then acquire some semaphore to limit concurrency. Could return a closed channel if the backend is already fully committed, so no need to wait on commit timer.