From 7149270521355d7b9369ac4cb4db4d849010066c Mon Sep 17 00:00:00 2001 From: Xiang Date: Thu, 22 Jun 2017 12:46:56 -0700 Subject: [PATCH] *: add option to skip freelist sync When the database has a lot of freepages, the cost to sync all freepages down to disk is high. If the total database size is small (<10GB), and the application can tolerate ~10 seconds recovery time, then it is reasonable to simply not sync freelist and rescan the db to rebuild freelist on recovery. --- db.go | 53 +++++++++- db_test.go | 33 +++++- freelist.go | 6 ++ simulation_no_freelist_sync_test.go | 47 +++++++++ simulation_test.go | 151 +++++++++++++++------------- tx.go | 56 +++++++---- 6 files changed, 247 insertions(+), 99 deletions(-) create mode 100644 simulation_no_freelist_sync_test.go diff --git a/db.go b/db.go index f352ff14f..dd391a230 100644 --- a/db.go +++ b/db.go @@ -61,6 +61,11 @@ type DB struct { // THIS IS UNSAFE. PLEASE USE WITH CAUTION. NoSync bool + // When true, skips syncing freelist to disk. This improves the database + // write performance under normal operation, but requires a full database + // re-sync during recovery. + NoFreelistSync bool + // When true, skips the truncate call when growing the database. // Setting this to true is only safe on non-ext3/ext4 systems. // Skipping truncation avoids preallocation of hard drive space and @@ -156,6 +161,7 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) { } db.NoGrowSync = options.NoGrowSync db.MmapFlags = options.MmapFlags + db.NoFreelistSync = options.NoFreelistSync // Set default values for later DB operations. db.MaxBatchSize = DefaultMaxBatchSize @@ -232,9 +238,14 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) { return nil, err } - // Read in the freelist. - db.freelist = newFreelist() - db.freelist.read(db.page(db.meta().freelist)) + if db.NoFreelistSync { + db.freelist = newFreelist() + db.freelist.readIDs(db.freepages()) + } else { + // Read in the freelist. + db.freelist = newFreelist() + db.freelist.read(db.page(db.meta().freelist)) + } // Mark the database as opened and return. return db, nil @@ -893,6 +904,38 @@ func (db *DB) IsReadOnly() bool { return db.readOnly } +func (db *DB) freepages() []pgid { + tx, err := db.beginTx() + defer func() { + err = tx.Rollback() + if err != nil { + panic("freepages: failed to rollback tx") + } + }() + if err != nil { + panic("freepages: failed to open read only tx") + } + + reachable := make(map[pgid]*page) + nofreed := make(map[pgid]bool) + ech := make(chan error) + go func() { + for e := range ech { + panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e)) + } + }() + tx.checkBucket(&tx.root, reachable, nofreed, ech) + close(ech) + + var fids []pgid + for i := pgid(2); i < db.meta().pgid; i++ { + if _, ok := reachable[i]; !ok { + fids = append(fids, i) + } + } + return fids +} + // Options represents the options that can be set when opening a database. type Options struct { // Timeout is the amount of time to wait to obtain a file lock. @@ -903,6 +946,10 @@ type Options struct { // Sets the DB.NoGrowSync flag before memory mapping the file. NoGrowSync bool + // Do not sync freelist to disk. This improves the database write performance + // under normal operation, but requires a full database re-sync during recovery. + NoFreelistSync bool + // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to // grab a shared lock (UNIX). ReadOnly bool diff --git a/db_test.go b/db_test.go index ad0e3d47e..7a9afc276 100644 --- a/db_test.go +++ b/db_test.go @@ -1366,15 +1366,35 @@ func validateBatchBench(b *testing.B, db *DB) { // DB is a test wrapper for bolt.DB. type DB struct { *bolt.DB + f string + o *bolt.Options } // MustOpenDB returns a new, open DB at a temporary location. func MustOpenDB() *DB { - db, err := bolt.Open(tempfile(), 0666, nil) + f := tempfile() + db, err := bolt.Open(f, 0666, nil) if err != nil { panic(err) } - return &DB{db} + return &DB{ + DB: db, + f: f, + } +} + +// MustOpenDBWithOption returns a new, open DB at a temporary location with given options. +func MustOpenWithOption(o *bolt.Options) *DB { + f := tempfile() + db, err := bolt.Open(f, 0666, o) + if err != nil { + panic(err) + } + return &DB{ + DB: db, + f: f, + o: o, + } } // Close closes the database and deletes the underlying file. @@ -1399,6 +1419,15 @@ func (db *DB) MustClose() { } } +// MustReopen reopen the database. Panic on error. +func (db *DB) MustReopen() { + indb, err := bolt.Open(db.f, 0666, db.o) + if err != nil { + panic(err) + } + db.DB = indb +} + // PrintStats prints the database stats func (db *DB) PrintStats() { var stats = db.Stats() diff --git a/freelist.go b/freelist.go index aba48f58c..f59dc0747 100644 --- a/freelist.go +++ b/freelist.go @@ -185,6 +185,12 @@ func (f *freelist) read(p *page) { f.reindex() } +// read initializes the freelist from a given list of ids. +func (f *freelist) readIDs(ids []pgid) { + f.ids = ids + f.reindex() +} + // write writes the page ids onto a freelist page. All free and pending ids are // saved to disk since in the event of a program crash, all pending ids will // become free. diff --git a/simulation_no_freelist_sync_test.go b/simulation_no_freelist_sync_test.go new file mode 100644 index 000000000..da2031eed --- /dev/null +++ b/simulation_no_freelist_sync_test.go @@ -0,0 +1,47 @@ +package bolt_test + +import ( + "testing" + + "github.com/coreos/bbolt" +) + +func TestSimulateNoFreeListSync_1op_1p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 1, 1) +} +func TestSimulateNoFreeListSync_10op_1p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10, 1) +} +func TestSimulateNoFreeListSync_100op_1p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 100, 1) +} +func TestSimulateNoFreeListSync_1000op_1p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 1000, 1) +} +func TestSimulateNoFreeListSync_10000op_1p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10000, 1) +} +func TestSimulateNoFreeListSync_10op_10p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10, 10) +} +func TestSimulateNoFreeListSync_100op_10p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 100, 10) +} +func TestSimulateNoFreeListSync_1000op_10p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 1000, 10) +} +func TestSimulateNoFreeListSync_10000op_10p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10000, 10) +} +func TestSimulateNoFreeListSync_100op_100p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 100, 100) +} +func TestSimulateNoFreeListSync_1000op_100p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 1000, 100) +} +func TestSimulateNoFreeListSync_10000op_100p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10000, 100) +} +func TestSimulateNoFreeListSync_10000op_1000p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10000, 1000) +} diff --git a/simulation_test.go b/simulation_test.go index e62415306..a5889c02f 100644 --- a/simulation_test.go +++ b/simulation_test.go @@ -10,25 +10,25 @@ import ( "github.com/coreos/bbolt" ) -func TestSimulate_1op_1p(t *testing.T) { testSimulate(t, 1, 1) } -func TestSimulate_10op_1p(t *testing.T) { testSimulate(t, 10, 1) } -func TestSimulate_100op_1p(t *testing.T) { testSimulate(t, 100, 1) } -func TestSimulate_1000op_1p(t *testing.T) { testSimulate(t, 1000, 1) } -func TestSimulate_10000op_1p(t *testing.T) { testSimulate(t, 10000, 1) } +func TestSimulate_1op_1p(t *testing.T) { testSimulate(t, nil, 1, 1, 1) } +func TestSimulate_10op_1p(t *testing.T) { testSimulate(t, nil, 1, 10, 1) } +func TestSimulate_100op_1p(t *testing.T) { testSimulate(t, nil, 1, 100, 1) } +func TestSimulate_1000op_1p(t *testing.T) { testSimulate(t, nil, 1, 1000, 1) } +func TestSimulate_10000op_1p(t *testing.T) { testSimulate(t, nil, 1, 10000, 1) } -func TestSimulate_10op_10p(t *testing.T) { testSimulate(t, 10, 10) } -func TestSimulate_100op_10p(t *testing.T) { testSimulate(t, 100, 10) } -func TestSimulate_1000op_10p(t *testing.T) { testSimulate(t, 1000, 10) } -func TestSimulate_10000op_10p(t *testing.T) { testSimulate(t, 10000, 10) } +func TestSimulate_10op_10p(t *testing.T) { testSimulate(t, nil, 1, 10, 10) } +func TestSimulate_100op_10p(t *testing.T) { testSimulate(t, nil, 1, 100, 10) } +func TestSimulate_1000op_10p(t *testing.T) { testSimulate(t, nil, 1, 1000, 10) } +func TestSimulate_10000op_10p(t *testing.T) { testSimulate(t, nil, 1, 10000, 10) } -func TestSimulate_100op_100p(t *testing.T) { testSimulate(t, 100, 100) } -func TestSimulate_1000op_100p(t *testing.T) { testSimulate(t, 1000, 100) } -func TestSimulate_10000op_100p(t *testing.T) { testSimulate(t, 10000, 100) } +func TestSimulate_100op_100p(t *testing.T) { testSimulate(t, nil, 1, 100, 100) } +func TestSimulate_1000op_100p(t *testing.T) { testSimulate(t, nil, 1, 1000, 100) } +func TestSimulate_10000op_100p(t *testing.T) { testSimulate(t, nil, 1, 10000, 100) } -func TestSimulate_10000op_1000p(t *testing.T) { testSimulate(t, 10000, 1000) } +func TestSimulate_10000op_1000p(t *testing.T) { testSimulate(t, nil, 1, 10000, 1000) } // Randomly generate operations on a given database with multiple clients to ensure consistency and thread safety. -func testSimulate(t *testing.T, threadCount, parallelism int) { +func testSimulate(t *testing.T, openOption *bolt.Options, round, threadCount, parallelism int) { if testing.Short() { t.Skip("skipping test in short mode.") } @@ -42,81 +42,88 @@ func testSimulate(t *testing.T, threadCount, parallelism int) { var versions = make(map[int]*QuickDB) versions[1] = NewQuickDB() - db := MustOpenDB() + db := MustOpenWithOption(openOption) defer db.MustClose() var mutex sync.Mutex // Run n threads in parallel, each with their own operation. var wg sync.WaitGroup - var threads = make(chan bool, parallelism) - var i int - for { - threads <- true - wg.Add(1) - writable := ((rand.Int() % 100) < 20) // 20% writers - - // Choose an operation to execute. - var handler simulateHandler - if writable { - handler = writerHandlers[rand.Intn(len(writerHandlers))] - } else { - handler = readerHandlers[rand.Intn(len(readerHandlers))] - } - - // Execute a thread for the given operation. - go func(writable bool, handler simulateHandler) { - defer wg.Done() - // Start transaction. - tx, err := db.Begin(writable) - if err != nil { - t.Fatal("tx begin: ", err) - } + for n := 0; n < round; n++ { - // Obtain current state of the dataset. - mutex.Lock() - var qdb = versions[tx.ID()] - if writable { - qdb = versions[tx.ID()-1].Copy() - } - mutex.Unlock() + var threads = make(chan bool, parallelism) + var i int + for { + threads <- true + wg.Add(1) + writable := ((rand.Int() % 100) < 20) // 20% writers - // Make sure we commit/rollback the tx at the end and update the state. + // Choose an operation to execute. + var handler simulateHandler if writable { - defer func() { - mutex.Lock() - versions[tx.ID()] = qdb - mutex.Unlock() - - if err := tx.Commit(); err != nil { - t.Fatal(err) - } - }() + handler = writerHandlers[rand.Intn(len(writerHandlers))] } else { - defer func() { _ = tx.Rollback() }() + handler = readerHandlers[rand.Intn(len(readerHandlers))] } - // Ignore operation if we don't have data yet. - if qdb == nil { - return + // Execute a thread for the given operation. + go func(writable bool, handler simulateHandler) { + defer wg.Done() + + // Start transaction. + tx, err := db.Begin(writable) + if err != nil { + t.Fatal("tx begin: ", err) + } + + // Obtain current state of the dataset. + mutex.Lock() + var qdb = versions[tx.ID()] + if writable { + qdb = versions[tx.ID()-1].Copy() + } + mutex.Unlock() + + // Make sure we commit/rollback the tx at the end and update the state. + if writable { + defer func() { + mutex.Lock() + versions[tx.ID()] = qdb + mutex.Unlock() + + if err := tx.Commit(); err != nil { + t.Fatal(err) + } + }() + } else { + defer func() { _ = tx.Rollback() }() + } + + // Ignore operation if we don't have data yet. + if qdb == nil { + return + } + + // Execute handler. + handler(tx, qdb) + + // Release a thread back to the scheduling loop. + <-threads + }(writable, handler) + + i++ + if i > threadCount { + break } + } - // Execute handler. - handler(tx, qdb) - - // Release a thread back to the scheduling loop. - <-threads - }(writable, handler) + // Wait until all threads are done. + wg.Wait() - i++ - if i > threadCount { - break - } + db.MustClose() + db.MustReopen() } - - // Wait until all threads are done. - wg.Wait() } type simulateHandler func(tx *bolt.Tx, qdb *QuickDB) diff --git a/tx.go b/tx.go index 6700308a2..97083142b 100644 --- a/tx.go +++ b/tx.go @@ -169,26 +169,9 @@ func (tx *Tx) Commit() error { // Free the old root bucket. tx.meta.root.root = tx.root.root - opgid := tx.meta.pgid - - // Free the freelist and allocate new pages for it. This will overestimate - // the size of the freelist but not underestimate the size (which would be bad). - tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist)) - p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1) - if err != nil { - tx.rollback() - return err - } - if err := tx.db.freelist.write(p); err != nil { - tx.rollback() - return err - } - tx.meta.freelist = p.id - - // If the high water mark has moved up then attempt to grow the database. - if tx.meta.pgid > opgid { - if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil { - tx.rollback() + if !tx.db.NoFreelistSync { + err := tx.commitFreelist() + if err != nil { return err } } @@ -235,6 +218,33 @@ func (tx *Tx) Commit() error { return nil } +func (tx *Tx) commitFreelist() error { + opgid := tx.meta.pgid + + // Free the freelist and allocate new pages for it. This will overestimate + // the size of the freelist but not underestimate the size (which would be bad). + tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist)) + p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1) + if err != nil { + tx.rollback() + return err + } + if err := tx.db.freelist.write(p); err != nil { + tx.rollback() + return err + } + tx.meta.freelist = p.id + // If the high water mark has moved up then attempt to grow the database. + if tx.meta.pgid > opgid { + if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil { + tx.rollback() + return err + } + } + + return nil +} + // Rollback closes the transaction and ignores all previous updates. Read-only // transactions must be rolled back and not committed. func (tx *Tx) Rollback() error { @@ -394,8 +404,10 @@ func (tx *Tx) check(ch chan error) { reachable := make(map[pgid]*page) reachable[0] = tx.page(0) // meta0 reachable[1] = tx.page(1) // meta1 - for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ { - reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist) + if !tx.DB().NoFreelistSync { + for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ { + reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist) + } } // Recursively check buckets.