diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 8249bb8fc633..3938266afce7 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -59,6 +59,7 @@ struct dsl_deadlist; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; extern int zfs_dirty_data_sync_percent; +extern unsigned long zfs_txg_quiesce_advance; extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; extern int zfs_delay_min_dirty_percent; diff --git a/include/sys/txg.h b/include/sys/txg.h index 22158bd1a5e6..f38f0006c040 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -78,7 +78,7 @@ extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, hrtime_t resolution); -extern void txg_kick(struct dsl_pool *dp); +extern void txg_kick(struct dsl_pool *dp, uint64_t txg); /* * Wait until the given transaction group has finished syncing. diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index c770eafa75d8..ce934c1889ee 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -111,6 +111,16 @@ int zfs_dirty_data_max_max_percent = 25; */ int zfs_dirty_data_sync_percent = 20; +/* + * The open txg can be quiesced into the pipeline even there is a txg still + * syncing. When the dirty data in syncing txg is below + * zfs_txg_quiesce_advance, which also measns the sync is about to complete, + * quiesce the open txg into the pipeline. + * 0 means only quiesce the open txg when all the data in the previous txg + * is synced. + */ +unsigned long zfs_txg_quiesce_advance = 0; + /* * Once there is this amount of dirty data, the dmu_tx_delay() will kick in * and delay each transaction. @@ -899,18 +909,34 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; - uint64_t dirty; mutex_enter(&dp->dp_lock); - dirty = dp->dp_dirty_total; + uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); - if (dirty > dirty_min_bytes) - txg_kick(dp); + return (dirty > delay_min_bytes); } +static boolean_t +dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) +{ + ASSERT(MUTEX_HELD(&dp->dp_lock)); + + uint64_t dirty_min_bytes = + zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; + uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + uint64_t total = dp->dp_dirty_total; + + /* + * Only quiesce new transaction group when previous syncing is + * getting close to completion, so that quiescing completed just + * in time for it. That's the time when the dirty data in + * syncing txg shrinks below zfs_txg_quiesce_advance. + */ + return (dirty > dirty_min_bytes && + total - dirty <= zfs_txg_quiesce_advance); +} + void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { @@ -918,7 +944,11 @@ dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); + boolean_t needsync = dsl_pool_need_dirty_sync(dp, tx->tx_txg); mutex_exit(&dp->dp_lock); + + if (needsync) + txg_kick(dp, tx->tx_txg); } } @@ -938,7 +968,16 @@ dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; ASSERT3U(dp->dp_dirty_total, >=, space); dsl_pool_dirty_delta(dp, -space); + + /* Assuming txg + 1 is in open stage, check if it needs to be synced. */ + boolean_t needsync = dsl_pool_need_dirty_sync(dp, txg + 1); mutex_exit(&dp->dp_lock); + /* + * Pass txg + 1 into txg_kick. Inside txg_kick(), it will kick only + * if txg + 1 is actually in open stage. + */ + if (needsync) + txg_kick(dp, txg + 1); } /* ARGSUSED */ @@ -1400,6 +1439,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW, "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); +ZFS_MODULE_PARAM(zfs, zfs_, txg_quiesce_advance, ULONG, ZMOD_RW, + "Threshold of the dirty data in syncing txg to quiesce open txg"); + ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, "How quickly delay approaches infinity"); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 497e19dd58eb..17eef9f04741 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -498,14 +498,6 @@ txg_wait_callbacks(dsl_pool_t *dp) taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0); } -static boolean_t -txg_is_syncing(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); - return (tx->tx_syncing_txg != 0); -} - static boolean_t txg_is_quiescing(dsl_pool_t *dp) { @@ -539,8 +531,6 @@ txg_sync_thread(void *arg) clock_t timeout = zfs_txg_timeout * hz; clock_t timer; uint64_t txg; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; /* * We sync when we're scanning, there's someone waiting @@ -551,8 +541,7 @@ txg_sync_thread(void *arg) while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - !txg_has_quiesced_to_sync(dp) && - dp->dp_dirty_total < dirty_min_bytes) { + !txg_has_quiesced_to_sync(dp)) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); @@ -565,9 +554,11 @@ txg_sync_thread(void *arg) * prompting it to do so if necessary. */ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { - if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) - tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; - cv_broadcast(&tx->tx_quiesce_more_cv); + if (!txg_is_quiescing(dp)) { + if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg + 1) + tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; + cv_broadcast(&tx->tx_quiesce_more_cv); + } txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } @@ -784,22 +775,27 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) } /* - * If there isn't a txg syncing or in the pipeline, push another txg through - * the pipeline by quiescing the open txg. + * If there isn't a txg quiescing in the pipeline, push the txg + * through the pipeline by quiescing the open txg. + * It is fine there is a txg still syncing. + * Pass in the txg number of the transaction that should be closed and synced. */ void -txg_kick(dsl_pool_t *dp) +txg_kick(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; ASSERT(!dsl_pool_config_held(dp)); + if (txg != tx->tx_open_txg || + tx->tx_quiesce_txg_waiting > tx->tx_open_txg) + return; + mutex_enter(&tx->tx_sync_lock); - if (!txg_is_syncing(dp) && + if (txg == tx->tx_open_txg && !txg_is_quiescing(dp) && tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && - tx->tx_sync_txg_waiting <= tx->tx_synced_txg && - tx->tx_quiesced_txg <= tx->tx_synced_txg) { + tx->tx_sync_txg_waiting <= tx->tx_synced_txg) { tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; cv_broadcast(&tx->tx_quiesce_more_cv); }