diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 31d0aa55b708..3938266afce7 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -59,6 +59,7 @@ struct dsl_deadlist; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; extern int zfs_dirty_data_sync_percent; +extern unsigned long zfs_txg_quiesce_advance; extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; extern int zfs_delay_min_dirty_percent; @@ -171,7 +172,6 @@ void dsl_pool_mos_diduse_space(dsl_pool_t *dp, void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp, int64_t used, int64_t comp, int64_t uncomp); boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp); -boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg); void dsl_pool_config_enter(dsl_pool_t *dp, void *tag); void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag); void dsl_pool_config_exit(dsl_pool_t *dp, void *tag); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index fcdd9c296a86..73667915df0f 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -38,7 +38,6 @@ #include #include #include -#include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); @@ -1056,9 +1055,6 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) txg_rele_to_quiesce(&tx->tx_txgh); - if (dsl_pool_need_dirty_sync(tx->tx_pool, tx->tx_txg)) { - txg_kick(tx->tx_pool, tx->tx_txg); - } return (0); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 42aff2b093cf..ce934c1889ee 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -111,6 +111,16 @@ int zfs_dirty_data_max_max_percent = 25; */ int zfs_dirty_data_sync_percent = 20; +/* + * The open txg can be quiesced into the pipeline even there is a txg still + * syncing. When the dirty data in syncing txg is below + * zfs_txg_quiesce_advance, which also measns the sync is about to complete, + * quiesce the open txg into the pipeline. + * 0 means only quiesce the open txg when all the data in the previous txg + * is synced. + */ +unsigned long zfs_txg_quiesce_advance = 0; + /* * Once there is this amount of dirty data, the dmu_tx_delay() will kick in * and delay each transaction. @@ -899,27 +909,32 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - uint64_t dirty; mutex_enter(&dp->dp_lock); - dirty = dp->dp_dirty_total; + uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); return (dirty > delay_min_bytes); } -boolean_t +static boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) { - uint64_t dirty; + ASSERT(MUTEX_HELD(&dp->dp_lock)); + uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; + uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + uint64_t total = dp->dp_dirty_total; - mutex_enter(&dp->dp_lock); - dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; - mutex_exit(&dp->dp_lock); - - return (dirty > dirty_min_bytes); + /* + * Only quiesce new transaction group when previous syncing is + * getting close to completion, so that quiescing completed just + * in time for it. That's the time when the dirty data in + * syncing txg shrinks below zfs_txg_quiesce_advance. + */ + return (dirty > dirty_min_bytes && + total - dirty <= zfs_txg_quiesce_advance); } void @@ -929,7 +944,11 @@ dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); + boolean_t needsync = dsl_pool_need_dirty_sync(dp, tx->tx_txg); mutex_exit(&dp->dp_lock); + + if (needsync) + txg_kick(dp, tx->tx_txg); } } @@ -949,7 +968,16 @@ dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; ASSERT3U(dp->dp_dirty_total, >=, space); dsl_pool_dirty_delta(dp, -space); + + /* Assuming txg + 1 is in open stage, check if it needs to be synced. */ + boolean_t needsync = dsl_pool_need_dirty_sync(dp, txg + 1); mutex_exit(&dp->dp_lock); + /* + * Pass txg + 1 into txg_kick. Inside txg_kick(), it will kick only + * if txg + 1 is actually in open stage. + */ + if (needsync) + txg_kick(dp, txg + 1); } /* ARGSUSED */ @@ -1411,6 +1439,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW, "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); +ZFS_MODULE_PARAM(zfs, zfs_, txg_quiesce_advance, ULONG, ZMOD_RW, + "Threshold of the dirty data in syncing txg to quiesce open txg"); + ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, "How quickly delay approaches infinity"); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 53915feaed1d..17eef9f04741 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -531,8 +531,6 @@ txg_sync_thread(void *arg) clock_t timeout = zfs_txg_timeout * hz; clock_t timer; uint64_t txg; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; /* * We sync when we're scanning, there's someone waiting @@ -543,8 +541,7 @@ txg_sync_thread(void *arg) while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - !txg_has_quiesced_to_sync(dp) && - dp->dp_dirty_total < dirty_min_bytes) { + !txg_has_quiesced_to_sync(dp)) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); @@ -557,9 +554,11 @@ txg_sync_thread(void *arg) * prompting it to do so if necessary. */ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { - if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) - tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; - cv_broadcast(&tx->tx_quiesce_more_cv); + if (!txg_is_quiescing(dp)) { + if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg + 1) + tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; + cv_broadcast(&tx->tx_quiesce_more_cv); + } txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } @@ -779,6 +778,7 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) * If there isn't a txg quiescing in the pipeline, push the txg * through the pipeline by quiescing the open txg. * It is fine there is a txg still syncing. + * Pass in the txg number of the transaction that should be closed and synced. */ void txg_kick(dsl_pool_t *dp, uint64_t txg) @@ -787,8 +787,11 @@ txg_kick(dsl_pool_t *dp, uint64_t txg) ASSERT(!dsl_pool_config_held(dp)); + if (txg != tx->tx_open_txg || + tx->tx_quiesce_txg_waiting > tx->tx_open_txg) + return; + mutex_enter(&tx->tx_sync_lock); - txg = txg == 0 ? tx->tx_open_txg : txg; if (txg == tx->tx_open_txg && !txg_is_quiescing(dp) && tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&