From 632a5f286f2b50844e41cf9d2023daead0ad9de3 Mon Sep 17 00:00:00 2001 From: jxdking Date: Fri, 30 Apr 2021 19:55:51 +0000 Subject: [PATCH] Maximize txg size to improve throughput Added dp_dirty_peak status to dsl_pool_t struct. It is used to track peaks of dp_dirty_total percisely. Following routine only applies to heavy work load when delay is kicked in. (Under light load, txg kick is triggered by zfs_dirty_data_sync_percent.) The desired txg size is derived from dp_dirty_peak. An optimized txg size should meet following criterias. 1. dp_dirty_peak should be able to fit 2 full txgs and 1 partial txg in order to fully utilize 3 stages of pipeline. 2. The size of txg should be as large as possible, to fully utilize each txg. Some other style fixes regarding code review. Signed-off-by: jxdking --- include/sys/dsl_pool.h | 1 + module/zfs/dmu_tx.c | 3 +-- module/zfs/dsl_pool.c | 47 +++++++++++++++++++++++++++++++++++++++--- module/zfs/txg.c | 3 ++- 4 files changed, 48 insertions(+), 6 deletions(-) diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 31d0aa55b708..7c656bf6612f 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -114,6 +114,7 @@ typedef struct dsl_pool { kcondvar_t dp_spaceavail_cv; uint64_t dp_dirty_pertxg[TXG_SIZE]; uint64_t dp_dirty_total; + uint64_t dp_dirty_peak; /* historical peak of dp_dirty_total */ uint64_t dp_long_free_dirty_pertxg[TXG_SIZE]; uint64_t dp_mos_used_delta; uint64_t dp_mos_compressed_delta; diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index fcdd9c296a86..5c166afe57b7 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1056,9 +1056,8 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) txg_rele_to_quiesce(&tx->tx_txgh); - if (dsl_pool_need_dirty_sync(tx->tx_pool, tx->tx_txg)) { + if (dsl_pool_need_dirty_sync(tx->tx_pool, tx->tx_txg)) txg_kick(tx->tx_pool, tx->tx_txg); - } return (0); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 42aff2b093cf..e04c32030883 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -818,6 +818,16 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) dmu_buf_rele(ds->ds_dbuf, zilog); } ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); + + mutex_enter(&dp->dp_lock); + /* + * An absolute peak of dp_dirty_total since the pool is loaded + * may not represent current workload. + * Use following formula to fade off dp_dirty_peak value each + * time a txg is synced, so that it can reflect current workload. + */ + dp->dp_dirty_peak -= dp->dp_dirty_peak / 128; + mutex_exit(&dp->dp_lock); } /* @@ -911,15 +921,45 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) { - uint64_t dirty; uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; mutex_enter(&dp->dp_lock); - dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + uint64_t total = dp->dp_dirty_total; + uint64_t peak = dp->dp_dirty_peak; mutex_exit(&dp->dp_lock); - return (dirty > dirty_min_bytes); + if (dirty < dirty_min_bytes) + return (0); + + if (dirty == total) { + /* + * All dirty data is in current txg, which suggests + * there is no dirty data quiescing or syncing. + * Since dirty data is no less than dirty_min_bytes, + * current txg should be synced. + */ + return (1); + } + + /* + * Ensure target_txg_size is no less than the half + * of zfs_delay_min_dirty_percent. + */ + uint64_t target_txg_size = + zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100 / 2; + /* + * The suggests the largest possible dp_dirty_total that + * it can grow. + * We want to maximize target_txg_size value to fully utilize + * each txg, and also want to fit 2 full txgs and 1 partial + * txg in the to fully utilize the pipline. + * Choosing the target_txg_size slightly below 50% of the + * should be right. + */ + target_txg_size = MAX(peak * 48 / 100, target_txg_size); + return (dirty >= target_txg_size); } void @@ -948,6 +988,7 @@ dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; ASSERT3U(dp->dp_dirty_total, >=, space); + dp->dp_dirty_peak = MAX(dp->dp_dirty_peak, dp->dp_dirty_total); dsl_pool_dirty_delta(dp, -space); mutex_exit(&dp->dp_lock); } diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 53915feaed1d..d7977e4393fc 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -779,6 +779,7 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) * If there isn't a txg quiescing in the pipeline, push the txg * through the pipeline by quiescing the open txg. * It is fine there is a txg still syncing. + * Pass in the txg number of the transaction that should be closed and synced. */ void txg_kick(dsl_pool_t *dp, uint64_t txg) @@ -788,7 +789,7 @@ txg_kick(dsl_pool_t *dp, uint64_t txg) ASSERT(!dsl_pool_config_held(dp)); mutex_enter(&tx->tx_sync_lock); - txg = txg == 0 ? tx->tx_open_txg : txg; + txg = (txg == 0 ? tx->tx_open_txg : txg); if (txg == tx->tx_open_txg && !txg_is_quiescing(dp) && tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&