From e7e717b9b726c4540b795f6b5b02695e2d294e3b Mon Sep 17 00:00:00 2001 From: jxdking Date: Fri, 16 Apr 2021 18:48:20 +0000 Subject: [PATCH 1/2] Add Module Parameters Regarding Log Size Limit zfs_wrlog_data_max The upper limit of TX_WRITE log data. Once it is reached, write operation is blocked, until log data is cleared out after txg sync. It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. Add write-transaction log data counter at end of the body of zfs_log_write() and zvol_log_write(). Add delay logic into dmu_tx_try_assign(). Signed-off-by: jxdking --- include/sys/dsl_pool.h | 7 ++++++ module/zfs/arc.c | 12 +++++++++ module/zfs/dmu_tx.c | 5 ++++ module/zfs/dsl_pool.c | 55 ++++++++++++++++++++++++++++++++++++++++++ module/zfs/zfs_log.c | 5 ++++ module/zfs/zvol.c | 7 ++++-- 6 files changed, 89 insertions(+), 2 deletions(-) diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 8249bb8fc633..44900f8ceb2f 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -40,6 +40,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -58,6 +59,7 @@ struct dsl_deadlist; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; +extern unsigned long zfs_wrlog_data_max; extern int zfs_dirty_data_sync_percent; extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; @@ -119,6 +121,9 @@ typedef struct dsl_pool { uint64_t dp_mos_compressed_delta; uint64_t dp_mos_uncompressed_delta; + aggsum_t dp_wrlog_pertxg[TXG_SIZE]; + aggsum_t dp_wrlog_total; + /* * Time of most recently scheduled (furthest in the future) * wakeup for delayed transactions. @@ -158,6 +163,8 @@ int dsl_pool_sync_context(dsl_pool_t *dp); uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy); +void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg); +boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 394ca1bfe42d..606774a3b46c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -7969,6 +7969,18 @@ arc_init(void) zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } + + if (zfs_wrlog_data_max == 0) { + + /* + * dp_wrlog_total is reduced for each txg at the end of + * spa_sync(). However, dp_dirty_total is reduced every time + * a block being written out. Thus under normal operation, + * dp_wrlog_total could grow 2 times as big as + * zfs_dirty_data_max. + */ + zfs_wrlog_data_max = zfs_dirty_data_max * 2; + } } void diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 0beb983f992f..f9f35b7cd5c2 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -884,6 +884,11 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) return (SET_ERROR(ERESTART)); } + if (!tx->tx_dirty_delayed && + dsl_pool_wrlog_over_max(tx->tx_pool)) { + return (SET_ERROR(ERESTART)); + } + if (!tx->tx_dirty_delayed && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 72f4b86d772e..536a44d13b71 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -104,6 +104,14 @@ unsigned long zfs_dirty_data_max_max = 0; int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; +/* + * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. + * Once it is reached, write operation is blocked, + * until log data is cleared out after txg sync. + * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. + */ +unsigned long zfs_wrlog_data_max = 0; + /* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than @@ -220,6 +228,11 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); + aggsum_init(&dp->dp_wrlog_total, 0); + for (int i = 0; i < TXG_SIZE; i++) { + aggsum_init(&dp->dp_wrlog_pertxg[i], 0); + } + dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); @@ -416,6 +429,12 @@ dsl_pool_close(dsl_pool_t *dp) rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); + + aggsum_fini(&dp->dp_wrlog_total); + for (int i = 0; i < TXG_SIZE; i++) { + aggsum_fini(&dp->dp_wrlog_pertxg[i]); + } + taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_zrele_taskq); if (dp->dp_blkstats != NULL) { @@ -592,6 +611,36 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) cv_signal(&dp->dp_spaceavail_cv); } +void +dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) +{ + ASSERT3S(size, >=, 0); + + aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size); + aggsum_add(&dp->dp_wrlog_total, size); + + /* Choose a value slightly bigger than min dirty sync bytes */ + uint64_t sync_min = + zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100; + if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) + txg_kick(dp, txg); +} + +boolean_t +dsl_pool_wrlog_over_max(dsl_pool_t *dp) +{ + return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0); +} + +static void +dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) +{ + int64_t delta; + delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); + aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); + aggsum_add(&dp->dp_wrlog_total, delta); +} + #ifdef ZFS_DEBUG static boolean_t dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) @@ -816,6 +865,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } + + dsl_pool_wrlog_clear(dp, txg); + ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } @@ -1405,6 +1457,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, "Determines the dirty space limit"); +ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, + "The size limit of write-transaction zil log data"); + /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 30d5c4821ae5..0f330ec933aa 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -541,6 +541,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx_wr_state_t write_state; uintptr_t fsync_cnt; uint64_t gen = 0; + ssize_t size = resid; if (zil_replaying(zilog, tx) || zp->z_unlinked || zfs_xattr_owner_unlinked(zp)) { @@ -626,6 +627,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, off += len; resid -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg); + } } /* diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 23df0e1541a3..ab3b263870b3 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -84,10 +84,8 @@ #include #include #include - #include - unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; @@ -579,6 +577,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; + uint64_t sz = size; if (zil_replaying(zilog, tx)) return; @@ -630,6 +629,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, offset += len; size -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg); + } } /* From 48ab427e9dbed2a98ac97e1d62851ee70701b2d5 Mon Sep 17 00:00:00 2001 From: jxdking Date: Wed, 7 Jul 2021 15:21:35 +0000 Subject: [PATCH 2/2] Resolve code review. Updated man/man4/zfs.4 Added dmu_tx_wrlog_over_max to dmu_tx kstat Added ASSERT0 for dp_wrlog_total and dp_wrlog_pertxg[] on dsl_pool_close() Fixed some grammar in comments. Signed-off-by: jxdking --- include/sys/dmu_tx.h | 1 + man/man4/zfs.4 | 12 ++++++++++++ module/zfs/arc.c | 2 +- module/zfs/dmu_tx.c | 2 ++ module/zfs/dsl_pool.c | 2 ++ 5 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 60e9ed6e26f5..71a9ac7ca7bf 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -124,6 +124,7 @@ typedef struct dmu_tx_stats { kstat_named_t dmu_tx_dirty_throttle; kstat_named_t dmu_tx_dirty_delay; kstat_named_t dmu_tx_dirty_over_max; + kstat_named_t dmu_tx_wrlog_over_max; kstat_named_t dmu_tx_dirty_frees_delay; kstat_named_t dmu_tx_quota; } dmu_tx_stats_t; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 6da8d42b42bd..cd6fd052bb9a 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1066,6 +1066,18 @@ Start syncing out a transaction group if there's at least this much dirty data This should be less than .Sy zfs_vdev_async_write_active_min_dirty_percent . . +.It Sy zfs_wrlog_data_max Ns = Pq int +The upper limit of write-transaction zil log data size in bytes. +Once it is reached, write operation is blocked, until log data is cleared out +after transaction group sync. Because of some overhead, it should be set +at least 2 times the size of +.Sy zfs_dirty_data_max +.No to prevent harming normal write throughput. +It also should be smaller than the size of the slog device if slog is present. +.Pp +Defaults to +.Sy zfs_dirty_data_max*2 +. .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be preallocated for a file in order to guarantee that later writes will not diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 606774a3b46c..e29f9845bddc 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -7975,7 +7975,7 @@ arc_init(void) /* * dp_wrlog_total is reduced for each txg at the end of * spa_sync(). However, dp_dirty_total is reduced every time - * a block being written out. Thus under normal operation, + * a block is written out. Thus under normal operation, * dp_wrlog_total could grow 2 times as big as * zfs_dirty_data_max. */ diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index f9f35b7cd5c2..5fa516866668 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -53,6 +53,7 @@ dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, + { "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; @@ -886,6 +887,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) if (!tx->tx_dirty_delayed && dsl_pool_wrlog_over_max(tx->tx_pool)) { + DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max); return (SET_ERROR(ERESTART)); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 536a44d13b71..1350f1329564 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -430,8 +430,10 @@ dsl_pool_close(dsl_pool_t *dp) mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); + ASSERT0(aggsum_value(&dp->dp_wrlog_total)); aggsum_fini(&dp->dp_wrlog_total); for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i])); aggsum_fini(&dp->dp_wrlog_pertxg[i]); }