diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 0283a8c589c1..573048e002b2 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -64,6 +64,8 @@ extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; extern int zfs_delay_min_dirty_percent; extern unsigned long zfs_delay_scale; +extern unsigned long zfs_smoothing_scale; +extern unsigned long zfs_smoothing_write; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE @@ -115,6 +117,7 @@ typedef struct dsl_pool { kcondvar_t dp_spaceavail_cv; uint64_t dp_dirty_pertxg[TXG_SIZE]; uint64_t dp_dirty_total; + hrtime_t dp_last_smooth; uint64_t dp_long_free_dirty_pertxg[TXG_SIZE]; uint64_t dp_mos_used_delta; uint64_t dp_mos_compressed_delta; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 01e9c5de445d..27cb4e898e3e 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -15,7 +15,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd June 1, 2021 +.Dd January 8, 2021 .Dt ZFS 4 .Os . @@ -944,6 +944,21 @@ This will smoothly handle between ten times and a tenth of this number. .Pp .Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 . . +.It Sy zfs_smoothing_write Ns = Ns Sy 0 Ns s Pq ulong +This controls for how many seconds smoothing should be applied. +The smoothing mechanism is used to add additional transaction delays +after the amount of dirty data drops below +.Sy zfs_delay_min_dirty_percent . +This mechanism may be used to avoid stalls and uneven performance during +heavy write workloads +. +.It Sy zfs_smoothing_scale Ns = Ns Sy 100000 Pq int +Similar to +.Sy zfs_delay_scale , +but for write smoothing. +This variable controls the scale of smoothing curve. +Larger values cause longer delays for a given amount of dirty data. +. .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int Disables requirement for IVset GUIDs to be present and match when doing a raw receive of encrypted datasets. diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index fe9860066d31..3e550761a5f7 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -776,14 +776,16 @@ static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ * of zfs_delay_scale to increase the steepness of the curve. */ static void -dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) +dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty, hrtime_t last_smooth) { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - hrtime_t wakeup, min_tx_time, now; + hrtime_t wakeup, min_tx_time, now, smoothing_time, delay_time; - if (dirty <= delay_min_bytes) + now = gethrtime(); + + if (dirty <= delay_min_bytes && last_smooth <= now) return; /* @@ -794,11 +796,20 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) */ ASSERT3U(dirty, <, zfs_dirty_data_max); - now = gethrtime(); - min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); - min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); - if (now > tx->tx_start + min_tx_time) + smoothing_time = 0; + delay_time = 0; + + if (dirty > delay_min_bytes) { + delay_time = zfs_delay_scale * + (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); + } + if (last_smooth > now) { + smoothing_time = zfs_smoothing_scale * dirty / + (zfs_dirty_data_max - dirty); + } + + min_tx_time = MIN(MAX(smoothing_time, delay_time), zfs_delay_max_ns); + if (zfs_smoothing_write == 0 && now > tx->tx_start + min_tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, @@ -808,6 +819,9 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) wakeup = MAX(tx->tx_start + min_tx_time, dp->dp_last_wakeup + min_tx_time); dp->dp_last_wakeup = wakeup; + if (dirty > delay_min_bytes) { + dp->dp_last_smooth = now + zfs_smoothing_write * NANOSEC; + } mutex_exit(&dp->dp_lock); zfs_sleep_until(wakeup); @@ -1069,7 +1083,7 @@ dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; - hrtime_t before; + hrtime_t before, last_smooth; ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); @@ -1089,10 +1103,11 @@ dmu_tx_wait(dmu_tx_t *tx) DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); while (dp->dp_dirty_total >= zfs_dirty_data_max) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); + last_smooth = dp->dp_last_smooth; dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); - dmu_tx_delay(tx, dirty); + dmu_tx_delay(tx, dirty, last_smooth); tx->tx_wait_dirty = B_FALSE; diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index c120d19b8bd0..eb745cea0c1a 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -103,6 +103,7 @@ unsigned long zfs_dirty_data_max = 0; unsigned long zfs_dirty_data_max_max = 0; int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; +unsigned long zfs_smoothing_write = 0; /* * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. @@ -140,6 +141,7 @@ int zfs_delay_min_dirty_percent = 60; * multiply in dmu_tx_delay(). */ unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; +unsigned long zfs_smoothing_scale = 100000; /* * This determines the number of threads used by the dp_sync_taskq. @@ -958,9 +960,10 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) mutex_enter(&dp->dp_lock); uint64_t dirty = dp->dp_dirty_total; + hrtime_t last_delay = dp->dp_last_smooth; mutex_exit(&dp->dp_lock); - return (dirty > delay_min_bytes); + return (dirty > delay_min_bytes || last_delay > gethrtime()); } static boolean_t @@ -1462,6 +1465,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, "The size limit of write-transaction zil log data"); +ZFS_MODULE_PARAM(zfs, zfs_, smoothing_write, ULONG, ZMOD_RW, + "How long should we smooth write after last delay (sec)"); + /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); @@ -1472,6 +1478,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, "How quickly delay approaches infinity"); +ZFS_MODULE_PARAM(zfs, zfs_, smoothing_scale, ULONG, ZMOD_RW, + "Delay smoothing scale"); + ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, "Max percent of CPUs that are used to sync dirty data");