Skip to content

Commit

Permalink
Add Module Parameter Regarding Log Size Limit
Browse files Browse the repository at this point in the history
* Add Module Parameters Regarding Log Size Limit

zfs_wrlog_data_max
The upper limit of TX_WRITE log data. Once it is reached,
write operation is blocked, until log data is cleared out
after txg sync. It only counts TX_WRITE log with WR_COPIED
or WR_NEED_COPY.

Reviewed-by: Prakash Surya <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: jxdking <[email protected]>
Closes openzfs#12284
Signed-off-by: Ameer Hamza <[email protected]>
  • Loading branch information
jxdking authored and ixhamza committed Sep 21, 2022
1 parent 0f517f7 commit 15545f8
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 2 deletions.
1 change: 1 addition & 0 deletions include/sys/dmu_tx.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ typedef struct dmu_tx_stats {
kstat_named_t dmu_tx_dirty_throttle;
kstat_named_t dmu_tx_dirty_delay;
kstat_named_t dmu_tx_dirty_over_max;
kstat_named_t dmu_tx_wrlog_over_max;
kstat_named_t dmu_tx_dirty_frees_delay;
kstat_named_t dmu_tx_quota;
} dmu_tx_stats_t;
Expand Down
7 changes: 7 additions & 0 deletions include/sys/dsl_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include <sys/rrwlock.h>
#include <sys/dsl_synctask.h>
#include <sys/mmp.h>
#include <sys/aggsum.h>

#ifdef __cplusplus
extern "C" {
Expand All @@ -58,6 +59,7 @@ struct dsl_deadlist;

extern unsigned long zfs_dirty_data_max;
extern unsigned long zfs_dirty_data_max_max;
extern unsigned long zfs_wrlog_data_max;
extern int zfs_dirty_data_sync_percent;
extern int zfs_dirty_data_max_percent;
extern int zfs_dirty_data_max_max_percent;
Expand Down Expand Up @@ -119,6 +121,9 @@ typedef struct dsl_pool {
uint64_t dp_mos_compressed_delta;
uint64_t dp_mos_uncompressed_delta;

aggsum_t dp_wrlog_pertxg[TXG_SIZE];
aggsum_t dp_wrlog_total;

/*
* Time of most recently scheduled (furthest in the future)
* wakeup for delayed transactions.
Expand Down Expand Up @@ -159,6 +164,8 @@ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
zfs_space_check_t slop_policy);
uint64_t dsl_pool_deferred_space(dsl_pool_t *dp);
void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg);
boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp);
void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
Expand Down
12 changes: 12 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -1080,6 +1080,18 @@ Start syncing out a transaction group if there's at least this much dirty data
This should be less than
.Sy zfs_vdev_async_write_active_min_dirty_percent .
.
.It Sy zfs_wrlog_data_max Ns = Pq int
The upper limit of write-transaction zil log data size in bytes.
Once it is reached, write operation is blocked, until log data is cleared out
after transaction group sync. Because of some overhead, it should be set
at least 2 times the size of
.Sy zfs_dirty_data_max
.No to prevent harming normal write throughput.
It also should be smaller than the size of the slog device if slog is present.
.Pp
Defaults to
.Sy zfs_dirty_data_max*2
.
.It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint
Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
preallocated for a file in order to guarantee that later writes will not
Expand Down
12 changes: 12 additions & 0 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -8062,6 +8062,18 @@ arc_init(void)
zfs_dirty_data_max = MIN(zfs_dirty_data_max,
zfs_dirty_data_max_max);
}

if (zfs_wrlog_data_max == 0) {

/*
* dp_wrlog_total is reduced for each txg at the end of
* spa_sync(). However, dp_dirty_total is reduced every time
* a block is written out. Thus under normal operation,
* dp_wrlog_total could grow 2 times as big as
* zfs_dirty_data_max.
*/
zfs_wrlog_data_max = zfs_dirty_data_max * 2;
}
}

void
Expand Down
7 changes: 7 additions & 0 deletions module/zfs/dmu_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ dmu_tx_stats_t dmu_tx_stats = {
{ "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
{ "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_quota", KSTAT_DATA_UINT64 },
};
Expand Down Expand Up @@ -884,6 +885,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
return (SET_ERROR(ERESTART));
}

if (!tx->tx_dirty_delayed &&
dsl_pool_wrlog_over_max(tx->tx_pool)) {
DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
return (SET_ERROR(ERESTART));
}

if (!tx->tx_dirty_delayed &&
dsl_pool_need_dirty_delay(tx->tx_pool)) {
tx->tx_wait_dirty = B_TRUE;
Expand Down
57 changes: 57 additions & 0 deletions module/zfs/dsl_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@ unsigned long zfs_dirty_data_max_max = 0;
int zfs_dirty_data_max_percent = 10;
int zfs_dirty_data_max_max_percent = 25;

/*
* zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
* Once it is reached, write operation is blocked,
* until log data is cleared out after txg sync.
* It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
*/
unsigned long zfs_wrlog_data_max = 0;

/*
* If there's at least this much dirty data (as a percentage of
* zfs_dirty_data_max), push out a txg. This should be less than
Expand Down Expand Up @@ -220,6 +228,11 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);

aggsum_init(&dp->dp_wrlog_total, 0);
for (int i = 0; i < TXG_SIZE; i++) {
aggsum_init(&dp->dp_wrlog_pertxg[i], 0);
}

dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
TASKQ_THREADS_CPU_PCT);
Expand Down Expand Up @@ -416,6 +429,14 @@ dsl_pool_close(dsl_pool_t *dp)
rrw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
cv_destroy(&dp->dp_spaceavail_cv);

ASSERT0(aggsum_value(&dp->dp_wrlog_total));
aggsum_fini(&dp->dp_wrlog_total);
for (int i = 0; i < TXG_SIZE; i++) {
ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i]));
aggsum_fini(&dp->dp_wrlog_pertxg[i]);
}

taskq_destroy(dp->dp_unlinked_drain_taskq);
taskq_destroy(dp->dp_zrele_taskq);
if (dp->dp_blkstats != NULL) {
Expand Down Expand Up @@ -592,6 +613,36 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
cv_signal(&dp->dp_spaceavail_cv);
}

void
dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
{
ASSERT3S(size, >=, 0);

aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size);
aggsum_add(&dp->dp_wrlog_total, size);

/* Choose a value slightly bigger than min dirty sync bytes */
uint64_t sync_min =
zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100;
if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
txg_kick(dp);
}

boolean_t
dsl_pool_wrlog_over_max(dsl_pool_t *dp)
{
return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0);
}

static void
dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
{
int64_t delta;
delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
aggsum_add(&dp->dp_wrlog_total, delta);
}

#ifdef ZFS_DEBUG
static boolean_t
dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
Expand Down Expand Up @@ -816,6 +867,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
dmu_buf_rele(ds->ds_dbuf, zilog);
}

dsl_pool_wrlog_clear(dp, txg);

ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
}

Expand Down Expand Up @@ -1398,6 +1452,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
"Determines the dirty space limit");

ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW,
"The size limit of write-transaction zil log data");

/* zfs_dirty_data_max_max only applied at module load in arc_init(). */
ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
"zfs_dirty_data_max upper bound in bytes");
Expand Down
5 changes: 5 additions & 0 deletions module/zfs/zfs_log.c
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
itx_wr_state_t write_state;
uintptr_t fsync_cnt;
uint64_t gen = 0;
ssize_t size = resid;

if (zil_replaying(zilog, tx) || zp->z_unlinked ||
zfs_xattr_owner_unlinked(zp)) {
Expand Down Expand Up @@ -628,6 +629,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
off += len;
resid -= len;
}

if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg);
}
}

/*
Expand Down
7 changes: 5 additions & 2 deletions module/zfs/zvol.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,8 @@
#include <sys/zfs_rlock.h>
#include <sys/spa_impl.h>
#include <sys/zvol.h>

#include <sys/zvol_impl.h>


unsigned int zvol_inhibit_dev = 0;
unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;

Expand Down Expand Up @@ -577,6 +575,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
uint32_t blocksize = zv->zv_volblocksize;
zilog_t *zilog = zv->zv_zilog;
itx_wr_state_t write_state;
uint64_t sz = size;

if (zil_replaying(zilog, tx))
return;
Expand Down Expand Up @@ -628,6 +627,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
offset += len;
size -= len;
}

if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg);
}
}

/*
Expand Down

0 comments on commit 15545f8

Please sign in to comment.