Skip to content

Commit

Permalink
Improve ZFS objset sync parallelism
Browse files Browse the repository at this point in the history
As part of transaction group commit, dsl_pool_sync() sequentially calls
dsl_dataset_sync() for each dirty dataset, which subsequently calls
dmu_objset_sync().  dmu_objset_sync() in turn uses up to 75% of CPU
cores to run sync_dnodes_task() in taskq threads to sync the dirty
dnodes (files).

There are two problems:

1. Each ZVOL in a pool is a separate dataset/objset having a single
   dnode.  This means the objsets are synchronized serially, which
   leads to a bottleneck of ~330K blocks written per second per pool.

2. In the case of multiple dirty dnodes/files on a dataset/objset on a
   big system they will be sync'd in parallel taskq threads. However,
   it is inefficient to to use 75% of CPU cores of a big system to do
   that, because of (a) bottlenecks on a single write issue taskq, and
   (b) allocation throttling.  In addition, if not for the allocation
   throttling sorting write requests by bookmarks (logical address),
   writes for different files may reach space allocators interleaved,
   leading to unwanted fragmentation.

The solution to both problems is to always sync no more and (if
possible) no fewer dnodes at the same time than there are allocators
the pool.

Signed-off-by: Edmund Nadolski <[email protected]>
  • Loading branch information
Edmund Nadolski committed Oct 13, 2023
1 parent 8a74070 commit a9f251a
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 47 deletions.
3 changes: 2 additions & 1 deletion include/sys/dmu_objset.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,8 @@ void dmu_objset_evict_dbufs(objset_t *os);
inode_timespec_t dmu_objset_snap_cmtime(objset_t *os);

/* called from dsl */
void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
zio_t *dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
void dmu_objset_sync_sublists_done(zio_t *zio);
boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
objset_t *dmu_objset_create_impl_dnstats(spa_t *spa, struct dsl_dataset *ds,
blkptr_t *bp, dmu_objset_type_t type, int levels, int blksz, int ibs,
Expand Down
6 changes: 5 additions & 1 deletion module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -4587,6 +4587,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
}
}

/* May be called recursively from dbuf_sync_indirect(). */
void
dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -5005,7 +5006,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
}


/* Issue I/O to commit a dirty buffer to disk. */
/*
* Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
* Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
*/
static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
Expand Down
92 changes: 70 additions & 22 deletions module/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -1625,6 +1625,7 @@ typedef struct sync_dnodes_arg {
int sda_sublist_idx;
multilist_t *sda_newlist;
dmu_tx_t *sda_tx;
zio_t *sda_cio;
} sync_dnodes_arg_t;

static void
Expand All @@ -1639,20 +1640,66 @@ sync_dnodes_task(void *arg)

multilist_sublist_unlock(ms);

zio_nowait(sda->sda_cio);

kmem_free(sda, sizeof (*sda));
}

typedef struct sync_objset_arg {
objset_t *soa_os;
dmu_tx_t *soa_tx;
zio_t *soa_zio;
} sync_objset_arg_t;

static void
sync_dnodes_finish_task(void *arg)
{
sync_objset_arg_t *soa = arg;
objset_t *os = soa->soa_os;
dmu_tx_t *tx = soa->soa_tx;

/* Free intent log blocks up to this tx. (May block.) */
zil_sync(os->os_zil, tx);
os->os_phys->os_zil_header = os->os_zil_header;

zio_nowait(soa->soa_zio);
kmem_free(soa, sizeof (*soa));
}

/* called from dsl */
void
dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
dmu_objset_sync_sublists_done(zio_t *zio)
{
sync_objset_arg_t *soa = zio->io_private;
objset_t *os = soa->soa_os;
dmu_tx_t *tx = soa->soa_tx;
int txgoff = tx->tx_txg & TXG_MASK;
dbuf_dirty_record_t *dr;

list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
while ((dr = list_remove_head(list)) != NULL) {
ASSERT0(dr->dr_dbuf->db_level);
zio_nowait(dr->dr_zio);
}

/* Enable dnode backfill if enough objects have been freed. */
if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
os->os_rescan_dnodes = B_TRUE;
os->os_freed_dnodes = 0;
}

/* sync_dnodes_finsh_task calls zil_sync on our behalf. */
(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
sync_dnodes_finish_task, soa, TQ_FRONT);
}

/* Nonblocking objset sync. Called from dsl. */
zio_t *
dmu_objset_sync(objset_t *os, zio_t *rio, dmu_tx_t *tx)
{
int txgoff;
zbookmark_phys_t zb;
zio_prop_t zp;
zio_t *zio;
list_t *list;
dbuf_dirty_record_t *dr;
int num_sublists;
multilist_t *ml;
blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
Expand Down Expand Up @@ -1696,6 +1743,10 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
DMU_OT_OBJSET, NULL, NULL, NULL);
}

/* pio is a child of rio and a parent of zio */
zio_t *pio = zio_null(rio, os->os_spa, NULL, NULL, NULL,
ZIO_FLAG_MUSTSUCCEED);

zio = arc_write(pio, os->os_spa, tx->tx_txg,
blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),
&zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,
Expand Down Expand Up @@ -1739,6 +1790,16 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
offsetof(dnode_t, dn_dirty_link[txgoff]));
}

/* sync_dnodes_finish_task executes zio and frees soa */
sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP);
soa->soa_os = os;
soa->soa_tx = tx;
soa->soa_zio = zio;

/* sio is a child of the arc_write zio and parent of the sda_cio(s). */
zio_t *sio = zio_null(zio, os->os_spa, NULL,
dmu_objset_sync_sublists_done, soa, ZIO_FLAG_MUSTSUCCEED);

ml = &os->os_dirty_dnodes[txgoff];
num_sublists = multilist_get_num_sublists(ml);
for (int i = 0; i < num_sublists; i++) {
Expand All @@ -1748,30 +1809,17 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
sda->sda_list = ml;
sda->sda_sublist_idx = i;
sda->sda_tx = tx;
sda->sda_cio = zio_null(sio, os->os_spa, NULL, NULL, NULL,
ZIO_FLAG_MUSTSUCCEED);
(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
sync_dnodes_task, sda, 0);
/* callback frees sda */
}
taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);

list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
while ((dr = list_remove_head(list)) != NULL) {
ASSERT0(dr->dr_dbuf->db_level);
zio_nowait(dr->dr_zio);
}
/* Syncs the meta dnode even if all sublists were empty */
zio_nowait(sio);

/* Enable dnode backfill if enough objects have been freed. */
if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
os->os_rescan_dnodes = B_TRUE;
os->os_freed_dnodes = 0;
}

/*
* Free intent log blocks up to this tx.
*/
zil_sync(os->os_zil, tx);
os->os_phys->os_zil_header = os->os_zil_header;
zio_nowait(zio);
return (pio);
}

boolean_t
Expand Down
1 change: 1 addition & 0 deletions module/zfs/dnode_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)

/*
* Write out the dnode's dirty buffers.
* Does not wait for zio completions.
*/
void
dnode_sync(dnode_t *dn, dmu_tx_t *tx)
Expand Down
5 changes: 3 additions & 2 deletions module/zfs/dsl_dataset.c
Original file line number Diff line number Diff line change
Expand Up @@ -2069,8 +2069,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
return (error);
}

/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */
void
dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
dsl_dataset_sync(dsl_dataset_t *ds, zio_t *pio, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(ds->ds_objset != NULL);
Expand Down Expand Up @@ -2098,7 +2099,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
}

dmu_objset_sync(ds->ds_objset, zio, tx);
zio_nowait(dmu_objset_sync(ds->ds_objset, pio, tx));
}

/*
Expand Down
32 changes: 11 additions & 21 deletions module/zfs/dsl_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,6 @@ uint_t zfs_delay_min_dirty_percent = 60;
*/
uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;

/*
* This determines the number of threads used by the dp_sync_taskq.
*/
static int zfs_sync_taskq_batch_pct = 75;

/*
* These tunables determine the behavior of how zil_itxg_clean() is
* called via zil_clean() in the context of spa_sync(). When an itxg
Expand Down Expand Up @@ -215,8 +210,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
offsetof(dsl_sync_task_t, dst_node));

dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
TASKQ_THREADS_CPU_PCT);
MIN(spa->spa_alloc_count, boot_ncpus), minclsyspri, 1, INT_MAX, 0);

dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
zfs_zil_clean_taskq_nthr_pct, minclsyspri,
Expand Down Expand Up @@ -580,9 +574,7 @@ dsl_pool_mos_diduse_space(dsl_pool_t *dp,
static void
dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
{
zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
dmu_objset_sync(dp->dp_meta_objset, zio, tx);
VERIFY0(zio_wait(zio));
VERIFY0(zio_wait(dmu_objset_sync(dp->dp_meta_objset, NULL, tx)));
dmu_objset_sync_done(dp->dp_meta_objset, tx);
taskq_wait(dp->dp_sync_taskq);
multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes);
Expand Down Expand Up @@ -674,7 +666,7 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
zio_t *zio;
zio_t *rio; /* root zio for all dirty dataset syncs */
dmu_tx_t *tx;
dsl_dir_t *dd;
dsl_dataset_t *ds;
Expand Down Expand Up @@ -704,9 +696,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
}

/*
* Write out all dirty blocks of dirty datasets.
* Write out all dirty blocks of dirty datasets. Note, this could
* create a very large (+10k) zio tree.
*/
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
/*
* We must not sync any non-MOS datasets twice, because
Expand All @@ -715,9 +708,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
*/
ASSERT(!list_link_active(&ds->ds_synced_link));
list_insert_tail(&synced_datasets, ds);
dsl_dataset_sync(ds, zio, tx);
dsl_dataset_sync(ds, rio, tx);
}
VERIFY0(zio_wait(zio));
VERIFY0(zio_wait(rio));

/*
* Update the long range free counter after
Expand Down Expand Up @@ -748,13 +741,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
* user accounting information (and we won't get confused
* about which blocks are part of the snapshot).
*/
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
objset_t *os = ds->ds_objset;

ASSERT(list_link_active(&ds->ds_synced_link));
dmu_buf_rele(ds->ds_dbuf, ds);
dsl_dataset_sync(ds, zio, tx);
dsl_dataset_sync(ds, rio, tx);

/*
* Release any key mappings created by calls to
Expand All @@ -767,7 +760,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
}
}
VERIFY0(zio_wait(zio));
VERIFY0(zio_wait(rio));

/*
* Now that the datasets have been completely synced, we can
Expand Down Expand Up @@ -1481,9 +1474,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW,
"How quickly delay approaches infinity");

ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
"Max percent of CPUs that are used to sync dirty data");

ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,
"Max percent of CPUs that are used per dp_sync_taskq");

Expand Down

0 comments on commit a9f251a

Please sign in to comment.