From a9f251adc1d75d47316098757702bb8380f87feb Mon Sep 17 00:00:00 2001 From: Edmund Nadolski Date: Wed, 5 Jul 2023 11:49:18 -0600 Subject: [PATCH] Improve ZFS objset sync parallelism As part of transaction group commit, dsl_pool_sync() sequentially calls dsl_dataset_sync() for each dirty dataset, which subsequently calls dmu_objset_sync(). dmu_objset_sync() in turn uses up to 75% of CPU cores to run sync_dnodes_task() in taskq threads to sync the dirty dnodes (files). There are two problems: 1. Each ZVOL in a pool is a separate dataset/objset having a single dnode. This means the objsets are synchronized serially, which leads to a bottleneck of ~330K blocks written per second per pool. 2. In the case of multiple dirty dnodes/files on a dataset/objset on a big system they will be sync'd in parallel taskq threads. However, it is inefficient to to use 75% of CPU cores of a big system to do that, because of (a) bottlenecks on a single write issue taskq, and (b) allocation throttling. In addition, if not for the allocation throttling sorting write requests by bookmarks (logical address), writes for different files may reach space allocators interleaved, leading to unwanted fragmentation. The solution to both problems is to always sync no more and (if possible) no fewer dnodes at the same time than there are allocators the pool. Signed-off-by: Edmund Nadolski --- include/sys/dmu_objset.h | 3 +- module/zfs/dbuf.c | 6 ++- module/zfs/dmu_objset.c | 92 ++++++++++++++++++++++++++++++---------- module/zfs/dnode_sync.c | 1 + module/zfs/dsl_dataset.c | 5 ++- module/zfs/dsl_pool.c | 32 +++++--------- 6 files changed, 92 insertions(+), 47 deletions(-) diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 9f6e0fdd601b..96bff4e4e063 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -235,7 +235,8 @@ void dmu_objset_evict_dbufs(objset_t *os); inode_timespec_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ -void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); +zio_t *dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); +void dmu_objset_sync_sublists_done(zio_t *zio); boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); objset_t *dmu_objset_create_impl_dnstats(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, dmu_objset_type_t type, int levels, int blksz, int ibs, diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index c0c2692c113a..6321895c73e8 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4587,6 +4587,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } } +/* May be called recursively from dbuf_sync_indirect(). */ void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { @@ -5005,7 +5006,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) } -/* Issue I/O to commit a dirty buffer to disk. */ +/* + * Populate dr->dr_zio with a zio to commit a dirty buffer to disk. + * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio). + */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index d134d4958f7c..2fe217475132 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1625,6 +1625,7 @@ typedef struct sync_dnodes_arg { int sda_sublist_idx; multilist_t *sda_newlist; dmu_tx_t *sda_tx; + zio_t *sda_cio; } sync_dnodes_arg_t; static void @@ -1639,20 +1640,66 @@ sync_dnodes_task(void *arg) multilist_sublist_unlock(ms); + zio_nowait(sda->sda_cio); + kmem_free(sda, sizeof (*sda)); } +typedef struct sync_objset_arg { + objset_t *soa_os; + dmu_tx_t *soa_tx; + zio_t *soa_zio; +} sync_objset_arg_t; + +static void +sync_dnodes_finish_task(void *arg) +{ + sync_objset_arg_t *soa = arg; + objset_t *os = soa->soa_os; + dmu_tx_t *tx = soa->soa_tx; + + /* Free intent log blocks up to this tx. (May block.) */ + zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; + + zio_nowait(soa->soa_zio); + kmem_free(soa, sizeof (*soa)); +} -/* called from dsl */ void -dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) +dmu_objset_sync_sublists_done(zio_t *zio) +{ + sync_objset_arg_t *soa = zio->io_private; + objset_t *os = soa->soa_os; + dmu_tx_t *tx = soa->soa_tx; + int txgoff = tx->tx_txg & TXG_MASK; + dbuf_dirty_record_t *dr; + + list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; + while ((dr = list_remove_head(list)) != NULL) { + ASSERT0(dr->dr_dbuf->db_level); + zio_nowait(dr->dr_zio); + } + + /* Enable dnode backfill if enough objects have been freed. */ + if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { + os->os_rescan_dnodes = B_TRUE; + os->os_freed_dnodes = 0; + } + + /* sync_dnodes_finsh_task calls zil_sync on our behalf. */ + (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, + sync_dnodes_finish_task, soa, TQ_FRONT); +} + +/* Nonblocking objset sync. Called from dsl. */ +zio_t * +dmu_objset_sync(objset_t *os, zio_t *rio, dmu_tx_t *tx) { int txgoff; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; - list_t *list; - dbuf_dirty_record_t *dr; int num_sublists; multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); @@ -1696,6 +1743,10 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) DMU_OT_OBJSET, NULL, NULL, NULL); } + /* pio is a child of rio and a parent of zio */ + zio_t *pio = zio_null(rio, os->os_spa, NULL, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); + zio = arc_write(pio, os->os_spa, tx->tx_txg, blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os), &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done, @@ -1739,6 +1790,16 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) offsetof(dnode_t, dn_dirty_link[txgoff])); } + /* sync_dnodes_finish_task executes zio and frees soa */ + sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP); + soa->soa_os = os; + soa->soa_tx = tx; + soa->soa_zio = zio; + + /* sio is a child of the arc_write zio and parent of the sda_cio(s). */ + zio_t *sio = zio_null(zio, os->os_spa, NULL, + dmu_objset_sync_sublists_done, soa, ZIO_FLAG_MUSTSUCCEED); + ml = &os->os_dirty_dnodes[txgoff]; num_sublists = multilist_get_num_sublists(ml); for (int i = 0; i < num_sublists; i++) { @@ -1748,30 +1809,17 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) sda->sda_list = ml; sda->sda_sublist_idx = i; sda->sda_tx = tx; + sda->sda_cio = zio_null(sio, os->os_spa, NULL, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, sync_dnodes_task, sda, 0); /* callback frees sda */ } - taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); - list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; - while ((dr = list_remove_head(list)) != NULL) { - ASSERT0(dr->dr_dbuf->db_level); - zio_nowait(dr->dr_zio); - } + /* Syncs the meta dnode even if all sublists were empty */ + zio_nowait(sio); - /* Enable dnode backfill if enough objects have been freed. */ - if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { - os->os_rescan_dnodes = B_TRUE; - os->os_freed_dnodes = 0; - } - - /* - * Free intent log blocks up to this tx. - */ - zil_sync(os->os_zil, tx); - os->os_phys->os_zil_header = os->os_zil_header; - zio_nowait(zio); + return (pio); } boolean_t diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 8e39af83bb0a..8cffbdb9d20b 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -627,6 +627,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) /* * Write out the dnode's dirty buffers. + * Does not wait for zio completions. */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index d6db61729223..0002e8264ab7 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -2069,8 +2069,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, return (error); } +/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */ void -dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *pio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); @@ -2098,7 +2099,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; } - dmu_objset_sync(ds->ds_objset, zio, tx); + zio_nowait(dmu_objset_sync(ds->ds_objset, pio, tx)); } /* diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 9120fef93c74..1bf6a06f3952 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -140,11 +140,6 @@ uint_t zfs_delay_min_dirty_percent = 60; */ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; -/* - * This determines the number of threads used by the dp_sync_taskq. - */ -static int zfs_sync_taskq_batch_pct = 75; - /* * These tunables determine the behavior of how zil_itxg_clean() is * called via zil_clean() in the context of spa_sync(). When an itxg @@ -215,8 +210,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) offsetof(dsl_sync_task_t, dst_node)); dp->dp_sync_taskq = taskq_create("dp_sync_taskq", - zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, - TASKQ_THREADS_CPU_PCT); + MIN(spa->spa_alloc_count, boot_ncpus), minclsyspri, 1, INT_MAX, 0); dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", zfs_zil_clean_taskq_nthr_pct, minclsyspri, @@ -580,9 +574,7 @@ dsl_pool_mos_diduse_space(dsl_pool_t *dp, static void dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) { - zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - dmu_objset_sync(dp->dp_meta_objset, zio, tx); - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(dmu_objset_sync(dp->dp_meta_objset, NULL, tx))); dmu_objset_sync_done(dp->dp_meta_objset, tx); taskq_wait(dp->dp_sync_taskq); multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes); @@ -674,7 +666,7 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { - zio_t *zio; + zio_t *rio; /* root zio for all dirty dataset syncs */ dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; @@ -704,9 +696,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) } /* - * Write out all dirty blocks of dirty datasets. + * Write out all dirty blocks of dirty datasets. Note, this could + * create a very large (+10k) zio tree. */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because @@ -715,9 +708,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); - dsl_dataset_sync(ds, zio, tx); + dsl_dataset_sync(ds, rio, tx); } - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(rio)); /* * Update the long range free counter after @@ -748,13 +741,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { objset_t *os = ds->ds_objset; ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); - dsl_dataset_sync(ds, zio, tx); + dsl_dataset_sync(ds, rio, tx); /* * Release any key mappings created by calls to @@ -767,7 +760,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } } - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(rio)); /* * Now that the datasets have been completely synced, we can @@ -1481,9 +1474,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW, "How quickly delay approaches infinity"); -ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, - "Max percent of CPUs that are used to sync dirty data"); - ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, "Max percent of CPUs that are used per dp_sync_taskq");