diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 9f6e0fdd601b..96bff4e4e063 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -235,7 +235,8 @@ void dmu_objset_evict_dbufs(objset_t *os); inode_timespec_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ -void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); +zio_t *dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); +void dmu_objset_sync_sublists_done(zio_t *zio); boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); objset_t *dmu_objset_create_impl_dnstats(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, dmu_objset_type_t type, int levels, int blksz, int ibs, diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index c0c2692c113a..6321895c73e8 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4587,6 +4587,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } } +/* May be called recursively from dbuf_sync_indirect(). */ void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { @@ -5005,7 +5006,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) } -/* Issue I/O to commit a dirty buffer to disk. */ +/* + * Populate dr->dr_zio with a zio to commit a dirty buffer to disk. + * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio). + */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index d134d4958f7c..2fe217475132 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1625,6 +1625,7 @@ typedef struct sync_dnodes_arg { int sda_sublist_idx; multilist_t *sda_newlist; dmu_tx_t *sda_tx; + zio_t *sda_cio; } sync_dnodes_arg_t; static void @@ -1639,20 +1640,66 @@ sync_dnodes_task(void *arg) multilist_sublist_unlock(ms); + zio_nowait(sda->sda_cio); + kmem_free(sda, sizeof (*sda)); } +typedef struct sync_objset_arg { + objset_t *soa_os; + dmu_tx_t *soa_tx; + zio_t *soa_zio; +} sync_objset_arg_t; + +static void +sync_dnodes_finish_task(void *arg) +{ + sync_objset_arg_t *soa = arg; + objset_t *os = soa->soa_os; + dmu_tx_t *tx = soa->soa_tx; + + /* Free intent log blocks up to this tx. (May block.) */ + zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; + + zio_nowait(soa->soa_zio); + kmem_free(soa, sizeof (*soa)); +} -/* called from dsl */ void -dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) +dmu_objset_sync_sublists_done(zio_t *zio) +{ + sync_objset_arg_t *soa = zio->io_private; + objset_t *os = soa->soa_os; + dmu_tx_t *tx = soa->soa_tx; + int txgoff = tx->tx_txg & TXG_MASK; + dbuf_dirty_record_t *dr; + + list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; + while ((dr = list_remove_head(list)) != NULL) { + ASSERT0(dr->dr_dbuf->db_level); + zio_nowait(dr->dr_zio); + } + + /* Enable dnode backfill if enough objects have been freed. */ + if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { + os->os_rescan_dnodes = B_TRUE; + os->os_freed_dnodes = 0; + } + + /* sync_dnodes_finsh_task calls zil_sync on our behalf. */ + (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, + sync_dnodes_finish_task, soa, TQ_FRONT); +} + +/* Nonblocking objset sync. Called from dsl. */ +zio_t * +dmu_objset_sync(objset_t *os, zio_t *rio, dmu_tx_t *tx) { int txgoff; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; - list_t *list; - dbuf_dirty_record_t *dr; int num_sublists; multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); @@ -1696,6 +1743,10 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) DMU_OT_OBJSET, NULL, NULL, NULL); } + /* pio is a child of rio and a parent of zio */ + zio_t *pio = zio_null(rio, os->os_spa, NULL, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); + zio = arc_write(pio, os->os_spa, tx->tx_txg, blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os), &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done, @@ -1739,6 +1790,16 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) offsetof(dnode_t, dn_dirty_link[txgoff])); } + /* sync_dnodes_finish_task executes zio and frees soa */ + sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP); + soa->soa_os = os; + soa->soa_tx = tx; + soa->soa_zio = zio; + + /* sio is a child of the arc_write zio and parent of the sda_cio(s). */ + zio_t *sio = zio_null(zio, os->os_spa, NULL, + dmu_objset_sync_sublists_done, soa, ZIO_FLAG_MUSTSUCCEED); + ml = &os->os_dirty_dnodes[txgoff]; num_sublists = multilist_get_num_sublists(ml); for (int i = 0; i < num_sublists; i++) { @@ -1748,30 +1809,17 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) sda->sda_list = ml; sda->sda_sublist_idx = i; sda->sda_tx = tx; + sda->sda_cio = zio_null(sio, os->os_spa, NULL, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, sync_dnodes_task, sda, 0); /* callback frees sda */ } - taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); - list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; - while ((dr = list_remove_head(list)) != NULL) { - ASSERT0(dr->dr_dbuf->db_level); - zio_nowait(dr->dr_zio); - } + /* Syncs the meta dnode even if all sublists were empty */ + zio_nowait(sio); - /* Enable dnode backfill if enough objects have been freed. */ - if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { - os->os_rescan_dnodes = B_TRUE; - os->os_freed_dnodes = 0; - } - - /* - * Free intent log blocks up to this tx. - */ - zil_sync(os->os_zil, tx); - os->os_phys->os_zil_header = os->os_zil_header; - zio_nowait(zio); + return (pio); } boolean_t diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 8e39af83bb0a..8cffbdb9d20b 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -627,6 +627,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) /* * Write out the dnode's dirty buffers. + * Does not wait for zio completions. */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index d6db61729223..0002e8264ab7 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -2069,8 +2069,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, return (error); } +/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */ void -dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *pio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); @@ -2098,7 +2099,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; } - dmu_objset_sync(ds->ds_objset, zio, tx); + zio_nowait(dmu_objset_sync(ds->ds_objset, pio, tx)); } /* diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 9120fef93c74..1bf6a06f3952 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -140,11 +140,6 @@ uint_t zfs_delay_min_dirty_percent = 60; */ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; -/* - * This determines the number of threads used by the dp_sync_taskq. - */ -static int zfs_sync_taskq_batch_pct = 75; - /* * These tunables determine the behavior of how zil_itxg_clean() is * called via zil_clean() in the context of spa_sync(). When an itxg @@ -215,8 +210,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) offsetof(dsl_sync_task_t, dst_node)); dp->dp_sync_taskq = taskq_create("dp_sync_taskq", - zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, - TASKQ_THREADS_CPU_PCT); + MIN(spa->spa_alloc_count, boot_ncpus), minclsyspri, 1, INT_MAX, 0); dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", zfs_zil_clean_taskq_nthr_pct, minclsyspri, @@ -580,9 +574,7 @@ dsl_pool_mos_diduse_space(dsl_pool_t *dp, static void dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) { - zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - dmu_objset_sync(dp->dp_meta_objset, zio, tx); - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(dmu_objset_sync(dp->dp_meta_objset, NULL, tx))); dmu_objset_sync_done(dp->dp_meta_objset, tx); taskq_wait(dp->dp_sync_taskq); multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes); @@ -674,7 +666,7 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { - zio_t *zio; + zio_t *rio; /* root zio for all dirty dataset syncs */ dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; @@ -704,9 +696,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) } /* - * Write out all dirty blocks of dirty datasets. + * Write out all dirty blocks of dirty datasets. Note, this could + * create a very large (+10k) zio tree. */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because @@ -715,9 +708,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); - dsl_dataset_sync(ds, zio, tx); + dsl_dataset_sync(ds, rio, tx); } - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(rio)); /* * Update the long range free counter after @@ -748,13 +741,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { objset_t *os = ds->ds_objset; ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); - dsl_dataset_sync(ds, zio, tx); + dsl_dataset_sync(ds, rio, tx); /* * Release any key mappings created by calls to @@ -767,7 +760,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } } - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(rio)); /* * Now that the datasets have been completely synced, we can @@ -1481,9 +1474,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW, "How quickly delay approaches infinity"); -ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, - "Max percent of CPUs that are used to sync dirty data"); - ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, "Max percent of CPUs that are used per dp_sync_taskq");