Skip to content

Commit

Permalink
Remove ARC/ZIO physdone callbacks.
Browse files Browse the repository at this point in the history
Those callbacks were introduced many years ago as part of a bigger
patch to smoothen the write throttling within a txg. They allow to
account completion of individual physical writes within a logical
one, improving cases when some of physical writes complete much
sooner than others, gradually opening the write throttle.

Few years after that ZFS got allocation throttling, working on a
level of logical writes and limiting number of writes queued to
vdevs at any point, and so limiting latency distribution between
the physical writes and especially writes of multiple copies.
The addition of scheduling deadline I proposed in openzfs#14925 should
further reduce the latency distribution.  Grown memory sizes over
the past 10 years should also reduce importance of the smoothing.

While the use of physdone callback may still in theory provide
some smoother throttling, there are cases where we simply can not
afford it.  Since dirty data accounting is protected by pool-wide
lock, in case of 6-wide RAIDZ, for example, it requires us to take
it 8 times per logical block write, creating huge lock contention.

My tests of this patch show radical reduction of the lock spinning
time on workloads when smaller blocks are written to RAIDZ pools,
when each of the disks receives 8-16KB chunks, but the total rate
reaching 100K+ blocks per second.  Same time attempts to measure
any write time fluctuations didn't show anything noticeable.

Signed-off-by:	Alexander Motin <[email protected]>
Sponsored by:	iXsystems, Inc.
  • Loading branch information
amotin committed Jun 6, 2023
1 parent 8653f1d commit dfa8783
Show file tree
Hide file tree
Showing 8 changed files with 24 additions and 133 deletions.
5 changes: 2 additions & 3 deletions include/sys/arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,8 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp,
arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
arc_write_done_func_t *physdone, arc_write_done_func_t *done,
void *priv, zio_priority_t priority, int zio_flags,
const zbookmark_phys_t *zb);
arc_write_done_func_t *done, void *priv, zio_priority_t priority,
int zio_flags, const zbookmark_phys_t *zb);

arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv);
void arc_remove_prune_callback(arc_prune_t *p);
Expand Down
1 change: 0 additions & 1 deletion include/sys/arc_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ struct arc_write_callback {
void *awcb_private;
arc_write_done_func_t *awcb_ready;
arc_write_done_func_t *awcb_children_ready;
arc_write_done_func_t *awcb_physdone;
arc_write_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};
Expand Down
7 changes: 2 additions & 5 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,6 @@ struct zio {
/* Callback info */
zio_done_func_t *io_ready;
zio_done_func_t *io_children_ready;
zio_done_func_t *io_physdone;
zio_done_func_t *io_done;
void *io_private;
int64_t io_prev_space_delta; /* DMU private */
Expand Down Expand Up @@ -505,7 +504,6 @@ struct zio {
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
uint64_t io_child_count;
uint64_t io_phys_children;
uint64_t io_parent_count;
uint64_t *io_stall;
zio_t *io_gang_leader;
Expand Down Expand Up @@ -554,9 +552,8 @@ extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *priv, zio_priority_t priority, zio_flag_t flags,
const zbookmark_phys_t *zb);
zio_done_func_t *done, void *priv, zio_priority_t priority,
zio_flag_t flags, const zbookmark_phys_t *zb);

extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
Expand Down
22 changes: 4 additions & 18 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -6676,18 +6676,6 @@ arc_write_children_ready(zio_t *zio)
callback->awcb_children_ready(zio, buf, callback->awcb_private);
}

/*
* The SPA calls this callback for each physical write that happens on behalf
* of a logical write. See the comment in dbuf_write_physdone() for details.
*/
static void
arc_write_physdone(zio_t *zio)
{
arc_write_callback_t *cb = zio->io_private;
if (cb->awcb_physdone != NULL)
cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
}

static void
arc_write_done(zio_t *zio)
{
Expand Down Expand Up @@ -6777,9 +6765,9 @@ zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
const zio_prop_t *zp, arc_write_done_func_t *ready,
arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
arc_write_done_func_t *done, void *private, zio_priority_t priority,
int zio_flags, const zbookmark_phys_t *zb)
arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
void *private, zio_priority_t priority, int zio_flags,
const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
Expand Down Expand Up @@ -6826,7 +6814,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
callback->awcb_children_ready = children_ready;
callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
Expand Down Expand Up @@ -6863,8 +6850,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL,
arc_write_physdone, arc_write_done, callback,
priority, zio_flags, zb);
arc_write_done, callback, priority, zio_flags, zb);

return (zio);
}
Expand Down
94 changes: 9 additions & 85 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -4369,22 +4369,6 @@ dbuf_lightweight_ready(zio_t *zio)
rw_exit(&parent_db->db_rwlock);
}

static void
dbuf_lightweight_physdone(zio_t *zio)
{
dbuf_dirty_record_t *dr = zio->io_private;
dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
ASSERT3U(dr->dr_txg, ==, zio->io_txg);

/*
* The callback will be called io_phys_children times. Retire one
* portion of our dirty space each time we are called. Any rounding
* error will be cleaned up by dbuf_lightweight_done().
*/
int delta = dr->dr_accounted / zio->io_phys_children;
dsl_pool_undirty_space(dp, delta, zio->io_txg);
}

static void
dbuf_lightweight_done(zio_t *zio)
{
Expand All @@ -4403,16 +4387,8 @@ dbuf_lightweight_done(zio_t *zio)
dsl_dataset_block_born(ds, zio->io_bp, tx);
}

/*
* See comment in dbuf_write_done().
*/
if (zio->io_phys_children == 0) {
dsl_pool_undirty_space(dmu_objset_pool(os),
dr->dr_accounted, zio->io_txg);
} else {
dsl_pool_undirty_space(dmu_objset_pool(os),
dr->dr_accounted % zio->io_phys_children, zio->io_txg);
}
dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
zio->io_txg);

abd_free(dr->dt.dll.dr_abd);
kmem_free(dr, sizeof (*dr));
Expand Down Expand Up @@ -4446,8 +4422,7 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
&dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
ZIO_PRIORITY_ASYNC_WRITE,
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);

zio_nowait(dr->dr_zio);
Expand Down Expand Up @@ -4789,37 +4764,6 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_EXIT(db);
}

/*
* The SPA will call this callback several times for each zio - once
* for every physical child i/o (zio->io_phys_children times). This
* allows the DMU to monitor the progress of each logical i/o. For example,
* there may be 2 copies of an indirect block, or many fragments of a RAID-Z
* block. There may be a long delay before all copies/fragments are completed,
* so this callback allows us to retire dirty space gradually, as the physical
* i/os complete.
*/
static void
dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
{
(void) buf;
dmu_buf_impl_t *db = arg;
objset_t *os = db->db_objset;
dsl_pool_t *dp = dmu_objset_pool(os);
dbuf_dirty_record_t *dr;
int delta = 0;

dr = db->db_data_pending;
ASSERT3U(dr->dr_txg, ==, zio->io_txg);

/*
* The callback will be called io_phys_children times. Retire one
* portion of our dirty space each time we are called. Any rounding
* error will be cleaned up by dbuf_write_done().
*/
delta = dr->dr_accounted / zio->io_phys_children;
dsl_pool_undirty_space(dp, delta, zio->io_txg);
}

static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
Expand Down Expand Up @@ -4894,27 +4838,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);

/*
* If we didn't do a physical write in this ZIO and we
* still ended up here, it means that the space of the
* dbuf that we just released (and undirtied) above hasn't
* been marked as undirtied in the pool's accounting.
*
* Thus, we undirty that space in the pool's view of the
* world here. For physical writes this type of update
* happens in dbuf_write_physdone().
*
* If we did a physical write, cleanup any rounding errors
* that came up due to writing multiple copies of a block
* on disk [see dbuf_write_physdone()].
*/
if (zio->io_phys_children == 0) {
dsl_pool_undirty_space(dmu_objset_pool(os),
dr->dr_accounted, zio->io_txg);
} else {
dsl_pool_undirty_space(dmu_objset_pool(os),
dr->dr_accounted % zio->io_phys_children, zio->io_txg);
}
dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
zio->io_txg);

kmem_free(dr, sizeof (dbuf_dirty_record_t));
}
Expand Down Expand Up @@ -5162,7 +5087,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)

dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp,
dbuf_write_override_ready, NULL, NULL,
dbuf_write_override_ready, NULL,
dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
Expand All @@ -5176,7 +5101,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
dbuf_write_nofill_ready, NULL, NULL,
dbuf_write_nofill_ready, NULL,
dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
Expand All @@ -5195,9 +5120,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dr_zio = arc_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
children_ready_cb, dbuf_write_physdone,
dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED, &zb);
children_ready_cb, dbuf_write_done, db,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
}

Expand Down
4 changes: 2 additions & 2 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1698,7 +1698,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));

return (0);
Expand Down Expand Up @@ -1864,7 +1864,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)

zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
&zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
&zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));

return (0);
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -1698,7 +1698,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)

zio = arc_write(pio, os->os_spa, tx->tx_txg,
blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),
&zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
&zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,
os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);

/*
Expand Down
22 changes: 4 additions & 18 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -1162,9 +1162,8 @@ zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, zio_flag_t flags,
const zbookmark_phys_t *zb)
zio_done_func_t *done, void *private, zio_priority_t priority,
zio_flag_t flags, const zbookmark_phys_t *zb)
{
zio_t *zio;

Expand All @@ -1184,7 +1183,6 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,

zio->io_ready = ready;
zio->io_children_ready = children_ready;
zio->io_physdone = physdone;
zio->io_prop = *zp;

/*
Expand Down Expand Up @@ -1517,16 +1515,11 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
flags &= ~ZIO_FLAG_IO_ALLOCATING;
}


zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);

zio->io_physdone = pio->io_physdone;
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
zio->io_logical->io_phys_children++;

return (zio);
}

Expand Down Expand Up @@ -2975,7 +2968,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
has_data ? abd_get_offset(pio->io_abd, pio->io_size -
resid) : NULL, lsize, lsize, &zp,
zio_write_gang_member_ready, NULL, NULL,
zio_write_gang_member_ready, NULL,
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);

Expand Down Expand Up @@ -3437,7 +3430,7 @@ zio_ddt_write(zio_t *zio)
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_ready, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);

Expand Down Expand Up @@ -4147,13 +4140,6 @@ zio_vdev_io_assess(zio_t *zio)
if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;

if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
zio->io_physdone != NULL) {
ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
zio->io_physdone(zio->io_logical);
}

return (zio);
}

Expand Down

0 comments on commit dfa8783

Please sign in to comment.