Skip to content

Commit

Permalink
Another set of vdev queue optimizations.
Browse files Browse the repository at this point in the history
Switch FIFO queues (SYNC/TRIM) and active queue of vdev queue from
time-sorted AVL-trees to simple lists.  AVL-trees are too expensive
for such a simple task.  To change I/O priority without searching
through the trees, add io_queue_state field to struct zio.

To not check number of queued I/Os for each priority add vq_cqueued
bitmap to struct vdev_queue.  Update it when adding/removing I/Os.
Make vq_cactive a separate array instead of struct vdev_queue_class
member.  Together those allow to avoid lots of cache misses when
looking for work in vdev_queue_class_to_issue().

Introduce deadline of ~0.5s for LBA-sorted queues.  Before this I
saw some I/Os waiting in a queue for up to 8 seconds and possibly
more due to starvation.  With this change I no longer see it.  I
had to slightly more complicate the comparison function, but since
it uses all the same cache lines the difference is minimal.  For a
sequential I/Os the new code in vdev_queue_io_to_issue() actually
often uses more simple avl_first(), falling back to avl_find() and
avl_nearest() only when needed.

Arrange members in struct zio to access only one cache line when
searching through vdev queues.  While there, remove io_alloc_node,
reusing the io_queue_node instead.  Those two are never used same
time.

Remove zfs_vdev_aggregate_trim parameter.  It was disabled for 4
years since implemented, while still wasted time maintaining the
offset-sorted tree of TRIM requests.  Just remove the tree.

Remove locking from txg_all_lists_empty().  It is racy by design,
while 2 pair of locks/unlocks take noticeable time under the vdev
queue lock.

With these changes in my tests with volblocksize=4KB I measure vdev
queue lock spin time reduction by 50% on read and 75% on write.

Signed-off-by:	Alexander Motin <[email protected]>
Sponsored by:	iXsystems, Inc.
  • Loading branch information
amotin committed Jun 10, 2023
1 parent feff9df commit b9abf43
Show file tree
Hide file tree
Showing 8 changed files with 204 additions and 171 deletions.
3 changes: 2 additions & 1 deletion include/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);

extern int vdev_queue_length(vdev_t *vd);
extern uint32_t vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);

extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
Expand Down
15 changes: 6 additions & 9 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,26 +131,23 @@ typedef const struct vdev_ops {
* Virtual device properties
*/
typedef struct vdev_queue_class {
uint32_t vqc_active;

/*
* Sorted by offset or timestamp, depending on if the queue is
* LBA-ordered vs FIFO.
*/
avl_tree_t vqc_queued_tree;
list_t vqc_list;
avl_tree_t vqc_tree;
} vdev_queue_class_t;

struct vdev_queue {
vdev_t *vq_vdev;
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
avl_tree_t vq_trim_offset_tree;
uint64_t vq_last_offset;
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
uint32_t vq_cqueued; /* Classes with queued I/Os. */
uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
uint32_t vq_active; /* Number of active I/Os. */
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
list_t vq_active_list; /* List of active I/Os. */
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */
Expand Down
15 changes: 12 additions & 3 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,12 @@ typedef struct zio_link {
list_node_t zl_child_node;
} zio_link_t;

enum zio_qstate {
ZIO_QS_NONE = 0,
ZIO_QS_QUEUED,
ZIO_QS_ACTIVE,
};

struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
Expand Down Expand Up @@ -480,16 +486,19 @@ struct zio {
const zio_vsd_ops_t *io_vsd_ops;
metaslab_class_t *io_metaslab_class; /* dva throttle class */

enum zio_qstate io_queue_state; /* vdev queue state */
union {
list_node_t l;
avl_node_t a;
} io_queue_node ____cacheline_aligned; /* allocator and vdev queues */
avl_node_t io_offset_node; /* vdev offset queues */
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
hrtime_t io_queued_timestamp;
hrtime_t io_target_timestamp;
hrtime_t io_delta; /* vdev queue service delta */
hrtime_t io_delay; /* Device access time (disk or */
/* file). */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
avl_node_t io_alloc_node;
zio_alloc_list_t io_alloc_list;

/* Internal pipeline state */
Expand Down
6 changes: 0 additions & 6 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in
Flush dirty data to disk at least every this many seconds (maximum TXG
duration).
.
.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Allow TRIM I/O operations to be aggregated.
This is normally not helpful because the extents to be trimmed
will have been already been aggregated by the metaslab.
This option is provided for debugging and performance analysis.
.
.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
Max vdev I/O aggregation size.
.
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
}
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
Expand Down
13 changes: 4 additions & 9 deletions module/zfs/txg.c
Original file line number Diff line number Diff line change
Expand Up @@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
boolean_t
txg_all_lists_empty(txg_list_t *tl)
{
mutex_enter(&tl->tl_lock);
for (int i = 0; i < TXG_SIZE; i++) {
if (!txg_list_empty_impl(tl, i)) {
mutex_exit(&tl->tl_lock);
return (B_FALSE);
}
}
mutex_exit(&tl->tl_lock);
return (B_TRUE);
boolean_t res = B_TRUE;
for (int i = 0; i < TXG_SIZE; i++)
res &= (tl->tl_head[i] == NULL);
return (res);
}

/*
Expand Down
16 changes: 7 additions & 9 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)

memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));

for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
vsx->vsx_active_queue[t] =
vd->vdev_queue.vq_class[t].vqc_active;
vsx->vsx_pend_queue[t] = avl_numnodes(
&vd->vdev_queue.vq_class[t].vqc_queued_tree);
for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
}
}
}
Expand Down Expand Up @@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
vdev_queue_t *vq = &vd->vdev_queue;

mutex_enter(&vq->vq_lock);
if (avl_numnodes(&vq->vq_active_tree) > 0) {
if (vq->vq_active > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;

zfs_dbgmsg("slow vdev: %s has %lu active IOs",
vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
zfs_dbgmsg("slow vdev: %s has %u active IOs",
vd->vdev_path, vq->vq_active);

/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
fio = avl_first(&vq->vq_active_tree);
fio = list_head(&vq->vq_active_list);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);
Expand Down
Loading

0 comments on commit b9abf43

Please sign in to comment.