Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zvol: use multiple taskqs #15992

Merged
merged 1 commit into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -2350,6 +2350,13 @@ The number of requests which can be handled concurrently is controlled by
is ignored when running on a kernel that supports block multiqueue
.Pq Li blk-mq .
.
.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint
Number of zvol taskqs.
If
.Sy 0
(the default) then scaling is done internally to prefer 6 threads per taskq.
This only applies on Linux.
.
.It Sy zvol_threads Ns = Ns Sy 0 Pq uint
The number of system wide threads to use for processing zvol block IOs.
If
Expand Down
102 changes: 92 additions & 10 deletions module/os/linux/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <sys/spa_impl.h>
#include <sys/zvol.h>
#include <sys/zvol_impl.h>
#include <cityhash.h>

#include <linux/blkdev_compat.h>
#include <linux/task_io_accounting_ops.h>
Expand All @@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0;
static unsigned int zvol_prefetch_bytes = (128 * 1024);
static unsigned long zvol_max_discard_blocks = 16384;

/*
* Switch taskq at multiple of 512 MB offset. This can be set to a lower value
* to utilize more threads for small files but may affect prefetch hits.
*/
#define ZVOL_TASKQ_OFFSET_SHIFT 29

#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
static unsigned int zvol_open_timeout_ms = 1000;
#endif
Expand All @@ -74,6 +81,7 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
* read and write tests to a zvol in an NVMe pool (with 16 CPUs).
*/
static unsigned int zvol_blk_mq_blocks_per_thread = 8;
static unsigned int zvol_num_taskqs = 0;
#endif

#ifndef BLKDEV_DEFAULT_RQ
Expand Down Expand Up @@ -114,7 +122,11 @@ struct zvol_state_os {
boolean_t use_blk_mq;
};

static taskq_t *zvol_taskq;
typedef struct zv_taskq {
uint_t tqs_cnt;
taskq_t **tqs_taskq;
} zv_taskq_t;
static zv_taskq_t zvol_taskqs;
static struct ida zvol_ida;

typedef struct zv_request_stack {
Expand Down Expand Up @@ -532,6 +544,17 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
}

zv_request_task_t *task;
zv_taskq_t *ztqs = &zvol_taskqs;
uint_t blk_mq_hw_queue = 0;
uint_t tq_idx;
uint_t taskq_hash;
#ifdef HAVE_BLK_MQ
if (rq)
blk_mq_hw_queue = rq->mq_hctx->queue_num;
#endif
taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
blk_mq_hw_queue, 0);
tq_idx = taskq_hash % ztqs->tqs_cnt;

if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
Expand Down Expand Up @@ -601,15 +624,15 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
zvol_discard(&zvr);
} else {
task = zv_request_task_create(zvr);
taskq_dispatch_ent(zvol_taskq,
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
zvol_discard_task, task, 0, &task->ent);
}
} else {
if (force_sync) {
zvol_write(&zvr);
} else {
task = zv_request_task_create(zvr);
taskq_dispatch_ent(zvol_taskq,
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
zvol_write_task, task, 0, &task->ent);
}
}
Expand All @@ -631,7 +654,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
zvol_read(&zvr);
} else {
task = zv_request_task_create(zvr);
taskq_dispatch_ent(zvol_taskq,
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
zvol_read_task, task, 0, &task->ent);
}
}
Expand Down Expand Up @@ -1570,8 +1593,40 @@ zvol_init(void)
zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
}

/*
* Use atleast 32 zvol_threads but for many core system,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can simplify to make it easer to reason about. One idea is:

  • Systems with >32 CPUs retain previous behavior of 1 taskq and 32 threads or zvol_threads value
  • Systems with 32 or greater will scale the number of taskq with CPU count / [6|8] (or some other #define value). And per taskq thread count can be 8 (perhaps making this a tunable)

It would be close to the values and behavior you have and the code would be a little simpler.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies if I misunderstood your reasoning, but it's worth noting that single taskq lock contention can be observed even on systems with fewer than 32 CPUs.

* prefer 6 threads per taskq, but no more taskqs
* than threads in them on large systems.
*
* taskq total
* cpus taskqs threads threads
* ------- ------- ------- -------
* 1 1 32 32
* 2 1 32 32
* 4 1 32 32
* 8 2 16 32
* 16 3 11 33
* 32 5 7 35
* 64 8 8 64
* 128 11 12 132
* 256 16 16 256
*/
zv_taskq_t *ztqs = &zvol_taskqs;
uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
if (num_tqs == 0) {
num_tqs = 1 + num_online_cpus() / 6;
while (num_tqs * num_tqs > zvol_actual_threads)
num_tqs--;
}
uint_t per_tq_thread = zvol_actual_threads / num_tqs;
if (per_tq_thread * num_tqs < zvol_actual_threads)
per_tq_thread++;
ztqs->tqs_cnt = num_tqs;
ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
error = register_blkdev(zvol_major, ZVOL_DRIVER);
if (error) {
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
ztqs->tqs_taskq = NULL;
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
return (error);
}
Expand All @@ -1591,11 +1646,22 @@ zvol_init(void)
1024);
}
#endif
zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,
zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
if (zvol_taskq == NULL) {
unregister_blkdev(zvol_major, ZVOL_DRIVER);
return (-ENOMEM);
for (uint_t i = 0; i < num_tqs; i++) {
char name[32];
(void) snprintf(name, sizeof (name), "%s_tq-%u",
ZVOL_DRIVER, i);
ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
maxclsyspri, per_tq_thread, INT_MAX,
TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
if (ztqs->tqs_taskq[i] == NULL) {
for (int j = i - 1; j >= 0; j--)
taskq_destroy(ztqs->tqs_taskq[j]);
unregister_blkdev(zvol_major, ZVOL_DRIVER);
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
sizeof (taskq_t *));
ztqs->tqs_taskq = NULL;
return (-ENOMEM);
}
}

zvol_init_impl();
Expand All @@ -1606,9 +1672,22 @@ zvol_init(void)
void
zvol_fini(void)
{
zv_taskq_t *ztqs = &zvol_taskqs;
zvol_fini_impl();
unregister_blkdev(zvol_major, ZVOL_DRIVER);
taskq_destroy(zvol_taskq);

if (ztqs->tqs_taskq == NULL) {
ASSERT3U(ztqs->tqs_cnt, ==, 0);
} else {
for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
taskq_destroy(ztqs->tqs_taskq[i]);
}
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
sizeof (taskq_t *));
ztqs->tqs_taskq = NULL;
}

ida_destroy(&zvol_ida);
}

Expand All @@ -1629,6 +1708,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
module_param(zvol_max_discard_blocks, ulong, 0444);
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");

module_param(zvol_num_taskqs, uint, 0444);
MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");

module_param(zvol_prefetch_bytes, uint, 0644);
MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");

Expand Down
Loading