From a91cc99a6b49f87063ddb0ec2917f87b8c967cb4 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 31 Jul 2024 14:35:48 +1000 Subject: [PATCH 1/8] linux/zvol_os: tidy and document queue limit/config setup It gets hairier again in Linux 6.11, so I want some actual theory of operation laid out for next time. Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ --- module/os/linux/zfs/zvol_os.c | 45 +++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 83f80f62aee7..48e49a50a9c3 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2024, Rob Norris * Copyright (c) 2024, Klara, Inc. */ @@ -1089,6 +1090,34 @@ static const struct block_device_operations zvol_ops = { #endif }; +/* + * Since 6.9, Linux has been removing queue limit setters in favour of an + * initial queue_limits struct applied when the device is open. Since 6.11, + * queue_limits is being extended to allow more things to be applied when the + * device is open. Setters are also being removed for this. + * + * For OpenZFS, this means that depending on kernel version, some options may + * be set up before the device is open, and some applied to an open device + * (queue) after the fact. + * + * We manage this complexity by having our own limits struct, + * zvol_queue_limits_t, in which we carry any queue config that we're + * interested in setting. This structure is the same on all kernels. + * + * These limits are then applied to the queue at device open time by the most + * appropriate method for the kernel. + * + * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of + * blk_alloc_disk() exists). This converts our limits struct to a proper Linux + * struct queue_limits, and passes it in. Any fields added in later kernels are + * (obviously) not set up here. + * + * zvol_queue_limits_apply() is called on all kernel versions after the queue + * is created, and applies any remaining config. Before 6.9 that will be + * everything, via setter methods. After 6.9 that will be whatever couldn't be + * put into struct queue_limits. (This implies that zvol_queue_limits_apply() + * will always be a no-op on the latest kernel we support). + */ typedef struct zvol_queue_limits { unsigned int zql_max_hw_sectors; unsigned short zql_max_segments; @@ -1175,17 +1204,18 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits, qlimits->max_segment_size = limits->zql_max_segment_size; qlimits->io_opt = limits->zql_io_opt; } -#else +#endif + static void zvol_queue_limits_apply(zvol_queue_limits_t *limits, struct request_queue *queue) { +#ifndef HAVE_BLK_ALLOC_DISK_2ARG blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); blk_queue_max_segments(queue, limits->zql_max_segments); blk_queue_max_segment_size(queue, limits->zql_max_segment_size); blk_queue_io_opt(queue, limits->zql_io_opt); } -#endif static int zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) @@ -1223,7 +1253,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) } zso->zvo_disk->queue = zso->zvo_queue; - zvol_queue_limits_apply(limits, zso->zvo_queue); #endif /* HAVE_BLK_ALLOC_DISK */ #else zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); @@ -1237,8 +1266,10 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) } zso->zvo_disk->queue = zso->zvo_queue; - zvol_queue_limits_apply(limits, zso->zvo_queue); #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ + + zvol_queue_limits_apply(limits, zso->zvo_queue); + return (0); } @@ -1260,7 +1291,6 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) return (1); } zso->zvo_queue = zso->zvo_disk->queue; - zvol_queue_limits_apply(limits, zso->zvo_queue); zso->zvo_disk->minors = ZVOL_MINORS; #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) struct queue_limits qlimits; @@ -1291,10 +1321,11 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) /* Our queue is now created, assign it to our disk */ zso->zvo_disk->queue = zso->zvo_queue; - zvol_queue_limits_apply(limits, zso->zvo_queue); - #endif + + zvol_queue_limits_apply(limits, zso->zvo_queue); #endif + return (0); } From 36f8c0fe34cedde6ca0f1b84315313281b05c60a Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 30 Jul 2024 21:40:35 +1000 Subject: [PATCH 2/8] Linux 6.11: enable queue flush through queue limits In 6.11 struct queue_limits gains a 'features' field, where, among other things, flush and write-cache are enabled. Detect it and use it. Along the way, the blk_queue_set_write_cache() compat wrapper gets a little cleanup. Since both flags are alway set together, its now a single bool. Also the very very ancient version that sets q->flush_flags directly couldn't actually turn it off, so I've fixed that. Not that we use it, but still. Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ --- config/kernel-make-request-fn.m4 | 21 +++++++++++++ include/os/linux/kernel/linux/blkdev_compat.h | 31 ++++++++++++------- module/os/linux/zfs/zvol_os.c | 12 +++++-- 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/config/kernel-make-request-fn.m4 b/config/kernel-make-request-fn.m4 index 9813ad2fb3f3..4c54bdd6d4a2 100644 --- a/config/kernel-make-request-fn.m4 +++ b/config/kernel-make-request-fn.m4 @@ -58,6 +58,13 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ disk = blk_alloc_disk(lim, NUMA_NO_NODE); ]) + ZFS_LINUX_TEST_SRC([blkdev_queue_limits_features], [ + #include + ],[ + struct queue_limits *lim = NULL; + lim->features = 0; + ]) + ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [ #include ],[ @@ -114,6 +121,20 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [ AC_MSG_RESULT(yes) AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args]) + dnl # + dnl # Linux 6.11 API change: + dnl # struct queue_limits gains a 'features' field, + dnl # used to set flushing options + dnl # + AC_MSG_CHECKING([whether struct queue_limits has a features field]) + ZFS_LINUX_TEST_RESULT([blkdev_queue_limits_features], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLKDEV_QUEUE_LIMITS_FEATURES], 1, + [struct queue_limits has a features field]) + ], [ + AC_MSG_RESULT(no) + ]) + dnl # dnl # 5.20 API change, dnl # Removed blk_cleanup_disk(), put_disk() should be used. diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index 658f546213de..b7c21f5b317a 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -57,6 +57,11 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q) #endif /* + * 6.11 API + * Setting the flush flags directly is no longer possible; flush flags are set + * on the queue_limits structure and passed to blk_disk_alloc(). In this case + * we remove this function entirely. + * * 4.7 API, * The blk_queue_write_cache() interface has replaced blk_queue_flush() * interface. However, the new interface is GPL-only thus we implement @@ -68,31 +73,33 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q) * new one is GPL-only. Thus if the GPL-only version is detected we * implement our own trivial helper. */ +#if !defined(HAVE_BLK_ALLOC_DISK_2ARG) || \ + !defined(HAVE_BLKDEV_QUEUE_LIMITS_FEATURES) static inline void -blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua) +blk_queue_set_write_cache(struct request_queue *q, bool on) { #if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY) - if (wc) + if (on) { blk_queue_flag_set(QUEUE_FLAG_WC, q); - else - blk_queue_flag_clear(QUEUE_FLAG_WC, q); - if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); - else + } else { + blk_queue_flag_clear(QUEUE_FLAG_WC, q); blk_queue_flag_clear(QUEUE_FLAG_FUA, q); + } #elif defined(HAVE_BLK_QUEUE_WRITE_CACHE) - blk_queue_write_cache(q, wc, fua); + blk_queue_write_cache(q, on, on); #elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY) - if (wc) - q->flush_flags |= REQ_FLUSH; - if (fua) - q->flush_flags |= REQ_FUA; + if (on) + q->flush_flags |= REQ_FLUSH | REQ_FUA; + else + q->flush_flags &= ~(REQ_FLUSH | REQ_FUA); #elif defined(HAVE_BLK_QUEUE_FLUSH) - blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0)); + blk_queue_flush(q, on ? (REQ_FLUSH | REQ_FUA) : 0); #else #error "Unsupported kernel" #endif } +#endif /* !HAVE_BLK_ALLOC_DISK_2ARG || !HAVE_BLKDEV_QUEUE_LIMITS_FEATURES */ /* * Detect if a device has a write cache. Used to set the intial value for the diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 48e49a50a9c3..044e9a35600e 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1203,6 +1203,9 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits, qlimits->max_segments = limits->zql_max_segments; qlimits->max_segment_size = limits->zql_max_segment_size; qlimits->io_opt = limits->zql_io_opt; +#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES + qlimits->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; +#endif } #endif @@ -1215,6 +1218,9 @@ zvol_queue_limits_apply(zvol_queue_limits_t *limits, blk_queue_max_segments(queue, limits->zql_max_segments); blk_queue_max_segment_size(queue, limits->zql_max_segment_size); blk_queue_io_opt(queue, limits->zql_io_opt); +#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES + blk_queue_set_write_cache(queue, B_TRUE); +#endif } static int @@ -1238,6 +1244,10 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) return (1); } +#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES + blk_queue_set_write_cache(zso->zvo_queue, B_TRUE); +#endif + zso->zvo_disk = disk; zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; @@ -1391,8 +1401,6 @@ zvol_alloc(dev_t dev, const char *name) if (ret != 0) goto out_kmem; - blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); - /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); From 179eca837ec393664dbbf2f889c3244ea354bb1c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 30 Jul 2024 22:25:50 +1000 Subject: [PATCH 3/8] Linux 6.11: get backing_dev_info through queue gendisk It's no longer available directly on the request queue, but its easy to get from the attached disk. Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ --- config/kernel-blk-queue.m4 | 28 +++++++++++++++++++ include/os/linux/kernel/linux/blkdev_compat.h | 4 ++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 index 2f0b386e6637..a064140f337a 100644 --- a/config/kernel-blk-queue.m4 +++ b/config/kernel-blk-queue.m4 @@ -25,6 +25,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PLUG], [ dnl # dnl # 2.6.32 - 4.11: statically allocated bdi in request_queue dnl # 4.12: dynamically allocated bdi in request_queue +dnl # 6.11: bdi no longer available through request_queue, so get it from +dnl # the gendisk attached to the queue dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [ ZFS_LINUX_TEST_SRC([blk_queue_bdi], [ @@ -47,6 +49,30 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [ ]) ]) +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI], [ + ZFS_LINUX_TEST_SRC([blk_queue_disk_bdi], [ + #include + #include + ], [ + struct request_queue q; + struct gendisk disk; + struct backing_dev_info bdi __attribute__ ((unused)); + q.disk = &disk; + q.disk->bdi = &bdi; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI], [ + AC_MSG_CHECKING([whether backing_dev_info is available through queue gendisk]) + ZFS_LINUX_TEST_RESULT([blk_queue_disk_bdi], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_DISK_BDI, 1, + [backing_dev_info is available through queue gendisk]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + dnl # dnl # 5.9: added blk_queue_update_readahead(), dnl # 5.15: renamed to disk_update_readahead() @@ -407,6 +433,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI + ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE @@ -421,6 +448,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ ZFS_AC_KERNEL_BLK_QUEUE_PLUG ZFS_AC_KERNEL_BLK_QUEUE_BDI + ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD ZFS_AC_KERNEL_BLK_QUEUE_DISCARD ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index b7c21f5b317a..c2e818b4d4ee 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -133,8 +133,10 @@ blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages) { #if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \ !defined(HAVE_DISK_UPDATE_READAHEAD) -#ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC +#if defined(HAVE_BLK_QUEUE_BDI_DYNAMIC) q->backing_dev_info->ra_pages = ra_pages; +#elif defined(HAVE_BLK_QUEUE_DISK_BDI) + q->disk->bdi->ra_pages = ra_pages; #else q->backing_dev_info.ra_pages = ra_pages; #endif From 0b543f70bae6cc42b2b1add073d47f1ab699e882 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 31 Jul 2024 12:15:07 +1000 Subject: [PATCH 4/8] Linux 6.11: first arg to proc_handler is now const Detect it, and use a macro to make sure we always match the prototype. Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ --- config/kernel-register_sysctl_table.m4 | 33 ++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/spl/spl-proc.c | 12 +++++++--- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4 index a5e934f56d29..b8a0e0b17332 100644 --- a/config/kernel-register_sysctl_table.m4 +++ b/config/kernel-register_sysctl_table.m4 @@ -25,3 +25,36 @@ AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [ AC_MSG_RESULT([no]) ]) ]) + +dnl # +dnl # Linux 6.11 makes const the ctl_table arg of proc_handler +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST], [ + ZFS_LINUX_TEST_SRC([has_proc_handler_ctl_table_const], [ + #include + + static int test_handler( + const struct ctl_table *ctl __attribute((unused)), + int write __attribute((unused)), + void *buffer __attribute((unused)), + size_t *lenp __attribute((unused)), + loff_t *ppos __attribute((unused))) + { + return (0); + } + ], [ + proc_handler *ph __attribute((unused)) = + &test_handler; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST], [ + AC_MSG_CHECKING([whether proc_handler ctl_table arg is const]) + ZFS_LINUX_TEST_RESULT([has_proc_handler_ctl_table_const], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_PROC_HANDLER_CTL_TABLE_CONST, 1, + [proc_handler ctl_table arg is const]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index f282ccd8b9d7..6194c119cca6 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE + ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ ZFS_AC_KERNEL_SRC_SYNC_BDEV ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE @@ -319,6 +320,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE + ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST ZFS_AC_KERNEL_COPY_SPLICE_READ ZFS_AC_KERNEL_SYNC_BDEV ZFS_AC_KERNEL_MM_PAGE_SIZE diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index f0f929d3ce90..22f587934d25 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -43,6 +43,12 @@ typedef struct ctl_table __no_const spl_ctl_table; typedef struct ctl_table spl_ctl_table; #endif +#ifdef HAVE_PROC_HANDLER_CTL_TABLE_CONST +#define CONST_CTL_TABLE const struct ctl_table +#else +#define CONST_CTL_TABLE struct ctl_table +#endif + static unsigned long table_min = 0; static unsigned long table_max = ~0; @@ -60,7 +66,7 @@ struct proc_dir_entry *proc_spl_kstat = NULL; #ifdef DEBUG_KMEM static int -proc_domemused(struct ctl_table *table, int write, +proc_domemused(CONST_CTL_TABLE *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int rc = 0; @@ -88,7 +94,7 @@ proc_domemused(struct ctl_table *table, int write, #endif /* DEBUG_KMEM */ static int -proc_doslab(struct ctl_table *table, int write, +proc_doslab(CONST_CTL_TABLE *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int rc = 0; @@ -135,7 +141,7 @@ proc_doslab(struct ctl_table *table, int write, } static int -proc_dohostid(struct ctl_table *table, int write, +proc_dohostid(CONST_CTL_TABLE *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char *end, str[32]; From f618abf1e6d39c5de462e381d6adf1e2ac03453f Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 31 Jul 2024 14:48:58 +1000 Subject: [PATCH 5/8] Linux 6.11: IO stats is now a queue feature flag Apply them with with the rest of the settings. Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ --- module/os/linux/zfs/zvol_os.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 044e9a35600e..5aad4e430c8d 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1204,7 +1204,8 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits, qlimits->max_segment_size = limits->zql_max_segment_size; qlimits->io_opt = limits->zql_io_opt; #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES - qlimits->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; + qlimits->features = + BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; #endif } #endif @@ -1220,6 +1221,7 @@ zvol_queue_limits_apply(zvol_queue_limits_t *limits, blk_queue_io_opt(queue, limits->zql_io_opt); #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES blk_queue_set_write_cache(queue, B_TRUE); + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); #endif } @@ -1409,9 +1411,6 @@ zvol_alloc(dev_t dev, const char *name) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); } - /* Enable /proc/diskstats */ - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); - zso->zvo_queue->queuedata = zv; zso->zvo_dev = dev; zv->zv_open_count = 0; From cf941b931c7867de00e5358f49604f6342ed4d8d Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 31 Jul 2024 17:22:20 +1000 Subject: [PATCH 6/8] Linux 6.11: add more queue_limit fields with removed setters These fields are very old, so no detection necessary; we just move them into the limit setup functions. Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ --- module/os/linux/zfs/zvol_os.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 5aad4e430c8d..2beec6436bff 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1123,6 +1123,9 @@ typedef struct zvol_queue_limits { unsigned short zql_max_segments; unsigned int zql_max_segment_size; unsigned int zql_io_opt; + unsigned int zql_physical_block_size; + unsigned int zql_max_discard_sectors; + unsigned int zql_discard_granularity; } zvol_queue_limits_t; static void @@ -1191,6 +1194,11 @@ zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, } limits->zql_io_opt = zv->zv_volblocksize; + + limits->zql_physical_block_size = zv->zv_volblocksize; + limits->zql_max_discard_sectors = + (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; + limits->zql_discard_granularity = zv->zv_volblocksize; } #ifdef HAVE_BLK_ALLOC_DISK_2ARG @@ -1203,6 +1211,9 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits, qlimits->max_segments = limits->zql_max_segments; qlimits->max_segment_size = limits->zql_max_segment_size; qlimits->io_opt = limits->zql_io_opt; + qlimits->physical_block_size = limits->zql_physical_block_size; + qlimits->max_discard_sectors = limits->zql_max_discard_sectors; + qlimits->discard_granularity = limits->zql_discard_granularity; #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES qlimits->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; @@ -1219,6 +1230,10 @@ zvol_queue_limits_apply(zvol_queue_limits_t *limits, blk_queue_max_segments(queue, limits->zql_max_segments); blk_queue_max_segment_size(queue, limits->zql_max_segment_size); blk_queue_io_opt(queue, limits->zql_io_opt); + blk_queue_physical_block_size(queue, limits->zql_physical_block_size); + blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); + blk_queue_discard_granularity(queue, limits->zql_discard_granularity); +#endif #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES blk_queue_set_write_cache(queue, B_TRUE); blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); @@ -1677,14 +1692,6 @@ zvol_os_create_minor(const char *name) set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); - - - blk_queue_physical_block_size(zv->zv_zso->zvo_queue, - zv->zv_volblocksize); - blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, - (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); - blk_queue_discard_granularity(zv->zv_zso->zvo_queue, - zv->zv_volblocksize); #ifdef QUEUE_FLAG_DISCARD blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); #endif From ccbacb853b589576c27daa4fca9bcd090dbe52c0 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 31 Jul 2024 18:43:39 +1000 Subject: [PATCH 7/8] Linux 6.11: add compat macro for page_mapping() Since the change to folios it has just been a wrapper anyway. Linux has removed their wrapper, so we add one. Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ --- config/kernel-mm-page-size.m4 | 17 ----------- config/kernel-mm-pagemap.m4 | 36 +++++++++++++++++++++++ config/kernel.m4 | 2 ++ include/os/linux/kernel/linux/mm_compat.h | 7 +++++ module/os/linux/zfs/zfs_vnops_os.c | 1 + 5 files changed, 46 insertions(+), 17 deletions(-) delete mode 100644 config/kernel-mm-page-size.m4 create mode 100644 config/kernel-mm-pagemap.m4 diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4 deleted file mode 100644 index d5ebd926986a..000000000000 --- a/config/kernel-mm-page-size.m4 +++ /dev/null @@ -1,17 +0,0 @@ -AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [ - ZFS_LINUX_TEST_SRC([page_size], [ - #include - ],[ - unsigned long s; - s = page_size(NULL); - ]) -]) -AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [ - AC_MSG_CHECKING([whether page_size() is available]) - ZFS_LINUX_TEST_RESULT([page_size], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-mm-pagemap.m4 b/config/kernel-mm-pagemap.m4 new file mode 100644 index 000000000000..466b6fa07d9a --- /dev/null +++ b/config/kernel-mm-pagemap.m4 @@ -0,0 +1,36 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [ + ZFS_LINUX_TEST_SRC([page_size], [ + #include + ],[ + unsigned long s; + s = page_size(NULL); + ]) +]) +AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [ + AC_MSG_CHECKING([whether page_size() is available]) + ZFS_LINUX_TEST_RESULT([page_size], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + + +AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING], [ + ZFS_LINUX_TEST_SRC([page_mapping], [ + #include + ],[ + struct page *p = NULL; + struct address_space *m = page_mapping(NULL); + ]) +]) +AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_MAPPING], [ + AC_MSG_CHECKING([whether page_mapping() is available]) + ZFS_LINUX_TEST_RESULT([page_mapping], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MM_PAGE_MAPPING, 1, [page_mapping() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 6194c119cca6..d6ea3453292a 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -171,6 +171,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ ZFS_AC_KERNEL_SRC_SYNC_BDEV ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE + ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -324,6 +325,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_COPY_SPLICE_READ ZFS_AC_KERNEL_SYNC_BDEV ZFS_AC_KERNEL_MM_PAGE_SIZE + ZFS_AC_KERNEL_MM_PAGE_MAPPING case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h index 40056c68d6dd..817f6df422de 100644 --- a/include/os/linux/kernel/linux/mm_compat.h +++ b/include/os/linux/kernel/linux/mm_compat.h @@ -21,16 +21,23 @@ /* * Copyright (c) 2023, 2024, Klara Inc. + * Copyright (c) 2024, Rob Norris */ #ifndef _ZFS_MM_COMPAT_H #define _ZFS_MM_COMPAT_H #include +#include /* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */ #ifndef HAVE_MM_PAGE_SIZE #define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p))) #endif +/* 6.11 removed page_mapping(). A simple wrapper around folio_mapping() works */ +#ifndef HAVE_MM_PAGE_MAPPING +#define page_mapping(p) folio_mapping(page_folio(p)) +#endif + #endif /* _ZFS_MM_COMPAT_H */ diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 1cecad9f7755..8061169c3293 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -69,6 +69,7 @@ #include #include #include +#include /* * Programming rules. From 50229a1b4edf2987b9a17709c46e4281dc3abce9 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 31 Jul 2024 21:39:31 +1000 Subject: [PATCH 8/8] Linux 6.11: avoid passing "end" sentinel to register_sysctl() Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ --- config/kernel-register_sysctl_table.m4 | 26 ++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/spl/spl-proc.c | 41 ++++++++++++++++++++++++-- 3 files changed, 66 insertions(+), 3 deletions(-) diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4 index b8a0e0b17332..12ffe9d95142 100644 --- a/config/kernel-register_sysctl_table.m4 +++ b/config/kernel-register_sysctl_table.m4 @@ -26,6 +26,32 @@ AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [ ]) ]) +dnl # +dnl # Linux 6.11 register_sysctl() enforces that sysctl tables no longer +dnl # supply a sentinel end-of-table element. 6.6 introduces +dnl # register_sysctl_sz() to enable callers to choose, so we use it if +dnl # available for backward compatibility. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ], [ + ZFS_LINUX_TEST_SRC([has_register_sysctl_sz], [ + #include + ],[ + struct ctl_table test_table[] __attribute__((unused)) = {0}; + register_sysctl_sz("", test_table, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ], [ + AC_MSG_CHECKING([whether register_sysctl_sz exists]) + ZFS_LINUX_TEST_RESULT([has_register_sysctl_sz], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_REGISTER_SYSCTL_SZ, 1, + [register_sysctl_sz exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) + dnl # dnl # Linux 6.11 makes const the ctl_table arg of proc_handler dnl # diff --git a/config/kernel.m4 b/config/kernel.m4 index d6ea3453292a..4d471358d242 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE + ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ ZFS_AC_KERNEL_SRC_SYNC_BDEV @@ -321,6 +322,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE + ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST ZFS_AC_KERNEL_COPY_SPLICE_READ ZFS_AC_KERNEL_SYNC_BDEV diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index 22f587934d25..2c0cdd9febf5 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -22,6 +22,9 @@ * * Solaris Porting Layer (SPL) Proc Implementation. */ +/* + * Copyright (c) 2024, Rob Norris + */ #include #include @@ -694,6 +697,37 @@ static void spl_proc_cleanup(void) } } +#ifndef HAVE_REGISTER_SYSCTL_TABLE + +/* + * Traditionally, struct ctl_table arrays have been terminated by an "empty" + * sentinel element (specifically, one with .procname == NULL). + * + * Linux 6.6 began migrating away from this, adding register_sysctl_sz() so + * that callers could provide the size directly, and redefining + * register_sysctl() to just call register_sysctl_sz() with the array size. It + * retained support for the terminating element so that existing callers would + * continue to work. + * + * Linux 6.11 removed support for the terminating element, instead interpreting + * it as a real malformed element, and rejecting it. + * + * In order to continue support older kernels, we retain the terminating + * sentinel element for our sysctl tables, but instead detect availability of + * register_sysctl_sz(). If it exists, we pass it the array size -1, stopping + * the kernel from trying to process the terminator. For pre-6.6 kernels that + * don't have register_sysctl_sz(), we just use register_sysctl(), which can + * handle the terminating element as it always has. + */ +#ifdef HAVE_REGISTER_SYSCTL_SZ +#define spl_proc_register_sysctl(p, t) \ + register_sysctl_sz(p, t, ARRAY_SIZE(t)-1) +#else +#define spl_proc_register_sysctl(p, t) \ + register_sysctl(p, t) +#endif +#endif + int spl_proc_init(void) { @@ -704,16 +738,17 @@ spl_proc_init(void) if (spl_header == NULL) return (-EUNATCH); #else - spl_header = register_sysctl("kernel/spl", spl_table); + spl_header = spl_proc_register_sysctl("kernel/spl", spl_table); if (spl_header == NULL) return (-EUNATCH); - spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table); + spl_kmem = spl_proc_register_sysctl("kernel/spl/kmem", spl_kmem_table); if (spl_kmem == NULL) { rc = -EUNATCH; goto out; } - spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table); + spl_kstat = spl_proc_register_sysctl("kernel/spl/kstat", + spl_kstat_table); if (spl_kstat == NULL) { rc = -EUNATCH; goto out;