From 6c2fc5691615e478d1360101b3ec22906e8ce7a8 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Tue, 1 Aug 2023 11:27:58 -0400 Subject: [PATCH 01/37] Linux 6.5 compat: register_sysctl_table removed Additionally, the .child element of ctl_table has been removed in 6.5. This change adds a new test for the pre-6.5 register_sysctl_table() function, and uses the old code in that case. If it isn't found, then the parentage entries in the tables are removed, and the register_sysctl call is provided the paths of "kernel/spl", "kernel/spl/kmem", and "kernel/spl/kstat" directly, to populate each subdirectory over three calls, as is the new API. Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15098 --- config/kernel-register_sysctl_table.m4 | 27 ++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/spl/spl-proc.c | 26 ++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 config/kernel-register_sysctl_table.m4 diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4 new file mode 100644 index 000000000000..f18316b32b6d --- /dev/null +++ b/config/kernel-register_sysctl_table.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # Linux 6.5 removes register_sysctl_table +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [ + ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [ + #include + + static struct ctl_table dummy_table[] = { + {} + }; + + ],[ + struct ctl_table_header *h + __attribute((unused)) = register_sysctl_table(dummy_table); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [ + AC_MSG_CHECKING([whether register_sysctl_table exists]) + ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_SYSCTL_REGISTER_TABLE, 1, + [sysctl_register_table exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 1487fa2e7793..28bd361d33ff 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -160,6 +160,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED + ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -299,6 +300,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED + ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index 01f5619e1893..bcc356ae55b6 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -624,6 +624,7 @@ static struct ctl_table spl_table[] = { .mode = 0644, .proc_handler = &proc_dohostid, }, +#ifdef HAVE_REGISTER_SYSCTL_TABLE { .procname = "kmem", .mode = 0555, @@ -634,9 +635,11 @@ static struct ctl_table spl_table[] = { .mode = 0555, .child = spl_kstat_table, }, +#endif {}, }; +#ifdef HAVE_REGISTER_SYSCTL_TABLE static struct ctl_table spl_dir[] = { { .procname = "spl", @@ -648,21 +651,38 @@ static struct ctl_table spl_dir[] = { static struct ctl_table spl_root[] = { { - .procname = "kernel", - .mode = 0555, - .child = spl_dir, + .procname = "kernel", + .mode = 0555, + .child = spl_dir, }, {} }; +#endif int spl_proc_init(void) { int rc = 0; +#ifdef HAVE_REGISTER_SYSCTL_TABLE spl_header = register_sysctl_table(spl_root); if (spl_header == NULL) return (-EUNATCH); +#else + spl_header = register_sysctl("kernel/spl", spl_table); + if (spl_header == NULL) + return (-EUNATCH); + + if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) { + rc = -EUNATCH; + goto out; + } + + if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) { + rc = -EUNATCH; + goto out; + } +#endif proc_spl = proc_mkdir("spl", NULL); if (proc_spl == NULL) { From c0f075c06b914b02e175f8de670b7b440630c7bc Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Tue, 1 Aug 2023 11:32:38 -0400 Subject: [PATCH 02/37] Linux 6.5 compat: use disk_check_media_change when it exists When disk_check_media_change() exists, then define zfs_check_media_change() to simply call disk_check_media_change() on the bd_disk member of its argument. Since disk_check_media_change() is newer than when revalidate_disk was present in bops, we should be able to safely do this via a macro, instead of recreating a new implementation of the inline function that forces revalidation. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Coleman Kane Closes #15101 --- include/os/linux/kernel/linux/blkdev_compat.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index e0f20ba32008..1641dd92a918 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -347,6 +347,7 @@ zfs_check_media_change(struct block_device *bdev) #define vdev_bdev_reread_part(bdev) zfs_check_media_change(bdev) #elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE) #define vdev_bdev_reread_part(bdev) disk_check_media_change(bdev->bd_disk) +#define zfs_check_media_change(bdev) disk_check_media_change(bdev->bd_disk) #else /* * This is encountered if check_disk_change() and bdev_check_media_change() From d76de9fb170d58f81edac5729c365fc3fd60a22f Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Tue, 1 Aug 2023 11:37:20 -0400 Subject: [PATCH 03/37] Linux 6.5 compat: blkdev changes Multiple changes to the blkdev API were introduced in Linux 6.5. This includes passing (void* holder) to blkdev_put, adding a new blk_holder_ops* arg to blkdev_get_by_path, adding a new blk_mode_t type that replaces uses of fmode_t, and removing an argument from the release handler on block_device_operations that we weren't using. The open function definition has also changed to take gendisk* and blk_mode_t, so update it accordingly, too. Implement local wrappers for blkdev_get_by_path() and vdev_blkdev_put() so that the in-line calls are cleaner, and place the conditionally-compiled implementation details inside of both of these local wrappers. Both calls are exclusively used within vdev_disk.c, at this time. Add blk_mode_is_open_write() to test FMODE_WRITE / BLK_OPEN_WRITE The wrapper function is now used for testing using the appropriate method for the kernel, whether the open mode is writable or not. Emphasize fmode_t arg in zvol_release is not used Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15099 --- config/kernel-blkdev.m4 | 84 ++++++++++++++++++- config/kernel-block-device-operations.m4 | 35 +++++++- include/os/linux/kernel/linux/blkdev_compat.h | 6 ++ module/os/linux/zfs/vdev_disk.c | 65 ++++++++++++-- module/os/linux/zfs/zfs_vnops_os.c | 2 +- module/os/linux/zfs/zpl_ctldir.c | 2 +- module/os/linux/zfs/zvol_os.c | 28 ++++++- 7 files changed, 203 insertions(+), 19 deletions(-) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index 887acee670ba..e04a2bd2c3b6 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -16,12 +16,63 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH], [ ]) ]) +dnl # +dnl # 6.5.x API change, +dnl # blkdev_get_by_path() takes 4 args +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG], [ + ZFS_LINUX_TEST_SRC([blkdev_get_by_path_4arg], [ + #include + #include + ], [ + struct block_device *bdev __attribute__ ((unused)) = NULL; + const char *path = "path"; + fmode_t mode = 0; + void *holder = NULL; + struct blk_holder_ops h; + + bdev = blkdev_get_by_path(path, mode, holder, &h); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ - AC_MSG_CHECKING([whether blkdev_get_by_path() exists]) + AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args]) ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 4 args]) + ZFS_LINUX_TEST_RESULT([blkdev_get_by_path_4arg], [ + AC_DEFINE(HAVE_BLKDEV_GET_BY_PATH_4ARG, 1, + [blkdev_get_by_path() exists and takes 4 args]) + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + ]) + ]) +]) + +dnl # +dnl # 6.5.x API change +dnl # blk_mode_t was added as a type to supercede some places where fmode_t +dnl # is used +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T], [ + ZFS_LINUX_TEST_SRC([blk_mode_t], [ + #include + #include + ], [ + blk_mode_t m __attribute((unused)) = (blk_mode_t)0; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T], [ + AC_MSG_CHECKING([whether blk_mode_t is defined]) + ZFS_LINUX_TEST_RESULT([blk_mode_t], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_MODE_T, 1, [blk_mode_t is defined]) + ], [ + AC_MSG_RESULT(no) ]) ]) @@ -41,12 +92,35 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT], [ ]) ]) +dnl # +dnl # 6.5.x API change. +dnl # blkdev_put() takes (void* holder) as arg 2 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER], [ + ZFS_LINUX_TEST_SRC([blkdev_put_holder], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + void *holder = NULL; + + blkdev_put(bdev, holder); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_MSG_CHECKING([whether blkdev_put() exists]) ZFS_LINUX_TEST_RESULT([blkdev_put], [ AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_put()]) + AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2]) + ZFS_LINUX_TEST_RESULT([blkdev_put_holder], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_PUT_HOLDER, 1, + [blkdev_put() accepts void* as arg 2]) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_put()]) + ]) ]) ]) @@ -495,7 +569,9 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH + ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG ZFS_AC_KERNEL_SRC_BLKDEV_PUT + ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV @@ -510,6 +586,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT + ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ @@ -530,4 +607,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT + ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T ]) diff --git a/config/kernel-block-device-operations.m4 b/config/kernel-block-device-operations.m4 index 84e39dc8a2f6..d13c1337b1fb 100644 --- a/config/kernel-block-device-operations.m4 +++ b/config/kernel-block-device-operations.m4 @@ -49,12 +49,42 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ ], [], []) ]) +dnl # +dnl # 5.9.x API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [ + ZFS_LINUX_TEST_SRC([block_device_operations_release_void_1arg], [ + #include + + void blk_release(struct gendisk *g) { + (void) g; + return; + } + + static const struct block_device_operations + bops __attribute__ ((unused)) = { + .open = NULL, + .release = blk_release, + .ioctl = NULL, + .compat_ioctl = NULL, + }; + ], [], []) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ - AC_MSG_CHECKING([whether bops->release() is void]) + AC_MSG_CHECKING([whether bops->release() is void and takes 2 args]) ZFS_LINUX_TEST_RESULT([block_device_operations_release_void], [ AC_MSG_RESULT(yes) ],[ - ZFS_LINUX_TEST_ERROR([bops->release()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether bops->release() is void and takes 1 arg]) + ZFS_LINUX_TEST_RESULT([block_device_operations_release_void_1arg], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [1], + [Define if release() in block_device_operations takes 1 arg]) + ],[ + ZFS_LINUX_TEST_ERROR([bops->release()]) + ]) ]) ]) @@ -92,6 +122,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS], [ ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK ]) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index 1641dd92a918..f111e648ccf7 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -398,6 +398,12 @@ vdev_lookup_bdev(const char *path, dev_t *dev) #endif } +#if defined(HAVE_BLK_MODE_T) +#define blk_mode_is_open_write(flag) ((flag) & BLK_OPEN_WRITE) +#else +#define blk_mode_is_open_write(flag) ((flag) & FMODE_WRITE) +#endif + /* * Kernels without bio_set_op_attrs use bi_rw for the bio flags. */ diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 925ee9d9fe9c..48ac55f07034 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -80,9 +80,22 @@ typedef struct dio_request { static unsigned int zfs_vdev_failfast_mask = 1; +#ifdef HAVE_BLK_MODE_T +static blk_mode_t +#else static fmode_t +#endif vdev_bdev_mode(spa_mode_t spa_mode) { +#ifdef HAVE_BLK_MODE_T + blk_mode_t mode = 0; + + if (spa_mode & SPA_MODE_READ) + mode |= BLK_OPEN_READ; + + if (spa_mode & SPA_MODE_WRITE) + mode |= BLK_OPEN_WRITE; +#else fmode_t mode = 0; if (spa_mode & SPA_MODE_READ) @@ -90,6 +103,7 @@ vdev_bdev_mode(spa_mode_t spa_mode) if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; +#endif return (mode); } @@ -197,12 +211,47 @@ vdev_disk_kobj_evt_post(vdev_t *v) } } +#if !defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) +/* + * Define a dummy struct blk_holder_ops for kernel versions + * prior to 6.5. + */ +struct blk_holder_ops {}; +#endif + +static struct block_device * +vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder, + const struct blk_holder_ops *hops) +{ +#ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG + return (blkdev_get_by_path(path, + vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops)); +#else + return (blkdev_get_by_path(path, + vdev_bdev_mode(mode) | FMODE_EXCL, holder)); +#endif +} + +static void +vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder) +{ +#ifdef HAVE_BLKDEV_PUT_HOLDER + return (blkdev_put(bdev, holder)); +#else + return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL)); +#endif +} + static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { struct block_device *bdev; +#ifdef HAVE_BLK_MODE_T + blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); +#else fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); +#endif hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; @@ -252,15 +301,15 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, reread_part = B_TRUE; } - blkdev_put(bdev, mode | FMODE_EXCL); + vdev_blkdev_put(bdev, mode, zfs_vdev_holder); } if (reread_part) { - bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, - zfs_vdev_holder); + bdev = vdev_blkdev_get_by_path(disk_name, mode, + zfs_vdev_holder, NULL); if (!IS_ERR(bdev)) { int error = vdev_bdev_reread_part(bdev); - blkdev_put(bdev, mode | FMODE_EXCL); + vdev_blkdev_put(bdev, mode, zfs_vdev_holder); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); @@ -305,8 +354,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, hrtime_t start = gethrtime(); bdev = ERR_PTR(-ENXIO); while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { - bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, - zfs_vdev_holder); + bdev = vdev_blkdev_get_by_path(v->vdev_path, mode, + zfs_vdev_holder, NULL); if (unlikely(PTR_ERR(bdev) == -ENOENT)) { /* * There is no point of waiting since device is removed @@ -382,8 +431,8 @@ vdev_disk_close(vdev_t *v) return; if (vd->vd_bdev != NULL) { - blkdev_put(vd->vd_bdev, - vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); + vdev_blkdev_put(vd->vd_bdev, spa_mode(v->vdev_spa), + zfs_vdev_holder); } rw_destroy(&vd->vd_lock); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 234c4d5ef0e0..33baac9db06b 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -186,7 +186,7 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) return (error); /* Honor ZFS_APPENDONLY file attribute */ - if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c index 68a7de78f471..7786444fea35 100644 --- a/module/os/linux/zfs/zpl_ctldir.c +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -42,7 +42,7 @@ static int zpl_common_open(struct inode *ip, struct file *filp) { - if (filp->f_mode & FMODE_WRITE) + if (blk_mode_is_open_write(filp->f_mode)) return (-EACCES); return (generic_file_open(ip, filp)); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 38bc8e2c4eeb..7a95b54bdf0d 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -671,7 +671,11 @@ zvol_request(struct request_queue *q, struct bio *bio) } static int +#ifdef HAVE_BLK_MODE_T +zvol_open(struct gendisk *disk, blk_mode_t flag) +#else zvol_open(struct block_device *bdev, fmode_t flag) +#endif { zvol_state_t *zv; int error = 0; @@ -686,10 +690,14 @@ zvol_open(struct block_device *bdev, fmode_t flag) /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting - * bdev->bd_disk->private_data to NULL is observed, or zvol_os_free() + * disk->private_data to NULL is observed, or zvol_os_free() * is not called on this zv because of the positive zv_open_count. */ +#ifdef HAVE_BLK_MODE_T + zv = disk->private_data; +#else zv = bdev->bd_disk->private_data; +#endif if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(-ENXIO)); @@ -769,14 +777,15 @@ zvol_open(struct block_device *bdev, fmode_t flag) } } - error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); + error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); if (drop_namespace) mutex_exit(&spa_namespace_lock); } if (error == 0) { - if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + if ((blk_mode_is_open_write(flag)) && + (zv->zv_flags & ZVOL_RDONLY)) { if (zv->zv_open_count == 0) zvol_last_close(zv); @@ -791,14 +800,25 @@ zvol_open(struct block_device *bdev, fmode_t flag) rw_exit(&zv->zv_suspend_lock); if (error == 0) +#ifdef HAVE_BLK_MODE_T + disk_check_media_change(disk); +#else zfs_check_media_change(bdev); +#endif return (error); } static void -zvol_release(struct gendisk *disk, fmode_t mode) +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG +zvol_release(struct gendisk *disk) +#else +zvol_release(struct gendisk *disk, fmode_t unused) +#endif { +#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) + (void) unused; +#endif zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; From 0bf2c5365ed3afa65545393d8da2317699f18b30 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Sun, 23 Jul 2023 01:34:29 -0400 Subject: [PATCH 04/37] Linux 6.4 compat: iter_iov() function now used to get old iov member The iov_iter->iov member is now iov_iter->__iov and must be accessed via the accessor function iter_iov(). Create a wrapper that is conditionally compiled to use the access method appropriate for the target kernel version. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Coleman Kane Closes #15100 --- config/kernel-vfs-iov_iter.m4 | 23 +++++++++++++++++++++++ include/os/linux/spl/sys/uio.h | 6 ++++++ module/os/linux/zfs/zpl_file.c | 8 +++----- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 index e0617faab02c..cc5a7ab0c237 100644 --- a/config/kernel-vfs-iov_iter.m4 +++ b/config/kernel-vfs-iov_iter.m4 @@ -93,6 +93,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ struct iov_iter iter = { 0 }; __attribute__((unused)) enum iter_type i = iov_iter_type(&iter); ]) + + ZFS_LINUX_TEST_SRC([iter_iov], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + __attribute__((unused)) const struct iovec *iov = iter_iov(&iter); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ @@ -201,4 +209,19 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ AC_DEFINE(HAVE_VFS_IOV_ITER, 1, [All required iov_iter interfaces are available]) ]) + + dnl # + dnl # Kernel 6.5 introduces the iter_iov() function that returns the + dnl # __iov member of an iov_iter*. The iov member was renamed to this + dnl # __iov member, and is intended to be accessed via the helper + dnl # function now. + dnl # + AC_MSG_CHECKING([whether iter_iov() is available]) + ZFS_LINUX_TEST_RESULT([iter_iov], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ITER_IOV, 1, + [iter_iov() is available]) + ],[ + AC_MSG_RESULT(no) + ]) ]) diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index fe2b5c07a018..082e930e46b3 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -173,4 +173,10 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, } #endif +#if defined(HAVE_ITER_IOV) +#define zfs_uio_iter_iov(iter) iter_iov((iter)) +#else +#define zfs_uio_iter_iov(iter) (iter)->iov +#endif + #endif /* SPL_UIO_H */ diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 73526db731c4..aedafd6002b9 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -300,17 +300,15 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, { #if defined(HAVE_VFS_IOV_ITER) zfs_uio_iov_iter_init(uio, to, pos, count, skip); -#else -#ifdef HAVE_IOV_ITER_TYPE - zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, +#elif defined(HAVE_IOV_ITER_TYPE) + zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, count, skip); #else - zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, + zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, count, skip); #endif -#endif } static ssize_t From 8be6308e85cffb2247753033fbfc3641f731af51 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Sun, 30 Jul 2023 15:23:47 -0400 Subject: [PATCH 05/37] Linux 4.20 compat: wrapper function for iov_iter type access An iov_iter_type() function to access the "type" member of the struct iov_iter was added at one point. Move the conditional logic to decide which method to use for accessing it into a macro and simplify the zpl_uio_init code. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Coleman Kane Closes #15100 --- include/os/linux/spl/sys/uio.h | 6 ++++++ module/os/linux/zfs/zpl_file.c | 7 ++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 082e930e46b3..cce097e16fbc 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -179,4 +179,10 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, #define zfs_uio_iter_iov(iter) (iter)->iov #endif +#if defined(HAVE_IOV_ITER_TYPE) +#define zfs_uio_iov_iter_type(iter) iov_iter_type((iter)) +#else +#define zfs_uio_iov_iter_type(iter) (iter)->type +#endif + #endif /* SPL_UIO_H */ diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index aedafd6002b9..f6af2ebd1163 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -300,13 +300,10 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, { #if defined(HAVE_VFS_IOV_ITER) zfs_uio_iov_iter_init(uio, to, pos, count, skip); -#elif defined(HAVE_IOV_ITER_TYPE) - zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, - iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, - count, skip); #else zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, - to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, + zfs_uio_iov_iter_type(to) & ITER_KVEC ? + UIO_SYSSPACE : UIO_USERSPACE, count, skip); #endif } From 3a68f3c50f82ec87d74b255e3a0db504ff366211 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Tue, 1 Aug 2023 17:48:19 -0400 Subject: [PATCH 06/37] Revert "Linux 6.5 compat: register_sysctl_table removed" This reverts commit b35374fd6474603170fd9a3c7503da6eb13ac712 as there are error messages when loading the SPL module. Errors seemed to be tied to duplicate a duplicate entry. Reviewed-by: Brian Behlendorf Signed-off-by: Brian Atkinson Closes #15134 --- config/kernel-register_sysctl_table.m4 | 27 -------------------------- config/kernel.m4 | 2 -- module/os/linux/spl/spl-proc.c | 26 +++---------------------- 3 files changed, 3 insertions(+), 52 deletions(-) delete mode 100644 config/kernel-register_sysctl_table.m4 diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4 deleted file mode 100644 index f18316b32b6d..000000000000 --- a/config/kernel-register_sysctl_table.m4 +++ /dev/null @@ -1,27 +0,0 @@ -dnl # -dnl # Linux 6.5 removes register_sysctl_table -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [ - ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [ - #include - - static struct ctl_table dummy_table[] = { - {} - }; - - ],[ - struct ctl_table_header *h - __attribute((unused)) = register_sysctl_table(dummy_table); - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [ - AC_MSG_CHECKING([whether register_sysctl_table exists]) - ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_SYSCTL_REGISTER_TABLE, 1, - [sysctl_register_table exists]) - ],[ - AC_MSG_RESULT([no]) - ]) -]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 28bd361d33ff..1487fa2e7793 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -160,7 +160,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED - ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -300,7 +299,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED - ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index bcc356ae55b6..01f5619e1893 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -624,7 +624,6 @@ static struct ctl_table spl_table[] = { .mode = 0644, .proc_handler = &proc_dohostid, }, -#ifdef HAVE_REGISTER_SYSCTL_TABLE { .procname = "kmem", .mode = 0555, @@ -635,11 +634,9 @@ static struct ctl_table spl_table[] = { .mode = 0555, .child = spl_kstat_table, }, -#endif {}, }; -#ifdef HAVE_REGISTER_SYSCTL_TABLE static struct ctl_table spl_dir[] = { { .procname = "spl", @@ -651,38 +648,21 @@ static struct ctl_table spl_dir[] = { static struct ctl_table spl_root[] = { { - .procname = "kernel", - .mode = 0555, - .child = spl_dir, + .procname = "kernel", + .mode = 0555, + .child = spl_dir, }, {} }; -#endif int spl_proc_init(void) { int rc = 0; -#ifdef HAVE_REGISTER_SYSCTL_TABLE spl_header = register_sysctl_table(spl_root); if (spl_header == NULL) return (-EUNATCH); -#else - spl_header = register_sysctl("kernel/spl", spl_table); - if (spl_header == NULL) - return (-EUNATCH); - - if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) { - rc = -EUNATCH; - goto out; - } - - if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) { - rc = -EUNATCH; - goto out; - } -#endif proc_spl = proc_mkdir("spl", NULL); if (proc_spl == NULL) { From 31a4673c05ea942498a278d9dd519f251b501db1 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Wed, 2 Aug 2023 17:05:46 -0400 Subject: [PATCH 07/37] Linux 6.5 compat: register_sysctl_table removed Additionally, the .child element of ctl_table has been removed in 6.5. This change adds a new test for the pre-6.5 register_sysctl_table() function, and uses the old code in that case. If it isn't found, then the parentage entries in the tables are removed, and the register_sysctl call is provided the paths of "kernel/spl", "kernel/spl/kmem", and "kernel/spl/kstat" directly, to populate each subdirectory over three calls, as is the new API. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15138 --- config/kernel-register_sysctl_table.m4 | 27 ++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/spl/spl-proc.c | 26 ++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 config/kernel-register_sysctl_table.m4 diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4 new file mode 100644 index 000000000000..a5e934f56d29 --- /dev/null +++ b/config/kernel-register_sysctl_table.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # Linux 6.5 removes register_sysctl_table +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [ + ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [ + #include + + static struct ctl_table dummy_table[] = { + {} + }; + + ],[ + struct ctl_table_header *h + __attribute((unused)) = register_sysctl_table(dummy_table); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [ + AC_MSG_CHECKING([whether register_sysctl_table exists]) + ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_REGISTER_SYSCTL_TABLE, 1, + [register_sysctl_table exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 1487fa2e7793..28bd361d33ff 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -160,6 +160,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED + ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -299,6 +300,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED + ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index 01f5619e1893..bcc356ae55b6 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -624,6 +624,7 @@ static struct ctl_table spl_table[] = { .mode = 0644, .proc_handler = &proc_dohostid, }, +#ifdef HAVE_REGISTER_SYSCTL_TABLE { .procname = "kmem", .mode = 0555, @@ -634,9 +635,11 @@ static struct ctl_table spl_table[] = { .mode = 0555, .child = spl_kstat_table, }, +#endif {}, }; +#ifdef HAVE_REGISTER_SYSCTL_TABLE static struct ctl_table spl_dir[] = { { .procname = "spl", @@ -648,21 +651,38 @@ static struct ctl_table spl_dir[] = { static struct ctl_table spl_root[] = { { - .procname = "kernel", - .mode = 0555, - .child = spl_dir, + .procname = "kernel", + .mode = 0555, + .child = spl_dir, }, {} }; +#endif int spl_proc_init(void) { int rc = 0; +#ifdef HAVE_REGISTER_SYSCTL_TABLE spl_header = register_sysctl_table(spl_root); if (spl_header == NULL) return (-EUNATCH); +#else + spl_header = register_sysctl("kernel/spl", spl_table); + if (spl_header == NULL) + return (-EUNATCH); + + if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) { + rc = -EUNATCH; + goto out; + } + + if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) { + rc = -EUNATCH; + goto out; + } +#endif proc_spl = proc_mkdir("spl", NULL); if (proc_spl == NULL) { From 5a22de144abf2829bc8112e17a7a7e542da53dc5 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Mon, 7 Aug 2023 18:47:46 -0400 Subject: [PATCH 08/37] Linux 6.5 compat: replace generic_file_splice_read with filemap_splice_read The generic_file_splice_read function was removed in Linux 6.5 in favor of filemap_splice_read. Add an autoconf test for filemap_splice_read and use it if it is found as the handler for .splice_read in the file_operations struct. Additionally, ITER_PIPE was removed in 6.5. This change removes the ITER_* macros that OpenZFS doesn't use from being tested in config/kernel-vfs-iov_iter.m4. The removal of ITER_PIPE was causing the test to fail, which also affected the code responsible for setting the .splice_read handler, above. That behavior caused run-time panics on Linux 6.5. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15155 --- config/kernel-filemap-splice-read.m4 | 25 +++++++++++++++++++++++++ config/kernel-vfs-iov_iter.m4 | 3 +-- config/kernel.m4 | 2 ++ module/os/linux/zfs/zpl_file.c | 4 ++++ 4 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 config/kernel-filemap-splice-read.m4 diff --git a/config/kernel-filemap-splice-read.m4 b/config/kernel-filemap-splice-read.m4 new file mode 100644 index 000000000000..5199b7373e4d --- /dev/null +++ b/config/kernel-filemap-splice-read.m4 @@ -0,0 +1,25 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ], [ + dnl # + dnl # Kernel 6.5 - generic_file_splice_read was removed in favor + dnl # of filemap_splice_read for the .splice_read member of the + dnl # file_operations struct. + dnl # + ZFS_LINUX_TEST_SRC([has_filemap_splice_read], [ + #include + + struct file_operations fops __attribute__((unused)) = { + .splice_read = filemap_splice_read, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FILEMAP_SPLICE_READ], [ + AC_MSG_CHECKING([whether filemap_splice_read() exists]) + ZFS_LINUX_TEST_RESULT([has_filemap_splice_read], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILEMAP_SPLICE_READ, 1, + [filemap_splice_read exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 index cc5a7ab0c237..ff560ff3eef0 100644 --- a/config/kernel-vfs-iov_iter.m4 +++ b/config/kernel-vfs-iov_iter.m4 @@ -6,8 +6,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ #include #include ],[ - int type __attribute__ ((unused)) = - ITER_IOVEC | ITER_KVEC | ITER_BVEC | ITER_PIPE; + int type __attribute__ ((unused)) = ITER_KVEC; ]) ZFS_LINUX_TEST_SRC([iov_iter_advance], [ diff --git a/config/kernel.m4 b/config/kernel.m4 index 28bd361d33ff..309f1819be48 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -161,6 +161,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE + ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -301,6 +302,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE + ZFS_AC_KERNEL_FILEMAP_SPLICE_READ case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index f6af2ebd1163..24cc1064a8fc 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1323,7 +1323,11 @@ const struct file_operations zpl_file_operations = { .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #ifdef HAVE_VFS_IOV_ITER +#ifdef HAVE_FILEMAP_SPLICE_READ + .splice_read = filemap_splice_read, +#else .splice_read = generic_file_splice_read, +#endif .splice_write = iter_file_splice_write, #endif #else From 58a707375fef23697cb029fbad0e7e92bc78025b Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Tue, 8 Aug 2023 18:42:32 -0400 Subject: [PATCH 09/37] Linux 6.5 compat: Use copy_splice_read instead of filemap_splice_read Using the filemap_splice_read function for the splice_read handler was leading to occasional data corruption under certain circumstances. Favor using copy_splice_read instead, which does not demonstrate the same erroneous behavior under the tested failure cases. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15164 --- config/kernel-filemap-splice-read.m4 | 18 +++++++++--------- config/kernel.m4 | 4 ++-- module/os/linux/zfs/zpl_file.c | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/config/kernel-filemap-splice-read.m4 b/config/kernel-filemap-splice-read.m4 index 5199b7373e4d..4c83b31d738a 100644 --- a/config/kernel-filemap-splice-read.m4 +++ b/config/kernel-filemap-splice-read.m4 @@ -1,24 +1,24 @@ -AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ], [ +AC_DEFUN([ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ], [ dnl # dnl # Kernel 6.5 - generic_file_splice_read was removed in favor - dnl # of filemap_splice_read for the .splice_read member of the + dnl # of copy_splice_read for the .splice_read member of the dnl # file_operations struct. dnl # - ZFS_LINUX_TEST_SRC([has_filemap_splice_read], [ + ZFS_LINUX_TEST_SRC([has_copy_splice_read], [ #include struct file_operations fops __attribute__((unused)) = { - .splice_read = filemap_splice_read, + .splice_read = copy_splice_read, }; ],[]) ]) -AC_DEFUN([ZFS_AC_KERNEL_FILEMAP_SPLICE_READ], [ - AC_MSG_CHECKING([whether filemap_splice_read() exists]) - ZFS_LINUX_TEST_RESULT([has_filemap_splice_read], [ +AC_DEFUN([ZFS_AC_KERNEL_COPY_SPLICE_READ], [ + AC_MSG_CHECKING([whether copy_splice_read() exists]) + ZFS_LINUX_TEST_RESULT([has_copy_splice_read], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FILEMAP_SPLICE_READ, 1, - [filemap_splice_read exists]) + AC_DEFINE(HAVE_COPY_SPLICE_READ, 1, + [copy_splice_read exists]) ],[ AC_MSG_RESULT(no) ]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 309f1819be48..df194ec72207 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -161,7 +161,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE - ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ + ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -302,7 +302,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE - ZFS_AC_KERNEL_FILEMAP_SPLICE_READ + ZFS_AC_KERNEL_COPY_SPLICE_READ case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 24cc1064a8fc..3caa0fc6c214 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1323,8 +1323,8 @@ const struct file_operations zpl_file_operations = { .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #ifdef HAVE_VFS_IOV_ITER -#ifdef HAVE_FILEMAP_SPLICE_READ - .splice_read = filemap_splice_read, +#ifdef HAVE_COPY_SPLICE_READ + .splice_read = copy_splice_read, #else .splice_read = generic_file_splice_read, #endif From c7ee59a160f74049de1e459b3c3c63671784703f Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 2 Sep 2023 02:21:40 +0200 Subject: [PATCH 10/37] Linux 6.5 compat: safe cleanup in spl_proc_fini() If we fail to create a proc entry in spl_proc_init() we may end up calling unregister_sysctl_table() twice: one in the failure path of spl_proc_init() and another time during spl_proc_fini(). Avoid the double call to unregister_sysctl_table() and while at it refactor the code a bit to reduce code duplication. This was accidentally introduced when the spl code was updated for Linux 6.5 compatibility. Reviewed-by: Brian Behlendorf Reviewed-by: Ameer Hamza Signed-off-by: Andrea Righi Closes #15234 Closes #15235 --- module/os/linux/spl/spl-proc.c | 36 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index bcc356ae55b6..5cb5a6dadb05 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -659,6 +659,21 @@ static struct ctl_table spl_root[] = { }; #endif +static void spl_proc_cleanup(void) +{ + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + + if (spl_header) { + unregister_sysctl_table(spl_header); + spl_header = NULL; + } +} + int spl_proc_init(void) { @@ -723,15 +738,8 @@ spl_proc_init(void) goto out; } out: - if (rc) { - remove_proc_entry("kstat", proc_spl); - remove_proc_entry("slab", proc_spl_kmem); - remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); - remove_proc_entry("spl", NULL); - unregister_sysctl_table(spl_header); - } + if (rc) + spl_proc_cleanup(); return (rc); } @@ -739,13 +747,5 @@ spl_proc_init(void) void spl_proc_fini(void) { - remove_proc_entry("kstat", proc_spl); - remove_proc_entry("slab", proc_spl_kmem); - remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); - remove_proc_entry("spl", NULL); - - ASSERT(spl_header != NULL); - unregister_sysctl_table(spl_header); + spl_proc_cleanup(); } From cacc599aa20b6aba0ac8173386cea2f8b435068e Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 7 Sep 2023 23:36:32 +0200 Subject: [PATCH 11/37] Linux 6.5 compat: spl: properly unregister sysctl entries When register_sysctl_table() is unavailable we fail to properly unregister sysctl entries under "kernel/spl". This leads to errors like the following when spl is unloaded/reloaded, making impossible to properly reload the spl module: [ 746.995704] sysctl duplicate entry: /kernel/spl/kmem/slab_kvmem_total Fix by cleaning up all the sub-entries inside "kernel/spl" when the spl module is unloaded. Reviewed-by: Alexander Motin Reviewed-by: Brian Atkinson Signed-off-by: Andrea Righi Closes #15239 --- module/os/linux/spl/spl-proc.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index 5cb5a6dadb05..f0f929d3ce90 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -47,6 +47,10 @@ static unsigned long table_min = 0; static unsigned long table_max = ~0; static struct ctl_table_header *spl_header = NULL; +#ifndef HAVE_REGISTER_SYSCTL_TABLE +static struct ctl_table_header *spl_kmem = NULL; +static struct ctl_table_header *spl_kstat = NULL; +#endif static struct proc_dir_entry *proc_spl = NULL; static struct proc_dir_entry *proc_spl_kmem = NULL; static struct proc_dir_entry *proc_spl_kmem_slab = NULL; @@ -668,6 +672,16 @@ static void spl_proc_cleanup(void) remove_proc_entry("taskq", proc_spl); remove_proc_entry("spl", NULL); +#ifndef HAVE_REGISTER_SYSCTL_TABLE + if (spl_kstat) { + unregister_sysctl_table(spl_kstat); + spl_kstat = NULL; + } + if (spl_kmem) { + unregister_sysctl_table(spl_kmem); + spl_kmem = NULL; + } +#endif if (spl_header) { unregister_sysctl_table(spl_header); spl_header = NULL; @@ -688,12 +702,13 @@ spl_proc_init(void) if (spl_header == NULL) return (-EUNATCH); - if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) { + spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table); + if (spl_kmem == NULL) { rc = -EUNATCH; goto out; } - - if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) { + spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table); + if (spl_kstat == NULL) { rc = -EUNATCH; goto out; } From c011ef8c917473405a769d2b0bab1ead2e49dcc1 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 12 Sep 2023 12:51:11 -0700 Subject: [PATCH 12/37] Linux 6.5 compat: META (#15265) Update the META file to reflect compatibility with the 6.5 kernel. Signed-off-by: Tony Hutter Reviewed-by: Brian Behlendorf --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 0953cc51922f..9ffe90458dbd 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: rc4 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.4 +Linux-Maximum: 6.5 Linux-Minimum: 3.10 From 11943656f9233086260236e9eef5d752d9abe84c Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 19 Sep 2023 02:06:35 +0200 Subject: [PATCH 13/37] Update the MOS directory on spa_upgrade_errlog() spa_upgrade_errlog() does not update the MOS directory when the head_errlog feature is enabled. In this case if spa_errlog_sync() is not called, the MOS dir references the old errlog_last and errlog_sync objects. Thus when doing a scrub a panic will occur: Call Trace: dump_stack+0x6d/0x8b panic+0x101/0x2e3 spl_panic+0xcf/0x102 [spl] delete_errlog+0x124/0x130 [zfs] spa_errlog_sync+0x256/0x260 [zfs] spa_sync_iterate_to_convergence+0xe5/0x250 [zfs] spa_sync+0x2f7/0x670 [zfs] txg_sync_thread+0x22d/0x2d0 [zfs] thread_generic_wrapper+0x83/0xa0 [spl] kthread+0x104/0x140 ret_from_fork+0x1f/0x40 Fix this by updating the related MOS directory objects in spa_upgrade_errlog(). Reviewed-by: Mark Maybee Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #15279 Closes #15277 --- module/zfs/spa_errlog.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 2e5c22c11490..5dd08f597f33 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -930,12 +930,21 @@ spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) if (spa->spa_errlog_last != 0) { sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx); spa->spa_errlog_last = newobj; + + (void) zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, + sizeof (uint64_t), 1, &spa->spa_errlog_last, tx); } if (spa->spa_errlog_scrub != 0) { sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx); spa->spa_errlog_scrub = newobj; + + (void) zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, + sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx); } + mutex_exit(&spa->spa_errlog_lock); } From b9b9cdcdb148b50b8b647eb4bc300ec6c4ffd916 Mon Sep 17 00:00:00 2001 From: ednadolski-ix <137826107+ednadolski-ix@users.noreply.github.com> Date: Sat, 9 Sep 2023 11:23:29 -0600 Subject: [PATCH 14/37] update max_variance limit in zdb_block_size_histogram test for CI Commit 2d7843401a628ef8c483229742dd58bca70bc27e had previously updated this hardcoded limit to allow for CI testing. As there is no deterministic pass/fail value, the need has arisen for one more small increase. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Edmund Nadolski Closes #15252 --- .../functional/cli_root/zdb/zdb_block_size_histogram.ksh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh index 0a4d24fa695a..cfa26f54b11f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh @@ -204,11 +204,11 @@ function histo_check_test_pool # 4096 blocksize count for asize. For verification we stick # to just lsize counts. # - # The max_variance is hard-coded here at 12% to leave us some - # margin. Testing has shown this normally to be in the range - # of 2%-8%, but it may be as large as 11%. + # Variances are expected since this test does not account for + # metadata. The hardcoded limit here is empirical and should + # not be construed as deterministic. ################### - let max_variance=12 + let max_variance=15 let fail_value=0 let error_count=0 log_note "Comparisons for ${pool}" From 228b064d1b0acb991b7e27502f59f2ce22de8c4b Mon Sep 17 00:00:00 2001 From: Laura Hild Date: Mon, 11 Sep 2023 17:58:19 -0400 Subject: [PATCH 15/37] Remove implication that child `disk`s aren't vdevs in zpoolconcepts(7) Reviewed-by: Brian Behlendorf Signed-off-by: Laura Hild Closes #15247 --- man/man7/zpoolconcepts.7 | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index db3fd4926236..98f3ee7cd660 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -203,11 +203,9 @@ For more information, see the section. .El .Pp -Virtual devices cannot be nested, so a mirror or raidz virtual device can only -contain files or disks. -Mirrors of mirrors -.Pq or other combinations -are not allowed. +Virtual devices cannot be nested arbitrarily. +A mirror, raidz or draid virtual device can only be created with files or disks. +Mirrors of mirrors or other such combinations are not allowed. .Pp A pool can have any number of virtual devices at the top of the configuration .Po known as From 0ce7a068e9ec894cabef8f87c8603a06df055dd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Mon, 18 Sep 2023 18:08:41 +0200 Subject: [PATCH 16/37] check-zstd-symbols: also ignore __pfx_ symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b341b20d648bb7e9a3307c33163e7399f0913e66 Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Ahelenia Ziemiańska Closes #15282 Closes #15284 --- module/Makefile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/Makefile.in b/module/Makefile.in index 5b71e1abf79e..9b34b3dfaec7 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -168,4 +168,4 @@ gen-zstd-symbols: for obj in $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)); do echo; echo "/* $${obj#zstd/}: */"; @OBJDUMP@ -t $$obj | awk '$$2 == "g" && !/ zfs_/ {print "#define\t" $$6 " zfs_" $$6}' | sort; done >> zstd/include/zstd_compat_wrapper.h check-zstd-symbols: - @OBJDUMP@ -t $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)) | awk '/file format/ {print} $$2 == "g" && !/ zfs_/ {++ret; print} END {exit ret}' + @OBJDUMP@ -t $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)) | awk '/file format/ {print} $$2 == "g" && (!/ zfs_/ && !/ __pfx_zfs_/) {++ret; print} END {exit ret}' From 54c6fbd378eaa402eff34acf6a91c02d6cf9da11 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 18 Sep 2023 16:25:58 -0700 Subject: [PATCH 17/37] zed: Allow autoreplace and fault LEDs for removed vdevs Allow zed to autoreplace vdevs marked as REMOVED. Also update statechange-led zedlet to toggle fault LEDs for REMOVED vdevs. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15281 --- cmd/zed/agents/zfs_mod.c | 1 + cmd/zed/zed.d/statechange-led.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index a8d084bb4bd3..2f040ff7582c 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -372,6 +372,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) /* Only autoreplace bad disks */ if ((vs->vs_state != VDEV_STATE_DEGRADED) && (vs->vs_state != VDEV_STATE_FAULTED) && + (vs->vs_state != VDEV_STATE_REMOVED) && (vs->vs_state != VDEV_STATE_CANT_OPEN)) { zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in " "a bad state (currently %llu)", vs->vs_state); diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-led.sh index 46bfc1b866f1..40cb61f17307 100755 --- a/cmd/zed/zed.d/statechange-led.sh +++ b/cmd/zed/zed.d/statechange-led.sh @@ -121,7 +121,7 @@ state_to_val() { state="$1" case "$state" in - FAULTED|DEGRADED|UNAVAIL) + FAULTED|DEGRADED|UNAVAIL|REMOVED) echo 1 ;; ONLINE) From f7a07d76ee5a6b698540c6873f194350539a7065 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 19 Sep 2023 01:53:33 +0200 Subject: [PATCH 18/37] Retire z_nr_znodes Added in ab26409db753 ("Linux 3.1 compat, super_block->s_shrink"), with the only consumer which needed the count getting retired in 066e82522101 ("Linux compat: Minimum kernel version 3.10"). The counter gets in the way of not maintaining the list to begin with. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Mateusz Guzik Closes #15274 --- include/os/freebsd/zfs/sys/zfs_vfsops_os.h | 1 - include/os/linux/zfs/sys/zfs_vfsops_os.h | 1 - module/os/freebsd/zfs/zfs_vfsops.c | 8 +++----- module/os/freebsd/zfs/zfs_znode.c | 2 -- module/os/linux/zfs/zfs_ctldir.c | 1 - module/os/linux/zfs/zfs_vfsops.c | 7 +++---- module/os/linux/zfs/zfs_znode.c | 2 -- 7 files changed, 6 insertions(+), 16 deletions(-) diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index f765d38dbac8..24bb03575f33 100644 --- a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -93,7 +93,6 @@ struct zfsvfs { zfs_teardown_lock_t z_teardown_lock; zfs_teardown_inactive_lock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all vnodes in the fs */ - uint64_t z_nr_znodes; /* number of znodes in the fs */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index e320b8de4222..b4d5db21f5e5 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -105,7 +105,6 @@ struct zfsvfs { rrmlock_t z_teardown_lock; krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all znodes in the fs */ - uint64_t z_nr_znodes; /* number of znodes in the fs */ unsigned long z_rollback_time; /* last online rollback time */ unsigned long z_snap_defer_time; /* last snapshot unmount deferral */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 33759fa26169..e8b9ada1316b 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -1154,7 +1154,6 @@ zfsvfs_free(zfsvfs_t *zfsvfs) mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); - ASSERT3U(zfsvfs->z_nr_znodes, ==, 0); list_destroy(&zfsvfs->z_all_znodes); ZFS_TEARDOWN_DESTROY(zfsvfs); ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); @@ -1558,12 +1557,11 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * - * We can safely read z_nr_znodes without locking because the - * VFS has already blocked operations which add to the - * z_all_znodes list and thus increment z_nr_znodes. + * We can safely check z_all_znodes for being empty because the + * VFS has already blocked operations which add to it. */ int round = 0; - while (zfsvfs->z_nr_znodes > 0) { + while (!list_is_empty(&zfsvfs->z_all_znodes)) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index c4f2b722ef4e..0d4c94555c6b 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -537,7 +537,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; zp->z_zfsvfs = zfsvfs; mutex_exit(&zfsvfs->z_znodes_lock); @@ -1286,7 +1285,6 @@ zfs_znode_free(znode_t *zp) mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes--; mutex_exit(&zfsvfs->z_znodes_lock); #if __FreeBSD_version >= 1300139 diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index c45a3eb5a4eb..02cb379ea840 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -537,7 +537,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; membar_producer(); mutex_exit(&zfsvfs->z_znodes_lock); diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 464c12e1108d..a1db5c57c18b 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1330,12 +1330,11 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * - * We can safely read z_nr_znodes without locking because the - * VFS has already blocked operations which add to the - * z_all_znodes list and thus increment z_nr_znodes. + * We can safely check z_all_znodes for being empty because the + * VFS has already blocked operations which add to it. */ int round = 0; - while (zfsvfs->z_nr_znodes > 0) { + while (!list_is_empty(&zfsvfs->z_all_znodes)) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index 335ae3460c58..52c8e51df659 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -390,7 +390,6 @@ zfs_inode_destroy(struct inode *ip) mutex_enter(&zfsvfs->z_znodes_lock); if (list_link_active(&zp->z_link_node)) { list_remove(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes--; } mutex_exit(&zfsvfs->z_znodes_lock); @@ -641,7 +640,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; mutex_exit(&zfsvfs->z_znodes_lock); if (links > 0) From 472335781e50d7864c802184cfaaf35ad60d6b5d Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Thu, 24 Aug 2023 20:11:59 +0500 Subject: [PATCH 19/37] Do not enable zfs-share.service zfs-share.service executes `zfs share` on every boot to share any filesystem/s, that are shared over SMB and/or NFS using the sharesmb and sharenfs properties. Since we do not rely on these properties to share over SMB and NFS and the service fails on boot on TrueNAS if sharesmb and/or sharenfs properties are set, and we rely on middleware to control the SMB and NFS shares, zfs-share.service should be disabled for TrueNAS SCALE. Signed-off-by: Umer Saleem --- contrib/debian/rules.in | 1 - etc/systemd/system/50-zfs.preset | 2 +- etc/systemd/system/zfs-share.service.in | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index 9ff28b663b5f..e8cb5aa0ce3d 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -168,7 +168,6 @@ override_dh_installinit: dh_installinit -r --no-restart-after-upgrade --name zfs-import dh_installinit -r --no-restart-after-upgrade --name zfs-mount dh_installinit -r --no-restart-after-upgrade --name zfs-load-key - dh_installinit -R --name zfs-share dh_installinit -R --name zfs-zed override_dh_installsystemd: diff --git a/etc/systemd/system/50-zfs.preset b/etc/systemd/system/50-zfs.preset index e4056a92cd98..d2da85d330eb 100644 --- a/etc/systemd/system/50-zfs.preset +++ b/etc/systemd/system/50-zfs.preset @@ -3,7 +3,7 @@ enable zfs-import-cache.service disable zfs-import-scan.service enable zfs-import.target enable zfs-mount.service -enable zfs-share.service +disable zfs-share.service enable zfs-zed.service enable zfs-volume-wait.service enable zfs.target diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in index 1a6342a06fec..7c90f74a684a 100644 --- a/etc/systemd/system/zfs-share.service.in +++ b/etc/systemd/system/zfs-share.service.in @@ -17,4 +17,4 @@ EnvironmentFile=-@initconfdir@/zfs ExecStart=@sbindir@/zfs share -a [Install] -WantedBy=zfs.target +WantedBy=multi-user.target From 86c1cb8c82477876a2a462341687bb0e7328cc5f Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 5 Sep 2023 13:27:53 +0500 Subject: [PATCH 20/37] Update the behavior of mountpoint property There are some inconsistencies in the handling of mountpoint property. This commit updates the behavior and makes it consistent. If mountpoint property is set when dataset is unmounted, this would update the mountpoint property. The mountpoint could be valid or invalid in this case. Setting the mountpoint property would result in success in this case. Dataset would still be unmounted here. On the other hand, if dataset is mounted and mountpoint property is updated to something invalid where mount cannot be successful, for example, setting the mountpoint inside a readonly directory. This would unmount the dataset, set the mountpoint property to requested value and tries to mount the dataset. The mount operation returns error and this error is treated as overall failure of setting the property while the property is actually set. To make the behavior consistent in case dataset is mounted or unmounted, we should try to mount the dataset whenever mountpoint property is updated. This would result in mounting the datasets if canmount property is set to on, regardless if the dataset was previously unmounted. The failure in mount operation while setting the mountpoint property should not be treated as failure, since the property is actually set now to user requested value. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #15240 --- cmd/zfs/zfs_main.c | 7 ++++--- lib/libzfs/libzfs_changelist.c | 8 ++++---- .../cli_root/zfs_mount/zfs_mount_006_pos.ksh | 2 +- .../cli_root/zfs_mount/zfs_mount_008_pos.ksh | 4 ++-- .../cli_root/zfs_mount/zfs_mount_012_pos.ksh | 15 +++++++-------- .../cli_root/zfs_set/mountpoint_002_pos.ksh | 12 +++++++++--- .../cli_root/zfs_set/zfs_set_003_neg.ksh | 10 +++++++--- 7 files changed, 34 insertions(+), 24 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 5ed25d1ea720..b0f5d7024951 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -4198,8 +4198,9 @@ static int set_callback(zfs_handle_t *zhp, void *data) { nvlist_t *props = data; + int ret = zfs_prop_set_list(zhp, props); - if (zfs_prop_set_list(zhp, props) != 0) { + if (ret != 0 || libzfs_errno(g_zfs) != EZFS_SUCCESS) { switch (libzfs_errno(g_zfs)) { case EZFS_MOUNTFAILED: (void) fprintf(stderr, gettext("property may be set " @@ -4208,11 +4209,11 @@ set_callback(zfs_handle_t *zhp, void *data) case EZFS_SHARENFSFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to reshare filesystem\n")); + ret = 1; break; } - return (1); } - return (0); + return (ret); } static int diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index dd14c570ec03..4b0f66964346 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -244,13 +244,13 @@ changelist_postfix(prop_changelist_t *clp) zfs_is_mounted(cn->cn_handle, NULL); if (!mounted && !needs_key && (cn->cn_mounted || - ((sharenfs || sharesmb || clp->cl_waslegacy) && + (((clp->cl_prop == ZFS_PROP_MOUNTPOINT && + clp->cl_prop == clp->cl_realprop) || + sharenfs || sharesmb || clp->cl_waslegacy) && (zfs_prop_get_int(cn->cn_handle, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) { - if (zfs_mount(cn->cn_handle, NULL, 0) != 0) - errors++; - else + if (zfs_mount(cn->cn_handle, NULL, 0) == 0) mounted = TRUE; } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh index 2a2466f65c02..e9ab472795eb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh @@ -94,7 +94,7 @@ while (( depth < MAXDEPTH )); do done log_must zfs set mountpoint=$mtpt $TESTPOOL/$TESTFS -log_must zfs $mountcmd $TESTPOOL/$TESTFS +log_must ismounted $TESTPOOL/$TESTFS log_must zfs set overlay=off $TESTPOOL/$TESTFS if ! is_illumos; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh index 2c1029d551cf..0437c61a2c40 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh @@ -71,7 +71,7 @@ log_must mkfile 1M $testfile $testfile1 log_must zfs unmount $fs1 log_must zfs set mountpoint=$mntpnt $fs1 -log_must zfs mount $fs1 +log_must ismounted $fs1 log_must zfs unmount $fs1 log_must zfs mount -O $fs1 @@ -85,7 +85,7 @@ log_must ls $mntpnt/$TESTFILE1 $mntpnt/$TESTFILE2 # Verify $TESTFILE2 was created in $fs1, rather than $fs log_must zfs unmount $fs1 log_must zfs set mountpoint=$mntpnt1 $fs1 -log_must zfs mount $fs1 +log_must ismounted $fs1 log_must ls $testfile1 $mntpnt1/$TESTFILE2 # Verify $TESTFILE2 was not created in $fs, and $fs is accessible again. diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh index 5ff094d2c479..66958f2f0884 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh @@ -25,13 +25,12 @@ # STRATEGY: # 1. Unmount the dataset # 2. Create a new empty directory -# 3. Set the dataset's mountpoint -# 4. Attempt to mount the dataset -# 5. Verify the mount succeeds -# 6. Unmount the dataset -# 7. Create a file in the directory created in step 2 -# 8. Attempt to mount the dataset -# 9. Verify the mount succeeds +# 3. Set the dataset's mountpoint, this should mount the dataset +# 4. Verify the mount succeeds +# 5. Unmount the dataset +# 6. Create a file in the directory created in step 2 +# 7. Attempt to mount the dataset +# 8. Verify the mount succeeds # verify_runnable "both" @@ -43,7 +42,7 @@ fs=$TESTPOOL/$TESTFS log_must zfs umount $fs log_must mkdir -p $TESTDIR log_must zfs set mountpoint=$TESTDIR $fs -log_must zfs mount $fs +log_must ismounted $fs log_must zfs umount $fs log_must touch $TESTDIR/testfile.$$ log_must zfs mount $fs diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh index a5785226e02e..c227a6fb8aa8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh @@ -35,7 +35,9 @@ # # DESCRIPTION: # If ZFS is currently managing the file system but it is currently unmounted, -# and the mountpoint property is changed, the file system remains unmounted. +# and the mountpoint property is changed, the file system should be mounted +# if it is a valid mountpoint and canmount allows to mount, otherwise it +# should not be mounted. # # STRATEGY: # 1. Setup a pool and create fs, ctr within it. @@ -62,7 +64,7 @@ function cleanup } log_assert "Setting a valid mountpoint for an unmounted file system, \ - it remains unmounted." + it gets mounted." log_onexit cleanup old_fs_mpt=$(get_prop mountpoint $TESTPOOL/$TESTFS) @@ -83,7 +85,11 @@ while (( i < ${#dataset[@]} )); do while (( j < ${#values[@]} )); do set_n_check_prop "${values[j]}" "mountpoint" \ "${dataset[i]}" - log_mustnot ismounted ${dataset[i]} + if [ "${dataset[i]}" = "$TESTPOOL/$TESTFS" ]; then + log_must ismounted ${dataset[i]} + else + log_mustnot ismounted ${dataset[i]} + fi (( j += 1 )) done cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh index 3afb0eb7010e..5901ba7dc461 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh @@ -33,7 +33,9 @@ # # DESCRIPTION: -# 'zfs set mountpoint/sharenfs' should fail when the mountpoint is invalid +# 'zfs set mountpoint/sharenfs' should set the property when mountpoint +# is invalid. Setting the property should be successful, but dataset +# should not be mounted, as mountpoint is invalid. # # STRATEGY: # 1. Create invalid scenarios @@ -62,10 +64,12 @@ longpath=$(gen_dataset_name 1030 "abcdefg") log_must zfs create -o mountpoint=legacy $TESTPOOL/foo # Do the negative testing about "property may be set but unable to remount filesystem" -log_mustnot eval "zfs set mountpoint=$badpath $TESTPOOL/foo >/dev/null 2>&1" +set_n_check_prop "$badpath" "mountpoint" "$TESTPOOL/foo" +log_mustnot ismounted $TESTPOOL/foo # Do the negative testing about "property may be set but unable to reshare filesystem" -log_mustnot eval "zfs set sharenfs=on $TESTPOOL/foo >/dev/null 2>&1" +set_n_check_prop "on" "sharenfs" "$TESTPOOL/foo" +log_mustnot ismounted $TESTPOOL/foo # Do the negative testing about "sharenfs property can not be set to null" log_mustnot eval "zfs set sharenfs= $TESTPOOL/foo >/dev/null 2>&1" From 22da5ae59fbf8902f4897c2d417f5673591cc8e7 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 5 Sep 2023 13:33:58 +0500 Subject: [PATCH 21/37] Improve the handling of sharesmb,sharenfs properties For sharesmb and sharenfs properties, the status of setting the property is tied with whether we succeed to share the dataset or not. In case sharing the dataset is not successful, this is treated as overall failure of setting the property. In this case, if we check the property after the failure, it is set to on. This commit updates this behavior and the status of setting the share properties is not returned as failure, when we fail to share the dataset. For sharenfs property, if access list is provided, the syntax errors in access list/host adresses are not validated until after setting the property during postfix phase while trying to share the dataset. This is not correct, since the property has already been set when we reach there. Syntax errors in access list/host addresses are validated while validating the property list, before setting the property and failure is returned to user in this case when there are errors in access list. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #15240 --- cmd/zfs/zfs_main.c | 1 - lib/libshare/os/freebsd/nfs.c | 3 ++- lib/libshare/os/linux/nfs.c | 47 +++++++++++++++++++++++++++++++--- lib/libzfs/libzfs_changelist.c | 11 ++++---- 4 files changed, 51 insertions(+), 11 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index b0f5d7024951..22c28740e41b 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -4209,7 +4209,6 @@ set_callback(zfs_handle_t *zhp, void *data) case EZFS_SHARENFSFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to reshare filesystem\n")); - ret = 1; break; } } diff --git a/lib/libshare/os/freebsd/nfs.c b/lib/libshare/os/freebsd/nfs.c index 521631c51f07..d9fc66106369 100644 --- a/lib/libshare/os/freebsd/nfs.c +++ b/lib/libshare/os/freebsd/nfs.c @@ -161,7 +161,8 @@ nfs_is_shared(sa_share_impl_t impl_share) static int nfs_validate_shareopts(const char *shareopts) { - (void) shareopts; + if (strlen(shareopts) == 0) + return (SA_SYNTAX_ERR); return (SA_OK); } diff --git a/lib/libshare/os/linux/nfs.c b/lib/libshare/os/linux/nfs.c index c27e5564c1e1..004946b0cfe4 100644 --- a/lib/libshare/os/linux/nfs.c +++ b/lib/libshare/os/linux/nfs.c @@ -319,12 +319,49 @@ get_linux_shareopts_cb(const char *key, const char *value, void *cookie) "wdelay" }; char **plinux_opts = (char **)cookie; + char *host, *val_dup, *literal, *next; - /* host-specific options, these are taken care of elsewhere */ - if (strcmp(key, "ro") == 0 || strcmp(key, "rw") == 0 || - strcmp(key, "sec") == 0) + if (strcmp(key, "sec") == 0) return (SA_OK); + if (strcmp(key, "ro") == 0 || strcmp(key, "rw") == 0) { + if (value == NULL || strlen(value) == 0) + return (SA_OK); + val_dup = strdup(value); + host = val_dup; + if (host == NULL) + return (SA_NO_MEMORY); + do { + if (*host == '[') { + host++; + literal = strchr(host, ']'); + if (literal == NULL) { + free(val_dup); + return (SA_SYNTAX_ERR); + } + if (literal[1] == '\0') + next = NULL; + else if (literal[1] == '/') { + next = strchr(literal + 2, ':'); + if (next != NULL) + ++next; + } else if (literal[1] == ':') + next = literal + 2; + else { + free(val_dup); + return (SA_SYNTAX_ERR); + } + } else { + next = strchr(host, ':'); + if (next != NULL) + ++next; + } + host = next; + } while (host != NULL); + free(val_dup); + return (SA_OK); + } + if (strcmp(key, "anon") == 0) key = "anonuid"; @@ -472,6 +509,10 @@ static int nfs_validate_shareopts(const char *shareopts) { char *linux_opts = NULL; + + if (strlen(shareopts) == 0) + return (SA_SYNTAX_ERR); + int error = get_linux_shareopts(shareopts, &linux_opts); if (error != SA_OK) return (error); diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index 4b0f66964346..efe1c0c06035 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -174,7 +174,6 @@ changelist_postfix(prop_changelist_t *clp) prop_changenode_t *cn; uu_avl_walk_t *walk; char shareopts[ZFS_MAXPROPLEN]; - int errors = 0; boolean_t commit_smb_shares = B_FALSE; boolean_t commit_nfs_shares = B_FALSE; @@ -262,19 +261,19 @@ changelist_postfix(prop_changelist_t *clp) const enum sa_protocol nfs[] = {SA_PROTOCOL_NFS, SA_NO_PROTOCOL}; if (sharenfs && mounted) { - errors += zfs_share(cn->cn_handle, nfs); + zfs_share(cn->cn_handle, nfs); commit_nfs_shares = B_TRUE; } else if (cn->cn_shared || clp->cl_waslegacy) { - errors += zfs_unshare(cn->cn_handle, NULL, nfs); + zfs_unshare(cn->cn_handle, NULL, nfs); commit_nfs_shares = B_TRUE; } const enum sa_protocol smb[] = {SA_PROTOCOL_SMB, SA_NO_PROTOCOL}; if (sharesmb && mounted) { - errors += zfs_share(cn->cn_handle, smb); + zfs_share(cn->cn_handle, smb); commit_smb_shares = B_TRUE; } else if (cn->cn_shared || clp->cl_waslegacy) { - errors += zfs_unshare(cn->cn_handle, NULL, smb); + zfs_unshare(cn->cn_handle, NULL, smb); commit_smb_shares = B_TRUE; } } @@ -288,7 +287,7 @@ changelist_postfix(prop_changelist_t *clp) zfs_commit_shares(proto); uu_avl_walk_end(walk); - return (errors ? -1 : 0); + return (0); } /* From 62677576a75e94396e945c4ecd9372f5d34e50cb Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 20 Sep 2023 14:17:11 -0400 Subject: [PATCH 22/37] ZIL: Fix potential race on flush deferring. zil_lwb_set_zio_dependency() can not set write ZIO dependency on previous LWB's write ZIO if one is already in done handler and set state to LWB_STATE_WRITE_DONE. So theoretically done handler of next LWB's write ZIO may run before done handler of previous LWB write ZIO completes. In such case we can not defer flushes, since the flush issue process is not locked. This may fix some reported assertions of lwb_vdev_tree not being empty inside zil_free_lwb(). Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15278 --- module/zfs/zil.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index b30676b42d88..9e9c9c22549d 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1550,7 +1550,16 @@ zil_lwb_write_done(zio_t *zio) lwb->lwb_state = LWB_STATE_WRITE_DONE; lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; + + /* + * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not + * called for it yet, and when it will be, it won't be able to make + * its write ZIO a parent this ZIO. In such case we can not defer + * our flushes or below may be a race between the done callbacks. + */ nlwb = list_next(&zilog->zl_lwb_list, lwb); + if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) + nlwb = NULL; mutex_exit(&zilog->zl_lock); if (avl_numnodes(t) == 0) From 1c2aee7a5222b1b65fea07dd7ee18ae14c3f7be2 Mon Sep 17 00:00:00 2001 From: Rob N Date: Wed, 20 Sep 2023 01:48:02 +1000 Subject: [PATCH 23/37] tests: install missing PAM tests 'pam_change_unmounted' and 'pam_recursive' both exist and are referenced by the test run config, but weren't being installed and so are excluded. This gets them installed so they will run as expected. Reviewed-by: Brian Behlendorf Reviewed-by: Kay Pedersen Signed-off-by: Rob Norris Closes #15291 --- tests/zfs-tests/tests/Makefile.am | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 66aff5026f8f..1a58e6f774e9 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1588,7 +1588,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/online_offline/setup.ksh \ functional/pam/cleanup.ksh \ functional/pam/pam_basic.ksh \ + functional/pam/pam_change_unmounted.ksh \ functional/pam/pam_nounmount.ksh \ + functional/pam/pam_recursive.ksh \ functional/pam/pam_short_password.ksh \ functional/pam/setup.ksh \ functional/pool_checkpoint/checkpoint_after_rewind.ksh \ From cc75c816c539d22c56aa912d5db19827a16dcbf3 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 19 Sep 2023 08:58:14 -0700 Subject: [PATCH 24/37] Fix l2arc_apply_transforms ztest crash In #13375 we modified the allocation size of the buffer that we use to apply l2arc transforms to be the size of the arc hdr we're using, rather than the allocation size that will be in place on the disk, because sometimes the hdr size is larger. Unfortunately, sometimes the allocation size is larger, which means that we overflow the buffer in that case. This change modifies the allocation to be the max of the two values Reviewed-by: Mark Maybee Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #15177 Closes #15248 --- module/zfs/arc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 7023f448182a..22dc0ed5e3b6 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -9092,15 +9092,16 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, * write things before deciding to fail compression in nearly * every case.) */ - cabd = abd_alloc_for_io(size, ismd); - tmp = abd_borrow_buf(cabd, size); + uint64_t bufsize = MAX(size, asize); + cabd = abd_alloc_for_io(bufsize, ismd); + tmp = abd_borrow_buf(cabd, bufsize); psize = zio_compress_data(compress, to_write, &tmp, size, hdr->b_complevel); if (psize >= asize) { psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, size); + abd_return_buf_copy(cabd, tmp, bufsize); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); to_write = cabd; abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); @@ -9110,9 +9111,9 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) - memset((char *)tmp + psize, 0, asize - psize); + memset((char *)tmp + psize, 0, bufsize - psize); psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, size); + abd_return_buf_copy(cabd, tmp, bufsize); to_write = cabd; } From 9aa1a2878ea98f707e871833743d074a59ff2115 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 19 Sep 2023 09:02:23 -0700 Subject: [PATCH 25/37] Fix incorrect expected error in ztest There is an occasional ztest failure that looks like ztest: attach (/var/tmp/zloop-run/ztest.13a 570425344, draid1-1-0 532152320, 1) returned 22, expected 95. This is because the value that we return is EINVAL, but expected_error is set incorrectly. Change the expected_error value to match both the comment and the actual error value. Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #15295 --- cmd/ztest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index 398c519cfc35..59c4be225f93 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -3767,7 +3767,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) - expected_error = ENOTSUP; + expected_error = EINVAL; else expected_error = 0; From 3af63683fe07465fd96827cab0d946cbdaba6f73 Mon Sep 17 00:00:00 2001 From: Rob N Date: Wed, 20 Sep 2023 02:06:47 +1000 Subject: [PATCH 26/37] cmd: add 'help' subcommand to zpool and zfs 'program help subcommand' is a reasonably common pattern for multifunction command-line programs. This commit adds support for that style to the zpool and zfs commands. When run as 'zpool help []' or 'zfs help []', executes the 'man' program on the PATH with the most likely manpage name for the requested topic: "zpool-" or "zfs-" for subcommands, or "zpool" or "zfs" for the "concepts" and "props" topics. If no topic is supplied, uses the top "zpool" or "zfs" pages. Reviewed-by: Brian Behlendorf Reviewed-by: Kay Pedersen Signed-off-by: Rob Norris Closes #15288 --- cmd/zfs/zfs_main.c | 30 ++++++++++++++++++ cmd/zpool/zpool_main.c | 31 +++++++++++++++++++ .../cli_root/zfs_program/zfs_program_json.ksh | 4 ++- 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 5ed25d1ea720..62802de23e5f 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -132,6 +132,8 @@ static int zfs_do_zone(int argc, char **argv); static int zfs_do_unzone(int argc, char **argv); #endif +static int zfs_do_help(int argc, char **argv); + /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. */ @@ -606,6 +608,9 @@ usage(boolean_t requested) (void) fprintf(fp, gettext("\nFor the delegated permission list, run: %s\n"), "zfs allow|unallow"); + (void) fprintf(fp, + gettext("\nFor further help on a command or topic, " + "run: %s\n"), "zfs help []"); } /* @@ -8726,6 +8731,25 @@ zfs_do_version(int argc, char **argv) return (zfs_version_print() != 0); } +/* Display documentation */ +static int +zfs_do_help(int argc, char **argv) +{ + char page[MAXNAMELEN]; + if (argc < 3 || strcmp(argv[2], "zfs") == 0) + strcpy(page, "zfs"); + else if (strcmp(argv[2], "concepts") == 0 || + strcmp(argv[2], "props") == 0) + snprintf(page, sizeof (page), "zfs%s", argv[2]); + else + snprintf(page, sizeof (page), "zfs-%s", argv[2]); + + execlp("man", "man", page, NULL); + + fprintf(stderr, "couldn't run man program: %s", strerror(errno)); + return (-1); +} + int main(int argc, char **argv) { @@ -8781,6 +8805,12 @@ main(int argc, char **argv) if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zfs_do_version(argc, argv)); + /* + * Special case 'help' + */ + if (strcmp(cmdname, "help") == 0) + return (zfs_do_help(argc, argv)); + if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 6d0dae8d8b05..d64fdfa5ba4c 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -126,6 +126,8 @@ static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); +static int zpool_do_help(int argc, char **argv); + static zpool_compat_status_t zpool_do_load_compat( const char *, boolean_t *); @@ -538,6 +540,10 @@ usage(boolean_t requested) (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } + + (void) fprintf(fp, + gettext("\nFor further help on a command or topic, " + "run: %s\n"), "zpool help []"); } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); @@ -11051,6 +11057,25 @@ zpool_do_version(int argc, char **argv) return (zfs_version_print() != 0); } +/* Display documentation */ +static int +zpool_do_help(int argc, char **argv) +{ + char page[MAXNAMELEN]; + if (argc < 3 || strcmp(argv[2], "zpool") == 0) + strcpy(page, "zpool"); + else if (strcmp(argv[2], "concepts") == 0 || + strcmp(argv[2], "props") == 0) + snprintf(page, sizeof (page), "zpool%s", argv[2]); + else + snprintf(page, sizeof (page), "zpool-%s", argv[2]); + + execlp("man", "man", page, NULL); + + fprintf(stderr, "couldn't run man program: %s", strerror(errno)); + return (-1); +} + /* * Do zpool_load_compat() and print error message on failure */ @@ -11118,6 +11143,12 @@ main(int argc, char **argv) if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zpool_do_version(argc, argv)); + /* + * Special case 'help' + */ + if (strcmp(cmdname, "help") == 0) + return (zpool_do_help(argc, argv)); + if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_program/zfs_program_json.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_program/zfs_program_json.ksh index b0265c5ee4a1..2241b77bf806 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_program/zfs_program_json.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_program/zfs_program_json.ksh @@ -117,7 +117,9 @@ usage: For the property list, run: zfs set|get -For the delegated permission list, run: zfs allow|unallow") +For the delegated permission list, run: zfs allow|unallow + +For further help on a command or topic, run: zfs help []") cnt=0 for cmd in ${neg_cmds[@]}; do log_mustnot zfs program $cmd $TESTPOOL $TESTZCP $TESTDS 2>&1 From 729507d309d4573868855eca04f8522cc120f7d5 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Wed, 20 Sep 2023 16:39:38 -0700 Subject: [PATCH 27/37] Fix occasional rsend test crashes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have occasional crashes in the rsend tests. Debugging revealed that this is because the send_worker thread is getting EINTR from splice(). This happens when a non-fatal signal is received during the syscall. We should retry the syscall, rather than exiting failure. Tweak the loop to only break if the splice is finished or we receive a non-EINTR error. Reviewed-by: Brian Behlendorf Reviewed-by: Ahelenia Ziemiańska Signed-off-by: Paul Dagnelie Closes #15273 --- lib/libzfs_core/libzfs_core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index c63a16de5ab6..01d803e21db0 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -650,10 +650,12 @@ send_worker(void *arg) unsigned int bufsiz = max_pipe_buffer(ctx->from); ssize_t rd; - while ((rd = splice(ctx->from, NULL, ctx->to, NULL, bufsiz, - SPLICE_F_MOVE | SPLICE_F_MORE)) > 0) - ; - + for (;;) { + rd = splice(ctx->from, NULL, ctx->to, NULL, bufsiz, + SPLICE_F_MOVE | SPLICE_F_MORE); + if ((rd == -1 && errno != EINTR) || rd == 0) + break; + } int err = (rd == -1) ? errno : 0; close(ctx->from); return ((void *)(uintptr_t)err); From a199cac6cdd751c02e1bd99fb02cd4623356140d Mon Sep 17 00:00:00 2001 From: Rob N Date: Thu, 21 Sep 2023 09:56:45 +1000 Subject: [PATCH 28/37] status: report pool suspension state under failmode=continue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When failmode=continue is set and the pool suspends, both 'zpool status' and the 'zfs/pool/state' kstat ignore it and report the normal vdev tree state. There's no clear indicator that the pool is suspended. This is unlike suspend in failmode=wait, or suspend due to MMP check failure, which both report "SUSPENDED" explicitly. This commit changes it so SUSPENDED is reported for failmode=continue the same as for other modes. Rationale: The historical behaviour of failmode=continue is roughly, "press on as though all is well". To this end, the fact that the pool had suspended was not shown, to maintain the façade that all is well. Its unclear why hiding this information was considered appropriate. One possibility is that it was expected that a true pool fault would always be reported as DEGRADED or FAULTED, and that the pool could not suspend without these happening. That is not necessarily true, as vdev health and suspend state are only loosely connected, such that a pool in (apparent) good health can be suspended for good reasons, and of course a degraded pool does not lead to suspension. Even if that expectation were true, there's still a difference in urgency - a degraded pool may not need to be attended to for hours, while a suspended pool is most often unusable until an operator intervenes. An operator that has set failmode=continue has presumably done so because their workload is one that can continue to operate in a useful way when the pool suspends. In this case the operator still needs a clear indicator that there is a problem that needs attending to. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #15297 --- lib/libzfs/libzfs_pool.c | 3 ++- module/zfs/spa_misc.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 85564edfd862..4ebd112f452f 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -29,7 +29,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2018, loli10K * Copyright (c) 2021, Colm Buckley - * Copyright (c) 2021, Klara Inc. + * Copyright (c) 2021, 2023, Klara Inc. */ #include @@ -255,6 +255,7 @@ zpool_get_state_str(zpool_handle_t *zhp) if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { str = gettext("FAULTED"); } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT || + status == ZPOOL_STATUS_IO_FAILURE_CONTINUE || status == ZPOOL_STATUS_IO_FAILURE_MMP) { str = gettext("SUSPENDED"); } else { diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 3b355e0debcc..72b690162d64 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -27,6 +27,7 @@ * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. + * Copyright (c) 2023, Klara Inc. */ #include @@ -2756,8 +2757,7 @@ spa_state_to_name(spa_t *spa) vdev_state_t state = rvd->vdev_state; vdev_aux_t aux = rvd->vdev_stat.vs_aux; - if (spa_suspended(spa) && - (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)) + if (spa_suspended(spa)) return ("SUSPENDED"); switch (state) { From 5f30698670dec3fc2085663677021f0a592cf081 Mon Sep 17 00:00:00 2001 From: Rob N Date: Fri, 22 Sep 2023 10:54:15 +1000 Subject: [PATCH 29/37] tests/block_cloning: try harder to stay on same txg in fallback test We've observed this test failing intermittently. When it does, the "same block" check shows that both files have the same content, that is, the file was cloned. The only way this could have happened is if the open txg moved between the dd and clonefile calls. That's possible because although we set zfs_txg_timeout to be large, that only affects the wait time in the sync thread at the start of a new txg; it doesn't change anything if its currently waiting or working. So here we just force the txgs to move immediately before, which should get both operations onto the same txg as intented. Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Rob Norris Closes #15303 --- .../block_cloning_copyfilerange_fallback_same_txg.ksh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index a10545bc0769..74c5a5bece60 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -51,6 +51,7 @@ log_onexit cleanup log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS log_must set_tunable64 TXG_TIMEOUT 5000 +log_must sync_pool $TESTPOOL true log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 From 0aabd6b48228845462090c9137a340ca062a3cb9 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 21 Sep 2023 21:40:13 -0400 Subject: [PATCH 30/37] ZIL: Avoid dbuf_read() in ztest_get_data() While working on similar patches for zfs and zvol in #15153 I've forgot about ztest. Update it also so that we test the same code paths as use in production. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15301 --- cmd/ztest.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index 59c4be225f93..9c5a6944035c 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -2457,8 +2457,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, RL_READER); - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); + error = dmu_buf_hold_noread(os, object, offset, zgd, &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; From 0ce1b2ca19302df59a8c4fa8383f35cf549874de Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Fri, 22 Sep 2023 16:08:51 -0700 Subject: [PATCH 31/37] Invoke zdb by guid to avoid import errors The problem that was occurring is basically that a device was removed by ztest and replaced with another device. It was then reguided. The import then failed because there were two possible imports with the same name; one with the new guid, and one with the old. This can happen because the label writes from the device removal/replacement can be subject to ztest's error injection. The other ways to fix this would be to change the error injection to not trigger on removals (which may not be technically feasible), or to change the import code to not report configurations that are so short on devices (which would potentially have unpleasant end-user effects when trying to recover from data losses/device configuration issues). Reviewed-by: Brian Behlendorf Reviewed-by: Matthew Ahrens Reviewed-by: George Melikov Signed-off-by: Paul Dagnelie Closes #15298 --- cmd/ztest.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index 9c5a6944035c..8cfbdfe1c2e2 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -6378,6 +6378,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id) spa_t *spa = ztest_spa; uint64_t orig, load; int error; + ztest_shared_t *zs = ztest_shared; if (ztest_opts.zo_mmp_test) return; @@ -6387,6 +6388,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id) (void) pthread_rwlock_wrlock(&ztest_name_lock); error = spa_change_guid(spa); + zs->zs_guid = spa_guid(spa); (void) pthread_rwlock_unlock(&ztest_name_lock); if (error != 0) @@ -6916,7 +6918,7 @@ ztest_trim(ztest_ds_t *zd, uint64_t id) * Verify pool integrity by running zdb. */ static void -ztest_run_zdb(const char *pool) +ztest_run_zdb(uint64_t guid) { int status; char *bin; @@ -6940,13 +6942,13 @@ ztest_run_zdb(const char *pool) free(set_gvars_args); size_t would = snprintf(zdb, len, - "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", + "%s -bcc%s%s -G -d -Y -e -y %s -p %s %"PRIu64, bin, ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 4 ? "v" : "", set_gvars_args_joined, ztest_opts.zo_dir, - pool); + guid); ASSERT3U(would, <, len); umem_free(set_gvars_args_joined, strlen(set_gvars_args_joined) + 1); @@ -7524,14 +7526,15 @@ ztest_import(ztest_shared_t *zs) VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + zs->zs_guid = spa_guid(spa); spa_close(spa, FTAG); kernel_fini(); if (!ztest_opts.zo_mmp_test) { - ztest_run_zdb(ztest_opts.zo_pool); + ztest_run_zdb(zs->zs_guid); ztest_freeze(); - ztest_run_zdb(ztest_opts.zo_pool); + ztest_run_zdb(zs->zs_guid); } (void) pthread_rwlock_destroy(&ztest_name_lock); @@ -7602,7 +7605,6 @@ ztest_run(ztest_shared_t *zs) dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - zs->zs_guid = dds.dds_guid; dmu_objset_disown(os, B_TRUE, FTAG); /* @@ -7873,14 +7875,15 @@ ztest_init(ztest_shared_t *zs) VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + zs->zs_guid = spa_guid(spa); spa_close(spa, FTAG); kernel_fini(); if (!ztest_opts.zo_mmp_test) { - ztest_run_zdb(ztest_opts.zo_pool); + ztest_run_zdb(zs->zs_guid); ztest_freeze(); - ztest_run_zdb(ztest_opts.zo_pool); + ztest_run_zdb(zs->zs_guid); } (void) pthread_rwlock_destroy(&ztest_name_lock); @@ -8303,7 +8306,7 @@ main(int argc, char **argv) } if (!ztest_opts.zo_mmp_test) - ztest_run_zdb(ztest_opts.zo_pool); + ztest_run_zdb(zs->zs_guid); } if (ztest_opts.zo_verbose >= 1) { From 8526b12f3d6f751ec575ab582fd925bf0084a27e Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 25 Sep 2023 11:14:00 -0700 Subject: [PATCH 32/37] Set timeout before creating pool in test Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Paul Dagnelie Closes #15309 --- .../block_cloning_copyfilerange_fallback_same_txg.ksh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index 74c5a5bece60..17ce5c9ba784 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -48,11 +48,11 @@ function cleanup log_onexit cleanup -log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS - log_must set_tunable64 TXG_TIMEOUT 5000 log_must sync_pool $TESTPOOL true +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 From ba4dbbdae7d8bc168f4cb247db187d46c41c7742 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 25 Sep 2023 11:15:32 -0700 Subject: [PATCH 33/37] ZTS: Add additional exceptions "zfs_share_concurrent_shares" may fail on FreeBSD and some Linux distributions (fedora). Move it to the common list. "zfs_allow_010_pos" has been observed to fail on FreeBSD 13. Reviewed-by: George Melikov Reviewed-by: Kay Pedersen Reviewed-by: Umer Saleem Signed-off-by: Brian Behlendorf Closes #15306 --- tests/test-runner/bin/zts-report.py.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index e1bbe063ab4c..558e4b57279d 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -214,6 +214,7 @@ maybe = { 'cli_root/zfs_get/zfs_get_009_pos': ['SKIP', 5479], 'cli_root/zfs_rollback/zfs_rollback_001_pos': ['FAIL', known_reason], 'cli_root/zfs_rollback/zfs_rollback_002_pos': ['FAIL', known_reason], + 'cli_root/zfs_share/zfs_share_concurrent_shares': ['FAIL', known_reason], 'cli_root/zfs_snapshot/zfs_snapshot_002_neg': ['FAIL', known_reason], 'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason], 'cli_root/zpool_add/zpool_add_004_pos': ['FAIL', known_reason], @@ -259,10 +260,9 @@ if sys.platform.startswith('freebsd'): maybe.update({ 'cli_root/zfs_copies/zfs_copies_002_pos': ['FAIL', known_reason], 'cli_root/zfs_inherit/zfs_inherit_001_neg': ['FAIL', known_reason], - 'cli_root/zfs_share/zfs_share_concurrent_shares': - ['FAIL', known_reason], 'cli_root/zpool_import/zpool_import_012_pos': ['FAIL', known_reason], 'delegate/zfs_allow_003_pos': ['FAIL', known_reason], + 'delegate/zfs_allow_010_pos': ['FAIL', known_reason], 'inheritance/inherit_001_pos': ['FAIL', 11829], 'resilver/resilver_restart_001': ['FAIL', known_reason], 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], From 99dc1fc340670a070aba58aa660d0d72c774093c Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 26 Sep 2023 14:37:28 -0700 Subject: [PATCH 34/37] ZTS: Fix introduced test bug in block_cloning_copyfilerange Reviewed-by: John Wren Kennedy Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #15316 --- .../block_cloning_copyfilerange_fallback_same_txg.ksh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index 17ce5c9ba784..2cd2f4763a73 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -49,7 +49,6 @@ function cleanup log_onexit cleanup log_must set_tunable64 TXG_TIMEOUT 5000 -log_must sync_pool $TESTPOOL true log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS From d38f4664a6ef5cfff8f005016e10b9583d04de6b Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Wed, 27 Sep 2023 12:06:24 -0700 Subject: [PATCH 35/37] Reduce trim min size even lower for tests to reduce flakiness Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Paul Dagnelie Closes #15315 --- .../tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh | 2 +- .../cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh | 2 +- tests/zfs-tests/tests/functional/trim/autotrim_config.ksh | 2 +- tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh | 2 +- .../zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh | 2 +- tests/zfs-tests/tests/functional/trim/trim_config.ksh | 2 +- tests/zfs-tests/tests/functional/trim/trim_integrity.ksh | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh index bdbf3db53336..9342cbe880ae 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh @@ -60,7 +60,7 @@ log_must set_tunable64 VDEV_MIN_MS_COUNT 64 # Minimum trim size is decreased to verify all trim sizes. typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) -log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 log_must mkdir "$TESTDIR" log_must truncate -s $LARGESIZE "$LARGEFILE" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh index d5aaf49aebc5..41d1decd13ce 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh @@ -52,7 +52,7 @@ LARGEFILE="$TESTDIR/largefile" # Reduce trim size to allow for tighter tolerance below when checking. typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) -log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 log_must mkdir "$TESTDIR" log_must truncate -s $LARGESIZE "$LARGEFILE" diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh index c97772585737..4dba4616e947 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh @@ -57,7 +57,7 @@ log_onexit cleanup # Minimum trim size is decreased to verify all trim sizes. typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) -log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 # Reduced TRIM_TXG_BATCH to make trimming more frequent. typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh index e25390339b6c..e139809105df 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh @@ -54,7 +54,7 @@ log_onexit cleanup # Minimum trim size is decreased to verify all trim sizes. typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) -log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 # Reduced TRIM_TXG_BATCH to make trimming more frequent. typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh index ae7ad8d73dd8..53de485b1a8e 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh @@ -55,7 +55,7 @@ log_onexit cleanup # Minimum trim size is decreased to verify all trim sizes. typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) -log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 # Reduced TRIM_TXG_BATCH to make trimming more frequent. typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) diff --git a/tests/zfs-tests/tests/functional/trim/trim_config.ksh b/tests/zfs-tests/tests/functional/trim/trim_config.ksh index 6a187a05b579..c08b5edb5698 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_config.ksh @@ -57,7 +57,7 @@ log_onexit cleanup # Minimum trim size is decreased to verify all trim sizes. typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) -log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 # Reduced TRIM_TXG_BATCH to make trimming more frequent. typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) diff --git a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh index 2dff0924f7b1..c67697521ce8 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh @@ -54,7 +54,7 @@ log_onexit cleanup # Minimum trim size is decreased to verify all trim sizes. typeset trim_extent_bytes_min=$(get_tunable TRIM_EXTENT_BYTES_MIN) -log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 +log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 512 # Reduced TRIM_TXG_BATCH to make trimming more frequent. typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) From 9e36c5769fe142873c8a2984335d11007fd66987 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 28 Sep 2023 14:08:52 -0700 Subject: [PATCH 36/37] Don't allocate from new metaslabs Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #15307 Closes #15308 --- module/zfs/metaslab.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 20dc934593f1..cdf599b17924 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -3208,6 +3208,15 @@ metaslab_segment_weight(metaslab_t *msp) static boolean_t metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { + /* + * This case will usually but not always get caught by the checks below; + * metaslabs can be loaded by various means, including the trim and + * initialize code. Once that happens, without this check they are + * allocatable even before they finish their first txg sync. + */ + if (unlikely(msp->ms_new)) + return (B_FALSE); + /* * If the metaslab is loaded, ms_max_size is definitive and we can use * the fast check. If it's not, the ms_max_size is a lower bound (once From 229ca7d738ccbf4c55076977467ee93e20b6f01b Mon Sep 17 00:00:00 2001 From: Akash B Date: Fri, 29 Sep 2023 02:40:07 +0530 Subject: [PATCH 37/37] Fix ENOSPC for extended quota When unlinking multiple files from a pool at 100% capacity, it was possible for ENOSPC to be returned after the first few unlinks. This issue was fixed previously by PR #13172 but then this was again introduced by PR #13839. This is resolved using the existing mechanism of returning ERESTART when over quota as long as we know enough space will shortly be available after processing the pending deferred frees. Also, updated the existing testcase which reliably reproduced the issue without this patch. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Dipak Ghosh Signed-off-by: Akash B Closes #15312 --- module/zfs/dsl_dir.c | 22 +++++++------------ .../tests/functional/no_space/enospc_rm.ksh | 12 +++++++++- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index bbe6a03d620f..baf970121a61 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -26,6 +26,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. */ #include @@ -1358,30 +1359,23 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, ext_quota = 0; if (used_on_disk >= quota) { + if (retval == ENOSPC && (used_on_disk - quota) < + dsl_pool_deferred_space(dd->dd_pool)) { + retval = SET_ERROR(ERESTART); + } /* Quota exceeded */ mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); return (retval); } else if (used_on_disk + est_inflight >= quota + ext_quota) { - if (est_inflight > 0 || used_on_disk < quota) { - retval = SET_ERROR(ERESTART); - } else { - ASSERT3U(used_on_disk, >=, quota); - - if (retval == ENOSPC && (used_on_disk - quota) < - dsl_pool_deferred_space(dd->dd_pool)) { - retval = SET_ERROR(ERESTART); - } - } - dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " - "quota=%lluK tr=%lluK err=%d\n", + "quota=%lluK tr=%lluK\n", (u_longlong_t)used_on_disk>>10, (u_longlong_t)est_inflight>>10, - (u_longlong_t)quota>>10, (u_longlong_t)asize>>10, retval); + (u_longlong_t)quota>>10, (u_longlong_t)asize>>10); mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); - return (retval); + return (SET_ERROR(ERESTART)); } /* We need to up our estimated delta before dropping dd_lock */ diff --git a/tests/zfs-tests/tests/functional/no_space/enospc_rm.ksh b/tests/zfs-tests/tests/functional/no_space/enospc_rm.ksh index d0f4ff4a08fe..d9582fbe2a76 100755 --- a/tests/zfs-tests/tests/functional/no_space/enospc_rm.ksh +++ b/tests/zfs-tests/tests/functional/no_space/enospc_rm.ksh @@ -17,6 +17,7 @@ # # Copyright (c) 2014, 2016 by Delphix. All rights reserved. # Copyright (c) 2022 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2023 Hewlett Packard Enterprise Development LP. # . $STF_SUITE/include/libtest.shlib @@ -51,11 +52,20 @@ log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS log_must zfs set compression=off $TESTPOOL/$TESTFS -log_note "Writing files until ENOSPC." +log_note "Writing Big(1G) files until ENOSPC." log_mustnot_expect "No space left on device" fio --name=test \ --fallocate=none --rw=write --bs=1M --size=1G --numjobs=4 \ --sync=1 --directory=$TESTDIR/ --group_reporting +log_must rm $TESTDIR/test.* +log_must test -z "$(ls -A $TESTDIR)" +sync_pool $TESTPOOL true + +log_note "Writing small(10M) files until ENOSPC." +log_mustnot_expect "No space left on device" fio --name=test \ + --fallocate=none --rw=write --bs=1M --size=10M --numjobs=200 \ + --sync=1 --directory=$TESTDIR/ --group_reporting + log_must rm $TESTDIR/test.* log_must test -z "$(ls -A $TESTDIR)"