From 90ff732358390ac395999577b703dcb4d3e0df59 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 21 Mar 2024 04:08:50 +1100
Subject: [PATCH 001/116] freebsd: fix missing headers in distribution tarball

arc_os.h and freebsd_event.h aren't included in release tarballs, so the
build fails on FreeBSD. This fixes it.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15963
---
 include/os/freebsd/Makefile.am | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am
index 9819e534b7f6..551f75f42a20 100644
--- a/include/os/freebsd/Makefile.am
+++ b/include/os/freebsd/Makefile.am
@@ -80,7 +80,9 @@ noinst_HEADERS = \
 	%D%/spl/sys/zmod.h \
 	%D%/spl/sys/zone.h \
 	\
+	%D%/zfs/sys/arc_os.h \
 	%D%/zfs/sys/freebsd_crypto.h \
+	%D%/zfs/sys/freebsd_event.h \
 	%D%/zfs/sys/vdev_os.h \
 	%D%/zfs/sys/zfs_bootenv_os.h \
 	%D%/zfs/sys/zfs_context_os.h \

From ef08a4d4065d21414d7fedccac20da6bfda4dfd0 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 21 Mar 2024 10:46:15 +1100
Subject: [PATCH 002/116] Linux 6.8 compat: use splice_copy_file_range() for
 fallback

Linux 6.8 removes generic_copy_file_range(), which had been reduced to a
simple wrapper around splice_copy_file_range(). Detect that function
directly and use it if generic_ is not available.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15930
Closes #15931
---
 config/kernel-vfs-file_range.m4      | 27 +++++++++++++++++++++++++++
 config/kernel.m4                     |  2 ++
 module/os/linux/zfs/zpl_file_range.c | 16 ++++++++++++++--
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
index cc96404d8bbe..8a5cbe2eeeed 100644
--- a/config/kernel-vfs-file_range.m4
+++ b/config/kernel-vfs-file_range.m4
@@ -16,6 +16,9 @@ dnl #
 dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
 dnl #      generic_copy_file_range() added to support it
 dnl #
+dnl # 6.8: generic_copy_file_range() removed, replaced by
+dnl #      splice_copy_file_range()
+dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
 	ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
 		#include <linux/fs.h>
@@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
 	])
 ])
 
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
+		#include <linux/splice.h>
+	], [
+		struct file *src_file __attribute__ ((unused)) = NULL;
+		loff_t src_off __attribute__ ((unused)) = 0;
+		struct file *dst_file __attribute__ ((unused)) = NULL;
+		loff_t dst_off __attribute__ ((unused)) = 0;
+		size_t len __attribute__ ((unused)) = 0;
+		splice_copy_file_range(src_file, src_off, dst_file, dst_off,
+		    len);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
+	AC_MSG_CHECKING([whether splice_copy_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
+		    [splice_copy_file_range() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
 	ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
 		#include <linux/fs.h>
diff --git a/config/kernel.m4 b/config/kernel.m4
index e3f8645774c5..1d0c5a27fc7f 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
 	ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
@@ -266,6 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_VFS_IOV_ITER
 	ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index 3065d54fa9da..64728fdb1187 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -26,6 +26,9 @@
 #include <linux/compat.h>
 #endif
 #include <linux/fs.h>
+#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
+#include <linux/splice.h>
+#endif
 #include <sys/file.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_vnops.h>
@@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	ret = zpl_clone_file_range_impl(src_file, src_off,
 	    dst_file, dst_off, len);
 
-#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
+#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
 	/*
 	 * Since Linux 5.3 the filesystem driver is responsible for executing
 	 * an appropriate fallback, and a generic fallback function is provided.
@@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	    ret == -EAGAIN)
 		ret = generic_copy_file_range(src_file, src_off, dst_file,
 		    dst_off, len, flags);
+#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
+	/*
+	 * Since 6.8 the fallback function is called splice_copy_file_range
+	 * and has a slightly different signature.
+	 */
+	if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
+	    ret == -EAGAIN)
+		ret = splice_copy_file_range(src_file, src_off, dst_file,
+		    dst_off, len);
 #else
 	/*
 	 * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
@@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	 */
 	if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
 		ret = -EOPNOTSUPP;
-#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
 
 	return (ret);
 }

From 45e23abed55cc1c7216e98df28f1b6c6f172b790 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 20 Mar 2024 20:22:36 -0400
Subject: [PATCH 003/116] Update resume token at object receive.

Before this change resume token was updated only on data receive.
Usually it is enough to resume replication without much overlap.
But we've got a report of a curios case, where replication source
was traversed with recursive grep, which through enabled atime
modified every object without modifying any data.  It produced
several gigabytes of replication traffic without a single data
write and so without a single resume point.

While the resume token was not designed to resume from an object,
I've found that the send implementation always sends object before
any data. So by requesting resume from offset 0 we are effectively
resuming from the object, followed (or not) by the data at offset
0, just as we need it.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15927
---
 module/zfs/dmu_recv.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 54aa60259ea1..2cf10909738b 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2110,6 +2110,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		dmu_buf_rele(db, FTAG);
 		dnode_rele(dn, FTAG);
 	}
+
+	/*
+	 * If the receive fails, we want the resume stream to start with the
+	 * same record that we last successfully received. There is no way to
+	 * request resume from the object record, but we can benefit from the
+	 * fact that sender always sends object record before anything else,
+	 * after which it will "resend" data at offset 0 and resume normally.
+	 */
+	save_resume_state(rwa, drro->drr_object, 0, tx);
+
 	dmu_tx_commit(tx);
 
 	return (0);

From c9d8f6c59a268f65075bb9e510a58b1eec8015f7 Mon Sep 17 00:00:00 2001
From: Cameron Harr <harr1@llnl.gov>
Date: Thu, 21 Mar 2024 09:00:29 -0700
Subject: [PATCH 004/116] Fix option string, adding -e and fixing order

The recently added '-e' option (PR #15769) missed adding the
new option in the online `zpool status` help command. This
adds the options and reorders a couple of the other options
that were not listed alphabetically.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Cameron Harr <harr1@llnl.gov>
Closes #16008
---
 cmd/zpool/zpool_main.c  | 39 +++++++++++++++++++--------------------
 man/man8/zpool-status.8 | 18 +++++++++---------
 2 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 0783271f4734..987d44062865 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -413,7 +413,7 @@ get_usage(zpool_help_t idx)
 		    "[<device> ...]\n"));
 	case HELP_STATUS:
 		return (gettext("\tstatus [--power] [-c [script1,script2,...]] "
-		    "[-igLpPstvxD]  [-T d|u] [pool] ... \n"
+		    "[-DegiLpPstvx] [-T d|u] [pool] ...\n"
 		    "\t    [interval [count]]\n"));
 	case HELP_UPGRADE:
 		return (gettext("\tupgrade\n"
@@ -9177,22 +9177,22 @@ status_callback(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * zpool status [-c [script1,script2,...]] [-igLpPstvx] [--power] [-T d|u] ...
+ * zpool status [-c [script1,script2,...]] [-DegiLpPstvx] [--power] [-T d|u] ...
  *              [pool] [interval [count]]
  *
  *	-c CMD	For each vdev, run command CMD
+ *	-D	Display dedup status (undocumented)
  *	-e	Display only unhealthy vdevs
- *	-i	Display vdev initialization status.
  *	-g	Display guid for individual vdev name.
+ *	-i	Display vdev initialization status.
  *	-L	Follow links when resolving vdev path name.
  *	-p	Display values in parsable (exact) format.
  *	-P	Display full path for vdev name.
  *	-s	Display slow IOs column.
- *	-v	Display complete error logs
- *	-x	Display only pools with potential problems
- *	-D	Display dedup status (undocumented)
  *	-t	Display vdev TRIM status.
  *	-T	Display a timestamp in date(1) or Unix format
+ *	-v	Display complete error logs
+ *	-x	Display only pools with potential problems
  *	--power	Display vdev enclosure slot power status
  *
  * Describes the health status of all pools or some subset.
@@ -9213,7 +9213,7 @@ zpool_do_status(int argc, char **argv)
 	};
 
 	/* check options */
-	while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options,
+	while ((c = getopt_long(argc, argv, "c:DegiLpPstT:vx", long_options,
 	    NULL)) != -1) {
 		switch (c) {
 		case 'c':
@@ -9240,15 +9240,18 @@ zpool_do_status(int argc, char **argv)
 			}
 			cmd = optarg;
 			break;
+		case 'D':
+			cb.cb_dedup_stats = B_TRUE;
+			break;
 		case 'e':
 			cb.cb_print_unhealthy = B_TRUE;
 			break;
-		case 'i':
-			cb.cb_print_vdev_init = B_TRUE;
-			break;
 		case 'g':
 			cb.cb_name_flags |= VDEV_NAME_GUID;
 			break;
+		case 'i':
+			cb.cb_print_vdev_init = B_TRUE;
+			break;
 		case 'L':
 			cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
 			break;
@@ -9261,21 +9264,18 @@ zpool_do_status(int argc, char **argv)
 		case 's':
 			cb.cb_print_slow_ios = B_TRUE;
 			break;
-		case 'v':
-			cb.cb_verbose = B_TRUE;
-			break;
-		case 'x':
-			cb.cb_explain = B_TRUE;
-			break;
-		case 'D':
-			cb.cb_dedup_stats = B_TRUE;
-			break;
 		case 't':
 			cb.cb_print_vdev_trim = B_TRUE;
 			break;
 		case 'T':
 			get_timestamp_arg(*optarg);
 			break;
+		case 'v':
+			cb.cb_verbose = B_TRUE;
+			break;
+		case 'x':
+			cb.cb_explain = B_TRUE;
+			break;
 		case POWER_OPT:
 			cb.cb_print_power = B_TRUE;
 			break;
@@ -9315,7 +9315,6 @@ zpool_do_status(int argc, char **argv)
 
 		if (cb.vcdl != NULL)
 			free_vdev_cmd_data_list(cb.vcdl);
-
 		if (argc == 0 && cb.cb_count == 0)
 			(void) fprintf(stderr, gettext("no pools available\n"));
 		else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8
index 24ad6e643cae..bbe7a45aa0c6 100644
--- a/man/man8/zpool-status.8
+++ b/man/man8/zpool-status.8
@@ -36,7 +36,7 @@
 .Sh SYNOPSIS
 .Nm zpool
 .Cm status
-.Op Fl DeigLpPstvx
+.Op Fl DegiLpPstvx
 .Op Fl T Sy u Ns | Ns Sy d
 .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns …
 .Oo Ar pool Oc Ns …
@@ -69,14 +69,20 @@ See the
 option of
 .Nm zpool Cm iostat
 for complete details.
+.It Fl D
+Display a histogram of deduplication statistics, showing the allocated
+.Pq physically present on disk
+and referenced
+.Pq logically referenced in the pool
+block counts and sizes by reference count.
 .It Fl e
 Only show unhealthy vdevs (not-ONLINE or with errors).
-.It Fl i
-Display vdev initialization status.
 .It Fl g
 Display vdev GUIDs instead of the normal device names
 These GUIDs can be used in place of device names for the zpool
 detach/offline/remove/replace commands.
+.It Fl i
+Display vdev initialization status.
 .It Fl L
 Display real paths for vdevs resolving all symbolic links.
 This can be used to look up the current block device name regardless of the
@@ -90,12 +96,6 @@ the path.
 This can be used in conjunction with the
 .Fl L
 flag.
-.It Fl D
-Display a histogram of deduplication statistics, showing the allocated
-.Pq physically present on disk
-and referenced
-.Pq logically referenced in the pool
-block counts and sizes by reference count.
 .It Fl s
 Display the number of leaf vdev slow I/O operations.
 This is the number of I/O operations that didn't complete in

From 5c4a4f82c850be6540076ff794d25defd826dddf Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Fri, 22 Mar 2024 06:10:04 +1100
Subject: [PATCH 005/116] zio: update ZIO type x stage documentation

- add column for TRIM ZIOs
- remove R from ZIO_STAGE_ISSUE_ASYNC, never happened
- remove I from ZIO_STAGE_VDEV_IO_DONE, never happened

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15959
---
 include/sys/zio_impl.h  | 56 +++++++++++++++++++++-------------------
 man/man8/zpool-events.8 | 57 +++++++++++++++++++++--------------------
 2 files changed, 58 insertions(+), 55 deletions(-)

diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h
index febe0a87b428..1c0a44059d24 100644
--- a/include/sys/zio_impl.h
+++ b/include/sys/zio_impl.h
@@ -25,6 +25,7 @@
 
 /*
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2024, Klara Inc.
  */
 
 #ifndef _ZIO_IMPL_H
@@ -39,7 +40,7 @@ extern "C" {
  *
  * The ZFS I/O pipeline is comprised of various stages which are defined
  * in the zio_stage enum below. The individual stages are used to construct
- * these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
+ * these basic I/O operations: Read, Write, Free, Claim, Ioctl and Trim.
  *
  * I/O operations: (XXX - provide detail for each of the operations)
  *
@@ -48,6 +49,7 @@ extern "C" {
  * Free:
  * Claim:
  * Ioctl:
+ * Trim:
  *
  * Although the most common pipeline are used by the basic I/O operations
  * above, there are some helper pipelines (one could consider them
@@ -120,43 +122,43 @@ extern "C" {
  * zio pipeline stage definitions
  */
 enum zio_stage {
-	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCI */
+	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCIT */
 
-	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R---- */
-	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W--- */
-	ZIO_STAGE_FREE_BP_INIT		= 1 << 3,	/* --F-- */
-	ZIO_STAGE_ISSUE_ASYNC		= 1 << 4,	/* RWF-- */
-	ZIO_STAGE_WRITE_COMPRESS	= 1 << 5,	/* -W--- */
+	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R----- */
+	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W---- */
+	ZIO_STAGE_FREE_BP_INIT		= 1 << 3,	/* --F--- */
+	ZIO_STAGE_ISSUE_ASYNC		= 1 << 4,	/* -WF--T */
+	ZIO_STAGE_WRITE_COMPRESS	= 1 << 5,	/* -W---- */
 
-	ZIO_STAGE_ENCRYPT		= 1 << 6,	/* -W--- */
-	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 7,	/* -W--- */
+	ZIO_STAGE_ENCRYPT		= 1 << 6,	/* -W---- */
+	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 7,	/* -W---- */
 
-	ZIO_STAGE_NOP_WRITE		= 1 << 8,	/* -W--- */
+	ZIO_STAGE_NOP_WRITE		= 1 << 8,	/* -W---- */
 
-	ZIO_STAGE_BRT_FREE		= 1 << 9,	/* --F-- */
+	ZIO_STAGE_BRT_FREE		= 1 << 9,	/* --F--- */
 
-	ZIO_STAGE_DDT_READ_START	= 1 << 10,	/* R---- */
-	ZIO_STAGE_DDT_READ_DONE		= 1 << 11,	/* R---- */
-	ZIO_STAGE_DDT_WRITE		= 1 << 12,	/* -W--- */
-	ZIO_STAGE_DDT_FREE		= 1 << 13,	/* --F-- */
+	ZIO_STAGE_DDT_READ_START	= 1 << 10,	/* R----- */
+	ZIO_STAGE_DDT_READ_DONE		= 1 << 11,	/* R----- */
+	ZIO_STAGE_DDT_WRITE		= 1 << 12,	/* -W---- */
+	ZIO_STAGE_DDT_FREE		= 1 << 13,	/* --F--- */
 
-	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 14,	/* RWFC- */
-	ZIO_STAGE_GANG_ISSUE		= 1 << 15,	/* RWFC- */
+	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 14,	/* RWFC-- */
+	ZIO_STAGE_GANG_ISSUE		= 1 << 15,	/* RWFC-- */
 
-	ZIO_STAGE_DVA_THROTTLE		= 1 << 16,	/* -W--- */
-	ZIO_STAGE_DVA_ALLOCATE		= 1 << 17,	/* -W--- */
-	ZIO_STAGE_DVA_FREE		= 1 << 18,	/* --F-- */
-	ZIO_STAGE_DVA_CLAIM		= 1 << 19,	/* ---C- */
+	ZIO_STAGE_DVA_THROTTLE		= 1 << 16,	/* -W---- */
+	ZIO_STAGE_DVA_ALLOCATE		= 1 << 17,	/* -W---- */
+	ZIO_STAGE_DVA_FREE		= 1 << 18,	/* --F--- */
+	ZIO_STAGE_DVA_CLAIM		= 1 << 19,	/* ---C-- */
 
-	ZIO_STAGE_READY			= 1 << 20,	/* RWFCI */
+	ZIO_STAGE_READY			= 1 << 20,	/* RWFCIT */
 
-	ZIO_STAGE_VDEV_IO_START		= 1 << 21,	/* RW--I */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 22,	/* RW--I */
-	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 23,	/* RW--I */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 21,	/* RW--IT */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 22,	/* RW---T */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 23,	/* RW--IT */
 
-	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 24,	/* R---- */
+	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 24,	/* R----- */
 
-	ZIO_STAGE_DONE			= 1 << 25	/* RWFCI */
+	ZIO_STAGE_DONE			= 1 << 25	/* RWFCIT */
 };
 
 #define	ZIO_ROOT_PIPELINE			\
diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8
index e1436f6ded57..a7a9e33442da 100644
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@@ -25,8 +25,9 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2024, Klara Inc.
 .\"
-.Dd July 11, 2023
+.Dd February 28, 2024
 .Dt ZPOOL-EVENTS 8
 .Os
 .
@@ -363,7 +364,7 @@ that is, the bits set in the good data which are cleared in the bad data.
 .Sh I/O STAGES
 The ZFS I/O pipeline is comprised of various stages which are defined below.
 The individual stages are used to construct these basic I/O
-operations: Read, Write, Free, Claim, and Ioctl.
+operations: Read, Write, Free, Claim, Ioctl and Trim.
 These stages may be
 set on an event to describe the life cycle of a given I/O request.
 .Pp
@@ -372,43 +373,43 @@ tab(:);
 l l l .
 Stage:Bit Mask:Operations
 _:_:_
-ZIO_STAGE_OPEN:0x00000001:RWFCI
+ZIO_STAGE_OPEN:0x00000001:RWFCIT
 
-ZIO_STAGE_READ_BP_INIT:0x00000002:R----
-ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W---
-ZIO_STAGE_FREE_BP_INIT:0x00000008:--F--
-ZIO_STAGE_ISSUE_ASYNC:0x00000010:RWF--
-ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W---
+ZIO_STAGE_READ_BP_INIT:0x00000002:R-----
+ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W----
+ZIO_STAGE_FREE_BP_INIT:0x00000008:--F---
+ZIO_STAGE_ISSUE_ASYNC:0x00000010:-WF--T
+ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W----
 
-ZIO_STAGE_ENCRYPT:0x00000040:-W---
-ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W---
+ZIO_STAGE_ENCRYPT:0x00000040:-W----
+ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W----
 
-ZIO_STAGE_NOP_WRITE:0x00000100:-W---
+ZIO_STAGE_NOP_WRITE:0x00000100:-W----
 
-ZIO_STAGE_BRT_FREE:0x00000200:--F--
+ZIO_STAGE_BRT_FREE:0x00000200:--F---
 
-ZIO_STAGE_DDT_READ_START:0x00000400:R----
-ZIO_STAGE_DDT_READ_DONE:0x00000800:R----
-ZIO_STAGE_DDT_WRITE:0x00001000:-W---
-ZIO_STAGE_DDT_FREE:0x00002000:--F--
+ZIO_STAGE_DDT_READ_START:0x00000400:R-----
+ZIO_STAGE_DDT_READ_DONE:0x00000800:R-----
+ZIO_STAGE_DDT_WRITE:0x00001000:-W----
+ZIO_STAGE_DDT_FREE:0x00002000:--F---
 
-ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC-
-ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC-
+ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC--
+ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC--
 
-ZIO_STAGE_DVA_THROTTLE:0x00010000:-W---
-ZIO_STAGE_DVA_ALLOCATE:0x00020000:-W---
-ZIO_STAGE_DVA_FREE:0x00040000:--F--
-ZIO_STAGE_DVA_CLAIM:0x00080000:---C-
+ZIO_STAGE_DVA_THROTTLE:0x00010000:-W----
+ZIO_STAGE_DVA_ALLOCATE:0x00020000:-W----
+ZIO_STAGE_DVA_FREE:0x00040000:--F---
+ZIO_STAGE_DVA_CLAIM:0x00080000:---C--
 
-ZIO_STAGE_READY:0x00100000:RWFCI
+ZIO_STAGE_READY:0x00100000:RWFCIT
 
-ZIO_STAGE_VDEV_IO_START:0x00200000:RW--I
-ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--I
-ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--I
+ZIO_STAGE_VDEV_IO_START:0x00200000:RW--IT
+ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW---T
+ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--IT
 
-ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R----
+ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R-----
 
-ZIO_STAGE_DONE:0x02000000:RWFCI
+ZIO_STAGE_DONE:0x02000000:RWFCIT
 .TE
 .
 .Sh I/O FLAGS

From 2c01cae8b9faca5766629aa45b2bfabaeae92e4d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 21 Mar 2024 18:42:21 -0400
Subject: [PATCH 006/116] BRT: Change brt_pending_tree sorting order

It does not look important how exactly brt_pending_tree is sorted.
When cloning large file, it is quite likely that all of its blocks
have identical physical birth times, so comparing them first does
not provide useful entropy, while accesses additional cache line.
In most cases combination of vdev and offset provides unique result
and physical birth time comparison is not even needed.  Meanwhile,
when traversing the tree inside brt_pending_apply(), it can be
beneficial for dbuf cache and CPU cache hits to group processing
by vdev and so by the per-VDEV BRT ZAPs.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15954
---
 module/zfs/brt.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 225ddaca1e54..3d565cd1397c 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1420,13 +1420,14 @@ brt_pending_entry_compare(const void *x1, const void *x2)
 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
 	int cmp;
 
-	cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
+	cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
+	    DVA_GET_VDEV(&bp2->blk_dva[0]));
 	if (cmp == 0) {
-		cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
-		    DVA_GET_VDEV(&bp2->blk_dva[0]));
-		if (cmp == 0) {
-			cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
-			    DVA_GET_OFFSET(&bp2->blk_dva[0]));
+		cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
+		    DVA_GET_OFFSET(&bp2->blk_dva[0]));
+		if (unlikely(cmp == 0)) {
+			cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1),
+			    BP_PHYSICAL_BIRTH(bp2));
 		}
 	}
 

From f1b368359b3970f7995a6dcb088fdadb31840f4d Mon Sep 17 00:00:00 2001
From: Fabian-Gruenbichler <f.gruenbichler@proxmox.com>
Date: Fri, 22 Mar 2024 00:38:24 +0100
Subject: [PATCH 007/116] udev: correctly handle partition #16 and later
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a zvol has more than 15 partitions, the minor device number exhausts
the slot count reserved for partitions next to the zvol itself. As a
result, the minor number cannot be used to determine the partition
number for the higher partition, and doing so results in wrong named
symlinks being generated by udev.

Since the partition number is encoded in the block device name anyway,
let's just extract it from there instead.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Closes #15904
Closes #15970
---
 udev/zvol_id.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/udev/zvol_id.c b/udev/zvol_id.c
index 5960b978787a..609349594767 100644
--- a/udev/zvol_id.c
+++ b/udev/zvol_id.c
@@ -51,7 +51,7 @@ const char *__asan_default_options(void) {
 int
 main(int argc, const char *const *argv)
 {
-	if (argc != 2) {
+	if (argc != 2 || strncmp(argv[1], "/dev/zd", 7) != 0) {
 		fprintf(stderr, "usage: %s /dev/zdX\n", argv[0]);
 		return (1);
 	}
@@ -72,9 +72,10 @@ main(int argc, const char *const *argv)
 		return (1);
 	}
 
-	unsigned int dev_part = minor(sb.st_rdev) % ZVOL_MINORS;
-	if (dev_part != 0)
-		sprintf(zvol_name + strlen(zvol_name), "-part%u", dev_part);
+	const char *dev_part = strrchr(dev_name, 'p');
+	if (dev_part != NULL) {
+		sprintf(zvol_name + strlen(zvol_name), "-part%s", dev_part + 1);
+	}
 
 	for (size_t i = 0; i < strlen(zvol_name); ++i)
 		if (isblank(zvol_name[i]))

From c28f94f32ef0f104b731be0e44c5e61bbdf3b9b7 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 21 Mar 2024 19:43:53 -0400
Subject: [PATCH 008/116] ZAP: Some cleanups/micro-optimizations

- Remove custom zap_memset(), use regular memset().
- Use PANIC() instead of opaque cmn_err(CE_PANIC).
- Provide entry parameter to zap_leaf_rehash_entry().
- Reduce branching in zap_leaf_array_create() inner loop.
- Remove signedness where it should not be.

Should be no function changes.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15976
---
 include/sys/zap_leaf.h |  8 ++---
 module/zfs/zap_leaf.c  | 77 +++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 47 deletions(-)

diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h
index ebc67c2bf465..d563edd7ba59 100644
--- a/include/sys/zap_leaf.h
+++ b/include/sys/zap_leaf.h
@@ -47,7 +47,7 @@ struct zap_stats;
  * entries - header space (2*chunksize)
  */
 #define	ZAP_LEAF_NUMCHUNKS_BS(bs) \
-	(((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
+	(((1U << (bs)) - 2 * ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
 	ZAP_LEAF_CHUNKSIZE - 2)
 
 #define	ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs)))
@@ -80,7 +80,7 @@ struct zap_stats;
  * chunks per entry (3).
  */
 #define	ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5)
-#define	ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs))
+#define	ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1U << ZAP_LEAF_HASH_SHIFT_BS(bs))
 #define	ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs)))
 #define	ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs)))
 
@@ -163,7 +163,7 @@ typedef struct zap_leaf {
 	dmu_buf_user_t l_dbu;
 	krwlock_t l_rwlock;
 	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
-	int l_bs;			/* block size shift */
+	uint_t l_bs;			/* block size shift */
 	dmu_buf_t *l_dbuf;
 } zap_leaf_t;
 
@@ -243,7 +243,7 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
  */
 
 extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
-extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t len);
 extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
 extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
     struct zap_stats *zs);
diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c
index e6afb1c58c95..032aca92695e 100644
--- a/module/zfs/zap_leaf.c
+++ b/module/zfs/zap_leaf.c
@@ -41,7 +41,8 @@
 #include <sys/zap_leaf.h>
 #include <sys/arc.h>
 
-static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le,
+    uint16_t entry);
 
 #define	CHAIN_END 0xffff /* end of the chunk chain */
 
@@ -52,16 +53,6 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
 #define	LEAF_HASH_ENTPTR(l, h)	(&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
 
-static void
-zap_memset(void *a, int c, size_t n)
-{
-	char *cp = a;
-	char *cpend = cp + n;
-
-	while (cp < cpend)
-		*cp++ = c;
-}
-
 static void
 stv(int len, void *addr, uint64_t value)
 {
@@ -79,7 +70,7 @@ stv(int len, void *addr, uint64_t value)
 		*(uint64_t *)addr = value;
 		return;
 	default:
-		cmn_err(CE_PANIC, "bad int len %d", len);
+		PANIC("bad int len %d", len);
 	}
 }
 
@@ -96,13 +87,13 @@ ldv(int len, const void *addr)
 	case 8:
 		return (*(uint64_t *)addr);
 	default:
-		cmn_err(CE_PANIC, "bad int len %d", len);
+		PANIC("bad int len %d", len);
 	}
 	return (0xFEEDFACEDEADBEEFULL);
 }
 
 void
-zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size)
 {
 	zap_leaf_t l;
 	dmu_buf_t l_dbuf;
@@ -119,10 +110,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
 
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
 
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
 		struct zap_leaf_entry *le;
 
@@ -160,11 +151,11 @@ void
 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 {
 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
-	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+	memset(&zap_leaf_phys(l)->l_hdr, 0,
 	    sizeof (struct zap_leaf_header));
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
@@ -185,7 +176,7 @@ zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
 	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
-	int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+	uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
@@ -223,28 +214,29 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
 {
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
-	int byten = 0;
+	int byten = integer_size;
 	uint64_t value = 0;
 	int shift = (integer_size - 1) * 8;
 	int len = num_integers;
 
 	ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
 
+	if (len > 0)
+		value = ldv(integer_size, buf);
 	while (len > 0) {
 		uint16_t chunk = zap_leaf_chunk_alloc(l);
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 
 		la->la_type = ZAP_CHUNK_ARRAY;
 		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
-			if (byten == 0)
-				value = ldv(integer_size, buf);
 			la->la_array[i] = value >> shift;
 			value <<= 8;
-			if (++byten == integer_size) {
-				byten = 0;
-				buf += integer_size;
+			if (--byten == 0) {
 				if (--len == 0)
 					break;
+				byten = integer_size;
+				buf += integer_size;
+				value = ldv(integer_size, buf);
 			}
 		}
 
@@ -264,7 +256,7 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
 	*chunkp = CHAIN_END;
 
 	while (chunk != CHAIN_END) {
-		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+		uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
 		    ZAP_CHUNK_ARRAY);
 		zap_leaf_chunk_free(l, chunk);
@@ -333,7 +325,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 
 static boolean_t
 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
-    int chunk, int array_numints)
+    uint_t chunk, int array_numints)
 {
 	int bseen = 0;
 
@@ -562,7 +554,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	uint64_t valuelen = integer_size * num_integers;
 
-	int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+	uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (SET_ERROR(E2BIG));
@@ -624,7 +616,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	/* link it into the hash chain */
 	/* XXX if we did the search above, we could just use that */
-	uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
+	uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk);
 
 	zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
@@ -687,9 +679,8 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
  */
 
 static uint16_t *
-zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry)
 {
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	struct zap_leaf_entry *le2;
 	uint16_t *chunkp;
 
@@ -722,7 +713,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
 		struct zap_leaf_array *la =
 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int nextchunk = la->la_next;
+		uint_t nextchunk = la->la_next;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
@@ -739,7 +730,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 }
 
 static void
-zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
 {
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
@@ -748,7 +739,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
 	*nle = *le; /* structure assignment */
 
-	(void) zap_leaf_rehash_entry(nl, chunk);
+	(void) zap_leaf_rehash_entry(nl, nle, chunk);
 
 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
 	nle->le_value_chunk =
@@ -766,7 +757,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
-	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
 	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
@@ -777,7 +768,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* break existing hash chains */
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
 	if (sort)
@@ -792,7 +783,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	 * but this accesses memory more sequentially, and when we're
 	 * called, the block is usually pretty full.
 	 */
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
 		if (le->le_type != ZAP_CHUNK_ENTRY)
 			continue;
@@ -800,14 +791,14 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 		if (le->le_hash & (1ULL << bit))
 			zap_leaf_transfer_entry(l, i, nl);
 		else
-			(void) zap_leaf_rehash_entry(l, i);
+			(void) zap_leaf_rehash_entry(l, le, i);
 	}
 }
 
 void
 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 {
-	int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
@@ -823,9 +814,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
 
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
-		int nentries = 0;
-		int chunk = zap_leaf_phys(l)->l_hash[i];
+	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+		uint_t nentries = 0;
+		uint_t chunk = zap_leaf_phys(l)->l_hash[i];
 
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_entry *le =

From 102b468b5e190973fbaee6fe682727eb33079811 Mon Sep 17 00:00:00 2001
From: Robert Evans <rrevans@gmail.com>
Date: Mon, 25 Mar 2024 17:56:49 -0400
Subject: [PATCH 009/116] Fix corruption caused by mmap flushing problems

1) Make mmap flushes synchronous. Linux may skip flushing dirty pages
   already in writeback unless data-integrity sync is requested.

2) Change zfs_putpage to use TXG_WAIT. Otherwise dirty pages may be
   skipped due to DMU pushing back on TX assign.

3) Add missing mmap flush when doing block cloning.

4) While here, pass errors from putpage to writepage/writepages.

This change fixes corruption edge cases, but unfortunately adds
synchronous ZIL flushes for dirty mmap pages to llseek and bclone
operations. It may be possible to avoid these sync writes later
but would need more tricky refactoring of the writeback code.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #15933
Closes #16019
---
 module/os/linux/zfs/zfs_vnops_os.c | 5 +----
 module/os/linux/zfs/zpl_file.c     | 8 ++++----
 module/zfs/zfs_vnops.c             | 6 +++++-
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index a32307c39331..1cecad9f7755 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -3795,11 +3795,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
-	err = dmu_tx_assign(tx, TXG_NOWAIT);
+	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
-		if (err == ERESTART)
-			dmu_tx_wait(tx);
-
 		dmu_tx_abort(tx);
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 3caa0fc6c214..9dec52215c7c 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 {
 	boolean_t *for_sync = data;
 	fstrans_cookie_t cookie;
+	int ret;
 
 	ASSERT(PageLocked(pp));
 	ASSERT(!PageWriteback(pp));
 
 	cookie = spl_fstrans_mark();
-	(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
+	ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
 	spl_fstrans_unmark(cookie);
 
-	return (0);
+	return (ret);
 }
 
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
 {
-	(void) zpl_putpage(&pp->page, wbc, data);
-	return (0);
+	return (zpl_putpage(&pp->page, wbc, data));
 }
 #endif
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 7f39ad6fc775..babb07ca25a9 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -123,7 +123,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
 
 	/* Flush any mmap()'d data to disk */
 	if (zn_has_cached_data(zp, 0, file_sz - 1))
-		zn_flush_cached_data(zp, B_FALSE);
+		zn_flush_cached_data(zp, B_TRUE);
 
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
 	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
@@ -1187,6 +1187,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}
 	}
 
+	/* Flush any mmap()'d data to disk */
+	if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
+		zn_flush_cached_data(inzp, B_TRUE);
+
 	/*
 	 * Maintain predictable lock order.
 	 */

From bf8f72359d1bf0cdb6a4b31ccfc7bbef0f948ca4 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:58:04 -0400
Subject: [PATCH 010/116] BRT: Skip duplicate BRT prefetches

If there is a pending entry for this block, then we've already
issued BRT prefetch for it within this TXG, so don't do it again.
BRT vdev lookup and following zap_prefetch_uint64() call can be
pretty expensive and should be avoided when not necessary.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15941
---
 module/zfs/brt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 3d565cd1397c..7ddec0b4b9bb 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1472,10 +1472,10 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 		kmem_cache_free(brt_pending_entry_cache, newbpe);
 	} else {
 		ASSERT(bpe == NULL);
-	}
 
-	/* Prefetch BRT entry, as we will need it in the syncing context. */
-	brt_prefetch(brt, bp);
+		/* Prefetch BRT entry for the syncing context. */
+		brt_prefetch(brt, bp);
+	}
 }
 
 void

From 80cc516295fef1a429542fcfeea369c6bbb85ce4 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:58:50 -0400
Subject: [PATCH 011/116] ZAP: Massively switch to _by_dnode() interfaces

Before this change ZAP called dnode_hold() for almost every block
access, that was clearly visible in profiler under heavy load, such
as BRT.  This patch makes it always hold the dnode reference between
zap_lockdir() and zap_unlockdir().  It allows to avoid most of dnode
operations between those.  It also adds several new _by_dnode() APIs
to ZAP and uses them in BRT code.  Also adds dmu_prefetch_by_dnode()
variant and uses it in the ZAP code.

After this there remains only one call to dmu_buf_dnode_enter(),
which seems to be unneeded.  So remove the call and the functions.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15951
---
 include/sys/dmu.h      |   4 +-
 include/sys/zap.h      |   8 ++
 include/sys/zap_impl.h |   1 +
 module/zfs/brt.c       |  72 +++-----------
 module/zfs/dbuf.c      |  15 ---
 module/zfs/dmu.c       |  18 +++-
 module/zfs/dmu_recv.c  |   7 +-
 module/zfs/zap.c       |  43 ++++-----
 module/zfs/zap_micro.c | 206 +++++++++++++++++++++++++++++------------
 9 files changed, 202 insertions(+), 172 deletions(-)

diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 921f51f27a20..b5fed64da4ad 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -752,8 +752,6 @@ void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub);
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 objset_t *dmu_buf_get_objset(dmu_buf_t *db);
-dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
-void dmu_buf_dnode_exit(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
@@ -902,6 +900,8 @@ extern uint_t zfs_max_recordsize;
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
+void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+	uint64_t len, enum zio_priority pri);
 void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
 
 typedef struct dmu_object_info {
diff --git a/include/sys/zap.h b/include/sys/zap.h
index 308a7c7284d7..96ddcc324b65 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key,
 int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
+int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
 
 /*
  * Set the attribute with the given name to the given value.  If an
@@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
 int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
 
 /*
  * Get the length (in integers) and the integer size of the specified
@@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
 int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
 int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx);
+int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx);
 
 /*
  * Returns (in *count) the number of attributes in the specified zap
diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index 74853f5faceb..2959aa9b2ca4 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -145,6 +145,7 @@ typedef struct zap {
 	dmu_buf_user_t zap_dbu;
 	objset_t *zap_objset;
 	uint64_t zap_object;
+	dnode_t *zap_dnode;
 	struct dmu_buf *zap_dbuf;
 	krwlock_t zap_rwlock;
 	boolean_t zap_ismicro;
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 7ddec0b4b9bb..5e10df9dfe56 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -955,52 +955,10 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
 	if (mos_entries == 0)
 		return;
 
-	BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
-	    (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
-	    (u_longlong_t)bre->bre_offset);
 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
 }
 
-static int
-brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
-{
-	int error;
-
-	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-	ASSERT(bre->bre_refcount > 0);
-
-	error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
-	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
-	    sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
-	BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
-	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
-	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
-	    (u_longlong_t)bre->bre_refcount, error);
-
-	return (error);
-}
-
-static int
-brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
-{
-	int error;
-
-	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-	ASSERT0(bre->bre_refcount);
-
-	error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
-	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
-	BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
-	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
-	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
-	    (u_longlong_t)bre->bre_refcount, error);
-
-	return (error);
-}
-
 /*
  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
  * positive, but gives us quick answer if we should look into BRT, which
@@ -1559,24 +1517,16 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
 }
 
 static void
-brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
 {
-
-	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-
 	if (bre->bre_refcount == 0) {
-		int error;
-
-		error = brt_entry_remove(brt, brtvd, bre, tx);
-		ASSERT(error == 0 || error == ENOENT);
-		/*
-		 * If error == ENOENT then zfs_clone_range() was done from a
-		 * removed (but opened) file (open(), unlink()).
-		 */
-		ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
+		int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
+		    BRT_KEY_WORDS, tx);
+		VERIFY(error == 0 || error == ENOENT);
 	} else {
-		VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
+		VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
+		    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
+		    &bre->bre_refcount, tx));
 	}
 }
 
@@ -1585,6 +1535,7 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
 {
 	brt_vdev_t *brtvd;
 	brt_entry_t *bre;
+	dnode_t *dn;
 	uint64_t vdevid;
 	void *c;
 
@@ -1608,14 +1559,19 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
 		if (brtvd->bv_mos_brtvdev == 0)
 			brt_vdev_create(brt, brtvd, tx);
 
+		VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
+		    FTAG, &dn));
+
 		c = NULL;
 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
-			brt_sync_entry(brt, brtvd, bre, tx);
+			brt_sync_entry(dn, bre, tx);
 			brt_entry_free(bre);
 			ASSERT(brt->brt_nentries > 0);
 			brt->brt_nentries--;
 		}
 
+		dnode_rele(dn, FTAG);
+
 		brt_vdev_sync(brt, brtvd, tx);
 
 		if (brtvd->bv_totalcount == 0)
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 324bf8cbc276..6798fc2d5bdc 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4174,21 +4174,6 @@ dmu_buf_get_objset(dmu_buf_t *db)
 	return (dbi->db_objset);
 }
 
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_ENTER(dbi);
-	return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_EXIT(dbi);
-}
-
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index d82211e6d4c7..8986f55e792a 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -712,8 +712,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
-	int64_t level2 = level;
-	uint64_t start, end, start2, end2;
 
 	if (dmu_prefetch_max == 0 || len == 0) {
 		dmu_prefetch_dnode(os, object, pri);
@@ -723,6 +721,18 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
+	dmu_prefetch_by_dnode(dn, level, offset, len, pri);
+
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
+{
+	int64_t level2 = level;
+	uint64_t start, end, start2, end2;
+
 	/*
 	 * Depending on len we may do two prefetches: blocks [start, end) at
 	 * level, and following blocks [start2, end2) at higher level2.
@@ -762,8 +772,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	for (uint64_t i = start2; i < end2; i++)
 		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
-
-	dnode_rele(dn, FTAG);
 }
 
 /*
@@ -2563,6 +2571,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
+EXPORT_SYMBOL(dmu_prefetch_by_dnode);
+EXPORT_SYMBOL(dmu_prefetch_dnode);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 2cf10909738b..9f1c25f866f7 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2353,7 +2353,6 @@ receive_process_write_record(struct receive_writer_arg *rwa,
 	if (rwa->heal) {
 		blkptr_t *bp;
 		dmu_buf_t *dbp;
-		dnode_t *dn;
 		int flags = DB_RF_CANFAIL;
 
 		if (rwa->raw)
@@ -2385,19 +2384,15 @@ receive_process_write_record(struct receive_writer_arg *rwa,
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
-		dn = dmu_buf_dnode_enter(dbp);
 		/* Make sure the on-disk block and recv record sizes match */
-		if (drrw->drr_logical_size !=
-		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) {
+		if (drrw->drr_logical_size != dbp->db_size) {
 			err = ENOTSUP;
-			dmu_buf_dnode_exit(dbp);
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
 		/* Get the block pointer for the corrupted block */
 		bp = dmu_buf_get_blkptr(dbp);
 		err = do_corrective_recv(rwa, drrw, rrd, bp);
-		dmu_buf_dnode_exit(dbp);
 		dmu_buf_rele(dbp, FTAG);
 		return (err);
 	}
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index dde05d7005c2..da86defb445c 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -133,7 +133,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 	 * set up block 1 - the first leaf
 	 */
 	dmu_buf_t *db;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db, tx);
 
@@ -182,7 +182,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
 		ASSERT0(tbl->zt_blks_copied);
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
@@ -193,21 +193,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 
 	uint64_t b = tbl->zt_blks_copied;
 	dmu_buf_t *db_old;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
 
 	/* first half of entries in old[b] go to new[2*b+0] */
 	dmu_buf_t *db_new;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
@@ -255,7 +255,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
 	dmu_buf_t *db;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
@@ -267,7 +267,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
 		dmu_buf_t *db2;
 
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0) {
@@ -296,16 +296,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 	uint64_t blk = idx >> (bs-3);
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
-	/*
-	 * Note: this is equivalent to dmu_buf_hold(), but we use
-	 * _dnode_enter / _by_dnode because it's faster because we don't
-	 * have to hold the dnode.
-	 */
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
 	dmu_buf_t *db;
-	int err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err != 0)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
@@ -319,11 +312,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 		 */
 		blk = (idx*2) >> (bs-3);
 
-		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-		err = dmu_buf_hold_by_dnode(dn,
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
-		dmu_buf_dnode_exit(zap->zap_dbuf);
 		if (err == 0)
 			dmu_buf_rele(db, FTAG);
 	}
@@ -368,7 +359,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 
 		uint64_t newblk = zap_allocate_blocks(zap, 1);
 		dmu_buf_t *db_new;
-		int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0)
@@ -433,7 +424,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 	l->l_blkid = zap_allocate_blocks(zap, 1);
 	l->l_dbuf = NULL;
 
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 	    DMU_READ_NO_PREFETCH));
 	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
@@ -533,10 +524,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
 		return (SET_ERROR(ENOENT));
 
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-	int err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err != 0)
 		return (err);
 
@@ -985,7 +974,7 @@ fzap_prefetch(zap_name_t *zn)
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+	dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
 	    ZIO_PRIORITY_SYNC_READ);
 }
 
@@ -1228,7 +1217,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 	 */
 	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
 	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
-		dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
 		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
 		    ZIO_PRIORITY_ASYNC_READ);
 	}
@@ -1356,7 +1345,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
 	} else {
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
@@ -1366,7 +1355,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 			dmu_buf_t *db;
 			int err;
 
-			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+			err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
 			    FTAG, &db, DMU_READ_NO_PREFETCH);
 			if (err == 0) {
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 085d9cd8b4b6..d806988af96d 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -415,7 +415,7 @@ mze_destroy(zap_t *zap)
 }
 
 static zap_t *
-mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+mzap_open(dmu_buf_t *db)
 {
 	zap_t *winner;
 	uint64_t *zap_hdr = (uint64_t *)db->db_data;
@@ -427,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
 	rw_enter(&zap->zap_rwlock, RW_WRITER);
-	zap->zap_objset = os;
-	zap->zap_object = obj;
+	zap->zap_objset = dmu_buf_get_objset(db);
+	zap->zap_object = db->db_object;
 	zap->zap_dbuf = db;
 
 	if (zap_block_type != ZBT_MICRO) {
@@ -518,7 +518,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
  * have the specified tag.
  */
 static int
-zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
+zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	ASSERT0(db->db_offset);
@@ -528,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 
 	*zapp = NULL;
 
-	dmu_object_info_from_db(db, &doi);
+	dmu_object_info_from_dnode(dn, &doi);
 	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
 		return (SET_ERROR(EINVAL));
 
 	zap_t *zap = dmu_buf_get_user(db);
 	if (zap == NULL) {
-		zap = mzap_open(os, obj, db);
+		zap = mzap_open(db);
 		if (zap == NULL) {
 			/*
 			 * mzap_open() didn't like what it saw on-disk.
@@ -563,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 	}
 
 	zap->zap_objset = os;
+	zap->zap_dnode = dn;
 
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
@@ -598,23 +599,16 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
     zap_t **zapp)
 {
 	dmu_buf_t *db;
+	int err;
 
-	int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0) {
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
 		return (err);
-	}
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
-	}
-#endif
-
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0) {
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0)
 		dmu_buf_rele(db, tag);
-	}
+	else
+		VERIFY(dnode_add_ref(dn, tag));
 	return (err);
 }
 
@@ -623,21 +617,23 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp)
 {
+	dnode_t *dn;
 	dmu_buf_t *db;
+	int err;
 
-	int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	err = dnode_hold(os, obj, tag, &dn);
 	if (err != 0)
 		return (err);
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, tag);
+		return (err);
 	}
-#endif
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0)
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
 		dmu_buf_rele(db, tag);
+		dnode_rele(dn, tag);
+	}
 	return (err);
 }
 
@@ -645,6 +641,7 @@ void
 zap_unlockdir(zap_t *zap, const void *tag)
 {
 	rw_exit(&zap->zap_rwlock);
+	dnode_rele(zap->zap_dnode, tag);
 	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
@@ -730,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 	if (flags != 0) {
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
-		VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+		VERIFY(dnode_add_ref(dn, FTAG));
+		VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
 		    B_FALSE, B_FALSE, &zap));
 		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
 		zap_unlockdir(zap, FTAG);
@@ -1325,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key,
 	return (err);
 }
 
+static int
+zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+	zap = zn->zn_zap;	/* fzap_add() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
 int
 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
@@ -1336,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
-	zap = zn->zn_zap;	/* fzap_add() may change zap */
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, FTAG);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1396,27 +1424,56 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
-int
-zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+static int
+zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
+    const void *tag)
 {
-	zap_t *zap;
+	int err;
 
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
+	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers, const void *val,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1481,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
 	return (err);
 }
 
+static int
+zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_remove(zn, tx);
+	zap_name_free(zn);
+	zap_unlockdir(zap, tag);
+	return (err);
+}
+
 int
 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx)
@@ -1491,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_remove(zn, tx);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1704,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64);
 EXPORT_SYMBOL(zap_add);
 EXPORT_SYMBOL(zap_add_by_dnode);
 EXPORT_SYMBOL(zap_add_uint64);
+EXPORT_SYMBOL(zap_add_uint64_by_dnode);
 EXPORT_SYMBOL(zap_update);
 EXPORT_SYMBOL(zap_update_uint64);
+EXPORT_SYMBOL(zap_update_uint64_by_dnode);
 EXPORT_SYMBOL(zap_length);
 EXPORT_SYMBOL(zap_length_uint64);
 EXPORT_SYMBOL(zap_remove);
 EXPORT_SYMBOL(zap_remove_by_dnode);
 EXPORT_SYMBOL(zap_remove_norm);
 EXPORT_SYMBOL(zap_remove_uint64);
+EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
 EXPORT_SYMBOL(zap_count);
 EXPORT_SYMBOL(zap_value_search);
 EXPORT_SYMBOL(zap_join);

From 4616b96a643c941e96ee0d1d816c573df9f0de28 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:59:55 -0400
Subject: [PATCH 012/116] BRT: Relax brt_pending_apply() locking

Since brt_pending_apply() is running in syncing context, no other
brt_pending_tree accesses are possible for the TXG.  We don't need
to acquire brt_pending_lock here.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15955
---
 module/zfs/brt.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 5e10df9dfe56..416caeb11c7e 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1473,26 +1473,23 @@ brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 void
 brt_pending_apply(spa_t *spa, uint64_t txg)
 {
-	brt_t *brt;
+	brt_t *brt = spa->spa_brt;
 	brt_pending_entry_t *bpe;
 	avl_tree_t *pending_tree;
-	kmutex_t *pending_lock;
 	void *c;
 
 	ASSERT3U(txg, !=, 0);
 
-	brt = spa->spa_brt;
+	/*
+	 * We are in syncing context, so no other brt_pending_tree accesses
+	 * are possible for the TXG. Don't need to acquire brt_pending_lock.
+	 */
 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
-	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
-
-	mutex_enter(pending_lock);
 
 	c = NULL;
 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
 		boolean_t added_to_ddt;
 
-		mutex_exit(pending_lock);
-
 		for (int i = 0; i < bpe->bpe_count; i++) {
 			/*
 			 * If the block has DEDUP bit set, it means that it
@@ -1510,10 +1507,7 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
 		}
 
 		kmem_cache_free(brt_pending_entry_cache, bpe);
-		mutex_enter(pending_lock);
 	}
-
-	mutex_exit(pending_lock);
 }
 
 static void

From 493fcce9be165bd751434879d2478938cd5bb926 Mon Sep 17 00:00:00 2001
From: George Wilson <george.wilson@delphix.com>
Date: Mon, 25 Mar 2024 18:01:54 -0400
Subject: [PATCH 013/116] Provide macros for setting and getting blkptr birth
 times

There exist a couple of macros that are used to update the blkptr birth
times but they can often be confusing. For example, the
BP_PHYSICAL_BIRTH() macro will provide either the physical birth time
if it is set or else return back the logical birth time. The
complement to this macro is BP_SET_BIRTH() which will set the logical
birth time and set the physical birth time if they are not the same.
Consumers may get confused when they are trying to get the physical
birth time and use the BP_PHYSICAL_BIRTH() macro only to find out that
the logical birth time is what is actually returned.

This change cleans up these macros and makes them symmetrical. The same
functionally is preserved but the name is changed. Instead of calling
BP_PHYSICAL_BIRTH(), consumer can now call BP_GET_BIRTH(). In
additional to cleaning up this naming conventions, two new sets of
macros are introduced -- BP_[SET|GET]_LOGICAL_BIRTH() and
BP_[SET|GET]_PHYSICAL_BIRTH.  These new macros allow the consumer to
get and set the specific birth time.

As part of the cleanup, the unused GRID macros have been removed and
that portion of the blkptr are currently unused.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes #15962
---
 cmd/zdb/zdb.c                 | 19 +++++------
 cmd/zdb/zdb_il.c              | 12 +++----
 cmd/zhack.c                   |  4 +--
 include/sys/spa.h             | 59 ++++++++++++++++++-----------------
 include/sys/uberblock_impl.h  |  2 +-
 lib/libzdb/libzdb.c           |  4 +--
 module/zfs/arc.c              | 16 +++++-----
 module/zfs/bpobj.c            |  7 +++--
 module/zfs/brt.c              |  3 +-
 module/zfs/dbuf.c             | 22 +++++++------
 module/zfs/ddt.c              |  4 +--
 module/zfs/dmu.c              | 19 +++++------
 module/zfs/dmu_recv.c         | 11 ++++---
 module/zfs/dmu_send.c         |  6 ++--
 module/zfs/dmu_traverse.c     | 11 ++++---
 module/zfs/dnode.c            |  4 +--
 module/zfs/dsl_bookmark.c     |  3 +-
 module/zfs/dsl_dataset.c      | 21 +++++++------
 module/zfs/dsl_deadlist.c     |  7 ++---
 module/zfs/dsl_destroy.c      | 13 +++++---
 module/zfs/dsl_pool.c         |  2 +-
 module/zfs/dsl_scan.c         | 31 +++++++++---------
 module/zfs/metaslab.c         | 11 ++++---
 module/zfs/spa.c              |  9 +++---
 module/zfs/spa_errlog.c       | 26 +++++----------
 module/zfs/spa_log_spacemap.c |  2 +-
 module/zfs/uberblock.c        |  2 +-
 module/zfs/vdev_mirror.c      |  2 +-
 module/zfs/vdev_raidz.c       | 13 ++++----
 module/zfs/zil.c              | 14 ++++-----
 module/zfs/zio.c              | 45 +++++++++++++-------------
 module/zfs/zio_checksum.c     |  2 +-
 32 files changed, 209 insertions(+), 197 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 4880c8048726..449b6bf2ccb3 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -199,7 +199,8 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
 					break;
 				sublivelist_verify_block_t svb = {
 				    .svb_dva = bp->blk_dva[i],
-				    .svb_allocated_txg = bp->blk_birth
+				    .svb_allocated_txg =
+				    BP_GET_LOGICAL_BIRTH(bp)
 				};
 
 				if (zfs_btree_find(&sv->sv_leftover, &svb,
@@ -2340,7 +2341,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
 		    (int)BPE_GET_ETYPE(bp),
 		    (u_longlong_t)BPE_GET_LSIZE(bp),
 		    (u_longlong_t)BPE_GET_PSIZE(bp),
-		    (u_longlong_t)bp->blk_birth);
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 		return;
 	}
 
@@ -2358,7 +2359,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
 		    buflen - strlen(blkbuf),
 		    "%llxL B=%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
-		    (u_longlong_t)bp->blk_birth);
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 	} else {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
@@ -2366,8 +2367,8 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_FILL(bp),
-		    (u_longlong_t)bp->blk_birth,
-		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),
+		    (u_longlong_t)BP_GET_BIRTH(bp));
 		if (bp_freed)
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
@@ -2417,7 +2418,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
 {
 	int err = 0;
 
-	if (bp->blk_birth == 0)
+	if (BP_GET_LOGICAL_BIRTH(bp) == 0)
 		return (0);
 
 	print_indirect(spa, bp, zb, dnp);
@@ -2605,7 +2606,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
-	if (bp->blk_birth != 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
@@ -2646,7 +2647,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
-	ASSERT(bp->blk_birth != 0);
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0);
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
 	(void) printf("\t%s\n", blkbuf);
 	return (0);
@@ -5788,7 +5789,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	if (zb->zb_level == ZB_DNODE_LEVEL)
 		return (0);
 
-	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
+	if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("objset %llu object %llu "
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c
index 63d95ddedc3b..e3caaeb70e14 100644
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@@ -173,8 +173,8 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		(void) printf("%shas blkptr, %s\n", tab_prefix,
-		    !BP_IS_HOLE(bp) &&
-		    bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
+		    !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
+		    spa_min_claim_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
 		print_log_bp(bp, tab_prefix);
 
@@ -186,7 +186,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
 			(void) printf("%s<hole>\n", tab_prefix);
 			return;
 		}
-		if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+		if (BP_GET_LOGICAL_BIRTH(bp) < zilog->zl_header->zh_claim_txg) {
 			(void) printf("%s<block already committed>\n",
 			    tab_prefix);
 			return;
@@ -237,8 +237,8 @@ zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg)
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		(void) printf("%shas blkptr, %s\n", tab_prefix,
-		    !BP_IS_HOLE(bp) &&
-		    bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
+		    !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
+		    spa_min_claim_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
 		print_log_bp(bp, tab_prefix);
 	}
@@ -473,7 +473,7 @@ print_log_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 
 	if (claim_txg != 0)
 		claim = "already claimed";
-	else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
+	else if (BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa))
 		claim = "will claim";
 	else
 		claim = "won't claim";
diff --git a/cmd/zhack.c b/cmd/zhack.c
index 44611887dd25..f15a6ece538c 100644
--- a/cmd/zhack.c
+++ b/cmd/zhack.c
@@ -612,8 +612,8 @@ zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l)
 	 * Uberblock root block pointer has valid birth TXG.
 	 * Copying it to the label NVlist
 	 */
-	if (ub->ub_rootbp.blk_birth != 0) {
-		const uint64_t txg = ub->ub_rootbp.blk_birth;
+	if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) {
+		const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp);
 		ub->ub_txg = txg;
 
 		if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) {
diff --git a/include/sys/spa.h b/include/sys/spa.h
index cada3c841037..fb4c93431a31 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -125,15 +125,15 @@ typedef struct zio_cksum_salt {
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|  pad  |	  vdev1         | GRID  |	  ASIZE		|
+ * 0	|  pad  |	  vdev1         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|  pad  |	  vdev2         | GRID  |	  ASIZE		|
+ * 2	|  pad  |	  vdev2         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 4	|  pad  |	  vdev3         | GRID  |	  ASIZE		|
+ * 4	|  pad  |	  vdev3         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -165,7 +165,6 @@ typedef struct zio_cksum_salt {
  * LSIZE	logical size
  * PSIZE	physical size (after compression)
  * ASIZE	allocated size (including RAID-Z parity and gang block headers)
- * GRID		RAID-Z layout information (reserved for future use)
  * cksum	checksum function
  * comp		compression function
  * G		gang block indicator
@@ -190,11 +189,11 @@ typedef struct zio_cksum_salt {
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|		vdev1		| GRID  |	  ASIZE		|
+ * 0	|		vdev1		| pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|		vdev2		| GRID  |	  ASIZE		|
+ * 2	|		vdev2		| pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -355,7 +354,7 @@ typedef enum bp_embedded_type {
 #define	BPE_NUM_WORDS 14
 #define	BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
 #define	BPE_IS_PAYLOADWORD(bp, wp) \
-	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+	((wp) != &(bp)->blk_prop && (wp) != (&(bp)->blk_birth_word[1]))
 
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
@@ -374,8 +373,7 @@ typedef struct blkptr {
 	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
 	uint64_t	blk_pad[2];	/* Extra space for the future	    */
-	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
-	uint64_t	blk_birth;	/* transaction group at birth	    */
+	uint64_t	blk_birth_word[2];
 	uint64_t	blk_fill;	/* fill count			    */
 	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
 } blkptr_t;
@@ -395,9 +393,6 @@ typedef struct blkptr {
 	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
 	SPA_MINBLOCKSHIFT, 0, x)
 
-#define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
-#define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
-
 #define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
 #define	DVA_SET_VDEV(dva, x)	\
 	BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
@@ -480,15 +475,23 @@ typedef struct blkptr {
 #define	BP_GET_FREE(bp)			BF64_GET((bp)->blk_fill, 0, 1)
 #define	BP_SET_FREE(bp, x)		BF64_SET((bp)->blk_fill, 0, 1, x)
 
-#define	BP_PHYSICAL_BIRTH(bp)		\
-	(BP_IS_EMBEDDED(bp) ? 0 : \
-	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+#define	BP_GET_LOGICAL_BIRTH(bp)	(bp)->blk_birth_word[1]
+#define	BP_SET_LOGICAL_BIRTH(bp, x)	((bp)->blk_birth_word[1] = (x))
+
+#define	BP_GET_PHYSICAL_BIRTH(bp)	(bp)->blk_birth_word[0]
+#define	BP_SET_PHYSICAL_BIRTH(bp, x)	((bp)->blk_birth_word[0] = (x))
+
+#define	BP_GET_BIRTH(bp)					\
+	(BP_IS_EMBEDDED(bp) ? 0 : 				\
+	BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) :	\
+	BP_GET_LOGICAL_BIRTH(bp))
 
 #define	BP_SET_BIRTH(bp, logical, physical)	\
 {						\
 	ASSERT(!BP_IS_EMBEDDED(bp));		\
-	(bp)->blk_birth = (logical);		\
-	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+	BP_SET_LOGICAL_BIRTH(bp, logical);	\
+	BP_SET_PHYSICAL_BIRTH(bp, 		\
+	    ((logical) == (physical) ? 0 : (physical))); \
 }
 
 #define	BP_GET_FILL(bp)				\
@@ -541,8 +544,8 @@ typedef struct blkptr {
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
 #define	BP_EQUAL(bp1, bp2)	\
-	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
-	(bp1)->blk_birth == (bp2)->blk_birth &&			\
+	(BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) &&	\
+	BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
@@ -581,8 +584,8 @@ typedef struct blkptr {
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
-	(bp)->blk_phys_birth = 0;		\
-	(bp)->blk_birth = 0;			\
+	(bp)->blk_birth_word[0] = 0;		\
+	(bp)->blk_birth_word[1] = 0;		\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
@@ -631,7 +634,7 @@ typedef struct blkptr {
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));		\
 	} else if (BP_IS_EMBEDDED(bp)) {				\
 		len = func(buf + len, size - len,			\
 		    "EMBEDDED [L%llu %s] et=%u %s "			\
@@ -642,14 +645,14 @@ typedef struct blkptr {
 		    compress,						\
 		    (u_longlong_t)BPE_GET_LSIZE(bp),			\
 		    (u_longlong_t)BPE_GET_PSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));		\
 	} else if (BP_IS_REDACTED(bp)) {				\
 		len += func(buf + len, size - len,			\
 		    "REDACTED [L%llu %s] size=%llxL birth=%lluL",	\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));		\
 	} else {							\
 		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
 			const dva_t *dva = &bp->blk_dva[d];		\
@@ -691,8 +694,8 @@ typedef struct blkptr {
 		    ws,							\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth,			\
-		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),		\
+		    (u_longlong_t)BP_GET_BIRTH(bp),			\
 		    (u_longlong_t)BP_GET_FILL(bp),			\
 		    ws,							\
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
@@ -1142,9 +1145,9 @@ extern const char *spa_state_to_name(spa_t *spa);
 /* error handling */
 struct zbookmark_phys;
 extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb,
-    const uint64_t *birth);
+    const uint64_t birth);
 extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb,
-    const uint64_t *birth);
+    uint64_t birth);
 extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t state);
 extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd,
diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h
index d3a71cc8f84b..1736b32cd3c6 100644
--- a/include/sys/uberblock_impl.h
+++ b/include/sys/uberblock_impl.h
@@ -165,7 +165,7 @@ struct uberblock {
 	 * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
 	 * the value of the field is used to determine which ZIL blocks have
 	 * been allocated according to the ms_sm when we are rewinding to a
-	 * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+	 * checkpoint. Specifically, if logical birth > ub_checkpoint_txg,then
 	 * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
 	 */
 	uint64_t	ub_checkpoint_txg;
diff --git a/lib/libzdb/libzdb.c b/lib/libzdb/libzdb.c
index 9989fa1eb80f..12144dc65e75 100644
--- a/lib/libzdb/libzdb.c
+++ b/lib/libzdb/libzdb.c
@@ -93,9 +93,9 @@ livelist_compare(const void *larg, const void *rarg)
 	 * Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
 	 * it's possible the offsets are equal. In that case, sort by txg
 	 */
-	if (l->blk_birth < r->blk_birth) {
+	if (BP_GET_LOGICAL_BIRTH(l) < BP_GET_LOGICAL_BIRTH(r)) {
 		return (-1);
-	} else if (l->blk_birth > r->blk_birth) {
+	} else if (BP_GET_LOGICAL_BIRTH(l) > BP_GET_LOGICAL_BIRTH(r)) {
 		return (+1);
 	}
 	return (0);
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 3bcffb3c7ede..b1bcac6c44bc 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1014,7 +1014,7 @@ static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
+	uint64_t birth = BP_GET_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
@@ -2183,7 +2183,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 		 * (and generate an ereport) before leaving the ARC.
 		 */
 		ret = SET_ERROR(EIO);
-		spa_log_error(spa, zb, &buf->b_hdr->b_birth);
+		spa_log_error(spa, zb, buf->b_hdr->b_birth);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 		    spa, NULL, zb, NULL, 0);
 	}
@@ -5251,7 +5251,7 @@ arc_read_done(zio_t *zio)
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
-		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+		ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
@@ -5354,7 +5354,7 @@ arc_read_done(zio_t *zio)
 			error = SET_ERROR(EIO);
 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 				spa_log_error(zio->io_spa, &acb->acb_zb,
-				    &zio->io_bp->blk_birth);
+				    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 				(void) zfs_ereport_post(
 				    FM_EREPORT_ZFS_AUTHENTICATION,
 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0);
@@ -5639,7 +5639,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 				 */
 				rc = SET_ERROR(EIO);
 				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
-					spa_log_error(spa, zb, &hdr->b_birth);
+					spa_log_error(spa, zb, hdr->b_birth);
 					(void) zfs_ereport_post(
 					    FM_EREPORT_ZFS_AUTHENTICATION,
 					    spa, NULL, zb, NULL, 0);
@@ -5686,12 +5686,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 			 * embedded data.
 			 */
 			arc_buf_hdr_t *exists = NULL;
-			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+			hdr = arc_hdr_alloc(guid, psize, lsize,
 			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
 
 			if (!embedded_bp) {
 				hdr->b_dva = *BP_IDENTITY(bp);
-				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+				hdr->b_birth = BP_GET_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
@@ -6557,7 +6557,7 @@ arc_write_done(zio_t *zio)
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+			hdr->b_birth = BP_GET_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(HDR_EMPTY(hdr));
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index e772caead29b..96e1601c4e9c 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -893,7 +893,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
 		 */
 		memset(&stored_bp, 0, sizeof (stored_bp));
 		stored_bp.blk_prop = bp->blk_prop;
-		stored_bp.blk_birth = bp->blk_birth;
+		BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp));
 	} else if (!BP_GET_DEDUP(bp)) {
 		/* The bpobj will compress better without the checksum */
 		memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));
@@ -953,7 +953,8 @@ space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 	(void) bp_freed, (void) tx;
 	struct space_range_arg *sra = arg;
 
-	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg &&
+	    BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) {
 		if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
 			sra->used += bp_get_dsize_sync(sra->spa, bp);
 		else
@@ -985,7 +986,7 @@ bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 
 /*
  * Return the amount of space in the bpobj which is:
- * mintxg < blk_birth <= maxtxg
+ * mintxg < logical birth <= maxtxg
  */
 int
 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 416caeb11c7e..0b5a09df3724 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1384,8 +1384,7 @@ brt_pending_entry_compare(const void *x1, const void *x2)
 		cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
 		    DVA_GET_OFFSET(&bp2->blk_dva[0]));
 		if (unlikely(cmp == 0)) {
-			cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1),
-			    BP_PHYSICAL_BIRTH(bp2));
+			cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2));
 		}
 	}
 
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 6798fc2d5bdc..4e190c131e1d 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1217,7 +1217,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 					ASSERT0(bp->blk_pad[1]);
 					ASSERT(!BP_IS_EMBEDDED(bp));
 					ASSERT(BP_IS_HOLE(bp));
-					ASSERT0(bp->blk_phys_birth);
+					ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 				}
 			}
 		}
@@ -1457,7 +1457,7 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
 		    dn->dn_datablksz : BP_GET_LSIZE(dbbp));
 		BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
 		BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
-		BP_SET_BIRTH(bp, dbbp->blk_birth, 0);
+		BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
 	}
 }
 
@@ -1486,7 +1486,7 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
 		memset(db->db.db_data, 0, db->db.db_size);
 
 		if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
-		    bp->blk_birth != 0) {
+		    BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			dbuf_handle_indirect_hole(db, dn, bp);
 		}
 		db->db_state = DB_CACHED;
@@ -1633,7 +1633,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	 * If this is not true it indicates tampering and we report an error.
 	 */
 	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
-		spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth);
+		spa_log_error(db->db_objset->os_spa, &zb,
+		    BP_GET_LOGICAL_BIRTH(bpp));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
@@ -2832,7 +2833,7 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
 	dl = &dr->dt.dl;
 	dl->dr_overridden_by = *bp;
 	dl->dr_override_state = DR_OVERRIDDEN;
-	dl->dr_overridden_by.blk_birth = dr->dr_txg;
+	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 boolean_t
@@ -2909,7 +2910,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
 	dl->dr_override_state = DR_OVERRIDDEN;
-	dl->dr_overridden_by.blk_birth = dr->dr_txg;
+	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 void
@@ -4712,7 +4713,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
-	if (bp->blk_birth != 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
@@ -4999,7 +5000,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	drica.drica_os = dn->dn_objset;
-	drica.drica_blk_birth = bp->blk_birth;
+	drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
 	drica.drica_tx = tx;
 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
 	    &drica)) {
@@ -5014,7 +5015,8 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 		if (dn->dn_objset != spa_meta_objset(spa)) {
 			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
 			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-			    bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+			    BP_GET_LOGICAL_BIRTH(bp) >
+			    ds->ds_dir->dd_origin_txg) {
 				ASSERT(!BP_IS_EMBEDDED(bp));
 				ASSERT(dsl_dir_is_clone(ds->ds_dir));
 				ASSERT(spa_feature_is_enabled(spa,
@@ -5136,7 +5138,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
-	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
 	ASSERT(pio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index de8640e58a2c..4c53cb0a2f9b 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -437,7 +437,7 @@ ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
 
 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 		ddp->ddp_dva[d] = bp->blk_dva[d];
-	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+	ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
 }
 
 void
@@ -485,7 +485,7 @@ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
-		    BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+		    BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
 			return (ddp);
 	}
 	return (NULL);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 8986f55e792a..b88cf447d296 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1627,7 +1627,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
-		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
+		    BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1658,7 +1658,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
 			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
 			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
-			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+			ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
@@ -2285,11 +2285,11 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		 * operation into ZIL, or it may be impossible to replay, since
 		 * the block may appear not yet allocated at that point.
 		 */
-		if (BP_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
+		if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
-		if (BP_PHYSICAL_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
+		if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
 			error = SET_ERROR(EAGAIN);
 			goto out;
 		}
@@ -2364,13 +2364,14 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		dl->dr_brtwrite = B_TRUE;
 		dl->dr_override_state = DR_OVERRIDDEN;
 		if (BP_IS_HOLE(bp)) {
-			dl->dr_overridden_by.blk_birth = 0;
-			dl->dr_overridden_by.blk_phys_birth = 0;
+			BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, 0);
+			BP_SET_PHYSICAL_BIRTH(&dl->dr_overridden_by, 0);
 		} else {
-			dl->dr_overridden_by.blk_birth = dr->dr_txg;
+			BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
+			    dr->dr_txg);
 			if (!BP_IS_EMBEDDED(bp)) {
-				dl->dr_overridden_by.blk_phys_birth =
-				    BP_PHYSICAL_BIRTH(bp);
+				BP_SET_PHYSICAL_BIRTH(&dl->dr_overridden_by,
+				    BP_GET_BIRTH(bp));
 			}
 		}
 
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 9f1c25f866f7..680aed4513bc 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1352,8 +1352,10 @@ corrective_read_done(zio_t *zio)
 {
 	cr_cb_data_t *data = zio->io_private;
 	/* Corruption corrected; update error log if needed */
-	if (zio->io_error == 0)
-		spa_remove_error(data->spa, &data->zb, &zio->io_bp->blk_birth);
+	if (zio->io_error == 0) {
+		spa_remove_error(data->spa, &data->zb,
+		    BP_GET_LOGICAL_BIRTH(zio->io_bp));
+	}
 	kmem_free(data, sizeof (cr_cb_data_t));
 	abd_free(zio->io_abd);
 }
@@ -1480,8 +1482,9 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 	}
 	rrd->abd = abd;
 
-	io = zio_rewrite(NULL, rwa->os->os_spa, bp->blk_birth, bp, abd,
-	    BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb);
+	io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp,
+	    abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags,
+	    &zb);
 
 	ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) ||
 	    abd_get_size(abd) == BP_GET_PSIZE(bp));
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 37c68528bf95..b6cc2f0a5e91 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -619,7 +619,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 
 	/* See comment in dump_dnode() for full details */
 	if (zfs_send_unmodified_spill_blocks &&
-	    (bp->blk_birth <= dscp->dsc_fromtxg)) {
+	    (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) {
 		drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
 	}
 
@@ -804,7 +804,7 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 	 */
 	if (zfs_send_unmodified_spill_blocks &&
 	    (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
-	    (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) {
+	    (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) {
 		struct send_range record;
 		blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
 
@@ -1123,7 +1123,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	 */
 	if (sta->os->os_encrypted &&
 	    !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
-		spa_log_error(spa, zb, &bp->blk_birth);
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EIO));
 	}
 
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 809f7f6165f9..15cc2885e805 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -83,7 +83,8 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	if (BP_IS_HOLE(bp))
 		return (0);
 
-	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+	if (claim_txg == 0 &&
+	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa))
 		return (-1);
 
 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -108,7 +109,7 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 		if (BP_IS_HOLE(bp))
 			return (0);
 
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
@@ -192,7 +193,7 @@ traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
 	 */
 	if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
 		return (B_FALSE);
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
+	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg)
 		return (B_FALSE);
 	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 		return (B_FALSE);
@@ -235,7 +236,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		ASSERT(0);
 	}
 
-	if (bp->blk_birth == 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) == 0) {
 		/*
 		 * Since this block has a birth time of 0 it must be one of
 		 * two things: a hole created before the
@@ -263,7 +264,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
 		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
 			return (0);
-	} else if (bp->blk_birth <= td->td_min_txg) {
+	} else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) {
 		return (0);
 	}
 
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index ba28aa06a91f..a703fd414f87 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -2557,7 +2557,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	}
 
 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
-	    db->db_blkptr->blk_birth <= txg ||
+	    BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg ||
 	    BP_IS_HOLE(db->db_blkptr))) {
 		/*
 		 * This can only happen when we are searching up the tree
@@ -2605,7 +2605,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 		    i >= 0 && i < epb; i += inc) {
 			if (BP_GET_FILL(&bp[i]) >= minfill &&
 			    BP_GET_FILL(&bp[i]) <= maxfill &&
-			    (hole || bp[i].blk_birth > txg))
+			    (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
 				break;
 			if (inc > 0 || *offset > 0)
 				*offset += inc;
diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c
index 4faefecbadbb..5fd8bc2a2682 100644
--- a/module/zfs/dsl_bookmark.c
+++ b/module/zfs/dsl_bookmark.c
@@ -1520,7 +1520,8 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 		 * If the block was live (referenced) at the time of this
 		 * bookmark, add its space to the bookmark's FBN.
 		 */
-		if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg &&
+		if (BP_GET_LOGICAL_BIRTH(bp) <=
+		    dbn->dbn_phys.zbm_creation_txg &&
 		    (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
 			mutex_enter(&dbn->dbn_lock);
 			dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 62a1649d3786..b4de0e7ff073 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -156,7 +156,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 		return;
 	}
 
-	ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
@@ -190,7 +191,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 	 * they do not need to be freed.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
 	    !(BP_IS_EMBEDDED(bp))) {
 		ASSERT(dsl_dir_is_clone(ds->ds_dir));
 		ASSERT(spa_feature_is_enabled(spa,
@@ -236,7 +237,7 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
 		mutex_exit(&ds->ds_remap_deadlist_lock);
 
 		BP_ZERO(&fakebp);
-		fakebp.blk_birth = birth;
+		BP_SET_LOGICAL_BIRTH(&fakebp, birth);
 		DVA_SET_VDEV(dva, vdev);
 		DVA_SET_OFFSET(dva, offset);
 		DVA_SET_ASIZE(dva, size);
@@ -259,7 +260,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		return (0);
 
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(bp->blk_birth <= tx->tx_txg);
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg);
 
 	if (ds == NULL) {
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
@@ -277,7 +278,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 	 * they do not need to be freed.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
 	    !(BP_IS_EMBEDDED(bp))) {
 		ASSERT(dsl_dir_is_clone(ds->ds_dir));
 		ASSERT(spa_feature_is_enabled(spa,
@@ -285,7 +286,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		bplist_append(&ds->ds_dir->dd_pending_frees, bp);
 	}
 
-	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object);
@@ -317,16 +318,16 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
 		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
-		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+		/* if (logical birth > prev prev snap txg) prev unique += bs */
 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
-		    ds->ds_object && bp->blk_birth >
+		    ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) >
 		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			mutex_enter(&ds->ds_prev->ds_lock);
 			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
-		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+		if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
 			dsl_dir_transfer_space(ds->ds_dir, used,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
@@ -2895,7 +2896,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
 	if (snap == NULL)
 		return (B_FALSE);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+	birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds));
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
 		objset_t *os, *os_snap;
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index e6c8d4be13b4..eff1f7de7731 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -474,7 +474,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
 	dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
 	dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
 
-	dle_tofind.dle_mintxg = bp->blk_birth;
+	dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp);
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
@@ -483,7 +483,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
 
 	if (dle == NULL) {
 		zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
-		    bp, (longlong_t)bp->blk_birth);
+		    bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 		dle = avl_first(&dl->dl_tree);
 	}
 
@@ -1039,8 +1039,7 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));
 		ASSERT3U(BP_GET_CHECKSUM(bp), ==,
 		    BP_GET_CHECKSUM(&found->le_bp));
-		ASSERT3U(BP_PHYSICAL_BIRTH(bp), ==,
-		    BP_PHYSICAL_BIRTH(&found->le_bp));
+		ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp));
 	}
 	if (bp_freed) {
 		if (found == NULL) {
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index d9d88a981e05..d4a6e5b6e9fd 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -132,10 +132,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) <=
+	    dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
 		if (poa->ds_prev && !poa->after_branch_point &&
-		    bp->blk_birth >
+		    BP_GET_LOGICAL_BIRTH(bp) >
 		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
 			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
 			    bp_get_dsize_sync(dp->dp_spa, bp);
@@ -313,7 +314,8 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
+	    tx->tx_txg);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
 
@@ -727,7 +729,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
 		ASSERT(zilog == NULL);
-		ASSERT3U(bp->blk_birth, >,
+		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
 		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
 		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
@@ -1017,7 +1019,8 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 	ASSERT(ds->ds_prev == NULL ||
 	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
+	    tx->tx_txg);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 370c6a010dca..342ec5c15c79 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -1047,7 +1047,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 		 * will be wrong.
 		 */
 		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+		ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp));
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 		/* The origin doesn't get attached to itself */
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 060a5cc36d70..55e89b89f06a 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -429,8 +429,8 @@ sio2bp(const scan_io_t *sio, blkptr_t *bp)
 {
 	memset(bp, 0, sizeof (*bp));
 	bp->blk_prop = sio->sio_blk_prop;
-	bp->blk_phys_birth = sio->sio_phys_birth;
-	bp->blk_birth = sio->sio_birth;
+	BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth);
+	BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth);
 	bp->blk_fill = 1;	/* we always only work with data pointers */
 	bp->blk_cksum = sio->sio_cksum;
 
@@ -444,8 +444,8 @@ static inline void
 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
 {
 	sio->sio_blk_prop = bp->blk_prop;
-	sio->sio_phys_birth = bp->blk_phys_birth;
-	sio->sio_birth = bp->blk_birth;
+	sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
+	sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
 	sio->sio_cksum = bp->blk_cksum;
 	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
 
@@ -1721,7 +1721,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	zbookmark_phys_t zb;
 
 	ASSERT(!BP_IS_REDACTED(bp));
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+	if (BP_IS_HOLE(bp) ||
+	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 		return (0);
 
 	/*
@@ -1730,7 +1731,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	 * (on-disk) even if it hasn't been claimed (even though for
 	 * scrub there's nothing to do to it).
 	 */
-	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
+	if (claim_txg == 0 &&
+	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@@ -1756,7 +1758,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 
 		ASSERT(!BP_IS_REDACTED(bp));
 		if (BP_IS_HOLE(bp) ||
-		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 			return (0);
 
 		/*
@@ -1764,7 +1766,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 		 * already txg sync'ed (but this log block contains
 		 * other records that are not synced)
 		 */
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
@@ -1903,7 +1905,8 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
 	if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
 		return;
 
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+	if (BP_IS_HOLE(bp) ||
+	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
 	    BP_GET_TYPE(bp) != DMU_OT_OBJSET))
 		return;
@@ -2174,7 +2177,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 	if (dnp != NULL &&
 	    dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
 		scn->scn_phys.scn_errors++;
-		spa_log_error(spa, zb, &bp->blk_birth);
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -2270,7 +2273,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 		 * by arc_read() for the cases above.
 		 */
 		scn->scn_phys.scn_errors++;
-		spa_log_error(spa, zb, &bp->blk_birth);
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -2347,7 +2350,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
 	if (f != SPA_FEATURE_NONE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
-	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
 		scn->scn_lt_min_this_txg++;
 		return;
 	}
@@ -2373,7 +2376,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
-	if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+	if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
 		scn->scn_gt_max_this_txg++;
 		return;
 	}
@@ -4714,7 +4717,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
-	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+	uint64_t phys_birth = BP_GET_BIRTH(bp);
 	size_t psize = BP_GET_PSIZE(bp);
 	boolean_t needs_io = B_FALSE;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 7237fa8eeb59..c4aa98ced433 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5495,8 +5495,9 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
 	    DVA_GET_VDEV(&bp->blk_dva[0]));
 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
-	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
+	uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
+	BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
 
 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
@@ -5845,8 +5846,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
 	int error = 0;
 
-	ASSERT(bp->blk_birth == 0);
-	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
+	ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
+	ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
@@ -5900,7 +5901,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
+	ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
 
 	/*
 	 * If we have a checkpoint for the pool we need to make sure that
@@ -5918,7 +5919,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	 * normally as they will be referenced by the checkpointed uberblock.
 	 */
 	boolean_t checkpoint = B_FALSE;
-	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+	if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
 		/*
 		 * At this point, if the block is part of the checkpoint
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index b144d0652930..30c528a53049 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -2655,8 +2655,8 @@ spa_claim_notify(zio_t *zio)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
-	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
-		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+	if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
+		spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
 	mutex_exit(&spa->spa_props_lock);
 }
 
@@ -6266,7 +6266,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	nvlist_t *nvl;
 
 	if (props == NULL ||
-	    nvlist_lookup_string(props, "tname", &poolname) != 0)
+	    nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
 		poolname = (char *)pool;
 
 	/*
@@ -9801,7 +9802,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 		 * don't want to rely on that here).
 		 */
 		if (pass == 1 &&
-		    spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+		    BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 		    !dmu_objset_is_dirty(mos, txg)) {
 			/*
 			 * Nothing changed on the first pass, therefore this
diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c
index 244b4d264212..62d7b4fa2df2 100644
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@@ -180,7 +180,7 @@ static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds)
  * during spa_errlog_sync().
  */
 void
-spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth)
+spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth)
 {
 	spa_error_entry_t search;
 	spa_error_entry_t *new;
@@ -223,13 +223,7 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth)
 		new->se_zep.zb_object = zb->zb_object;
 		new->se_zep.zb_level = zb->zb_level;
 		new->se_zep.zb_blkid = zb->zb_blkid;
-
-		/*
-		 * birth may end up being NULL, e.g. in zio_done(). We
-		 * will handle this in process_error_block().
-		 */
-		if (birth != NULL)
-			new->se_zep.zb_birth = *birth;
+		new->se_zep.zb_birth = birth;
 	}
 
 	avl_insert(tree, new, where);
@@ -258,7 +252,7 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
 	if (error == 0 && BP_IS_HOLE(&bp))
 		error = SET_ERROR(ENOENT);
 
-	*birth_txg = bp.blk_birth;
+	*birth_txg = BP_GET_LOGICAL_BIRTH(&bp);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
@@ -535,7 +529,7 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
 		 */
 		zbookmark_phys_t zb;
 		zep_to_zb(head_ds, zep, &zb);
-		spa_remove_error(spa, &zb, &zep->zb_birth);
+		spa_remove_error(spa, &zb, zep->zb_birth);
 	}
 
 	return (error);
@@ -563,7 +557,7 @@ spa_get_last_errlog_size(spa_t *spa)
  */
 static void
 spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb,
-    const uint64_t *birth)
+    const uint64_t birth)
 {
 	char name[NAME_MAX_LEN];
 
@@ -618,11 +612,7 @@ spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb,
 	healed_zep.zb_object = healed_zb->zb_object;
 	healed_zep.zb_level = healed_zb->zb_level;
 	healed_zep.zb_blkid = healed_zb->zb_blkid;
-
-	if (birth != NULL)
-		healed_zep.zb_birth = *birth;
-	else
-		healed_zep.zb_birth = 0;
+	healed_zep.zb_birth = birth;
 
 	errphys_to_name(&healed_zep, name, sizeof (name));
 
@@ -742,7 +732,7 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
  * later in spa_remove_healed_errors().
  */
 void
-spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, const uint64_t *birth)
+spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth)
 {
 	spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth);
 	spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth);
@@ -890,7 +880,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
 		if (error == EACCES)
 			error = 0;
 		else if (!error)
-			zep.zb_birth = bp.blk_birth;
+			zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp);
 
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index 873089a53e34..32158e8c592c 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -783,7 +783,7 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
 	 * request of flushing everything before we attempt to return
 	 * immediately.
 	 */
-	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+	if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 	    !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
 	    !spa_flush_all_logs_requested(spa))
 		return;
diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c
index 1921be107660..22ee8036c473 100644
--- a/module/zfs/uberblock.c
+++ b/module/zfs/uberblock.c
@@ -70,5 +70,5 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
 	}
 	ub->ub_checkpoint_txg = 0;
 
-	return (ub->ub_rootbp.blk_birth == txg);
+	return (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) == txg);
 }
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index f9a01c9f53f4..102eacb03349 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -531,7 +531,7 @@ vdev_mirror_child_select(zio_t *zio)
 	uint64_t txg = zio->io_txg;
 	int c, lowest_load;
 
-	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
+	ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg);
 
 	lowest_load = INT_MAX;
 	mm->mm_preferred_cnt = 0;
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 9d0b8763f16f..b03331ec69c6 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -2190,12 +2190,11 @@ vdev_raidz_close(vdev_t *vd)
 
 /*
  * Return the logical width to use, given the txg in which the allocation
- * happened.  Note that BP_PHYSICAL_BIRTH() is usually the txg in which the
+ * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
  * BP was allocated.  Remapped BP's (that were relocated due to device
- * removal, see remap_blkptr_cb()), will have a more recent
- * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can
- * ignore these because they can't be on RAIDZ (device removal doesn't
- * support RAIDZ).
+ * removal, see remap_blkptr_cb()), will have a more recent physical birth
+ * which reflects when the BP was relocated, but we can ignore these because
+ * they can't be on RAIDZ (device removal doesn't support RAIDZ).
  */
 static uint64_t
 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
@@ -2295,7 +2294,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
 	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
-	    BP_PHYSICAL_BIRTH(zio->io_bp));
+	    BP_GET_BIRTH(zio->io_bp));
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
@@ -2518,7 +2517,7 @@ vdev_raidz_io_start(zio_t *zio)
 	raidz_map_t *rm;
 
 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
-	    BP_PHYSICAL_BIRTH(zio->io_bp));
+	    BP_GET_BIRTH(zio->io_bp));
 	if (logical_width != vdrz->vd_physical_width) {
 		zfs_locked_range_t *lr = NULL;
 		uint64_t synced_offset = UINT64_MAX;
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index e549e1895f39..1af357c58006 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -557,7 +557,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
-	if (bp->blk_birth >= first_txg)
+	if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
@@ -583,7 +583,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
-	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
@@ -608,7 +608,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
-	if (lr->lr_blkptr.blk_birth >= first_txg) {
+	if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
@@ -655,7 +655,7 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
 		 * just in case lets be safe and just stop here now instead of
 		 * corrupting the pool.
 		 */
-		if (BP_PHYSICAL_BIRTH(bp) >= first_txg)
+		if (BP_GET_BIRTH(bp) >= first_txg)
 			return (SET_ERROR(ENOENT));
 
 		/*
@@ -710,8 +710,8 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
-	if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
-	    !BP_IS_HOLE(bp)) {
+	if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg &&
+	    zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 	}
 
@@ -1965,7 +1965,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 		    &slog);
 	}
 	if (error == 0) {
-		ASSERT3U(bp->blk_birth, ==, txg);
+		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg);
 		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
 		    ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 213fe5c483f2..e96bbda35a04 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -613,7 +613,7 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 		zio->io_error = SET_ERROR(EIO);
 		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 			spa_log_error(spa, &zio->io_bookmark,
-			    &zio->io_bp->blk_birth);
+			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 			    spa, NULL, &zio->io_bookmark, zio, 0);
 		}
@@ -1052,8 +1052,8 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
 	    (long long)bp->blk_prop,
 	    (long long)bp->blk_pad[0],
 	    (long long)bp->blk_pad[1],
-	    (long long)bp->blk_phys_birth,
-	    (long long)bp->blk_birth,
+	    (long long)BP_GET_PHYSICAL_BIRTH(bp),
+	    (long long)BP_GET_LOGICAL_BIRTH(bp),
 	    (long long)bp->blk_fill,
 	    (long long)bp->blk_cksum.zc_word[0],
 	    (long long)bp->blk_cksum.zc_word[1],
@@ -1156,10 +1156,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
 	/*
 	 * Pool-specific checks.
 	 *
-	 * Note: it would be nice to verify that the blk_birth and
-	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
-	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
-	 * that are in the log) to be arbitrarily large.
+	 * Note: it would be nice to verify that the logical birth
+	 * and physical birth are not too large.  However,
+	 * spa_freeze() allows the birth time of log blocks (and
+	 * dmu_sync()-ed blocks that are in the log) to be arbitrarily
+	 * large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		const dva_t *dva = &bp->blk_dva[i];
@@ -1246,7 +1247,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 {
 	zio_t *zio;
 
-	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
+	zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
@@ -1435,7 +1436,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
-	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <,
 	    spa_min_claim_txg(spa));
 	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(8) */
@@ -1731,7 +1732,7 @@ zio_write_bp_init(zio_t *zio)
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
-		ASSERT(bp->blk_birth != zio->io_txg);
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -1819,7 +1820,7 @@ zio_write_compress(zio_t *zio)
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
-	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
@@ -1866,7 +1867,7 @@ zio_write_compress(zio_t *zio)
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			zio_buf_free(cbuf, lsize);
-			bp->blk_birth = zio->io_txg;
+			BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
@@ -1947,7 +1948,7 @@ zio_write_compress(zio_t *zio)
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
-	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		VERIFY3U(psize, !=, 0);
@@ -1961,7 +1962,7 @@ zio_write_compress(zio_t *zio)
 	}
 
 	if (psize == 0) {
-		if (zio->io_bp_orig.blk_birth != 0 &&
+		if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
@@ -3539,7 +3540,7 @@ zio_ddt_write(zio_t *zio)
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
-		ASSERT(bp->blk_birth == txg);
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 		ddt_phys_fill(ddp, bp);
 		ddt_phys_addref(ddp);
@@ -3810,11 +3811,13 @@ zio_dva_claim(zio_t *zio)
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
-	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
-	if (!BP_IS_HOLE(bp))
-		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
+	if (!BP_IS_HOLE(bp)) {
+		metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
+		    B_TRUE);
+	}
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@@ -4555,8 +4558,8 @@ zio_ready(zio_t *zio)
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
-		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
-		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
+		    BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
@@ -4852,7 +4855,7 @@ zio_done(zio_t *zio)
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(zio->io_spa, &zio->io_bookmark,
-			    &zio->io_bp->blk_birth);
+			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
 			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
 		}
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index e511b31fee6d..ce6772a40c8b 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -272,7 +272,7 @@ static void
 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
+	uint64_t txg = BP_GET_BIRTH(bp);
 
 	ASSERT(BP_IS_GANG(bp));
 

From f68bde7236699353b89de176fd35f7fa92bfc30b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 18:02:38 -0400
Subject: [PATCH 014/116] BRT: Make BRT block sizes configurable

Similar to DDT make BRT data and indirect block sizes configurable
via module parameters.  I am not sure what would be the best yet,
but similar to DDT 4KB blocks kill all chances of compression on
vdev with ashift=12 or more, that on my tests reaches 3x.

While here, fix documentation for respective DDT parameters.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15967
---
 man/man4/zfs.4   | 17 +++++++++++++++--
 module/zfs/brt.c | 22 +++++++++++-----------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 30c168253f96..759a68784aca 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -244,12 +244,25 @@ For blocks that could be forced to be a gang block (due to
 .Sy metaslab_force_ganging ) ,
 force this many of them to be gang blocks.
 .
-.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
+.It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
+Controls prefetching BRT records for blocks which are going to be cloned.
+.
+.It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
+Default BRT ZAP data block size as a power of 2. Note that changing this after
+creating a BRT on the pool will not affect existing BRTs, only newly created
+ones.
+.
+.It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
+Default BRT ZAP indirect block size as a power of 2. Note that changing this
+after creating a BRT on the pool will not affect existing BRTs, only newly
+created ones.
+.
+.It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP data block size as a power of 2. Note that changing this after
 creating a DDT on the pool will not affect existing DDTs, only newly created
 ones.
 .
-.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
+.It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP indirect block size as a power of 2. Note that changing this
 after creating a DDT on the pool will not affect existing DDTs, only newly
 created ones.
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 0b5a09df3724..5d1f4728b645 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -248,7 +248,7 @@ static kmem_cache_t *brt_pending_entry_cache;
 /*
  * Enable/disable prefetching of BRT entries that we are going to modify.
  */
-int zfs_brt_prefetch = 1;
+static int brt_zap_prefetch = 1;
 
 #ifdef ZFS_DEBUG
 #define	BRT_DEBUG(...)	do {						\
@@ -260,8 +260,8 @@ int zfs_brt_prefetch = 1;
 #define	BRT_DEBUG(...)	do { } while (0)
 #endif
 
-int brt_zap_leaf_blockshift = 12;
-int brt_zap_indirect_blockshift = 12;
+static int brt_zap_default_bs = 12;
+static int brt_zap_default_ibs = 12;
 
 static kstat_t	*brt_ksp;
 
@@ -458,8 +458,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
 
 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
-	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
-	    0, tx);
+	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
 	VERIFY(brtvd->bv_mos_entries != 0);
 	BRT_DEBUG("MOS entries created, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_entries);
@@ -1363,7 +1362,7 @@ brt_prefetch(brt_t *brt, const blkptr_t *bp)
 
 	ASSERT(bp != NULL);
 
-	if (!zfs_brt_prefetch)
+	if (!brt_zap_prefetch)
 		return;
 
 	brt_entry_fill(bp, &bre, &vdevid);
@@ -1679,9 +1678,10 @@ brt_unload(spa_t *spa)
 }
 
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
-    "Enable prefetching of BRT entries");
-#ifdef ZFS_BRT_DEBUG
-ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
-#endif
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
+	"Enable prefetching of BRT ZAP entries");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
+	"BRT ZAP leaf blockshift");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
+	"BRT ZAP indirect blockshift");
 /* END CSTYLED */

From df04efe321a49c650f1fbaa6fd701fa2928cbe21 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 13 Nov 2023 17:55:29 +1100
Subject: [PATCH 015/116] linux 5.4 compat: page_size()

Before 5.4 we have to do a little math.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 config/kernel-mm-page-size.m4             | 17 +++++++++++
 config/kernel.m4                          |  2 ++
 include/os/linux/Makefile.am              |  1 +
 include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++
 4 files changed, 56 insertions(+)
 create mode 100644 config/kernel-mm-page-size.m4
 create mode 100644 include/os/linux/kernel/linux/mm_compat.h

diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4
new file mode 100644
index 000000000000..d5ebd926986a
--- /dev/null
+++ b/config/kernel-mm-page-size.m4
@@ -0,0 +1,17 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
+	ZFS_LINUX_TEST_SRC([page_size], [
+		#include <linux/mm.h>
+	],[
+		unsigned long s;
+		s = page_size(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
+	AC_MSG_CHECKING([whether page_size() is available])
+	ZFS_LINUX_TEST_RESULT([page_size], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 1d0c5a27fc7f..548905ccd04d 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
+	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
+	ZFS_AC_KERNEL_MM_PAGE_SIZE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
index 3830d198dfff..51c27132b4ef 100644
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
 	%D%/kernel/linux/compiler_compat.h \
 	%D%/kernel/linux/dcache_compat.h \
 	%D%/kernel/linux/kmap_compat.h \
+	%D%/kernel/linux/mm_compat.h \
 	%D%/kernel/linux/mod_compat.h \
 	%D%/kernel/linux/page_compat.h \
 	%D%/kernel/linux/percpu_compat.h \
diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h
new file mode 100644
index 000000000000..40056c68d6dd
--- /dev/null
+++ b/include/os/linux/kernel/linux/mm_compat.h
@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ZFS_MM_COMPAT_H
+#define	_ZFS_MM_COMPAT_H
+
+#include <linux/mm.h>
+
+/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
+#ifndef HAVE_MM_PAGE_SIZE
+#define	page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
+#endif
+
+#endif /* _ZFS_MM_COMPAT_H */

From 390b448726c580999dd337be7a40b0e95cf1d50b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 11 Dec 2023 16:05:54 +1100
Subject: [PATCH 016/116] abd: add page iterator

The regular ABD iterators yield data buffers, so they have to map and
unmap pages into kernel memory. If the caller only wants to count
chunks, or can use page pointers directly, then the map/unmap is just
unnecessary overhead.

This adds adb_iterate_page_func, which yields unmapped struct page
instead.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 include/sys/abd.h              |   7 +++
 include/sys/abd_impl.h         |  26 ++++++++-
 module/os/freebsd/zfs/abd_os.c |   4 +-
 module/os/linux/zfs/abd_os.c   | 104 ++++++++++++++++++++++++++++++---
 module/zfs/abd.c               |  42 +++++++++++++
 5 files changed, 169 insertions(+), 14 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index b48dc36423f7..3a500e2c9ae7 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -79,6 +79,9 @@ typedef struct abd {
 
 typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
 typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
+#if defined(__linux__) && defined(_KERNEL)
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
+#endif
 
 extern int zfs_abd_scatter_enabled;
 
@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
 int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
 int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
     abd_iter_func2_t *, void *);
+#if defined(__linux__) && defined(_KERNEL)
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
+    void *);
+#endif
 void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
 void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
 void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
index 40546d4af137..f88ea25e245d 100644
--- a/include/sys/abd_impl.h
+++ b/include/sys/abd_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #ifndef _ABD_IMPL_H
@@ -38,12 +39,30 @@ typedef enum abd_stats_op {
 	ABDSTAT_DECR  /* Decrease abdstat values */
 } abd_stats_op_t;
 
-struct scatterlist; /* forward declaration */
+/* forward declarations */
+struct scatterlist;
+struct page;
 
 struct abd_iter {
 	/* public interface */
-	void		*iter_mapaddr;	/* addr corresponding to iter_pos */
-	size_t		iter_mapsize;	/* length of data valid at mapaddr */
+	union {
+		/* for abd_iter_map()/abd_iter_unmap() */
+		struct {
+			/* addr corresponding to iter_pos */
+			void		*iter_mapaddr;
+			/* length of data valid at mapaddr */
+			size_t		iter_mapsize;
+		};
+		/* for abd_iter_page() */
+		struct {
+			/* current page */
+			struct page	*iter_page;
+			/* offset of data in page */
+			size_t		iter_page_doff;
+			/* size of data in page */
+			size_t		iter_page_dsize;
+		};
+	};
 
 	/* private */
 	abd_t		*iter_abd;	/* ABD being iterated through */
@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
 void abd_iter_advance(struct abd_iter *, size_t);
 void abd_iter_map(struct abd_iter *);
 void abd_iter_unmap(struct abd_iter *);
+void abd_iter_page(struct abd_iter *);
 
 /*
  * Helper macros
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index 58a37df62b69..3b812271f98b 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
-	aiter->iter_pos = 0;
-	aiter->iter_mapaddr = NULL;
-	aiter->iter_mapsize = 0;
 }
 
 /*
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 24390fbbf125..dae1280121da 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 /*
@@ -59,6 +60,7 @@
 #include <sys/zfs_znode.h>
 #ifdef _KERNEL
 #include <linux/kmap_compat.h>
+#include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 #endif
 
@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
-	aiter->iter_mapaddr = NULL;
-	aiter->iter_mapsize = 0;
-	aiter->iter_pos = 0;
-	if (abd_is_linear(abd)) {
-		aiter->iter_offset = 0;
-		aiter->iter_sg = NULL;
-	} else {
+	if (!abd_is_linear(abd)) {
 		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
 		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
 	}
@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 boolean_t
 abd_iter_at_end(struct abd_iter *aiter)
 {
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
 }
 
@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
 void
 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 {
+	/*
+	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
+	 * this state (directly or abd_iter_unmap()) before advancing.
+	 */
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
+	ASSERT3P(aiter->iter_page, ==, NULL);
+	ASSERT0(aiter->iter_page_doff);
+	ASSERT0(aiter->iter_page_dsize);
 
 	/* There's nothing left to advance to, so do nothing */
 	if (abd_iter_at_end(aiter))
@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
 }
 
 #if defined(_KERNEL)
+/*
+ * Yield the next page struct and data offset and size within it, without
+ * mapping it into the address space.
+ */
+void
+abd_iter_page(struct abd_iter *aiter)
+{
+	if (abd_iter_at_end(aiter)) {
+		aiter->iter_page = NULL;
+		aiter->iter_page_doff = 0;
+		aiter->iter_page_dsize = 0;
+		return;
+	}
+
+	struct page *page;
+	size_t doff, dsize;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+
+		/* memory address at iter_pos */
+		void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
+
+		/* struct page for address */
+		page = is_vmalloc_addr(paddr) ?
+		    vmalloc_to_page(paddr) : virt_to_page(paddr);
+
+		/* offset of address within the page */
+		doff = offset_in_page(paddr);
+
+		/* total data remaining in abd from this position */
+		dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
+	} else {
+		ASSERT(!abd_is_gang(aiter->iter_abd));
+
+		/* current scatter page */
+		page = sg_page(aiter->iter_sg);
+
+		/* position within page */
+		doff = aiter->iter_offset;
+
+		/* remaining data in scatterlist */
+		dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+	}
+	ASSERT(page);
+
+	if (PageTail(page)) {
+		/*
+		 * This page is part of a "compound page", which is a group of
+		 * pages that can be referenced from a single struct page *.
+		 * Its organised as a "head" page, followed by a series of
+		 * "tail" pages.
+		 *
+		 * In OpenZFS, compound pages are allocated using the
+		 * __GFP_COMP flag, which we get from scatter ABDs and SPL
+		 * vmalloc slabs (ie >16K allocations). So a great many of the
+		 * IO buffers we get are going to be of this type.
+		 *
+		 * The tail pages are just regular PAGE_SIZE pages, and can be
+		 * safely used as-is. However, the head page has length
+		 * covering itself and all the tail pages. If this ABD chunk
+		 * spans multiple pages, then we can use the head page and a
+		 * >PAGE_SIZE length, which is far more efficient.
+		 *
+		 * To do this, we need to adjust the offset to be counted from
+		 * the head page. struct page for compound pages are stored
+		 * contiguously, so we can just adjust by a simple offset.
+		 */
+		struct page *head = compound_head(page);
+		doff += ((page - head) * PAGESIZE);
+		page = head;
+	}
+
+	/* final page and position within it */
+	aiter->iter_page = page;
+	aiter->iter_page_doff = doff;
+
+	/* amount of data in the chunk, up to the end of the page */
+	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
+}
+
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
 module_param(zfs_abd_scatter_max_order, uint, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
 	"Maximum order allocation used for a scatter ABD.");
-#endif
+
+#endif /* _KERNEL */
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index 0a2411a2d572..2c0cda25dbc6 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
 	return (ret);
 }
 
+#if defined(__linux__) && defined(_KERNEL)
+int
+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
+    abd_iter_page_func_t *func, void *private)
+{
+	struct abd_iter aiter;
+	int ret = 0;
+
+	if (size == 0)
+		return (0);
+
+	abd_verify(abd);
+	ASSERT3U(off + size, <=, abd->abd_size);
+
+	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
+
+	while (size > 0) {
+		IMPLY(abd_is_gang(abd), c_abd != NULL);
+
+		abd_iter_page(&aiter);
+
+		size_t len = MIN(aiter.iter_page_dsize, size);
+		ASSERT3U(len, >, 0);
+
+		ret = func(aiter.iter_page, aiter.iter_page_doff,
+		    len, private);
+
+		aiter.iter_page = NULL;
+		aiter.iter_page_doff = 0;
+		aiter.iter_page_dsize = 0;
+
+		if (ret != 0)
+			break;
+
+		size -= len;
+		c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
+	}
+
+	return (ret);
+}
+#endif
+
 struct buf_arg {
 	void *arg_buf;
 };

From f3b85d706bae82957d2e3e0ef1d53a1cfab60eb4 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:12:56 +1100
Subject: [PATCH 017/116] vdev_disk: rename existing functions to
 vdev_classic_*

This is just renaming the existing functions we're about to replace and
grouping them together to make the next commits easier to follow.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 include/sys/abd.h               |   2 +
 module/os/linux/zfs/abd_os.c    |   5 +
 module/os/linux/zfs/vdev_disk.c | 215 +++++++++++++++++---------------
 3 files changed, 120 insertions(+), 102 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index 3a500e2c9ae7..19fe96292d5f 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -220,6 +220,8 @@ void abd_fini(void);
 
 /*
  * Linux ABD bio functions
+ * Note: these are only needed to support vdev_classic. See comment in
+ * vdev_disk.c.
  */
 #if defined(__linux__) && defined(_KERNEL)
 unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index dae1280121da..3fe01c0b7d77 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -1096,6 +1096,11 @@ abd_iter_page(struct abd_iter *aiter)
 	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
 }
 
+/*
+ * Note: ABD BIO functions only needed to support vdev_classic. See comments in
+ * vdev_disk.c.
+ */
+
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index b0bda5fa2012..957619b87afd 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -83,17 +83,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
  */
 #define	EFI_MIN_RESV_SIZE	(16 * 1024)
 
-/*
- * Virtual device vector for disks.
- */
-typedef struct dio_request {
-	zio_t			*dr_zio;	/* Parent ZIO */
-	atomic_t		dr_ref;		/* References */
-	int			dr_error;	/* Bio error */
-	int			dr_bio_count;	/* Count of bio's */
-	struct bio		*dr_bio[];	/* Attached bio's */
-} dio_request_t;
-
 /*
  * BIO request failfast mask.
  */
@@ -467,85 +456,6 @@ vdev_disk_close(vdev_t *v)
 	v->vdev_tsd = NULL;
 }
 
-static dio_request_t *
-vdev_disk_dio_alloc(int bio_count)
-{
-	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
-	    sizeof (struct bio *) * bio_count, KM_SLEEP);
-	atomic_set(&dr->dr_ref, 0);
-	dr->dr_bio_count = bio_count;
-	dr->dr_error = 0;
-
-	for (int i = 0; i < dr->dr_bio_count; i++)
-		dr->dr_bio[i] = NULL;
-
-	return (dr);
-}
-
-static void
-vdev_disk_dio_free(dio_request_t *dr)
-{
-	int i;
-
-	for (i = 0; i < dr->dr_bio_count; i++)
-		if (dr->dr_bio[i])
-			bio_put(dr->dr_bio[i]);
-
-	kmem_free(dr, sizeof (dio_request_t) +
-	    sizeof (struct bio *) * dr->dr_bio_count);
-}
-
-static void
-vdev_disk_dio_get(dio_request_t *dr)
-{
-	atomic_inc(&dr->dr_ref);
-}
-
-static void
-vdev_disk_dio_put(dio_request_t *dr)
-{
-	int rc = atomic_dec_return(&dr->dr_ref);
-
-	/*
-	 * Free the dio_request when the last reference is dropped and
-	 * ensure zio_interpret is called only once with the correct zio
-	 */
-	if (rc == 0) {
-		zio_t *zio = dr->dr_zio;
-		int error = dr->dr_error;
-
-		vdev_disk_dio_free(dr);
-
-		if (zio) {
-			zio->io_error = error;
-			ASSERT3S(zio->io_error, >=, 0);
-			if (zio->io_error)
-				vdev_disk_error(zio);
-
-			zio_delay_interrupt(zio);
-		}
-	}
-}
-
-BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
-{
-	dio_request_t *dr = bio->bi_private;
-
-	if (dr->dr_error == 0) {
-#ifdef HAVE_1ARG_BIO_END_IO_T
-		dr->dr_error = BIO_END_IO_ERROR(bio);
-#else
-		if (error)
-			dr->dr_error = -(error);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			dr->dr_error = EIO;
-#endif
-	}
-
-	/* Drop reference acquired by __vdev_disk_physio */
-	vdev_disk_dio_put(dr);
-}
-
 static inline void
 vdev_submit_bio_impl(struct bio *bio)
 {
@@ -697,8 +607,107 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
 	return (bio);
 }
 
+/* ========== */
+
+/*
+ * This is the classic, battle-tested BIO submission code.
+ *
+ * These functions have been renamed to vdev_classic_* to make it clear what
+ * they belong to, but their implementations are unchanged.
+ */
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+	zio_t			*dr_zio;	/* Parent ZIO */
+	atomic_t		dr_ref;		/* References */
+	int			dr_error;	/* Bio error */
+	int			dr_bio_count;	/* Count of bio's */
+	struct bio		*dr_bio[];	/* Attached bio's */
+} dio_request_t;
+
+static dio_request_t *
+vdev_classic_dio_alloc(int bio_count)
+{
+	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
+	    sizeof (struct bio *) * bio_count, KM_SLEEP);
+	atomic_set(&dr->dr_ref, 0);
+	dr->dr_bio_count = bio_count;
+	dr->dr_error = 0;
+
+	for (int i = 0; i < dr->dr_bio_count; i++)
+		dr->dr_bio[i] = NULL;
+
+	return (dr);
+}
+
+static void
+vdev_classic_dio_free(dio_request_t *dr)
+{
+	int i;
+
+	for (i = 0; i < dr->dr_bio_count; i++)
+		if (dr->dr_bio[i])
+			bio_put(dr->dr_bio[i]);
+
+	kmem_free(dr, sizeof (dio_request_t) +
+	    sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_classic_dio_get(dio_request_t *dr)
+{
+	atomic_inc(&dr->dr_ref);
+}
+
+static void
+vdev_classic_dio_put(dio_request_t *dr)
+{
+	int rc = atomic_dec_return(&dr->dr_ref);
+
+	/*
+	 * Free the dio_request when the last reference is dropped and
+	 * ensure zio_interpret is called only once with the correct zio
+	 */
+	if (rc == 0) {
+		zio_t *zio = dr->dr_zio;
+		int error = dr->dr_error;
+
+		vdev_classic_dio_free(dr);
+
+		if (zio) {
+			zio->io_error = error;
+			ASSERT3S(zio->io_error, >=, 0);
+			if (zio->io_error)
+				vdev_disk_error(zio);
+
+			zio_delay_interrupt(zio);
+		}
+	}
+}
+
+BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
+{
+	dio_request_t *dr = bio->bi_private;
+
+	if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			dr->dr_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			dr->dr_error = EIO;
+#endif
+	}
+
+	/* Drop reference acquired by vdev_classic_physio */
+	vdev_classic_dio_put(dr);
+}
+
 static inline unsigned int
-vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
+vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 {
 	unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
 	    bio_size, abd_offset);
@@ -711,7 +720,7 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 }
 
 static int
-__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+vdev_classic_physio(struct block_device *bdev, zio_t *zio,
     size_t io_size, uint64_t io_offset, int rw, int flags)
 {
 	dio_request_t *dr;
@@ -736,7 +745,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 	}
 
 retry:
-	dr = vdev_disk_dio_alloc(bio_count);
+	dr = vdev_classic_dio_alloc(bio_count);
 
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
 	    zio->io_vd->vdev_failfast == B_TRUE) {
@@ -771,23 +780,23 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 		 * this should be rare - see the comment above.
 		 */
 		if (dr->dr_bio_count == i) {
-			vdev_disk_dio_free(dr);
+			vdev_classic_dio_free(dr);
 			bio_count *= 2;
 			goto retry;
 		}
 
-		nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
+		nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
 		dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
 		if (unlikely(dr->dr_bio[i] == NULL)) {
-			vdev_disk_dio_free(dr);
+			vdev_classic_dio_free(dr);
 			return (SET_ERROR(ENOMEM));
 		}
 
-		/* Matching put called by vdev_disk_physio_completion */
-		vdev_disk_dio_get(dr);
+		/* Matching put called by vdev_classic_physio_completion */
+		vdev_classic_dio_get(dr);
 
 		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
-		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+		dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
 		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
 
@@ -801,7 +810,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 	}
 
 	/* Extra reference to protect dio_request during vdev_submit_bio */
-	vdev_disk_dio_get(dr);
+	vdev_classic_dio_get(dr);
 
 	if (dr->dr_bio_count > 1)
 		blk_start_plug(&plug);
@@ -815,11 +824,13 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 	if (dr->dr_bio_count > 1)
 		blk_finish_plug(&plug);
 
-	vdev_disk_dio_put(dr);
+	vdev_classic_dio_put(dr);
 
 	return (error);
 }
 
+/* ========== */
+
 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
 {
 	zio_t *zio = bio->bi_private;
@@ -1023,7 +1034,7 @@ vdev_disk_io_start(zio_t *zio)
 	}
 
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
-	error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
+	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
 	    zio->io_size, zio->io_offset, rw, 0);
 	rw_exit(&vd->vd_lock);
 

From 867178ae1db28e73051c8a7ce662f2f2f81cd8e6 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:23:30 +1100
Subject: [PATCH 018/116] vdev_disk: reorganise vdev_disk_io_start

Light reshuffle to make it a bit more linear to read and get rid of a
bunch of args that aren't needed in all cases.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 module/os/linux/zfs/vdev_disk.c | 51 ++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 957619b87afd..51e7cef2fc78 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -720,9 +720,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 }
 
 static int
-vdev_classic_physio(struct block_device *bdev, zio_t *zio,
-    size_t io_size, uint64_t io_offset, int rw, int flags)
+vdev_classic_physio(zio_t *zio)
 {
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	size_t io_size = zio->io_size;
+	uint64_t io_offset = zio->io_offset;
+	int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
+	int flags = 0;
+
 	dio_request_t *dr;
 	uint64_t abd_offset;
 	uint64_t bio_offset;
@@ -944,7 +951,7 @@ vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
-	int rw, error;
+	int error;
 
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
@@ -1007,13 +1014,6 @@ vdev_disk_io_start(zio_t *zio)
 		rw_exit(&vd->vd_lock);
 		zio_execute(zio);
 		return;
-	case ZIO_TYPE_WRITE:
-		rw = WRITE;
-		break;
-
-	case ZIO_TYPE_READ:
-		rw = READ;
-		break;
 
 	case ZIO_TYPE_TRIM:
 		zio->io_error = vdev_disk_io_trim(zio);
@@ -1026,23 +1026,34 @@ vdev_disk_io_start(zio_t *zio)
 #endif
 		return;
 
-	default:
+	case ZIO_TYPE_READ:
+	case ZIO_TYPE_WRITE:
+		zio->io_target_timestamp = zio_handle_io_delay(zio);
+		error = vdev_classic_physio(zio);
 		rw_exit(&vd->vd_lock);
-		zio->io_error = SET_ERROR(ENOTSUP);
-		zio_interrupt(zio);
+		if (error) {
+			zio->io_error = error;
+			zio_interrupt(zio);
+		}
 		return;
-	}
 
-	zio->io_target_timestamp = zio_handle_io_delay(zio);
-	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
-	    zio->io_size, zio->io_offset, rw, 0);
-	rw_exit(&vd->vd_lock);
+	default:
+		/*
+		 * Getting here means our parent vdev has made a very strange
+		 * request of us, and shouldn't happen. Assert here to force a
+		 * crash in dev builds, but in production return the IO
+		 * unhandled. The pool will likely suspend anyway but that's
+		 * nicer than crashing the kernel.
+		 */
+		ASSERT3S(zio->io_type, ==, -1);
 
-	if (error) {
-		zio->io_error = error;
+		rw_exit(&vd->vd_lock);
+		zio->io_error = SET_ERROR(ENOTSUP);
 		zio_interrupt(zio);
 		return;
 	}
+
+	__builtin_unreachable();
 }
 
 static void

From c4a13ba483f08a81aa47479d2f763a470d95b2b0 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:29:19 +1100
Subject: [PATCH 019/116] vdev_disk: make read/write IO function configurable

This is just setting up for the next couple of commits, which will add a
new IO function and a parameter to select it.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 51e7cef2fc78..de4dba72fa3c 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -946,6 +946,8 @@ vdev_disk_io_trim(zio_t *zio)
 #endif
 }
 
+int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
+
 static void
 vdev_disk_io_start(zio_t *zio)
 {
@@ -1029,7 +1031,7 @@ vdev_disk_io_start(zio_t *zio)
 	case ZIO_TYPE_READ:
 	case ZIO_TYPE_WRITE:
 		zio->io_target_timestamp = zio_handle_io_delay(zio);
-		error = vdev_classic_physio(zio);
+		error = vdev_disk_io_rw_fn(zio);
 		rw_exit(&vd->vd_lock);
 		if (error) {
 			zio->io_error = error;
@@ -1102,8 +1104,25 @@ vdev_disk_rele(vdev_t *vd)
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
+/*
+ * At first use vdev use, set the submission function from the default value if
+ * it hasn't been set already.
+ */
+static int
+vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+	(void) spa;
+	(void) nv;
+	(void) tsd;
+
+	if (vdev_disk_io_rw_fn == NULL)
+		vdev_disk_io_rw_fn = vdev_classic_physio;
+
+	return (0);
+}
+
 vdev_ops_t vdev_disk_ops = {
-	.vdev_op_init = NULL,
+	.vdev_op_init = vdev_disk_init,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_disk_open,
 	.vdev_op_close = vdev_disk_close,

From 06a196020e6f70d2fedbd4d0d05bbe0c1ac6e4d8 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 18 Jul 2023 11:11:29 +1000
Subject: [PATCH 020/116] vdev_disk: rewrite BIO filling machinery to avoid
 split pages

This commit tackles a number of issues in the way BIOs (`struct bio`)
are constructed for submission to the Linux block layer.

The kernel has a hard upper limit on the number of pages/segments that
can be added to a BIO, as well as a separate limit for each device
(related to its queue depth and other scheduling characteristics).

ZFS counts the number of memory pages in the request ABD
(`abd_nr_pages_off()`, and then uses that as the number of segments to
put into the BIO, up to the hard upper limit. If it requires more than
the limit, it will create multiple BIOs.

Leaving aside the fact that page count method is wrong (see below), not
limiting to the device segment max means that the device driver will
need to split the BIO in half. This is alone is not necessarily a
problem, but it interacts with another issue to cause a much larger
problem.

The kernel function to add a segment to a BIO (`bio_add_page()`) takes a
`struct page` pointer, and offset+len within it. `struct page` can
represent a run of contiguous memory pages (known as a "compound page").
In can be of arbitrary length.

The ZFS functions that count ABD pages and load them into the BIO
(`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never
consider a page to be more than `PAGE_SIZE` (4K), even if the `struct
page` is for multiple pages. In this case, it will load the same `struct
page` into the BIO multiple times, with the offset adjusted each time.

With a sufficiently large ABD, this can easily lead to the BIO being
entirely filled much earlier than it could have been. This is also
further contributes to the problem caused by the incorrect segment limit
calculation, as its much easier to go past the device limit, and so
require a split.

Again, this is not a problem on its own.

The logic for "never submit more than `PAGE_SIZE`" is actually a little
more subtle. It will actually never submit a buffer that crosses a 4K
page boundary.

In practice, this is fine, as most ABDs are scattered, that is a list of
complete 4K pages, and so are loaded in as such.

Linear ABDs are typically allocated from slabs, and for small sizes they
are frequently not aligned to page boundaries. For example, a 12K
allocation can span four pages, eg:

     -- 4K -- -- 4K -- -- 4K -- -- 4K --
    |        |        |        |        |
          :## ######## ######## ######:    [1K, 4K, 4K, 3K]

Such an allocation would be loaded into a BIO as you see:

    [1K, 4K, 4K, 3K]

This tends not to be a problem in practice, because even if the BIO were
filled and needed to be split, each half would still have either a start
or end aligned to the logical block size of the device (assuming 4K at
least).

---

In ideal circumstances, these shortcomings don't cause any particular
problems. Its when they start to interact with other ZFS features that
things get interesting.

Aggregation will create a "gang" ABD, which is simply a list of other
ABDs. Iterating over a gang ABD is just iterating over each ABD within
it in turn.

Because the segments are simply loaded in order, we can end up with
uneven segments either side of the "gap" between the two ABDs. For
example, two 12K ABDs might be aggregated and then loaded as:

    [1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K]

Should a split occur, each individual BIO can end up either having an
start or end offset that is not aligned to the logical block size, which
some drivers (eg SCSI) will reject. However, this tends not to happen
because the default aggregation limit usually keeps the BIO small enough
to not require more than one split, and most pages are actually full 4K
pages, so hitting an uneven gap is very rare anyway.

If the pool is under particular memory pressure, then an IO can be
broken down into a "gang block", a 512-byte block composed of a header
and up to three block pointers. Each points to a fragment of the
original write, or in turn, another gang block, breaking the original
data up over and over until space can be found in the pool for each of
them.

Each gang header is a separate 512-byte memory allocation from a slab,
that needs to be written down to disk. When the gang header is added to
the BIO, its a single 512-byte segment.

Pulling all this together, consider a large aggregated write of gang
blocks. This results a BIO containing lots of 512-byte segments. Given
our tendency to overfill the BIO, a split is likely, and most possible
split points will yield a pair of BIOs that are misaligned. Drivers that
care, like the SCSI driver, will reject them.

---

This commit is a substantial refactor and rewrite of much of `vdev_disk`
to sort all this out.

`vdev_bio_max_segs()` now returns the ideal maximum size for the device,
if available. There's also a tuneable `zfs_vdev_disk_max_segs` to
override this, to assist with testing.

We scan the ABD up front to count the number of pages within it, and to
confirm that if we submitted all those pages to one or more BIOs, it
could be split at any point with creating a misaligned BIO.  If the
pages in the BIO are not usable (as in any of the above situations), the
ABD is linearised, and then checked again. This is the same technique
used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size
and allocator quirks.

`vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The
idea is simply that it can hold all the state needed to create, submit
and return multiple BIOs, including all the refcounts, the ABD copy if
it was needed, and so on. Apart from what I hope is a clearer interface,
the major difference is that because we know how many BIOs we'll need up
front, we don't need the old overflow logic that would grow the BIO
array, throw away all the old work and restart. We can get it right from
the start.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 include/os/linux/kernel/linux/mod_compat.h |   1 +
 man/man4/zfs.4                             |  10 +-
 module/os/linux/zfs/vdev_disk.c            | 439 ++++++++++++++++++++-
 3 files changed, 447 insertions(+), 3 deletions(-)

diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h
index 8e20a9613539..039865b703ef 100644
--- a/include/os/linux/kernel/linux/mod_compat.h
+++ b/include/os/linux/kernel/linux/mod_compat.h
@@ -68,6 +68,7 @@ enum scope_prefix_types {
 	zfs_trim,
 	zfs_txg,
 	zfs_vdev,
+	zfs_vdev_disk,
 	zfs_vdev_file,
 	zfs_vdev_mirror,
 	zfs_vnops,
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 759a68784aca..61f1df9c81d5 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2,6 +2,7 @@
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
 .\" Copyright (c) 2019 Datto Inc.
+.\" Copyright (c) 2023, 2024 Klara, Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
@@ -15,7 +16,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd July 21, 2023
+.Dd January 9, 2024
 .Dt ZFS 4
 .Os
 .
@@ -1375,6 +1376,13 @@ _
 	4	Driver	No driver retries on driver errors.
 .TE
 .
+.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
+Maximum number of segments to add to a BIO (min 4).
+If this is higher than the maximum allowed by the device queue or the kernel
+itself, it will be clamped.
+Setting it to zero will cause the kernel's ideal size to be used.
+This parameter only applies on Linux.
+.
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
 .Pa .zfs/snapshot .
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index de4dba72fa3c..0ccb9ad96fa5 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -24,6 +24,7 @@
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -66,6 +67,13 @@ typedef struct vdev_disk {
 	krwlock_t			vd_lock;
 } vdev_disk_t;
 
+/*
+ * Maximum number of segments to add to a bio (min 4). If this is higher than
+ * the maximum allowed by the device queue or the kernel itself, it will be
+ * clamped. Setting it to zero will cause the kernel's ideal size to be used.
+ */
+uint_t zfs_vdev_disk_max_segs = 0;
+
 /*
  * Unique identifier for the exclusive vdev holder.
  */
@@ -607,10 +615,433 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
 	return (bio);
 }
 
+static inline uint_t
+vdev_bio_max_segs(struct block_device *bdev)
+{
+	/*
+	 * Smallest of the device max segs and the tuneable max segs. Minimum
+	 * 4, so there's room to finish split pages if they come up.
+	 */
+	const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
+	const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
+	    MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
+	const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
+
+#ifdef HAVE_BIO_MAX_SEGS
+	return (bio_max_segs(max_segs));
+#else
+	return (MIN(max_segs, BIO_MAX_PAGES));
+#endif
+}
+
+static inline uint_t
+vdev_bio_max_bytes(struct block_device *bdev)
+{
+	return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
+}
+
+
+/*
+ * Virtual block IO object (VBIO)
+ *
+ * Linux block IO (BIO) objects have a limit on how many data segments (pages)
+ * they can hold. Depending on how they're allocated and structured, a large
+ * ZIO can require more than one BIO to be submitted to the kernel, which then
+ * all have to complete before we can return the completed ZIO back to ZFS.
+ *
+ * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
+ * translate a ZIO down into the kernel block layer and back again.
+ *
+ * Note that these are only used for data ZIOs (read/write). Meta-operations
+ * (flush/trim) don't need multiple BIOs and so can just make the call
+ * directly.
+ */
+typedef struct {
+	zio_t		*vbio_zio;	/* parent zio */
+
+	struct block_device *vbio_bdev;	/* blockdev to submit bios to */
+
+	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
+
+	atomic_t	vbio_ref;	/* bio refcount */
+	int		vbio_error;	/* error from failed bio */
+
+	uint_t		vbio_max_segs;	/* max segs per bio */
+
+	uint_t		vbio_max_bytes;	/* max bytes per bio */
+	uint_t		vbio_lbs_mask;	/* logical block size mask */
+
+	uint64_t	vbio_offset;	/* start offset of next bio */
+
+	struct bio	*vbio_bio;	/* pointer to the current bio */
+	struct bio	*vbio_bios;	/* list of all bios */
+} vbio_t;
+
+static vbio_t *
+vbio_alloc(zio_t *zio, struct block_device *bdev)
+{
+	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
+
+	vbio->vbio_zio = zio;
+	vbio->vbio_bdev = bdev;
+	atomic_set(&vbio->vbio_ref, 0);
+	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
+	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
+	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
+	vbio->vbio_offset = zio->io_offset;
+
+	return (vbio);
+}
+
+static int
+vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
+{
+	struct bio *bio;
+	uint_t ssize;
+
+	while (size > 0) {
+		bio = vbio->vbio_bio;
+		if (bio == NULL) {
+			/* New BIO, allocate and set up */
+			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
+			    vbio->vbio_max_segs);
+			if (unlikely(bio == NULL))
+				return (SET_ERROR(ENOMEM));
+			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+
+			bio->bi_next = vbio->vbio_bios;
+			vbio->vbio_bios = vbio->vbio_bio = bio;
+		}
+
+		/*
+		 * Only load as much of the current page data as will fit in
+		 * the space left in the BIO, respecting lbs alignment. Older
+		 * kernels will error if we try to overfill the BIO, while
+		 * newer ones will accept it and split the BIO. This ensures
+		 * everything works on older kernels, and avoids an additional
+		 * overhead on the new.
+		 */
+		ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
+		    vbio->vbio_lbs_mask);
+		if (ssize > 0 &&
+		    bio_add_page(bio, page, ssize, offset) == ssize) {
+			/* Accepted, adjust and load any remaining. */
+			size -= ssize;
+			offset += ssize;
+			continue;
+		}
+
+		/* No room, set up for a new BIO and loop */
+		vbio->vbio_offset += BIO_BI_SIZE(bio);
+
+		/* Signal new BIO allocation wanted */
+		vbio->vbio_bio = NULL;
+	}
+
+	return (0);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
+static void vbio_put(vbio_t *vbio);
+
+static void
+vbio_submit(vbio_t *vbio, int flags)
+{
+	ASSERT(vbio->vbio_bios);
+	struct bio *bio = vbio->vbio_bios;
+	vbio->vbio_bio = vbio->vbio_bios = NULL;
+
+	/*
+	 * We take a reference for each BIO as we submit it, plus one to
+	 * protect us from BIOs completing before we're done submitting them
+	 * all, causing vbio_put() to free vbio out from under us and/or the
+	 * zio to be returned before all its IO has completed.
+	 */
+	atomic_set(&vbio->vbio_ref, 1);
+
+	/*
+	 * If we're submitting more than one BIO, inform the block layer so
+	 * it can batch them if it wants.
+	 */
+	struct blk_plug plug;
+	boolean_t do_plug = (bio->bi_next != NULL);
+	if (do_plug)
+		blk_start_plug(&plug);
+
+	/* Submit all the BIOs */
+	while (bio != NULL) {
+		atomic_inc(&vbio->vbio_ref);
+
+		struct bio *next = bio->bi_next;
+		bio->bi_next = NULL;
+
+		bio->bi_end_io = vdev_disk_io_rw_completion;
+		bio->bi_private = vbio;
+		bio_set_op_attrs(bio,
+		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+		    WRITE : READ, flags);
+
+		vdev_submit_bio(bio);
+
+		bio = next;
+	}
+
+	/* Finish the batch */
+	if (do_plug)
+		blk_finish_plug(&plug);
+
+	/* Release the extra reference */
+	vbio_put(vbio);
+}
+
+static void
+vbio_return_abd(vbio_t *vbio)
+{
+	zio_t *zio = vbio->vbio_zio;
+	if (vbio->vbio_abd == NULL)
+		return;
+
+	/*
+	 * If we copied the ABD before issuing it, clean up and return the copy
+	 * to the ADB, with changes if appropriate.
+	 */
+	void *buf = abd_to_buf(vbio->vbio_abd);
+	abd_free(vbio->vbio_abd);
+	vbio->vbio_abd = NULL;
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+	else
+		abd_return_buf(zio->io_abd, buf, zio->io_size);
+}
+
+static void
+vbio_free(vbio_t *vbio)
+{
+	VERIFY0(atomic_read(&vbio->vbio_ref));
+
+	vbio_return_abd(vbio);
+
+	kmem_free(vbio, sizeof (vbio_t));
+}
+
+static void
+vbio_put(vbio_t *vbio)
+{
+	if (atomic_dec_return(&vbio->vbio_ref) > 0)
+		return;
+
+	/*
+	 * This was the last reference, so the entire IO is completed. Clean
+	 * up and submit it for processing.
+	 */
+
+	/*
+	 * Get any data buf back to the original ABD, if necessary. We do this
+	 * now so we can get the ZIO into the pipeline as quickly as possible,
+	 * and then do the remaining cleanup after.
+	 */
+	vbio_return_abd(vbio);
+
+	zio_t *zio = vbio->vbio_zio;
+
+	/*
+	 * Set the overall error. If multiple BIOs returned an error, only the
+	 * first will be taken; the others are dropped (see
+	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
+	 * multiple IOs to the same device to fail with different errors, so
+	 * there's no real risk.
+	 */
+	zio->io_error = vbio->vbio_error;
+	if (zio->io_error)
+		vdev_disk_error(zio);
+
+	/* All done, submit for processing */
+	zio_delay_interrupt(zio);
+
+	/* Finish cleanup */
+	vbio_free(vbio);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
+{
+	vbio_t *vbio = bio->bi_private;
+
+	if (vbio->vbio_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		vbio->vbio_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			vbio->vbio_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			vbio->vbio_error = EIO;
+#endif
+	}
+
+	/*
+	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
+	 * kernel won't touch it again after the completion function runs.
+	 */
+	bio_put(bio);
+
+	/* Drop this BIOs reference acquired by vbio_submit() */
+	vbio_put(vbio);
+}
+
+/*
+ * Iterator callback to count ABD pages and check their size & alignment.
+ *
+ * On Linux, each BIO segment can take a page pointer, and an offset+length of
+ * the data within that page. A page can be arbitrarily large ("compound"
+ * pages) but we still have to ensure the data portion is correctly sized and
+ * aligned to the logical block size, to ensure that if the kernel wants to
+ * split the BIO, the two halves will still be properly aligned.
+ */
+typedef struct {
+	uint_t  bmask;
+	uint_t  npages;
+	uint_t  end;
+} vdev_disk_check_pages_t;
+
+static int
+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vdev_disk_check_pages_t *s = priv;
+
+	/*
+	 * If we didn't finish on a block size boundary last time, then there
+	 * would be a gap if we tried to use this ABD as-is, so abort.
+	 */
+	if (s->end != 0)
+		return (1);
+
+	/*
+	 * Note if we're taking less than a full block, so we can check it
+	 * above on the next call.
+	 */
+	s->end = len & s->bmask;
+
+	/* All blocks after the first must start on a block size boundary. */
+	if (s->npages != 0 && (off & s->bmask) != 0)
+		return (1);
+
+	s->npages++;
+	return (0);
+}
+
+/*
+ * Check if we can submit the pages in this ABD to the kernel as-is. Returns
+ * the number of pages, or 0 if it can't be submitted like this.
+ */
+static boolean_t
+vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
+{
+	vdev_disk_check_pages_t s = {
+	    .bmask = bdev_logical_block_size(bdev)-1,
+	    .npages = 0,
+	    .end = 0,
+	};
+
+	if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vbio_t *vbio = priv;
+	return (vbio_add_page(vbio, page, len, off));
+}
+
+static int
+vdev_disk_io_rw(zio_t *zio)
+{
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	int flags = 0;
+
+	/*
+	 * Accessing outside the block device is never allowed.
+	 */
+	if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
+		vdev_dbgmsg(zio->io_vd,
+		    "Illegal access %llu size %llu, device size %llu",
+		    (u_longlong_t)zio->io_offset,
+		    (u_longlong_t)zio->io_size,
+		    (u_longlong_t)i_size_read(bdev->bd_inode));
+		return (SET_ERROR(EIO));
+	}
+
+	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
+	    v->vdev_failfast == B_TRUE) {
+		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
+		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
+	}
+
+	/*
+	 * Check alignment of the incoming ABD. If any part of it would require
+	 * submitting a page that is not aligned to the logical block size,
+	 * then we take a copy into a linear buffer and submit that instead.
+	 * This should be impossible on a 512b LBS, and fairly rare on 4K,
+	 * usually requiring abnormally-small data blocks (eg gang blocks)
+	 * mixed into the same ABD as larger ones (eg aggregated).
+	 */
+	abd_t *abd = zio->io_abd;
+	if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
+		void *buf;
+		if (zio->io_type == ZIO_TYPE_READ)
+			buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+		else
+			buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+
+		/*
+		 * Wrap the copy in an abd_t, so we can use the same iterators
+		 * to count and fill the vbio later.
+		 */
+		abd = abd_get_from_buf(buf, zio->io_size);
+
+		/*
+		 * False here would mean the borrowed copy has an invalid
+		 * alignment too, which would mean we've somehow been passed a
+		 * linear ABD with an interior page that has a non-zero offset
+		 * or a size not a multiple of PAGE_SIZE. This is not possible.
+		 * It would mean either zio_buf_alloc() or its underlying
+		 * allocators have done something extremely strange, or our
+		 * math in vdev_disk_check_pages() is wrong. In either case,
+		 * something in seriously wrong and its not safe to continue.
+		 */
+		VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
+	}
+
+	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
+	int error = 0;
+	vbio_t *vbio = vbio_alloc(zio, bdev);
+	if (abd != zio->io_abd)
+		vbio->vbio_abd = abd;
+
+	/* Fill it with pages */
+	error = abd_iterate_page_func(abd, 0, zio->io_size,
+	    vdev_disk_fill_vbio_cb, vbio);
+	if (error != 0) {
+		vbio_free(vbio);
+		return (error);
+	}
+
+	vbio_submit(vbio, flags);
+	return (0);
+}
+
 /* ========== */
 
 /*
- * This is the classic, battle-tested BIO submission code.
+ * This is the classic, battle-tested BIO submission code. Until we're totally
+ * sure that the new code is safe and correct in all cases, this will remain
+ * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
+ * load time.
  *
  * These functions have been renamed to vdev_classic_* to make it clear what
  * they belong to, but their implementations are unchanged.
@@ -1116,7 +1547,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	(void) tsd;
 
 	if (vdev_disk_io_rw_fn == NULL)
-		vdev_disk_io_rw_fn = vdev_classic_physio;
+		/* XXX make configurable */
+		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
 
 	return (0);
 }
@@ -1215,3 +1647,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 	"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
+
+ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
+	"Maximum number of data segments to add to an IO request (min 4)");

From df2169d141aadc0c2cc728c5c5261d6f5c2a27f7 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 13:28:57 +1100
Subject: [PATCH 021/116] vdev_disk: add module parameter to select BIO
 submission method

This makes the submission method selectable at module load time via the
`zfs_vdev_disk_classic` parameter, allowing this change to be backported
to 2.2 safely, and disabled in favour of the "classic" submission method
if new problems come up.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 man/man4/zfs.4                  | 16 ++++++++++++++++
 module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 61f1df9c81d5..cacb214d1dc1 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1382,6 +1382,22 @@ If this is higher than the maximum allowed by the device queue or the kernel
 itself, it will be clamped.
 Setting it to zero will cause the kernel's ideal size to be used.
 This parameter only applies on Linux.
+This parameter is ignored if
+.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
+.
+.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
+and earlier.
+This "classic" method has known issues with highly fragmented IO requests and
+is slower on many workloads, but it has been in use for many years and is known
+to be very stable.
+If you set this parameter, please also open a bug report why you did so,
+including the workload involved and any error messages.
+.Pp
+This parameter and the classic submission method will be removed once we have
+total confidence in the new method.
+.Pp
+This parameter only applies on Linux, and can only be set at module load time.
 .
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 0ccb9ad96fa5..a9110623ace0 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1535,6 +1535,29 @@ vdev_disk_rele(vdev_t *vd)
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
+/*
+ * BIO submission method. See comment above about vdev_classic.
+ * Set zfs_vdev_disk_classic=0 for new, =1 for classic
+ */
+static uint_t zfs_vdev_disk_classic = 0;	/* default new */
+
+/* Set submission function from module parameter */
+static int
+vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
+{
+	int err = param_set_uint(buf, kp);
+	if (err < 0)
+		return (SET_ERROR(err));
+
+	vdev_disk_io_rw_fn =
+	    zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
+
+	printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
+	    zfs_vdev_disk_classic ? "classic" : "new");
+
+	return (0);
+}
+
 /*
  * At first use vdev use, set the submission function from the default value if
  * it hasn't been set already.
@@ -1547,8 +1570,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	(void) tsd;
 
 	if (vdev_disk_io_rw_fn == NULL)
-		/* XXX make configurable */
-		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
+		vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
+		    vdev_classic_physio : vdev_disk_io_rw;
 
 	return (0);
 }
@@ -1650,3 +1673,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
 	"Maximum number of data segments to add to an IO request (min 4)");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
+    vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
+	"Use classic BIO submission method");

From 72fd834c47558cb10d847948d1a4615e894c77c3 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 21 Feb 2024 11:07:21 +1100
Subject: [PATCH 022/116] vdev_disk: use bio_chain() to submit multiple BIOs

Simplifies our code a lot, so we don't have to wait for each and
reassemble them.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 module/os/linux/zfs/vdev_disk.c | 231 +++++++++++---------------------
 1 file changed, 80 insertions(+), 151 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index a9110623ace0..36468fc21132 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -454,10 +454,9 @@ vdev_disk_close(vdev_t *v)
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
-	if (vd->vd_bdh != NULL) {
+	if (vd->vd_bdh != NULL)
 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
 		    zfs_vdev_holder);
-	}
 
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
@@ -663,9 +662,6 @@ typedef struct {
 
 	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
 
-	atomic_t	vbio_ref;	/* bio refcount */
-	int		vbio_error;	/* error from failed bio */
-
 	uint_t		vbio_max_segs;	/* max segs per bio */
 
 	uint_t		vbio_max_bytes;	/* max bytes per bio */
@@ -674,43 +670,52 @@ typedef struct {
 	uint64_t	vbio_offset;	/* start offset of next bio */
 
 	struct bio	*vbio_bio;	/* pointer to the current bio */
-	struct bio	*vbio_bios;	/* list of all bios */
+	int		vbio_flags;	/* bio flags */
 } vbio_t;
 
 static vbio_t *
-vbio_alloc(zio_t *zio, struct block_device *bdev)
+vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
 {
 	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
 
 	vbio->vbio_zio = zio;
 	vbio->vbio_bdev = bdev;
-	atomic_set(&vbio->vbio_ref, 0);
+	vbio->vbio_abd = NULL;
 	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
 	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
 	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
 	vbio->vbio_offset = zio->io_offset;
+	vbio->vbio_bio = NULL;
+	vbio->vbio_flags = flags;
 
 	return (vbio);
 }
 
+BIO_END_IO_PROTO(vbio_completion, bio, error);
+
 static int
 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 {
-	struct bio *bio;
+	struct bio *bio = vbio->vbio_bio;
 	uint_t ssize;
 
 	while (size > 0) {
-		bio = vbio->vbio_bio;
 		if (bio == NULL) {
 			/* New BIO, allocate and set up */
 			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
 			    vbio->vbio_max_segs);
-			if (unlikely(bio == NULL))
-				return (SET_ERROR(ENOMEM));
+			VERIFY(bio);
+
 			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+			bio_set_op_attrs(bio,
+			    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+			    WRITE : READ, vbio->vbio_flags);
 
-			bio->bi_next = vbio->vbio_bios;
-			vbio->vbio_bios = vbio->vbio_bio = bio;
+			if (vbio->vbio_bio) {
+				bio_chain(vbio->vbio_bio, bio);
+				vdev_submit_bio(vbio->vbio_bio);
+			}
+			vbio->vbio_bio = bio;
 		}
 
 		/*
@@ -735,157 +740,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 		vbio->vbio_offset += BIO_BI_SIZE(bio);
 
 		/* Signal new BIO allocation wanted */
-		vbio->vbio_bio = NULL;
+		bio = NULL;
 	}
 
 	return (0);
 }
 
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
-static void vbio_put(vbio_t *vbio);
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vbio_t *vbio = priv;
+	return (vbio_add_page(vbio, page, len, off));
+}
 
+/* Create some BIOs, fill them with data and submit them */
 static void
-vbio_submit(vbio_t *vbio, int flags)
+vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
-	ASSERT(vbio->vbio_bios);
-	struct bio *bio = vbio->vbio_bios;
-	vbio->vbio_bio = vbio->vbio_bios = NULL;
-
-	/*
-	 * We take a reference for each BIO as we submit it, plus one to
-	 * protect us from BIOs completing before we're done submitting them
-	 * all, causing vbio_put() to free vbio out from under us and/or the
-	 * zio to be returned before all its IO has completed.
-	 */
-	atomic_set(&vbio->vbio_ref, 1);
+	ASSERT(vbio->vbio_bdev);
 
 	/*
-	 * If we're submitting more than one BIO, inform the block layer so
-	 * it can batch them if it wants.
+	 * We plug so we can submit the BIOs as we go and only unplug them when
+	 * they are fully created and submitted. This is important; if we don't
+	 * plug, then the kernel may start executing earlier BIOs while we're
+	 * still creating and executing later ones, and if the device goes
+	 * away while that's happening, older kernels can get confused and
+	 * trample memory.
 	 */
 	struct blk_plug plug;
-	boolean_t do_plug = (bio->bi_next != NULL);
-	if (do_plug)
-		blk_start_plug(&plug);
+	blk_start_plug(&plug);
 
-	/* Submit all the BIOs */
-	while (bio != NULL) {
-		atomic_inc(&vbio->vbio_ref);
+	(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
+	ASSERT(vbio->vbio_bio);
 
-		struct bio *next = bio->bi_next;
-		bio->bi_next = NULL;
+	vbio->vbio_bio->bi_end_io = vbio_completion;
+	vbio->vbio_bio->bi_private = vbio;
 
-		bio->bi_end_io = vdev_disk_io_rw_completion;
-		bio->bi_private = vbio;
-		bio_set_op_attrs(bio,
-		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
-		    WRITE : READ, flags);
+	vdev_submit_bio(vbio->vbio_bio);
 
-		vdev_submit_bio(bio);
-
-		bio = next;
-	}
-
-	/* Finish the batch */
-	if (do_plug)
-		blk_finish_plug(&plug);
+	blk_finish_plug(&plug);
 
-	/* Release the extra reference */
-	vbio_put(vbio);
+	vbio->vbio_bio = NULL;
+	vbio->vbio_bdev = NULL;
 }
 
-static void
-vbio_return_abd(vbio_t *vbio)
+/* IO completion callback */
+BIO_END_IO_PROTO(vbio_completion, bio, error)
 {
+	vbio_t *vbio = bio->bi_private;
 	zio_t *zio = vbio->vbio_zio;
-	if (vbio->vbio_abd == NULL)
-		return;
-
-	/*
-	 * If we copied the ABD before issuing it, clean up and return the copy
-	 * to the ADB, with changes if appropriate.
-	 */
-	void *buf = abd_to_buf(vbio->vbio_abd);
-	abd_free(vbio->vbio_abd);
-	vbio->vbio_abd = NULL;
-
-	if (zio->io_type == ZIO_TYPE_READ)
-		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
-	else
-		abd_return_buf(zio->io_abd, buf, zio->io_size);
-}
 
-static void
-vbio_free(vbio_t *vbio)
-{
-	VERIFY0(atomic_read(&vbio->vbio_ref));
-
-	vbio_return_abd(vbio);
+	ASSERT(zio);
 
-	kmem_free(vbio, sizeof (vbio_t));
-}
+	/* Capture and log any errors */
+#ifdef HAVE_1ARG_BIO_END_IO_T
+	zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+	zio->io_error = 0;
+	if (error)
+		zio->io_error = -(error);
+	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		zio->io_error = EIO;
+#endif
+	ASSERT3U(zio->io_error, >=, 0);
 
-static void
-vbio_put(vbio_t *vbio)
-{
-	if (atomic_dec_return(&vbio->vbio_ref) > 0)
-		return;
+	if (zio->io_error)
+		vdev_disk_error(zio);
 
-	/*
-	 * This was the last reference, so the entire IO is completed. Clean
-	 * up and submit it for processing.
-	 */
+	/* Return the BIO to the kernel */
+	bio_put(bio);
 
 	/*
-	 * Get any data buf back to the original ABD, if necessary. We do this
-	 * now so we can get the ZIO into the pipeline as quickly as possible,
-	 * and then do the remaining cleanup after.
+	 * If we copied the ABD before issuing it, clean up and return the copy
+	 * to the ADB, with changes if appropriate.
 	 */
-	vbio_return_abd(vbio);
+	if (vbio->vbio_abd != NULL) {
+		void *buf = abd_to_buf(vbio->vbio_abd);
+		abd_free(vbio->vbio_abd);
+		vbio->vbio_abd = NULL;
 
-	zio_t *zio = vbio->vbio_zio;
+		if (zio->io_type == ZIO_TYPE_READ)
+			abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+		else
+			abd_return_buf(zio->io_abd, buf, zio->io_size);
+	}
 
-	/*
-	 * Set the overall error. If multiple BIOs returned an error, only the
-	 * first will be taken; the others are dropped (see
-	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
-	 * multiple IOs to the same device to fail with different errors, so
-	 * there's no real risk.
-	 */
-	zio->io_error = vbio->vbio_error;
-	if (zio->io_error)
-		vdev_disk_error(zio);
+	/* Final cleanup */
+	kmem_free(vbio, sizeof (vbio_t));
 
 	/* All done, submit for processing */
 	zio_delay_interrupt(zio);
-
-	/* Finish cleanup */
-	vbio_free(vbio);
-}
-
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
-{
-	vbio_t *vbio = bio->bi_private;
-
-	if (vbio->vbio_error == 0) {
-#ifdef HAVE_1ARG_BIO_END_IO_T
-		vbio->vbio_error = BIO_END_IO_ERROR(bio);
-#else
-		if (error)
-			vbio->vbio_error = -(error);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			vbio->vbio_error = EIO;
-#endif
-	}
-
-	/*
-	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
-	 * kernel won't touch it again after the completion function runs.
-	 */
-	bio_put(bio);
-
-	/* Drop this BIOs reference acquired by vbio_submit() */
-	vbio_put(vbio);
 }
 
 /*
@@ -948,14 +893,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
 	return (B_TRUE);
 }
 
-/* Iterator callback to submit ABD pages to the vbio. */
-static int
-vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
-{
-	vbio_t *vbio = priv;
-	return (vbio_add_page(vbio, page, len, off));
-}
-
 static int
 vdev_disk_io_rw(zio_t *zio)
 {
@@ -1018,20 +955,12 @@ vdev_disk_io_rw(zio_t *zio)
 	}
 
 	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
-	int error = 0;
-	vbio_t *vbio = vbio_alloc(zio, bdev);
+	vbio_t *vbio = vbio_alloc(zio, bdev, flags);
 	if (abd != zio->io_abd)
 		vbio->vbio_abd = abd;
 
-	/* Fill it with pages */
-	error = abd_iterate_page_func(abd, 0, zio->io_size,
-	    vdev_disk_fill_vbio_cb, vbio);
-	if (error != 0) {
-		vbio_free(vbio);
-		return (error);
-	}
-
-	vbio_submit(vbio, flags);
+	/* Fill it with data pages and submit it to the kernel */
+	vbio_submit(vbio, abd, zio->io_size);
 	return (0);
 }
 

From c6be6ce1755a3d9a3cbe70256cd8958ef83d8542 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 14 Mar 2024 10:57:30 +1100
Subject: [PATCH 023/116] abd_iter_page: don't use compound heads on Linux <4.5

Before 4.5 (specifically, torvalds/linux@ddc58f2), head and tail pages
in a compound page were refcounted separately. This means that using the
head page without taking a reference to it could see it cleaned up later
before we're finished with it. Specifically, bio_add_page() would take a
reference, and drop its reference after the bio completion callback
returns.

If the zio is executed immediately from the completion callback, this is
usually ok, as any data is referenced through the tail page referenced
by the ABD, and so becomes "live" that way. If there's a delay in zio
execution (high load, error injection), then the head page can be freed,
along with any dirty flags or other indicators that the underlying
memory is used. Later, when the zio completes and that memory is
accessed, its either unmapped and an unhandled fault takes down the
entire system, or it is mapped and we end up messing around in someone
else's memory. Both of these are very bad.

The solution on these older kernels is to take a reference to the head
page when we use it, and release it when we're done. There's not really
a sensible way under our current structure to do this; the "best" would
be to keep a list of head page references in the ABD, and release them
when the ABD is freed.

Since this additional overhead is totally unnecessary on 4.5+, where
head and tail pages share refcounts, I've opted to simply not use the
compound head in ABD page iteration there. This is theoretically less
efficient (though cleaning up head page references would add overhead),
but its safe, and we still get the other benefits of not mapping pages
before adding them to a bio and not mis-splitting pages.

There doesn't appear to be an obvious symbol name or config option we
can match on to discover this behaviour in configure (and the mm/page
APIs have changed a lot since then anyway), so I've gone with a simple
version check.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 module/os/linux/zfs/abd_os.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 3fe01c0b7d77..d3255dcbc0f7 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -62,6 +62,7 @@
 #include <linux/kmap_compat.h>
 #include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
+#include <linux/version.h>
 #endif
 
 #ifdef _KERNEL
@@ -1061,6 +1062,7 @@ abd_iter_page(struct abd_iter *aiter)
 	}
 	ASSERT(page);
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
 	if (PageTail(page)) {
 		/*
 		 * This page is part of a "compound page", which is a group of
@@ -1082,11 +1084,23 @@ abd_iter_page(struct abd_iter *aiter)
 		 * To do this, we need to adjust the offset to be counted from
 		 * the head page. struct page for compound pages are stored
 		 * contiguously, so we can just adjust by a simple offset.
+		 *
+		 * Before kernel 4.5, compound page heads were refcounted
+		 * separately, such that moving back to the head page would
+		 * require us to take a reference to it and releasing it once
+		 * we're completely finished with it. In practice, that means
+		 * when our caller is done with the ABD, which we have no
+		 * insight into from here. Rather than contort this API to
+		 * track head page references on such ancient kernels, we just
+		 * compile this block out and use the tail pages directly. This
+		 * is slightly less efficient, but makes everything far
+		 * simpler.
 		 */
 		struct page *head = compound_head(page);
 		doff += ((page - head) * PAGESIZE);
 		page = head;
 	}
+#endif
 
 	/* final page and position within it */
 	aiter->iter_page = page;

From 8cd8ccca5383dcdd9bf55d4d22921a6b43b4ebe1 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 20:13:45 -0400
Subject: [PATCH 024/116] BRT: Skip getting length in brt_entry_lookup()

Unlike DDT, where ZAP values may have different lengths due to
compression, all BRT entries are identical 8-byte counters.  It
does not make sense to first fetch the length only to assert it.
zap_lookup_uint64() is specifically designed to work with counters
of different size and should return error if something odd found.
Calling it straight allows to save some measurable CPU time.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15950
---
 module/zfs/brt.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 5d1f4728b645..ea8c0735c4b7 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -900,7 +900,6 @@ static int
 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
 {
 	uint64_t mos_entries;
-	uint64_t one, physsize;
 	int error;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
@@ -918,21 +917,8 @@ brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
 
 	brt_unlock(brt);
 
-	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
-	    BRT_KEY_WORDS, &one, &physsize);
-	if (error == 0) {
-		ASSERT3U(one, ==, 1);
-		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
-
-		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
-		    &bre->bre_offset, BRT_KEY_WORDS, 1,
-		    sizeof (bre->bre_refcount), &bre->bre_refcount);
-		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
-		    "count=%llu error=%d", (u_longlong_t)mos_entries,
-		    (u_longlong_t)brtvd->bv_vdevid,
-		    (u_longlong_t)bre->bre_offset,
-		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
-	}
+	error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
+	    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount);
 
 	brt_wlock(brt);
 

From a89d209bb60c3f32881da7624bd01d28023da4f4 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 18 Mar 2024 14:19:53 -0400
Subject: [PATCH 025/116] BRT: Fix holes cloning.

 - When reading L0 block pointers handle buffers without ones and
without dirty records as a holes.  Those appear when dnode size
was increased, but the end was never written, so there are no new
indirection levels to store the pointers.  It makes no sense to
return EAGAIN here, since sync won't create new indirection levels
until there will be actual writes.
 - When cloning blocks set destination hole logical birth time
to the current TXG.  Otherwise if we are cloning over existing
data, newly created holes may not be properly replicated later.
Use BP_SET_BIRTH() when possible to not replicate its logic.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15994
Closes #16007
---
 module/zfs/dmu.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index b88cf447d296..753dde6d5205 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -2265,11 +2265,13 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 
 		if (bp == NULL) {
 			/*
-			 * The block was created in this transaction group,
-			 * so it has no BP yet.
+			 * The file size was increased, but the block was never
+			 * written, otherwise we would either have the block
+			 * pointer or the dirty record and would not get here.
+			 * It is effectively a hole, so report it as such.
 			 */
-			error = SET_ERROR(EAGAIN);
-			goto out;
+			BP_ZERO(&bps[i]);
+			continue;
 		}
 		/*
 		 * Make sure we clone only data blocks.
@@ -2361,19 +2363,17 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 		dl = &dr->dt.dl;
 		dl->dr_overridden_by = *bp;
-		dl->dr_brtwrite = B_TRUE;
-		dl->dr_override_state = DR_OVERRIDDEN;
-		if (BP_IS_HOLE(bp)) {
-			BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, 0);
-			BP_SET_PHYSICAL_BIRTH(&dl->dr_overridden_by, 0);
-		} else {
-			BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
-			    dr->dr_txg);
+		if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			if (!BP_IS_EMBEDDED(bp)) {
-				BP_SET_PHYSICAL_BIRTH(&dl->dr_overridden_by,
+				BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
 				    BP_GET_BIRTH(bp));
+			} else {
+				BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
+				    dr->dr_txg);
 			}
 		}
+		dl->dr_brtwrite = B_TRUE;
+		dl->dr_override_state = DR_OVERRIDDEN;
 
 		mutex_exit(&db->db_mtx);
 

From b4034276247bfe430a7ff8d8ef9b06826e83cb9d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 19 Mar 2024 12:25:14 -0400
Subject: [PATCH 026/116] BRT: Fix tests to work on non-empty pools

It should not normally happen, but if it does, better to not fail
everything for no good reason, or it may be hard to debug.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16007
---
 .../functional/bclone/bclone_common.kshlib    | 47 ++++++++++---------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
index 3b8eaea5bb54..84b92b4dcdc9 100644
--- a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
@@ -97,20 +97,19 @@ function verify_pool_prop_eq
 
 function verify_pool_props
 {
-    typeset -r dsize=$1
-    typeset -r ratio=$2
+    typeset -r oused=$1
+    typeset -r osaved=$2
+    typeset dsize=$3
+    typeset ratio=$4
 
     if [[ $dsize -eq 0 ]]; then
-        verify_pool_prop_eq bcloneused 0
-        verify_pool_prop_eq bclonesaved 0
-        verify_pool_prop_eq bcloneratio 1.00
-    else
-        if [[ $ratio -eq 1 ]]; then
-            verify_pool_prop_eq bcloneused 0
-        else
-            verify_pool_prop_eq bcloneused $dsize
-        fi
-        verify_pool_prop_eq bclonesaved $((dsize*(ratio-1)))
+        ratio=1
+    elif [[ $ratio -eq 1 ]]; then
+        dsize=0
+    fi
+    verify_pool_prop_eq bcloneused $(($oused+$dsize))
+    verify_pool_prop_eq bclonesaved $(($osaved+dsize*(ratio-1)))
+    if [[ $oused -eq 0 ]]; then
         verify_pool_prop_eq bcloneratio "${ratio}.00"
     fi
 }
@@ -124,16 +123,22 @@ function bclone_test
     typeset -r srcdir=$4
     typeset -r dstdir=$5
     typeset dsize
+    typeset oused
+    typeset osaved
 
     typeset -r original="${srcdir}/original"
     typeset -r clone="${dstdir}/clone"
 
     log_note "Testing file copy with datatype $datatype, file size $filesize, embedded $embedded"
 
+    # Save current block cloning stats for later use.
+    sync_pool $TESTPOOL
+    oused=$(get_pool_prop bcloneused $TESTPOOL)
+    osaved=$(get_pool_prop bclonesaved $TESTPOOL)
+
     # Create a test file with known content.
     case $datatype in
         random|text)
-            sync_pool $TESTPOOL
             if [[ $datatype = "random" ]]; then
                 dd if=/dev/urandom of=$original bs=$filesize count=1 2>/dev/null
             else
@@ -146,13 +151,13 @@ function bclone_test
             sync_pool $TESTPOOL
             # It is hard to predict block sizes that will be used,
             # so just do one clone and take it from bcloneused.
-            filesize=$(zpool get -Hp -o value bcloneused $TESTPOOL)
+            dsize=$(get_pool_prop bcloneused $TESTPOOL)
+            dsize=$(($dsize-$oused))
             if [[ $embedded = "false" ]]; then
-                log_must test $filesize -gt 0
+                log_must test $dsize -gt 0
             fi
             rm -f "${clone}-tmp"
             sync_pool $TESTPOOL
-            dsize=$filesize
             ;;
         hole)
             log_must truncate_test -s $filesize -f $original
@@ -217,7 +222,7 @@ function bclone_test
     test_file_integrity $original_checksum "${clone}4" $filesize
     test_file_integrity $original_checksum "${clone}5" $filesize
 
-    verify_pool_props $dsize 7
+    verify_pool_props $oused $osaved $dsize 7
 
     # Clear cache and test after fresh import.
     log_must zpool export $TESTPOOL
@@ -240,7 +245,7 @@ function bclone_test
 
     sync_pool $TESTPOOL
 
-    verify_pool_props $dsize 11
+    verify_pool_props $oused $osaved $dsize 11
 
     log_must zpool export $TESTPOOL
     log_must zpool import $TESTPOOL
@@ -268,7 +273,7 @@ function bclone_test
     test_file_integrity $original_checksum "${clone}8" $filesize
     test_file_integrity $original_checksum "${clone}9" $filesize
 
-    verify_pool_props $dsize 6
+    verify_pool_props $oused $osaved $dsize 6
 
     rm -f "${clone}0" "${clone}2" "${clone}4" "${clone}8" "${clone}9"
 
@@ -276,11 +281,11 @@ function bclone_test
 
     test_file_integrity $original_checksum "${clone}6" $filesize
 
-    verify_pool_props $dsize 1
+    verify_pool_props $oused $osaved $dsize 1
 
     rm -f "${clone}6"
 
     sync_pool $TESTPOOL
 
-    verify_pool_props $dsize 1
+    verify_pool_props $oused $osaved $dsize 1
 }

From 0c8eb974ff3bba965d7303d3fa7db2007ef4bdfa Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 19 Mar 2024 13:08:05 -0400
Subject: [PATCH 027/116] BRT: Check pool clone stats in more tests

This should allow to catch some leaks, if those happen.

While there fix some cosmetic issues.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16007
---
 .../bclone/bclone_corner_cases.kshlib         | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
index ddfbfc999c4e..aeb8efe91715 100644
--- a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
@@ -66,7 +66,7 @@ function bclone_corner_cases_init
     export SECOND_HALF_ORIG0_CHECKSUM=$(second_half_checksum $ORIG0)
     export SECOND_HALF_ORIG1_CHECKSUM=$(second_half_checksum $ORIG1)
     export SECOND_HALF_ORIG2_CHECKSUM=$(second_half_checksum $ORIG2)
-    export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 | sha256digest)
+    export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 2>/dev/null | sha256digest)
     export FIRST_HALF_CHECKSUM=""
     export SECOND_HALF_CHECKSUM=""
 }
@@ -210,6 +210,8 @@ function bclone_corner_cases_test
     typeset -r dstdir=$2
     typeset limit=$3
     typeset -i count=0
+    typeset oused
+    typeset osaved
 
     if [[ $srcdir != "count" ]]; then
         if [[ -n "$limit" ]]; then
@@ -217,6 +219,11 @@ function bclone_corner_cases_test
             limit=$(random_int_between 1 $total_count $((limit*2)) | sort -nu | head -n $limit | xargs)
         fi
         bclone_corner_cases_init $srcdir $dstdir
+
+        # Save current block cloning stats for later use.
+        sync_pool $TESTPOOL
+        oused=$(get_pool_prop bcloneused $TESTPOOL)
+        osaved=$(get_pool_prop bclonesaved $TESTPOOL)
     fi
 
     #
@@ -285,21 +292,24 @@ function bclone_corner_cases_test
                                     overwrite_clone "$second_overwrite"
 
                                     if checksum_compare $read_after; then
-                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after"
+                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after"
                                     else
-                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after"
+                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after"
                                     fi
 
                                     log_must zpool export $TESTPOOL
                                     log_must zpool import $TESTPOOL
 
                                     if checksum_compare "yes"; then
-                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg"
+                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg"
                                     else
-                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg"
+                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg"
                                     fi
 
                                     rm -f "$CLONE"
+                                    sync_pool $TESTPOOL
+                                    verify_pool_prop_eq bcloneused $oused
+                                    verify_pool_prop_eq bclonesaved $osaved
                                 done
                             done
                         done

From e39e20b6dc73bd7df1f097c23b5297bcc989ed53 Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Wed, 27 Mar 2024 17:59:16 -0400
Subject: [PATCH 028/116] ZTS: fix flakiness in cp_files_002_pos

Fix RANDOM to not return zero.

Overwriting with `dd ... count=0` does not test anything.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16029
---
 .../tests/functional/cp_files/cp_files_002_pos.ksh          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
index 60817449ab03..4db968ffae05 100755
--- a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
@@ -76,7 +76,7 @@ log_onexit cleanup
 
 SRC_FILE=src.data
 DST_FILE=dst.data
-SRC_SIZE=$(($RANDOM % 2048))
+SRC_SIZE=$((1024 + $RANDOM % 1024))
 
 # A smaller recordsize is used merely to speed up the test.
 RECORDSIZE=4096
@@ -120,7 +120,7 @@ for mode in "never" "auto" "always"; do
 	# Overwrite a random range of an existing file and immediately copy it.
 	sync_pool $TESTPOOL
 	log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
-            seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
+            seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc
 	if [[ "$mode" == "always" ]]; then
 		log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
 		log_must ls -l $CP_TESTDIR
@@ -152,7 +152,7 @@ for mode in "never" "auto" "always"; do
 
 	# Overwrite a random range of an existing file and immediately copy it.
 	log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
-            seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
+            seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc
 	log_must cp --reflink=$mode $SRC_FILE $DST_FILE
 	verify_copy $SRC_FILE $DST_FILE
 	log_must rm -f $SRC_FILE $DST_FILE

From b1e46f869e773086c23c565d7d5b261577023cfb Mon Sep 17 00:00:00 2001
From: George Wilson <george.wilson@delphix.com>
Date: Fri, 29 Mar 2024 15:15:56 -0400
Subject: [PATCH 029/116] Add ashift validation when adding devices to a pool

Currently, zpool add allows users to add top-level vdevs that have
different ashifts but doing so prevents users from being able to
perform a top-level vdev removal. Often times consumers may not realize
that they have mismatched ashifts until the top-level removal fails.

This feature adds ashift validation to the zpool add command and will
fail the operation if the sector size of the specified vdev does not
match the existing pool. This behavior can be disabled by using the -f
flag. In addition, new flags have been added to provide fine-grained
control to disable specific checks. These flags
are:

--allow-in-use
--allow-ashift-mismatch
--allow-replicaton-mismatch

The force flag will disable all of these checks.

Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Mark Maybee <mmaybee@delphix.com>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes #15509
---
 cmd/zpool/zpool_main.c                        | 76 ++++++++++++++-----
 cmd/ztest.c                                   |  8 +-
 include/libzfs.h                              |  5 +-
 include/sys/fs/zfs.h                          |  3 +-
 include/sys/spa.h                             |  4 +-
 lib/libzfs/libzfs.abi                         | 76 +++++++++++++++----
 lib/libzfs/libzfs_pool.c                      |  5 +-
 lib/libzfs/libzfs_util.c                      |  8 +-
 man/man8/zpool-add.8                          | 18 ++++-
 module/zfs/spa.c                              | 14 +++-
 module/zfs/zfs_ioctl.c                        |  4 +-
 tests/runfiles/common.run                     |  3 +-
 tests/zfs-tests/tests/Makefile.am             |  1 +
 .../cli_root/zpool_add/add-o_ashift.ksh       | 17 ++++-
 .../cli_root/zpool_add/add_prop_ashift.ksh    | 16 +++-
 .../zpool_add--allow-ashift-mismatch.ksh      |  0
 .../cli_root/zpool_add/zpool_add_002_pos.ksh  | 11 +++
 .../cli_root/zpool_add/zpool_add_004_pos.ksh  |  2 +-
 .../cli_root/zpool_add/zpool_add_005_pos.ksh  |  2 +
 .../cli_root/zpool_add/zpool_add_009_neg.ksh  |  2 +
 .../cli_root/zpool_add/zpool_add_010_pos.ksh  |  2 +-
 21 files changed, 219 insertions(+), 58 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 987d44062865..c85a5f285154 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2012 by Frederik Wessels. All rights reserved.
  * Copyright (c) 2012 by Cyril Plisko. All rights reserved.
  * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
@@ -131,6 +131,13 @@ static int zpool_do_help(int argc, char **argv);
 static zpool_compat_status_t zpool_do_load_compat(
     const char *, boolean_t *);
 
+enum zpool_options {
+	ZPOOL_OPTION_POWER = 1024,
+	ZPOOL_OPTION_ALLOW_INUSE,
+	ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH,
+	ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH
+};
+
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
@@ -347,7 +354,7 @@ get_usage(zpool_help_t idx)
 {
 	switch (idx) {
 	case HELP_ADD:
-		return (gettext("\tadd [-fgLnP] [-o property=value] "
+		return (gettext("\tadd [-afgLnP] [-o property=value] "
 		    "<pool> <vdev> ...\n"));
 	case HELP_ATTACH:
 		return (gettext("\tattach [-fsw] [-o property=value] "
@@ -1009,8 +1016,9 @@ add_prop_list_default(const char *propname, const char *propval,
 }
 
 /*
- * zpool add [-fgLnP] [-o property=value] <pool> <vdev> ...
+ * zpool add [-afgLnP] [-o property=value] <pool> <vdev> ...
  *
+ *	-a	Disable the ashift validation checks
  *	-f	Force addition of devices, even if they appear in use
  *	-g	Display guid for individual vdev name.
  *	-L	Follow links when resolving vdev path name.
@@ -1026,8 +1034,11 @@ add_prop_list_default(const char *propname, const char *propval,
 int
 zpool_do_add(int argc, char **argv)
 {
-	boolean_t force = B_FALSE;
+	boolean_t check_replication = B_TRUE;
+	boolean_t check_inuse = B_TRUE;
 	boolean_t dryrun = B_FALSE;
+	boolean_t check_ashift = B_TRUE;
+	boolean_t force = B_FALSE;
 	int name_flags = 0;
 	int c;
 	nvlist_t *nvroot;
@@ -1038,8 +1049,18 @@ zpool_do_add(int argc, char **argv)
 	nvlist_t *props = NULL;
 	char *propval;
 
+	struct option long_options[] = {
+		{"allow-in-use", no_argument, NULL, ZPOOL_OPTION_ALLOW_INUSE},
+		{"allow-replication-mismatch", no_argument, NULL,
+		    ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH},
+		{"allow-ashift-mismatch", no_argument, NULL,
+		    ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH},
+		{0, 0, 0, 0}
+	};
+
 	/* check options */
-	while ((c = getopt(argc, argv, "fgLno:P")) != -1) {
+	while ((c = getopt_long(argc, argv, "fgLno:P", long_options, NULL))
+	    != -1) {
 		switch (c) {
 		case 'f':
 			force = B_TRUE;
@@ -1069,6 +1090,15 @@ zpool_do_add(int argc, char **argv)
 		case 'P':
 			name_flags |= VDEV_NAME_PATH;
 			break;
+		case ZPOOL_OPTION_ALLOW_INUSE:
+			check_inuse = B_FALSE;
+			break;
+		case ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH:
+			check_replication = B_FALSE;
+			break;
+		case ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH:
+			check_ashift = B_FALSE;
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -1089,6 +1119,19 @@ zpool_do_add(int argc, char **argv)
 		usage(B_FALSE);
 	}
 
+	if (force) {
+		if (!check_inuse || !check_replication || !check_ashift) {
+			(void) fprintf(stderr, gettext("'-f' option is not "
+			    "allowed with '--allow-replication-mismatch', "
+			    "'--allow-ashift-mismatch', or "
+			    "'--allow-in-use'\n"));
+			usage(B_FALSE);
+		}
+		check_inuse = B_FALSE;
+		check_replication = B_FALSE;
+		check_ashift = B_FALSE;
+	}
+
 	poolname = argv[0];
 
 	argc--;
@@ -1119,8 +1162,8 @@ zpool_do_add(int argc, char **argv)
 	}
 
 	/* pass off to make_root_vdev for processing */
-	nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun,
-	    argc, argv);
+	nvroot = make_root_vdev(zhp, props, !check_inuse,
+	    check_replication, B_FALSE, dryrun, argc, argv);
 	if (nvroot == NULL) {
 		zpool_close(zhp);
 		return (1);
@@ -1224,7 +1267,7 @@ zpool_do_add(int argc, char **argv)
 
 		ret = 0;
 	} else {
-		ret = (zpool_add(zhp, nvroot) != 0);
+		ret = (zpool_add(zhp, nvroot, check_ashift) != 0);
 	}
 
 	nvlist_free(props);
@@ -7081,7 +7124,6 @@ zpool_do_split(int argc, char **argv)
 	return (ret);
 }
 
-#define	POWER_OPT 1024
 
 /*
  * zpool online [--power] <pool> <device> ...
@@ -7099,7 +7141,7 @@ zpool_do_online(int argc, char **argv)
 	int flags = 0;
 	boolean_t is_power_on = B_FALSE;
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -7109,7 +7151,7 @@ zpool_do_online(int argc, char **argv)
 		case 'e':
 			flags |= ZFS_ONLINE_EXPAND;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			is_power_on = B_TRUE;
 			break;
 		case '?':
@@ -7222,7 +7264,7 @@ zpool_do_offline(int argc, char **argv)
 	boolean_t is_power_off = B_FALSE;
 
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -7235,7 +7277,7 @@ zpool_do_offline(int argc, char **argv)
 		case 't':
 			istmp = B_TRUE;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			is_power_off = B_TRUE;
 			break;
 		case '?':
@@ -7335,7 +7377,7 @@ zpool_do_clear(int argc, char **argv)
 	char *pool, *device;
 
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -7352,7 +7394,7 @@ zpool_do_clear(int argc, char **argv)
 		case 'X':
 			xtreme_rewind = B_TRUE;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			is_power_on = B_TRUE;
 			break;
 		case '?':
@@ -9208,7 +9250,7 @@ zpool_do_status(int argc, char **argv)
 	char *cmd = NULL;
 
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -9276,7 +9318,7 @@ zpool_do_status(int argc, char **argv)
 		case 'x':
 			cb.cb_explain = B_TRUE;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			cb.cb_print_power = B_TRUE;
 			break;
 		case '?':
diff --git a/cmd/ztest.c b/cmd/ztest.c
index 1d414a9f6fd5..684ab586bb93 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -3375,7 +3375,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
 		    "log" : NULL, raidz_children, zs->zs_mirrors,
 		    1);
 
-		error = spa_vdev_add(spa, nvroot);
+		error = spa_vdev_add(spa, nvroot, B_FALSE);
 		fnvlist_free(nvroot);
 
 		switch (error) {
@@ -3438,7 +3438,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
 	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
 	    class, raidz_children, zs->zs_mirrors, 1);
 
-	error = spa_vdev_add(spa, nvroot);
+	error = spa_vdev_add(spa, nvroot, B_FALSE);
 	fnvlist_free(nvroot);
 
 	if (error == ENOSPC)
@@ -3545,7 +3545,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
 		 */
 		nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
 		    (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1);
-		error = spa_vdev_add(spa, nvroot);
+		error = spa_vdev_add(spa, nvroot, B_FALSE);
 
 		switch (error) {
 		case 0:
diff --git a/include/libzfs.h b/include/libzfs.h
index 4f06b5d3c24c..2823b8845827 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright Joyent, Inc.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2016, Intel Corporation.
@@ -158,6 +158,7 @@ typedef enum zfs_error {
 	EZFS_RESUME_EXISTS,	/* Resume on existing dataset without force */
 	EZFS_SHAREFAILED,	/* filesystem share failed */
 	EZFS_RAIDZ_EXPAND_IN_PROGRESS,	/* a raidz is currently expanding */
+	EZFS_ASHIFT_MISMATCH,   /* can't add vdevs with different ashifts */
 	EZFS_UNKNOWN
 } zfs_error_t;
 
@@ -261,7 +262,7 @@ _LIBZFS_H boolean_t zpool_skip_pool(const char *);
 _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
     nvlist_t *, nvlist_t *);
 _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *);
-_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *);
+_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift);
 
 typedef struct splitflags {
 	/* do not split, but return the config that would be split off */
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 025567e2183f..21f99bacccf3 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -1603,6 +1603,7 @@ typedef enum {
 	ZFS_ERR_RESUME_EXISTS,
 	ZFS_ERR_CRYPTO_NOTSUP,
 	ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
+	ZFS_ERR_ASHIFT_MISMATCH,
 } zfs_errno_t;
 
 /*
diff --git a/include/sys/spa.h b/include/sys/spa.h
index fb4c93431a31..b969f05afe48 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -785,7 +785,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 #define	SPA_ASYNC_DETACH_SPARE			0x4000
 
 /* device manipulation */
-extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing, int rebuild);
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index cdd2f04c2629..2bbaae6345ab 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -1112,14 +1112,11 @@
         <var-decl name='prev' type-id='b03eadb4' visibility='default'/>
       </data-member>
     </class-decl>
-    <class-decl name='list' size-in-bits='256' is-struct='yes' visibility='default' id='e824dae9'>
+    <class-decl name='list' size-in-bits='192' is-struct='yes' visibility='default' id='e824dae9'>
       <data-member access='public' layout-offset-in-bits='0'>
-        <var-decl name='list_size' type-id='b59d7dce' visibility='default'/>
-      </data-member>
-      <data-member access='public' layout-offset-in-bits='64'>
         <var-decl name='list_offset' type-id='b59d7dce' visibility='default'/>
       </data-member>
-      <data-member access='public' layout-offset-in-bits='128'>
+      <data-member access='public' layout-offset-in-bits='64'>
         <var-decl name='list_head' type-id='b0b5e45e' visibility='default'/>
       </data-member>
     </class-decl>
@@ -2832,6 +2829,9 @@
     </function-type>
   </abi-instr>
   <abi-instr address-size='64' path='lib/libzfs/libzfs_crypto.c' language='LANG_C99'>
+    <array-type-def dimensions='1' type-id='38b51b3c' size-in-bits='832' id='02b72c00'>
+      <subrange length='13' type-id='7359adad' id='487fded1'/>
+    </array-type-def>
     <array-type-def dimensions='1' type-id='fb7c6451' size-in-bits='256' id='64177143'>
       <subrange length='32' type-id='7359adad' id='ae5bde82'/>
     </array-type-def>
@@ -2844,6 +2844,10 @@
     <class-decl name='_IO_codecvt' is-struct='yes' visibility='default' is-declaration-only='yes' id='a4036571'/>
     <class-decl name='_IO_marker' is-struct='yes' visibility='default' is-declaration-only='yes' id='010ae0b9'/>
     <class-decl name='_IO_wide_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='79bd3751'/>
+    <class-decl name='__locale_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='23de8b96'/>
+    <array-type-def dimensions='1' type-id='80f4b756' size-in-bits='832' id='39e6f84a'>
+      <subrange length='13' type-id='7359adad' id='487fded1'/>
+    </array-type-def>
     <array-type-def dimensions='1' type-id='95e97e5e' size-in-bits='896' id='47394ee0'>
       <subrange length='28' type-id='7359adad' id='3db583d7'/>
     </array-type-def>
@@ -2964,6 +2968,24 @@
     <typedef-decl name='__clock_t' type-id='bd54fe1a' id='4d66c6d7'/>
     <typedef-decl name='__ssize_t' type-id='bd54fe1a' id='41060289'/>
     <typedef-decl name='FILE' type-id='ec1ed955' id='aa12d1ba'/>
+    <class-decl name='__locale_struct' size-in-bits='1856' is-struct='yes' visibility='default' id='90cc1ce3'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='__locales' type-id='02b72c00' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='832'>
+        <var-decl name='__ctype_b' type-id='31347b7a' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='896'>
+        <var-decl name='__ctype_tolower' type-id='6d60f45d' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='960'>
+        <var-decl name='__ctype_toupper' type-id='6d60f45d' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1024'>
+        <var-decl name='__names' type-id='39e6f84a' visibility='default'/>
+      </data-member>
+    </class-decl>
+    <typedef-decl name='__locale_t' type-id='f01e1813' id='b7ac9b5f'/>
     <class-decl name='__sigset_t' size-in-bits='1024' is-struct='yes' naming-typedef-id='b9c97942' visibility='default' id='2616147f'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='__val' type-id='d2baa450' visibility='default'/>
@@ -2979,6 +3001,7 @@
       </data-member>
     </union-decl>
     <typedef-decl name='__sigval_t' type-id='a094b870' id='eabacd01'/>
+    <typedef-decl name='locale_t' type-id='b7ac9b5f' id='973a4f8d'/>
     <class-decl name='siginfo_t' size-in-bits='1024' is-struct='yes' naming-typedef-id='cb681f62' visibility='default' id='d8149419'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='si_signo' type-id='95e97e5e' visibility='default'/>
@@ -3214,9 +3237,13 @@
     <pointer-type-def type-id='bb4788fa' size-in-bits='64' id='cecf4ea7'/>
     <pointer-type-def type-id='010ae0b9' size-in-bits='64' id='e4c6fa61'/>
     <pointer-type-def type-id='79bd3751' size-in-bits='64' id='c65a1f29'/>
+    <pointer-type-def type-id='23de8b96' size-in-bits='64' id='38b51b3c'/>
+    <pointer-type-def type-id='90cc1ce3' size-in-bits='64' id='f01e1813'/>
     <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/>
     <qualified-type-def type-id='80f4b756' restrict='yes' id='9d26089a'/>
     <pointer-type-def type-id='80f4b756' size-in-bits='64' id='7d3cd834'/>
+    <qualified-type-def type-id='95e97e5e' const='yes' id='2448a865'/>
+    <pointer-type-def type-id='2448a865' size-in-bits='64' id='6d60f45d'/>
     <qualified-type-def type-id='aca3bac8' const='yes' id='2498fd78'/>
     <pointer-type-def type-id='2498fd78' size-in-bits='64' id='eed6c816'/>
     <qualified-type-def type-id='eed6c816' restrict='yes' id='a431a9da'/>
@@ -3249,6 +3276,7 @@
     <class-decl name='_IO_codecvt' is-struct='yes' visibility='default' is-declaration-only='yes' id='a4036571'/>
     <class-decl name='_IO_marker' is-struct='yes' visibility='default' is-declaration-only='yes' id='010ae0b9'/>
     <class-decl name='_IO_wide_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='79bd3751'/>
+    <class-decl name='__locale_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='23de8b96'/>
     <function-decl name='zpool_get_prop_int' mangled-name='zpool_get_prop_int' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_get_prop_int'>
       <parameter type-id='4c81de99'/>
       <parameter type-id='5d0c23fb'/>
@@ -3353,6 +3381,10 @@
     <function-decl name='dlerror' visibility='default' binding='global' size-in-bits='64'>
       <return type-id='26a90f95'/>
     </function-decl>
+    <function-decl name='uselocale' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='973a4f8d'/>
+      <return type-id='973a4f8d'/>
+    </function-decl>
     <function-decl name='PKCS5_PBKDF2_HMAC_SHA1' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='80f4b756'/>
       <parameter type-id='95e97e5e'/>
@@ -3436,8 +3468,9 @@
       <parameter type-id='80f4b756'/>
       <return type-id='26a90f95'/>
     </function-decl>
-    <function-decl name='strerror' visibility='default' binding='global' size-in-bits='64'>
+    <function-decl name='strerror_l' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='95e97e5e'/>
+      <parameter type-id='973a4f8d'/>
       <return type-id='26a90f95'/>
     </function-decl>
     <function-decl name='tcgetattr' visibility='default' binding='global' size-in-bits='64'>
@@ -3794,12 +3827,18 @@
     <qualified-type-def type-id='9c313c2d' const='yes' id='c3b7ba7d'/>
     <pointer-type-def type-id='c3b7ba7d' size-in-bits='64' id='713a56f5'/>
     <pointer-type-def type-id='01a1b934' size-in-bits='64' id='566b3f52'/>
+    <qualified-type-def type-id='566b3f52' restrict='yes' id='c878edd6'/>
+    <pointer-type-def type-id='566b3f52' size-in-bits='64' id='82d4e9e8'/>
+    <qualified-type-def type-id='82d4e9e8' restrict='yes' id='aa19c230'/>
     <pointer-type-def type-id='7e291ce6' size-in-bits='64' id='ca64ff60'/>
     <pointer-type-def type-id='9da381c4' size-in-bits='64' id='cb785ebf'/>
     <pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/>
     <pointer-type-def type-id='8e0af06e' size-in-bits='64' id='053457bd'/>
     <pointer-type-def type-id='857bb57e' size-in-bits='64' id='75be733c'/>
     <pointer-type-def type-id='a63d15a3' size-in-bits='64' id='a195f4a3'/>
+    <qualified-type-def type-id='a195f4a3' restrict='yes' id='33518961'/>
+    <pointer-type-def type-id='a195f4a3' size-in-bits='64' id='e80ff3ab'/>
+    <qualified-type-def type-id='e80ff3ab' restrict='yes' id='8f2c7109'/>
     <pointer-type-def type-id='eae6431d' size-in-bits='64' id='0d41d328'/>
     <pointer-type-def type-id='7a6844eb' size-in-bits='64' id='18c91f9e'/>
     <pointer-type-def type-id='dddf6ca2' size-in-bits='64' id='d915a820'/>
@@ -4232,9 +4271,13 @@
       <parameter type-id='9d424d31'/>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='getgrnam' visibility='default' binding='global' size-in-bits='64'>
-      <parameter type-id='80f4b756'/>
-      <return type-id='566b3f52'/>
+    <function-decl name='getgrnam_r' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='c878edd6'/>
+      <parameter type-id='266fe297'/>
+      <parameter type-id='b59d7dce'/>
+      <parameter type-id='aa19c230'/>
+      <return type-id='95e97e5e'/>
     </function-decl>
     <function-decl name='hasmntopt' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='48bea5ec'/>
@@ -4258,9 +4301,13 @@
       <parameter type-id='18c91f9e'/>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='getpwnam' visibility='default' binding='global' size-in-bits='64'>
-      <parameter type-id='80f4b756'/>
-      <return type-id='a195f4a3'/>
+    <function-decl name='getpwnam_r' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='33518961'/>
+      <parameter type-id='266fe297'/>
+      <parameter type-id='b59d7dce'/>
+      <parameter type-id='8f2c7109'/>
+      <return type-id='95e97e5e'/>
     </function-decl>
     <function-decl name='strtol' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='9d26089a'/>
@@ -6315,6 +6362,7 @@
     <function-decl name='zpool_add' mangled-name='zpool_add' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_add'>
       <parameter type-id='4c81de99' name='zhp'/>
       <parameter type-id='5ce45b60' name='nvroot'/>
+      <parameter type-id='c19b74c3' name='ashift_check'/>
       <return type-id='95e97e5e'/>
     </function-decl>
     <function-decl name='zpool_export' mangled-name='zpool_export' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_export'>
@@ -6778,7 +6826,7 @@
       <enumerator name='LZC_SEND_FLAG_RAW' value='8'/>
       <enumerator name='LZC_SEND_FLAG_SAVED' value='16'/>
     </enum-decl>
-    <class-decl name='ddt_key' size-in-bits='320' is-struct='yes' visibility='default' id='e0a4a1cb'>
+    <class-decl name='ddt_key_t' size-in-bits='320' is-struct='yes' naming-typedef-id='67f6d2cf' visibility='default' id='5fae1718'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='ddk_cksum' type-id='39730d0b' visibility='default'/>
       </data-member>
@@ -6786,7 +6834,7 @@
         <var-decl name='ddk_prop' type-id='9c313c2d' visibility='default'/>
       </data-member>
     </class-decl>
-    <typedef-decl name='ddt_key_t' type-id='e0a4a1cb' id='67f6d2cf'/>
+    <typedef-decl name='ddt_key_t' type-id='5fae1718' id='67f6d2cf'/>
     <enum-decl name='dmu_object_type' id='04b3b0b9'>
       <underlying-type type-id='9cac1fee'/>
       <enumerator name='DMU_OT_NONE' value='0'/>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 402c14a6baee..b42e93e3db5d 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright (c) 2018 Datto Inc.
  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
@@ -1724,7 +1724,7 @@ zpool_discard_checkpoint(zpool_handle_t *zhp)
  * necessary verification to ensure that the vdev specification is well-formed.
  */
 int
-zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
+zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot, boolean_t check_ashift)
 {
 	zfs_cmd_t zc = {"\0"};
 	int ret;
@@ -1756,6 +1756,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
 
 	zcmd_write_conf_nvlist(hdl, &zc, nvroot);
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_flags = check_ashift;
 
 	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
 		switch (errno) {
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 8e70af2e5830..73ae0950ccb6 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2020 Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2020 The FreeBSD Foundation
@@ -319,6 +319,9 @@ libzfs_error_description(libzfs_handle_t *hdl)
 		    "dataset without force"));
 	case EZFS_RAIDZ_EXPAND_IN_PROGRESS:
 		return (dgettext(TEXT_DOMAIN, "raidz expansion in progress"));
+	case EZFS_ASHIFT_MISMATCH:
+		return (dgettext(TEXT_DOMAIN, "adding devices with "
+		    "different physical sector sizes is not allowed"));
 	case EZFS_UNKNOWN:
 		return (dgettext(TEXT_DOMAIN, "unknown error"));
 	default:
@@ -768,6 +771,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 	case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS:
 		zfs_verror(hdl, EZFS_RAIDZ_EXPAND_IN_PROGRESS, fmt, ap);
 		break;
+	case ZFS_ERR_ASHIFT_MISMATCH:
+		zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap);
+		break;
 	default:
 		zfs_error_aux(hdl, "%s", zfs_strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
diff --git a/man/man8/zpool-add.8 b/man/man8/zpool-add.8
index 8ccdcccc7b06..60b35f1a511a 100644
--- a/man/man8/zpool-add.8
+++ b/man/man8/zpool-add.8
@@ -24,8 +24,9 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2024 by Delphix. All Rights Reserved.
 .\"
-.Dd March 16, 2022
+.Dd March 8, 2024
 .Dt ZPOOL-ADD 8
 .Os
 .
@@ -36,6 +37,7 @@
 .Nm zpool
 .Cm add
 .Op Fl fgLnP
+.Op Fl -allow-in-use -allow-replication-mismatch -allow-ashift-mismatch
 .Oo Fl o Ar property Ns = Ns Ar value Oc
 .Ar pool vdev Ns …
 .
@@ -56,7 +58,8 @@ subcommand.
 .It Fl f
 Forces use of
 .Ar vdev Ns s ,
-even if they appear in use or specify a conflicting replication level.
+even if they appear in use, have conflicting ashift values, or specify
+a conflicting replication level.
 Not all devices can be overridden in this manner.
 .It Fl g
 Display
@@ -91,6 +94,17 @@ See the
 manual page for a list of valid properties that can be set.
 The only property supported at the moment is
 .Sy ashift .
+.It Fl -allow-ashift-mismatch
+Disable the ashift validation which allows mismatched ashift values in the
+pool.
+Adding top-level
+.Ar vdev Ns s
+with different sector sizes will prohibit future device removal operations, see
+.Xr zpool-remove 8 .
+.It Fl -allow-in-use
+Allow vdevs to be added even if they might be in use in another pool.
+.It Fl -allow-replication-mismatch
+Allow vdevs with conflicting replication levels to be added to the pool.
 .El
 .
 .Sh EXAMPLES
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 30c528a53049..3704ffd08820 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -7083,7 +7083,7 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
  * Add a device to a storage pool.
  */
 int
-spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
 {
 	uint64_t txg, ndraid = 0;
 	int error;
@@ -7174,6 +7174,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 		}
 	}
 
+	if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) {
+		for (int c = 0; c < vd->vdev_children; c++) {
+			tvd = vd->vdev_child[c];
+			if (tvd->vdev_ashift != spa->spa_max_ashift) {
+				return (spa_vdev_exit(spa, vd, txg,
+				    ZFS_ERR_ASHIFT_MISMATCH));
+			}
+		}
+	}
+
 	for (int c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index b2b06881bdd4..dca15f4b826d 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -27,7 +27,7 @@
  * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -1886,7 +1886,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config);
 	if (error == 0) {
-		error = spa_vdev_add(spa, config);
+		error = spa_vdev_add(spa, config, zc->zc_flags);
 		nvlist_free(config);
 	}
 	spa_close(spa, FTAG);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 502b4de2bae9..d4c5a21828a1 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -372,7 +372,8 @@ tags = ['functional', 'cli_root', 'zpool']
 tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos',
     'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
     'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos',
-    'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output']
+    'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output',
+    'zpool_add--allow-ashift-mismatch']
 tags = ['functional', 'cli_root', 'zpool_add']
 
 [tests/functional/cli_root/zpool_attach]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index fe9c92108725..866ea5b9e7ec 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -988,6 +988,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_add/add_prop_ashift.ksh \
 	functional/cli_root/zpool_add/cleanup.ksh \
 	functional/cli_root/zpool_add/setup.ksh \
+	functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh \
 	functional/cli_root/zpool_add/zpool_add_001_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_002_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_003_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
index 7ecaf849e44b..51871934dd22 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
@@ -22,7 +22,7 @@
 
 #
 # Copyright 2017, loli10K. All rights reserved.
-# Copyright (c) 2020 by Delphix. All rights reserved.
+# Copyright (c) 2020, 2024 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -60,12 +60,23 @@ log_must mkfile $SIZE $disk2
 logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT)
 orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
 max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT)
+opt=""
 
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
 do
+	#
+	# Need to add the --allow-ashift-mismatch option to disable the
+	# ashift mismatch checks in zpool add.
+	#
+	if [[ $ashift -eq $orig_ashift ]]; then
+		opt=""
+	else
+		opt="--allow-ashift-mismatch"
+	fi
+
 	log_must zpool create $TESTPOOL $disk1
-	log_must zpool add -o ashift=$ashift $TESTPOOL $disk2
+	log_must zpool add $opt -o ashift=$ashift $TESTPOOL $disk2
 	log_must verify_ashift $disk2 $ashift
 
 	# clean things for the next run
@@ -78,7 +89,7 @@ do
 	#
 	log_must zpool create $TESTPOOL $disk1
 	log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT $ashift
-	log_must zpool add $TESTPOOL $disk2
+	log_must zpool add $opt $TESTPOOL $disk2
 	exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift ))
 	log_must verify_ashift $disk2 $exp
 
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh
index 228f62232aae..6a3283d0618f 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh
@@ -22,7 +22,7 @@
 
 #
 # Copyright 2017, loli10K. All rights reserved.
-# Copyright (c) 2020 by Delphix. All rights reserved.
+# Copyright (c) 2020, 2024 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -68,8 +68,13 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
 do
+	if [ $ashift -eq $orig_ashift ];then
+		opt=""
+	else
+		opt="--allow-ashift-mismatch"
+	fi
 	log_must zpool create -o ashift=$ashift $TESTPOOL $disk1
-	log_must zpool add $TESTPOOL $disk2
+	log_must zpool add $opt $TESTPOOL $disk2
 	log_must verify_ashift $disk2 $ashift
 
 	# clean things for the next run
@@ -82,8 +87,13 @@ for ashift in ${ashifts[@]}
 do
 	for cmdval in ${ashifts[@]}
 	do
+		if [ $ashift -eq $cmdval ];then
+			opt=""
+		else
+			opt="--allow-ashift-mismatch"
+		fi
 		log_must zpool create -o ashift=$ashift $TESTPOOL $disk1
-		log_must zpool add -o ashift=$cmdval $TESTPOOL $disk2
+		log_must zpool add $opt -o ashift=$cmdval $TESTPOOL $disk2
 		log_must verify_ashift $disk2 $cmdval
 
 		# clean things for the next run
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh
new file mode 100755
index 000000000000..e69de29bb2d1
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh
index c5c06f76340b..afee34a33469 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh
@@ -65,4 +65,15 @@ log_mustnot vdevs_in_pool $TESTPOOL $DISK2
 log_must zpool add -f $TESTPOOL $DISK2
 log_must vdevs_in_pool $TESTPOOL $DISK2
 
+log_must zpool destroy $TESTPOOL
+
+create_pool $TESTPOOL mirror $DISK0 $DISK1
+log_must poolexists $TESTPOOL
+
+log_mustnot zpool add $TESTPOOL $DISK2
+log_mustnot vdevs_in_pool $TESTPOOL $DISK2
+
+log_must zpool add --allow-replication-mismatch $TESTPOOL $DISK2
+log_must vdevs_in_pool $TESTPOOL $DISK2
+
 log_pass "'zpool add -f <pool> <vdev> ...' executes successfully."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh
index 646edc1a4557..cecda56ab125 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh
@@ -70,7 +70,7 @@ if is_freebsd; then
 	recursive=$(get_tunable VOL_RECURSIVE)
 	log_must set_tunable64 VOL_RECURSIVE 1
 fi
-log_must zpool add $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL
+log_must zpool add --allow-ashift-mismatch $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL
 
 log_must vdevs_in_pool "$TESTPOOL" "$ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL"
 
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh
index 4990ef9d29b0..0e9d9f5f030f 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh
@@ -75,7 +75,9 @@ log_must poolexists $TESTPOOL1
 
 unset NOINUSE_CHECK
 log_mustnot zpool add -f $TESTPOOL $DISK1
+log_mustnot zpool add --allow-in-use $TESTPOOL $DISK1
 log_mustnot zpool add -f $TESTPOOL $mnttab_dev
+log_mustnot zpool add --allow-in-use $TESTPOOL $mnttab_dev
 if is_linux; then
        log_mustnot zpool add $TESTPOOL $vfstab_dev
 else
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh
index d7f3a900e8fd..a13a27160e76 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh
@@ -64,7 +64,9 @@ log_mustnot zpool add -f $TESTPOOL $DISK0
 for type in "" "mirror" "raidz" "draid" "spare" "log" "dedup" "special" "cache"
 do
 	log_mustnot zpool add -f $TESTPOOL $type $DISK0 $DISK1
+	log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK0 $DISK1
 	log_mustnot zpool add -f $TESTPOOL $type $DISK1 $DISK1
+	log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK1 $DISK1
 done
 
 log_pass "'zpool add' get fail as expected if vdevs are the same or vdev is " \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh
index b8b25db1b9f9..22860e9caf1d 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh
@@ -138,7 +138,7 @@ function zpool_create_forced_add
 		while ((j < ${#add_args[@]})); do
 			log_must zpool create $TESTPOOL1 ${create_args[$i]}
 			log_mustnot zpool add $TESTPOOL1 ${add_args[$j]}
-			log_must zpool add -f $TESTPOOL1 ${add_args[$j]}
+			log_must zpool add --allow-replication-mismatch $TESTPOOL1 ${add_args[$j]}
 			log_must zpool destroy -f $TESTPOOL1
 
 			((j += 1))

From c0aab8b8f91f5ecb2c625a8fa7265f26c260e10a Mon Sep 17 00:00:00 2001
From: Fabian-Gruenbichler <f.gruenbichler@proxmox.com>
Date: Fri, 29 Mar 2024 22:37:40 +0100
Subject: [PATCH 030/116] zvols: prevent overflow of minor device numbers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

currently, the linux kernel allows 2^20 minor devices per major device
number.  ZFS reserves blocks of 2^4 minors per zvol: 1 for the zvol
itself, the other 15 for the first partitions of that zvol. as a result,
only 2^16 such blocks are available for use.

there are no checks in place to avoid overflowing into the major device
number when more than 2^16 zvols are allocated (with volmode=dev or
default). instead of ignoring this limit, which comes with all sorts of
weird knock-on effects, detect this situation and simply fail allocating
the zvol block device early on.

without this safeguard, the kernel will reject the attempt to create an
already existing block device, but ZFS doesn't handle this error and
gets confused about which zvol occupies which minor slot, potentially
resulting in kernel NULL derefs and other issues later on.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Closes #16006
---
 module/os/linux/zfs/zvol_os.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 8d5d1f06fce9..26cc63d426eb 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1314,6 +1314,13 @@ zvol_os_create_minor(const char *name)
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
+	if (MINOR(minor) != minor) {
+		/* too many partitions can cause an overflow */
+		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
+		    name, minor, MINOR(minor));
+		ida_simple_remove(&zvol_ida, idx);
+		return (SET_ERROR(EINVAL));
+	}
 
 	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
 	if (zv) {

From cfb96c772b8448dca6eaeb66a540b8bb39c9908c Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 30 Mar 2024 08:51:33 +1100
Subject: [PATCH 031/116] vdev_disk: clean up spa/bdev mode conversion

43e8f6e37 introduced a subtle API misuse, in that it passed the output
from vdev_bdev_mode() back into itself. Fortunately, the
SPA_MODE_(READ|WRITE) bit values exactly map to the FMODE_(READ|WRITE) &
BLK_OPEN_(READ|WRITE) bit values, so it didn't result in a bug, but it
was hard to read and understand, so I cleaned it up.

In doing so, I noticed that the only call to vdev_bdev_mode() without
the "exclusive" flag set was in that misuse, and actually, we never do a
non-exclusive blkdev_get_by_path(). So I've just made exclusive be
always-on.


Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15995
---
 module/os/linux/zfs/vdev_disk.c | 81 ++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 42 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 36468fc21132..ac8fe6cb1bf9 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -97,38 +97,41 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
 
 static unsigned int zfs_vdev_failfast_mask = 1;
 
+/*
+ * Convert SPA mode flags into bdev open mode flags.
+ */
 #ifdef HAVE_BLK_MODE_T
-static blk_mode_t
+typedef blk_mode_t vdev_bdev_mode_t;
+#define	VDEV_BDEV_MODE_READ	BLK_OPEN_READ
+#define	VDEV_BDEV_MODE_WRITE	BLK_OPEN_WRITE
+#define	VDEV_BDEV_MODE_EXCL	BLK_OPEN_EXCL
+#define	VDEV_BDEV_MODE_MASK	(BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL)
 #else
-static fmode_t
+typedef fmode_t vdev_bdev_mode_t;
+#define	VDEV_BDEV_MODE_READ	FMODE_READ
+#define	VDEV_BDEV_MODE_WRITE	FMODE_WRITE
+#define	VDEV_BDEV_MODE_EXCL	FMODE_EXCL
+#define	VDEV_BDEV_MODE_MASK	(FMODE_READ|FMODE_WRITE|FMODE_EXCL)
 #endif
-vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive)
-{
-#ifdef HAVE_BLK_MODE_T
-	blk_mode_t mode = 0;
-
-	if (spa_mode & SPA_MODE_READ)
-		mode |= BLK_OPEN_READ;
 
-	if (spa_mode & SPA_MODE_WRITE)
-		mode |= BLK_OPEN_WRITE;
+static vdev_bdev_mode_t
+vdev_bdev_mode(spa_mode_t smode)
+{
+	ASSERT3U(smode, !=, SPA_MODE_UNINIT);
+	ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE));
 
-	if (exclusive)
-		mode |= BLK_OPEN_EXCL;
-#else
-	fmode_t mode = 0;
+	vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL;
 
-	if (spa_mode & SPA_MODE_READ)
-		mode |= FMODE_READ;
+	if (smode & SPA_MODE_READ)
+		bmode |= VDEV_BDEV_MODE_READ;
 
-	if (spa_mode & SPA_MODE_WRITE)
-		mode |= FMODE_WRITE;
+	if (smode & SPA_MODE_WRITE)
+		bmode |= VDEV_BDEV_MODE_WRITE;
 
-	if (exclusive)
-		mode |= FMODE_EXCL;
-#endif
+	ASSERT(bmode & VDEV_BDEV_MODE_MASK);
+	ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK);
 
-	return (mode);
+	return (bmode);
 }
 
 /*
@@ -235,30 +238,28 @@ vdev_disk_kobj_evt_post(vdev_t *v)
 }
 
 static zfs_bdev_handle_t *
-vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder)
+vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
 {
+	vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);
+
 #if defined(HAVE_BDEV_OPEN_BY_PATH)
-	return (bdev_open_by_path(path,
-	    vdev_bdev_mode(mode, B_TRUE), holder, NULL));
+	return (bdev_open_by_path(path, bmode, holder, NULL));
 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
-	return (blkdev_get_by_path(path,
-	    vdev_bdev_mode(mode, B_TRUE), holder, NULL));
+	return (blkdev_get_by_path(path, bmode, holder, NULL));
 #else
-	return (blkdev_get_by_path(path,
-	    vdev_bdev_mode(mode, B_TRUE), holder));
+	return (blkdev_get_by_path(path, bmode, holder));
 #endif
 }
 
 static void
-vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder)
+vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
 {
 #if defined(HAVE_BDEV_RELEASE)
 	return (bdev_release(bdh));
 #elif defined(HAVE_BLKDEV_PUT_HOLDER)
 	return (blkdev_put(BDH_BDEV(bdh), holder));
 #else
-	return (blkdev_put(BDH_BDEV(bdh),
-	    vdev_bdev_mode(mode, B_TRUE)));
+	return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
 #endif
 }
 
@@ -267,11 +268,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	zfs_bdev_handle_t *bdh;
-#ifdef HAVE_BLK_MODE_T
-	blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
-#else
-	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
-#endif
+	spa_mode_t smode = spa_mode(v->vdev_spa);
 	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
 	vdev_disk_t *vd;
 
@@ -322,16 +319,16 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 					reread_part = B_TRUE;
 			}
 
-			vdev_blkdev_put(bdh, mode, zfs_vdev_holder);
+			vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
 		}
 
 		if (reread_part) {
-			bdh = vdev_blkdev_get_by_path(disk_name, mode,
+			bdh = vdev_blkdev_get_by_path(disk_name, smode,
 			    zfs_vdev_holder);
 			if (!BDH_IS_ERR(bdh)) {
 				int error =
 				    vdev_bdev_reread_part(BDH_BDEV(bdh));
-				vdev_blkdev_put(bdh, mode, zfs_vdev_holder);
+				vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
 				if (error == 0) {
 					timeout = MSEC2NSEC(
 					    zfs_vdev_open_timeout_ms * 2);
@@ -376,7 +373,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	hrtime_t start = gethrtime();
 	bdh = BDH_ERR_PTR(-ENXIO);
 	while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) {
-		bdh = vdev_blkdev_get_by_path(v->vdev_path, mode,
+		bdh = vdev_blkdev_get_by_path(v->vdev_path, smode,
 		    zfs_vdev_holder);
 		if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) {
 			/*

From 2553f94c4299aaf31c5ceea4bfbfcc811cf76513 Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Fri, 29 Mar 2024 17:59:23 -0400
Subject: [PATCH 032/116] Fix buffer underflow if sysfs file is empty

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Jason Lee <jasonlee@lanl.gov>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16028
Closes #16035
---
 cmd/zpool/os/linux/zpool_vdev_os.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c
index 006a3a7d8e01..80627b58211c 100644
--- a/cmd/zpool/os/linux/zpool_vdev_os.c
+++ b/cmd/zpool/os/linux/zpool_vdev_os.c
@@ -458,7 +458,7 @@ static char *zpool_sysfs_gets(char *path)
 	}
 
 	/* Remove trailing newline */
-	if (buf[count - 1] == '\n')
+	if (count > 0 && buf[count - 1] == '\n')
 		buf[count - 1] = 0;
 
 	close(fd);

From 39be46f43f96fb7420386d03751b01f5cb376d6b Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Fri, 29 Mar 2024 20:11:52 -0400
Subject: [PATCH 033/116] Linux 5.18+ compat: Detect filemap_range_has_page

In v5.18 `filemap_range_has_page` moved to `pagemap.h`

`pagemap.h` has been around since 3.10 so just include both

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16034
---
 config/kernel-filemap.m4 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/config/kernel-filemap.m4 b/config/kernel-filemap.m4
index 745928168f92..0b7da828d299 100644
--- a/config/kernel-filemap.m4
+++ b/config/kernel-filemap.m4
@@ -4,6 +4,7 @@ dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP], [
 	ZFS_LINUX_TEST_SRC([filemap_range_has_page], [
 		#include <linux/fs.h>
+		#include <linux/pagemap.h>
 	],[
 		struct address_space *mapping = NULL;
 		loff_t lstart = 0;

From b12738182cff269456e7737241415356c08b5d2e Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 3 Apr 2024 18:04:26 -0400
Subject: [PATCH 034/116] Improve dbuf_read() error reporting

Previous code reported non-ZIO errors only via return value, but
not via parent ZIO.  It could cause NULL-dereference panics due
to dmu_buf_hold_array_by_dnode() ignoring the return value,
relying solely on parent ZIO status.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reported by:	Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16042
---
 module/zfs/dbuf.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 4e190c131e1d..0ab143bd089f 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1557,17 +1557,14 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
  * returning.
  */
 static int
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
+dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
     db_lock_type_t dblt, const void *tag)
 {
-	dnode_t *dn;
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
 	blkptr_t bp, *bpp;
 
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@@ -1643,8 +1640,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	if (err != 0)
 		goto early_unlock;
 
-	DB_DNODE_EXIT(db);
-
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
@@ -1669,12 +1664,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
 	dmu_buf_unlock_parent(db, dblt, tag);
-	(void) arc_read(zio, db->db_objset->os_spa, bpp,
+	return (arc_read(zio, db->db_objset->os_spa, bpp,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
-	    &aflags, &zb);
-	return (err);
+	    &aflags, &zb));
+
 early_unlock:
-	DB_DNODE_EXIT(db);
 	mutex_exit(&db->db_mtx);
 	dmu_buf_unlock_parent(db, dblt, tag);
 	return (err);
@@ -1759,7 +1753,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 }
 
 int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t prefetch;
@@ -1775,7 +1769,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 	dn = DB_DNODE(db);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
+	    (flags & DB_RF_NOPREFETCH) == 0;
 
 	mutex_enter(&db->db_mtx);
 	if (flags & DB_RF_PARTIAL_FIRST)
@@ -1822,13 +1816,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 
-		if (zio == NULL && (db->db_state == DB_NOFILL ||
+		if (pio == NULL && (db->db_state == DB_NOFILL ||
 		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
 			spa_t *spa = dn->dn_objset->os_spa;
-			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+			pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			need_wait = B_TRUE;
 		}
-		err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
+		err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
 		/*
 		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
 		 * for us
@@ -1849,9 +1843,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		 */
 		if (need_wait) {
 			if (err == 0)
-				err = zio_wait(zio);
+				err = zio_wait(pio);
 			else
-				VERIFY0(zio_wait(zio));
+				(void) zio_wait(pio);
+			pio = NULL;
 		}
 	} else {
 		/*
@@ -1878,7 +1873,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
-				    db, zio_t *, zio);
+				    db, zio_t *, pio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
@@ -1887,6 +1882,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		}
 	}
 
+	if (pio && err != 0) {
+		zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+		zio->io_error = err;
+		zio_nowait(zio);
+	}
+
 	return (err);
 }
 

From a9a4290173dfdfd25aabd623bc3ccd994126794a Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 4 Apr 2024 09:13:27 +1100
Subject: [PATCH 035/116] xdr: header cleanup

#16047 notes that include/os/freebsd/spl/rpc/xdr.h carried an
(apparently) incompatible license. While looking into it, it seems that
this file is actually unnecessary these days - FreeBSD's kernel XDR has
XDR_CONTROL, xdrmem_control and XDR_GET_BYTES_AVAIL, while userspace has
XDR_CONTROL and xdrmem_control, and our implementation of
XDR_GET_BYTES_AVAIL for libspl works nicely with it. So this removes
that file outright.

To keep the includes in nvpair.c tidy, I've made a few small adjustments
to the Linux headers. By definition, rpc/types.h provides bool_t and is
included before rpc/xdr.h, so I've created rpc/types.h for Linux. This
isn't necessary for userspace; both FreeBSD native and tirpc on Linux
already have these headers set up correctly.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16047
Closes #16051
---
 include/os/freebsd/Makefile.am   |  2 -
 include/os/freebsd/spl/rpc/xdr.h | 71 --------------------------------
 include/os/linux/Makefile.am     |  1 +
 include/os/linux/spl/rpc/types.h | 30 ++++++++++++++
 include/os/linux/spl/rpc/xdr.h   |  2 -
 module/nvpair/nvpair.c           |  1 +
 module/os/linux/spl/spl-xdr.c    |  1 +
 7 files changed, 33 insertions(+), 75 deletions(-)
 delete mode 100644 include/os/freebsd/spl/rpc/xdr.h
 create mode 100644 include/os/linux/spl/rpc/types.h

diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am
index 551f75f42a20..d4103c2f062a 100644
--- a/include/os/freebsd/Makefile.am
+++ b/include/os/freebsd/Makefile.am
@@ -4,8 +4,6 @@ noinst_HEADERS = \
 	\
 	%D%/spl/acl/acl_common.h \
 	\
-	%D%/spl/rpc/xdr.h \
-	\
 	%D%/spl/sys/ia32/asm_linkage.h \
 	\
 	%D%/spl/sys/acl.h \
diff --git a/include/os/freebsd/spl/rpc/xdr.h b/include/os/freebsd/spl/rpc/xdr.h
deleted file mode 100644
index c98466e9d16a..000000000000
--- a/include/os/freebsd/spl/rpc/xdr.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Sun RPC is a product of Sun Microsystems, Inc. and is provided for
- * unrestricted use provided that this legend is included on all tape
- * media and as a part of the software program in whole or part.  Users
- * may copy or modify Sun RPC without charge, but are not authorized
- * to license or distribute it to anyone else except as part of a product or
- * program developed by the user.
- *
- * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE
- * WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE.
- *
- * Sun RPC is provided with no support and without any obligation on the
- * part of Sun Microsystems, Inc. to assist in its use, correction,
- * modification or enhancement.
- *
- * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE
- * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC
- * OR ANY PART THEREOF.
- *
- * In no event will Sun Microsystems, Inc. be liable for any lost revenue
- * or profits or other special, indirect and consequential damages, even if
- * Sun has been advised of the possibility of such damages.
- *
- * Sun Microsystems, Inc.
- * 2550 Garcia Avenue
- * Mountain View, California  94043
- */
-
-#ifndef	_OPENSOLARIS_RPC_XDR_H_
-#define	_OPENSOLARIS_RPC_XDR_H_
-
-#include <rpc/types.h>
-#include_next <rpc/xdr.h>
-
-#if !defined(_KERNEL) && !defined(_STANDALONE)
-
-#include <assert.h>
-
-/*
- * Taken from sys/xdr/xdr_mem.c.
- *
- * FreeBSD's userland XDR doesn't implement control method (only the kernel),
- * but OpenSolaris nvpair still depend on it, so we have to implement it here.
- */
-static __inline bool_t
-xdrmem_control(XDR *xdrs, int request, void *info)
-{
-	xdr_bytesrec *xptr;
-
-	switch (request) {
-	case XDR_GET_BYTES_AVAIL:
-		xptr = (xdr_bytesrec *)info;
-		xptr->xc_is_last_record = TRUE;
-		xptr->xc_num_avail = xdrs->x_handy;
-		return (TRUE);
-	default:
-		assert(!"unexpected request");
-	}
-	return (FALSE);
-}
-
-#undef XDR_CONTROL
-#define	XDR_CONTROL(xdrs, req, op)					\
-	(((xdrs)->x_ops->x_control == NULL) ?				\
-	    xdrmem_control((xdrs), (req), (op)) :			\
-	    (*(xdrs)->x_ops->x_control)(xdrs, req, op))
-
-#endif	/* !_KERNEL && !_STANDALONE */
-
-#endif	/* !_OPENSOLARIS_RPC_XDR_H_ */
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
index 51c27132b4ef..332569efe361 100644
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@@ -47,6 +47,7 @@ kernel_sys_HEADERS = \
 
 kernel_spl_rpcdir = $(kerneldir)/spl/rpc
 kernel_spl_rpc_HEADERS = \
+	%D%/spl/rpc/types.h \
 	%D%/spl/rpc/xdr.h
 
 kernel_spl_sysdir = $(kerneldir)/spl/sys
diff --git a/include/os/linux/spl/rpc/types.h b/include/os/linux/spl/rpc/types.h
new file mode 100644
index 000000000000..5bbb4f2dec46
--- /dev/null
+++ b/include/os/linux/spl/rpc/types.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2008 Sun Microsystems, Inc.
+ *  Written by Ricardo Correia <Ricardo.M.Correia@Sun.COM>
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_RPC_TYPES_H
+#define	_SPL_RPC_TYPES_H
+
+#include <sys/types.h>
+
+/* Just enough to support rpc/xdr.h */
+
+typedef int bool_t;
+
+#endif /* SPL_RPC_TYPES_H */
diff --git a/include/os/linux/spl/rpc/xdr.h b/include/os/linux/spl/rpc/xdr.h
index b00f3542fcdf..5b621fa9c863 100644
--- a/include/os/linux/spl/rpc/xdr.h
+++ b/include/os/linux/spl/rpc/xdr.h
@@ -23,8 +23,6 @@
 
 #include <sys/types.h>
 
-typedef int bool_t;
-
 /*
  * XDR enums and types.
  */
diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c
index d9449e47e87a..887f7d32df4a 100644
--- a/module/nvpair/nvpair.c
+++ b/module/nvpair/nvpair.c
@@ -41,6 +41,7 @@
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/string.h>
+#include <rpc/types.h>
 #include <rpc/xdr.h>
 #include <sys/mod.h>
 
diff --git a/module/os/linux/spl/spl-xdr.c b/module/os/linux/spl/spl-xdr.c
index 6b77524181db..e1773da5d173 100644
--- a/module/os/linux/spl/spl-xdr.c
+++ b/module/os/linux/spl/spl-xdr.c
@@ -25,6 +25,7 @@
 #include <sys/debug.h>
 #include <sys/types.h>
 #include <sys/sysmacros.h>
+#include <rpc/types.h>
 #include <rpc/xdr.h>
 
 /*

From 917ff75e9510d19968ef3cc5c80b1cd0ef48f84d Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 4 Apr 2024 09:17:07 +1100
Subject: [PATCH 036/116] vdev_disk: don't touch vbio after its handed off to
 the kernel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After IO is unplugged, it may complete immediately and vbio_completion
be called on interrupt context. That may interrupt or deschedule our
task. If its the last bio, the vbio will be freed. Then, we get
rescheduled, and try to write to freed memory through vbio->.

This patch just removes the the cleanup, and the corresponding assert.
These were leftovers from a previous iteration of vbio_submit() and were
always "belt and suspenders" ops anyway, never strictly required.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc
Reported-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Laurențiu Nicola <lnicola@dend.ro>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16045
Closes #16050
Closes #16049
---
 module/os/linux/zfs/vdev_disk.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index ac8fe6cb1bf9..df5fa067797a 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -755,8 +755,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
 static void
 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
-	ASSERT(vbio->vbio_bdev);
-
 	/*
 	 * We plug so we can submit the BIOs as we go and only unplug them when
 	 * they are fully created and submitted. This is important; if we don't
@@ -774,12 +772,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 	vbio->vbio_bio->bi_end_io = vbio_completion;
 	vbio->vbio_bio->bi_private = vbio;
 
+	/*
+	 * Once submitted, vbio_bio now owns vbio (through bi_private) and we
+	 * can't touch it again. The bio may complete and vbio_completion() be
+	 * called and free the vbio before this task is run again, so we must
+	 * consider it invalid from this point.
+	 */
 	vdev_submit_bio(vbio->vbio_bio);
 
 	blk_finish_plug(&plug);
-
-	vbio->vbio_bio = NULL;
-	vbio->vbio_bdev = NULL;
 }
 
 /* IO completion callback */

From e3120f73d0481a5e0779e45da83518373d60dcff Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 27 Mar 2024 10:07:50 +1100
Subject: [PATCH 037/116] Linux 6.9 compat: bdev handles are now struct file

bdev_open_by_path() is replaced by bdev_file_open_by_path(), which
returns a plain old struct file*. Release function is gone entirely; the
regular file release function fput() will take care of the bdev
specifics.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16027
Closes #16033
---
 config/kernel-blkdev.m4         | 43 +++++++++++++++++++++++++++++++--
 module/os/linux/zfs/vdev_disk.c | 24 ++++++++++++++----
 2 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index c5a353ca9203..7b0e830e600f 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -54,6 +54,26 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [
 	])
 ])
 
+dnl #
+dnl # 6.9.x API change
+dnl # bdev_file_open_by_path() replaced bdev_open_by_path(),
+dnl # and returns struct file*
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH], [
+	ZFS_LINUX_TEST_SRC([bdev_file_open_by_path], [
+		#include <linux/fs.h>
+		#include <linux/blkdev.h>
+	], [
+		struct file *file __attribute__ ((unused)) = NULL;
+		const char *path = "path";
+		fmode_t mode = 0;
+		void *holder = NULL;
+		struct blk_holder_ops h;
+
+		file = bdev_file_open_by_path(path, mode, holder, &h);
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [
 	AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args])
 	ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [
@@ -73,7 +93,16 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [
 					[bdev_open_by_path() exists])
 				AC_MSG_RESULT(yes)
 			], [
-				ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
+				AC_MSG_RESULT(no)
+				AC_MSG_CHECKING([whether bdev_file_open_by_path() exists])
+				ZFS_LINUX_TEST_RESULT([bdev_file_open_by_path], [
+					AC_DEFINE(HAVE_BDEV_FILE_OPEN_BY_PATH, 1,
+						[bdev_file_open_by_path() exists])
+					AC_MSG_RESULT(yes)
+				], [
+					AC_MSG_RESULT(no)
+					ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
+				])
 			])
 		])
 	])
@@ -149,10 +178,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE], [
 	])
 ])
 
+dnl #
+dnl # 6.9.x API change
+dnl #
+dnl # bdev_release() now private, but because bdev_file_open_by_path() returns
+dnl # struct file*, we can just use fput(). So the blkdev_put test no longer
+dnl # fails if not found.
+dnl #
+
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [
 	AC_MSG_CHECKING([whether blkdev_put() exists])
 	ZFS_LINUX_TEST_RESULT([blkdev_put], [
 		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_PUT, 1, [blkdev_put() exists])
 	], [
 		AC_MSG_RESULT(no)
 		AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2])
@@ -168,7 +206,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [
 				AC_DEFINE(HAVE_BDEV_RELEASE, 1,
 					[bdev_release() exists])
 			], [
-				ZFS_LINUX_TEST_ERROR([blkdev_put()])
+				AC_MSG_RESULT(no)
 			])
 		])
 	])
@@ -645,6 +683,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH
+	ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index df5fa067797a..a710bb91004e 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -45,15 +45,25 @@
 /*
  * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying
  * block_device. Since it carries the block_device inside, its convenient to
- * just use the handle as a proxy. For pre-6.8, we just emulate this with
- * a cast, since we don't need any of the other fields inside the handle.
+ * just use the handle as a proxy.
+ *
+ * Linux 6.9.x uses a file for the same purpose.
+ *
+ * For pre-6.8, we just emulate this with a cast, since we don't need any of
+ * the other fields inside the handle.
  */
-#ifdef HAVE_BDEV_OPEN_BY_PATH
+#if defined(HAVE_BDEV_OPEN_BY_PATH)
 typedef struct bdev_handle zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		((bdh)->bdev)
 #define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
 #define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
 #define	BDH_ERR_PTR(err)	(ERR_PTR(err))
+#elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
+typedef struct file zfs_bdev_handle_t;
+#define	BDH_BDEV(bdh)		(file_bdev(bdh))
+#define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
+#define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
+#define	BDH_ERR_PTR(err)	(ERR_PTR(err))
 #else
 typedef void zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		((struct block_device *)bdh)
@@ -242,7 +252,9 @@ vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
 {
 	vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);
 
-#if defined(HAVE_BDEV_OPEN_BY_PATH)
+#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
+	return (bdev_file_open_by_path(path, bmode, holder, NULL));
+#elif defined(HAVE_BDEV_OPEN_BY_PATH)
 	return (bdev_open_by_path(path, bmode, holder, NULL));
 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
 	return (blkdev_get_by_path(path, bmode, holder, NULL));
@@ -258,8 +270,10 @@ vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
 	return (bdev_release(bdh));
 #elif defined(HAVE_BLKDEV_PUT_HOLDER)
 	return (blkdev_put(BDH_BDEV(bdh), holder));
-#else
+#elif defined(HAVE_BLKDEV_PUT)
 	return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
+#else
+	fput(bdh);
 #endif
 }
 

From 6097a7ba8b49c9b263809d2d1092fdca86182bd3 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 27 Mar 2024 11:24:57 +1100
Subject: [PATCH 038/116] Linux 6.9 compat: blk_alloc_disk() now takes two args

There's an extra nullable arg for queue limits. Detect it, and set it to
NULL. Similar change for blk_mq_alloc_disk(), now three args, same
treatment.

Error return now has error encoded in the return, so detect with
IS_ERR() and explicitly NULL our own return.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16027
Closes #16033
---
 config/kernel-make-request-fn.m4 | 33 ++++++++++++++++++++++++++++++++
 module/os/linux/zfs/zvol_os.c    | 23 +++++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/config/kernel-make-request-fn.m4 b/config/kernel-make-request-fn.m4
index 4d20dd45c4a1..9813ad2fb3f3 100644
--- a/config/kernel-make-request-fn.m4
+++ b/config/kernel-make-request-fn.m4
@@ -50,6 +50,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
 		disk = blk_alloc_disk(NUMA_NO_NODE);
 	])
 
+	ZFS_LINUX_TEST_SRC([blk_alloc_disk_2arg], [
+		#include <linux/blkdev.h>
+	],[
+		struct queue_limits *lim = NULL;
+		struct gendisk *disk  __attribute__ ((unused));
+		disk = blk_alloc_disk(lim, NUMA_NO_NODE);
+	])
+
 	ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [
 		#include <linux/blkdev.h>
 	],[
@@ -96,6 +104,31 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
 		], [
 			AC_MSG_RESULT(no)
 		])
+
+		dnl #
+		dnl # Linux 6.9 API Change:
+		dnl # blk_alloc_queue() takes a nullable queue_limits arg.
+		dnl #
+		AC_MSG_CHECKING([whether blk_alloc_disk() exists and takes 2 args])
+		ZFS_LINUX_TEST_RESULT([blk_alloc_disk_2arg], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args])
+
+			dnl #
+			dnl # 5.20 API change,
+			dnl # Removed blk_cleanup_disk(), put_disk() should be used.
+			dnl #
+			AC_MSG_CHECKING([whether blk_cleanup_disk() exists])
+			ZFS_LINUX_TEST_RESULT([blk_cleanup_disk], [
+				AC_MSG_RESULT(yes)
+				AC_DEFINE([HAVE_BLK_CLEANUP_DISK], 1,
+				    [blk_cleanup_disk() exists])
+			], [
+				AC_MSG_RESULT(no)
+			])
+		], [
+			AC_MSG_RESULT(no)
+		])
 	],[
 		AC_MSG_RESULT(no)
 
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 26cc63d426eb..d815cb2ad2c4 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1053,6 +1053,16 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
 	if (zso->zvo_disk == NULL)
 		return (1);
 
+	zso->zvo_disk->minors = ZVOL_MINORS;
+	zso->zvo_queue = zso->zvo_disk->queue;
+#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
+	struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
+	if (IS_ERR(disk)) {
+		zso->zvo_disk = NULL;
+		return (1);
+	}
+
+	zso->zvo_disk = disk;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
 #else
@@ -1103,6 +1113,17 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
 	}
 	zso->zvo_queue = zso->zvo_disk->queue;
 	zso->zvo_disk->minors = ZVOL_MINORS;
+#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
+	struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv);
+	if (IS_ERR(disk)) {
+		zso->zvo_disk = NULL;
+		blk_mq_free_tag_set(&zso->tag_set);
+		return (1);
+	}
+
+	zso->zvo_disk = disk;
+	zso->zvo_queue = zso->zvo_disk->queue;
+	zso->zvo_disk->minors = ZVOL_MINORS;
 #else
 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 	if (zso->zvo_disk == NULL) {
@@ -1256,7 +1277,7 @@ zvol_os_free(zvol_state_t *zv)
 
 	del_gendisk(zv->zv_zso->zvo_disk);
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
-	defined(HAVE_BLK_ALLOC_DISK)
+	(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
 #if defined(HAVE_BLK_CLEANUP_DISK)
 	blk_cleanup_disk(zv->zv_zso->zvo_disk);
 #else

From ca678bc0bc8347610d34c39a5a2322be45b11093 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 4 Apr 2024 09:49:22 +1100
Subject: [PATCH 039/116] Makefile.bsd: sort and cleanup source file list

All files now in their correct sections, and all sections match on-disk
dir layout, and all sorted.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15943
---
 module/Makefile.bsd | 85 ++++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 40 deletions(-)

diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index e9ad69fc50a2..d9d31564d090 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -82,12 +82,9 @@ CFLAGS+= -DBITS_PER_LONG=64
 
 SRCS=	vnode_if.h device_if.h bus_if.h
 
-# avl
+#avl
 SRCS+=	avl.c
 
-# icp
-SRCS+=	edonr.c
-
 #icp/algs/blake3
 SRCS+=	blake3.c \
 	blake3_generic.c \
@@ -107,9 +104,12 @@ SRCS+=	blake3_avx2.S \
 	blake3_sse2.S \
 	blake3_sse41.S
 
+#icp/algs/edonr
+SRCS+=	edonr.c
+
 #icp/algs/sha2
-SRCS+=	sha2_generic.c \
-	sha256_impl.c \
+SRCS+=	sha256_impl.c \
+	sha2_generic.c \
 	sha512_impl.c
 
 #icp/asm-arm/sha2
@@ -122,8 +122,8 @@ SRCS+=	sha256-armv8.S \
 
 #icp/asm-ppc64/sha2
 SRCS+=	sha256-p8.S \
-	sha512-p8.S \
 	sha256-ppc.S \
+	sha512-p8.S \
 	sha512-ppc.S
 
 #icp/asm-x86_64/sha2
@@ -157,10 +157,10 @@ SRCS+=	lapi.c \
 	lzio.c
 
 #nvpair
-SRCS+=	nvpair.c \
-	fnvpair.c \
-	nvpair_alloc_spl.c \
-	nvpair_alloc_fixed.c
+SRCS+=	fnvpair.c \
+	nvpair.c \
+	nvpair_alloc_fixed.c \
+	nvpair_alloc_spl.c
 
 #os/freebsd/spl
 SRCS+=	acl_common.c \
@@ -184,7 +184,6 @@ SRCS+=	acl_common.c \
 	spl_zlib.c \
 	spl_zone.c
 
-
 .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
 	${MACHINE_ARCH} == "powerpcspe" || ${MACHINE_ARCH} == "arm"
 SRCS+= spl_atomic.c
@@ -207,6 +206,7 @@ SRCS+=	abd_os.c \
 	zfs_ctldir.c \
 	zfs_debug.c \
 	zfs_dir.c \
+	zfs_file_os.c \
 	zfs_ioctl_compat.c \
 	zfs_ioctl_os.c \
 	zfs_racct.c \
@@ -217,19 +217,20 @@ SRCS+=	abd_os.c \
 	zvol_os.c
 
 #unicode
-SRCS+=	uconv.c \
-	u8_textprep.c
+SRCS+= 	u8_textprep.c \
+	uconv.c
 
 #zcommon
-SRCS+=	zfeature_common.c \
+SRCS+=	cityhash.c \
+	zfeature_common.c \
 	zfs_comutil.c \
 	zfs_deleg.c \
-	zfs_fletcher.c \
 	zfs_fletcher_avx512.c \
+	zfs_fletcher.c \
 	zfs_fletcher_intel.c \
 	zfs_fletcher_sse.c \
-	zfs_fletcher_superscalar.c \
 	zfs_fletcher_superscalar4.c \
+	zfs_fletcher_superscalar.c \
 	zfs_namecheck.c \
 	zfs_prop.c \
 	zpool_prop.c \
@@ -243,14 +244,13 @@ SRCS+=	abd.c \
 	blkptr.c \
 	bplist.c \
 	bpobj.c \
+	bptree.c \
+	bqueue.c \
 	brt.c \
 	btree.c \
-	cityhash.c \
+	dataset_kstats.c \
 	dbuf.c \
 	dbuf_stats.c \
-	bptree.c \
-	bqueue.c \
-	dataset_kstats.c \
 	ddt.c \
 	ddt_stats.c \
 	ddt_zap.c \
@@ -266,13 +266,13 @@ SRCS+=	abd.c \
 	dmu_zfetch.c \
 	dnode.c \
 	dnode_sync.c \
+	dsl_bookmark.c \
+	dsl_crypt.c \
 	dsl_dataset.c \
 	dsl_deadlist.c \
 	dsl_deleg.c \
-	dsl_bookmark.c \
-	dsl_dir.c \
-	dsl_crypt.c \
 	dsl_destroy.c \
+	dsl_dir.c \
 	dsl_pool.c \
 	dsl_prop.c \
 	dsl_scan.c \
@@ -281,9 +281,9 @@ SRCS+=	abd.c \
 	edonr_zfs.c \
 	fm.c \
 	gzip.c \
-	lzjb.c \
 	lz4.c \
 	lz4_zfs.c \
+	lzjb.c \
 	metaslab.c \
 	mmp.c \
 	multilist.c \
@@ -296,6 +296,8 @@ SRCS+=	abd.c \
 	sha2_zfs.c \
 	skein_zfs.c \
 	spa.c \
+	space_map.c \
+	space_reftree.c \
 	spa_checkpoint.c \
 	spa_config.c \
 	spa_errlog.c \
@@ -303,16 +305,14 @@ SRCS+=	abd.c \
 	spa_log_spacemap.c \
 	spa_misc.c \
 	spa_stats.c \
-	space_map.c \
-	space_reftree.c \
 	txg.c \
 	uberblock.c \
 	unique.c \
 	vdev.c \
 	vdev_draid.c \
 	vdev_draid_rand.c \
-	vdev_indirect.c \
 	vdev_indirect_births.c \
+	vdev_indirect.c \
 	vdev_indirect_mapping.c \
 	vdev_initialize.c \
 	vdev_label.c \
@@ -320,11 +320,11 @@ SRCS+=	abd.c \
 	vdev_missing.c \
 	vdev_queue.c \
 	vdev_raidz.c \
-	vdev_raidz_math.c \
-	vdev_raidz_math_scalar.c \
 	vdev_raidz_math_avx2.c \
 	vdev_raidz_math_avx512bw.c \
 	vdev_raidz_math_avx512f.c \
+	vdev_raidz_math.c \
+	vdev_raidz_math_scalar.c \
 	vdev_raidz_math_sse2.c \
 	vdev_raidz_math_ssse3.c \
 	vdev_rebuild.c \
@@ -343,7 +343,6 @@ SRCS+=	abd.c \
 	zfeature.c \
 	zfs_byteswap.c \
 	zfs_chksum.c \
-	zfs_file_os.c \
 	zfs_fm.c \
 	zfs_fuid.c \
 	zfs_impl.c \
@@ -367,30 +366,36 @@ SRCS+=	abd.c \
 	zvol.c
 
 #zstd
-SRCS+=	zfs_zstd.c \
-	entropy_common.c \
+SRCS+=	zfs_zstd.c
+
+#zstd/common
+SRCS+=	entropy_common.c \
 	error_private.c \
-	fse_compress.c \
 	fse_decompress.c \
-	hist.c \
-	huf_compress.c \
-	huf_decompress.c \
 	pool.c \
 	xxhash.c \
 	zstd_common.c \
+
+#zstd/compress
+SRCS+=	fse_compress.c \
+	hist.c \
+	huf_compress.c \
 	zstd_compress.c \
 	zstd_compress_literals.c \
 	zstd_compress_sequences.c \
 	zstd_compress_superblock.c \
-	zstd_ddict.c \
-	zstd_decompress.c \
-	zstd_decompress_block.c \
 	zstd_double_fast.c \
 	zstd_fast.c \
 	zstd_lazy.c \
 	zstd_ldm.c \
 	zstd_opt.c
 
+#zstd/decompress
+SRCS+=	huf_decompress.c \
+	zstd_ddict.c \
+	zstd_decompress_block.c \
+	zstd_decompress.c
+
 beforeinstall:
 .if ${MK_DEBUG_FILES} != "no"
 	mtree -eu \

From fa480fe5bacceae940608860bd06d94be104cbb4 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 1 Mar 2024 10:38:41 +1100
Subject: [PATCH 040/116] zinject: show more device fault fields

Once there's a few different kinds injected, its pretty hard to see them
otherwise.

So, lets show IO type, error type and frequency fields in the table too.

Since we now have to convert from error code to pretty string, refactor
the error names into a table and add lookup functions.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15953
---
 cmd/zinject/zinject.c | 73 ++++++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 21 deletions(-)

diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index a11b6d0b7fac..8d0cf5d0a957 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2024, Klara Inc.
  */
 
 /*
@@ -208,6 +209,37 @@ type_to_name(uint64_t type)
 	}
 }
 
+struct errstr {
+	int		err;
+	const char	*str;
+};
+static const struct errstr errstrtable[] = {
+	{ EIO,		"io" },
+	{ ECKSUM,	"checksum" },
+	{ EINVAL,	"decompress" },
+	{ EACCES,	"decrypt" },
+	{ ENXIO,	"nxio" },
+	{ ECHILD,	"dtl" },
+	{ EILSEQ,	"corrupt" },
+	{ 0, NULL },
+};
+
+static int
+str_to_err(const char *str)
+{
+	for (int i = 0; errstrtable[i].str != NULL; i++)
+		if (strcasecmp(errstrtable[i].str, str) == 0)
+			return (errstrtable[i].err);
+	return (-1);
+}
+static const char *
+err_to_str(int err)
+{
+	for (int i = 0; errstrtable[i].str != NULL; i++)
+		if (errstrtable[i].err == err)
+			return (errstrtable[i].str);
+	return ("[unknown]");
+}
 
 /*
  * Print usage message.
@@ -392,6 +424,10 @@ static int
 print_device_handler(int id, const char *pool, zinject_record_t *record,
     void *data)
 {
+	static const char *iotypestr[] = {
+	    "null", "read", "write", "free", "claim", "ioctl", "trim", "all",
+	};
+
 	int *count = data;
 
 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
@@ -401,14 +437,21 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
 		return (0);
 
 	if (*count == 0) {
-		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
-		(void) printf("---  ---------------  ----------------\n");
+		(void) printf("%3s  %-15s  %-16s  %-5s  %-10s  %-9s\n",
+		    "ID", "POOL", "GUID", "TYPE", "ERROR", "FREQ");
+		(void) printf(
+		    "---  ---------------  ----------------  "
+		    "-----  ----------  ---------\n");
 	}
 
 	*count += 1;
 
-	(void) printf("%3d  %-15s  %llx\n", id, pool,
-	    (u_longlong_t)record->zi_guid);
+	double freq = record->zi_freq == 0 ? 100.0f :
+	    (((double)record->zi_freq) / ZI_PERCENTAGE_MAX) * 100.0f;
+
+	(void) printf("%3d  %-15s  %llx  %-5s  %-10s  %8.4f%%\n", id, pool,
+	    (u_longlong_t)record->zi_guid, iotypestr[record->zi_iotype],
+	    err_to_str(record->zi_error), freq);
 
 	return (0);
 }
@@ -842,24 +885,12 @@ main(int argc, char **argv)
 			}
 			break;
 		case 'e':
-			if (strcasecmp(optarg, "io") == 0) {
-				error = EIO;
-			} else if (strcasecmp(optarg, "checksum") == 0) {
-				error = ECKSUM;
-			} else if (strcasecmp(optarg, "decompress") == 0) {
-				error = EINVAL;
-			} else if (strcasecmp(optarg, "decrypt") == 0) {
-				error = EACCES;
-			} else if (strcasecmp(optarg, "nxio") == 0) {
-				error = ENXIO;
-			} else if (strcasecmp(optarg, "dtl") == 0) {
-				error = ECHILD;
-			} else if (strcasecmp(optarg, "corrupt") == 0) {
-				error = EILSEQ;
-			} else {
+			error = str_to_err(optarg);
+			if (error < 0) {
 				(void) fprintf(stderr, "invalid error type "
-				    "'%s': must be 'io', 'checksum' or "
-				    "'nxio'\n", optarg);
+				    "'%s': must be one of: io decompress "
+				    "decrypt nxio dtl corrupt\n",
+				    optarg);
 				usage();
 				libzfs_fini(g_zfs);
 				return (1);

From 756e10b0a1f1f1fc4974e9d1736e1173d649cb8c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 27 Mar 2024 16:15:48 +1100
Subject: [PATCH 041/116] tests: simple zinject disk fault arg check

Just making sure the valid values for disk faults are accepted.
Obviously we can do a lot more, but this will do to get us started.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15953
---
 tests/runfiles/common.run                     |  6 ++
 tests/zfs-tests/tests/Makefile.am             |  1 +
 .../cli_root/zinject/zinject_args.ksh         | 62 +++++++++++++++++++
 3 files changed, 69 insertions(+)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index d4c5a21828a1..912344b4edde 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -153,6 +153,12 @@ tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos',
     'clean_mirror_003_pos', 'clean_mirror_004_pos']
 tags = ['functional', 'clean_mirror']
 
+[tests/functional/cli_root/zinject]
+tests = ['zinject_args']
+pre =
+post =
+tags = ['functional', 'cli_root', 'zinject']
+
 [tests/functional/cli_root/zdb]
 tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
     'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos',
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 866ea5b9e7ec..db6b4c0146a7 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -606,6 +606,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/clean_mirror/clean_mirror_004_pos.ksh \
 	functional/clean_mirror/cleanup.ksh \
 	functional/clean_mirror/setup.ksh \
+	functional/cli_root/zinject/zinject_args.ksh \
 	functional/cli_root/zdb/zdb_002_pos.ksh \
 	functional/cli_root/zdb/zdb_003_pos.ksh \
 	functional/cli_root/zdb/zdb_004_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh
new file mode 100755
index 000000000000..f8a8ffbb7b0e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh
@@ -0,0 +1,62 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara Inc.
+#
+
+#
+# TODO: this only checks that the set of valid device fault types. It should
+#       check all the other options, and that they work, and everything really.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+log_assert "Check zinject parameters."
+
+log_onexit cleanup
+
+DISK1=${DISKS%% *}
+
+function cleanup
+{
+	zinject -c all
+	default_cleanup_noexit
+}
+
+function test_device_fault
+{
+	typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt")
+	for e in ${errno[@]}; do
+		log_must eval \
+		    "zinject -d $DISK1 -e $e -T read -f 0.001 $TESTPOOL"
+	done
+	zinject -c all
+}
+
+default_mirror_setup_noexit $DISKS
+
+test_device_fault
+
+log_pass "zinject parameters work as expected."

From b6bbaa837271698f238a9264c4070416077fb67b Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Wed, 3 Apr 2024 16:34:46 -0700
Subject: [PATCH 042/116] Give a better message from 'zpool get' with invalid
 pool name

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Don Brady <don.brady@klarasystems.com>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #15942
---
 cmd/zpool/zpool_main.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index c85a5f285154..9df5df0328b3 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -10793,11 +10793,10 @@ zpool_do_get(int argc, char **argv)
 		}
 	} else {
 		/*
-		 * The first arg isn't a pool name,
+		 * The first arg isn't the name of a valid pool.
 		 */
-		fprintf(stderr, gettext("missing pool name.\n"));
-		fprintf(stderr, "\n");
-		usage(B_FALSE);
+		fprintf(stderr, gettext("Cannot get properties of %s: "
+		    "no such pool available.\n"), argv[0]);
 		return (1);
 	}
 

From b21b967bd5095d431b71ef63f09e0d31c0ef0b08 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Thu, 4 Apr 2024 10:38:18 +1100
Subject: [PATCH 043/116] zap_leaf: make l_hash[] variable length to silence
 UBSAN

When UBSAN is active and OpenZFS is a debug build, the l_hash assert at
the bottom of zap_open_leaf() causes UBSAN to complain.

This follows the example in 786641dcf to shut it up.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15964
---
 include/sys/zap_leaf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h
index d563edd7ba59..e54456d3472b 100644
--- a/include/sys/zap_leaf.h
+++ b/include/sys/zap_leaf.h
@@ -132,7 +132,7 @@ typedef struct zap_leaf_phys {
 	 * with the ZAP_LEAF_CHUNK() macro.
 	 */
 
-	uint16_t l_hash[1];
+	uint16_t l_hash[];
 } zap_leaf_phys_t;
 
 typedef union zap_leaf_chunk {

From ea2862cdda1009a98a6f7b390e34b705bb7d90ae Mon Sep 17 00:00:00 2001
From: Alek P <alek-p@users.noreply.github.com>
Date: Wed, 3 Apr 2024 20:56:34 -0400
Subject: [PATCH 044/116] vdev props comment and manpage should include zfsd
 and FreeBSD mentions

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Alek Pinchuk <apinchuk@axcient.com>
Closes #15968
---
 include/sys/vdev_impl.h | 2 +-
 man/man7/vdevprops.7    | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index f39ebf031cea..2a93f7c680bc 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -455,7 +455,7 @@ struct vdev {
 	zfs_ratelimit_t vdev_checksum_rl;
 
 	/*
-	 * Vdev properties for tuning ZED
+	 * Vdev properties for tuning ZED or zfsd
 	 */
 	uint64_t	vdev_checksum_n;
 	uint64_t	vdev_checksum_t;
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index 3d3ebc072915..5ec37df179de 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -127,7 +127,13 @@ If the property is only set on the top-level vdev, this value will be used.
 The value of these properties do not persist across vdev replacement.
 For this reason, it is advisable to set the property on the top-level vdev -
 not on the leaf vdev itself.
-The default values are 10 errors in 600 seconds.
+The default values for
+.Sy OpenZFS on Linux
+are 10 errors in 600 seconds.
+For
+.Sy OpenZFS on FreeBSD
+defaults see
+.Xr zfsd 8 .
 .It Sy comment
 A text comment up to 8192 characters long
 .It Sy bootsize

From 66929f6829985b9d90dcc0828291e5353e6b3fc9 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <harry-chen@outlook.com>
Date: Thu, 4 Apr 2024 09:04:15 +0800
Subject: [PATCH 045/116] man: move zfs_prepare_disk.8 to nodist_man_MANS

The commit b53077a added zfs_prepare_disk.8 to the wrong list
dist_man_MANS, in which @zfsexecdir@ will not be properly substituted.
This leads to wrong path in the manpage in generated release tarballs.

Reported-by: Benda Xu <orv@debian.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
Closes #15979
---
 man/Makefile.am | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/man/Makefile.am b/man/Makefile.am
index 45156571eec3..43bb014ddd32 100644
--- a/man/Makefile.am
+++ b/man/Makefile.am
@@ -62,7 +62,6 @@ dist_man_MANS = \
 	%D%/man8/zfs-userspace.8 \
 	%D%/man8/zfs-wait.8 \
 	%D%/man8/zfs_ids_to_path.8 \
-	%D%/man8/zfs_prepare_disk.8 \
 	%D%/man8/zgenhostid.8 \
 	%D%/man8/zinject.8 \
 	%D%/man8/zpool.8 \
@@ -115,7 +114,8 @@ endif
 
 nodist_man_MANS = \
 	%D%/man8/zed.8 \
-	%D%/man8/zfs-mount-generator.8
+	%D%/man8/zfs-mount-generator.8 \
+	%D%/man8/zfs_prepare_disk.8
 
 dist_noinst_DATA += $(dist_noinst_man_MANS) $(dist_man_MANS)
 

From 30c4eba4eaaef92cbc1b8d2eed6689600a342386 Mon Sep 17 00:00:00 2001
From: Pavel Snajdr <snajpa@snajpa.net>
Date: Thu, 4 Apr 2024 03:09:19 +0200
Subject: [PATCH 046/116] Fix panics when truncating/deleting files

There's an union in dbuf_dirty_record_t; dr_brtwrite could evaluate
to B_TRUE if the dirty record is of another type than dl. Adding
more explicit dr type check before trying to access dr_brtwrite.

Fixes two similar panics:

[ 1373.806119] VERIFY0(db->db_level) failed (0 == 1)
[ 1373.807232] PANIC at dbuf.c:2549:dbuf_undirty()
[ 1373.814979]  dump_stack_lvl+0x71/0x90
[ 1373.815799]  spl_panic+0xd3/0x100 [spl]
[ 1373.827709]  dbuf_undirty+0x62a/0x970 [zfs]
[ 1373.829204]  dmu_buf_will_dirty_impl+0x1e9/0x5b0 [zfs]
[ 1373.831010]  dnode_free_range+0x532/0x1220 [zfs]
[ 1373.833922]  dmu_free_long_range+0x4e0/0x930 [zfs]
[ 1373.835277]  zfs_trunc+0x75/0x1e0 [zfs]
[ 1373.837958]  zfs_freesp+0x9b/0x470 [zfs]
[ 1373.847236]  zfs_setattr+0x161a/0x3500 [zfs]
[ 1373.855267]  zpl_setattr+0x125/0x320 [zfs]
[ 1373.856725]  notify_change+0x1ee/0x4a0
[ 1373.859207]  do_truncate+0x7f/0xd0
[ 1373.859968]  do_sys_ftruncate+0x28e/0x2e0
[ 1373.860962]  do_syscall_64+0x38/0x90
[ 1373.861751]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8

[ 1822.381337] VERIFY0(db->db_level) failed (0 == 1)
[ 1822.382376] PANIC at dbuf.c:2549:dbuf_undirty()
[ 1822.389232]  dump_stack_lvl+0x71/0x90
[ 1822.389920]  spl_panic+0xd3/0x100 [spl]
[ 1822.399567]  dbuf_undirty+0x62a/0x970 [zfs]
[ 1822.400583]  dmu_buf_will_dirty_impl+0x1e9/0x5b0 [zfs]
[ 1822.401752]  dnode_free_range+0x532/0x1220 [zfs]
[ 1822.402841]  dmu_object_free+0x74/0x120 [zfs]
[ 1822.403869]  zfs_znode_delete+0x75/0x120 [zfs]
[ 1822.404906]  zfs_rmnode+0x3f6/0x7f0 [zfs]
[ 1822.405870]  zfs_inactive+0xa3/0x610 [zfs]
[ 1822.407803]  zpl_evict_inode+0x3e/0x90 [zfs]
[ 1822.408831]  evict+0xc1/0x1c0
[ 1822.409387]  do_unlinkat+0x147/0x300
[ 1822.410060]  __x64_sys_unlinkat+0x33/0x60
[ 1822.410802]  do_syscall_64+0x38/0x90
[ 1822.411458]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Pavel Snajdr <snajpa@snajpa.net>
Closes #15983
---
 module/zfs/dbuf.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 0ab143bd089f..d43f84e84725 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2633,26 +2633,24 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	/*
-	 * Quick check for dirtiness.  For already dirty blocks, this
-	 * reduces runtime of this function by >90%, and overall performance
-	 * by 50% for some workloads (e.g. file deletion with indirect blocks
-	 * cached).
+	 * Quick check for dirtiness to improve performance for some workloads
+	 * (e.g. file deletion with indirect blocks cached).
 	 */
 	mutex_enter(&db->db_mtx);
-
 	if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
-		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 		/*
-		 * It's possible that it is already dirty but not cached,
+		 * It's possible that the dbuf is already dirty but not cached,
 		 * because there are some calls to dbuf_dirty() that don't
 		 * go through dmu_buf_will_dirty().
 		 */
+		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 		if (dr != NULL) {
-			if (dr->dt.dl.dr_brtwrite) {
+			if (db->db_level == 0 &&
+			    dr->dt.dl.dr_brtwrite) {
 				/*
 				 * Block cloning: If we are dirtying a cloned
-				 * block, we cannot simply redirty it, because
-				 * this dr has no data associated with it.
+				 * level 0 block, we cannot simply redirty it,
+				 * because this dr has no associated data.
 				 * We will go through a full undirtying below,
 				 * before dirtying it again.
 				 */

From 99741bde59d1d1df0963009bb624ddc105f7d8dc Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Thu, 4 Apr 2024 06:21:25 +0500
Subject: [PATCH 047/116] zvol: use multiple taskq

Currently, zvol uses a single taskq, resulting in throughput bottleneck
under heavy load due to lock contention on the single taskq. This patch
addresses the performance bottleneck under heavy load conditions by
utilizing multiple taskqs, thus mitigating lock contention. The number
of taskqs scale dynamically based on the available CPUs in the system,
as illustrated below:

                taskq   total
cpus    taskqs  threads threads
------- ------- ------- -------
1       1       32       32
2       1       32       32
4       1       32       32
8       2       16       32
16      3       11       33
32      5       7        35
64      8       8        64
128     11      12       132
256     16      16       256

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #15992
---
 man/man4/zfs.4                |   7 +++
 module/os/linux/zfs/zvol_os.c | 102 ++++++++++++++++++++++++++++++----
 2 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index cacb214d1dc1..c8e90968ab34 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2387,6 +2387,13 @@ The number of requests which can be handled concurrently is controlled by
 is ignored when running on a kernel that supports block multiqueue
 .Pq Li blk-mq .
 .
+.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint
+Number of zvol taskqs.
+If
+.Sy 0
+(the default) then scaling is done internally to prefer 6 threads per taskq.
+This only applies on Linux.
+.
 .It Sy zvol_threads Ns = Ns Sy 0 Pq uint
 The number of system wide threads to use for processing zvol block IOs.
 If
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index d815cb2ad2c4..107776932493 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -37,6 +37,7 @@
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
 #include <sys/zvol_impl.h>
+#include <cityhash.h>
 
 #include <linux/blkdev_compat.h>
 #include <linux/task_io_accounting_ops.h>
@@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0;
 static unsigned int zvol_prefetch_bytes = (128 * 1024);
 static unsigned long zvol_max_discard_blocks = 16384;
 
+/*
+ * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
+ * to utilize more threads for small files but may affect prefetch hits.
+ */
+#define	ZVOL_TASKQ_OFFSET_SHIFT 29
+
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 static unsigned int zvol_open_timeout_ms = 1000;
 #endif
@@ -74,6 +81,7 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
  * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
  */
 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
+static unsigned int zvol_num_taskqs = 0;
 #endif
 
 #ifndef	BLKDEV_DEFAULT_RQ
@@ -114,7 +122,11 @@ struct zvol_state_os {
 	boolean_t use_blk_mq;
 };
 
-static taskq_t *zvol_taskq;
+typedef struct zv_taskq {
+	uint_t tqs_cnt;
+	taskq_t **tqs_taskq;
+} zv_taskq_t;
+static zv_taskq_t zvol_taskqs;
 static struct ida zvol_ida;
 
 typedef struct zv_request_stack {
@@ -532,6 +544,17 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	}
 
 	zv_request_task_t *task;
+	zv_taskq_t *ztqs = &zvol_taskqs;
+	uint_t blk_mq_hw_queue = 0;
+	uint_t tq_idx;
+	uint_t taskq_hash;
+#ifdef HAVE_BLK_MQ
+	if (rq)
+		blk_mq_hw_queue = rq->mq_hctx->queue_num;
+#endif
+	taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
+	    blk_mq_hw_queue, 0);
+	tq_idx = taskq_hash % ztqs->tqs_cnt;
 
 	if (rw == WRITE) {
 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
@@ -601,7 +624,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 				zvol_discard(&zvr);
 			} else {
 				task = zv_request_task_create(zvr);
-				taskq_dispatch_ent(zvol_taskq,
+				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 				    zvol_discard_task, task, 0, &task->ent);
 			}
 		} else {
@@ -609,7 +632,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 				zvol_write(&zvr);
 			} else {
 				task = zv_request_task_create(zvr);
-				taskq_dispatch_ent(zvol_taskq,
+				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 				    zvol_write_task, task, 0, &task->ent);
 			}
 		}
@@ -631,7 +654,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 			zvol_read(&zvr);
 		} else {
 			task = zv_request_task_create(zvr);
-			taskq_dispatch_ent(zvol_taskq,
+			taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 			    zvol_read_task, task, 0, &task->ent);
 		}
 	}
@@ -1598,8 +1621,40 @@ zvol_init(void)
 		zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
 	}
 
+	/*
+	 * Use atleast 32 zvol_threads but for many core system,
+	 * prefer 6 threads per taskq, but no more taskqs
+	 * than threads in them on large systems.
+	 *
+	 *                 taskq   total
+	 * cpus    taskqs  threads threads
+	 * ------- ------- ------- -------
+	 * 1       1       32       32
+	 * 2       1       32       32
+	 * 4       1       32       32
+	 * 8       2       16       32
+	 * 16      3       11       33
+	 * 32      5       7        35
+	 * 64      8       8        64
+	 * 128     11      12       132
+	 * 256     16      16       256
+	 */
+	zv_taskq_t *ztqs = &zvol_taskqs;
+	uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
+	if (num_tqs == 0) {
+		num_tqs = 1 + num_online_cpus() / 6;
+		while (num_tqs * num_tqs > zvol_actual_threads)
+			num_tqs--;
+	}
+	uint_t per_tq_thread = zvol_actual_threads / num_tqs;
+	if (per_tq_thread * num_tqs < zvol_actual_threads)
+		per_tq_thread++;
+	ztqs->tqs_cnt = num_tqs;
+	ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
 	if (error) {
+		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
+		ztqs->tqs_taskq = NULL;
 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
 		return (error);
 	}
@@ -1619,11 +1674,22 @@ zvol_init(void)
 		    1024);
 	}
 #endif
-	zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,
-	    zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
-	if (zvol_taskq == NULL) {
-		unregister_blkdev(zvol_major, ZVOL_DRIVER);
-		return (-ENOMEM);
+	for (uint_t i = 0; i < num_tqs; i++) {
+		char name[32];
+		(void) snprintf(name, sizeof (name), "%s_tq-%u",
+		    ZVOL_DRIVER, i);
+		ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
+		    maxclsyspri, per_tq_thread, INT_MAX,
+		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+		if (ztqs->tqs_taskq[i] == NULL) {
+			for (int j = i - 1; j >= 0; j--)
+				taskq_destroy(ztqs->tqs_taskq[j]);
+			unregister_blkdev(zvol_major, ZVOL_DRIVER);
+			kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
+			    sizeof (taskq_t *));
+			ztqs->tqs_taskq = NULL;
+			return (-ENOMEM);
+		}
 	}
 
 	zvol_init_impl();
@@ -1634,9 +1700,22 @@ zvol_init(void)
 void
 zvol_fini(void)
 {
+	zv_taskq_t *ztqs = &zvol_taskqs;
 	zvol_fini_impl();
 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
-	taskq_destroy(zvol_taskq);
+
+	if (ztqs->tqs_taskq == NULL) {
+		ASSERT3U(ztqs->tqs_cnt, ==, 0);
+	} else {
+		for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
+			ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
+			taskq_destroy(ztqs->tqs_taskq[i]);
+		}
+		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
+		    sizeof (taskq_t *));
+		ztqs->tqs_taskq = NULL;
+	}
+
 	ida_destroy(&zvol_ida);
 }
 
@@ -1657,6 +1736,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
 module_param(zvol_max_discard_blocks, ulong, 0444);
 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
 
+module_param(zvol_num_taskqs, uint, 0444);
+MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
+
 module_param(zvol_prefetch_bytes, uint, 0644);
 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
 

From c13400c9a26985fe817bc777fc0bbb5459bedeaf Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Tue, 9 Apr 2024 03:13:27 +1000
Subject: [PATCH 048/116] zvol_os: fix build on Linux <3.13

99741bde5 introduced zvol_num_taskqs, but put it behind the HAVE_BLK_MQ
define, preventing builds on versions of Linux that don't have it
(<3.13, incl EL7).

Nothing about it seems dependent on blk-mq, so this just moves it out
from behind that define and so fixes the build.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16062
---
 module/os/linux/zfs/zvol_os.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 107776932493..e2a6ba3a7f32 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -81,9 +81,10 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
  * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
  */
 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
-static unsigned int zvol_num_taskqs = 0;
 #endif
 
+static unsigned int zvol_num_taskqs = 0;
+
 #ifndef	BLKDEV_DEFAULT_RQ
 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
 #define	BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ

From 03987f71e39c47d2f16c3006d690c50a9fec2ca5 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Tue, 9 Apr 2024 04:38:49 +1000
Subject: [PATCH 049/116] zvol_os: fix compile with blk-mq on Linux 4.x

99741bde5 accesses a cached blk-mq hardware context through the mq_hctx
field of struct request. However, this field did not exist until 5.0.
Before that, the private function blk_mq_map_queue() was used to dig it
out of broader queue context. This commit detects this situation, and
handles it with a poor-man's simulation of that function.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16069
---
 config/kernel-blk-queue.m4    | 15 +++++++++++++++
 module/os/linux/zfs/zvol_os.c |  5 +++++
 2 files changed, 20 insertions(+)

diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4
index bb5903b313eb..15dbe1c7dff0 100644
--- a/config/kernel-blk-queue.m4
+++ b/config/kernel-blk-queue.m4
@@ -377,6 +377,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [
 		(void) blk_mq_alloc_tag_set(&tag_set);
 		return BLK_STS_OK;
 	], [])
+	ZFS_LINUX_TEST_SRC([blk_mq_rq_hctx], [
+		#include <linux/blk-mq.h>
+		#include <linux/blkdev.h>
+	], [
+		struct request rq = {0};
+		struct blk_mq_hw_ctx *hctx = NULL;
+		rq.mq_hctx = hctx;
+	], [])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
@@ -384,6 +392,13 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
 	ZFS_LINUX_TEST_RESULT([blk_mq], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available])
+		AC_MSG_CHECKING([whether block multiqueue hardware context is cached in struct request])
+		ZFS_LINUX_TEST_RESULT([blk_mq_rq_hctx], [
+			AC_MSG_RESULT(yes)
+			AC_DEFINE(HAVE_BLK_MQ_RQ_HCTX, 1, [block multiqueue hardware context is cached in struct request])
+		], [
+			AC_MSG_RESULT(no)
+		])
 	], [
 		AC_MSG_RESULT(no)
 	])
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index e2a6ba3a7f32..4b960daf89ee 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -551,7 +551,12 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	uint_t taskq_hash;
 #ifdef HAVE_BLK_MQ
 	if (rq)
+#ifdef HAVE_BLK_MQ_RQ_HCTX
 		blk_mq_hw_queue = rq->mq_hctx->queue_num;
+#else
+		blk_mq_hw_queue =
+		    rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+#endif
 #endif
 	taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
 	    blk_mq_hw_queue, 0);

From ba9f587a77e6893390c752491dfacb6ee5d52023 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Tue, 9 Apr 2024 04:50:24 +1000
Subject: [PATCH 050/116] vdev_disk: ensure trim errors are returned
 immediately

After 06e25f9c4, the discard issuing code was organised such that if
requesting an async discard or secure erase failed before the IO was
issued (that is, calling __blkdev_issue_discard() returned an error),
the failed zio would never be executed, resulting in txg_sync hanging
forever waiting for IO to finish.

This commit fixes that by immediately executing a failed zio on error.
To handle the successful synchronous op case, we fake an async op by,
when not using an asynchronous submission method, queuing the successful
result zio as part of the discard handler.

Since it was hard to understand the differences between discard and
secure erase, and sync and async, across different kernel versions, I've
commented and reorganised the code a bit to try and make everything more
contained and linear.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16070
---
 config/kernel-blkdev.m4         | 118 ++++++++++++++++++++--------
 module/os/linux/zfs/vdev_disk.c | 131 +++++++++++++++++++++-----------
 2 files changed, 171 insertions(+), 78 deletions(-)

diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4
index 7b0e830e600f..b6ce1e1cf083 100644
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@@ -561,12 +561,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [
 ])
 
 dnl #
-dnl # 5.19 API: blkdev_issue_secure_erase()
-dnl # 4.7  API: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
-dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
-	ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [
+dnl # TRIM support: discard and secure erase. We make use of asynchronous
+dnl #               functions when available.
+dnl #
+dnl # 3.10:
+dnl #   sync discard:  blkdev_issue_discard(..., 0)
+dnl #   sync erase:    blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
+dnl #   async discard: [not available]
+dnl #   async erase:   [not available]
+dnl #
+dnl # 4.7:
+dnl #   sync discard:  blkdev_issue_discard(..., 0)
+dnl #   sync erase:    blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
+dnl #   async discard: __blkdev_issue_discard(..., 0)
+dnl #   async erase:   __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
+dnl #
+dnl # 5.19:
+dnl #   sync discard:  blkdev_issue_discard(...)
+dnl #   sync erase:    blkdev_issue_secure_erase(...)
+dnl #   async discard: __blkdev_issue_discard(...)
+dnl #   async erase:   [not available]
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD], [
+	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_noflags], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
@@ -574,10 +591,33 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
 		sector_t nr_sects = 0;
 		int error __attribute__ ((unused));
 
-		error = blkdev_issue_secure_erase(bdev,
+		error = blkdev_issue_discard(bdev,
 		    sector, nr_sects, GFP_KERNEL);
 	])
+	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [
+		#include <linux/blkdev.h>
+	],[
+		struct block_device *bdev = NULL;
+		sector_t sector = 0;
+		sector_t nr_sects = 0;
+		unsigned long flags = 0;
+		int error __attribute__ ((unused));
+
+		error = blkdev_issue_discard(bdev,
+		    sector, nr_sects, GFP_KERNEL, flags);
+	])
+	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_noflags], [
+		#include <linux/blkdev.h>
+	],[
+		struct block_device *bdev = NULL;
+		sector_t sector = 0;
+		sector_t nr_sects = 0;
+		struct bio *biop = NULL;
+		int error __attribute__ ((unused));
 
+		error = __blkdev_issue_discard(bdev,
+		    sector, nr_sects, GFP_KERNEL, &biop);
+	])
 	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [
 		#include <linux/blkdev.h>
 	],[
@@ -591,22 +631,52 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [
 		error = __blkdev_issue_discard(bdev,
 		    sector, nr_sects, GFP_KERNEL, flags, &biop);
 	])
-
-	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [
+	ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		sector_t sector = 0;
 		sector_t nr_sects = 0;
-		unsigned long flags = 0;
 		int error __attribute__ ((unused));
 
-		error = blkdev_issue_discard(bdev,
-		    sector, nr_sects, GFP_KERNEL, flags);
+		error = blkdev_issue_secure_erase(bdev,
+		    sector, nr_sects, GFP_KERNEL);
 	])
 ])
 
-AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD], [
+	AC_MSG_CHECKING([whether blkdev_issue_discard() is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_noflags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS, 1,
+		    [blkdev_issue_discard() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+	AC_MSG_CHECKING([whether blkdev_issue_discard(flags) is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS, 1,
+		    [blkdev_issue_discard(flags) is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+	AC_MSG_CHECKING([whether __blkdev_issue_discard() is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_noflags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS, 1,
+		    [__blkdev_issue_discard() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+	AC_MSG_CHECKING([whether __blkdev_issue_discard(flags) is available])
+	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS, 1,
+		    [__blkdev_issue_discard(flags) is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
 	AC_MSG_CHECKING([whether blkdev_issue_secure_erase() is available])
 	ZFS_LINUX_TEST_RESULT([blkdev_issue_secure_erase], [
 		AC_MSG_RESULT(yes)
@@ -614,24 +684,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [
 		    [blkdev_issue_secure_erase() is available])
 	],[
 		AC_MSG_RESULT(no)
-
-		AC_MSG_CHECKING([whether __blkdev_issue_discard() is available])
-		ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [
-			AC_MSG_RESULT(yes)
-			AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC, 1,
-			    [__blkdev_issue_discard() is available])
-		],[
-			AC_MSG_RESULT(no)
-
-			AC_MSG_CHECKING([whether blkdev_issue_discard() is available])
-			ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [
-				AC_MSG_RESULT(yes)
-				AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1,
-					[blkdev_issue_discard() is available])
-			],[
-				ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()])
-			])
-		])
 	])
 ])
 
@@ -696,7 +748,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME
-	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE
+	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
 	ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE
@@ -717,7 +769,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE
 	ZFS_AC_KERNEL_BLKDEV_BDEVNAME
 	ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS
-	ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE
+	ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD
 	ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
 	ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index a710bb91004e..a560bca918a8 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1252,8 +1252,6 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 	return (0);
 }
 
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \
-	defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC)
 BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error)
 {
 	zio_t *zio = bio->bi_private;
@@ -1268,54 +1266,99 @@ BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error)
 	zio_interrupt(zio);
 }
 
+/*
+ * Wrappers for the different secure erase and discard APIs. We use async
+ * when available; in this case, *biop is set to the last bio in the chain.
+ */
 static int
-vdev_issue_discard_trim(zio_t *zio, unsigned long flags)
+vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector,
+    sector_t nsect, struct bio **biop)
 {
-	int ret;
-	struct bio *bio = NULL;
+	*biop = NULL;
+	int error;
 
-#if defined(BLKDEV_DISCARD_SECURE)
-	ret = - __blkdev_issue_discard(
-	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio);
+#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
+	error = blkdev_issue_secure_erase(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
+	error = __blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
+	error = blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE);
 #else
-	(void) flags;
-	ret = - __blkdev_issue_discard(
-	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio);
+#error "unsupported kernel"
 #endif
-	if (!ret && bio) {
-		bio->bi_private = zio;
-		bio->bi_end_io = vdev_disk_discard_end_io;
-		vdev_submit_bio(bio);
-	}
-	return (ret);
+
+	return (error);
 }
+
+static int
+vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector,
+    sector_t nsect, struct bio **biop)
+{
+	*biop = NULL;
+	int error;
+
+#if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
+	error = __blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, 0, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS)
+	error = __blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
+	error = blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS, 0);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS)
+	error = blkdev_issue_discard(BDH_BDEV(bdh),
+	    sector, nsect, GFP_NOFS);
+#else
+#error "unsupported kernel"
 #endif
 
+	return (error);
+}
+
+/*
+ * Entry point for TRIM ops. This calls the right wrapper for secure erase or
+ * discard, and then does the appropriate finishing work for error vs success
+ * and async vs sync.
+ */
 static int
 vdev_disk_io_trim(zio_t *zio)
 {
-	unsigned long trim_flags = 0;
-	if (zio->io_trim_flags & ZIO_TRIM_SECURE) {
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
-		return (-blkdev_issue_secure_erase(
-		    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS));
-#elif defined(BLKDEV_DISCARD_SECURE)
-		trim_flags |= BLKDEV_DISCARD_SECURE;
-#endif
+	int error;
+	struct bio *bio;
+
+	zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh;
+	sector_t sector = zio->io_offset >> 9;
+	sector_t nsects = zio->io_size >> 9;
+
+	if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+		error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio);
+	else
+		error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio);
+
+	if (error != 0)
+		return (SET_ERROR(-error));
+
+	if (bio == NULL) {
+		/*
+		 * This was a synchronous op that completed successfully, so
+		 * return it to ZFS immediately.
+		 */
+		zio_interrupt(zio);
+	} else {
+		/*
+		 * This was an asynchronous op; set up completion callback and
+		 * issue it.
+		 */
+		bio->bi_private = zio;
+		bio->bi_end_io = vdev_disk_discard_end_io;
+		vdev_submit_bio(bio);
 	}
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \
-	defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC)
-	return (vdev_issue_discard_trim(zio, trim_flags));
-#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
-	return (-blkdev_issue_discard(
-	    BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
-	    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags));
-#else
-#error "Unsupported kernel"
-#endif
+
+	return (0);
 }
 
 int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
@@ -1390,14 +1433,12 @@ vdev_disk_io_start(zio_t *zio)
 		return;
 
 	case ZIO_TYPE_TRIM:
-		zio->io_error = vdev_disk_io_trim(zio);
+		error = vdev_disk_io_trim(zio);
 		rw_exit(&vd->vd_lock);
-#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
-		if (zio->io_trim_flags & ZIO_TRIM_SECURE)
-			zio_interrupt(zio);
-#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
-		zio_interrupt(zio);
-#endif
+		if (error) {
+			zio->io_error = error;
+			zio_execute(zio);
+		}
 		return;
 
 	case ZIO_TYPE_READ:

From 76d1dde94ca9cac03fa641b4cf9259d98a706e12 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Tue, 9 Apr 2024 04:59:04 +1000
Subject: [PATCH 051/116] zinject: inject device errors into ioctls

Adds 'ioctl' as a valid IO type for device error injection, so we can
simulate a flush error (which OpenZFS currently ignores, but that's by
the by).

To support this, adding ZIO_STAGE_VDEV_IO_DONE to ZIO_IOCTL_PIPELINE,
since that's where device error injection happens. This needs a small
exclusion to avoid the vdev_queue, since flushes are not queued, and I'm
assuming that the various failure responses are still reasonable for
flush failures (probes, media change, etc). This seems reasonable to me,
as a flush failure is not unlike a write failure in this regard, however
this may be too aggressive or subtle to assume in just this change.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16061
---
 cmd/zinject/zinject.c   | 6 ++++--
 include/sys/zio_impl.h  | 5 ++---
 man/man8/zinject.8      | 4 +++-
 man/man8/zpool-events.8 | 2 +-
 module/zfs/zio.c        | 7 +++++--
 module/zfs/zio_inject.c | 6 +++---
 6 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index 8d0cf5d0a957..07d3d8af9980 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -265,7 +265,7 @@ usage(void)
 	    "\t\tspa_vdev_exit() will trigger a panic.\n"
 	    "\n"
 	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
-	    "\t\t[-T <read|write|free|claim|all>] [-f frequency] pool\n\n"
+	    "\t\t[-T <read|write|free|claim|ioctl|all>] [-f frequency] pool\n\n"
 	    "\t\tInject a fault into a particular device or the device's\n"
 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
 	    "\t\t'pad1', or 'pad2'.\n"
@@ -978,12 +978,14 @@ main(int argc, char **argv)
 				io_type = ZIO_TYPE_FREE;
 			} else if (strcasecmp(optarg, "claim") == 0) {
 				io_type = ZIO_TYPE_CLAIM;
+			} else if (strcasecmp(optarg, "ioctl") == 0) {
+				io_type = ZIO_TYPE_IOCTL;
 			} else if (strcasecmp(optarg, "all") == 0) {
 				io_type = ZIO_TYPES;
 			} else {
 				(void) fprintf(stderr, "invalid I/O type "
 				    "'%s': must be 'read', 'write', 'free', "
-				    "'claim' or 'all'\n", optarg);
+				    "'claim', 'ioctl' or 'all'\n", optarg);
 				usage();
 				libzfs_fini(g_zfs);
 				return (1);
diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h
index 1c0a44059d24..4b3726d7eec4 100644
--- a/include/sys/zio_impl.h
+++ b/include/sys/zio_impl.h
@@ -153,7 +153,7 @@ enum zio_stage {
 	ZIO_STAGE_READY			= 1 << 20,	/* RWFCIT */
 
 	ZIO_STAGE_VDEV_IO_START		= 1 << 21,	/* RW--IT */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 22,	/* RW---T */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 22,	/* RW--IT */
 	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 23,	/* RW--IT */
 
 	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 24,	/* R----- */
@@ -261,8 +261,7 @@ enum zio_stage {
 
 #define	ZIO_IOCTL_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_VDEV_IO_START |		\
-	ZIO_STAGE_VDEV_IO_ASSESS)
+	ZIO_VDEV_IO_STAGES)
 
 #define	ZIO_TRIM_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
diff --git a/man/man8/zinject.8 b/man/man8/zinject.8
index b692f12130a8..817dcb7fe32a 100644
--- a/man/man8/zinject.8
+++ b/man/man8/zinject.8
@@ -19,10 +19,11 @@
 .\" CDDL HEADER END
 .\"
 .\" Copyright 2013 Darik Horn <dajhorn@vanadac.com>. All rights reserved.
+.\" Copyright (c) 2024, Klara Inc.
 .\"
 .\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS
 .\"
-.Dd May 26, 2021
+.Dd April 4, 2024
 .Dt ZINJECT 8
 .Os
 .
@@ -257,6 +258,7 @@ Run for this many seconds before reporting failure.
 .It Fl T Ar failure
 Set the failure type to one of
 .Sy all ,
+.Sy ioctl ,
 .Sy claim ,
 .Sy free ,
 .Sy read ,
diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8
index a7a9e33442da..12331b7b2a2d 100644
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@@ -404,7 +404,7 @@ ZIO_STAGE_DVA_CLAIM:0x00080000:---C--
 ZIO_STAGE_READY:0x00100000:RWFCIT
 
 ZIO_STAGE_VDEV_IO_START:0x00200000:RW--IT
-ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW---T
+ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--IT
 ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--IT
 
 ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R-----
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index e96bbda35a04..08d56eef83e9 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -4086,14 +4086,17 @@ zio_vdev_io_done(zio_t *zio)
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
-	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
+	    zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_IOCTL ||
+	    zio->io_type == ZIO_TYPE_TRIM);
 
 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
-		vdev_queue_io_done(zio);
+		if (zio->io_type != ZIO_TYPE_IOCTL)
+			vdev_queue_io_done(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injections(vd, zio,
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 609182f4a2cd..0a4851ecb40d 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -364,10 +364,10 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
 	int ret = 0;
 
 	/*
-	 * We skip over faults in the labels unless it's during
-	 * device open (i.e. zio == NULL).
+	 * We skip over faults in the labels unless it's during device open
+	 * (i.e. zio == NULL) or a device flush (offset is meaningless)
 	 */
-	if (zio != NULL) {
+	if (zio != NULL && zio->io_type != ZIO_TYPE_IOCTL) {
 		uint64_t offset = zio->io_offset;
 
 		if (offset < VDEV_LABEL_START_SIZE ||

From eeca9a91d6866879f4d57b4d0644e5da951f3daa Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Apr 2024 15:03:18 -0400
Subject: [PATCH 052/116] Fix read errors race after block cloning

Investigating read errors triggering panic fixed in #16042 I've
found that we have a race in a sync process between the moment
dirty record for cloned block is removed and the moment dbuf is
destroyed.  If dmu_buf_hold_array_by_dnode() take a hold on a
cloned dbuf before it is synced/destroyed, then dbuf_read_impl()
may see it still in DB_NOFILL state, but without the dirty record.
Such case is not an error, but equivalent to DB_UNCACHED, since
the dbuf block pointer is already updated by dbuf_write_ready().
Unfortunately it is impossible to safely change the dbuf state
to DB_UNCACHED there, since there may already be another cloning
in progress, that dropped dbuf lock before creating a new dirty
record, protected only by the range lock.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Robert Evans <evansr@google.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16052
---
 module/zfs/dbuf.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index d43f84e84725..8c42b116d7e1 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1563,7 +1563,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
-	blkptr_t bp, *bpp;
+	blkptr_t bp, *bpp = NULL;
 
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1577,29 +1577,28 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 		goto early_unlock;
 	}
 
-	if (db->db_state == DB_UNCACHED) {
-		if (db->db_blkptr == NULL) {
-			bpp = NULL;
-		} else {
-			bp = *db->db_blkptr;
+	/*
+	 * If we have a pending block clone, we don't want to read the
+	 * underlying block, but the content of the block being cloned,
+	 * pointed by the dirty record, so we have the most recent data.
+	 * If there is no dirty record, then we hit a race in a sync
+	 * process when the dirty record is already removed, while the
+	 * dbuf is not yet destroyed. Such case is equivalent to uncached.
+	 */
+	if (db->db_state == DB_NOFILL) {
+		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+		if (dr != NULL) {
+			if (!dr->dt.dl.dr_brtwrite) {
+				err = EIO;
+				goto early_unlock;
+			}
+			bp = dr->dt.dl.dr_overridden_by;
 			bpp = &bp;
 		}
-	} else {
-		dbuf_dirty_record_t *dr;
-
-		ASSERT3S(db->db_state, ==, DB_NOFILL);
+	}
 
-		/*
-		 * Block cloning: If we have a pending block clone,
-		 * we don't want to read the underlying block, but the content
-		 * of the block being cloned, so we have the most recent data.
-		 */
-		dr = list_head(&db->db_dirty_records);
-		if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
-			err = EIO;
-			goto early_unlock;
-		}
-		bp = dr->dt.dl.dr_overridden_by;
+	if (bpp == NULL && db->db_blkptr != NULL) {
+		bp = *db->db_blkptr;
 		bpp = &bp;
 	}
 

From 5e5fd0a1785aa65d5c2259f2d43459437ae209eb Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Apr 2024 18:13:27 -0400
Subject: [PATCH 053/116] Speculative prefetch for reordered requests

Before this change speculative prefetcher was able to detect a stream
only if all of its accesses are perfectly sequential.  It was easy to
implement and is perfectly fine for single-threaded applications.
Unfortunately multi-threaded network servers, such as iSCSI, SMB or
NFS usually have plenty of threads and may often reorder requests,
preventing successful speculation and prefetch.

This change allows speculative prefetcher to detect streams even if
requests are reordered by introducing a list of 9 non-contiguous
ranges up to 16MB ahead of current stream position and filling the
gaps as more requests arrive.  It also allows stream to proceed
even with holes up to a certain configurable threshold (25%).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16022
---
 cmd/arc_summary          |  11 +-
 include/sys/dmu_zfetch.h |  16 ++-
 man/man4/zfs.4           |  11 ++
 module/zfs/dmu.c         |   8 +-
 module/zfs/dmu_zfetch.c  | 289 +++++++++++++++++++++++++++++++--------
 5 files changed, 272 insertions(+), 63 deletions(-)

diff --git a/cmd/arc_summary b/cmd/arc_summary
index 9c69ec4f8ccc..100fb1987a8b 100755
--- a/cmd/arc_summary
+++ b/cmd/arc_summary
@@ -793,18 +793,27 @@ def section_dmu(kstats_dict):
 
     zfetch_stats = isolate_section('zfetchstats', kstats_dict)
 
-    zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
+    zfetch_access_total = int(zfetch_stats['hits']) +\
+        int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\
+        int(zfetch_stats['past']) + int(zfetch_stats['misses'])
 
     prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total))
     prt_i2('Stream hits:',
            f_perc(zfetch_stats['hits'], zfetch_access_total),
            f_hits(zfetch_stats['hits']))
+    future = int(zfetch_stats['future']) + int(zfetch_stats['stride'])
+    prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total),
+           f_hits(future))
+    prt_i2('Hits behind stream:',
+           f_perc(zfetch_stats['past'], zfetch_access_total),
+           f_hits(zfetch_stats['past']))
     prt_i2('Stream misses:',
            f_perc(zfetch_stats['misses'], zfetch_access_total),
            f_hits(zfetch_stats['misses']))
     prt_i2('Streams limit reached:',
            f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']),
            f_hits(zfetch_stats['max_streams']))
+    prt_i1('Stream strides:', f_hits(zfetch_stats['stride']))
     prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued']))
     print()
 
diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h
index f00e13cf03a6..322472fb1ae2 100644
--- a/include/sys/dmu_zfetch.h
+++ b/include/sys/dmu_zfetch.h
@@ -45,18 +45,24 @@ typedef struct zfetch {
 	int		zf_numstreams;	/* number of zstream_t's */
 } zfetch_t;
 
+typedef struct zsrange {
+	uint16_t	start;
+	uint16_t	end;
+} zsrange_t;
+
+#define	ZFETCH_RANGES	9		/* Fits zstream_t into 128 bytes */
+
 typedef struct zstream {
+	list_node_t	zs_node;	/* link for zf_stream */
 	uint64_t	zs_blkid;	/* expect next access at this blkid */
+	uint_t		zs_atime;	/* time last prefetch issued */
+	zsrange_t	zs_ranges[ZFETCH_RANGES]; /* ranges from future */
 	unsigned int	zs_pf_dist;	/* data prefetch distance in bytes */
 	unsigned int	zs_ipf_dist;	/* L1 prefetch distance in bytes */
 	uint64_t	zs_pf_start;	/* first data block to prefetch */
 	uint64_t	zs_pf_end;	/* data block to prefetch up to */
 	uint64_t	zs_ipf_start;	/* first data block to prefetch L1 */
 	uint64_t	zs_ipf_end;	/* data block to prefetch L1 up to */
-
-	list_node_t	zs_node;	/* link for zf_stream */
-	hrtime_t	zs_atime;	/* time last prefetch issued */
-	zfetch_t	*zs_fetch;	/* parent fetch */
 	boolean_t	zs_missed;	/* stream saw cache misses */
 	boolean_t	zs_more;	/* need more distant prefetch */
 	zfs_refcount_t	zs_callers;	/* number of pending callers */
@@ -74,7 +80,7 @@ void		dmu_zfetch_init(zfetch_t *, struct dnode *);
 void		dmu_zfetch_fini(zfetch_t *);
 zstream_t	*dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
     boolean_t);
-void		dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
+void		dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
 void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
     boolean_t);
 
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index c8e90968ab34..6088ebc7ef35 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -564,6 +564,10 @@ However, this is limited by
 Maximum micro ZAP size.
 A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
 .
+.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
+Log2 fraction of holes in speculative prefetch stream allowed for it to
+proceed.
+.
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
 Prefetch distance starts from the demand access size and quickly grows to
@@ -578,6 +582,13 @@ Max bytes to prefetch per stream.
 .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
 Max bytes to prefetch indirects for per stream.
 .
+.It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
+Requests within this byte distance from the current prefetch stream position
+are considered parts of the stream, reordered due to parallel processing.
+Such requests do not advance the stream position immediately unless
+.Sy zfetch_hole_shift
+fill threshold is reached, but saved to fill holes in the stream later.
+.
 .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
 Max number of streams per zfetch (prefetch streams per file).
 .
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 753dde6d5205..6ef149aab9a6 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -569,8 +569,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
-			if (zs)
-				dmu_zfetch_run(zs, missed, B_TRUE);
+			if (zs) {
+				dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
+				    B_TRUE);
+			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
@@ -606,7 +608,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		zfs_racct_write(length, nblks);
 
 	if (zs)
-		dmu_zfetch_run(zs, missed, B_TRUE);
+		dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (read) {
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index 2b2d72671001..915d99916d2e 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -65,9 +65,16 @@ unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
 #endif
 /* max bytes to prefetch indirects for per stream (default 64MB) */
 unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
+/* max request reorder distance within a stream (default 16MB) */
+unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
+/* Max log2 fraction of holes in a stream */
+unsigned int	zfetch_hole_shift = 2;
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
+	kstat_named_t zfetchstat_future;
+	kstat_named_t zfetchstat_stride;
+	kstat_named_t zfetchstat_past;
 	kstat_named_t zfetchstat_misses;
 	kstat_named_t zfetchstat_max_streams;
 	kstat_named_t zfetchstat_io_issued;
@@ -76,6 +83,9 @@ typedef struct zfetch_stats {
 
 static zfetch_stats_t zfetch_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "future",			KSTAT_DATA_UINT64 },
+	{ "stride",			KSTAT_DATA_UINT64 },
+	{ "past",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "max_streams",		KSTAT_DATA_UINT64 },
 	{ "io_issued",			KSTAT_DATA_UINT64 },
@@ -84,6 +94,9 @@ static zfetch_stats_t zfetch_stats = {
 
 struct {
 	wmsum_t zfetchstat_hits;
+	wmsum_t zfetchstat_future;
+	wmsum_t zfetchstat_stride;
+	wmsum_t zfetchstat_past;
 	wmsum_t zfetchstat_misses;
 	wmsum_t zfetchstat_max_streams;
 	wmsum_t zfetchstat_io_issued;
@@ -107,6 +120,12 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
 		return (EACCES);
 	zs->zfetchstat_hits.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_hits);
+	zs->zfetchstat_future.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_future);
+	zs->zfetchstat_stride.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_stride);
+	zs->zfetchstat_past.value.ui64 =
+	    wmsum_value(&zfetch_sums.zfetchstat_past);
 	zs->zfetchstat_misses.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_misses);
 	zs->zfetchstat_max_streams.value.ui64 =
@@ -122,6 +141,9 @@ void
 zfetch_init(void)
 {
 	wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_future, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
+	wmsum_init(&zfetch_sums.zfetchstat_past, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
@@ -147,6 +169,9 @@ zfetch_fini(void)
 	}
 
 	wmsum_fini(&zfetch_sums.zfetchstat_hits);
+	wmsum_fini(&zfetch_sums.zfetchstat_future);
+	wmsum_fini(&zfetch_sums.zfetchstat_stride);
+	wmsum_fini(&zfetch_sums.zfetchstat_past);
 	wmsum_fini(&zfetch_sums.zfetchstat_misses);
 	wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
 	wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
@@ -222,22 +247,22 @@ static void
 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 {
 	zstream_t *zs, *zs_next, *zs_old = NULL;
-	hrtime_t now = gethrtime(), t;
+	uint_t now = gethrestime_sec(), t;
 
 	ASSERT(MUTEX_HELD(&zf->zf_lock));
 
 	/*
 	 * Delete too old streams, reusing the first found one.
 	 */
-	t = now - SEC2NSEC(zfetch_max_sec_reap);
+	t = now - zfetch_max_sec_reap;
 	for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
 		zs_next = list_next(&zf->zf_stream, zs);
 		/*
 		 * Skip if still active.  1 -- zf_stream reference.
 		 */
-		if (zfs_refcount_count(&zs->zs_refs) != 1)
+		if ((int)(zs->zs_atime - t) >= 0)
 			continue;
-		if (zs->zs_atime > t)
+		if (zfs_refcount_count(&zs->zs_refs) != 1)
 			continue;
 		if (zs_old)
 			dmu_zfetch_stream_remove(zf, zs);
@@ -246,6 +271,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	}
 	if (zs_old) {
 		zs = zs_old;
+		list_remove(&zf->zf_stream, zs);
 		goto reuse;
 	}
 
@@ -255,21 +281,23 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	 * for all the streams to be non-overlapping.
 	 */
 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
-	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+	    (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
 	    zfetch_max_distance));
 	if (zf->zf_numstreams >= max_streams) {
-		t = now - SEC2NSEC(zfetch_min_sec_reap);
+		t = now - zfetch_min_sec_reap;
 		for (zs = list_head(&zf->zf_stream); zs != NULL;
 		    zs = list_next(&zf->zf_stream, zs)) {
-			if (zfs_refcount_count(&zs->zs_refs) != 1)
+			if ((int)(zs->zs_atime - t) >= 0)
 				continue;
-			if (zs->zs_atime > t)
+			if (zfs_refcount_count(&zs->zs_refs) != 1)
 				continue;
-			if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime)
+			if (zs_old == NULL ||
+			    (int)(zs_old->zs_atime - zs->zs_atime) >= 0)
 				zs_old = zs;
 		}
 		if (zs_old) {
 			zs = zs_old;
+			list_remove(&zf->zf_stream, zs);
 			goto reuse;
 		}
 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
@@ -277,24 +305,24 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 	}
 
 	zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
-	zs->zs_fetch = zf;
 	zfs_refcount_create(&zs->zs_callers);
 	zfs_refcount_create(&zs->zs_refs);
 	/* One reference for zf_stream. */
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	zf->zf_numstreams++;
-	list_insert_head(&zf->zf_stream, zs);
 
 reuse:
+	list_insert_head(&zf->zf_stream, zs);
 	zs->zs_blkid = blkid;
+	/* Allow immediate stream reuse until first hit. */
+	zs->zs_atime = now - zfetch_min_sec_reap;
+	memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
 	zs->zs_pf_dist = 0;
+	zs->zs_ipf_dist = 0;
 	zs->zs_pf_start = blkid;
 	zs->zs_pf_end = blkid;
-	zs->zs_ipf_dist = 0;
 	zs->zs_ipf_start = blkid;
 	zs->zs_ipf_end = blkid;
-	/* Allow immediate stream reuse until first hit. */
-	zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap);
 	zs->zs_missed = B_FALSE;
 	zs->zs_more = B_FALSE;
 }
@@ -311,6 +339,120 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
 	aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
 }
 
+/*
+ * Process stream hit access for nblks blocks starting at zs_blkid.  Return
+ * number of blocks to proceed for after aggregation with future ranges.
+ */
+static uint64_t
+dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
+{
+	uint_t i, j;
+
+	/* Optimize sequential accesses (no future ranges). */
+	if (zs->zs_ranges[0].start == 0)
+		goto done;
+
+	/* Look for intersections with further ranges. */
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > nblks)
+			break;
+		if (r->end >= nblks) {
+			nblks = r->end;
+			i++;
+			break;
+		}
+	}
+
+	/* Delete all found intersecting ranges, updates remaining. */
+	for (j = 0; i < ZFETCH_RANGES; i++, j++) {
+		if (zs->zs_ranges[i].start == 0)
+			break;
+		ASSERT3U(zs->zs_ranges[i].start, >, nblks);
+		ASSERT3U(zs->zs_ranges[i].end, >, nblks);
+		zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
+		zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
+	}
+	if (j < ZFETCH_RANGES) {
+		zs->zs_ranges[j].start = 0;
+		zs->zs_ranges[j].end = 0;
+	}
+
+done:
+	zs->zs_blkid += nblks;
+	return (nblks);
+}
+
+/*
+ * Process future stream access for nblks blocks starting at blkid.  Return
+ * number of blocks to proceed for if future ranges reach fill threshold.
+ */
+static uint64_t
+dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
+{
+	ASSERT3U(blkid, >, zs->zs_blkid);
+	blkid -= zs->zs_blkid;
+	ASSERT3U(blkid + nblks, <=, UINT16_MAX);
+
+	/* Search for first and last intersection or insert point. */
+	uint_t f = ZFETCH_RANGES, l = 0, i;
+	for (i = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0 || r->start > blkid + nblks)
+			break;
+		if (r->end < blkid)
+			continue;
+		if (f > i)
+			f = i;
+		if (l < i)
+			l = i;
+	}
+	if (f <= l) {
+		/* Got some intersecting range, expand it if needed. */
+		if (zs->zs_ranges[f].start > blkid)
+			zs->zs_ranges[f].start = blkid;
+		zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
+		if (f < l) {
+			/* Got more than one intersection, remove others. */
+			for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
+				zs->zs_ranges[f].start = zs->zs_ranges[l].start;
+				zs->zs_ranges[f].end = zs->zs_ranges[l].end;
+			}
+			zs->zs_ranges[ZFETCH_RANGES - 1].start = 0;
+			zs->zs_ranges[ZFETCH_RANGES - 1].end = 0;
+		}
+	} else if (i < ZFETCH_RANGES) {
+		/* Got no intersecting ranges, insert new one. */
+		for (l = ZFETCH_RANGES - 1; l > i; l--) {
+			zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
+			zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
+		}
+		zs->zs_ranges[i].start = blkid;
+		zs->zs_ranges[i].end = blkid + nblks;
+	} else {
+		/* No space left to insert.  Drop the range. */
+		return (0);
+	}
+
+	/* Check if with the new access addition we reached fill threshold. */
+	if (zfetch_hole_shift >= 16)
+		return (0);
+	uint_t hole = 0;
+	for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
+		zsrange_t *r = &zs->zs_ranges[i];
+		if (r->start == 0)
+			break;
+		hole += r->start - f;
+		f = r->end;
+		if (hole <= r->end >> zfetch_hole_shift)
+			l = r->end;
+	}
+	if (l > 0)
+		return (dmu_zfetch_hit(zs, l));
+
+	return (0);
+}
+
 /*
  * This is the predictive prefetch entry point.  dmu_zfetch_prepare()
  * associates dnode access specified with blkid and nblks arguments with
@@ -370,53 +512,92 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	mutex_enter(&zf->zf_lock);
 
 	/*
-	 * Find matching prefetch stream.  Depending on whether the accesses
+	 * Find perfect prefetch stream.  Depending on whether the accesses
 	 * are block-aligned, first block of the new access may either follow
 	 * the last block of the previous access, or be equal to it.
 	 */
+	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
+	uint64_t end_blkid = blkid + nblks;
 	for (zs = list_head(&zf->zf_stream); zs != NULL;
 	    zs = list_next(&zf->zf_stream, zs)) {
 		if (blkid == zs->zs_blkid) {
-			break;
+			goto hit;
 		} else if (blkid + 1 == zs->zs_blkid) {
 			blkid++;
 			nblks--;
-			break;
+			goto hit;
 		}
 	}
 
 	/*
-	 * If the file is ending, remove the matching stream if found.
-	 * If not found then it is too late to create a new one now.
+	 * Find close enough prefetch stream.  Access crossing stream position
+	 * is a hit in its new part.  Access ahead of stream position considered
+	 * a hit for metadata prefetch, since we do not care about fill percent,
+	 * or stored for future otherwise.  Access behind stream position is
+	 * silently ignored, since we already skipped it reaching fill percent.
 	 */
-	uint64_t end_of_access_blkid = blkid + nblks;
-	if (end_of_access_blkid >= maxblkid) {
-		if (zs != NULL)
-			dmu_zfetch_stream_remove(zf, zs);
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
+	uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
+	uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
+	for (zs = list_head(&zf->zf_stream); zs != NULL;
+	    zs = list_next(&zf->zf_stream, zs)) {
+		if (blkid > zs->zs_blkid) {
+			if (end_blkid <= zs->zs_blkid + max_reorder) {
+				if (!fetch_data) {
+					nblks = dmu_zfetch_hit(zs,
+					    end_blkid - zs->zs_blkid);
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+					goto future;
+				}
+				nblks = dmu_zfetch_future(zs, blkid, nblks);
+				if (nblks > 0)
+					ZFETCHSTAT_BUMP(zfetchstat_stride);
+				else
+					ZFETCHSTAT_BUMP(zfetchstat_future);
+				goto future;
+			}
+		} else if (end_blkid >= zs->zs_blkid) {
+			nblks -= zs->zs_blkid - blkid;
+			blkid += zs->zs_blkid - blkid;
+			goto hit;
+		} else if (end_blkid + max_reorder > zs->zs_blkid &&
+		    (int)(zs->zs_atime - t) >= 0) {
+			ZFETCHSTAT_BUMP(zfetchstat_past);
+			zs->zs_atime = gethrestime_sec();
+			goto out;
+		}
 	}
 
-	/* Exit if we already prefetched this block before. */
-	if (nblks == 0) {
-		mutex_exit(&zf->zf_lock);
-		if (!have_lock)
-			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		return (NULL);
-	}
+	/*
+	 * This access is not part of any existing stream.  Create a new
+	 * stream for it unless we are at the end of file.
+	 */
+	if (end_blkid < maxblkid)
+		dmu_zfetch_stream_create(zf, end_blkid);
+	mutex_exit(&zf->zf_lock);
+	if (!have_lock)
+		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+	ZFETCHSTAT_BUMP(zfetchstat_misses);
+	return (NULL);
 
-	if (zs == NULL) {
-		/*
-		 * This access is not part of any existing stream.  Create
-		 * a new stream for it.
-		 */
-		dmu_zfetch_stream_create(zf, end_of_access_blkid);
+hit:
+	nblks = dmu_zfetch_hit(zs, nblks);
+	ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+future:
+	zs->zs_atime = gethrestime_sec();
+
+	/* Exit if we already prefetched for this position before. */
+	if (nblks == 0)
+		goto out;
+
+	/* If the file is ending, remove the stream. */
+	end_blkid = zs->zs_blkid;
+	if (end_blkid >= maxblkid) {
+		dmu_zfetch_stream_remove(zf, zs);
+out:
 		mutex_exit(&zf->zf_lock);
 		if (!have_lock)
 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-		ZFETCHSTAT_BUMP(zfetchstat_misses);
 		return (NULL);
 	}
 
@@ -432,7 +613,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	 * than ~6% of ARC held by active prefetches.  It should help with
 	 * getting out of RAM on some badly mispredicted read patterns.
 	 */
-	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
 	unsigned int nbytes = nblks << dbs;
 	unsigned int pf_nblks;
 	if (fetch_data) {
@@ -452,10 +632,10 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	} else {
 		pf_nblks = 0;
 	}
-	if (zs->zs_pf_start < end_of_access_blkid)
-		zs->zs_pf_start = end_of_access_blkid;
-	if (zs->zs_pf_end < end_of_access_blkid + pf_nblks)
-		zs->zs_pf_end = end_of_access_blkid + pf_nblks;
+	if (zs->zs_pf_start < end_blkid)
+		zs->zs_pf_start = end_blkid;
+	if (zs->zs_pf_end < end_blkid + pf_nblks)
+		zs->zs_pf_end = end_blkid + pf_nblks;
 
 	/*
 	 * Do the same for indirects, starting where we will stop reading
@@ -473,9 +653,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 	if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
 		zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
 
-	zs->zs_blkid = end_of_access_blkid;
-	/* Protect the stream from reclamation. */
-	zs->zs_atime = gethrtime();
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	/* Count concurrent callers. */
 	zfs_refcount_add(&zs->zs_callers, NULL);
@@ -483,15 +660,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
 
 	if (!have_lock)
 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-
-	ZFETCHSTAT_BUMP(zfetchstat_hits);
 	return (zs);
 }
 
 void
-dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
+dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
+    boolean_t have_lock)
 {
-	zfetch_t *zf = zs->zs_fetch;
 	int64_t pf_start, pf_end, ipf_start, ipf_end;
 	int epbs, issued;
 
@@ -567,7 +742,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
 
 	zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
 	if (zs)
-		dmu_zfetch_run(zs, missed, have_lock);
+		dmu_zfetch_run(zf, zs, missed, have_lock);
 }
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
@@ -590,3 +765,9 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
 	"Max bytes to prefetch indirects for per stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
+	"Max request reorder distance within a stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
+	"Max log2 fraction of holes in a stream");

From aa5445c28ba6199a15790459290bcd16cef4422d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 8 Apr 2024 18:23:43 -0400
Subject: [PATCH 054/116] Remove db_state DB_NOFILL checks from syncing context

Syncing context should not depend on current state of dbuf, which
could already change several times in later transaction groups,
but rely solely on dirty record for the transaction group being
synced. Some of the checks seem already impossible, while instead
of others I think we should better check for absence of data in
the specific dirty record rather than DB_NOFILL.

Reviewed-by: Robert Evans <evansr@google.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16057
---
 module/zfs/dbuf.c | 44 +++++++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 8c42b116d7e1..d9fc6cf6af34 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4596,11 +4596,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
 		dbuf_prepare_encrypted_dnode_leaf(dr);
 
-	if (db->db_state != DB_NOFILL &&
+	if (*datap != NULL && *datap == db->db_buf &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    zfs_refcount_count(&db->db_holds) > 1 &&
-	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
-	    *datap == db->db_buf) {
+	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
@@ -4889,11 +4888,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-		if (db->db_state != DB_NOFILL) {
-			if (dr->dt.dl.dr_data != NULL &&
-			    dr->dt.dl.dr_data != db->db_buf) {
-				arc_buf_destroy(dr->dt.dl.dr_data, db);
-			}
+		if (dr->dt.dl.dr_data != NULL &&
+		    dr->dt.dl.dr_data != db->db_buf) {
+			arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	} else {
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -5096,21 +5093,18 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 	os = dn->dn_objset;
 
-	if (db->db_state != DB_NOFILL) {
-		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
-			/*
-			 * Private object buffers are released here rather
-			 * than in dbuf_dirty() since they are only modified
-			 * in the syncing context and we don't want the
-			 * overhead of making multiple copies of the data.
-			 */
-			if (BP_IS_HOLE(db->db_blkptr)) {
-				arc_buf_thaw(data);
-			} else {
-				dbuf_release_bp(db);
-			}
-			dbuf_remap(dn, db, tx);
-		}
+	if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+		/*
+		 * Private object buffers are released here rather than in
+		 * dbuf_dirty() since they are only modified in the syncing
+		 * context and we don't want the overhead of making multiple
+		 * copies of the data.
+		 */
+		if (BP_IS_HOLE(db->db_blkptr))
+			arc_buf_thaw(data);
+		else
+			dbuf_release_bp(db);
+		dbuf_remap(dn, db, tx);
 	}
 
 	if (parent != dn->dn_dbuf) {
@@ -5146,7 +5140,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
-	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+	wp_flag |= (data == NULL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 
@@ -5178,7 +5172,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
 		    dr->dt.dl.dr_brtwrite);
 		mutex_exit(&db->db_mtx);
-	} else if (db->db_state == DB_NOFILL) {
+	} else if (data == NULL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(pio, os->os_spa, txg,

From f07389d3ad48ba21480dedcd79b75fe0a31e27bc Mon Sep 17 00:00:00 2001
From: Maxim Filimonov <part1zano@users.noreply.github.com>
Date: Tue, 9 Apr 2024 02:37:41 +0400
Subject: [PATCH 055/116] Fix locale-specific time

In `zpool status -t`, scrub date/time is reported using the C locale,
while trim time is reported using the current one. This is inconsistent.
This patch fixes that.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Maxim Filimonov <che@bein.link>
Closes #15878
Closes #15879
---
 cmd/zpool/zpool_main.c   | 10 ++++------
 lib/libzfs/libzfs_pool.c |  6 ++++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 9df5df0328b3..d670cd1afeb1 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -2289,7 +2289,6 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose)
 		    !vs->vs_scan_removing) {
 			char zbuf[1024];
 			char tbuf[256];
-			struct tm zaction_ts;
 
 			time_t t = vs->vs_initialize_action_time;
 			int initialize_pct = 100;
@@ -2299,8 +2298,8 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose)
 				    100 / (vs->vs_initialize_bytes_est + 1));
 			}
 
-			(void) localtime_r(&t, &zaction_ts);
-			(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+			(void) ctime_r(&t, tbuf);
+			tbuf[24] = 0;
 
 			switch (vs->vs_initialize_state) {
 			case VDEV_INITIALIZE_SUSPENDED:
@@ -2340,7 +2339,6 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose)
 		    !vs->vs_scan_removing) {
 			char zbuf[1024];
 			char tbuf[256];
-			struct tm zaction_ts;
 
 			time_t t = vs->vs_trim_action_time;
 			int trim_pct = 100;
@@ -2349,8 +2347,8 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose)
 				    100 / (vs->vs_trim_bytes_est + 1));
 			}
 
-			(void) localtime_r(&t, &zaction_ts);
-			(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+			(void) ctime_r(&t, tbuf);
+			tbuf[24] = 0;
 
 			switch (vs->vs_trim_state) {
 			case VDEV_TRIM_SUSPENDED:
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index b42e93e3db5d..979bbdd3809a 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -1900,7 +1900,8 @@ zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
 	(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
 
 	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
-	    strftime(timestr, 128, "%c", &t) != 0) {
+	    ctime_r((time_t *)&rewindto, timestr) != NULL) {
+		timestr[24] = 0;
 		if (dryrun) {
 			(void) printf(dgettext(TEXT_DOMAIN,
 			    "Would be able to return %s "
@@ -1962,7 +1963,8 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
 	    "Recovery is possible, but will result in some data loss.\n"));
 
 	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
-	    strftime(timestr, 128, "%c", &t) != 0) {
+	    ctime_r((time_t *)&rewindto, timestr) != NULL) {
+		timestr[24] = 0;
 		(void) printf(dgettext(TEXT_DOMAIN,
 		    "\tReturning the pool to its state as of %s\n"
 		    "\tshould correct the problem.  "),

From 162cc80b8144698b44b5b168dc1853341277de10 Mon Sep 17 00:00:00 2001
From: Benda Xu <heroxbd@gmail.com>
Date: Tue, 9 Apr 2024 07:52:24 +0800
Subject: [PATCH 056/116] etc/init.d: decide which variant to use at build
 time.

Let Debian use the sysv-rc variant of the script, even when OpenRC is
installed. Unlike on Gentoo, OpenRC on Debian consumes both the
sysv-rc scripts and OpenRC ones. ZFS initscripts on Debian should be
the sysv-rc version to provide most compatibility and to integrate
with the rest of initscripts for dependency tracking.

Restrict the substitution in the Makefile to the dedicated list.

This construct is inspired by Mo Zhou's detection of the execution
shell and follows the strategy of Peter in 6ef28c526ba7.

As of 2024, the initscripts are mostly relevant on Debian, Gentoo and
their derivatives.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Benda Xu <orv@debian.org>
Issue #8063
Issue #8204
Issue #8359
Closes #15977
---
 config/Substfiles.am       | 1 +
 config/zfs-build.m4        | 8 +++++---
 etc/init.d/README.md       | 6 +-----
 etc/init.d/zfs-import.in   | 2 +-
 etc/init.d/zfs-load-key.in | 2 +-
 etc/init.d/zfs-mount.in    | 2 +-
 etc/init.d/zfs-share.in    | 3 ++-
 etc/init.d/zfs-zed.in      | 3 ++-
 8 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/config/Substfiles.am b/config/Substfiles.am
index 38e870b2f501..18422bf64301 100644
--- a/config/Substfiles.am
+++ b/config/Substfiles.am
@@ -18,6 +18,7 @@ subst_sed_cmd = \
 	-e 's|@ASAN_ENABLED[@]|$(ASAN_ENABLED)|g' \
 	-e 's|@DEFAULT_INIT_NFS_SERVER[@]|$(DEFAULT_INIT_NFS_SERVER)|g' \
 	-e 's|@DEFAULT_INIT_SHELL[@]|$(DEFAULT_INIT_SHELL)|g' \
+	-e 's|@IS_SYSV_RC[@]|$(IS_SYSV_RC)|g' \
 	-e 's|@LIBFETCH_DYNAMIC[@]|$(LIBFETCH_DYNAMIC)|g' \
 	-e 's|@LIBFETCH_SONAME[@]|$(LIBFETCH_SONAME)|g' \
 	-e 's|@PYTHON[@]|$(PYTHON)|g' \
diff --git a/config/zfs-build.m4 b/config/zfs-build.m4
index 5f36569fe25b..bb5a85d815d1 100644
--- a/config/zfs-build.m4
+++ b/config/zfs-build.m4
@@ -578,13 +578,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [
 
 	AC_MSG_CHECKING([default shell])
 	case "$VENDOR" in
-		gentoo)     DEFAULT_INIT_SHELL="/sbin/openrc-run";;
-		alpine)     DEFAULT_INIT_SHELL="/sbin/openrc-run";;
-		*)          DEFAULT_INIT_SHELL="/bin/sh"         ;;
+		gentoo|alpine)	DEFAULT_INIT_SHELL=/sbin/openrc-run
+				IS_SYSV_RC=false	;;
+		*)		DEFAULT_INIT_SHELL=/bin/sh
+				IS_SYSV_RC=true		;;
 	esac
 
 	AC_MSG_RESULT([$DEFAULT_INIT_SHELL])
 	AC_SUBST(DEFAULT_INIT_SHELL)
+	AC_SUBST(IS_SYSV_RC)
 
 	AC_MSG_CHECKING([default nfs server init script])
 	AS_IF([test "$VENDOR" = "debian"],
diff --git a/etc/init.d/README.md b/etc/init.d/README.md
index 2de05042ce63..da780fdc1222 100644
--- a/etc/init.d/README.md
+++ b/etc/init.d/README.md
@@ -7,11 +7,7 @@ DESCRIPTION
 
   They have been tested successfully on:
 
-    * Debian GNU/Linux Wheezy
-    * Debian GNU/Linux Jessie
-    * Ubuntu Trusty
-    * CentOS 6.0
-    * CentOS 6.6
+    * Debian GNU/Linux Bookworm
     * Gentoo
 
 SUPPORT
diff --git a/etc/init.d/zfs-import.in b/etc/init.d/zfs-import.in
index a9a0604f81ac..ff169eb96d86 100755
--- a/etc/init.d/zfs-import.in
+++ b/etc/init.d/zfs-import.in
@@ -307,7 +307,7 @@ do_start()
 
 # ----------------------------------------------------
 
-if [ ! -e /sbin/openrc-run ]
+if @IS_SYSV_RC@
 then
 	case "$1" in
 		start)
diff --git a/etc/init.d/zfs-load-key.in b/etc/init.d/zfs-load-key.in
index 53c7766b793a..27dfeeb0bcc5 100755
--- a/etc/init.d/zfs-load-key.in
+++ b/etc/init.d/zfs-load-key.in
@@ -104,7 +104,7 @@ do_stop()
 
 # ----------------------------------------------------
 
-if [ ! -e /sbin/openrc-run ]
+if @IS_SYSV_RC@
 then
 	case "$1" in
 		start)
diff --git a/etc/init.d/zfs-mount.in b/etc/init.d/zfs-mount.in
index a0825f19fcdd..6a3ca5f86908 100755
--- a/etc/init.d/zfs-mount.in
+++ b/etc/init.d/zfs-mount.in
@@ -114,7 +114,7 @@ do_stop()
 
 # ----------------------------------------------------
 
-if [ ! -e /sbin/openrc-run ]
+if @IS_SYSV_RC@
 then
 	case "$1" in
 		start)
diff --git a/etc/init.d/zfs-share.in b/etc/init.d/zfs-share.in
index 88978071cbf6..06c59c620b75 100755
--- a/etc/init.d/zfs-share.in
+++ b/etc/init.d/zfs-share.in
@@ -57,7 +57,8 @@ do_stop()
 
 # ----------------------------------------------------
 
-if [ ! -e /sbin/openrc-run ]; then
+if @IS_SYSV_RC@
+then
 	case "$1" in
 		start)
 			do_start
diff --git a/etc/init.d/zfs-zed.in b/etc/init.d/zfs-zed.in
index e9cf8867403c..3d40600cea5d 100755
--- a/etc/init.d/zfs-zed.in
+++ b/etc/init.d/zfs-zed.in
@@ -93,7 +93,8 @@ do_reload()
 
 # ----------------------------------------------------
 
-if [ ! -e /sbin/openrc-run ]; then
+if @IS_SYSV_RC@
+then
 	case "$1" in
 		start)
 			do_start

From 9e63631dea553fb81fe10710e626fae26ff5c14f Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 9 Apr 2024 19:14:04 -0400
Subject: [PATCH 057/116] Small fix to prefetch ranges aggregation

When after #16022 adding new range we aggregate more than two
existing ranges, that should be very rare, only if several streams
overlap, we may need to zero not the last range, but some earlier.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16072
---
 module/zfs/dmu_zfetch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index 915d99916d2e..ed50f1889b59 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -418,8 +418,8 @@ dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
 				zs->zs_ranges[f].start = zs->zs_ranges[l].start;
 				zs->zs_ranges[f].end = zs->zs_ranges[l].end;
 			}
-			zs->zs_ranges[ZFETCH_RANGES - 1].start = 0;
-			zs->zs_ranges[ZFETCH_RANGES - 1].end = 0;
+			zs->zs_ranges[f].start = 0;
+			zs->zs_ranges[f].end = 0;
 		}
 	} else if (i < ZFETCH_RANGES) {
 		/* Got no intersecting ranges, insert new one. */

From 997f85b4d3123286a584bbd3aaac3077a8067abb Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 9 Apr 2024 19:23:19 -0400
Subject: [PATCH 058/116] L2ARC: Relax locking during write

Previous code held ARC state sublist lock throughout all L2ARC
write process, which included number of allocations and even ZIO
issues.  Being blocked in any of those places the code could also
block ARC eviction, that could cause OOM activation or even dead-
lock if system is low on memory or one is too fragmented.

Fix it by dropping the lock as soon as we see a block eligible
for L2ARC writing and pick it up later using earlier inserted
marker.  While there, also reduce scope of hash lock, moving
ZIO allocation and other operations not requiring header access
out of it.  All operations requiring header access move under
hash lock, since L2_WRITING flag does not prevent header eviction
only transition to arc_l2c_only state with L1 header.

To be able to manipulate sublist lock and marker as needed add few
more multilist functions and modify one.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16040
---
 include/sys/multilist.h |   5 +-
 module/zfs/arc.c        | 179 +++++++++++++++++++++-------------------
 module/zfs/dbuf.c       |   2 +-
 module/zfs/dmu_objset.c |  10 +--
 module/zfs/metaslab.c   |   8 +-
 module/zfs/multilist.c  |  26 +++++-
 6 files changed, 131 insertions(+), 99 deletions(-)

diff --git a/include/sys/multilist.h b/include/sys/multilist.h
index 26f37c37ab38..e7de86f2379b 100644
--- a/include/sys/multilist.h
+++ b/include/sys/multilist.h
@@ -82,12 +82,15 @@ int  multilist_is_empty(multilist_t *);
 unsigned int multilist_get_num_sublists(multilist_t *);
 unsigned int multilist_get_random_index(multilist_t *);
 
-multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_lock(multilist_sublist_t *);
+multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
 multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
 void multilist_sublist_unlock(multilist_sublist_t *);
 
 void multilist_sublist_insert_head(multilist_sublist_t *, void *);
 void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
+void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
 void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
 void multilist_sublist_remove(multilist_sublist_t *, void *);
 int  multilist_sublist_is_empty(multilist_sublist_t *);
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index b1bcac6c44bc..16c95db10f47 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -3872,7 +3872,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 
 	ASSERT3P(marker, !=, NULL);
 
-	mls = multilist_sublist_lock(ml, idx);
+	mls = multilist_sublist_lock_idx(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
@@ -3984,6 +3984,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 	return (bytes_evicted);
 }
 
+static arc_buf_hdr_t *
+arc_state_alloc_marker(void)
+{
+	arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+	/*
+	 * A b_spa of 0 is used to indicate that this header is
+	 * a marker. This fact is used in arc_evict_state_impl().
+	 */
+	marker->b_spa = 0;
+
+	return (marker);
+}
+
+static void
+arc_state_free_marker(arc_buf_hdr_t *marker)
+{
+	kmem_cache_free(hdr_full_cache, marker);
+}
+
 /*
  * Allocate an array of buffer headers used as placeholders during arc state
  * eviction.
@@ -3994,16 +4014,8 @@ arc_state_alloc_markers(int count)
 	arc_buf_hdr_t **markers;
 
 	markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
-	for (int i = 0; i < count; i++) {
-		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
-		/*
-		 * A b_spa of 0 is used to indicate that this header is
-		 * a marker. This fact is used in arc_evict_state_impl().
-		 */
-		markers[i]->b_spa = 0;
-
-	}
+	for (int i = 0; i < count; i++)
+		markers[i] = arc_state_alloc_marker();
 	return (markers);
 }
 
@@ -4011,7 +4023,7 @@ static void
 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 {
 	for (int i = 0; i < count; i++)
-		kmem_cache_free(hdr_full_cache, markers[i]);
+		arc_state_free_marker(markers[i]);
 	kmem_free(markers, sizeof (*markers) * count);
 }
 
@@ -4055,7 +4067,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
-		mls = multilist_sublist_lock(ml, i);
+		mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@@ -4120,7 +4132,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	}
 
 	for (int i = 0; i < num_sublists; i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
@@ -8633,7 +8645,7 @@ l2arc_sublist_lock(int list_num)
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
-	return (multilist_sublist_lock(ml, idx));
+	return (multilist_sublist_lock_idx(ml, idx));
 }
 
 /*
@@ -9046,9 +9058,9 @@ l2arc_blk_fetch_done(zio_t *zio)
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
-	arc_buf_hdr_t 		*hdr, *hdr_prev, *head;
-	uint64_t 		write_asize, write_psize, write_lsize, headroom;
-	boolean_t		full;
+	arc_buf_hdr_t 		*hdr, *head, *marker;
+	uint64_t 		write_asize, write_psize, headroom;
+	boolean_t		full, from_head = !arc_warm;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
@@ -9057,10 +9069,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
-	write_lsize = write_asize = write_psize = 0;
+	write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+	marker = arc_state_alloc_marker();
 
 	/*
 	 * Copy buffers for L2ARC writing.
@@ -9075,40 +9088,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				continue;
 		}
 
-		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		uint64_t passed_sz = 0;
-
-		VERIFY3P(mls, !=, NULL);
+		headroom = target_sz * l2arc_headroom;
+		if (zfs_compressed_arc_enabled)
+			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		/*
-		 * L2ARC fast warmup.
-		 *
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
-		if (arc_warm == B_FALSE)
+		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+		ASSERT3P(mls, !=, NULL);
+		if (from_head)
 			hdr = multilist_sublist_head(mls);
 		else
 			hdr = multilist_sublist_tail(mls);
 
-		headroom = target_sz * l2arc_headroom;
-		if (zfs_compressed_arc_enabled)
-			headroom = (headroom * l2arc_headroom_boost) / 100;
-
-		for (; hdr; hdr = hdr_prev) {
+		while (hdr != NULL) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
-			if (arc_warm == B_FALSE)
-				hdr_prev = multilist_sublist_next(mls, hdr);
-			else
-				hdr_prev = multilist_sublist_prev(mls, hdr);
-
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
-				/*
-				 * Skip this buffer rather than waiting.
-				 */
+skip:
+				/* Skip this buffer rather than waiting. */
+				if (from_head)
+					hdr = multilist_sublist_next(mls, hdr);
+				else
+					hdr = multilist_sublist_prev(mls, hdr);
 				continue;
 			}
 
@@ -9123,11 +9130,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
-				continue;
+				goto skip;
 			}
 
 			ASSERT(HDR_HAS_L1HDR(hdr));
-
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
@@ -9149,12 +9155,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			}
 
 			/*
-			 * We rely on the L1 portion of the header below, so
-			 * it's invalid for this header to have been evicted out
-			 * of the ghost cache, prior to being written out. The
-			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+			 * We should not sleep with sublist lock held or it
+			 * may block ARC eviction.  Insert a marker to save
+			 * the position and drop the lock.
 			 */
-			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
+			if (from_head) {
+				multilist_sublist_insert_after(mls, hdr,
+				    marker);
+			} else {
+				multilist_sublist_insert_before(mls, hdr,
+				    marker);
+			}
+			multilist_sublist_unlock(mls);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
@@ -9185,32 +9197,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
-					    ARC_FLAG_L2_WRITING);
+					    ARC_FLAG_L2CACHE);
 					mutex_exit(hash_lock);
-					continue;
+					goto next;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
+			hdr->b_l2hdr.b_dev = dev;
+			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+			hdr->b_l2hdr.b_hits = 0;
+			hdr->b_l2hdr.b_arcs_state =
+			    hdr->b_l1hdr.b_state->arcs_state;
+			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
-				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
-				mutex_exit(&dev->l2ad_mtx);
+			}
+			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
+			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+			    ARC_FLAG_L2_WRITING);
+
+			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
+			l2arc_hdr_arcstats_increment(hdr);
 
+			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
+			mutex_exit(hash_lock);
+
+			if (pio == NULL) {
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
-				/*
-				 * Create a list to save allocated abd buffers
-				 * for l2arc_log_blk_commit().
-				 */
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
@@ -9218,54 +9243,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				    ZIO_FLAG_CANFAIL);
 			}
 
-			hdr->b_l2hdr.b_dev = dev;
-			hdr->b_l2hdr.b_hits = 0;
-
-			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
-			hdr->b_l2hdr.b_arcs_state =
-			    hdr->b_l1hdr.b_state->arcs_state;
-			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
-			mutex_enter(&dev->l2ad_mtx);
-			list_insert_head(&dev->l2ad_buflist, hdr);
-			mutex_exit(&dev->l2ad_mtx);
-
-			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
-			    arc_hdr_size(hdr), hdr);
-
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
-			    hdr->b_l2hdr.b_daddr, asize, to_write,
+			    dev->l2ad_hand, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
-			write_lsize += HDR_GET_LSIZE(hdr);
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
+			zio_nowait(wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
-			l2arc_hdr_arcstats_increment(hdr);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
-			mutex_exit(hash_lock);
-
-			/*
-			 * Append buf info to current log and commit if full.
-			 * arcstat_l2_{size,asize} kstats are updated
-			 * internally.
-			 */
-			if (l2arc_log_blk_insert(dev, hdr)) {
-				/*
-				 * l2ad_hand will be adjusted in
-				 * l2arc_log_blk_commit().
-				 */
+			if (commit) {
+				/* l2ad_hand will be adjusted inside. */
 				write_asize +=
 				    l2arc_log_blk_commit(dev, pio, cb);
 			}
 
-			zio_nowait(wzio);
+next:
+			multilist_sublist_lock(mls);
+			if (from_head)
+				hdr = multilist_sublist_next(mls, marker);
+			else
+				hdr = multilist_sublist_prev(mls, marker);
+			multilist_sublist_remove(mls, marker);
 		}
 
 		multilist_sublist_unlock(mls);
@@ -9274,9 +9279,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			break;
 	}
 
+	arc_state_free_marker(marker);
+
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
-		ASSERT0(write_lsize);
+		ASSERT0(write_psize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
@@ -10604,7 +10611,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
-	L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+	L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index d9fc6cf6af34..5f3643f573f7 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -769,7 +769,7 @@ static void
 dbuf_evict_one(void)
 {
 	int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(
+	multilist_sublist_t *mls = multilist_sublist_lock_idx(
 	    &dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index f098e1daa44b..2ba26f68e398 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1665,7 +1665,7 @@ sync_dnodes_task(void *arg)
 	objset_t *os = soa->soa_os;
 
 	multilist_sublist_t *ms =
-	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+	    multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
 
 	dmu_objset_sync_dnodes(ms, soa->soa_tx);
 
@@ -2076,8 +2076,8 @@ userquota_updates_task(void *arg)
 	dnode_t *dn;
 	userquota_cache_t cache = { { 0 } };
 
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	ASSERT(multilist_sublist_head(list) == NULL ||
 	    dmu_objset_userused_enabled(os));
@@ -2159,8 +2159,8 @@ dnode_rele_task(void *arg)
 	userquota_updates_arg_t *uua = arg;
 	objset_t *os = uua->uua_os;
 
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+	multilist_sublist_t *list = multilist_sublist_lock_idx(
+	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	dnode_t *dn;
 	while ((dn = multilist_sublist_head(list)) != NULL) {
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index c4aa98ced433..9e762357b727 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -639,7 +639,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL) {
@@ -656,7 +656,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 				i--;
 				break;
 			}
-			mls = multilist_sublist_lock(ml, i);
+			mls = multilist_sublist_lock_idx(ml, i);
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			if (txg >
@@ -2232,12 +2232,12 @@ metaslab_potentially_evict(metaslab_class_t *mc)
 		unsigned int idx = multilist_get_random_index(
 		    &mc->mc_metaslab_txg_list);
 		multilist_sublist_t *mls =
-		    multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
+		    multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 		    inuse * size) {
-			VERIFY3P(mls, ==, multilist_sublist_lock(
+			VERIFY3P(mls, ==, multilist_sublist_lock_idx(
 			    &mc->mc_metaslab_txg_list, idx));
 			ASSERT3U(idx, ==,
 			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c
index b1cdf1c5c5f4..3d3ef86e6839 100644
--- a/module/zfs/multilist.c
+++ b/module/zfs/multilist.c
@@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
 	return (random_in_range(ml->ml_num_sublists));
 }
 
+void
+multilist_sublist_lock(multilist_sublist_t *mls)
+{
+	mutex_enter(&mls->mls_lock);
+}
+
 /* Lock and return the sublist specified at the given index */
 multilist_sublist_t *
-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
 {
 	multilist_sublist_t *mls;
 
@@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
 multilist_sublist_t *
 multilist_sublist_lock_obj(multilist_t *ml, void *obj)
 {
-	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+	return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
 }
 
 void
@@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
 	list_insert_tail(&mls->mls_list, obj);
 }
 
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_after(&mls->mls_list, prev, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_before(&mls->mls_list, next, obj);
+}
+
 /*
  * Move the object one element forward in the list.
  *

From d98973dbdd5a85b6c8a8556d5bd5c9903e2d2ee6 Mon Sep 17 00:00:00 2001
From: Benda Xu <heroxbd@gmail.com>
Date: Wed, 10 Apr 2024 07:34:58 +0800
Subject: [PATCH 059/116] config/Substfiles.am: restrict to the dedicated list.

We recover the scope of $(SUBSTFILES) to explicitly control what files
are being generated from the corresponding .in.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Benda Xu <orv@debian.org>
Closes #15980
---
 config/Substfiles.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/Substfiles.am b/config/Substfiles.am
index 18422bf64301..2459637abe6e 100644
--- a/config/Substfiles.am
+++ b/config/Substfiles.am
@@ -44,4 +44,4 @@ SUBSTFILES =
 CLEANFILES += $(SUBSTFILES)
 dist_noinst_DATA += $(SUBSTFILES:=.in)
 
-$(call SUBST,%,)
+$(SUBSTFILES): $(call SUBST,%,)

From e5e2a5a3b872e618af585f1a8cec4782c6f2cfe1 Mon Sep 17 00:00:00 2001
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
Date: Wed, 10 Apr 2024 16:30:25 -0400
Subject: [PATCH 060/116] Add custom debug printing for your asserts

Being able to print custom debug information on assert trip
seems useful.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #15792
---
 include/os/freebsd/spl/sys/debug.h | 149 +++++++++++++++++++++++++---
 include/os/linux/spl/sys/debug.h   | 152 +++++++++++++++++++++++++----
 lib/libspl/include/assert.h        |  97 ++++++++++++++++++
 module/zfs/arc.c                   |   5 +-
 4 files changed, 372 insertions(+), 31 deletions(-)

diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h
index 785fcf62dd16..f041dde34fc8 100644
--- a/include/os/freebsd/spl/sys/debug.h
+++ b/include/os/freebsd/spl/sys/debug.h
@@ -56,11 +56,33 @@
 /*
  * Common DEBUG functionality.
  */
+#ifdef __FreeBSD__
+#include <linux/compiler.h>
+#endif
+
+#ifndef __printflike
+#define	__printflike(a, b)	__printf(a, b)
+#endif
+
+#ifndef __maybe_unused
+#define	__maybe_unused __attribute__((unused))
+#endif
+
+/*
+ * Without this, we see warnings from objtool during normal Linux builds when
+ * the kernel is built with CONFIG_STACK_VALIDATION=y:
+ *
+ * warning: objtool: tsd_create() falls through to next function __list_add()
+ * warning: objtool: .text: unexpected end of section
+ *
+ * Until the toolchain stops doing this, we must only define this attribute on
+ * spl_panic() when doing static analysis.
+ */
 #if defined(__COVERITY__) || defined(__clang_analyzer__)
 __attribute__((__noreturn__))
 #endif
 extern void spl_panic(const char *file, const char *func, int line,
-    const char *fmt, ...) __attribute__((__noreturn__));
+    const char *fmt, ...);
 extern void spl_dumpstack(void);
 
 static inline int
@@ -73,8 +95,10 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #ifndef expect
 #define	expect(expr, value) (__builtin_expect((expr), (value)))
 #endif
+#ifndef __linux__
 #define	likely(expr)   expect((expr) != 0, 1)
 #define	unlikely(expr) expect((expr) != 0, 0)
+#endif
 
 #define	PANIC(fmt, a...)						\
 	spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a)
@@ -84,6 +108,12 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 	    spl_assert("VERIFY(" #cond ") failed\n",			\
 	    __FILE__, __FUNCTION__, __LINE__))
 
+#define	VERIFYF(cond, str, ...)		do {				\
+		if (unlikely(!cond))					\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY(" #cond ") failed " str "\n", __VA_ARGS__);\
+	} while (0)
+
 #define	VERIFY3B(LEFT, OP, RIGHT)	do {				\
 		const boolean_t _verify3_left = (boolean_t)(LEFT);	\
 		const boolean_t _verify3_right = (boolean_t)(RIGHT);	\
@@ -123,7 +153,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		if (unlikely(!(_verify3_left OP _verify3_right)))	\
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
-		    "failed (%p " #OP " %p)\n",				\
+		    "failed (%px " #OP " %px)\n",			\
 		    (void *)_verify3_left,				\
 		    (void *)_verify3_right);				\
 	} while (0)
@@ -142,10 +172,98 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		if (unlikely(!(0 == _verify0_right)))			\
 		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
 		    "VERIFY0P(" #RIGHT ") "				\
-		    "failed (NULL == %p)\n",				\
+		    "failed (NULL == %px)\n",				\
 		    (void *)_verify0_right);				\
 	} while (0)
 
+/*
+ * Note that you should not put any operations you want to always happen
+ * in the print section for ASSERTs unless you only want them to run on
+ * debug builds!
+ * e.g. ASSERT3UF(2, <, 3, "%s", foo(x)), foo(x) won't run on non-debug
+ * builds.
+ */
+
+#define	VERIFY3BF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const boolean_t _verify3_left = (boolean_t)(LEFT);	\
+		const boolean_t _verify3_right = (boolean_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%d " #OP " %d) " STR "\n",			\
+		    (boolean_t)(_verify3_left),				\
+		    (boolean_t)(_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY3SF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const int64_t _verify3_left = (int64_t)(LEFT);		\
+		const int64_t _verify3_right = (int64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%lld " #OP " %lld) " STR "\n",		\
+		    (long long)(_verify3_left),				\
+		    (long long)(_verify3_right),			\
+		    __VA_ARGS);						\
+	} while (0)
+
+#define	VERIFY3UF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const uint64_t _verify3_left = (uint64_t)(LEFT);	\
+		const uint64_t _verify3_right = (uint64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%llu " #OP " %llu) " STR "\n",		\
+		    (unsigned long long)(_verify3_left),		\
+		    (unsigned long long)(_verify3_right),		\
+		    __VA_ARGS);						\
+	} while (0)
+
+#define	VERIFY3PF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const uintptr_t _verify3_left = (uintptr_t)(LEFT);	\
+		const uintptr_t _verify3_right = (uintptr_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%px " #OP " %px) " STR "\n",		\
+		    (void *) (_verify3_left),				\
+		    (void *) (_verify3_right),				\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY0PF(RIGHT, STR, ...)	do {				\
+		const uintptr_t _verify3_left = (uintptr_t)(0);		\
+		const uintptr_t _verify3_right = (uintptr_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left == _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY0(0 == " #RIGHT ") "				\
+		    "failed (0 == %px) " STR "\n",			\
+		    (long long) (_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY0F(RIGHT, STR, ...)	do {				\
+		const int64_t _verify3_left = (int64_t)(0);		\
+		const int64_t _verify3_right = (int64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left == _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY0(0 == " #RIGHT ") "				\
+		    "failed (0 == %lld) " STR "\n",			\
+		    (long long) (_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY_IMPLY(A, B) \
+	((void)(likely((!(A)) || (B)) ||				\
+	    spl_assert("(" #A ") implies (" #B ")",			\
+	    __FILE__, __FUNCTION__, __LINE__)))
+
+#define	VERIFY_EQUIV(A, B) \
+	((void)(likely(!!(A) == !!(B)) || 				\
+	    spl_assert("(" #A ") is equivalent to (" #B ")",		\
+	    __FILE__, __FUNCTION__, __LINE__)))
+
 /*
  * Debugging disabled (--disable-debug)
  */
@@ -162,6 +280,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 	((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z)))
 #define	ASSERT0(x)		((void) sizeof ((uintptr_t)(x)))
 #define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT3BF(x, y, z, str, ...)	ASSERT3B(x, y, z)
+#define	ASSERT3SF(x, y, z, str, ...)	ASSERT3S(x, y, z)
+#define	ASSERT3UF(x, y, z, str, ...)	ASSERT3U(x, y, z)
+#define	ASSERT3PF(x, y, z, str, ...)	ASSERT3P(x, y, z)
+#define	ASSERT0PF(x, str, ...)		ASSERT0P(x)
+#define	ASSERT0F(x, str, ...)		ASSERT0(x)
+#define	ASSERTF(x, str, ...)		ASSERT(x)
 #define	IMPLY(A, B)							\
 	((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B)))
 #define	EQUIV(A, B)		\
@@ -178,16 +303,16 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #define	ASSERT3P	VERIFY3P
 #define	ASSERT0		VERIFY0
 #define	ASSERT0P	VERIFY0P
+#define	ASSERT3BF	VERIFY3BF
+#define	ASSERT3SF	VERIFY3SF
+#define	ASSERT3UF	VERIFY3UF
+#define	ASSERT3PF	VERIFY3PF
+#define	ASSERT0PF	VERIFY0PF
+#define	ASSERT0F	VERIFY0F
+#define	ASSERTF		VERIFYF
 #define	ASSERT		VERIFY
-#define	IMPLY(A, B) \
-	((void)(likely((!(A)) || (B)) ||				\
-	    spl_assert("(" #A ") implies (" #B ")",			\
-	    __FILE__, __FUNCTION__, __LINE__)))
-#define	EQUIV(A, B) \
-	((void)(likely(!!(A) == !!(B)) || 				\
-	    spl_assert("(" #A ") is equivalent to (" #B ")",		\
-	    __FILE__, __FUNCTION__, __LINE__)))
-
+#define	IMPLY		VERIFY_IMPLY
+#define	EQUIV		VERIFY_EQUIV
 
 #endif /* NDEBUG */
 
diff --git a/include/os/linux/spl/sys/debug.h b/include/os/linux/spl/sys/debug.h
index 288193ad21c5..f041dde34fc8 100644
--- a/include/os/linux/spl/sys/debug.h
+++ b/include/os/linux/spl/sys/debug.h
@@ -1,24 +1,29 @@
 /*
- *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
- *  Copyright (C) 2007 The Regents of the University of California.
- *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
- *  UCRL-CODE-235197
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
  *
- *  This file is part of the SPL, Solaris Porting Layer.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
  *
- *  The SPL is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  *
- *  The SPL is distributed in the hope that it will be useful, but WITHOUT
- *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- *  for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ * $FreeBSD$
  */
 
 /*
@@ -47,10 +52,17 @@
 #ifndef _SPL_DEBUG_H
 #define	_SPL_DEBUG_H
 
+
 /*
  * Common DEBUG functionality.
  */
+#ifdef __FreeBSD__
+#include <linux/compiler.h>
+#endif
+
+#ifndef __printflike
 #define	__printflike(a, b)	__printf(a, b)
+#endif
 
 #ifndef __maybe_unused
 #define	__maybe_unused __attribute__((unused))
@@ -80,6 +92,14 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 	return (0);
 }
 
+#ifndef expect
+#define	expect(expr, value) (__builtin_expect((expr), (value)))
+#endif
+#ifndef __linux__
+#define	likely(expr)   expect((expr) != 0, 1)
+#define	unlikely(expr) expect((expr) != 0, 0)
+#endif
+
 #define	PANIC(fmt, a...)						\
 	spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a)
 
@@ -88,6 +108,12 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 	    spl_assert("VERIFY(" #cond ") failed\n",			\
 	    __FILE__, __FUNCTION__, __LINE__))
 
+#define	VERIFYF(cond, str, ...)		do {				\
+		if (unlikely(!cond))					\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY(" #cond ") failed " str "\n", __VA_ARGS__);\
+	} while (0)
+
 #define	VERIFY3B(LEFT, OP, RIGHT)	do {				\
 		const boolean_t _verify3_left = (boolean_t)(LEFT);	\
 		const boolean_t _verify3_right = (boolean_t)(RIGHT);	\
@@ -150,6 +176,84 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 		    (void *)_verify0_right);				\
 	} while (0)
 
+/*
+ * Note that you should not put any operations you want to always happen
+ * in the print section for ASSERTs unless you only want them to run on
+ * debug builds!
+ * e.g. ASSERT3UF(2, <, 3, "%s", foo(x)), foo(x) won't run on non-debug
+ * builds.
+ */
+
+#define	VERIFY3BF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const boolean_t _verify3_left = (boolean_t)(LEFT);	\
+		const boolean_t _verify3_right = (boolean_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%d " #OP " %d) " STR "\n",			\
+		    (boolean_t)(_verify3_left),				\
+		    (boolean_t)(_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY3SF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const int64_t _verify3_left = (int64_t)(LEFT);		\
+		const int64_t _verify3_right = (int64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%lld " #OP " %lld) " STR "\n",		\
+		    (long long)(_verify3_left),				\
+		    (long long)(_verify3_right),			\
+		    __VA_ARGS);						\
+	} while (0)
+
+#define	VERIFY3UF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const uint64_t _verify3_left = (uint64_t)(LEFT);	\
+		const uint64_t _verify3_right = (uint64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%llu " #OP " %llu) " STR "\n",		\
+		    (unsigned long long)(_verify3_left),		\
+		    (unsigned long long)(_verify3_right),		\
+		    __VA_ARGS);						\
+	} while (0)
+
+#define	VERIFY3PF(LEFT, OP, RIGHT, STR, ...)	do {			\
+		const uintptr_t _verify3_left = (uintptr_t)(LEFT);	\
+		const uintptr_t _verify3_right = (uintptr_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left OP _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY3(" #LEFT " "  #OP " "  #RIGHT ") "		\
+		    "failed (%px " #OP " %px) " STR "\n",		\
+		    (void *) (_verify3_left),				\
+		    (void *) (_verify3_right),				\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY0PF(RIGHT, STR, ...)	do {				\
+		const uintptr_t _verify3_left = (uintptr_t)(0);		\
+		const uintptr_t _verify3_right = (uintptr_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left == _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY0(0 == " #RIGHT ") "				\
+		    "failed (0 == %px) " STR "\n",			\
+		    (long long) (_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
+#define	VERIFY0F(RIGHT, STR, ...)	do {				\
+		const int64_t _verify3_left = (int64_t)(0);		\
+		const int64_t _verify3_right = (int64_t)(RIGHT);	\
+		if (unlikely(!(_verify3_left == _verify3_right)))	\
+		    spl_panic(__FILE__, __FUNCTION__, __LINE__,		\
+		    "VERIFY0(0 == " #RIGHT ") "				\
+		    "failed (0 == %lld) " STR "\n",			\
+		    (long long) (_verify3_right),			\
+		    __VA_ARGS__);					\
+	} while (0)
+
 #define	VERIFY_IMPLY(A, B) \
 	((void)(likely((!(A)) || (B)) ||				\
 	    spl_assert("(" #A ") implies (" #B ")",			\
@@ -176,6 +280,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 	((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z)))
 #define	ASSERT0(x)		((void) sizeof ((uintptr_t)(x)))
 #define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT3BF(x, y, z, str, ...)	ASSERT3B(x, y, z)
+#define	ASSERT3SF(x, y, z, str, ...)	ASSERT3S(x, y, z)
+#define	ASSERT3UF(x, y, z, str, ...)	ASSERT3U(x, y, z)
+#define	ASSERT3PF(x, y, z, str, ...)	ASSERT3P(x, y, z)
+#define	ASSERT0PF(x, str, ...)		ASSERT0P(x)
+#define	ASSERT0F(x, str, ...)		ASSERT0(x)
+#define	ASSERTF(x, str, ...)		ASSERT(x)
 #define	IMPLY(A, B)							\
 	((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B)))
 #define	EQUIV(A, B)		\
@@ -192,6 +303,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #define	ASSERT3P	VERIFY3P
 #define	ASSERT0		VERIFY0
 #define	ASSERT0P	VERIFY0P
+#define	ASSERT3BF	VERIFY3BF
+#define	ASSERT3SF	VERIFY3SF
+#define	ASSERT3UF	VERIFY3UF
+#define	ASSERT3PF	VERIFY3PF
+#define	ASSERT0PF	VERIFY0PF
+#define	ASSERT0F	VERIFY0F
+#define	ASSERTF		VERIFYF
 #define	ASSERT		VERIFY
 #define	IMPLY		VERIFY_IMPLY
 #define	EQUIV		VERIFY_EQUIV
diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h
index 57f5719c1ac1..155bbab3020a 100644
--- a/lib/libspl/include/assert.h
+++ b/lib/libspl/include/assert.h
@@ -70,6 +70,15 @@ libspl_assert(const char *buf, const char *file, const char *func, int line)
 #define	VERIFY(cond)							\
 	(void) ((!(cond)) &&						\
 	    libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__))
+
+#define	VERIFYF(cond, STR, ...)						\
+do {									\
+	if (!(cond))							\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s " STR, #cond,					\
+		    __VA_ARGS__);					\
+} while (0)
+
 #define	verify(cond)							\
 	(void) ((!(cond)) &&						\
 	    libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__))
@@ -132,6 +141,79 @@ do {									\
 		    (void *)__left);					\
 } while (0)
 
+/*
+ * This is just here because cstyle gets upset about #LEFT
+ * on a newline.
+ */
+
+/* BEGIN CSTYLED */
+#define	VERIFY3BF(LEFT, OP, RIGHT, STR, ...)				\
+do {									\
+	const boolean_t __left = (boolean_t)(LEFT);			\
+	const boolean_t __right = (boolean_t)(RIGHT);			\
+	if (!(__left OP __right))					\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s %s %s (0x%llx %s 0x%llx) " STR,			\
+		    #LEFT, #OP, #RIGHT,					\
+		    (u_longlong_t)__left, #OP, (u_longlong_t)__right,	\
+		    __VA_ARGS__);					\
+} while (0)
+
+#define	VERIFY3SF(LEFT, OP, RIGHT, STR, ...)				\
+do {									\
+	const int64_t __left = (int64_t)(LEFT);				\
+	const int64_t __right = (int64_t)(RIGHT);			\
+	if (!(__left OP __right))					\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s %s %s (0x%llx %s 0x%llx) " STR,			\
+		    #LEFT, #OP, #RIGHT,					\
+		    (u_longlong_t)__left, #OP, (u_longlong_t)__right,	\
+		    __VA_ARGS__);					\
+} while (0)
+
+#define	VERIFY3UF(LEFT, OP, RIGHT, STR, ...)				\
+do {									\
+	const uint64_t __left = (uint64_t)(LEFT);			\
+	const uint64_t __right = (uint64_t)(RIGHT);			\
+	if (!(__left OP __right))					\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s %s %s (0x%llx %s 0x%llx) " STR,			\
+		    #LEFT, #OP, #RIGHT,					\
+		    (u_longlong_t)__left, #OP, (u_longlong_t)__right,	\
+		    __VA_ARGS__);					\
+} while (0)
+
+#define	VERIFY3PF(LEFT, OP, RIGHT, STR, ...)				\
+do {									\
+	const uintptr_t __left = (uintptr_t)(LEFT);			\
+	const uintptr_t __right = (uintptr_t)(RIGHT);			\
+	if (!(__left OP __right))					\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s %s %s (0x%llx %s 0x%llx) " STR,			\
+		    #LEFT, #OP, #RIGHT,					\
+		    (u_longlong_t)__left, #OP, (u_longlong_t)__right,	\
+		    __VA_ARGS__);					\
+} while (0)
+/* END CSTYLED */
+
+#define	VERIFY0F(LEFT, STR, ...)					\
+do {									\
+	const uint64_t __left = (uint64_t)(LEFT);			\
+	if (!(__left == 0))						\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s == 0 (0x%llx == 0) " STR, #LEFT,		\
+		    (u_longlong_t)__left, __VA_ARGS__);			\
+} while (0)
+
+#define	VERIFY0PF(LEFT, STR, ...)					\
+do {									\
+	const uintptr_t __left = (uintptr_t)(LEFT);			\
+	if (!(__left == 0))						\
+		libspl_assertf(__FILE__, __FUNCTION__, __LINE__,	\
+		    "%s == 0 (%p == 0) " STR, #LEFT,			\
+		    (u_longlong_t)__left, __VA_ARGS__);			\
+} while (0)
+
 #ifdef assert
 #undef assert
 #endif
@@ -147,7 +229,15 @@ do {									\
 	((void) sizeof ((uintptr_t)(x)), (void) sizeof ((uintptr_t)(z)))
 #define	ASSERT0(x)		((void) sizeof ((uintptr_t)(x)))
 #define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT3BF(x, y, z, str, ...)	ASSERT3B(x, y, z)
+#define	ASSERT3SF(x, y, z, str, ...)	ASSERT3S(x, y, z)
+#define	ASSERT3UF(x, y, z, str, ...)	ASSERT3U(x, y, z)
+#define	ASSERT3PF(x, y, z, str, ...)	ASSERT3P(x, y, z)
+#define	ASSERT0P(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERT0PF(x, str, ...)		ASSERT0P(x)
+#define	ASSERT0F(x, str, ...)		ASSERT0(x)
 #define	ASSERT(x)		((void) sizeof ((uintptr_t)(x)))
+#define	ASSERTF(x, str, ...)	ASSERT(x)
 #define	assert(x)		((void) sizeof ((uintptr_t)(x)))
 #define	IMPLY(A, B)							\
 	((void) sizeof ((uintptr_t)(A)), (void) sizeof ((uintptr_t)(B)))
@@ -160,7 +250,14 @@ do {									\
 #define	ASSERT3P	VERIFY3P
 #define	ASSERT0		VERIFY0
 #define	ASSERT0P	VERIFY0P
+#define	ASSERT3BF	VERIFY3BF
+#define	ASSERT3SF	VERIFY3SF
+#define	ASSERT3UF	VERIFY3UF
+#define	ASSERT3PF	VERIFY3PF
+#define	ASSERT0PF	VERIFY0PF
+#define	ASSERT0F	VERIFY0F
 #define	ASSERT		VERIFY
+#define	ASSERTF		VERIFYF
 #define	assert		VERIFY
 #define	IMPLY(A, B) \
 	((void)(((!(A)) || (B)) || \
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 16c95db10f47..6954051b1d19 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1960,7 +1960,7 @@ arc_buf_untransform_in_place(arc_buf_t *buf)
 	ASSERT(HDR_ENCRYPTED(hdr));
 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+	ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf);
 
 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
 	    arc_buf_size(buf));
@@ -2083,7 +2083,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 		 * allocate a new data buffer for the buf.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
-			ASSERT(ARC_BUF_COMPRESSED(buf));
+			ASSERTF(ARC_BUF_COMPRESSED(buf),
+			"buf %p was uncompressed", buf);
 
 			/* We need to give the buf its own b_data */
 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;

From e5ddecd1a7e33bc341e7b5e8dd25d2fe478de8f2 Mon Sep 17 00:00:00 2001
From: Jason Lee <calccrypto@gmail.com>
Date: Wed, 10 Apr 2024 16:01:39 -0600
Subject: [PATCH 061/116] return NULL at end of send_progress_thread

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Jason Lee <jasonlee@lanl.gov>
Closes #16074
---
 lib/libzfs/libzfs_sendrecv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index d7b90ccb1cba..526f57ea403c 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -1053,6 +1053,7 @@ send_progress_thread(void *arg)
 		}
 	}
 	pthread_cleanup_pop(B_TRUE);
+	return (NULL);
 }
 
 static boolean_t

From 44f337be30e1502b32c8d381344044f15dd34674 Mon Sep 17 00:00:00 2001
From: Andy Fiddaman <illumos@fiddaman.net>
Date: Thu, 11 Apr 2024 22:38:22 +0100
Subject: [PATCH 062/116] Illumos#16463 zfs_ioc_recv leaks nvlist

In https://www.illumos.org/issues/16463 it was observed that
an nvlist was being leaked in zfs_ioc_recv() due a missing
call to nvlist_free for "hidden_args".
For OpenZFS the same issue exists in zfs_ioc_recv_new() and
is addressed by this PR.

This change also properly frees nvlists in the unlikely
event that a call to get_nvlist() fails.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Igor Kozhukhov <igor@dilos.org>
Signed-off-by: Andy Fiddaman <illumos@fiddaman.net>
Closes #16077
---
 module/zfs/zfs_ioctl.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index dca15f4b826d..2ac1e34dccec 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -40,6 +40,7 @@
  * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
  * Copyright (c) 2019, 2021, Klara Inc.
  * Copyright (c) 2019, Allan Jude
+ * Copyright 2024 Oxide Computer Company
  */
 
 /*
@@ -5345,8 +5346,9 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '@') == NULL ||
-	    strchr(zc->zc_value, '%'))
+	    strchr(zc->zc_value, '%') != NULL) {
 		return (SET_ERROR(EINVAL));
+	}
 
 	(void) strlcpy(tofs, zc->zc_value, sizeof (tofs));
 	tosnap = strchr(tofs, '@');
@@ -5354,13 +5356,15 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 
 	if (zc->zc_nvlist_src != 0 &&
 	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &recvdprops)) != 0)
-		return (error);
+	    zc->zc_iflags, &recvdprops)) != 0) {
+		goto out;
+	}
 
 	if (zc->zc_nvlist_conf != 0 &&
 	    (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    zc->zc_iflags, &localprops)) != 0)
-		return (error);
+	    zc->zc_iflags, &localprops)) != 0) {
+		goto out;
+	}
 
 	if (zc->zc_string[0])
 		origin = zc->zc_string;
@@ -5372,8 +5376,6 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 	error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops,
 	    NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record,
 	    &zc->zc_cookie, &zc->zc_obj, &errors);
-	nvlist_free(recvdprops);
-	nvlist_free(localprops);
 
 	/*
 	 * Now that all props, initial and delayed, are set, report the prop
@@ -5389,7 +5391,10 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 		error = SET_ERROR(EINVAL);
 	}
 
+out:
 	nvlist_free(errors);
+	nvlist_free(recvdprops);
+	nvlist_free(localprops);
 
 	return (error);
 }
@@ -5456,8 +5461,9 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 
 	if (dataset_namecheck(snapname, NULL, NULL) != 0 ||
 	    strchr(snapname, '@') == NULL ||
-	    strchr(snapname, '%'))
+	    strchr(snapname, '%') != NULL) {
 		return (SET_ERROR(EINVAL));
+	}
 
 	(void) strlcpy(tofs, snapname, sizeof (tofs));
 	tosnap = strchr(tofs, '@');
@@ -5481,15 +5487,15 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 	/* we still use "props" here for backwards compatibility */
 	error = nvlist_lookup_nvlist(innvl, "props", &recvprops);
 	if (error && error != ENOENT)
-		return (error);
+		goto out;
 
 	error = nvlist_lookup_nvlist(innvl, "localprops", &localprops);
 	if (error && error != ENOENT)
-		return (error);
+		goto out;
 
 	error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
 	if (error && error != ENOENT)
-		return (error);
+		goto out;
 
 	error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops,
 	    hidden_args, force, heal, resumable, input_fd, begin_record,
@@ -5499,9 +5505,11 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 	fnvlist_add_uint64(outnvl, "error_flags", errflags);
 	fnvlist_add_nvlist(outnvl, "errors", errors);
 
+out:
 	nvlist_free(errors);
 	nvlist_free(recvprops);
 	nvlist_free(localprops);
+	nvlist_free(hidden_args);
 
 	return (error);
 }

From bc27c494049e5282f90b103ee45d0fe12310aac4 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 10 Apr 2024 11:19:50 +1000
Subject: [PATCH 063/116] tests: add test for vdev_disk page alignment check

This provides a test driver and a set of test vectors for the page
alignment check callback function vdev_disk_check_pages_cb().

Because there's no good facility for exposing this function to a
userspace test right now, for now I'm just duplicating the function and
adding commentary to remind people to keep them in sync.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16076
---
 module/os/linux/zfs/vdev_disk.c               |   6 +
 tests/runfiles/common.run                     |   6 +
 tests/runfiles/sanity.run                     |   6 +
 tests/zfs-tests/Makefile.am                   |   3 +
 .../tests/functional/vdev_disk/.gitignore     |   1 +
 .../functional/vdev_disk/page_alignment.c     | 413 ++++++++++++++++++
 6 files changed, 435 insertions(+)
 create mode 100644 tests/zfs-tests/tests/functional/vdev_disk/.gitignore
 create mode 100644 tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index a560bca918a8..77773c4f2bf2 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -853,6 +853,11 @@ BIO_END_IO_PROTO(vbio_completion, bio, error)
  * pages) but we still have to ensure the data portion is correctly sized and
  * aligned to the logical block size, to ensure that if the kernel wants to
  * split the BIO, the two halves will still be properly aligned.
+ *
+ * NOTE: if you change this function, change the copy in
+ * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test
+ * data there to validate the change you're making.
+ *
  */
 typedef struct {
 	uint_t  bmask;
@@ -863,6 +868,7 @@ typedef struct {
 static int
 vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
 {
+	(void) page;
 	vdev_disk_check_pages_t *s = priv;
 
 	/*
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 912344b4edde..4295ca1b6f31 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -971,6 +971,12 @@ tests = [
     'userspace_send_encrypted', 'userspace_encrypted_13709']
 tags = ['functional', 'userquota']
 
+[tests/functional/vdev_disk:Linux]
+pre =
+post =
+tests = ['page_alignment']
+tags = ['functional', 'vdev_disk']
+
 [tests/functional/vdev_zaps]
 tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos',
     'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos',
diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run
index ab41c05b8473..598123bcd277 100644
--- a/tests/runfiles/sanity.run
+++ b/tests/runfiles/sanity.run
@@ -599,6 +599,12 @@ tags = ['functional', 'truncate']
 tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool']
 tags = ['functional', 'upgrade']
 
+[tests/functional/vdev_disk:Linux]
+pre =
+post =
+tests = ['page_alignment']
+tags = ['functional', 'vdev_disk']
+
 [tests/functional/vdev_zaps]
 tests = ['vdev_zaps_001_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos',
     'vdev_zaps_005_pos', 'vdev_zaps_006_pos']
diff --git a/tests/zfs-tests/Makefile.am b/tests/zfs-tests/Makefile.am
index 3dd1a6452728..40a361d582a2 100644
--- a/tests/zfs-tests/Makefile.am
+++ b/tests/zfs-tests/Makefile.am
@@ -13,6 +13,9 @@ scripts_zfs_tests_functional_hkdf_PROGRAMS = %D%/tests/functional/hkdf/hkdf_test
 %C%_tests_functional_hkdf_hkdf_test_LDADD = \
 	libzpool.la
 
+scripts_zfs_tests_functional_vdev_diskdir = $(datadir)/$(PACKAGE)/zfs-tests/tests/functional/vdev_disk
+scripts_zfs_tests_functional_vdev_disk_PROGRAMS = %D%/tests/functional/vdev_disk/page_alignment
+
 scripts_zfs_tests_functional_cp_filesdir = $(datadir)/$(PACKAGE)/zfs-tests/tests/functional/cp_files
 scripts_zfs_tests_functional_cp_files_PROGRAMS = %D%/tests/functional/cp_files/seekflood
 
diff --git a/tests/zfs-tests/tests/functional/vdev_disk/.gitignore b/tests/zfs-tests/tests/functional/vdev_disk/.gitignore
new file mode 100644
index 000000000000..27653e5924fc
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/vdev_disk/.gitignore
@@ -0,0 +1 @@
+page_alignment
diff --git a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c
new file mode 100644
index 000000000000..98d19a1280ea
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c
@@ -0,0 +1,413 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <sys/param.h>
+#include <stdlib.h>
+
+/*
+ * This tests the vdev_disk page alignment check callback
+ * vdev_disk_check_pages_cb(). For now, this test includes a copy of that
+ * function from module/os/linux/zfs/vdev_disk.c. If you change it here,
+ * remember to change it there too, and add tests data here to validate the
+ * change you're making.
+ */
+
+struct page;
+
+typedef struct {
+	uint32_t  bmask;
+	uint32_t  npages;
+	uint32_t  end;
+} vdev_disk_check_pages_t;
+
+static int
+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	(void) page;
+	vdev_disk_check_pages_t *s = priv;
+
+	/*
+	 * If we didn't finish on a block size boundary last time, then there
+	 * would be a gap if we tried to use this ABD as-is, so abort.
+	 */
+	if (s->end != 0)
+		return (1);
+
+	/*
+	 * Note if we're taking less than a full block, so we can check it
+	 * above on the next call.
+	 */
+	s->end = len & s->bmask;
+
+	/* All blocks after the first must start on a block size boundary. */
+	if (s->npages != 0 && (off & s->bmask) != 0)
+		return (1);
+
+	s->npages++;
+	return (0);
+}
+
+typedef struct {
+	/* test name */
+	const char	*name;
+
+	/* blocks size mask */
+	uint32_t	mask;
+
+	/* amount of data to take */
+	size_t		size;
+
+	/* [start offset in page, len to end of page or size] */
+	size_t		pages[16][2];
+} page_test_t;
+
+static const page_test_t valid_tests[] = {
+	/* 512B block tests */
+	{
+		"512B blocks, 4K single page",
+		0x1ff, 0x1000, {
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"512B blocks, 1K at start of page",
+		0x1ff, 0x400, {
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"512B blocks, 1K at end of page",
+		0x1ff, 0x400, {
+			{ 0x0c00, 0x0400 },
+		},
+	}, {
+		"512B blocks, 1K within page, 512B start offset",
+		0x1ff, 0x400, {
+			{ 0x0200, 0x0e00 },
+		},
+	}, {
+		"512B blocks, 8K across 2x4K pages",
+		0x1ff, 0x2000, {
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"512B blocks, 4K across two pages, 2K start offset",
+		0x1ff, 0x1000, {
+			{ 0x0800, 0x0800 },
+			{ 0x0,    0x0800 },
+		},
+	}, {
+		"512B blocks, 16K across 5x4K pages, 512B start offset",
+		0x1ff, 0x4000, {
+			{ 0x0200, 0x0e00 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0200 },
+		},
+	}, {
+		"512B blocks, 64K data, 8x8K compound pages",
+		0x1ff, 0x10000, {
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+		},
+	}, {
+		"512B blocks, 64K data, 9x8K compound pages, 512B start offset",
+		0x1ff, 0x10000, {
+			{ 0x0200, 0x1e00 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x0200 },
+		},
+	}, {
+		"512B blocks, 64K data, 2x16K compound pages, 8x4K pages",
+		0x1ff, 0x10000, {
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"512B blocks, 64K data, mixed 4K/8K/16K pages",
+		0x1ff, 0x10000, {
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+		},
+	}, {
+		"512B blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset",
+		0x1ff, 0x10000, {
+			{ 0x0400, 0x0c00 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x8000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0400 },
+		},
+	},
+
+	/* 4K block tests */
+	{
+		"4K blocks, 4K single page",
+		0xfff, 0x1000, {
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"4K blocks, 1K at start of page",
+		0xfff, 0x400, {
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"4K blocks, 1K at end of page",
+		0xfff, 0x400, {
+			{ 0x0c00, 0x0400 },
+		},
+	}, {
+		"4K blocks, 1K within page, 512B start offset",
+		0xfff, 0x400, {
+			{ 0x0200, 0x0e00 },
+		},
+	}, {
+		"4K blocks, 8K across 2x4K pages",
+		0xfff, 0x2000, {
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"4K blocks, 4K across two pages, 2K start offset",
+		0xfff, 0x1000, {
+			{ 0x0800, 0x0800 },
+			{ 0x0,    0x0800 },
+		},
+	}, {
+		"4K blocks, 16K across 5x4K pages, 512B start offset",
+		0xfff, 0x4000, {
+			{ 0x0200, 0x0e00 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0200 },
+		},
+	}, {
+		"4K blocks, 64K data, 8x8K compound pages",
+		0xfff, 0x10000, {
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x2000 },
+		},
+	}, {
+		"4K blocks, 64K data, 9x8K compound pages, 512B start offset",
+		0xfff, 0x10000, {
+			{ 0x0200, 0x1e00 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x0200 },
+		},
+	}, {
+		"4K blocks, 64K data, 2x16K compound pages, 8x4K pages",
+		0xfff, 0x10000, {
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"4K blocks, 64K data, mixed 4K/8K/16K pages",
+		0xfff, 0x10000, {
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x8000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x2000 },
+		},
+	}, {
+		"4K blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset",
+		0xfff, 0x10000, {
+			{ 0x0400, 0x0c00 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x8000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0400 },
+		},
+	},
+
+	{ 0 },
+};
+
+static const page_test_t invalid_tests[] = {
+	{
+		"512B blocks, 16K data, 512 leader (gang block simulation)",
+		0x1ff, 0x8000, {
+			{ 0x0, 0x0200 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x1000 },
+			{ 0x0, 0x0c00 },
+		},
+	}, {
+		"4K blocks, 32K data, 2 incompatible spans "
+		"(gang abd simulation)",
+		0xfff, 0x8000, {
+			{ 0x0800, 0x0800 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0800 },
+			{ 0x0800, 0x0800 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0800 },
+		},
+	},
+	{ 0 },
+};
+
+static bool
+run_test(const page_test_t *test, bool verbose)
+{
+	size_t rem = test->size;
+
+	vdev_disk_check_pages_t s = {
+	    .bmask = 0xfff,
+	    .npages = 0,
+	    .end = 0,
+	};
+
+	for (int i = 0; test->pages[i][1] > 0; i++) {
+		size_t off = test->pages[i][0];
+		size_t len = test->pages[i][1];
+
+		size_t take = MIN(rem, len);
+
+		if (verbose)
+			printf("  page %d [off %lx len %lx], "
+			    "rem %lx, take %lx\n",
+			    i, off, len, rem, take);
+
+		if (vdev_disk_check_pages_cb(NULL, off, take, &s)) {
+			if (verbose)
+				printf("  ABORT: misalignment detected, "
+				    "rem %lx\n", rem);
+			return (false);
+		}
+
+		rem -= take;
+		if (rem == 0)
+			break;
+	}
+
+	if (rem > 0) {
+		if (verbose)
+			printf("  ABORT: ran out of pages, rem %lx\n", rem);
+		return (false);
+	}
+
+	return (true);
+}
+
+static void
+run_test_set(const page_test_t *tests, bool want, int *ntests, int *npassed)
+{
+	for (const page_test_t *test = &tests[0]; test->name; test++) {
+		bool pass = (run_test(test, false) == want);
+		if (pass) {
+			printf("%s: PASS\n", test->name);
+			(*npassed)++;
+		} else {
+			printf("%s: FAIL [expected %s, got %s]\n", test->name,
+			    want ? "VALID" : "INVALID",
+			    want ? "INVALID" : "VALID");
+			run_test(test, true);
+		}
+		(*ntests)++;
+	}
+}
+
+int main(void) {
+	int ntests = 0, npassed = 0;
+
+	run_test_set(valid_tests, true, &ntests, &npassed);
+	run_test_set(invalid_tests, false, &ntests, &npassed);
+
+	printf("\n%d/%d tests passed\n", npassed, ntests);
+
+	return (ntests == npassed ? 0 : 1);
+}

From 1bf649cb0a1cc6e48dce848611ba327eb283000e Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 10 Apr 2024 13:14:13 +1000
Subject: [PATCH 064/116] vdev_disk: fix alignment check when buffer has
 non-zero starting offset

If a linear buffer spans multiple pages, and the first page has a
non-zero starting offset, the checker would not include the offset, and
so would think there was an alignment gap at the end of the first page,
rather than at the start.

That is, for a 16K buffer spread across five pages with an initial 512B
offset:

    [.XXXXXXX][XXXXXXXX][XXXXXXXX][XXXXXXXX][XXXXXXX.]

It would be interpreted as:

    [XXXXXXX.][XXXXXXXX]...

And be rejected as misaligned.

Since it's already a linear ABD, the "linearising" copy would just reuse
the buffer as-is, and the second check would failing, tripping the
VERIFY in vdev_disk_io_rw().

This commit fixes all this by including the offset in the check for
end-of-page alignment.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16076
---
 module/os/linux/zfs/vdev_disk.c                             | 2 +-
 tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 77773c4f2bf2..f3f0c0875210 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -882,7 +882,7 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
 	 * Note if we're taking less than a full block, so we can check it
 	 * above on the next call.
 	 */
-	s->end = len & s->bmask;
+	s->end = (off+len) & s->bmask;
 
 	/* All blocks after the first must start on a block size boundary. */
 	if (s->npages != 0 && (off & s->bmask) != 0)
diff --git a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c
index 98d19a1280ea..5c6d28eb2c44 100644
--- a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c
+++ b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c
@@ -61,7 +61,7 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
 	 * Note if we're taking less than a full block, so we can check it
 	 * above on the next call.
 	 */
-	s->end = len & s->bmask;
+	s->end = (off+len) & s->bmask;
 
 	/* All blocks after the first must start on a block size boundary. */
 	if (s->npages != 0 && (off & s->bmask) != 0)

From e2035cdbf70e2d4e6f819ce6d5f6a286a152d264 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Fri, 12 Apr 2024 07:49:57 +1000
Subject: [PATCH 065/116] AUTHORS: refresh with recent new contributors

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16079
---
 .mailmap | 18 ++++++++++++++++++
 AUTHORS  | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/.mailmap b/.mailmap
index 46ef016b93f8..32bdb5209613 100644
--- a/.mailmap
+++ b/.mailmap
@@ -30,6 +30,7 @@ Andreas Dilger <adilger@dilger.ca>
 Andrew Walker <awalker@ixsystems.com>
 Benedikt Neuffer <github@itfriend.de>
 Chengfei Zhu <chengfeix.zhu@intel.com>
+ChenHao Lu <18302010006@fudan.edu.cn>
 Chris Lindee <chris.lindee+github@gmail.com>
 Colm Buckley <colm@tuatha.org>
 Crag Wang <crag0715@gmail.com>
@@ -43,6 +44,7 @@ Glenn Washburn <development@efficientek.com>
 Gordan Bobic <gordan.bobic@gmail.com>
 Gregory Bartholomew <gregory.lee.bartholomew@gmail.com>
 hedong zhang <h_d_zhang@163.com>
+Ilkka Sovanto <github@ilkka.kapsi.fi>
 InsanePrawn <Insane.Prawny@gmail.com>
 Jason Cohen <jwittlincohen@gmail.com>
 Jason Harmening <jason.harmening@gmail.com>
@@ -57,6 +59,7 @@ KernelOfTruth <kerneloftruth@gmail.com>
 Liu Hua <liu.hua130@zte.com.cn>
 Liu Qing <winglq@gmail.com>
 loli10K <ezomori.nozomu@gmail.com>
+Mart Frauenlob <allkind@fastest.cc>
 Matthias Blankertz <matthias@blankertz.org>
 Michael Gmelin <grembo@FreeBSD.org>
 Olivier Mazouffre <olivier.mazouffre@ims-bordeaux.fr>
@@ -73,6 +76,9 @@ WHR <msl0000023508@gmail.com>
 Yanping Gao <yanping.gao@xtaotech.com>
 Youzhong Yang <youzhong@gmail.com>
 
+# Signed-off-by: overriding Author:
+Yuxin Wang <yuxinwang9999@gmail.com> <Bi11gates9999@gmail.com>
+
 # Commits from strange places, long ago
 Brian Behlendorf <behlendorf1@llnl.gov> <behlendo@7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c>
 Brian Behlendorf <behlendorf1@llnl.gov> <behlendo@fedora-17-amd64.(none)>
@@ -102,12 +108,15 @@ Brandon Thetford <brandon@dodecatec.com> <dodexahedron@users.noreply.github.com>
 buzzingwires <buzzingwires@outlook.com> <131118055+buzzingwires@users.noreply.github.com>
 Cedric Maunoury <cedric.maunoury@gmail.com> <38213715+cedricmaunoury@users.noreply.github.com>
 Charles Suh <charles.suh@gmail.com> <charlessuh@users.noreply.github.com>
+Chris Peredun <chris.peredun@ixsystems.com> <126915832+chrisperedun@users.noreply.github.com>
 Dacian Reece-Stremtan <dacianstremtan@gmail.com> <35844628+dacianstremtan@users.noreply.github.com>
 Damian Szuberski <szuberskidamian@gmail.com> <30863496+szubersk@users.noreply.github.com>
 Daniel Hiepler <d-git@coderdu.de> <32984777+heeplr@users.noreply.github.com>
 Daniel Kobras <d.kobras@science-computing.de> <sckobras@users.noreply.github.com>
 Daniel Reichelt <hacking@nachtgeist.net> <nachtgeist@users.noreply.github.com>
 David Quigley <david.quigley@intel.com> <dpquigl@users.noreply.github.com>
+Dennis R. Friedrichsen <dennis.r.friedrichsen@gmail.com> <31087738+dennisfriedrichsen@users.noreply.github.com>
+Dex Wood <slash2314@gmail.com> <slash2314@users.noreply.github.com>
 DHE <git@dehacked.net> <DeHackEd@users.noreply.github.com>
 Dmitri John Ledkov <dimitri.ledkov@canonical.com> <19779+xnox@users.noreply.github.com>
 Dries Michiels <driesm.michiels@gmail.com> <32487486+driesmp@users.noreply.github.com>
@@ -128,6 +137,7 @@ Harry Mallon <hjmallon@gmail.com> <1816667+hjmallon@users.noreply.github.com>
 Hiếu Lê <leorize+oss@disroot.org> <alaviss@users.noreply.github.com>
 Jake Howard <git@theorangeone.net> <RealOrangeOne@users.noreply.github.com>
 James Cowgill <james.cowgill@mips.com> <jcowgill@users.noreply.github.com>
+Jaron Kent-Dobias <jaron@kent-dobias.com> <kentdobias@users.noreply.github.com>
 Jason King <jason.king@joyent.com> <jasonbking@users.noreply.github.com>
 Jeff Dike <jdike@akamai.com> <52420226+jdike@users.noreply.github.com>
 Jitendra Patidar <jitendra.patidar@nutanix.com> <53164267+jsai20@users.noreply.github.com>
@@ -137,7 +147,9 @@ John L. Hammond <john.hammond@intel.com> <35266395+jhammond-intel@users.noreply.
 John-Mark Gurney <jmg@funkthat.com> <jmgurney@users.noreply.github.com>
 John Ramsden <johnramsden@riseup.net> <johnramsden@users.noreply.github.com>
 Jonathon Fernyhough <jonathon@m2x.dev> <559369+jonathonf@users.noreply.github.com>
+Jose Luis Duran <jlduran@gmail.com> <jlduran@users.noreply.github.com>
 Justin Hibbits <chmeeedalf@gmail.com> <chmeeedalf@users.noreply.github.com>
+Kevin Greene <kevin.greene@delphix.com> <104801862+kxgreene@users.noreply.github.com>
 Kevin Jin <lostking2008@hotmail.com> <33590050+jxdking@users.noreply.github.com>
 Kevin P. Fleming <kevin@km6g.us> <kpfleming@users.noreply.github.com>
 Krzysztof Piecuch <piecuch@kpiecuch.pl> <3964215+pikrzysztof@users.noreply.github.com>
@@ -148,9 +160,11 @@ Lorenz Hüdepohl <dev@stellardeath.org> <lhuedepohl@users.noreply.github.com>
 Luís Henriques <henrix@camandro.org> <73643340+lumigch@users.noreply.github.com>
 Marcin Skarbek <git@skarbek.name> <mskarbek@users.noreply.github.com>
 Matt Fiddaman <github@m.fiddaman.uk> <81489167+matt-fidd@users.noreply.github.com>
+Maxim Filimonov <che@bein.link> <part1zano@users.noreply.github.com>
 Max Zettlmeißl <max@zettlmeissl.de> <6818198+maxz@users.noreply.github.com>
 Michael Niewöhner <foss@mniewoehner.de> <c0d3z3r0@users.noreply.github.com>
 Michael Zhivich <mzhivich@akamai.com> <33133421+mzhivich@users.noreply.github.com>
+MigeljanImeri <ImeriMigel@gmail.com> <78048439+MigeljanImeri@users.noreply.github.com>
 Mo Zhou <cdluminate@gmail.com> <5723047+cdluminate@users.noreply.github.com>
 Nick Mattis <nickm970@gmail.com> <nmattis@users.noreply.github.com>
 omni <omni+vagant@hack.org> <79493359+omnivagant@users.noreply.github.com>
@@ -164,6 +178,7 @@ Ping Huang <huangping@smartx.com> <101400146+hpingfs@users.noreply.github.com>
 Piotr P. Stefaniak <pstef@freebsd.org> <pstef@users.noreply.github.com>
 Richard Allen <belperite@gmail.com> <33836503+belperite@users.noreply.github.com>
 Rich Ercolani <rincebrain@gmail.com> <214141+rincebrain@users.noreply.github.com>
+Rick Macklem <rmacklem@uoguelph.ca> <64620010+rmacklem@users.noreply.github.com>
 Rob Wing <rob.wing@klarasystems.com> <98866084+rob-wing@users.noreply.github.com>
 Roman Strashkin <roman.strashkin@nexenta.com> <Ramzec@users.noreply.github.com>
 Ryan Hirasaki <ryanhirasaki@gmail.com> <4690732+RyanHir@users.noreply.github.com>
@@ -174,6 +189,8 @@ Scott Colby <scott@scolby.com> <scolby33@users.noreply.github.com>
 Sean Eric Fagan <kithrup@mac.com> <kithrup@users.noreply.github.com>
 Spencer Kinny <spencerkinny1995@gmail.com> <30333052+Spencer-Kinny@users.noreply.github.com>
 Srikanth N S <srikanth.nagasubbaraoseetharaman@hpe.com> <75025422+nssrikanth@users.noreply.github.com>
+Stefan Lendl <s.lendl@proxmox.com> <1321542+stfl@users.noreply.github.com>
+Thomas Bertschinger <bertschinger@lanl.gov> <101425190+bertschinger@users.noreply.github.com>
 Thomas Geppert <geppi@digitx.de> <geppi@users.noreply.github.com>
 Tim Crawford <tcrawford@datto.com> <crawfxrd@users.noreply.github.com>
 Tom Matthews <tom@axiom-partners.com> <tomtastic@users.noreply.github.com>
@@ -181,6 +198,7 @@ Tony Perkins <tperkins@datto.com> <62951051+tony-zfs@users.noreply.github.com>
 Torsten Wörtwein <twoertwein@gmail.com> <twoertwein@users.noreply.github.com>
 Tulsi Jain <tulsi.jain@delphix.com> <TulsiJain@users.noreply.github.com>
 Václav Skála <skala@vshosting.cz> <33496485+vaclavskala@users.noreply.github.com>
+Vaibhav Bhanawat <vaibhav.bhanawat@delphix.com> <88050553+vaibhav-delphix@users.noreply.github.com>
 Violet Purcell <vimproved@inventati.org> <66446404+vimproved@users.noreply.github.com>
 Vipin Kumar Verma <vipin.verma@hpe.com> <75025470+vermavipinkumar@users.noreply.github.com>
 Wolfgang Bumiller <w.bumiller@proxmox.com> <Blub@users.noreply.github.com>
diff --git a/AUTHORS b/AUTHORS
index be1efb87b34c..d7d55f42d2e7 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -88,9 +88,11 @@ CONTRIBUTORS:
     Bassu <bassu@phi9.com>
     Ben Allen <bsallen@alcf.anl.gov>
     Ben Cordero <bencord0@condi.me>
+    Benda Xu <orv@debian.org>
     Benedikt Neuffer <github@itfriend.de>
     Benjamin Albrecht <git@albrecht.io>
     Benjamin Gentil <benjgentil.pro@gmail.com>
+    Benjamin Sherman <benjamin@holyarmy.org>
     Ben McGough <bmcgough@fredhutch.org>
     Ben Rubson <ben.rubson@gmail.com>
     Ben Wolsieffer <benwolsieffer@gmail.com>
@@ -111,6 +113,7 @@ CONTRIBUTORS:
     bzzz77 <bzzz.tomas@gmail.com>
     cable2999 <cable2999@users.noreply.github.com>
     Caleb James DeLisle <calebdelisle@lavabit.com>
+    Cameron Harr <harr1@llnl.gov>
     Cao Xuewen <cao.xuewen@zte.com.cn>
     Carlo Landmeter <clandmeter@gmail.com>
     Carlos Alberto Lopez Perez <clopez@igalia.com>
@@ -120,12 +123,15 @@ CONTRIBUTORS:
     Chen Can <chen.can2@zte.com.cn>
     Chengfei Zhu <chengfeix.zhu@intel.com>
     Chen Haiquan <oc@yunify.com>
+    ChenHao Lu <18302010006@fudan.edu.cn>
     Chip Parker <aparker@enthought.com>
     Chris Burroughs <chris.burroughs@gmail.com>
+    Chris Davidson <christopher.davidson@gmail.com>
     Chris Dunlap <cdunlap@llnl.gov>
     Chris Dunlop <chris@onthe.net.au>
     Chris Lindee <chris.lindee+github@gmail.com>
     Chris McDonough <chrism@plope.com>
+    Chris Peredun <chris.peredun@ixsystems.com>
     Chris Siden <chris.siden@delphix.com>
     Chris Siebenmann <cks.github@cs.toronto.edu>
     Christer Ekholm <che@chrekh.se>
@@ -144,6 +150,7 @@ CONTRIBUTORS:
     Clint Armstrong <clint@clintarmstrong.net>
     Coleman Kane <ckane@colemankane.org>
     Colin Ian King <colin.king@canonical.com>
+    Colin Percival <cperciva@tarsnap.com>
     Colm Buckley <colm@tuatha.org>
     Crag Wang <crag0715@gmail.com>
     Craig Loomis <cloomis@astro.princeton.edu>
@@ -156,6 +163,7 @@ CONTRIBUTORS:
     Damiano Albani <damiano.albani@gmail.com>
     Damian Szuberski <szuberskidamian@gmail.com>
     Damian Wojsław <damian@wojslaw.pl>
+    Daniel Berlin <dberlin@dberlin.org>
     Daniel Hiepler <d-git@coderdu.de>
     Daniel Hoffman <dj.hoffman@delphix.com>
     Daniel Kobras <d.kobras@science-computing.de>
@@ -176,8 +184,10 @@ CONTRIBUTORS:
     David Quigley <david.quigley@intel.com>
     Debabrata Banerjee <dbanerje@akamai.com>
     D. Ebdrup <debdrup@freebsd.org>
+    Dennis R. Friedrichsen <dennis.r.friedrichsen@gmail.com>
     Denys Rtveliashvili <denys@rtveliashvili.name>
     Derek Dai <daiderek@gmail.com>
+    Dex Wood <slash2314@gmail.com>
     DHE <git@dehacked.net>
     Didier Roche <didrocks@ubuntu.com>
     Dimitri John Ledkov <xnox@ubuntu.com>
@@ -235,9 +245,11 @@ CONTRIBUTORS:
     Gionatan Danti <g.danti@assyoma.it>
     Giuseppe Di Natale <guss80@gmail.com>
     Glenn Washburn <development@efficientek.com>
+    gofaster <felix.gofaster@gmail.com>
     Gordan Bobic <gordan@redsleeve.org>
     Gordon Bergling <gbergling@googlemail.com>
     Gordon Ross <gwr@nexenta.com>
+    Gordon Tetlow <gordon@freebsd.org>
     Graham Christensen <graham@grahamc.com>
     Graham Perrin <grahamperrin@gmail.com>
     Gregor Kopka <gregor@kopka.net>
@@ -265,6 +277,7 @@ CONTRIBUTORS:
     Igor Kozhukhov <ikozhukhov@gmail.com>
     Igor Lvovsky <ilvovsky@gmail.com>
     ilbsmart <wgqimut@gmail.com>
+    Ilkka Sovanto <github@ilkka.kapsi.fi>
     illiliti <illiliti@protonmail.com>
     ilovezfs <ilovezfs@icloud.com>
     InsanePrawn <Insane.Prawny@gmail.com>
@@ -280,9 +293,11 @@ CONTRIBUTORS:
     Jan Engelhardt <jengelh@inai.de>
     Jan Kryl <jan.kryl@nexenta.com>
     Jan Sanislo <oystr@cs.washington.edu>
+    Jaron Kent-Dobias <jaron@kent-dobias.com>
     Jason Cohen <jwittlincohen@gmail.com>
     Jason Harmening <jason.harmening@gmail.com>
     Jason King <jason.brian.king@gmail.com>
+    Jason Lee <jasonlee@lanl.gov>
     Jason Zaman <jasonzaman@gmail.com>
     Javen Wu <wu.javen@gmail.com>
     Jean-Baptiste Lallement <jean-baptiste@ubuntu.com>
@@ -313,6 +328,7 @@ CONTRIBUTORS:
     Jonathon Fernyhough <jonathon@m2x.dev>
     Jorgen Lundman <lundman@lundman.net>
     Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
+    Jose Luis Duran <jlduran@gmail.com>
     Josh Soref <jsoref@users.noreply.github.com>
     Joshua M. Clulow <josh@sysmgr.org>
     José Luis Salvador Rufo <salvador.joseluis@gmail.com>
@@ -336,8 +352,10 @@ CONTRIBUTORS:
     Kash Pande <kash@tripleback.net>
     Kay Pedersen <christianpe96@gmail.com>
     Keith M Wesolowski <wesolows@foobazco.org>
+    Kent Ross <k@mad.cash>
     KernelOfTruth <kerneloftruth@gmail.com>
     Kevin Bowling <kevin.bowling@kev009.com>
+    Kevin Greene <kevin.greene@delphix.com>
     Kevin Jin <lostking2008@hotmail.com>
     Kevin P. Fleming <kevin@km6g.us>
     Kevin Tanguy <kevin.tanguy@ovh.net>
@@ -389,6 +407,7 @@ CONTRIBUTORS:
     Mark Shellenbaum <Mark.Shellenbaum@Oracle.COM>
     marku89 <mar42@kola.li>
     Mark Wright <markwright@internode.on.net>
+    Mart Frauenlob <allkind@fastest.cc>
     Martin Matuska <mm@FreeBSD.org>
     Martin Rüegg <martin.rueegg@metaworx.ch>
     Massimo Maggi <me@massimo-maggi.eu>
@@ -405,6 +424,7 @@ CONTRIBUTORS:
     Matus Kral <matuskral@me.com>
     Mauricio Faria de Oliveira <mfo@canonical.com>
     Max Grossman <max.grossman@delphix.com>
+    Maxim Filimonov <che@bein.link>
     Maximilian Mehnert <maximilian.mehnert@gmx.de>
     Max Zettlmeißl <max@zettlmeissl.de>
     Md Islam <mdnahian@outlook.com>
@@ -417,6 +437,7 @@ CONTRIBUTORS:
     Michael Niewöhner <foss@mniewoehner.de>
     Michael Zhivich <mzhivich@akamai.com>
     Michal Vasilek <michal@vasilek.cz>
+    MigeljanImeri <ImeriMigel@gmail.com>
     Mike Gerdts <mike.gerdts@joyent.com>
     Mike Harsch <mike@harschsystems.com>
     Mike Leddy <mike.leddy@gmail.com>
@@ -448,6 +469,7 @@ CONTRIBUTORS:
     Olaf Faaland <faaland1@llnl.gov>
     Oleg Drokin <green@linuxhacker.ru>
     Oleg Stepura <oleg@stepura.com>
+    Olivier Certner <olce.freebsd@certner.fr>
     Olivier Mazouffre <olivier.mazouffre@ims-bordeaux.fr>
     omni <omni+vagant@hack.org>
     Orivej Desh <orivej@gmx.fr>
@@ -479,6 +501,7 @@ CONTRIBUTORS:
     Prasad Joshi <prasadjoshi124@gmail.com>
     privb0x23 <privb0x23@users.noreply.github.com>
     P.SCH <p88@yahoo.com>
+    Quartz <yyhran@163.com>
     Quentin Zdanis <zdanisq@gmail.com>
     Rafael Kitover <rkitover@gmail.com>
     RageLtMan <sempervictus@users.noreply.github.com>
@@ -491,11 +514,15 @@ CONTRIBUTORS:
     Riccardo Schirone <rschirone91@gmail.com>
     Richard Allen <belperite@gmail.com>
     Richard Elling <Richard.Elling@RichardElling.com>
+    Richard Kojedzinszky <richard@kojedz.in>
     Richard Laager <rlaager@wiktel.com>
     Richard Lowe <richlowe@richlowe.net>
     Richard Sharpe <rsharpe@samba.org>
     Richard Yao <ryao@gentoo.org>
     Rich Ercolani <rincebrain@gmail.com>
+    Rick Macklem <rmacklem@uoguelph.ca>
+    rilysh <nightquick@proton.me>
+    Robert Evans <evansr@google.com>
     Robert Novak <sailnfool@gmail.com>
     Roberto Ricci <ricci@disroot.org>
     Rob Norris <robn@despairlabs.com>
@@ -509,7 +536,9 @@ CONTRIBUTORS:
     Ryan Lahfa <masterancpp@gmail.com>
     Ryan Libby <rlibby@FreeBSD.org>
     Ryan Moeller <freqlabs@FreeBSD.org>
+    Sam Atkinson <samatk@amazon.com>
     Sam Hathaway <github.com@munkynet.org>
+    Sam James <sam@gentoo.org>
     Sam Lunt <samuel.j.lunt@gmail.com>
     Samuel VERSCHELDE <stormi-github@ylix.fr>
     Samuel Wycliffe <samuelwycliffe@gmail.com>
@@ -530,6 +559,8 @@ CONTRIBUTORS:
     Shaan Nobee <sniper111@gmail.com>
     Shampavman <sham.pavman@nexenta.com>
     Shaun Tancheff <shaun@aeonazure.com>
+    Shawn Bayern <sbayern@law.fsu.edu>
+    Shengqi Chen <harry-chen@outlook.com>
     Shen Yan <shenyanxxxy@qq.com>
     Simon Guest <simon.guest@tesujimath.org>
     Simon Klinkert <simon.klinkert@gmail.com>
@@ -537,6 +568,7 @@ CONTRIBUTORS:
     Spencer Kinny <spencerkinny1995@gmail.com>
     Srikanth N S <srikanth.nagasubbaraoseetharaman@hpe.com>
     Stanislav Seletskiy <s.seletskiy@gmail.com>
+    Stefan Lendl <s.lendl@proxmox.com>
     Steffen Müthing <steffen.muething@iwr.uni-heidelberg.de>
     Stephen Blinick <stephen.blinick@delphix.com>
     sterlingjensen <sterlingjensen@users.noreply.github.com>
@@ -557,6 +589,7 @@ CONTRIBUTORS:
     Teodor Spæren <teodor_spaeren@riseup.net>
     TerraTech <TerraTech@users.noreply.github.com>
     Thijs Cramer <thijs.cramer@gmail.com>
+    Thomas Bertschinger <bertschinger@lanl.gov>
     Thomas Geppert <geppi@digitx.de>
     Thomas Lamprecht <guggentom@hotmail.de>
     Till Maas <opensource@till.name>
@@ -586,6 +619,7 @@ CONTRIBUTORS:
     Turbo Fredriksson <turbo@bayour.com>
     Tyler J. Stachecki <stachecki.tyler@gmail.com>
     Umer Saleem <usaleem@ixsystems.com>
+    Vaibhav Bhanawat <vaibhav.bhanawat@delphix.com>
     Valmiky Arquissandas <kayvlim@gmail.com>
     Val Packett <val@packett.cool>
     Vince van Oosten <techhazard@codeforyouand.me>
@@ -614,6 +648,7 @@ CONTRIBUTORS:
     yuina822 <ayuichi@club.kyutech.ac.jp>
     YunQiang Su <syq@debian.org>
     Yuri Pankov <yuri.pankov@gmail.com>
+    Yuxin Wang <yuxinwang9999@gmail.com>
     Yuxuan Shui <yshuiv7@gmail.com>
     Zachary Bedell <zac@thebedells.org>
     Zach Dykstra <dykstra.zachary@gmail.com>

From a100a195fa490e4a816492be2efa216a6880909f Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Fri, 12 Apr 2024 03:10:24 +0500
Subject: [PATCH 066/116] Add support for zfs mount -R <filesystem>

This commit adds support for mounting a dataset along with all of
it's children with '-R' flag for zfs mount. There can be scenarios
where we want to mount all datasets under one hierarchy instead of
mounting all datasets present on system with '-a' flag.

'-R' flag should work on all root and non-root datasets. Usage
information and man page has been updated for zfs mount. A test
for verifying the behavior for '-R' flag is also added.

Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #16015
---
 cmd/zfs/zfs_main.c                            |  75 +++++++--
 man/man8/zfs-mount.8                          |   6 +-
 tests/runfiles/common.run                     |   2 +-
 tests/runfiles/sanity.run                     |   3 +-
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../cli_root/zfs_mount/zfs_mount.cfg          |   1 +
 .../zfs_mount/zfs_mount_recursive.ksh         | 146 ++++++++++++++++++
 7 files changed, 216 insertions(+), 18 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index c2147c8f4acd..ec52c563b447 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -309,7 +309,8 @@ get_usage(zfs_help_t idx)
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_MOUNT:
 		return (gettext("\tmount\n"
-		    "\tmount [-flvO] [-o opts] <-a | filesystem>\n"));
+		    "\tmount [-flvO] [-o opts] <-a|-R filesystem|"
+		    "filesystem>\n"));
 	case HELP_PROMOTE:
 		return (gettext("\tpromote <clone-filesystem>\n"));
 	case HELP_RECEIVE:
@@ -6754,6 +6755,8 @@ zfs_do_holds(int argc, char **argv)
 #define	MOUNT_TIME 1		/* seconds */
 
 typedef struct get_all_state {
+	char		**ga_datasets;
+	int		ga_count;
 	boolean_t	ga_verbose;
 	get_all_cb_t	*ga_cbp;
 } get_all_state_t;
@@ -6800,19 +6803,35 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
 	return (0);
 }
 
-static void
-get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)
+static int
+get_recursive_datasets(zfs_handle_t *zhp, void *data)
 {
-	get_all_state_t state = {
-	    .ga_verbose = verbose,
-	    .ga_cbp = cbp
-	};
+	get_all_state_t *state = data;
+	int len = strlen(zfs_get_name(zhp));
+	for (int i = 0; i < state->ga_count; ++i) {
+		if (strcmp(state->ga_datasets[i], zfs_get_name(zhp)) == 0)
+			return (get_one_dataset(zhp, data));
+		else if ((strncmp(state->ga_datasets[i], zfs_get_name(zhp),
+		    len) == 0) && state->ga_datasets[i][len] == '/') {
+			(void) zfs_iter_filesystems_v2(zhp, 0,
+			    get_recursive_datasets, data);
+		}
+	}
+	zfs_close(zhp);
+	return (0);
+}
 
-	if (verbose)
+static void
+get_all_datasets(get_all_state_t *state)
+{
+	if (state->ga_verbose)
 		set_progress_header(gettext("Reading ZFS config"));
-	(void) zfs_iter_root(g_zfs, get_one_dataset, &state);
+	if (state->ga_datasets == NULL)
+		(void) zfs_iter_root(g_zfs, get_one_dataset, state);
+	else
+		(void) zfs_iter_root(g_zfs, get_recursive_datasets, state);
 
-	if (verbose)
+	if (state->ga_verbose)
 		finish_progress(gettext("done."));
 }
 
@@ -7158,18 +7177,22 @@ static int
 share_mount(int op, int argc, char **argv)
 {
 	int do_all = 0;
+	int recursive = 0;
 	boolean_t verbose = B_FALSE;
 	int c, ret = 0;
 	char *options = NULL;
 	int flags = 0;
 
 	/* check options */
-	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al"))
+	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":aRlvo:Of" : "al"))
 	    != -1) {
 		switch (c) {
 		case 'a':
 			do_all = 1;
 			break;
+		case 'R':
+			recursive = 1;
+			break;
 		case 'v':
 			verbose = B_TRUE;
 			break;
@@ -7211,7 +7234,7 @@ share_mount(int op, int argc, char **argv)
 	argv += optind;
 
 	/* check number of arguments */
-	if (do_all) {
+	if (do_all || recursive) {
 		enum sa_protocol protocol = SA_NO_PROTOCOL;
 
 		if (op == OP_SHARE && argc > 0) {
@@ -7220,14 +7243,38 @@ share_mount(int op, int argc, char **argv)
 			argv++;
 		}
 
-		if (argc != 0) {
+		if (argc != 0 && do_all) {
 			(void) fprintf(stderr, gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
+		if (argc == 0 && recursive) {
+			(void) fprintf(stderr,
+			    gettext("no dataset provided\n"));
+			usage(B_FALSE);
+		}
+
 		start_progress_timer();
 		get_all_cb_t cb = { 0 };
-		get_all_datasets(&cb, verbose);
+		get_all_state_t state = { 0 };
+		if (argc == 0) {
+			state.ga_datasets = NULL;
+			state.ga_count = -1;
+		} else {
+			zfs_handle_t *zhp;
+			for (int i = 0; i < argc; i++) {
+				zhp = zfs_open(g_zfs, argv[i],
+				    ZFS_TYPE_FILESYSTEM);
+				if (zhp == NULL)
+					usage(B_FALSE);
+				zfs_close(zhp);
+			}
+			state.ga_datasets = argv;
+			state.ga_count = argc;
+		}
+		state.ga_verbose = verbose;
+		state.ga_cbp = &cb;
+		get_all_datasets(&state);
 
 		if (cb.cb_used == 0) {
 			free(options);
diff --git a/man/man8/zfs-mount.8 b/man/man8/zfs-mount.8
index 35aa187cf063..20dbe4d0e648 100644
--- a/man/man8/zfs-mount.8
+++ b/man/man8/zfs-mount.8
@@ -43,7 +43,7 @@
 .Cm mount
 .Op Fl Oflv
 .Op Fl o Ar options
-.Fl a Ns | Ns Ar filesystem
+.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem
 .Nm zfs
 .Cm unmount
 .Op Fl fu
@@ -61,7 +61,7 @@ Displays all ZFS file systems currently mounted.
 .Cm mount
 .Op Fl Oflv
 .Op Fl o Ar options
-.Fl a Ns | Ns Ar filesystem
+.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem
 .Xc
 Mount ZFS filesystem on a path described by its
 .Sy mountpoint
@@ -83,6 +83,8 @@ for more information.
 .It Fl a
 Mount all available ZFS file systems.
 Invoked automatically as part of the boot process if configured.
+.It Fl R
+Mount the specified filesystems along with all their children.
 .It Ar filesystem
 Mount the specified filesystem.
 .It Fl o Ar options
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 4295ca1b6f31..558cd425afd8 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -252,7 +252,7 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted',
     'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
-    'zfs_mount_test_race']
+    'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run
index 598123bcd277..d6a791e3375d 100644
--- a/tests/runfiles/sanity.run
+++ b/tests/runfiles/sanity.run
@@ -155,7 +155,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount',
-    'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race']
+    'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
+    'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index db6b4c0146a7..f182a2825cd6 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -770,6 +770,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \
+	functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_remount.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \
 	functional/cli_root/zfs_mount/zfs_multi_mount.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
index 06d25faf0356..739baf16086a 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
@@ -31,6 +31,7 @@
 export mountcmd=mount
 export mountforce="$mountcmd -f"
 export mountall="$mountcmd -a"
+export mountrecursive="$mountcmd -R"
 
 export unmountcmd=unmount
 export unmountforce="$unmountcmd -f"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh
new file mode 100755
index 000000000000..0e5cc5d6955e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh
@@ -0,0 +1,146 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2024, iXsystems Inc. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
+
+#
+# DESCRIPTION:
+# Verify zfs mount -R <filesystems/s> functionality.
+#
+# STRATEGY:
+# 1. Create nested datasets
+# 2. Unmount all datasets
+# 3. Recusrively mount root datasets, this should mount all datasets
+#    present in a pool
+# 4. Unmount all datasets
+# 5. Recusrsively mount child datasets with children. This should mount
+#    child datasets, but not the root dataset or parent datasets
+# 6. Unmount all datasets
+# 7. Mount root dataset recursively again and confirm all child
+#    datasets are mounted.
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+	log_must datasetexists $TESTPOOL/$TESTFS1 && \
+		destroy_dataset $TESTPOOL/$TESTFS1 -R
+	log_must datasetexists $TESTPOOL/$TESTFS2 && \
+		destroy_dataset $TESTPOOL/$TESTFS2 -R
+	log_must datasetexists $TESTPOOL/$TESTFS3 && \
+		destroy_dataset $TESTPOOL/$TESTFS3 -R
+}
+
+function setup_all
+{
+	log_must datasetexists $TESTPOOL/$TESTFS || zfs create $TESTPOOL/$TESTFS
+	log_must zfs create $TESTPOOL/$TESTFS1
+	log_must zfs create $TESTPOOL/$TESTFS2
+	log_must zfs create $TESTPOOL/$TESTFS3
+	log_must zfs create $TESTPOOL/$TESTFS2/child1
+	log_must zfs create $TESTPOOL/$TESTFS2/child2
+	log_must zfs create $TESTPOOL/$TESTFS2/child3
+	log_must zfs create $TESTPOOL/$TESTFS2/child2/subchild
+	log_must zfs create $TESTPOOL/$TESTFS3/child
+}
+
+log_assert "Verify that 'zfs $mountrecursive' successfully, " \
+	"mounts the dataset along with all its children."
+
+log_onexit cleanup
+
+log_must setup_all
+
+log_must zfs $unmountall
+
+log_must zfs $mountrecursive $TESTPOOL
+
+log_must mounted $TESTPOOL
+log_must mounted $TESTPOOL/$TESTFS
+log_must mounted $TESTPOOL/$TESTFS1
+log_must mounted $TESTPOOL/$TESTFS2
+log_must mounted $TESTPOOL/$TESTFS3
+log_must mounted $TESTPOOL/$TESTFS2/child1
+log_must mounted $TESTPOOL/$TESTFS2/child2
+log_must mounted $TESTPOOL/$TESTFS2/child3
+log_must mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_must mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $unmountall
+
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_mustnot mounted $TESTPOOL/$TESTFS2
+log_mustnot mounted $TESTPOOL/$TESTFS3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child1
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2
+log_mustnot mounted $TESTPOOL/$TESTFS2/child3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_mustnot mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $mountrecursive $TESTPOOL/$TESTFS2 $TESTPOOL/$TESTFS3
+
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_must mounted $TESTPOOL/$TESTFS2
+log_must mounted $TESTPOOL/$TESTFS3
+log_must mounted $TESTPOOL/$TESTFS2/child1
+log_must mounted $TESTPOOL/$TESTFS2/child2
+log_must mounted $TESTPOOL/$TESTFS2/child3
+log_must mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_must mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $unmountall
+
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_mustnot mounted $TESTPOOL/$TESTFS2
+log_mustnot mounted $TESTPOOL/$TESTFS3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child1
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2
+log_mustnot mounted $TESTPOOL/$TESTFS2/child3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_mustnot mounted $TESTPOOL/$TESTFS3/child
+
+log_must zfs $mountrecursive $TESTPOOL/$TESTFS2/child2
+
+log_must mounted $TESTPOOL/$TESTFS2/child2
+log_must mounted $TESTPOOL/$TESTFS2/child2/subchild
+log_mustnot mounted $TESTPOOL
+log_mustnot mounted $TESTPOOL/$TESTFS
+log_mustnot mounted $TESTPOOL/$TESTFS1
+log_mustnot mounted $TESTPOOL/$TESTFS2
+log_mustnot mounted $TESTPOOL/$TESTFS3
+log_mustnot mounted $TESTPOOL/$TESTFS2/child1
+log_mustnot mounted $TESTPOOL/$TESTFS2/child3
+log_mustnot mounted $TESTPOOL/$TESTFS3/child
+
+log_pass "'zfs $mountrecursive' behaves as expected."

From cac416f1062fdbd2ff84ff2b40835d4853cbf190 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 4 Apr 2024 22:34:42 +1100
Subject: [PATCH 067/116] zio: remove zio_ioctl()

It only had one user, zio_flush(), and there are no other vdev ioctls
anyway.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16064
---
 include/sys/zio.h |  5 +----
 module/zfs/zio.c  | 31 ++++++++++++++++---------------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/include/sys/zio.h b/include/sys/zio.h
index 25a4b221f05e..5dcd7fe073a0 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -27,7 +27,7 @@
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2019, Allan Jude
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2023, 2024, Klara Inc.
  * Copyright (c) 2019-2020, Michael Niewöhner
  */
 
@@ -579,9 +579,6 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
-extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *priv, zio_flag_t flags);
-
 extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, enum trim_flag trim_flags);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 08d56eef83e9..4aa08f3b30f5 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2023, 2024, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2021, Datto, Inc.
  */
@@ -1449,17 +1449,6 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	return (zio);
 }
 
-zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, zio_flag_t flags)
-{
-	zio_t *zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
-	    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
-	    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-	zio->io_cmd = cmd;
-	return (zio);
-}
-
 zio_t *
 zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *private, zio_priority_t priority,
@@ -1626,15 +1615,27 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
 	return (zio);
 }
 
+
+/*
+ * Send a flush command to the given vdev. Unlike most zio creation functions,
+ * the flush zios are issued immediately. You can wait on pio to pause until
+ * the flushes complete.
+ */
 void
 zio_flush(zio_t *pio, vdev_t *vd)
 {
+	const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+	    ZIO_FLAG_DONT_RETRY;
+
 	if (vd->vdev_nowritecache)
 		return;
+
 	if (vd->vdev_children == 0) {
-		zio_nowait(zio_ioctl(pio, vd->vdev_spa, vd,
-		    DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL |
-		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+		zio_t *zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
+		    NULL, NULL, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0,
+		    NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+		zio->io_cmd = DKIOCFLUSHWRITECACHE;
+		zio_nowait(zio);
 	} else {
 		for (uint64_t c = 0; c < vd->vdev_children; c++)
 			zio_flush(pio, vd->vdev_child[c]);

From c9c838aa1fca9aef84d74db1d99872c5efa9a25d Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 4 Apr 2024 22:34:54 +1100
Subject: [PATCH 068/116] zio: remove io_cmd and DKIOCFLUSHWRITECACHE

There's no other options, so we can just always assume its a flush.

Includes some light refactoring where a switch statement was doing
control flow that no longer works.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16064
---
 include/os/linux/zfs/sys/trace_common.h |  6 +-
 include/sys/zio.h                       |  1 -
 module/os/freebsd/zfs/vdev_file.c       |  9 +--
 module/os/freebsd/zfs/vdev_geom.c       | 43 +++++-------
 module/os/linux/zfs/vdev_disk.c         | 39 ++++-------
 module/os/linux/zfs/vdev_file.c         | 44 +++++-------
 module/zfs/vdev_draid.c                 | 14 ++--
 module/zfs/zfs_fm.c                     |  5 +-
 module/zfs/zil.c                        | 93 ++++++++++++-------------
 module/zfs/zio.c                        |  9 +--
 10 files changed, 106 insertions(+), 157 deletions(-)

diff --git a/include/os/linux/zfs/sys/trace_common.h b/include/os/linux/zfs/sys/trace_common.h
index 3d4b1920d598..6ffa57c86418 100644
--- a/include/os/linux/zfs/sys/trace_common.h
+++ b/include/os/linux/zfs/sys/trace_common.h
@@ -31,7 +31,6 @@
 /* ZIO macros */
 #define	ZIO_TP_STRUCT_ENTRY						\
 		__field(zio_type_t,		zio_type)		\
-		__field(int,			zio_cmd)		\
 		__field(zio_priority_t,		zio_priority)		\
 		__field(uint64_t,		zio_size)		\
 		__field(uint64_t,		zio_orig_size)		\
@@ -61,7 +60,6 @@
 
 #define	ZIO_TP_FAST_ASSIGN						    \
 		__entry->zio_type		= zio->io_type;		    \
-		__entry->zio_cmd		= zio->io_cmd;		    \
 		__entry->zio_priority		= zio->io_priority;	    \
 		__entry->zio_size		= zio->io_size;		    \
 		__entry->zio_orig_size		= zio->io_orig_size;	    \
@@ -90,7 +88,7 @@
 		__entry->zp_dedup_verify	= zio->io_prop.zp_dedup_verify;
 
 #define	ZIO_TP_PRINTK_FMT						\
-	"zio { type %u cmd %i prio %u size %llu orig_size %llu "	\
+	"zio { type %u prio %u size %llu orig_size %llu "		\
 	"offset %llu timestamp %llu delta %llu delay %llu "		\
 	"flags 0x%llx stage 0x%x pipeline 0x%x orig_flags 0x%llx "	\
 	"orig_stage 0x%x orig_pipeline 0x%x reexecute %u "		\
@@ -98,7 +96,7 @@
 	"type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }"
 
 #define	ZIO_TP_PRINTK_ARGS						\
-	__entry->zio_type, __entry->zio_cmd, __entry->zio_priority,	\
+	__entry->zio_type, __entry->zio_priority,			\
 	__entry->zio_size, __entry->zio_orig_size, __entry->zio_offset,	\
 	__entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay,	\
 	__entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline,	\
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 5dcd7fe073a0..545b9cf0c3c5 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -451,7 +451,6 @@ struct zio {
 	zio_type_t	io_type;
 	enum zio_child	io_child_type;
 	enum trim_flag	io_trim_flags;
-	int		io_cmd;
 	zio_priority_t	io_priority;
 	uint8_t		io_reexecute;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c
index a65dfec86caf..888c8e7f8863 100644
--- a/module/os/freebsd/zfs/vdev_file.c
+++ b/module/os/freebsd/zfs/vdev_file.c
@@ -255,14 +255,7 @@ vdev_file_io_start(zio_t *zio)
 			return;
 		}
 
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-			zio->io_error = zfs_file_fsync(vf->vf_file,
-			    O_SYNC|O_DSYNC);
-			break;
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
-		}
+		zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC|O_DSYNC);
 
 		zio_execute(zio);
 		return;
diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index 196d67b4b595..264dfa5c9237 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -1153,42 +1153,31 @@ vdev_geom_io_start(zio_t *zio)
 
 	vd = zio->io_vd;
 
-	switch (zio->io_type) {
-	case ZIO_TYPE_IOCTL:
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return;
-		} else {
-			switch (zio->io_cmd) {
-			case DKIOCFLUSHWRITECACHE:
-				if (zfs_nocacheflush ||
-				    vdev_geom_bio_flush_disable)
-					break;
-				if (vd->vdev_nowritecache) {
-					zio->io_error = SET_ERROR(ENOTSUP);
-					break;
-				}
-				goto sendreq;
-			default:
-				zio->io_error = SET_ERROR(ENOTSUP);
-			}
 		}
 
-		zio_execute(zio);
-		return;
-	case ZIO_TYPE_TRIM:
-		if (!vdev_geom_bio_delete_disable) {
-			goto sendreq;
+		if (zfs_nocacheflush || vdev_geom_bio_flush_disable) {
+			zio_execute(zio);
+			return;
+		}
+
+		if (vd->vdev_nowritecache) {
+			zio->io_error = SET_ERROR(ENOTSUP);
+			zio_execute(zio);
+			return;
+		}
+	} else if (zio->io_type == ZIO_TYPE_TRIM) {
+		if (vdev_geom_bio_delete_disable) {
+			zio_execute(zio);
+			return;
 		}
-		zio_execute(zio);
-		return;
-	default:
-			;
-		/* PASSTHROUGH --- placate compiler */
 	}
-sendreq:
+
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM ||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index f3f0c0875210..554ed22b9df8 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1403,38 +1403,29 @@ vdev_disk_io_start(zio_t *zio)
 	case ZIO_TYPE_IOCTL:
 
 		if (!vdev_readable(v)) {
-			rw_exit(&vd->vd_lock);
-			zio->io_error = SET_ERROR(ENXIO);
-			zio_interrupt(zio);
-			return;
-		}
-
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-
-			if (zfs_nocacheflush)
-				break;
-
-			if (v->vdev_nowritecache) {
-				zio->io_error = SET_ERROR(ENOTSUP);
-				break;
-			}
-
+			/* Drive not there, can't flush */
+			error = SET_ERROR(ENXIO);
+		} else if (zfs_nocacheflush) {
+			/* Flushing disabled by operator, declare success */
+			error = 0;
+		} else if (v->vdev_nowritecache) {
+			/* This vdev not capable of flushing */
+			error = SET_ERROR(ENOTSUP);
+		} else {
+			/*
+			 * Issue the flush. If successful, the response will
+			 * be handled in the completion callback, so we're done.
+			 */
 			error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio);
 			if (error == 0) {
 				rw_exit(&vd->vd_lock);
 				return;
 			}
-
-			zio->io_error = error;
-
-			break;
-
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
 		}
 
+		/* Couldn't issue the flush, so set the error and return it */
 		rw_exit(&vd->vd_lock);
+		zio->io_error = error;
 		zio_execute(zio);
 		return;
 
diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c
index 5abc0426d1a7..2b483c9a9fa4 100644
--- a/module/os/linux/zfs/vdev_file.c
+++ b/module/os/linux/zfs/vdev_file.c
@@ -250,33 +250,27 @@ vdev_file_io_start(zio_t *zio)
 			return;
 		}
 
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-
-			if (zfs_nocacheflush)
-				break;
-
-			/*
-			 * We cannot safely call vfs_fsync() when PF_FSTRANS
-			 * is set in the current context.  Filesystems like
-			 * XFS include sanity checks to verify it is not
-			 * already set, see xfs_vm_writepage().  Therefore
-			 * the sync must be dispatched to a different context.
-			 */
-			if (__spl_pf_fstrans_check()) {
-				VERIFY3U(taskq_dispatch(vdev_file_taskq,
-				    vdev_file_io_fsync, zio, TQ_SLEEP), !=,
-				    TASKQID_INVALID);
-				return;
-			}
-
-			zio->io_error = zfs_file_fsync(vf->vf_file,
-			    O_SYNC | O_DSYNC);
-			break;
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
+		if (zfs_nocacheflush) {
+			zio_execute(zio);
+			return;
 		}
 
+		/*
+		 * We cannot safely call vfs_fsync() when PF_FSTRANS
+		 * is set in the current context.  Filesystems like
+		 * XFS include sanity checks to verify it is not
+		 * already set, see xfs_vm_writepage().  Therefore
+		 * the sync must be dispatched to a different context.
+		 */
+		if (__spl_pf_fstrans_check()) {
+			VERIFY3U(taskq_dispatch(vdev_file_taskq,
+			    vdev_file_io_fsync, zio, TQ_SLEEP), !=,
+			    TASKQID_INVALID);
+			return;
+		}
+
+		zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
+
 		zio_execute(zio);
 		return;
 	} else if (zio->io_type == ZIO_TYPE_TRIM) {
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index ec961255fd64..7769ed6a377a 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -2557,15 +2557,11 @@ vdev_draid_spare_ioctl(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	int error = 0;
 
-	if (zio->io_cmd == DKIOCFLUSHWRITECACHE) {
-		for (int c = 0; c < vd->vdev_children; c++) {
-			zio_nowait(zio_vdev_child_io(zio, NULL,
-			    vd->vdev_child[c], zio->io_offset, zio->io_abd,
-			    zio->io_size, zio->io_type, zio->io_priority, 0,
-			    vdev_draid_spare_child_done, zio));
-		}
-	} else {
-		error = SET_ERROR(ENOTSUP);
+	for (int c = 0; c < vd->vdev_children; c++) {
+		zio_nowait(zio_vdev_child_io(zio, NULL,
+		    vd->vdev_child[c], zio->io_offset, zio->io_abd,
+		    zio->io_size, zio->io_type, zio->io_priority, 0,
+		    vdev_draid_spare_child_done, zio));
 	}
 
 	return (error);
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 481af2ba826b..2f43c4aa41b8 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -1096,10 +1096,7 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
 		return (B_FALSE);
 
 	if (zio != NULL) {
-		/*
-		 * If this is not a read or write zio, ignore the error.  This
-		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
-		 */
+		/* If this is not a read or write zio, ignore the error */
 		if (zio->io_type != ZIO_TYPE_READ &&
 		    zio->io_type != ZIO_TYPE_WRITE)
 			return (B_FALSE);
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 1af357c58006..34be54b337fd 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -125,10 +125,9 @@ static kstat_t *zil_kstats_global;
 int zil_replay_disable = 0;
 
 /*
- * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
- * the disk(s) by the ZIL after an LWB write has completed. Setting this
- * will cause ZIL corruption on power loss if a volatile out-of-order
- * write cache is enabled.
+ * Disable the flush commands that are normally sent to the disk(s) by the ZIL
+ * after an LWB write has completed. Setting this will cause ZIL corruption on
+ * power loss if a volatile out-of-order write cache is enabled.
  */
 static int zil_nocacheflush = 0;
 
@@ -1406,19 +1405,17 @@ zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 }
 
 /*
- * This function is a called after all vdevs associated with a given lwb
- * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
- * as the lwb write completes, if "zil_nocacheflush" is set. Further,
- * all "previous" lwb's will have completed before this function is
- * called; i.e. this function is called for all previous lwbs before
- * it's called for "this" lwb (enforced via zio the dependencies
- * configured in zil_lwb_set_zio_dependency()).
+ * This function is a called after all vdevs associated with a given lwb write
+ * have completed their flush command; or as soon as the lwb write completes,
+ * if "zil_nocacheflush" is set. Further, all "previous" lwb's will have
+ * completed before this function is called; i.e. this function is called for
+ * all previous lwbs before it's called for "this" lwb (enforced via zio the
+ * dependencies configured in zil_lwb_set_zio_dependency()).
  *
- * The intention is for this function to be called as soon as the
- * contents of an lwb are considered "stable" on disk, and will survive
- * any sudden loss of power. At this point, any threads waiting for the
- * lwb to reach this state are signalled, and the "waiter" structures
- * are marked "done".
+ * The intention is for this function to be called as soon as the contents of
+ * an lwb are considered "stable" on disk, and will survive any sudden loss of
+ * power. At this point, any threads waiting for the lwb to reach this state
+ * are signalled, and the "waiter" structures are marked "done".
  */
 static void
 zil_lwb_flush_vdevs_done(zio_t *zio)
@@ -1532,17 +1529,16 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
 }
 
 /*
- * This is called when an lwb's write zio completes. The callback's
- * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
- * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
- * in writing out this specific lwb's data, and in the case that cache
- * flushes have been deferred, vdevs involved in writing the data for
- * previous lwbs. The writes corresponding to all the vdevs in the
- * lwb_vdev_tree will have completed by the time this is called, due to
- * the zio dependencies configured in zil_lwb_set_zio_dependency(),
- * which takes deferred flushes into account. The lwb will be "done"
- * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
- * completion callback for the lwb's root zio.
+ * This is called when an lwb's write zio completes. The callback's purpose is
+ * to issue the flush commands for the vdevs in the lwb's lwb_vdev_tree. The
+ * tree will contain the vdevs involved in writing out this specific lwb's
+ * data, and in the case that cache flushes have been deferred, vdevs involved
+ * in writing the data for previous lwbs. The writes corresponding to all the
+ * vdevs in the lwb_vdev_tree will have completed by the time this is called,
+ * due to the zio dependencies configured in zil_lwb_set_zio_dependency(),
+ * which takes deferred flushes into account. The lwb will be "done" once
+ * zil_lwb_flush_vdevs_done() is called, which occurs in the zio completion
+ * callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
@@ -1601,19 +1597,18 @@ zil_lwb_write_done(zio_t *zio)
 	}
 
 	/*
-	 * If this lwb does not have any threads waiting for it to
-	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
-	 * command to the vdevs written to by "this" lwb, and instead
-	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
-	 * command for those vdevs. Thus, we merge the vdev tree of
-	 * "this" lwb with the vdev tree of the "next" lwb in the list,
-	 * and assume the "next" lwb will handle flushing the vdevs (or
-	 * deferring the flush(s) again).
+	 * If this lwb does not have any threads waiting for it to complete, we
+	 * want to defer issuing the flush command to the vdevs written to by
+	 * "this" lwb, and instead rely on the "next" lwb to handle the flush
+	 * command for those vdevs. Thus, we merge the vdev tree of "this" lwb
+	 * with the vdev tree of the "next" lwb in the list, and assume the
+	 * "next" lwb will handle flushing the vdevs (or deferring the flush(s)
+	 * again).
 	 *
-	 * This is a useful performance optimization, especially for
-	 * workloads with lots of async write activity and few sync
-	 * write and/or fsync activity, as it has the potential to
-	 * coalesce multiple flush commands to a vdev into one.
+	 * This is a useful performance optimization, especially for workloads
+	 * with lots of async write activity and few sync write and/or fsync
+	 * activity, as it has the potential to coalesce multiple flush
+	 * commands to a vdev into one.
 	 */
 	if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
@@ -1663,16 +1658,16 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 	 * If the previous lwb's write hasn't already completed, we also want
 	 * to order the completion of the lwb write zios (above, we only order
 	 * the completion of the lwb root zios). This is required because of
-	 * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
+	 * how we can defer the flush commands for each lwb.
 	 *
-	 * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
-	 * lwb will rely on this lwb to flush the vdevs written to by that
-	 * previous lwb. Thus, we need to ensure this lwb doesn't issue the
-	 * flush until after the previous lwb's write completes. We ensure
-	 * this ordering by setting the zio parent/child relationship here.
+	 * When the flush commands are deferred, the previous lwb will rely on
+	 * this lwb to flush the vdevs written to by that previous lwb. Thus,
+	 * we need to ensure this lwb doesn't issue the flush until after the
+	 * previous lwb's write completes. We ensure this ordering by setting
+	 * the zio parent/child relationship here.
 	 *
-	 * Without this relationship on the lwb's write zio, it's possible
-	 * for this lwb's write to complete prior to the previous lwb's write
+	 * Without this relationship on the lwb's write zio, it's possible for
+	 * this lwb's write to complete prior to the previous lwb's write
 	 * completing; and thus, the vdevs for the previous lwb would be
 	 * flushed prior to that lwb's data being written to those vdevs (the
 	 * vdevs are flushed in the lwb write zio's completion handler,
@@ -3499,8 +3494,8 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
  *      callback of the lwb's zio[*].
  *
  *      * Actually, the waiters are signaled in the zio completion
- *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
- *        that are sent to the vdevs upon completion of the lwb zio.
+ *        callback of the root zio for the flush commands that are sent to
+ *        the vdevs upon completion of the lwb zio.
  *
  *   2. When the itxs are inserted into the ZIL's queue of uncommitted
  *      itxs, the order in which they are inserted is preserved[*]; as
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 4aa08f3b30f5..031fc3d5135d 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1631,11 +1631,9 @@ zio_flush(zio_t *pio, vdev_t *vd)
 		return;
 
 	if (vd->vdev_children == 0) {
-		zio_t *zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
+		zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
 		    NULL, NULL, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0,
-		    NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-		zio->io_cmd = DKIOCFLUSHWRITECACHE;
-		zio_nowait(zio);
+		    NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE));
 	} else {
 		for (uint64_t c = 0; c < vd->vdev_children; c++)
 			zio_flush(pio, vd->vdev_child[c]);
@@ -4241,8 +4239,7 @@ zio_vdev_io_assess(zio_t *zio)
 	 * boolean flag so that we don't bother with it in the future.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
-	    zio->io_type == ZIO_TYPE_IOCTL &&
-	    zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
+	    zio->io_type == ZIO_TYPE_IOCTL && vd != NULL)
 		vd->vdev_nowritecache = B_TRUE;
 
 	if (zio->io_error)

From b613709c46bcc0d190c0d67c739ef3f8722d76b2 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 10 Apr 2024 16:07:24 +1000
Subject: [PATCH 069/116] dkio: remove kernel dkio.h compatibility header

Without DKIOCFLUSHWRITECACHE, we no longer need the compat header. Note
that we're keeping the userspace SPL compat header, which is used by
libefi.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16064
---
 include/os/freebsd/Makefile.am    |  1 -
 include/os/freebsd/spl/sys/dkio.h | 34 ---------------------------
 include/os/linux/Makefile.am      |  1 -
 include/os/linux/spl/sys/dkio.h   | 39 -------------------------------
 include/sys/vdev_impl.h           |  1 -
 5 files changed, 76 deletions(-)
 delete mode 100644 include/os/freebsd/spl/sys/dkio.h
 delete mode 100644 include/os/linux/spl/sys/dkio.h

diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am
index d4103c2f062a..292f79b8ce72 100644
--- a/include/os/freebsd/Makefile.am
+++ b/include/os/freebsd/Makefile.am
@@ -20,7 +20,6 @@ noinst_HEADERS = \
 	%D%/spl/sys/debug.h \
 	%D%/spl/sys/dirent.h \
 	%D%/spl/sys/disp.h \
-	%D%/spl/sys/dkio.h \
 	%D%/spl/sys/fcntl.h \
 	%D%/spl/sys/file.h \
 	%D%/spl/sys/freebsd_rwlock.h \
diff --git a/include/os/freebsd/spl/sys/dkio.h b/include/os/freebsd/spl/sys/dkio.h
deleted file mode 100644
index cd747089d422..000000000000
--- a/include/os/freebsd/spl/sys/dkio.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or https://opensource.org/licenses/CDDL-1.0.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * $FreeBSD$
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _OPENSOLARIS_SYS_DKIO_H_
-#define	_OPENSOLARIS_SYS_DKIO_H_
-
-#define	DKIOC		(0x04 << 8)
-#define	DKIOCFLUSHWRITECACHE	(DKIOC|34)	/* flush cache to phys medium */
-
-#endif /* _OPENSOLARIS_SYS_DKIO_H_ */
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
index 332569efe361..f31ae50b96af 100644
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@@ -63,7 +63,6 @@ kernel_spl_sys_HEADERS = \
 	%D%/spl/sys/ctype.h \
 	%D%/spl/sys/debug.h \
 	%D%/spl/sys/disp.h \
-	%D%/spl/sys/dkio.h \
 	%D%/spl/sys/errno.h \
 	%D%/spl/sys/fcntl.h \
 	%D%/spl/sys/file.h \
diff --git a/include/os/linux/spl/sys/dkio.h b/include/os/linux/spl/sys/dkio.h
deleted file mode 100644
index a90b67d36702..000000000000
--- a/include/os/linux/spl/sys/dkio.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
- *  Copyright (C) 2007 The Regents of the University of California.
- *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
- *  UCRL-CODE-235197
- *
- *  This file is part of the SPL, Solaris Porting Layer.
- *
- *  The SPL is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
- *
- *  The SPL is distributed in the hope that it will be useful, but WITHOUT
- *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- *  for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _SPL_DKIO_H
-#define	_SPL_DKIO_H
-
-#define	DFL_SZ(num_exts) \
-	(sizeof (dkioc_free_list_t) + (num_exts - 1) * 16)
-
-#define	DKIOC		(0x04 << 8)
-#define	DKIOCFLUSHWRITECACHE	(DKIOC|34)	/* flush cache to phys medium */
-
-/*
- * ioctl to free space (e.g. SCSI UNMAP) off a disk.
- * Pass a dkioc_free_list_t containing a list of extents to be freed.
- */
-#define	DKIOCFREE	(DKIOC|50)
-
-#endif /* _SPL_DKIO_H */
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 2a93f7c680bc..95164c4546bb 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -35,7 +35,6 @@
 #include <sys/nvpair.h>
 #include <sys/space_map.h>
 #include <sys/vdev.h>
-#include <sys/dkio.h>
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>

From d7605ae77b7ad176e8dbd5649fe4d14f5f4e8b9f Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 4 Apr 2024 22:35:00 +1100
Subject: [PATCH 070/116] zio: rename ZIO_TYPE_IOCTL to ZIO_TYPE_FLUSH

The only possible ioctl is a flush, and any other kind of meta-operation
introduced in the future is likely to have different semantics (much
like trim did). So, lets just call it what it is.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16064
---
 cmd/zinject/zinject.c             | 10 +++++-----
 include/sys/fs/zfs.h              |  8 +++++++-
 include/sys/zio_impl.h            | 18 +++++++++---------
 man/man8/zpool-events.8           | 12 ++++++------
 module/os/freebsd/zfs/vdev_file.c |  2 +-
 module/os/freebsd/zfs/vdev_geom.c |  8 ++++----
 module/os/linux/zfs/vdev_disk.c   |  2 +-
 module/os/linux/zfs/vdev_file.c   |  2 +-
 module/zfs/spa.c                  |  2 +-
 module/zfs/vdev.c                 | 12 ++++++------
 module/zfs/vdev_draid.c           | 10 +++++-----
 module/zfs/zio.c                  | 12 ++++++------
 module/zfs/zio_inject.c           |  2 +-
 13 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index 07d3d8af9980..a1afa4a63feb 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -265,7 +265,7 @@ usage(void)
 	    "\t\tspa_vdev_exit() will trigger a panic.\n"
 	    "\n"
 	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
-	    "\t\t[-T <read|write|free|claim|ioctl|all>] [-f frequency] pool\n\n"
+	    "\t\t[-T <read|write|free|claim|flush|all>] [-f frequency] pool\n\n"
 	    "\t\tInject a fault into a particular device or the device's\n"
 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
 	    "\t\t'pad1', or 'pad2'.\n"
@@ -425,7 +425,7 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
     void *data)
 {
 	static const char *iotypestr[] = {
-	    "null", "read", "write", "free", "claim", "ioctl", "trim", "all",
+	    "null", "read", "write", "free", "claim", "flush", "trim", "all",
 	};
 
 	int *count = data;
@@ -978,14 +978,14 @@ main(int argc, char **argv)
 				io_type = ZIO_TYPE_FREE;
 			} else if (strcasecmp(optarg, "claim") == 0) {
 				io_type = ZIO_TYPE_CLAIM;
-			} else if (strcasecmp(optarg, "ioctl") == 0) {
-				io_type = ZIO_TYPE_IOCTL;
+			} else if (strcasecmp(optarg, "flush") == 0) {
+				io_type = ZIO_TYPE_FLUSH;
 			} else if (strcasecmp(optarg, "all") == 0) {
 				io_type = ZIO_TYPES;
 			} else {
 				(void) fprintf(stderr, "invalid I/O type "
 				    "'%s': must be 'read', 'write', 'free', "
-				    "'claim', 'ioctl' or 'all'\n", optarg);
+				    "'claim', 'flush' or 'all'\n", optarg);
 				usage();
 				libzfs_fini(g_zfs);
 				return (1);
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 21f99bacccf3..e191420f2d2d 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1094,11 +1094,17 @@ typedef enum zio_type {
 	ZIO_TYPE_WRITE,
 	ZIO_TYPE_FREE,
 	ZIO_TYPE_CLAIM,
-	ZIO_TYPE_IOCTL,
+	ZIO_TYPE_FLUSH,
 	ZIO_TYPE_TRIM,
 	ZIO_TYPES
 } zio_type_t;
 
+/*
+ * Compatibility: _IOCTL was renamed to _FLUSH; keep the old name available to
+ * user programs.
+ */
+#define	ZIO_TYPE_IOCTL	ZIO_TYPE_FLUSH
+
 /*
  * Pool statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and userland as an nvlist uint64 array.
diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h
index 4b3726d7eec4..2b026d48675a 100644
--- a/include/sys/zio_impl.h
+++ b/include/sys/zio_impl.h
@@ -40,7 +40,7 @@ extern "C" {
  *
  * The ZFS I/O pipeline is comprised of various stages which are defined
  * in the zio_stage enum below. The individual stages are used to construct
- * these basic I/O operations: Read, Write, Free, Claim, Ioctl and Trim.
+ * these basic I/O operations: Read, Write, Free, Claim, Flush and Trim.
  *
  * I/O operations: (XXX - provide detail for each of the operations)
  *
@@ -48,7 +48,7 @@ extern "C" {
  * Write:
  * Free:
  * Claim:
- * Ioctl:
+ * Flush:
  * Trim:
  *
  * Although the most common pipeline are used by the basic I/O operations
@@ -122,7 +122,7 @@ extern "C" {
  * zio pipeline stage definitions
  */
 enum zio_stage {
-	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCIT */
+	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCXT */
 
 	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R----- */
 	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W---- */
@@ -150,15 +150,15 @@ enum zio_stage {
 	ZIO_STAGE_DVA_FREE		= 1 << 18,	/* --F--- */
 	ZIO_STAGE_DVA_CLAIM		= 1 << 19,	/* ---C-- */
 
-	ZIO_STAGE_READY			= 1 << 20,	/* RWFCIT */
+	ZIO_STAGE_READY			= 1 << 20,	/* RWFCXT */
 
-	ZIO_STAGE_VDEV_IO_START		= 1 << 21,	/* RW--IT */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 22,	/* RW--IT */
-	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 23,	/* RW--IT */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 21,	/* RW--XT */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 22,	/* RW--XT */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 23,	/* RW--XT */
 
 	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 24,	/* R----- */
 
-	ZIO_STAGE_DONE			= 1 << 25	/* RWFCIT */
+	ZIO_STAGE_DONE			= 1 << 25	/* RWFCXT */
 };
 
 #define	ZIO_ROOT_PIPELINE			\
@@ -259,7 +259,7 @@ enum zio_stage {
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_STAGE_DVA_CLAIM)
 
-#define	ZIO_IOCTL_PIPELINE			\
+#define	ZIO_FLUSH_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_VDEV_IO_STAGES)
 
diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8
index 12331b7b2a2d..ef20ef4e003c 100644
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@@ -364,7 +364,7 @@ that is, the bits set in the good data which are cleared in the bad data.
 .Sh I/O STAGES
 The ZFS I/O pipeline is comprised of various stages which are defined below.
 The individual stages are used to construct these basic I/O
-operations: Read, Write, Free, Claim, Ioctl and Trim.
+operations: Read, Write, Free, Claim, Flush and Trim.
 These stages may be
 set on an event to describe the life cycle of a given I/O request.
 .Pp
@@ -373,7 +373,7 @@ tab(:);
 l l l .
 Stage:Bit Mask:Operations
 _:_:_
-ZIO_STAGE_OPEN:0x00000001:RWFCIT
+ZIO_STAGE_OPEN:0x00000001:RWFCXT
 
 ZIO_STAGE_READ_BP_INIT:0x00000002:R-----
 ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W----
@@ -403,13 +403,13 @@ ZIO_STAGE_DVA_CLAIM:0x00080000:---C--
 
 ZIO_STAGE_READY:0x00100000:RWFCIT
 
-ZIO_STAGE_VDEV_IO_START:0x00200000:RW--IT
-ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--IT
-ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--IT
+ZIO_STAGE_VDEV_IO_START:0x00200000:RW--XT
+ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--XT
+ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--XT
 
 ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R-----
 
-ZIO_STAGE_DONE:0x02000000:RWFCIT
+ZIO_STAGE_DONE:0x02000000:RWFCXT
 .TE
 .
 .Sh I/O FLAGS
diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c
index 888c8e7f8863..869093afa3ed 100644
--- a/module/os/freebsd/zfs/vdev_file.c
+++ b/module/os/freebsd/zfs/vdev_file.c
@@ -247,7 +247,7 @@ vdev_file_io_start(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;
 
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
+	if (zio->io_type == ZIO_TYPE_FLUSH) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index 264dfa5c9237..9d88971919db 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -1053,7 +1053,7 @@ vdev_geom_io_intr(struct bio *bp)
 	/*
 	 * We have to split bio freeing into two parts, because the ABD code
 	 * cannot be called in this context and vdev_op_io_done is not called
-	 * for ZIO_TYPE_IOCTL zio-s.
+	 * for ZIO_TYPE_FLUSH zio-s.
 	 */
 	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
 		g_destroy_bio(bp);
@@ -1153,7 +1153,7 @@ vdev_geom_io_start(zio_t *zio)
 
 	vd = zio->io_vd;
 
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
+	if (zio->io_type == ZIO_TYPE_FLUSH) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
@@ -1181,7 +1181,7 @@ vdev_geom_io_start(zio_t *zio)
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM ||
-	    zio->io_type == ZIO_TYPE_IOCTL);
+	    zio->io_type == ZIO_TYPE_FLUSH);
 
 	cp = vd->vdev_tsd;
 	if (cp == NULL) {
@@ -1233,7 +1233,7 @@ vdev_geom_io_start(zio_t *zio)
 		bp->bio_offset = zio->io_offset;
 		bp->bio_length = zio->io_size;
 		break;
-	case ZIO_TYPE_IOCTL:
+	case ZIO_TYPE_FLUSH:
 		bp->bio_cmd = BIO_FLUSH;
 		bp->bio_data = NULL;
 		bp->bio_offset = cp->provider->mediasize;
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 554ed22b9df8..2cea61a6294c 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1400,7 +1400,7 @@ vdev_disk_io_start(zio_t *zio)
 	}
 
 	switch (zio->io_type) {
-	case ZIO_TYPE_IOCTL:
+	case ZIO_TYPE_FLUSH:
 
 		if (!vdev_readable(v)) {
 			/* Drive not there, can't flush */
diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c
index 2b483c9a9fa4..ac41a2615f16 100644
--- a/module/os/linux/zfs/vdev_file.c
+++ b/module/os/linux/zfs/vdev_file.c
@@ -242,7 +242,7 @@ vdev_file_io_start(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;
 
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
+	if (zio->io_type == ZIO_TYPE_FLUSH) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 3704ffd08820..f67d980ae4c6 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -180,7 +180,7 @@ static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
 	{ ZTI_SCALE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FLUSH */
 	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
 };
 
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index ebba453e2b14..d97d0a8100c2 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4924,11 +4924,11 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 
 			/*
 			 * TRIM ops and bytes are reported to user space as
-			 * ZIO_TYPE_IOCTL.  This is done to preserve the
+			 * ZIO_TYPE_FLUSH.  This is done to preserve the
 			 * vdev_stat_t structure layout for user space.
 			 */
 			if (type == ZIO_TYPE_TRIM)
-				vs_type = ZIO_TYPE_IOCTL;
+				vs_type = ZIO_TYPE_FLUSH;
 
 			/*
 			 * Solely for the purposes of 'zpool iostat -lqrw'
@@ -6239,12 +6239,12 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			case VDEV_PROP_OPS_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
-				 * space as ZIO_TYPE_IOCTL.  This is done to
+				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
-				    vd->vdev_stat.vs_ops[ZIO_TYPE_IOCTL],
+				    vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_NULL:
@@ -6275,12 +6275,12 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 			case VDEV_PROP_BYTES_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
-				 * space as ZIO_TYPE_IOCTL.  This is done to
+				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
-				    vd->vdev_stat.vs_bytes[ZIO_TYPE_IOCTL],
+				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_REMOVING:
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index 7769ed6a377a..13bb33cc6871 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -2548,11 +2548,11 @@ vdev_draid_read_config_spare(vdev_t *vd)
 }
 
 /*
- * Handle any ioctl requested of the distributed spare.  Only flushes
- * are supported in which case all children must be flushed.
+ * Handle any flush requested of the distributed spare. All children must be
+ * flushed.
  */
 static int
-vdev_draid_spare_ioctl(zio_t *zio)
+vdev_draid_spare_flush(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	int error = 0;
@@ -2592,8 +2592,8 @@ vdev_draid_spare_io_start(zio_t *zio)
 	}
 
 	switch (zio->io_type) {
-	case ZIO_TYPE_IOCTL:
-		zio->io_error = vdev_draid_spare_ioctl(zio);
+	case ZIO_TYPE_FLUSH:
+		zio->io_error = vdev_draid_spare_flush(zio);
 		break;
 
 	case ZIO_TYPE_WRITE:
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 031fc3d5135d..8d8523038e60 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -63,7 +63,7 @@ const char *const zio_type_name[ZIO_TYPES] = {
 	 * Note: Linux kernel thread name length is limited
 	 * so these names will differ from upstream open zfs.
 	 */
-	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
+	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim"
 };
 
 int zio_dva_throttle_enabled = B_TRUE;
@@ -1632,8 +1632,8 @@ zio_flush(zio_t *pio, vdev_t *vd)
 
 	if (vd->vdev_children == 0) {
 		zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
-		    NULL, NULL, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0,
-		    NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE));
+		    NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0,
+		    NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE));
 	} else {
 		for (uint64_t c = 0; c < vd->vdev_children; c++)
 			zio_flush(pio, vd->vdev_child[c]);
@@ -4086,7 +4086,7 @@ zio_vdev_io_done(zio_t *zio)
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
-	    zio->io_type == ZIO_TYPE_IOCTL ||
+	    zio->io_type == ZIO_TYPE_FLUSH ||
 	    zio->io_type == ZIO_TYPE_TRIM);
 
 	if (zio->io_delay)
@@ -4094,7 +4094,7 @@ zio_vdev_io_done(zio_t *zio)
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
-		if (zio->io_type != ZIO_TYPE_IOCTL)
+		if (zio->io_type != ZIO_TYPE_FLUSH)
 			vdev_queue_io_done(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
@@ -4239,7 +4239,7 @@ zio_vdev_io_assess(zio_t *zio)
 	 * boolean flag so that we don't bother with it in the future.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
-	    zio->io_type == ZIO_TYPE_IOCTL && vd != NULL)
+	    zio->io_type == ZIO_TYPE_FLUSH && vd != NULL)
 		vd->vdev_nowritecache = B_TRUE;
 
 	if (zio->io_error)
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 0a4851ecb40d..1af2c26f8a43 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -367,7 +367,7 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
 	 * We skip over faults in the labels unless it's during device open
 	 * (i.e. zio == NULL) or a device flush (offset is meaningless)
 	 */
-	if (zio != NULL && zio->io_type != ZIO_TYPE_IOCTL) {
+	if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) {
 		uint64_t offset = zio->io_offset;
 
 		if (offset < VDEV_LABEL_START_SIZE ||

From b181b2e604de3f36feab1092c702cdec5e78c693 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 13 Apr 2024 02:00:20 +1000
Subject: [PATCH 071/116] bdev_discard_supported: understand
 discard_granularity=0

Kernel documentation for the discard_granularity property says:

    A discard_granularity of 0 means that the device does not support
    discard functionality.

Some older kernels had drivers (notably loop, but also some USB-SATA
adapters) that would set the QUEUE_FLAG_DISCARD capability flag, but
have discard_granularity=0. Since 5.10 (torvalds/linux@b35fd7422c2f) the
discard entry point blkdev_issue_discard() has had a check for this,
which would immediately reject the call with EOPNOTSUPP, and throw a
scary diagnostic message into the log. See #16068.

Since 6.8, the block layer sets a non-zero default for
discard_granularity (torvalds/linux@3c407dc723bb), and a future kernel
will remove the check entirely[1].

As such, there's no good reason for us to enable discard when
discard_granularity=0. The kernel will never let the request go in
anyway; better that we just disable it so we can report it properly to
the user.

1. https://patchwork.kernel.org/project/linux-block/patch/20240312144826.1045212-2-hch@lst.de/

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16068
Closes #16082
---
 include/os/linux/kernel/linux/blkdev_compat.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index f111e648ccf7..b0f398354e4f 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -563,9 +563,11 @@ static inline boolean_t
 bdev_discard_supported(struct block_device *bdev)
 {
 #if defined(HAVE_BDEV_MAX_DISCARD_SECTORS)
-	return (!!bdev_max_discard_sectors(bdev));
+	return (bdev_max_discard_sectors(bdev) > 0 &&
+	    bdev_discard_granularity(bdev) > 0);
 #elif defined(HAVE_BLK_QUEUE_DISCARD)
-	return (!!blk_queue_discard(bdev_get_queue(bdev)));
+	return (blk_queue_discard(bdev_get_queue(bdev)) > 0 &&
+	    bdev_get_queue(bdev)->limits.discard_granularity > 0);
 #else
 #error "Unsupported kernel"
 #endif

From f22b110f60d83f62b75d20fabb0968ab74324778 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Tue, 16 Apr 2024 06:44:12 +1000
Subject: [PATCH 072/116] zts: allow running a single test by name only

Specifying a single test is kind of a hassle, because the full relative
path under the test suite dir has to be included, but it's not always
clear what that path even is.

This change allows `-t` to take the name of a single test instead of a
full path. If the value has no `/` characters, we search for a file of
that name under the test root, and if found, use that as the full test
path instead.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16088
---
 scripts/zfs-tests.sh | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh
index 179e24d7a0ef..b5b3e4ab351f 100755
--- a/scripts/zfs-tests.sh
+++ b/scripts/zfs-tests.sh
@@ -326,7 +326,8 @@ OPTIONS:
 	-d DIR      Use world-writable DIR for files and loopback devices
 	-s SIZE     Use vdevs of SIZE (default: 4G)
 	-r RUNFILES Run tests in RUNFILES (default: ${DEFAULT_RUNFILES})
-	-t PATH     Run single test at PATH relative to test suite
+	-t PATH|NAME  Run single test at PATH relative to test suite,
+	                or search for test by NAME
 	-T TAGS     Comma separated list of tags (default: 'functional')
 	-u USER     Run single test as USER (default: root)
 
@@ -340,6 +341,9 @@ $0 -r linux-fast
 # Run a single test
 $0 -t tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh
 
+# Run a single test by name
+$0 -t zfs_bookmark_cliargs
+
 # Cleanup a previous run of the test suite prior to testing, run the
 # default ($(echo "${DEFAULT_RUNFILES}" | sed 's/\.run//')) suite of tests and perform no cleanup on exit.
 $0 -x
@@ -450,8 +454,15 @@ post_user = root
 post =
 outputdir = /var/tmp/test_results
 EOF
-	SINGLETESTDIR="${SINGLETEST%/*}"
+	if [ "$SINGLETEST" = "${SINGLETEST%/*}" ] ; then
+		NEWSINGLETEST=$(find "$STF_SUITE" -name "$SINGLETEST*" -print -quit)
+		if [ -z "$NEWSINGLETEST" ] ; then
+			fail "couldn't find test matching '$SINGLETEST'"
+		fi
+		SINGLETEST=$NEWSINGLETEST
+	fi
 
+	SINGLETESTDIR="${SINGLETEST%/*}"
 	SETUPDIR="$SINGLETESTDIR"
 	[ "${SETUPDIR#/}" = "$SETUPDIR" ] && SETUPDIR="$STF_SUITE/$SINGLETESTDIR"
 	[ -x "$SETUPDIR/setup.ksh"   ] && SETUPSCRIPT="setup"     || SETUPSCRIPT=

From 4725e543be32f74d3a0a46ce3bb5c8e89280b471 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Tue, 16 Apr 2024 06:52:20 +1000
Subject: [PATCH 073/116] zinject: "no-op" error injection

When injected, this causes the matching IO to appear to succeed, but the
actual work is never submitted to the physical device. This can be used
to simulate a write-back cache servicing a write, but the backing device
has failed and the cache cannot complete the operation in the
background.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16085
---
 cmd/zinject/zinject.c                                  |  7 ++++---
 man/man8/zinject.8                                     |  6 ++++--
 module/zfs/zio.c                                       | 10 ++++++++++
 .../tests/functional/cli_root/zinject/zinject_args.ksh |  2 +-
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index a1afa4a63feb..e9141fb4ba55 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -221,6 +221,7 @@ static const struct errstr errstrtable[] = {
 	{ ENXIO,	"nxio" },
 	{ ECHILD,	"dtl" },
 	{ EILSEQ,	"corrupt" },
+	{ ENOSYS,	"noop" },
 	{ 0, NULL },
 };
 
@@ -269,8 +270,8 @@ usage(void)
 	    "\t\tInject a fault into a particular device or the device's\n"
 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
 	    "\t\t'pad1', or 'pad2'.\n"
-	    "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl', or\n"
-	    "\t\t'corrupt' (bit flip).\n"
+	    "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl',\n"
+	    "\t\t'corrupt' (bit flip), or 'noop' (successfully do nothing).\n"
 	    "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"
 	    "\t\tdevice error injection to a percentage of the IOs.\n"
 	    "\n"
@@ -889,7 +890,7 @@ main(int argc, char **argv)
 			if (error < 0) {
 				(void) fprintf(stderr, "invalid error type "
 				    "'%s': must be one of: io decompress "
-				    "decrypt nxio dtl corrupt\n",
+				    "decrypt nxio dtl corrupt noop\n",
 				    optarg);
 				usage();
 				libzfs_fini(g_zfs);
diff --git a/man/man8/zinject.8 b/man/man8/zinject.8
index 817dcb7fe32a..f67b5e378dc3 100644
--- a/man/man8/zinject.8
+++ b/man/man8/zinject.8
@@ -211,9 +211,11 @@ to flip a bit in the data after a read,
 .It Sy dtl
 for an ECHILD error,
 .It Sy io
-for an EIO error where reopening the device will succeed, or
+for an EIO error where reopening the device will succeed,
 .It Sy nxio
-for an ENXIO error where reopening the device will fail.
+for an ENXIO error where reopening the device will fail, or
+.It Sy noop
+to drop the IO without executing it, and return success.
 .El
 .Pp
 For EIO and ENXIO, the "failed" reads or writes still occur.
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 8d8523038e60..414e3d4e93bd 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -4058,6 +4058,16 @@ zio_vdev_io_start(zio_t *zio)
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM)) {
 
+		if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) {
+			/*
+			 * "no-op" injections return success, but do no actual
+			 * work. Just skip the remaining vdev stages.
+			 */
+			zio_vdev_io_bypass(zio);
+			zio_interrupt(zio);
+			return (NULL);
+		}
+
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (NULL);
 
diff --git a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh
index f8a8ffbb7b0e..dd9ef9ddd229 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_args.ksh
@@ -47,7 +47,7 @@ function cleanup
 
 function test_device_fault
 {
-	typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt")
+	typeset -a errno=("io" "decompress" "decrypt" "nxio" "dtl" "corrupt" "noop")
 	for e in ${errno[@]}; do
 		log_must eval \
 		    "zinject -d $DISK1 -e $e -T read -f 0.001 $TESTPOOL"

From c6da985e28d7071b187bd928e7fd41ba9e9f6aa7 Mon Sep 17 00:00:00 2001
From: Andrew Turner <andrew.turner4@arm.com>
Date: Mon, 15 Apr 2024 21:53:39 +0100
Subject: [PATCH 074/116] Add the BTI elf note to the AArch64 SHA2 assembly

On ELF platforms there is a note to specify when an application or
library supports BTI. When linking one of these the linker needs
all input object files to have the note. If not it will not include
it in the output file.

Normally the compiler would generate it, but for assembly files we
need to do it our selves.

Add the note to the aarch64 sha256 and sha512 assembly files.

Tested by building with BTI enabled and using the -zbti-report=error
flag to lld that makes it an error if the note is missing.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Andrew Turner <andrew.turner4@arm.com>
Closes #16086
---
 module/icp/asm-aarch64/sha2/sha256-armv8.S | 10 ++++++++++
 module/icp/asm-aarch64/sha2/sha512-armv8.S | 10 ++++++++++
 2 files changed, 20 insertions(+)

diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S
index 7ae486e4e229..4dcdd3b65d0b 100644
--- a/module/icp/asm-aarch64/sha2/sha256-armv8.S
+++ b/module/icp/asm-aarch64/sha2/sha256-armv8.S
@@ -21,6 +21,16 @@
 
 #if defined(__aarch64__)
 
+	.section	.note.gnu.property,"a",@note
+	.p2align	3
+	.word	4
+	.word	16
+	.word	5
+	.asciz	"GNU"
+	.word	3221225472
+	.word	4
+	.word	3
+	.word	0
 .text
 
 .align	6
diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S
index 9c61eeee4d7b..f6c8f7742912 100644
--- a/module/icp/asm-aarch64/sha2/sha512-armv8.S
+++ b/module/icp/asm-aarch64/sha2/sha512-armv8.S
@@ -21,6 +21,16 @@
 
 #if defined(__aarch64__)
 
+	.section	.note.gnu.property,"a",@note
+	.p2align	3
+	.word	4
+	.word	16
+	.word	5
+	.asciz	"GNU"
+	.word	3221225472
+	.word	4
+	.word	3
+	.word	0
 .text
 
 .align	6

From 90ba19eb7b81f0225e63bedfb902000d23383921 Mon Sep 17 00:00:00 2001
From: Tino Reichardt <milky-zfs@mcmilk.de>
Date: Mon, 15 Apr 2024 22:56:10 +0200
Subject: [PATCH 075/116] Do no use .cfi_negate_ra_state within the assembly on
 Arm64

Compiling openzfs on aarch64 with gcc-8 and gcc-9 is failing currently.
See issue #14965 for deeper context.

On platforms without pointer authentication, .cfi_negate_ra_state can be
defined to a no-op:
https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=gdb/aarch64-tdep.c#l1413

I have tested this on Arm64 FreeBSD 13.2 and AlmaLinux-8.

Reviewed-by: Andrew Turner <andrew.turner4@arm.com>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Closes #14965
Closes #15784
---
 module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S  | 14 +++++++++++---
 module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S | 12 ++++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
index dc2719d142db..e66bb4bc7f26 100644
--- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
@@ -32,6 +32,14 @@
  */
 
 #if defined(__aarch64__)
+
+/* make gcc <= 9 happy */
+#if LD_VERSION >= 233010000
+#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state
+#else
+#define CFI_NEGATE_RA_STATE
+#endif
+
 	.text
 	.section	.note.gnu.property,"a",@note
 	.p2align	3
@@ -51,7 +59,7 @@
 zfs_blake3_compress_in_place_sse2:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
@@ -555,7 +563,7 @@ compress_pre:
 zfs_blake3_compress_xof_sse2:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
@@ -608,7 +616,7 @@ zfs_blake3_compress_xof_sse2:
 zfs_blake3_hash_many_sse2:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	stp	d15, d14, [sp, #-160]!
 	stp	d13, d12, [sp, #16]
 	stp	d11, d10, [sp, #32]
diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
index c4c2dfc5bcde..b9fb28dfcf03 100644
--- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
@@ -32,6 +32,14 @@
  */
 
 #if defined(__aarch64__)
+
+/* make gcc <= 9 happy */
+#if LD_VERSION >= 233010000
+#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state
+#else
+#define CFI_NEGATE_RA_STATE
+#endif
+
 	.text
 	.section	.note.gnu.property,"a",@note
 	.p2align	3
@@ -51,7 +59,7 @@
 zfs_blake3_compress_in_place_sse41:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64
@@ -565,7 +573,7 @@ compress_pre:
 zfs_blake3_compress_xof_sse41:
 	.cfi_startproc
 	hint	#25
-	.cfi_negate_ra_state
+	CFI_NEGATE_RA_STATE
 	sub	sp, sp, #96
 	stp	x29, x30, [sp, #64]
 	add	x29, sp, #64

From cf60db6ebe516d8470d9935c380f7ecc27071a25 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Wed, 17 Apr 2024 02:13:01 +1000
Subject: [PATCH 076/116] zts: add a debug option to get full test output

The test runner accumulates output from individual tests, then writes it
to the log at the end. If a test hangs or crashes the system half way
through, we get no insight into how it got to where it did.

This adds a -D option for "debug". When set, all test output is written
to stdout.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16096
---
 scripts/zfs-tests.sh                    | 10 +++++++++-
 tests/test-runner/bin/test-runner.py.in | 23 ++++++++++++++++-------
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh
index b5b3e4ab351f..c25903ea1bee 100755
--- a/scripts/zfs-tests.sh
+++ b/scripts/zfs-tests.sh
@@ -32,6 +32,7 @@ SCRIPT_COMMON=${SCRIPT_COMMON:-${0%/*}/common.sh}
 PROG=zfs-tests.sh
 VERBOSE="no"
 QUIET=""
+DEBUG=""
 CLEANUP="yes"
 CLEANUPALL="no"
 KMSG=""
@@ -313,6 +314,7 @@ OPTIONS:
 	-h          Show this message
 	-v          Verbose zfs-tests.sh output
 	-q          Quiet test-runner output
+	-D          Debug; show all test output immediately (noisy)
 	-x          Remove all testpools, dm, lo, and files (unsafe)
 	-k          Disable cleanup after test failure
 	-K          Log test names to /dev/kmsg
@@ -351,7 +353,7 @@ $0 -x
 EOF
 }
 
-while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do
+while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do
 	case $OPTION in
 	h)
 		usage
@@ -397,6 +399,9 @@ while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do
 	d)
 		FILEDIR="$OPTARG"
 		;;
+	D)
+		DEBUG="yes"
+		;;
 	I)
 		ITERATIONS="$OPTARG"
 		if [ "$ITERATIONS" -le 0 ]; then
@@ -691,6 +696,7 @@ REPORT_FILE=$(mktemp_file zts-report)
 #
 msg "${TEST_RUNNER}" \
     "${QUIET:+-q}" \
+    "${DEBUG:+-D}" \
     "${KMEMLEAK:+-m}" \
     "${KMSG:+-K}" \
     "-c \"${RUNFILES}\"" \
@@ -700,6 +706,7 @@ msg "${TEST_RUNNER}" \
 { PATH=$STF_PATH \
     ${TEST_RUNNER} \
     ${QUIET:+-q} \
+    ${DEBUG:+-D} \
     ${KMEMLEAK:+-m} \
     ${KMSG:+-K} \
     -c "${RUNFILES}" \
@@ -726,6 +733,7 @@ if [ "$RESULT" -eq "2" ] && [ -n "$RERUN" ]; then
 	{ PATH=$STF_PATH \
 	    ${TEST_RUNNER} \
 	        ${QUIET:+-q} \
+	        ${DEBUG:+-D} \
 	        ${KMEMLEAK:+-m} \
 	    -c "${RUNFILES}" \
 	    -T "${TAGS}" \
diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in
index 422ebd7bc8bf..65247f4f06fc 100755
--- a/tests/test-runner/bin/test-runner.py.in
+++ b/tests/test-runner/bin/test-runner.py.in
@@ -113,8 +113,9 @@ class Output(object):
     This class is a slightly modified version of the 'Stream' class found
     here: http://goo.gl/aSGfv
     """
-    def __init__(self, stream):
+    def __init__(self, stream, debug=False):
         self.stream = stream
+        self.debug = debug
         self._buf = b''
         self.lines = []
 
@@ -140,6 +141,8 @@ class Output(object):
         buf = os.read(fd, 4096)
         if not buf:
             return None
+        if self.debug:
+            os.write(sys.stderr.fileno(), buf)
         if b'\n' not in buf:
             self._buf += buf
             return []
@@ -238,14 +241,14 @@ User: %s
         ret = '%s -E -u %s %s' % (SUDO, user, cmd)
         return ret.split(' ')
 
-    def collect_output(self, proc):
+    def collect_output(self, proc, debug=False):
         """
         Read from stdout/stderr as data becomes available, until the
         process is no longer running. Return the lines from the stdout and
         stderr Output objects.
         """
-        out = Output(proc.stdout)
-        err = Output(proc.stderr)
+        out = Output(proc.stdout, debug)
+        err = Output(proc.stderr, debug)
         res = []
         while proc.returncode is None:
             proc.poll()
@@ -308,7 +311,10 @@ User: %s
 
         try:
             t.start()
-            self.result.stdout, self.result.stderr = self.collect_output(proc)
+
+            out, err = self.collect_output(proc, options.debug)
+            self.result.stdout = out
+            self.result.stderr = err
 
             if kmemleak:
                 cmd = f'{SUDO} sh -c "echo scan > {KMEMLEAK_FILE}"'
@@ -624,7 +630,7 @@ Tags: %s
 
 
 class TestRun(object):
-    props = ['quiet', 'outputdir']
+    props = ['quiet', 'outputdir', 'debug']
 
     def __init__(self, options):
         self.tests = {}
@@ -644,7 +650,8 @@ class TestRun(object):
             ('post_user', ''),
             ('failsafe', ''),
             ('failsafe_user', ''),
-            ('tags', [])
+            ('tags', []),
+            ('debug', False)
         ]
 
     def __str__(self):
@@ -1067,6 +1074,8 @@ def parse_args():
                       help='Specify tests to run via config files.')
     parser.add_option('-d', action='store_true', default=False, dest='dryrun',
                       help='Dry run. Print tests, but take no other action.')
+    parser.add_option('-D', action='store_true', default=False, dest='debug',
+                      help='Write all test output to stdout as it arrives.')
     parser.add_option('-l', action='callback', callback=options_cb,
                       default=None, dest='logfile', metavar='logfile',
                       type='string',

From 454c0b0e46eca93a9d6af262c41b56987b15928e Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 17 Apr 2024 09:29:21 -0700
Subject: [PATCH 077/116] Linux 6.8 compat: META (#16099)

Update the META file to reflect compatibility with the 6.8 kernel.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index 8a257f0feb17..19a796050f5b 100644
--- a/META
+++ b/META
@@ -6,5 +6,5 @@ Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.7
+Linux-Maximum: 6.8
 Linux-Minimum: 3.10

From 35bf2584852d47a666a0ae3d1c6903c367e8f169 Mon Sep 17 00:00:00 2001
From: Tino Reichardt <milky-zfs@mcmilk.de>
Date: Fri, 19 Apr 2024 19:15:38 +0200
Subject: [PATCH 078/116] Fix: FreeBSD Arm64 does not build currently

The define LD_VERSION isn't defined on FreeBSD Arm64 when OpenZFS is
build with the default compiler: clang.
I used only gcc for testing - my fault.

Fast fix as suggested by @mmatuska

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Martin Matuska <mm@FreeBSD.org>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Closes #16103
---
 module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S  | 2 +-
 module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
index e66bb4bc7f26..fefebf08116e 100644
--- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
@@ -34,7 +34,7 @@
 #if defined(__aarch64__)
 
 /* make gcc <= 9 happy */
-#if LD_VERSION >= 233010000
+#if !defined(LD_VERSION) || LD_VERSION >= 233010000
 #define CFI_NEGATE_RA_STATE .cfi_negate_ra_state
 #else
 #define CFI_NEGATE_RA_STATE
diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
index b9fb28dfcf03..1ad6cefc6d06 100644
--- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
@@ -34,7 +34,7 @@
 #if defined(__aarch64__)
 
 /* make gcc <= 9 happy */
-#if LD_VERSION >= 233010000
+#if !defined(LD_VERSION) || LD_VERSION >= 233010000
 #define CFI_NEGATE_RA_STATE .cfi_negate_ra_state
 #else
 #define CFI_NEGATE_RA_STATE

From cd3e6b4f4c5e0b514f3e76e194b2a5753264d44f Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Fri, 19 Apr 2024 22:19:12 +0500
Subject: [PATCH 079/116] Add zfetch stats in arcstats

arc_summary also reports zfetch stats but it's inconvenient to monitor
contiguously incrementing numbers. Adding them in arcstats allows us to
observe streams more conveniently.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16094
---
 cmd/arcstat.in | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/cmd/arcstat.in b/cmd/arcstat.in
index 8df1c62f7e86..220f343b5b62 100755
--- a/cmd/arcstat.in
+++ b/cmd/arcstat.in
@@ -157,6 +157,16 @@ cols = {
     "free":       [5, 1024, "ARC free memory"],
     "avail":      [5, 1024, "ARC available memory"],
     "waste":      [5, 1024, "Wasted memory due to round up to pagesize"],
+    "ztotal":     [6, 1000, "zfetch total prefetcher calls per second"],
+    "zhits":      [5, 1000, "zfetch stream hits per second"],
+    "zahead":     [6, 1000, "zfetch hits ahead of streams per second"],
+    "zpast":      [5, 1000, "zfetch hits behind streams per second"],
+    "zmisses":    [7, 1000, "zfetch stream misses per second"],
+    "zmax":       [4, 1000, "zfetch limit reached per second"],
+    "zfuture":    [7, 1000, "zfetch stream future per second"],
+    "zstride":    [7, 1000, "zfetch stream strides per second"],
+    "zissued":    [7, 1000, "zfetch prefetches issued per second"],
+    "zactive":    [7, 1000, "zfetch prefetches active per second"],
 }
 
 v = {}
@@ -164,6 +174,8 @@ hdr = ["time", "read", "ddread", "ddh%", "dmread", "dmh%", "pread", "ph%",
        "size", "c", "avail"]
 xhdr = ["time", "mfu", "mru", "mfug", "mrug", "unc", "eskip", "mtxmis",
         "dread", "pread", "read"]
+zhdr = ["time", "ztotal", "zhits", "zahead", "zpast", "zmisses", "zmax",
+        "zfuture", "zstride", "zissued", "zactive"]
 sint = 1               # Default interval is 1 second
 count = 1              # Default count is 1
 hdr_intr = 20          # Print header every 20 lines of output
@@ -206,12 +218,17 @@ elif sys.platform.startswith('linux'):
     def kstat_update():
         global kstat
 
-        k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')]
+        k1 = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')]
 
-        if not k:
+        k2 = ["zfetch_" + line.strip() for line in
+             open('/proc/spl/kstat/zfs/zfetchstats')]
+
+        if k1 is None or k2 is None:
             sys.exit(1)
 
-        del k[0:2]
+        del k1[0:2]
+        del k2[0:2]
+        k = k1 + k2
         kstat = {}
 
         for s in k:
@@ -239,6 +256,7 @@ def usage():
     sys.stderr.write("\t -v : List all possible field headers and definitions"
                      "\n")
     sys.stderr.write("\t -x : Print extended stats\n")
+    sys.stderr.write("\t -z : Print zfetch stats\n")
     sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
     sys.stderr.write("\t -o : Redirect output to the specified file\n")
     sys.stderr.write("\t -s : Override default field separator with custom "
@@ -357,6 +375,7 @@ def init():
     global count
     global hdr
     global xhdr
+    global zhdr
     global opfile
     global sep
     global out
@@ -368,15 +387,17 @@ def init():
     xflag = False
     hflag = False
     vflag = False
+    zflag = False
     i = 1
 
     try:
         opts, args = getopt.getopt(
             sys.argv[1:],
-            "axo:hvs:f:p",
+            "axzo:hvs:f:p",
             [
                 "all",
                 "extended",
+                "zfetch",
                 "outfile",
                 "help",
                 "verbose",
@@ -410,13 +431,15 @@ def init():
             i += 1
         if opt in ('-p', '--parsable'):
             pretty_print = False
+        if opt in ('-z', '--zfetch'):
+            zflag = True
         i += 1
 
     argv = sys.argv[i:]
     sint = int(argv[0]) if argv else sint
     count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1)
 
-    if hflag or (xflag and desired_cols):
+    if hflag or (xflag and zflag) or ((zflag or xflag) and desired_cols):
         usage()
 
     if vflag:
@@ -425,6 +448,9 @@ def init():
     if xflag:
         hdr = xhdr
 
+    if zflag:
+        hdr = zhdr
+
     update_hdr_intr()
 
     # check if L2ARC exists
@@ -569,6 +595,17 @@ def calculate():
     v["el2mru"] = d["evict_l2_eligible_mru"] // sint
     v["el2inel"] = d["evict_l2_ineligible"] // sint
     v["mtxmis"] = d["mutex_miss"] // sint
+    v["ztotal"] = (d["zfetch_hits"] + d["zfetch_future"] + d["zfetch_stride"] +
+                   d["zfetch_past"] + d["zfetch_misses"]) // sint
+    v["zhits"] = d["zfetch_hits"] // sint
+    v["zahead"] = (d["zfetch_future"] + d["zfetch_stride"]) // sint
+    v["zpast"] = d["zfetch_past"] // sint
+    v["zmisses"] = d["zfetch_misses"] // sint
+    v["zmax"] = d["zfetch_max_streams"] // sint
+    v["zfuture"] = d["zfetch_future"] // sint
+    v["zstride"] = d["zfetch_stride"] // sint
+    v["zissued"] = d["zfetch_io_issued"] // sint
+    v["zactive"] = d["zfetch_io_active"] // sint
 
     if l2exist:
         v["l2hits"] = d["l2_hits"] // sint

From f75574cbaaa1ade5bf24ab11751cbd5bc62ef7f1 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 16 Apr 2024 15:03:33 +1000
Subject: [PATCH 080/116] tests/quota_005_pos: use a long int for doubling the
 quota size

When run in isolation, quota_005_pos would see an empty ~300G dataset.
Doubling it's space overflows a int32, which meant it was trying to then
set the quota to a negative value, and would fail.

When run as part of the quota tests, the filesystem appears to have
stuff in it, and so a lower available space, which doesn't overflow, and
so succeeds.

The bare minimum fix seems to be to use a int64 for the available space,
so it can be comfortably doubled. Here it is.

(Also a typo fix and a tiny bit of cleanup).

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16097
---
 tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh
index 98ee4edae650..fb3d97f486de 100755
--- a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh
@@ -55,15 +55,14 @@ function cleanup
 
 log_onexit cleanup
 
-log_assert "Verify that quota doesnot inherit its value from parent."
-log_onexit cleanup
+log_assert "Verify that quota does not inherit its value from parent."
 
 fs=$TESTPOOL/$TESTFS
 fs_child=$TESTPOOL/$TESTFS/$TESTFS
 
 space_avail=$(get_prop available $fs)
 quota_val=$(get_prop quota $fs)
-typeset -i quotasize=$space_avail
+typeset -li quotasize=$space_avail
 ((quotasize = quotasize * 2 ))
 log_must zfs set quota=$quotasize $fs
 
@@ -72,4 +71,4 @@ quota_space=$(get_prop quota $fs_child)
 [[ $quota_space == $quotasize ]] && \
 	log_fail "The quota of child dataset inherits its value from parent."
 
-log_pass "quota doesnot inherit its value from parent as expected."
+log_pass "quota does not inherit its value from parent as expected."

From 26d49fec5f862818a0410fedbba1efded0543374 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 16 Apr 2024 14:56:35 +1000
Subject: [PATCH 081/116] tests/quota: consistently clear quota property
 between tests

When run in isolation, quota_005_pos would fail in cleanup because it
would attempt restore the previous quota, which was 0, and so get an
error (because you can't set quota to '0', you have to use 'none').

It worked as part of the quota tag set because the previous tests did
not clean up their quota, so there was always a non-zero quota to return
to.

This adds a simple quota reset function, and has all quota tests run it
at cleanup. For the ones that weren't cleaning up, they now do, and for
quota_005_pos, which was trying to do the right thing, it now just
resets it.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16097
---
 tests/zfs-tests/tests/functional/quota/quota.kshlib      | 7 +++++++
 tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh | 2 ++
 tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh | 2 ++
 tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh | 2 ++
 tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh | 2 ++
 tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh | 2 +-
 tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh | 2 +-
 7 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/quota/quota.kshlib b/tests/zfs-tests/tests/functional/quota/quota.kshlib
index 5083415c8956..b4cfde020f3f 100644
--- a/tests/zfs-tests/tests/functional/quota/quota.kshlib
+++ b/tests/zfs-tests/tests/functional/quota/quota.kshlib
@@ -95,3 +95,10 @@ function exceed_quota
 	    log_fail "Returned error code: $zret. Expected: $EDQUOT."
 	return 0
 }
+
+function reset_quota
+{
+	typeset FILESYSTEM="$1"
+
+	log_must zfs set quota=none $FILESYSTEM
+}
diff --git a/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh
index d124cb26ae98..f01008a46bb1 100755
--- a/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh
@@ -64,6 +64,8 @@ function cleanup
 	#
 	wait_freeing $TESTPOOL
 	sync_pool $TESTPOOL
+
+	reset_quota $TESTPOOL/$TESTFS
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh
index 3af005e874e9..bea2a5a68691 100755
--- a/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh
@@ -64,6 +64,8 @@ function cleanup
 
 	wait_freeing $TESTPOOL
 	sync_pool $TESTPOOL
+
+	reset_quota $TESTPOOL/$TESTFS
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh
index de265813d55b..33f6421131fc 100755
--- a/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh
@@ -67,6 +67,8 @@ function cleanup
 	#
 	wait_freeing $TESTPOOL
 	sync_pool $TESTPOOL
+
+	reset_quota $TESTPOOL/$TESTCTR/$TESTFS1
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh
index 8f20b533da68..682d09f080a4 100755
--- a/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh
@@ -65,6 +65,8 @@ function cleanup
 
 	wait_freeing $TESTPOOL
 	sync_pool $TESTPOOL
+
+	reset_quota $TESTPOOL/$TESTCTR/$TESTFS1
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh
index fb3d97f486de..9c4db81ca270 100755
--- a/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh
@@ -50,7 +50,7 @@ function cleanup
 {
 	datasetexists $fs_child && destroy_dataset $fs_child
 
-	log_must zfs set quota=$quota_val $fs
+	reset_quota $fs
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh b/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh
index 12105162c5b5..111d771188e3 100755
--- a/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh
+++ b/tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh
@@ -50,7 +50,7 @@ log_assert "Verify cannot set quota lower than the space currently in use"
 
 function cleanup
 {
-	log_must zfs set quota=none $TESTPOOL/$TESTFS
+	reset_quota $TESTPOOL/$TESTFS
 }
 
 log_onexit cleanup

From 9f83eec03904b18e052fbe2c66542bd47254cf57 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 19 Apr 2024 19:18:54 -0400
Subject: [PATCH 082/116] Handle FLUSH errors as "expected"

Before #16061 zio_vdev_io_done() was not used for FLUSH requests.
Addition of it triggers reprobe each TXG for vdevs not supporting
them.  Since those errors are often expected, they are normally
handled by individual vdev drivers and should be ignored here.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16110
---
 module/zfs/zio.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 414e3d4e93bd..1ba99f4d4624 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -4114,7 +4114,8 @@ zio_vdev_io_done(zio_t *zio)
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
-		if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
+		if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH &&
+		    zio->io_type != ZIO_TYPE_TRIM) {
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {

From f4f156157de3f61e55db0429b10c63d02226e115 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 20 Apr 2024 09:41:31 +1000
Subject: [PATCH 083/116] abd_iter_page: rework to handle multipage
 scatterlists

Previously, abd_iter_page() would assume that every scatterlist would
contain a single page (compound or no), because that's all we ever
create in abd_alloc_chunks(). However, scatterlists can contain multiple
pages of arbitrary provenance, and if we get one of those, we'd get all
the math wrong.

This reworks things to handle multiple pages in a scatterlist, by
properly finding the right page within it for the given offset, and
understanding better where the end of the page is and not crossing it.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reported-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16108
---
 module/os/linux/zfs/abd_os.c | 120 +++++++++++++++++++++--------------
 1 file changed, 74 insertions(+), 46 deletions(-)

diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index d3255dcbc0f7..cee7410c8833 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -1015,10 +1015,50 @@ abd_cache_reap_now(void)
 }
 
 #if defined(_KERNEL)
+
 /*
- * Yield the next page struct and data offset and size within it, without
+ * This is abd_iter_page(), the function underneath abd_iterate_page_func().
+ * It yields the next page struct and data offset and size within it, without
  * mapping it into the address space.
  */
+
+/*
+ * "Compound pages" are a group of pages that can be referenced from a single
+ * struct page *. Its organised as a "head" page, followed by a series of
+ * "tail" pages.
+ *
+ * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we
+ * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a
+ * great many of the IO buffers we get are going to be of this type.
+ *
+ * The tail pages are just regular PAGESIZE pages, and can be safely used
+ * as-is. However, the head page has length covering itself and all the tail
+ * pages. If the ABD chunk spans multiple pages, then we can use the head page
+ * and a >PAGESIZE length, which is far more efficient.
+ *
+ * Before kernel 4.5 however, compound page heads were refcounted separately
+ * from tail pages, such that moving back to the head page would require us to
+ * take a reference to it and releasing it once we're completely finished with
+ * it. In practice, that means when our caller is done with the ABD, which we
+ * have no insight into from here. Rather than contort this API to track head
+ * page references on such ancient kernels, we disable this special compound
+ * page handling on 4.5, instead just using treating each page within it as a
+ * regular PAGESIZE page (which it is). This is slightly less efficient, but
+ * makes everything far simpler.
+ *
+ * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the
+ * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to
+ * understand compound pages, or not, as required.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
+#define	ABD_ITER_COMPOUND_PAGES		1
+#define	ABD_ITER_PAGE_SIZE(page)	\
+	(PageCompound(page) ? page_size(page) : PAGESIZE)
+#else
+#undef ABD_ITER_COMPOUND_PAGES
+#define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)
+#endif
+
 void
 abd_iter_page(struct abd_iter *aiter)
 {
@@ -1032,6 +1072,12 @@ abd_iter_page(struct abd_iter *aiter)
 	struct page *page;
 	size_t doff, dsize;
 
+	/*
+	 * Find the page, and the start of the data within it. This is computed
+	 * differently for linear and scatter ABDs; linear is referenced by
+	 * virtual memory location, while scatter is referenced by page
+	 * pointer.
+	 */
 	if (abd_is_linear(aiter->iter_abd)) {
 		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
 
@@ -1044,57 +1090,24 @@ abd_iter_page(struct abd_iter *aiter)
 
 		/* offset of address within the page */
 		doff = offset_in_page(paddr);
-
-		/* total data remaining in abd from this position */
-		dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
 	} else {
 		ASSERT(!abd_is_gang(aiter->iter_abd));
 
 		/* current scatter page */
-		page = sg_page(aiter->iter_sg);
+		page = nth_page(sg_page(aiter->iter_sg),
+		    aiter->iter_offset >> PAGE_SHIFT);
 
 		/* position within page */
-		doff = aiter->iter_offset;
-
-		/* remaining data in scatterlist */
-		dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
-		    aiter->iter_abd->abd_size - aiter->iter_pos);
+		doff = aiter->iter_offset & (PAGESIZE - 1);
 	}
-	ASSERT(page);
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
+#ifdef ABD_ITER_COMPOUND_PAGES
 	if (PageTail(page)) {
 		/*
-		 * This page is part of a "compound page", which is a group of
-		 * pages that can be referenced from a single struct page *.
-		 * Its organised as a "head" page, followed by a series of
-		 * "tail" pages.
-		 *
-		 * In OpenZFS, compound pages are allocated using the
-		 * __GFP_COMP flag, which we get from scatter ABDs and SPL
-		 * vmalloc slabs (ie >16K allocations). So a great many of the
-		 * IO buffers we get are going to be of this type.
-		 *
-		 * The tail pages are just regular PAGE_SIZE pages, and can be
-		 * safely used as-is. However, the head page has length
-		 * covering itself and all the tail pages. If this ABD chunk
-		 * spans multiple pages, then we can use the head page and a
-		 * >PAGE_SIZE length, which is far more efficient.
-		 *
-		 * To do this, we need to adjust the offset to be counted from
-		 * the head page. struct page for compound pages are stored
-		 * contiguously, so we can just adjust by a simple offset.
-		 *
-		 * Before kernel 4.5, compound page heads were refcounted
-		 * separately, such that moving back to the head page would
-		 * require us to take a reference to it and releasing it once
-		 * we're completely finished with it. In practice, that means
-		 * when our caller is done with the ABD, which we have no
-		 * insight into from here. Rather than contort this API to
-		 * track head page references on such ancient kernels, we just
-		 * compile this block out and use the tail pages directly. This
-		 * is slightly less efficient, but makes everything far
-		 * simpler.
+		 * If this is a compound tail page, move back to the head, and
+		 * adjust the offset to match. This may let us yield a much
+		 * larger amount of data from a single logical page, and so
+		 * leave our caller with fewer pages to process.
 		 */
 		struct page *head = compound_head(page);
 		doff += ((page - head) * PAGESIZE);
@@ -1102,12 +1115,27 @@ abd_iter_page(struct abd_iter *aiter)
 	}
 #endif
 
-	/* final page and position within it */
+	ASSERT(page);
+
+	/*
+	 * Compute the maximum amount of data we can take from this page. This
+	 * is the smaller of:
+	 * - the remaining space in the page
+	 * - the remaining space in this scatterlist entry (which may not cover
+	 *   the entire page)
+	 * - the remaining space in the abd (which may not cover the entire
+	 *   scatterlist entry)
+	 */
+	dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff,
+	    aiter->iter_abd->abd_size - aiter->iter_pos);
+	if (!abd_is_linear(aiter->iter_abd))
+		dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset);
+	ASSERT3U(dsize, >, 0);
+
+	/* final iterator outputs */
 	aiter->iter_page = page;
 	aiter->iter_page_doff = doff;
-
-	/* amount of data in the chunk, up to the end of the page */
-	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
+	aiter->iter_page_dsize = dsize;
 }
 
 /*

From c183d164aa11e61dfe1f34907c1a029d75162f1d Mon Sep 17 00:00:00 2001
From: George Wilson <george.wilson@delphix.com>
Date: Mon, 22 Apr 2024 12:42:38 -0400
Subject: [PATCH 084/116] Parallel pool import

This commit allow spa_load() to drop the spa_namespace_lock so
that imports can happen concurrently. Prior to dropping the
spa_namespace_lock, the import logic will set the spa_load_thread
value to track the thread which is doing the import.

Consumers of spa_lookup() retain the same behavior by blocking
when either a thread is holding the spa_namespace_lock or the
spa_load_thread value is set. This will ensure that critical
concurrent operations cannot take place while a pool is being
imported.

The zpool command is also enhanced to provide multi-threaded support
when invoking zpool import -a.

Lastly, zinject provides a mechanism to insert artificial delays
when importing a pool and new zfs tests are added to verify parallel
import functionality.

Contributions-by: Don Brady <don.brady@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes #16093
---
 cmd/zinject/zinject.c                         | 115 +++++++++++-
 cmd/zpool/zpool_main.c                        |  72 ++++++--
 include/libzutil.h                            |   4 +-
 include/sys/spa.h                             |   2 +
 include/sys/spa_impl.h                        |   3 +-
 include/sys/zfs_ioctl.h                       |   4 +-
 include/sys/zio.h                             |   4 +-
 man/man8/zinject.8                            |   8 +
 module/zfs/spa.c                              |  58 ++++--
 module/zfs/spa_misc.c                         |  26 ++-
 module/zfs/vdev_initialize.c                  |   5 +-
 module/zfs/vdev_rebuild.c                     |   4 +-
 module/zfs/vdev_trim.c                        |   9 +-
 module/zfs/zio_inject.c                       | 138 ++++++++++++++-
 tests/runfiles/common.run                     |   3 +-
 tests/zfs-tests/tests/Makefile.am             |   3 +
 .../zpool_import_parallel_admin.ksh           | 165 ++++++++++++++++++
 .../zpool_import_parallel_neg.ksh             | 130 ++++++++++++++
 .../zpool_import_parallel_pos.ksh             | 137 +++++++++++++++
 19 files changed, 818 insertions(+), 72 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh

diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c
index e9141fb4ba55..ed60cce3dd16 100644
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@@ -22,7 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
- * Copyright (c) 2024, Klara Inc.
+ * Copyright (c) 2023-2024, Klara Inc.
  */
 
 /*
@@ -310,6 +310,11 @@ usage(void)
 	    "\t\tcreate 3 lanes on the device; one lane with a latency\n"
 	    "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
 	    "\n"
+	    "\tzinject -P import|export -s <seconds> pool\n"
+	    "\t\tAdd an artificial delay to a future pool import or export,\n"
+	    "\t\tsuch that the operation takes a minimum of supplied seconds\n"
+	    "\t\tto complete.\n"
+	    "\n"
 	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
 	    "\t\tCause the pool to stop writing blocks yet not\n"
 	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
@@ -392,8 +397,10 @@ print_data_handler(int id, const char *pool, zinject_record_t *record,
 {
 	int *count = data;
 
-	if (record->zi_guid != 0 || record->zi_func[0] != '\0')
+	if (record->zi_guid != 0 || record->zi_func[0] != '\0' ||
+	    record->zi_duration != 0) {
 		return (0);
+	}
 
 	if (*count == 0) {
 		(void) printf("%3s  %-15s  %-6s  %-6s  %-8s  %3s  %-4s  "
@@ -507,6 +514,33 @@ print_panic_handler(int id, const char *pool, zinject_record_t *record,
 	return (0);
 }
 
+static int
+print_pool_delay_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+	int *count = data;
+
+	if (record->zi_cmd != ZINJECT_DELAY_IMPORT &&
+	    record->zi_cmd != ZINJECT_DELAY_EXPORT) {
+		return (0);
+	}
+
+	if (*count == 0) {
+		(void) printf("%3s  %-19s  %-11s  %s\n",
+		    "ID", "POOL", "DELAY (sec)", "COMMAND");
+		(void) printf("---  -------------------  -----------"
+		    "  -------\n");
+	}
+
+	*count += 1;
+
+	(void) printf("%3d  %-19s  %-11llu  %s\n",
+	    id, pool, (u_longlong_t)record->zi_duration,
+	    record->zi_cmd == ZINJECT_DELAY_IMPORT ? "import": "export");
+
+	return (0);
+}
+
 /*
  * Print all registered error handlers.  Returns the number of handlers
  * registered.
@@ -537,6 +571,13 @@ print_all_handlers(void)
 		count = 0;
 	}
 
+	(void) iter_handlers(print_pool_delay_handler, &count);
+	if (count > 0) {
+		total += count;
+		(void) printf("\n");
+		count = 0;
+	}
+
 	(void) iter_handlers(print_panic_handler, &count);
 
 	return (count + total);
@@ -609,9 +650,27 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
 	zc.zc_guid = flags;
 
 	if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
-		(void) fprintf(stderr, "failed to add handler: %s\n",
-		    errno == EDOM ? "block level exceeds max level of object" :
-		    strerror(errno));
+		const char *errmsg = strerror(errno);
+
+		switch (errno) {
+		case EDOM:
+			errmsg = "block level exceeds max level of object";
+			break;
+		case EEXIST:
+			if (record->zi_cmd == ZINJECT_DELAY_IMPORT)
+				errmsg = "pool already imported";
+			if (record->zi_cmd == ZINJECT_DELAY_EXPORT)
+				errmsg = "a handler already exists";
+			break;
+		case ENOENT:
+			/* import delay injector running on older zfs module */
+			if (record->zi_cmd == ZINJECT_DELAY_IMPORT)
+				errmsg = "import delay injector not supported";
+			break;
+		default:
+			break;
+		}
+		(void) fprintf(stderr, "failed to add handler: %s\n", errmsg);
 		return (1);
 	}
 
@@ -636,6 +695,9 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
 		} else if (record->zi_duration < 0) {
 			(void) printf(" txgs: %lld \n",
 			    (u_longlong_t)-record->zi_duration);
+		} else if (record->zi_timer > 0) {
+			(void) printf(" timer: %lld ms\n",
+			    (u_longlong_t)NSEC2MSEC(record->zi_timer));
 		} else {
 			(void) printf("objset: %llu\n",
 			    (u_longlong_t)record->zi_objset);
@@ -834,7 +896,7 @@ main(int argc, char **argv)
 	}
 
 	while ((c = getopt(argc, argv,
-	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) {
 		switch (c) {
 		case 'a':
 			flags |= ZINJECT_FLUSH_ARC;
@@ -952,6 +1014,19 @@ main(int argc, char **argv)
 			    sizeof (record.zi_func));
 			record.zi_cmd = ZINJECT_PANIC;
 			break;
+		case 'P':
+			if (strcasecmp(optarg, "import") == 0) {
+				record.zi_cmd = ZINJECT_DELAY_IMPORT;
+			} else if (strcasecmp(optarg, "export") == 0) {
+				record.zi_cmd = ZINJECT_DELAY_EXPORT;
+			} else {
+				(void) fprintf(stderr, "invalid command '%s': "
+				    "must be 'import' or 'export'\n", optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
 		case 'q':
 			quiet = 1;
 			break;
@@ -1033,7 +1108,7 @@ main(int argc, char **argv)
 	argc -= optind;
 	argv += optind;
 
-	if (record.zi_duration != 0)
+	if (record.zi_duration != 0 && record.zi_cmd == 0)
 		record.zi_cmd = ZINJECT_IGNORED_WRITES;
 
 	if (cancel != NULL) {
@@ -1179,8 +1254,8 @@ main(int argc, char **argv)
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
 		    level != 0 || device != NULL || record.zi_freq > 0 ||
 		    dvas != 0) {
-			(void) fprintf(stderr, "panic (-p) incompatible with "
-			    "other options\n");
+			(void) fprintf(stderr, "%s incompatible with other "
+			    "options\n", "import|export delay (-P)");
 			usage();
 			libzfs_fini(g_zfs);
 			return (2);
@@ -1198,6 +1273,28 @@ main(int argc, char **argv)
 		if (argv[1] != NULL)
 			record.zi_type = atoi(argv[1]);
 		dataset[0] = '\0';
+	} else if (record.zi_cmd == ZINJECT_DELAY_IMPORT ||
+	    record.zi_cmd == ZINJECT_DELAY_EXPORT) {
+		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+		    level != 0 || device != NULL || record.zi_freq > 0 ||
+		    dvas != 0) {
+			(void) fprintf(stderr, "%s incompatible with other "
+			    "options\n", "import|export delay (-P)");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		if (argc != 1 || record.zi_duration <= 0) {
+			(void) fprintf(stderr, "import|export delay (-P) "
+			    "injection requires a duration (-s) and a single "
+			    "pool name\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		(void) strlcpy(pool, argv[0], sizeof (pool));
 	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
 		    level != 0 || record.zi_freq > 0 || dvas != 0) {
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index d670cd1afeb1..e6664b918be4 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -50,6 +50,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <thread_pool.h>
 #include <time.h>
 #include <unistd.h>
 #include <pwd.h>
@@ -3455,15 +3456,40 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
 	return (ret);
 }
 
+typedef struct import_parameters {
+	nvlist_t *ip_config;
+	const char *ip_mntopts;
+	nvlist_t *ip_props;
+	int ip_flags;
+	int *ip_err;
+} import_parameters_t;
+
+static void
+do_import_task(void *arg)
+{
+	import_parameters_t *ip = arg;
+	*ip->ip_err |= do_import(ip->ip_config, NULL, ip->ip_mntopts,
+	    ip->ip_props, ip->ip_flags);
+	free(ip);
+}
+
+
 static int
 import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
-    char *orig_name, char *new_name,
-    boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all,
-    importargs_t *import)
+    char *orig_name, char *new_name, importargs_t *import)
 {
 	nvlist_t *config = NULL;
 	nvlist_t *found_config = NULL;
 	uint64_t pool_state;
+	boolean_t pool_specified = (import->poolname != NULL ||
+	    import->guid != 0);
+
+
+	tpool_t *tp = NULL;
+	if (import->do_all) {
+		tp = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN),
+		    0, NULL);
+	}
 
 	/*
 	 * At this point we have a list of import candidate configs. Even if
@@ -3480,9 +3506,11 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
 
 		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 		    &pool_state) == 0);
-		if (!do_destroyed && pool_state == POOL_STATE_DESTROYED)
+		if (!import->do_destroyed &&
+		    pool_state == POOL_STATE_DESTROYED)
 			continue;
-		if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
+		if (import->do_destroyed &&
+		    pool_state != POOL_STATE_DESTROYED)
 			continue;
 
 		verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
@@ -3491,12 +3519,21 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
 		if (!pool_specified) {
 			if (first)
 				first = B_FALSE;
-			else if (!do_all)
+			else if (!import->do_all)
 				(void) fputc('\n', stdout);
 
-			if (do_all) {
-				err |= do_import(config, NULL, mntopts,
-				    props, flags);
+			if (import->do_all) {
+				import_parameters_t *ip = safe_malloc(
+				    sizeof (import_parameters_t));
+
+				ip->ip_config = config;
+				ip->ip_mntopts = mntopts;
+				ip->ip_props = props;
+				ip->ip_flags = flags;
+				ip->ip_err = &err;
+
+				(void) tpool_dispatch(tp, do_import_task,
+				    (void *)ip);
 			} else {
 				/*
 				 * If we're importing from cachefile, then
@@ -3544,6 +3581,10 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
 				found_config = config;
 		}
 	}
+	if (import->do_all) {
+		tpool_wait(tp);
+		tpool_destroy(tp);
+	}
 
 	/*
 	 * If we were searching for a specific pool, verify that we found a
@@ -3773,7 +3814,6 @@ zpool_do_import(int argc, char **argv)
 	boolean_t xtreme_rewind = B_FALSE;
 	boolean_t do_scan = B_FALSE;
 	boolean_t pool_exists = B_FALSE;
-	boolean_t pool_specified = B_FALSE;
 	uint64_t txg = -1ULL;
 	char *cachefile = NULL;
 	importargs_t idata = { 0 };
@@ -3972,7 +4012,6 @@ zpool_do_import(int argc, char **argv)
 			searchname = argv[0];
 			searchguid = 0;
 		}
-		pool_specified = B_TRUE;
 
 		/*
 		 * User specified a name or guid.  Ensure it's unique.
@@ -4005,6 +4044,8 @@ zpool_do_import(int argc, char **argv)
 	idata.cachefile = cachefile;
 	idata.scan = do_scan;
 	idata.policy = policy;
+	idata.do_destroyed = do_destroyed;
+	idata.do_all = do_all;
 
 	libpc_handle_t lpch = {
 		.lpc_lib_handle = g_zfs,
@@ -4047,9 +4088,7 @@ zpool_do_import(int argc, char **argv)
 	}
 
 	err = import_pools(pools, props, mntopts, flags,
-	    argc >= 1 ? argv[0] : NULL,
-	    argc >= 2 ? argv[1] : NULL,
-	    do_destroyed, pool_specified, do_all, &idata);
+	    argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, &idata);
 
 	/*
 	 * If we're using the cachefile and we failed to import, then
@@ -4070,9 +4109,8 @@ zpool_do_import(int argc, char **argv)
 		pools = zpool_search_import(&lpch, &idata);
 
 		err = import_pools(pools, props, mntopts, flags,
-		    argc >= 1 ? argv[0] : NULL,
-		    argc >= 2 ? argv[1] : NULL,
-		    do_destroyed, pool_specified, do_all, &idata);
+		    argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL,
+		    &idata);
 	}
 
 error:
diff --git a/include/libzutil.h b/include/libzutil.h
index d9a9a65753dd..e2108ceeaa44 100644
--- a/include/libzutil.h
+++ b/include/libzutil.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2018, 2024 by Delphix. All rights reserved.
  */
 
 #ifndef	_LIBZUTIL_H
@@ -79,6 +79,8 @@ typedef struct importargs {
 	boolean_t can_be_active; /* can the pool be active?		*/
 	boolean_t scan;		/* prefer scanning to libblkid cache    */
 	nvlist_t *policy;	/* load policy (max txg, rewind, etc.)	*/
+	boolean_t do_destroyed;
+	boolean_t do_all;
 } importargs_t;
 
 typedef struct libpc_handle {
diff --git a/include/sys/spa.h b/include/sys/spa.h
index b969f05afe48..ca15025ba33c 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -833,6 +833,8 @@ void spa_select_allocator(zio_t *zio);
 
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
+extern avl_tree_t spa_namespace_avl;
+extern kcondvar_t spa_namespace_cv;
 
 /*
  * SPA configuration functions in spa_config.c
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 0cd0c4720fbe..d7da085ab313 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -237,6 +237,7 @@ struct spa {
 	dsl_pool_t	*spa_dsl_pool;
 	boolean_t	spa_is_initializing;	/* true while opening pool */
 	boolean_t	spa_is_exporting;	/* true while exporting pool */
+	kthread_t	*spa_load_thread;	/* loading, no namespace lock */
 	metaslab_class_t *spa_normal_class;	/* normal data class */
 	metaslab_class_t *spa_log_class;	/* intent log data class */
 	metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */
diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h
index 26dfe97604de..525d40759fdd 100644
--- a/include/sys/zfs_ioctl.h
+++ b/include/sys/zfs_ioctl.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2024 by Delphix. All rights reserved.
  * Copyright 2016 RackTop Systems.
  * Copyright (c) 2017, Intel Corporation.
  */
@@ -454,6 +454,8 @@ typedef enum zinject_type {
 	ZINJECT_PANIC,
 	ZINJECT_DELAY_IO,
 	ZINJECT_DECRYPT_FAULT,
+	ZINJECT_DELAY_IMPORT,
+	ZINJECT_DELAY_EXPORT,
 } zinject_type_t;
 
 typedef struct zfs_share {
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 545b9cf0c3c5..4037b429982b 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright 2016 Toomas Soome <tsoome@me.com>
@@ -686,6 +686,8 @@ extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1,
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
 extern hrtime_t zio_handle_io_delay(zio_t *zio);
+extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed);
+extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed);
 
 /*
  * Checksum ereport functions
diff --git a/man/man8/zinject.8 b/man/man8/zinject.8
index f67b5e378dc3..ad9e7a42bfac 100644
--- a/man/man8/zinject.8
+++ b/man/man8/zinject.8
@@ -129,6 +129,14 @@ Force a vdev error.
 .
 .It Xo
 .Nm zinject
+.Fl i Ar seconds
+.Ar pool
+.Xc
+Add an artificial delay during the future import of a pool.
+This injector is automatically cleared after the import is finished.
+.
+.It Xo
+.Nm zinject
 .Fl I
 .Op Fl s Ar seconds Ns | Ns Fl g Ar txgs
 .Ar pool
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index f67d980ae4c6..96daf51b696a 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -3273,8 +3273,6 @@ spa_spawn_aux_threads(spa_t *spa)
 {
 	ASSERT(spa_writeable(spa));
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
 	spa_start_raidz_expansion_thread(spa);
 	spa_start_indirect_condensing_thread(spa);
 	spa_start_livelist_destroy_thread(spa);
@@ -4981,7 +4979,8 @@ spa_ld_read_checkpoint_txg(spa_t *spa)
 	int error = 0;
 
 	ASSERT0(spa->spa_checkpoint_txg);
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_load_thread == curthread);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
@@ -5228,6 +5227,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	boolean_t checkpoint_rewind =
 	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 	boolean_t update_config_cache = B_FALSE;
+	hrtime_t load_start = gethrtime();
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
@@ -5272,13 +5272,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 			return (error);
 	}
 
+	/*
+	 * Drop the namespace lock for the rest of the function.
+	 */
+	spa->spa_load_thread = curthread;
+	mutex_exit(&spa_namespace_lock);
+
 	/*
 	 * Retrieve the checkpoint txg if the pool has a checkpoint.
 	 */
 	spa_import_progress_set_notes(spa, "Loading checkpoint txg");
 	error = spa_ld_read_checkpoint_txg(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
@@ -5291,7 +5297,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
 	error = spa_ld_open_indirect_vdev_metadata(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Retrieve the full list of active features from the MOS and check if
@@ -5300,7 +5306,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	spa_import_progress_set_notes(spa, "Checking feature flags");
 	error = spa_ld_check_features(spa, &missing_feat_write);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Load several special directories from the MOS needed by the dsl_pool
@@ -5309,7 +5315,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	spa_import_progress_set_notes(spa, "Loading special MOS directories");
 	error = spa_ld_load_special_directories(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Retrieve pool properties from the MOS.
@@ -5317,7 +5323,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	spa_import_progress_set_notes(spa, "Loading properties");
 	error = spa_ld_get_props(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Retrieve the list of auxiliary devices - cache devices and spares -
@@ -5326,7 +5332,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	spa_import_progress_set_notes(spa, "Loading AUX vdevs");
 	error = spa_ld_open_aux_vdevs(spa, type);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Load the metadata for all vdevs. Also check if unopenable devices
@@ -5335,17 +5341,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	spa_import_progress_set_notes(spa, "Loading vdev metadata");
 	error = spa_ld_load_vdev_metadata(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	spa_import_progress_set_notes(spa, "Loading dedup tables");
 	error = spa_ld_load_dedup_tables(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	spa_import_progress_set_notes(spa, "Loading BRT");
 	error = spa_ld_load_brt(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Verify the logs now to make sure we don't have any unexpected errors
@@ -5354,7 +5360,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	spa_import_progress_set_notes(spa, "Verifying Log Devices");
 	error = spa_ld_verify_logs(spa, type, ereport);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	if (missing_feat_write) {
 		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
@@ -5364,8 +5370,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 		 * read-only mode but not read-write mode. We now have enough
 		 * information and can return to userland.
 		 */
-		return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
-		    ENOTSUP));
+		error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
+		    ENOTSUP);
+		goto fail;
 	}
 
 	/*
@@ -5376,7 +5383,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 	spa_import_progress_set_notes(spa, "Verifying pool data");
 	error = spa_ld_verify_pool_data(spa);
 	if (error != 0)
-		return (error);
+		goto fail;
 
 	/*
 	 * Calculate the deflated space for the pool. This must be done before
@@ -5501,13 +5508,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_import_progress_set_notes(spa, "Finished importing");
 	}
+	zio_handle_import_delay(spa, gethrtime() - load_start);
 
 	spa_import_progress_remove(spa_guid(spa));
 	spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 
 	spa_load_note(spa, "LOADED");
+fail:
+	mutex_enter(&spa_namespace_lock);
+	spa->spa_load_thread = NULL;
+	cv_broadcast(&spa_namespace_cv);
+
+	return (error);
 
-	return (0);
 }
 
 static int
@@ -6757,9 +6770,14 @@ spa_tryimport(nvlist_t *tryconfig)
 	/*
 	 * Create and initialize the spa structure.
 	 */
+	char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	(void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
+	    TRYIMPORT_NAME, (u_longlong_t)curthread, poolname);
+
 	mutex_enter(&spa_namespace_lock);
-	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
+	spa = spa_add(name, tryconfig, NULL);
 	spa_activate(spa, SPA_MODE_READ);
+	kmem_free(name, MAXPATHLEN);
 
 	/*
 	 * Rewind pool if a max txg was provided.
@@ -6874,6 +6892,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 {
 	int error;
 	spa_t *spa;
+	hrtime_t export_start = gethrtime();
 
 	if (oldconfig)
 		*oldconfig = NULL;
@@ -7018,6 +7037,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 		spa->spa_is_exporting = B_FALSE;
 	}
 
+	if (new_state == POOL_STATE_EXPORTED)
+		zio_handle_export_delay(spa, gethrtime() - export_start);
+
 	mutex_exit(&spa_namespace_lock);
 	return (0);
 
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 68b907614196..5fb7847b5d8b 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -82,7 +82,8 @@
  *		- Check if spa_refcount is zero
  *		- Rename a spa_t
  *		- add/remove/attach/detach devices
- *		- Held for the duration of create/destroy/import/export
+ *		- Held for the duration of create/destroy/export
+ *		- Held at the start and end of import
  *
  *	It does not need to handle recursion.  A create or destroy may
  *	reference objects (files or zvols) in other pools, but by
@@ -235,9 +236,9 @@
  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
  */
 
-static avl_tree_t spa_namespace_avl;
+avl_tree_t spa_namespace_avl;
 kmutex_t spa_namespace_lock;
-static kcondvar_t spa_namespace_cv;
+kcondvar_t spa_namespace_cv;
 static const int spa_max_replication_override = SPA_DVAS_PER_BP;
 
 static kmutex_t spa_spare_lock;
@@ -619,6 +620,7 @@ spa_lookup(const char *name)
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
+retry:
 	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
 
 	/*
@@ -630,6 +632,14 @@ spa_lookup(const char *name)
 		*cp = '\0';
 
 	spa = avl_find(&spa_namespace_avl, &search, &where);
+	if (spa == NULL)
+		return (NULL);
+
+	if (spa->spa_load_thread != NULL &&
+	    spa->spa_load_thread != curthread) {
+		cv_wait(&spa_namespace_cv, &spa_namespace_lock);
+		goto retry;
+	}
 
 	return (spa);
 }
@@ -728,6 +738,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa_config_lock_init(spa);
 	spa_stats_init(spa);
 
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	avl_add(&spa_namespace_avl, spa);
 
 	/*
@@ -826,7 +837,6 @@ spa_remove(spa_t *spa)
 	nvlist_free(spa->spa_config_splitting);
 
 	avl_remove(&spa_namespace_avl, spa);
-	cv_broadcast(&spa_namespace_cv);
 
 	if (spa->spa_root)
 		spa_strfree(spa->spa_root);
@@ -920,7 +930,8 @@ void
 spa_open_ref(spa_t *spa, const void *tag)
 {
 	ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
-	    MUTEX_HELD(&spa_namespace_lock));
+	    MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_load_thread == curthread);
 	(void) zfs_refcount_add(&spa->spa_refcount, tag);
 }
 
@@ -932,7 +943,8 @@ void
 spa_close(spa_t *spa, const void *tag)
 {
 	ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
-	    MUTEX_HELD(&spa_namespace_lock));
+	    MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_load_thread == curthread);
 	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
 }
 
diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c
index 5aaef1a69986..c5e16af16692 100644
--- a/module/zfs/vdev_initialize.c
+++ b/module/zfs/vdev_initialize.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2024 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
@@ -775,7 +775,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
 void
 vdev_initialize_restart(vdev_t *vd)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    vd->vdev_spa->spa_load_thread == curthread);
 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_leaf_zap != 0) {
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
index 6503390f7973..00ebd4c9fca4 100644
--- a/module/zfs/vdev_rebuild.c
+++ b/module/zfs/vdev_rebuild.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2018, Intel Corporation.
  * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2024 by Delphix. All rights reserved.
  */
 
 #include <sys/vdev_impl.h>
@@ -1071,7 +1072,8 @@ vdev_rebuild_restart_impl(vdev_t *vd)
 void
 vdev_rebuild_restart(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_load_thread == curthread);
 
 	vdev_rebuild_restart_impl(spa->spa_root_vdev);
 }
diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c
index 7e3c5f684703..9753d5a1ea04 100644
--- a/module/zfs/vdev_trim.c
+++ b/module/zfs/vdev_trim.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
  * Copyright (c) 2021 Hewlett Packard Enterprise Development LP
  * Copyright 2023 RackTop Systems, Inc.
@@ -1148,7 +1148,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
 void
 vdev_trim_restart(vdev_t *vd)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    vd->vdev_spa->spa_load_thread == curthread);
 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_leaf_zap != 0) {
@@ -1568,8 +1569,8 @@ vdev_autotrim_stop_all(spa_t *spa)
 void
 vdev_autotrim_restart(spa_t *spa)
 {
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa->spa_load_thread == curthread);
 	if (spa->spa_autotrim)
 		vdev_autotrim(spa);
 }
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 1af2c26f8a43..3773e400d799 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2024, Klara Inc.
  */
 
 /*
@@ -59,6 +60,7 @@ uint32_t zio_injection_enabled = 0;
 typedef struct inject_handler {
 	int			zi_id;
 	spa_t			*zi_spa;
+	char			*zi_spa_name; /* ZINJECT_DELAY_IMPORT only */
 	zinject_record_t	zi_record;
 	uint64_t		*zi_lanes;
 	int			zi_next_lane;
@@ -703,6 +705,63 @@ zio_handle_io_delay(zio_t *zio)
 	return (min_target);
 }
 
+static void
+zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command)
+{
+	inject_handler_t *handler;
+	hrtime_t delay = 0;
+	int id = 0;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers);
+	    handler != NULL && handler->zi_record.zi_cmd == command;
+	    handler = list_next(&inject_handlers, handler)) {
+		ASSERT3P(handler->zi_spa_name, !=, NULL);
+		if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) {
+			uint64_t pause =
+			    SEC2NSEC(handler->zi_record.zi_duration);
+			if (pause > elapsed) {
+				delay = pause - elapsed;
+			}
+			id = handler->zi_id;
+			break;
+		}
+	}
+
+	rw_exit(&inject_lock);
+
+	if (delay) {
+		if (command == ZINJECT_DELAY_IMPORT) {
+			spa_import_progress_set_notes(spa, "injecting %llu "
+			    "sec delay", (u_longlong_t)NSEC2SEC(delay));
+		}
+		zfs_sleep_until(gethrtime() + delay);
+	}
+	if (id) {
+		/* all done with this one-shot handler */
+		zio_clear_fault(id);
+	}
+}
+
+/*
+ * For testing, inject a delay during an import
+ */
+void
+zio_handle_import_delay(spa_t *spa, hrtime_t elapsed)
+{
+	zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT);
+}
+
+/*
+ * For testing, inject a delay during an export
+ */
+void
+zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
+{
+	zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
+}
+
 static int
 zio_calculate_range(const char *pool, zinject_record_t *record)
 {
@@ -760,6 +819,28 @@ zio_calculate_range(const char *pool, zinject_record_t *record)
 	return (0);
 }
 
+static boolean_t
+zio_pool_handler_exists(const char *name, zinject_type_t command)
+{
+	boolean_t exists = B_FALSE;
+
+	rw_enter(&inject_lock, RW_READER);
+	for (inject_handler_t *handler = list_head(&inject_handlers);
+	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
+		if (command != handler->zi_record.zi_cmd)
+			continue;
+
+		const char *pool = (handler->zi_spa_name != NULL) ?
+		    handler->zi_spa_name : spa_name(handler->zi_spa);
+		if (strcmp(name, pool) == 0) {
+			exists = B_TRUE;
+			break;
+		}
+	}
+	rw_exit(&inject_lock);
+
+	return (exists);
+}
 /*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
@@ -810,16 +891,42 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
 
 	if (!(flags & ZINJECT_NULL)) {
 		/*
-		 * spa_inject_ref() will add an injection reference, which will
-		 * prevent the pool from being removed from the namespace while
-		 * still allowing it to be unloaded.
+		 * Pool delays for import or export don't take an
+		 * injection reference on the spa. Instead they
+		 * rely on matching by name.
 		 */
-		if ((spa = spa_inject_addref(name)) == NULL)
-			return (SET_ERROR(ENOENT));
+		if (record->zi_cmd == ZINJECT_DELAY_IMPORT ||
+		    record->zi_cmd == ZINJECT_DELAY_EXPORT) {
+			if (record->zi_duration <= 0)
+				return (SET_ERROR(EINVAL));
+			/*
+			 * Only one import | export delay handler per pool.
+			 */
+			if (zio_pool_handler_exists(name, record->zi_cmd))
+				return (SET_ERROR(EEXIST));
+
+			mutex_enter(&spa_namespace_lock);
+			boolean_t has_spa = spa_lookup(name) != NULL;
+			mutex_exit(&spa_namespace_lock);
+
+			if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa)
+				return (SET_ERROR(EEXIST));
+			if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa)
+				return (SET_ERROR(ENOENT));
+			spa = NULL;
+		} else {
+			/*
+			 * spa_inject_ref() will add an injection reference,
+			 * which will prevent the pool from being removed
+			 * from the namespace while still allowing it to be
+			 * unloaded.
+			 */
+			if ((spa = spa_inject_addref(name)) == NULL)
+				return (SET_ERROR(ENOENT));
+		}
 
 		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
-
-		handler->zi_spa = spa;
+		handler->zi_spa = spa;	/* note: can be NULL */
 		handler->zi_record = *record;
 
 		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
@@ -832,6 +939,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
 			handler->zi_next_lane = 0;
 		}
 
+		if (handler->zi_spa == NULL)
+			handler->zi_spa_name = spa_strdup(name);
+		else
+			handler->zi_spa_name = NULL;
+
 		rw_enter(&inject_lock, RW_WRITER);
 
 		/*
@@ -891,7 +1003,11 @@ zio_inject_list_next(int *id, char *name, size_t buflen,
 	if (handler) {
 		*record = handler->zi_record;
 		*id = handler->zi_id;
-		(void) strlcpy(name, spa_name(handler->zi_spa), buflen);
+		ASSERT(handler->zi_spa || handler->zi_spa_name);
+		if (handler->zi_spa != NULL)
+			(void) strlcpy(name, spa_name(handler->zi_spa), buflen);
+		else
+			(void) strlcpy(name, handler->zi_spa_name, buflen);
 		ret = 0;
 	} else {
 		ret = SET_ERROR(ENOENT);
@@ -941,7 +1057,11 @@ zio_clear_fault(int id)
 		ASSERT3P(handler->zi_lanes, ==, NULL);
 	}
 
-	spa_inject_delref(handler->zi_spa);
+	if (handler->zi_spa_name != NULL)
+		spa_strfree(handler->zi_spa_name);
+
+	if (handler->zi_spa != NULL)
+		spa_inject_delref(handler->zi_spa);
 	kmem_free(handler, sizeof (inject_handler_t));
 	atomic_dec_32(&zio_injection_enabled);
 
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 558cd425afd8..0586d991b802 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -466,7 +466,8 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
     'import_paths_changed',
     'import_rewind_config_changed',
     'import_rewind_device_replaced',
-    'zpool_import_status']
+    'zpool_import_status', 'zpool_import_parallel_pos',
+    'zpool_import_parallel_neg', 'zpool_import_parallel_admin']
 tags = ['functional', 'cli_root', 'zpool_import']
 timeout = 1200
 
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index f182a2825cd6..dc447e042225 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1144,6 +1144,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_status.ksh \
+	functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh \
+	functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh \
+	functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh \
 	functional/cli_root/zpool_initialize/cleanup.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh
new file mode 100755
index 000000000000..c681d1b7dd23
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh
@@ -0,0 +1,165 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2023 Klara, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# 	Verify that admin commands to different pool are not blocked by import
+#
+# STRATEGY:
+#	1. Create 2 pools
+#	2. Export one of the pools
+#	4. Import the pool with an injected delay
+#	5. Execute some admin commands against both pools
+#	6. Verify that the admin commands to the non-imported pool don't stall
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+	zinject -c all
+	destroy_pool $TESTPOOL1
+	destroy_pool $TESTPOOL2
+}
+
+function pool_import
+{
+	typeset dir=$1
+	typeset pool=$2
+
+	SECONDS=0
+	errmsg=$(zpool import -d $dir -f $pool 2>&1 > /dev/null)
+	if [[ $? -eq 0 ]]; then
+		echo ${pool}: imported in $SECONDS secs
+		echo $SECONDS > ${DEVICE_DIR}/${pool}-import
+	else
+		echo ${pool}: import failed $errmsg in $SECONDS secs
+	fi
+}
+
+function pool_add_device
+{
+	typeset pool=$1
+	typeset device=$2
+	typeset devtype=$3
+
+	SECONDS=0
+	errmsg=$(zpool add $pool $devtype $device 2>&1 > /dev/null)
+	if [[ $? -eq 0 ]]; then
+		echo ${pool}: added $devtype vdev in $SECONDS secs
+		echo $SECONDS > ${DEVICE_DIR}/${pool}-add
+	else
+		echo ${pool}: add $devtype vdev failed ${errmsg}, in $SECONDS secs
+	fi
+}
+
+function pool_stats
+{
+	typeset stats=$1
+	typeset pool=$2
+
+	SECONDS=0
+	errmsg=$(zpool $stats $pool 2>&1 > /dev/null)
+	if [[ $? -eq 0 ]]; then
+		echo ${pool}: $stats in $SECONDS secs
+		echo $SECONDS > ${DEVICE_DIR}/${pool}-${stats}
+	else
+		echo ${pool}: $stats failed ${errmsg}, in $SECONDS secs
+	fi
+}
+
+function pool_create
+{
+	typeset pool=$1
+	typeset device=$2
+
+	SECONDS=0
+	errmsg=$(zpool create $pool $device 2>&1 > /dev/null)
+	if [[ $? -eq 0 ]]; then
+		echo ${pool}: created in $SECONDS secs
+		echo $SECONDS > ${DEVICE_DIR}/${pool}-create
+	else
+		echo ${pool}: create failed ${errmsg}, in $SECONDS secs
+	fi
+}
+
+log_assert "Simple admin commands to different pool not blocked by import"
+
+log_onexit cleanup
+
+#
+# create two pools and export one
+#
+log_must zpool create $TESTPOOL1 $VDEV0
+log_must zpool export $TESTPOOL1
+log_must zpool create $TESTPOOL2 $VDEV1
+
+#
+# import pool asyncronously with an injected 10 second delay
+#
+log_must zinject -P import -s 10 $TESTPOOL1
+pool_import $DEVICE_DIR $TESTPOOL1 &
+
+sleep 2
+
+#
+# run some admin commands on the pools while the import is in progress
+#
+
+pool_add_device $TESTPOOL1 $VDEV2 "log" &
+pool_add_device $TESTPOOL2 $VDEV3 "cache" &
+pool_stats "status" $TESTPOOL1 &
+pool_stats "status" $TESTPOOL2 &
+pool_stats "list" $TESTPOOL1 &
+pool_stats "list" $TESTPOOL2 &
+pool_create $TESTPOOL1 $VDEV4 &
+wait
+
+log_must zpool sync $TESTPOOL1 $TESTPOOL2
+
+zpool history $TESTPOOL1
+zpool history $TESTPOOL2
+
+log_must test "5" -lt $(<${DEVICE_DIR}/${TESTPOOL1}-import)
+
+#
+# verify that commands to second pool did not wait for import to finish
+#
+log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-status)
+log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-list)
+log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-add)
+[[ -e ${DEVICE_DIR}/${TESTPOOL1}-create ]] && log_fail "unexpected pool create"
+
+log_pass "Simple admin commands to different pool not blocked by import"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh
new file mode 100755
index 000000000000..339dc2575ede
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh
@@ -0,0 +1,130 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2023 Klara, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# 	Verify that pool imports by same name only have one winner
+#
+# STRATEGY:
+#	1. Create 4 single disk pools with the same name
+#	2. Generate some ZIL records (for a longer import)
+#	3. Export the pools
+#	4. Import the pools in parallel
+#	5. Repeat with using matching guids
+#
+
+verify_runnable "global"
+
+POOLNAME="import_pool"
+DEV_DIR_PREFIX="$DEVICE_DIR/$POOLNAME"
+VDEVSIZE=$((512 * 1024 * 1024))
+
+log_assert "parallel pool imports by same name only have one winner"
+
+# each pool has its own device directory
+for i in {0..3}; do
+	log_must mkdir -p ${DEV_DIR_PREFIX}$i
+	log_must truncate -s $VDEVSIZE ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i
+done
+
+function cleanup
+{
+	zinject -c all
+	log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0
+	log_must set_tunable64 METASLAB_DEBUG_LOAD 0
+
+	destroy_pool $POOLNAME
+
+	log_must rm -rf $DEV_DIR_PREFIX*
+}
+
+log_onexit cleanup
+
+log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1
+log_must set_tunable64 METASLAB_DEBUG_LOAD 1
+
+function import_pool
+{
+	typeset dir=$1
+	typeset pool=$2
+	typeset newname=$3
+
+	SECONDS=0
+	errmsg=$(zpool import -N -d $dir -f $pool $newname 2>&1 > /dev/null)
+	if [[ $? -eq 0 ]]; then
+		touch $dir/imported
+		echo "imported $pool in $SECONDS secs"
+	elif [[ $errmsg == *"cannot import"* ]]; then
+		echo "pool import failed: $errmsg, waited $SECONDS secs"
+		touch $dir/failed
+	fi
+}
+
+#
+# create four exported pools with the same name
+#
+for i in {0..3}; do
+	log_must zpool create $POOLNAME ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i
+	log_must zpool export $POOLNAME
+done
+log_must zinject -P import -s 10 $POOLNAME
+
+#
+# import the pools in parallel, expecting only one winner
+#
+for i in {0..3}; do
+	import_pool ${DEV_DIR_PREFIX}$i $POOLNAME &
+done
+wait
+
+# check the result of background imports
+typeset num_imports=0
+typeset num_cannot=0
+for i in {0..3}; do
+	if [[ -f ${DEV_DIR_PREFIX}$i/imported ]]; then
+		((num_imports += 1))
+	fi
+	if [[ -f ${DEV_DIR_PREFIX}$i/failed ]]; then
+		((num_cannot += 1))
+		loser=$i
+	fi
+done
+[[ $num_imports -eq "1" ]] || log_fail "expecting an import"
+[[ $num_cannot -eq "3" ]] || \
+    log_fail "expecting 3 pool exists errors, found $num_cannot"
+
+log_note "$num_imports imported and $num_cannot failed (expected)"
+
+log_pass "parallel pool imports by same name only have one winner"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh
new file mode 100755
index 000000000000..71b2437a37ec
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh
@@ -0,0 +1,137 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2023 Klara, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+# test uses 8 vdevs
+export MAX_NUM=8
+
+#
+# DESCRIPTION:
+# 	Verify that pool imports can occur in parallel
+#
+# STRATEGY:
+#	1. Create 8 pools
+#	2. Generate some ZIL records
+#	3. Export the pools
+#	4. Import half of the pools synchronously to baseline sequential cost
+#	5. Import the other half asynchronously to demonstrate parallel savings
+#	6. Export 4 pools
+#	7. Test zpool import -a
+#
+
+verify_runnable "global"
+
+#
+# override the minimum sized vdevs
+#
+VDEVSIZE=$((512 * 1024 * 1024))
+increase_device_sizes $VDEVSIZE
+
+POOLNAME="import_pool"
+
+function cleanup
+{
+	zinject -c all
+	log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0
+	log_must set_tunable64 METASLAB_DEBUG_LOAD 0
+
+	for i in {0..$(($MAX_NUM - 1))}; do
+		destroy_pool $POOLNAME-$i
+	done
+	# reset the devices
+	increase_device_sizes 0
+	increase_device_sizes $FILE_SIZE
+}
+
+log_assert "Pool imports can occur in parallel"
+
+log_onexit cleanup
+
+log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1
+log_must set_tunable64 METASLAB_DEBUG_LOAD 1
+
+
+#
+# create some exported pools with import delay injectors
+#
+for i in {0..$(($MAX_NUM - 1))}; do
+	log_must zpool create $POOLNAME-$i $DEVICE_DIR/${DEVICE_FILE}$i
+	log_must zpool export $POOLNAME-$i
+	log_must zinject -P import -s 12 $POOLNAME-$i
+done
+wait
+
+#
+# import half of the pools synchronously
+#
+SECONDS=0
+for i in {0..3}; do
+	log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i
+done
+sequential_time=$SECONDS
+log_note "sequentially imported 4 pools in $sequential_time seconds"
+
+#
+# import half of the pools in parallel
+#
+SECONDS=0
+for i in {4..7}; do
+	log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i &
+done
+wait
+parallel_time=$SECONDS
+log_note "asyncronously imported 4 pools in $parallel_time seconds"
+
+log_must test $parallel_time -lt $(($sequential_time / 3))
+
+#
+# export pools with import delay injectors
+#
+for i in {4..7}; do
+	log_must zpool export $POOLNAME-$i
+	log_must zinject -P import -s 12 $POOLNAME-$i
+done
+wait
+
+#
+# now test zpool import -a
+#
+SECONDS=0
+log_must zpool import -a -d $DEVICE_DIR -f
+parallel_time=$SECONDS
+log_note "asyncronously imported 4 pools in $parallel_time seconds"
+
+log_must test $parallel_time -lt $(($sequential_time / 3))
+
+log_pass "Pool imports occur in parallel"

From 9b43d7ba85059d37533d42f62cbb646203fd4a94 Mon Sep 17 00:00:00 2001
From: Seth Troisi <sethtroisi@google.com>
Date: Mon, 22 Apr 2024 10:45:39 -0700
Subject: [PATCH 085/116] Add newline to two zpool messages

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Seth Troisi <sethtroisi@google.com>
Closes #16113
---
 cmd/zpool/zpool_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index e6664b918be4..636eb2a301cd 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -3445,10 +3445,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
 		ms_status = zpool_enable_datasets(zhp, mntopts, 0);
 		if (ms_status == EZFS_SHAREFAILED) {
 			(void) fprintf(stderr, gettext("Import was "
-			    "successful, but unable to share some datasets"));
+			    "successful, but unable to share some datasets\n"));
 		} else if (ms_status == EZFS_MOUNTFAILED) {
 			(void) fprintf(stderr, gettext("Import was "
-			    "successful, but unable to mount some datasets"));
+			    "successful, but unable to mount some datasets\n"));
 		}
 	}
 

From cdae59e1530061cf4caa549a062994161c4383c6 Mon Sep 17 00:00:00 2001
From: Seth Troisi <sethtroisi@google.com>
Date: Mon, 22 Apr 2024 10:47:44 -0700
Subject: [PATCH 086/116] ZTS: user_namespace_004.ksh avoid error in cleanup if
 unsupported

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Seth Troisi <sethtroisi@google.com>
Closes #16114
---
 .../tests/functional/user_namespace/user_namespace_004.ksh    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh
index 37ef84b72377..e6ad25f23f93 100755
--- a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh
+++ b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh
@@ -44,8 +44,6 @@ user_ns_cleanup() {
 	log_must zfs destroy -r "$TESTPOOL/userns"
 }
 
-log_onexit user_ns_cleanup
-
 log_assert "Check zfs zone command handling of non-namespace files"
 
 # Pass if user namespaces are not supported.
@@ -54,6 +52,8 @@ if [ "$?" -ne "0" ]; then
 	log_unsupported "Failed to create user namespace"
 fi
 
+log_onexit user_ns_cleanup
+
 # Create the baseline datasets.
 log_must zfs create -o zoned=on "$TESTPOOL/userns"
 

From 7e52795aad561ec39e76a3ef6fea9e5c254b2e16 Mon Sep 17 00:00:00 2001
From: Brooks Davis <brooks@one-eyed-alien.net>
Date: Mon, 22 Apr 2024 10:48:58 -0700
Subject: [PATCH 087/116] ztest: use ASSERT3P to compare pointers

With a sufficiently modern gcc (I saw this with gcc13), gcc complains
when casting pointers to an integer of a different type (even a larger
one).  On 32-bt ASSERT3U does this on 32-bit systems by casting a 32-bit
pointer to uint64_t so use ASSERT3P which uses uintptr_t.

Fixes: 5caeef02fa53 RAID-Z expansion feature

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Brooks Davis <brooks.davis@sri.com>
Closes #16115
---
 cmd/ztest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/ztest.c b/cmd/ztest.c
index 684ab586bb93..b0fea8b3cfb4 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -8045,7 +8045,7 @@ ztest_raidz_expand_run(ztest_shared_t *zs, spa_t *spa)
 	ztest_expand_io_t *thread_args;
 
 	ASSERT3U(ztest_opts.zo_raidz_expand_test, !=, RAIDZ_EXPAND_NONE);
-	ASSERT3U(rzvd->vdev_ops, ==, &vdev_raidz_ops);
+	ASSERT3P(rzvd->vdev_ops, ==, &vdev_raidz_ops);
 	ztest_opts.zo_raidz_expand_test = RAIDZ_EXPAND_STARTED;
 
 	/* Setup a 1 MiB buffer of random data */

From c346068e5efeafd5676ab1644086877173ca4226 Mon Sep 17 00:00:00 2001
From: Ryan <error.nointernet@gmail.com>
Date: Tue, 23 Apr 2024 01:59:31 +0800
Subject: [PATCH 088/116] zfs get: add '-t fs' and '-t vol' options

Make `zfs get` accept `fs` for `filesystem` and `vol` for `volume`.

Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ryan <errornointernet@envs.net>
Closes #16117
---
 cmd/zfs/zfs_main.c | 22 ++++++++++++++++------
 man/man8/zfs-set.8 | 11 ++++++++++-
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index ec52c563b447..0bbdd5b18eda 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -2146,15 +2146,25 @@ found2:;
 
 			for (char *tok; (tok = strsep(&optarg, ",")); ) {
 				static const char *const type_opts[] = {
-					"filesystem", "volume",
-					"snapshot", "snap",
+					"filesystem",
+					"fs",
+					"volume",
+					"vol",
+					"snapshot",
+					"snap",
 					"bookmark",
-					"all" };
+					"all"
+				};
 				static const int type_types[] = {
-					ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME,
-					ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT,
+					ZFS_TYPE_FILESYSTEM,
+					ZFS_TYPE_FILESYSTEM,
+					ZFS_TYPE_VOLUME,
+					ZFS_TYPE_VOLUME,
+					ZFS_TYPE_SNAPSHOT,
+					ZFS_TYPE_SNAPSHOT,
 					ZFS_TYPE_BOOKMARK,
-					ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK };
+					ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK
+				};
 
 				for (i = 0; i < ARRAY_SIZE(type_opts); ++i)
 					if (strcmp(tok, type_opts[i]) == 0) {
diff --git a/man/man8/zfs-set.8 b/man/man8/zfs-set.8
index c01bcc643e5d..8cc19caf3f00 100644
--- a/man/man8/zfs-set.8
+++ b/man/man8/zfs-set.8
@@ -29,7 +29,7 @@
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd March 16, 2022
+.Dd April 20, 2024
 .Dt ZFS-SET 8
 .Os
 .
@@ -158,6 +158,15 @@ A comma-separated list of types to display, where
 .Ar type
 is one of
 .Sy filesystem , snapshot , volume , bookmark , No or Sy all .
+.Sy fs ,
+.Sy snap ,
+or
+.Sy vol
+can be used as aliases for
+.Sy filesystem ,
+.Sy snapshot ,
+or
+.Sy volume .
 .El
 .It Xo
 .Nm zfs

From 4036b8d027fb7fe1a629b08a0d23cac975ab2eb9 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 22 Apr 2024 14:41:03 -0400
Subject: [PATCH 089/116] Refactor dbuf_read() for safer decryption

In dbuf_read_verify_dnode_crypt():
 - We don't need original dbuf locked there. Instead take a lock
on a dnode dbuf, that is actually manipulated.
 - Block decryption for a dnode dbuf if it is currently being
written.  ARC hash lock does not protect anonymous buffers, so
arc_untransform() is unsafe when used on buffers being written,
that may happen in case of encrypted dnode buffers, since they
are not copied by dbuf_dirty()/dbuf_hold_copy().

In dbuf_read():
 - If the buffer is in flight, recheck its compression/encryption
status after it is cached, since it may need arc_untransform().

Tested-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16104
---
 module/zfs/dbuf.c | 214 ++++++++++++++++++++++------------------------
 1 file changed, 104 insertions(+), 110 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 5f3643f573f7..bb913f556374 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -161,13 +161,13 @@ struct {
 } dbuf_sums;
 
 #define	DBUF_STAT_INCR(stat, val)	\
-	wmsum_add(&dbuf_sums.stat, val);
+	wmsum_add(&dbuf_sums.stat, val)
 #define	DBUF_STAT_DECR(stat, val)	\
-	DBUF_STAT_INCR(stat, -(val));
+	DBUF_STAT_INCR(stat, -(val))
 #define	DBUF_STAT_BUMP(stat)		\
-	DBUF_STAT_INCR(stat, 1);
+	DBUF_STAT_INCR(stat, 1)
 #define	DBUF_STAT_BUMPDOWN(stat)	\
-	DBUF_STAT_INCR(stat, -1);
+	DBUF_STAT_INCR(stat, -1)
 #define	DBUF_STAT_MAX(stat, v) {					\
 	uint64_t _m;							\
 	while ((v) > (_m = dbuf_stats.stat.value.ui64) &&		\
@@ -177,7 +177,6 @@ struct {
 
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
-static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
 
 /*
  * Global data structures and functions for the dbuf cache.
@@ -1418,13 +1417,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
  * a decrypted block. Otherwise success.
  */
 static int
-dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
 {
-	int bonuslen, max_bonuslen, err;
-
-	err = dbuf_read_verify_dnode_crypt(db, flags);
-	if (err)
-		return (err);
+	int bonuslen, max_bonuslen;
 
 	bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 	max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
@@ -1509,32 +1504,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
  * decrypt / authenticate them when we need to read an encrypted bonus buffer.
  */
 static int
-dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
+dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
 {
-	int err = 0;
 	objset_t *os = db->db_objset;
-	arc_buf_t *dnode_abuf;
-	dnode_t *dn;
+	dmu_buf_impl_t *dndb;
+	arc_buf_t *dnbuf;
 	zbookmark_phys_t zb;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
+	int err;
 
 	if ((flags & DB_RF_NO_DECRYPT) != 0 ||
-	    !os->os_encrypted || os->os_raw_receive)
+	    !os->os_encrypted || os->os_raw_receive ||
+	    (dndb = dn->dn_dbuf) == NULL)
 		return (0);
 
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
-
-	if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
-		DB_DNODE_EXIT(db);
+	dnbuf = dndb->db_buf;
+	if (!arc_is_encrypted(dnbuf))
 		return (0);
-	}
+
+	mutex_enter(&dndb->db_mtx);
+
+	/*
+	 * Since dnode buffer is modified by sync process, there can be only
+	 * one copy of it.  It means we can not modify (decrypt) it while it
+	 * is being written.  I don't see how this may happen now, since
+	 * encrypted dnode writes by receive should be completed before any
+	 * plain-text reads due to txg wait, but better be safe than sorry.
+	 */
+	while (1) {
+		if (!arc_is_encrypted(dnbuf)) {
+			mutex_exit(&dndb->db_mtx);
+			return (0);
+		}
+		dbuf_dirty_record_t *dr = dndb->db_data_pending;
+		if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
+			break;
+		cv_wait(&dndb->db_changed, &dndb->db_mtx);
+	};
 
 	SET_BOOKMARK(&zb, dmu_objset_id(os),
-	    DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
-	err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
+	    DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
+	err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
 
 	/*
 	 * An error code of EACCES tells us that the key is still not
@@ -1547,7 +1556,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
 	    !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
 		err = 0;
 
-	DB_DNODE_EXIT(db);
+	mutex_exit(&dndb->db_mtx);
 
 	return (err);
 }
@@ -1573,7 +1582,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 	    RW_LOCK_HELD(&db->db_parent->db_rwlock));
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
-		err = dbuf_read_bonus(db, dn, flags);
+		err = dbuf_read_bonus(db, dn);
 		goto early_unlock;
 	}
 
@@ -1635,10 +1644,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
 		goto early_unlock;
 	}
 
-	err = dbuf_read_verify_dnode_crypt(db, flags);
-	if (err != 0)
-		goto early_unlock;
-
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
@@ -1754,19 +1759,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 {
-	int err = 0;
-	boolean_t prefetch;
 	dnode_t *dn;
+	boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
+	int err;
 
-	/*
-	 * We don't have to hold the mutex to check db_state because it
-	 * can't be freed while we have a hold on the buffer.
-	 */
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
+	/*
+	 * Ensure that this block's dnode has been decrypted if the caller
+	 * has requested decrypted data.
+	 */
+	err = dbuf_read_verify_dnode_crypt(db, dn, flags);
+	if (err != 0)
+		goto done;
+
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    (flags & DB_RF_NOPREFETCH) == 0;
 
@@ -1775,13 +1784,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 		db->db_partial_read = B_TRUE;
 	else if (!(flags & DB_RF_PARTIAL_MORE))
 		db->db_partial_read = B_FALSE;
-	if (db->db_state == DB_CACHED) {
+	miss = (db->db_state != DB_CACHED);
+
+	if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 		/*
-		 * Ensure that this block's dnode has been decrypted if
-		 * the caller has requested decrypted data.
+		 * Another reader came in while the dbuf was in flight between
+		 * UNCACHED and CACHED.  Either a writer will finish filling
+		 * the buffer, sending the dbuf to CACHED, or the first reader's
+		 * request will reach the read_done callback and send the dbuf
+		 * to CACHED.  Otherwise, a failure occurred and the dbuf will
+		 * be sent to UNCACHED.
 		 */
-		err = dbuf_read_verify_dnode_crypt(db, flags);
+		if (flags & DB_RF_NEVERWAIT) {
+			mutex_exit(&db->db_mtx);
+			DB_DNODE_EXIT(db);
+			goto done;
+		}
+		do {
+			ASSERT(db->db_state == DB_READ ||
+			    (flags & DB_RF_HAVESTRUCT) == 0);
+			DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
+			    zio_t *, pio);
+			cv_wait(&db->db_changed, &db->db_mtx);
+		} while (db->db_state == DB_READ || db->db_state == DB_FILL);
+		if (db->db_state == DB_UNCACHED) {
+			err = SET_ERROR(EIO);
+			mutex_exit(&db->db_mtx);
+			DB_DNODE_EXIT(db);
+			goto done;
+		}
+	}
 
+	if (db->db_state == DB_CACHED) {
 		/*
 		 * If the arc buf is compressed or encrypted and the caller
 		 * requested uncompressed data, we need to untransform it
@@ -1789,8 +1823,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 		 * unauthenticated blocks, which will verify their MAC if
 		 * the key is now available.
 		 */
-		if (err == 0 && db->db_buf != NULL &&
-		    (flags & DB_RF_NO_DECRYPT) == 0 &&
+		if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
 		    (arc_is_encrypted(db->db_buf) ||
 		    arc_is_unauthenticated(db->db_buf) ||
 		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
@@ -1804,17 +1837,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 			dbuf_set_data(db, db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
-		if (err == 0 && prefetch) {
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-			    B_FALSE, flags & DB_RF_HAVESTRUCT);
-		}
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_hits);
-	} else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
-		boolean_t need_wait = B_FALSE;
-
+	} else {
+		ASSERT(db->db_state == DB_UNCACHED ||
+		    db->db_state == DB_NOFILL);
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
-
 		if (pio == NULL && (db->db_state == DB_NOFILL ||
 		    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
 			spa_t *spa = dn->dn_objset->os_spa;
@@ -1822,65 +1848,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 			need_wait = B_TRUE;
 		}
 		err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
-		/*
-		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
-		 * for us
-		 */
-		if (!err && prefetch) {
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-			    db->db_state != DB_CACHED,
-			    flags & DB_RF_HAVESTRUCT);
-		}
-
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_misses);
+		/* dbuf_read_impl drops db_mtx and parent's rwlock. */
+		miss = (db->db_state != DB_CACHED);
+	}
 
-		/*
-		 * If we created a zio_root we must execute it to avoid
-		 * leaking it, even if it isn't attached to any work due
-		 * to an error in dbuf_read_impl().
-		 */
-		if (need_wait) {
-			if (err == 0)
-				err = zio_wait(pio);
-			else
-				(void) zio_wait(pio);
-			pio = NULL;
-		}
-	} else {
-		/*
-		 * Another reader came in while the dbuf was in flight
-		 * between UNCACHED and CACHED.  Either a writer will finish
-		 * writing the buffer (sending the dbuf to CACHED) or the
-		 * first reader's request will reach the read_done callback
-		 * and send the dbuf to CACHED.  Otherwise, a failure
-		 * occurred and the dbuf went to UNCACHED.
-		 */
-		mutex_exit(&db->db_mtx);
-		if (prefetch) {
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-			    B_TRUE, flags & DB_RF_HAVESTRUCT);
-		}
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_misses);
+	if (err == 0 && prefetch) {
+		dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
+		    flags & DB_RF_HAVESTRUCT);
+	}
+	DB_DNODE_EXIT(db);
 
-		/* Skip the wait per the caller's request. */
-		if ((flags & DB_RF_NEVERWAIT) == 0) {
-			mutex_enter(&db->db_mtx);
-			while (db->db_state == DB_READ ||
-			    db->db_state == DB_FILL) {
-				ASSERT(db->db_state == DB_READ ||
-				    (flags & DB_RF_HAVESTRUCT) == 0);
-				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
-				    db, zio_t *, pio);
-				cv_wait(&db->db_changed, &db->db_mtx);
-			}
-			if (db->db_state == DB_UNCACHED)
-				err = SET_ERROR(EIO);
-			mutex_exit(&db->db_mtx);
-		}
+	/*
+	 * If we created a zio we must execute it to avoid leaking it, even if
+	 * it isn't attached to any work due to an error in dbuf_read_impl().
+	 */
+	if (need_wait) {
+		if (err == 0)
+			err = zio_wait(pio);
+		else
+			(void) zio_wait(pio);
+		pio = NULL;
 	}
 
+done:
+	if (miss)
+		DBUF_STAT_BUMP(hash_misses);
+	else
+		DBUF_STAT_BUMP(hash_hits);
 	if (pio && err != 0) {
 		zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);

From 87d81d1d13e0ef848d2d533a4f12f5de41026e73 Mon Sep 17 00:00:00 2001
From: Todd <18294602+seidelma@users.noreply.github.com>
Date: Mon, 22 Apr 2024 17:55:41 -0700
Subject: [PATCH 090/116] zfs-kmod: fix empty rpm requires/conflicts

Fix an error in zfs-kmod.spec that causes kmod-zfs packages not to
include the correct RPM requires/conflicts relationships.  With this
change applied, RPM correctly no longer allows kmod-zfs & zfs-dkms
packages to be installed together.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Todd Seidelmann <18294602+seidelma@users.noreply.github.com>
Closes #16121
---
 rpm/redhat/zfs-kmod.spec.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in
index 9c836786baea..876c198c64de 100644
--- a/rpm/redhat/zfs-kmod.spec.in
+++ b/rpm/redhat/zfs-kmod.spec.in
@@ -17,7 +17,7 @@ BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
 # by generating a preamble text file which kmodtool can append to the spec file.
 %(/bin/echo -e "\
 Requires:       @PACKAGE@ = %{version}\n\
-Conflicts:      @PACKAGE@-dkms)
+Conflicts:      @PACKAGE@-dkms" > %{_sourcedir}/kmod-preamble)
 
 # LDFLAGS are not sanitized by arch/*/Makefile for these architectures.
 %ifarch ppc ppc64 ppc64le aarch64

From 1f940de07224c2068e7c721222b1f3a519820ca9 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 23 Apr 2024 12:06:00 -0400
Subject: [PATCH 091/116] L2ARC: Cleanup buffer re-compression

When compressed ARC is disabled, we may have to re-compress when
writing into L2ARC.  If doing so we can't fit it into the original
physical size, we should just fail immediately, since even if it
may still fit into allocation size, its checksum will never match.

While there, refactor the code similar to other compression places
without using abd_return_buf_copy().

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16038
---
 module/zfs/arc.c | 59 ++++++++++++++++--------------------------------
 1 file changed, 20 insertions(+), 39 deletions(-)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 6954051b1d19..51039af9bcc0 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -8902,7 +8902,6 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
     abd_t **abd_out)
 {
 	int ret;
-	void *tmp = NULL;
 	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
 	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
@@ -8923,12 +8922,11 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 	 * and copy the data. This may be done to eliminate a dependency on a
 	 * shared buffer or to reallocate the buffer to match asize.
 	 */
-	if (HDR_HAS_RABD(hdr) && asize != psize) {
-		ASSERT3U(asize, >=, psize);
+	if (HDR_HAS_RABD(hdr)) {
+		ASSERT3U(asize, >, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
-		if (psize != asize)
-			abd_zero_off(to_write, psize, asize - psize);
+		abd_zero_off(to_write, psize, asize - psize);
 		goto out;
 	}
 
@@ -8937,48 +8935,31 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 		ASSERT3U(size, ==, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
-		if (size != asize)
+		if (asize > size)
 			abd_zero_off(to_write, size, asize - size);
 		goto out;
 	}
 
 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
-		/*
-		 * In some cases, we can wind up with size > asize, so
-		 * we need to opt for the larger allocation option here.
-		 *
-		 * (We also need abd_return_buf_copy in all cases because
-		 * it's an ASSERT() to modify the buffer before returning it
-		 * with arc_return_buf(), and all the compressors
-		 * write things before deciding to fail compression in nearly
-		 * every case.)
-		 */
-		uint64_t bufsize = MAX(size, asize);
-		cabd = abd_alloc_for_io(bufsize, ismd);
-		tmp = abd_borrow_buf(cabd, bufsize);
-
-		psize = zio_compress_data(compress, to_write, &tmp, size,
-		    hdr->b_complevel);
-
-		if (psize >= asize) {
-			psize = HDR_GET_PSIZE(hdr);
-			abd_return_buf_copy(cabd, tmp, bufsize);
-			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
-			to_write = cabd;
-			abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
-			if (psize != asize)
-				abd_zero_off(to_write, psize, asize - psize);
-			goto encrypt;
+		size_t bufsize = MAX(size, asize);
+		void *buf = zio_buf_alloc(bufsize);
+		uint64_t csize = zio_compress_data(compress, to_write, &buf,
+		    size, hdr->b_complevel);
+		if (csize > psize) {
+			/*
+			 * We can't re-compress the block into the original
+			 * psize.  Even if it fits into asize, it does not
+			 * matter, since checksum will never match on read.
+			 */
+			zio_buf_free(buf, bufsize);
+			return (SET_ERROR(EIO));
 		}
-		ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
-		if (psize < asize)
-			memset((char *)tmp + psize, 0, bufsize - psize);
-		psize = HDR_GET_PSIZE(hdr);
-		abd_return_buf_copy(cabd, tmp, bufsize);
-		to_write = cabd;
+		if (asize > csize)
+			memset((char *)buf + csize, 0, asize - csize);
+		to_write = cabd = abd_get_from_buf(buf, bufsize);
+		abd_take_ownership_of_buf(cabd, B_TRUE);
 	}
 
-encrypt:
 	if (HDR_ENCRYPTED(hdr)) {
 		eabd = abd_alloc_for_io(asize, ismd);
 

From 67d13998b3e055232a07311c2dc609571eaf1df1 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 24 Apr 2024 17:38:48 -0400
Subject: [PATCH 092/116] Make more taskq parameters writable

There is no reason for these module parameters to be read-only.
Being modified they just apply on next pool import/creation, that
is useful for testing different values.

Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16118
---
 man/man4/zfs.4   | 9 +++++++--
 module/zfs/spa.c | 8 ++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 6088ebc7ef35..22e1106bbf05 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2327,8 +2327,8 @@ Prioritize requeued I/O.
 .
 .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint
 Percentage of online CPUs which will run a worker thread for I/O.
-These workers are responsible for I/O work such as compression and
-checksum calculations.
+These workers are responsible for I/O work such as compression, encryption,
+checksum and parity calculations.
 Fractional number of CPUs will be rounded down.
 .Pp
 The default value of
@@ -2336,6 +2336,7 @@ The default value of
 was chosen to avoid using all CPUs which can result in
 latency issues and inconsistent application performance,
 especially when slower compression and/or checksumming is enabled.
+Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint
 Number of worker threads per taskq.
@@ -2345,6 +2346,7 @@ while higher reduces lock contention.
 If
 .Sy 0 ,
 generate a system-dependent value close to 6 threads per taskq.
+Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint
 Determines the number of CPUs to run write issue taskqs.
@@ -2353,16 +2355,19 @@ When 0 (the default), the value to use is computed internally
 as the number of actual CPUs in the system divided by the
 .Sy spa_num_allocators
 value.
+Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
 Set the queue and thread configuration for the IO read queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
+Set values only apply to pools imported/created after that.
 .
 .It Sy zio_taskq_write Ns = Ns Sy sync fixed,1,5 scale fixed,1,5 Pq charp
 Set the queue and thread configuration for the IO write queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
+Set values only apply to pools imported/created after that.
 .
 .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Do not create zvol device nodes.
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 96daf51b696a..879147b097d0 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -10811,10 +10811,10 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
 	"Print vdev tree to zfs_dbgmsg during pool import");
 
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
 	"Percentage of CPUs to run an IO worker thread");
 
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
 	"Number of threads per IO worker taskqueue");
 
 /* BEGIN CSTYLED */
@@ -10845,10 +10845,10 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
 
 #ifdef _KERNEL
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
-	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
+	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
 	"Configure IO queues for read IO");
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
-	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
+	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
 	"Configure IO queues for write IO");
 #endif
 /* END CSTYLED */

From 5044c4e3ff0558b726b491a9267fc3db6f855a2d Mon Sep 17 00:00:00 2001
From: Allan Jude <allan@klarasystems.com>
Date: Wed, 24 Apr 2024 17:51:21 -0400
Subject: [PATCH 093/116] Fast Dedup: ZAP Shrinking

This allows ZAPs to shrink. When there are two empty sibling leafs,
one of them is collapsed and its storage space is reused.
This improved performance on directories that at one time contained
a large number of files, but many or all of those files have since
been deleted.

This also applies to all other types of ZAPs as well.

Sponsored-by: iXsystems, Inc.
Sponsored-by: Klara, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Alexander Stetsenko <alex.stetsenko@klarasystems.com>
Closes #15888
---
 man/man4/zfs.4                                |   7 +-
 module/zfs/zap.c                              | 336 +++++++++++++++++-
 tests/runfiles/common.run                     |   4 +
 tests/zfs-tests/tests/Makefile.am             |   3 +
 .../tests/functional/zap_shrink/cleanup.ksh   |  34 ++
 .../tests/functional/zap_shrink/setup.ksh     |  35 ++
 .../zap_shrink/zap_shrink_001_pos.ksh         |  81 +++++
 7 files changed, 488 insertions(+), 12 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/zap_shrink/setup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 22e1106bbf05..ef0385d42b8e 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -16,7 +16,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd January 9, 2024
+.Dd February 14, 2024
 .Dt ZFS 4
 .Os
 .
@@ -564,9 +564,8 @@ However, this is limited by
 Maximum micro ZAP size.
 A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
 .
-.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
-Log2 fraction of holes in speculative prefetch stream allowed for it to
-proceed.
+.It Sy zap_shrink_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
+If set, adjacent empty ZAP blocks will be collapsed, reducing disk space.
 .
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index da86defb445c..1b6b16fc6662 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -22,6 +22,8 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
+ * Copyright (c) 2023, Klara Inc.
  */
 
 /*
@@ -41,6 +43,7 @@
 
 #include <sys/spa.h>
 #include <sys/dmu.h>
+#include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
 #include <sys/fs/zfs.h>
@@ -78,9 +81,16 @@
  */
 static int zap_iterate_prefetch = B_TRUE;
 
+/*
+ * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be
+ * collapsed into a single block.
+ */
+int zap_shrink_enabled = B_TRUE;
+
 int fzap_default_block_shift = 14; /* 16k blocksize */
 
 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx);
 
 void
 fzap_byteswap(void *vbuf, size_t size)
@@ -586,6 +596,72 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 	}
 }
 
+static int
+zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk,
+    dmu_tx_t *tx)
+{
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	int epb = bs >> 3; /* entries per block */
+	int err = 0;
+
+	ASSERT(tx != NULL);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	/*
+	 * Check for i/o errors
+	 */
+	for (int i = 0; i < nptrs; i += epb) {
+		uint64_t blk;
+		err = zap_idx_to_blk(zap, idx + i, &blk);
+		if (err != 0) {
+			return (err);
+		}
+	}
+
+	for (int i = 0; i < nptrs; i++) {
+		err = zap_set_idx_to_blk(zap, idx + i, blk, tx);
+		ASSERT0(err); /* we checked for i/o errors above */
+		if (err != 0)
+			break;
+	}
+
+	return (err);
+}
+
+#define	ZAP_PREFIX_HASH(pref, pref_len)	((pref) << (64 - (pref_len)))
+
+/*
+ * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl.
+ * If two leaves are siblings, their ranges are adjecent and contain the same
+ * number of entries. In order to find out if a leaf has a sibling, we need to
+ * check the range corresponding to the sibling leaf. There is no need to check
+ * all entries in the range, we only need to check the frist and the last one.
+ */
+static uint64_t
+check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len)
+{
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len);
+	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len;
+	uint64_t nptrs = (1 << pref_diff);
+	uint64_t first;
+	uint64_t last;
+
+	ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+
+	if (zap_idx_to_blk(zap, idx, &first) != 0)
+		return (0);
+
+	if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0)
+		return (0);
+
+	if (first != last)
+		return (0);
+	return (first);
+}
+
 static int
 zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 {
@@ -958,6 +1034,10 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
 	if (err == 0) {
 		zap_entry_remove(&zeh);
 		zap_increment_num_entries(zn->zn_zap, -1, tx);
+
+		if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 &&
+		    zap_shrink_enabled)
+			return (zap_shrink(zn, l, tx));
 	}
 	zap_put_leaf(l);
 	return (err);
@@ -1222,13 +1302,19 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 		    ZIO_PRIORITY_ASYNC_READ);
 	}
 
-	if (zc->zc_leaf &&
-	    (ZAP_HASH_IDX(zc->zc_hash,
-	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
-	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+	if (zc->zc_leaf) {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
-		zap_put_leaf(zc->zc_leaf);
-		zc->zc_leaf = NULL;
+
+		/*
+		 * The leaf was either shrunk or split.
+		 */
+		if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) ||
+		    (ZAP_HASH_IDX(zc->zc_hash,
+		    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+		    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+			zap_put_leaf(zc->zc_leaf);
+			zc->zc_leaf = NULL;
+		}
 	}
 
 again:
@@ -1237,8 +1323,6 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 		    &zc->zc_leaf);
 		if (err != 0)
 			return (err);
-	} else {
-		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 	}
 	l = zc->zc_leaf;
 
@@ -1367,6 +1451,242 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 	}
 }
 
+/*
+ * Find last allocated block and update freeblk.
+ */
+static void
+zap_trunc(zap_t *zap)
+{
+	uint64_t nentries;
+	uint64_t lastblk;
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) {
+		/* External ptrtbl */
+		nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+		lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk +
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1;
+	} else {
+		/* Embedded ptrtbl */
+		nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+		lastblk = 0;
+	}
+
+	for (uint64_t idx = 0; idx < nentries; idx++) {
+		uint64_t blk;
+		if (zap_idx_to_blk(zap, idx, &blk) != 0)
+			return;
+		if (blk > lastblk)
+			lastblk = blk;
+	}
+
+	ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk);
+
+	zap_f_phys(zap)->zap_freeblk = lastblk + 1;
+}
+
+/*
+ * ZAP shrinking algorithm.
+ *
+ * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf
+ * only if it has a sibling. Sibling leaves have the same prefix length and
+ * their prefixes differ only by the least significant (sibling) bit. We require
+ * both siblings to be empty. This eliminates a need to rehash the non-empty
+ * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl
+ * entries of the removed leaf to point out to the remaining leaf. Prefix length
+ * of the remaining leaf is decremented. As a result, it has a new prefix and it
+ * might have a new sibling. So, we repeat the process.
+ *
+ * Steps:
+ * 1. Check if a sibling leaf (sl) exists and it is empty.
+ * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1.
+ * 3. Release the sibling (sl) to derefer it again with WRITER lock.
+ * 4. Upgrade zapdir lock to WRITER (once).
+ * 5. Derefer released leaves again.
+ * 6. If it is needed, recheck whether both leaves are still siblings and empty.
+ * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of
+ * the remaining leaf (slbit 0).
+ * 8. Free disk block of the removed leaf (dmu_free_range).
+ * 9. Decrement prefix_len of the remaining leaf.
+ * 10. Repeat the steps.
+ */
+static int
+zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+{
+	zap_t *zap = zn->zn_zap;
+	int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	uint64_t hash = zn->zn_hash;
+	uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+	uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	boolean_t trunc = B_FALSE;
+	int err = 0;
+
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+	ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix);
+
+	boolean_t writer = B_FALSE;
+
+	/*
+	 * To avoid deadlock always deref leaves in the same order -
+	 * sibling 0 first, then sibling 1.
+	 */
+	while (prefix_len) {
+		zap_leaf_t *sl;
+		int64_t prefix_diff = zt_shift - prefix_len;
+		uint64_t sl_prefix = prefix ^ 1;
+		uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len);
+		int slbit = prefix & 1;
+
+		ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+
+		/*
+		 * Check if there is a sibling by reading ptrtbl ptrs.
+		 */
+		if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0)
+			break;
+
+		/*
+		 * sibling 1, unlock it - we haven't yet dereferenced sibling 0.
+		 */
+		if (slbit == 1) {
+			zap_put_leaf(l);
+			l = NULL;
+		}
+
+		/*
+		 * Dereference sibling leaf and check if it is empty.
+		 */
+		if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER,
+		    &sl)) != 0)
+			break;
+
+		ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix);
+
+		/*
+		 * Check if we have a sibling and it is empty.
+		 */
+		if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len ||
+		    zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) {
+			zap_put_leaf(sl);
+			break;
+		}
+
+		zap_put_leaf(sl);
+
+		/*
+		 * If there two empty sibling, we have work to do, so
+		 * we need to lock ZAP ptrtbl as WRITER.
+		 */
+		if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
+			/* We failed to upgrade */
+			if (l != NULL) {
+				zap_put_leaf(l);
+				l = NULL;
+			}
+
+			/*
+			 * Usually, the right way to upgrade from a READER lock
+			 * to a WRITER lock is to call zap_unlockdir() and
+			 * zap_lockdir(), but we do not have a tag. Instead,
+			 * we do it in more sophisticated way.
+			 */
+			rw_exit(&zap->zap_rwlock);
+			rw_enter(&zap->zap_rwlock, RW_WRITER);
+			dmu_buf_will_dirty(zap->zap_dbuf, tx);
+
+			zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+			writer = B_TRUE;
+		}
+
+		/*
+		 * Here we have WRITER lock for ptrtbl.
+		 * Now, we need a WRITER lock for both siblings leaves.
+		 * Also, we have to recheck if the leaves are still siblings
+		 * and still empty.
+		 */
+		if (l == NULL) {
+			/* sibling 0 */
+			if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash),
+			    tx, RW_WRITER, &l)) != 0)
+				break;
+
+			/*
+			 * The leaf isn't empty anymore or
+			 * it was shrunk/split while our locks were down.
+			 */
+			if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 ||
+			    zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len)
+				break;
+		}
+
+		/* sibling 1 */
+		if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx,
+		    RW_WRITER, &sl)) != 0)
+			break;
+
+		/*
+		 * The leaf isn't empty anymore or
+		 * it was shrunk/split while our locks were down.
+		 */
+		if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 ||
+		    zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) {
+			zap_put_leaf(sl);
+			break;
+		}
+
+		/* If we have gotten here, we have a leaf to collapse */
+		uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff;
+		uint64_t nptrs = (1ULL << prefix_diff);
+		uint64_t sl_blkid = sl->l_blkid;
+
+		/*
+		 * Set ptrtbl entries to point out to the slibling 0 blkid
+		 */
+		if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid,
+		    tx)) != 0) {
+			zap_put_leaf(sl);
+			break;
+		}
+
+		/*
+		 * Free sibling 1 disk block.
+		 */
+		int bs = FZAP_BLOCK_SHIFT(zap);
+		if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1)
+			trunc = B_TRUE;
+
+		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
+		    sl_blkid << bs, 1 << bs, tx);
+		zap_put_leaf(sl);
+
+		zap_f_phys(zap)->zap_num_leafs--;
+
+		/*
+		 * Update prefix and prefix_len.
+		 */
+		zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1;
+		zap_leaf_phys(l)->l_hdr.lh_prefix_len--;
+
+		prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+		prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	}
+
+	if (trunc)
+		zap_trunc(zap);
+
+	if (l != NULL)
+		zap_put_leaf(l);
+
+	return (err);
+}
+
 /* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
 	"When iterating ZAP object, prefetch it");
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW,
+	"Enable ZAP shrinking");
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 0586d991b802..5e7fdf359a75 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -643,6 +643,10 @@ tags = ['functional', 'compression']
 tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
 tags = ['functional', 'cp_files']
 
+[tests/functional/zap_shrink]
+tests = ['zap_shrink_001_pos']
+tags = ['functional', 'zap_shrink']
+
 [tests/functional/crtime]
 tests = ['crtime_001_pos' ]
 tags = ['functional', 'crtime']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index dc447e042225..a6fe030d410c 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -2074,6 +2074,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/xattr/xattr_012_pos.ksh \
 	functional/xattr/xattr_013_pos.ksh \
 	functional/xattr/xattr_compat.ksh \
+	functional/zap_shrink/cleanup.ksh \
+	functional/zap_shrink/zap_shrink_001_pos.ksh \
+	functional/zap_shrink/setup.ksh \
 	functional/zpool_influxdb/cleanup.ksh \
 	functional/zpool_influxdb/setup.ksh \
 	functional/zpool_influxdb/zpool_influxdb.ksh \
diff --git a/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh b/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh
new file mode 100755
index 000000000000..42fe70042d6a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zap_shrink/cleanup.ksh
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh b/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh
new file mode 100755
index 000000000000..b756d4e76c83
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zap_shrink/setup.ksh
@@ -0,0 +1,35 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+default_setup $DISK
diff --git a/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh b/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh
new file mode 100755
index 000000000000..4dbf579b8ac7
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/zap_shrink/zap_shrink_001_pos.ksh
@@ -0,0 +1,81 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2024, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Create a large number of files in a directory. Then remove all files and
+# check that the directory zap was shrunk. Use zdb to check that the zap object
+# contains only one leaf block using zdb.
+#
+
+verify_runnable "global"
+
+DIR=largedir
+
+NR_FILES=100000
+BATCH=1000
+CWD=$PWD
+
+log_assert "Create a large number of files ($NR_FILES) in a directory. " \
+	"Make sure that the directory ZAP object was shrunk."
+
+log_must mkdir $TESTDIR/$DIR
+
+cd $TESTDIR/$DIR
+# In order to prevent arguments overflowing, create NR_FILES in BATCH at once.
+for i in $(seq $(($NR_FILES/$BATCH))); do
+	touch $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH)));
+done
+cd $CWD
+
+log_must test $NR_FILES -eq $(ls -U $TESTDIR/$DIR | wc -l)
+
+# remove all files in $DIR directory
+cd $TESTDIR/$DIR
+for i in $(seq $(($NR_FILES/$BATCH))); do
+	rm $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH)))
+done
+cd $CWD
+sync_pool $TESTPOOL
+
+log_must test 0 -eq $(ls -U $TESTDIR/$DIR | wc -l)
+
+# check whether zap_shrink works
+zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR)
+nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}')
+log_must test 1 -eq $nleafs
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+
+# check whether zap_shrink works
+zapobj=$(zdb -v -O $TESTPOOL/$TESTFS $DIR)
+nleafs=$(echo "$zapobj" | grep "Leaf blocks:" | awk -F\: '{print($2);}')
+log_must test 1 -eq $nleafs
+
+log_pass

From 317b31eedb2b729985a48d5b98a3a5d34895eeb2 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 25 Apr 2024 13:40:09 -0700
Subject: [PATCH 094/116] Python 3.12 deprecated python3-distutils

As for python-3.12 the distutils package has been deprecated.
The latest ax_python_devel.m4 macro from the autoconf archive
has been updated accordingly so let's pull in the new version.

We can also drop the changes made to our customized version
to continue if the development version is not installed since
this functionality has been included upstream.

Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #16126
Closes #16129
---
 config/always-pyzfs.m4    |   9 +-
 config/ax_python_devel.m4 | 341 +++++++++++++++++++++++++-------------
 contrib/debian/control    |   2 +-
 3 files changed, 235 insertions(+), 117 deletions(-)

diff --git a/config/always-pyzfs.m4 b/config/always-pyzfs.m4
index 9b123b1b2db1..98c1cc230205 100644
--- a/config/always-pyzfs.m4
+++ b/config/always-pyzfs.m4
@@ -80,10 +80,11 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [
 			[AC_MSG_ERROR("Python $PYTHON_VERSION unknown")]
 		)
 
-		AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [
-			AS_IF([test "x$enable_pyzfs" = xyes], [
-				AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed")
-			], [test "x$enable_pyzfs" != xno], [
+		AS_IF([test "x$enable_pyzfs" = xyes], [
+			AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION])
+		], [
+			AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [true])
+			AS_IF([test "x$ax_python_devel_found" = xno], [
 				enable_pyzfs=no
 			])
 		])
diff --git a/config/ax_python_devel.m4 b/config/ax_python_devel.m4
index f6d4b01444d6..1f480db6d233 100644
--- a/config/ax_python_devel.m4
+++ b/config/ax_python_devel.m4
@@ -4,18 +4,13 @@
 #
 # SYNOPSIS
 #
-#   AX_PYTHON_DEVEL([version], [action-if-not-found])
+#   AX_PYTHON_DEVEL([version[,optional]])
 #
 # DESCRIPTION
 #
 #   Note: Defines as a precious variable "PYTHON_VERSION". Don't override it
 #   in your configure.ac.
 #
-#   Note: this is a slightly modified version of the original AX_PYTHON_DEVEL
-#   macro which accepts an additional [action-if-not-found] argument. This
-#   allow to detect if Python development is available without aborting the
-#   configure phase with an hard error in case it is not.
-#
 #   This macro checks for Python and tries to get the include path to
 #   'Python.h'. It provides the $(PYTHON_CPPFLAGS) and $(PYTHON_LIBS) output
 #   variables. It also exports $(PYTHON_EXTRA_LIBS) and
@@ -28,6 +23,11 @@
 #   version number. Don't use "PYTHON_VERSION" for this: that environment
 #   variable is declared as precious and thus reserved for the end-user.
 #
+#   By default this will fail if it does not detect a development version of
+#   python.  If you want it to continue, set optional to true, like
+#   AX_PYTHON_DEVEL([], [true]).  The ax_python_devel_found variable will be
+#   "no" if it fails.
+#
 #   This macro should work for all versions of Python >= 2.1.0. As an end
 #   user, you can disable the check for the python version by setting the
 #   PYTHON_NOVERSIONCHECK environment variable to something else than the
@@ -45,7 +45,6 @@
 #   Copyright (c) 2009 Matteo Settenvini <matteo@member.fsf.org>
 #   Copyright (c) 2009 Horst Knorr <hk_classes@knoda.org>
 #   Copyright (c) 2013 Daniel Mullner <muellner@math.stanford.edu>
-#   Copyright (c) 2018 loli10K <ezomori.nozomu@gmail.com>
 #
 #   This program is free software: you can redistribute it and/or modify it
 #   under the terms of the GNU General Public License as published by the
@@ -73,10 +72,18 @@
 #   modified version of the Autoconf Macro, you may extend this special
 #   exception to the GPL to apply to your modified version as well.
 
-#serial 21
+#serial 36
 
 AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL])
 AC_DEFUN([AX_PYTHON_DEVEL],[
+	# Get whether it's optional
+	if test -z "$2"; then
+	   ax_python_devel_optional=false
+	else
+	   ax_python_devel_optional=$2
+	fi
+	ax_python_devel_found=yes
+
 	#
 	# Allow the use of a (user set) custom python version
 	#
@@ -87,23 +94,26 @@ AC_DEFUN([AX_PYTHON_DEVEL],[
 
 	AC_PATH_PROG([PYTHON],[python[$PYTHON_VERSION]])
 	if test -z "$PYTHON"; then
-		m4_ifvaln([$2],[$2],[
-			AC_MSG_ERROR([Cannot find python$PYTHON_VERSION in your system path])
-			PYTHON_VERSION=""
-		])
+	   AC_MSG_WARN([Cannot find python$PYTHON_VERSION in your system path])
+	   if ! $ax_python_devel_optional; then
+	      AC_MSG_ERROR([Giving up, python development not available])
+	   fi
+	   ax_python_devel_found=no
+	   PYTHON_VERSION=""
 	fi
 
-	#
-	# Check for a version of Python >= 2.1.0
-	#
-	AC_MSG_CHECKING([for a version of Python >= '2.1.0'])
-	ac_supports_python_ver=`$PYTHON -c "import sys; \
+	if test $ax_python_devel_found = yes; then
+	   #
+	   # Check for a version of Python >= 2.1.0
+	   #
+	   AC_MSG_CHECKING([for a version of Python >= '2.1.0'])
+	   ac_supports_python_ver=`$PYTHON -c "import sys; \
 		ver = sys.version.split ()[[0]]; \
 		print (ver >= '2.1.0')"`
-	if test "$ac_supports_python_ver" != "True"; then
+	   if test "$ac_supports_python_ver" != "True"; then
 		if test -z "$PYTHON_NOVERSIONCHECK"; then
 			AC_MSG_RESULT([no])
-			AC_MSG_FAILURE([
+			AC_MSG_WARN([
 This version of the AC@&t@_PYTHON_DEVEL macro
 doesn't work properly with versions of Python before
 2.1.0. You may need to re-run configure, setting the
@@ -112,20 +122,27 @@ PYTHON_EXTRA_LIBS and PYTHON_EXTRA_LDFLAGS by hand.
 Moreover, to disable this check, set PYTHON_NOVERSIONCHECK
 to something else than an empty string.
 ])
+			if ! $ax_python_devel_optional; then
+			   AC_MSG_FAILURE([Giving up])
+			fi
+			ax_python_devel_found=no
+			PYTHON_VERSION=""
 		else
 			AC_MSG_RESULT([skip at user request])
 		fi
-	else
+	   else
 		AC_MSG_RESULT([yes])
+	   fi
 	fi
 
-	#
-	# If the macro parameter ``version'' is set, honour it.
-	# A Python shim class, VPy, is used to implement correct version comparisons via
-	# string expressions, since e.g. a naive textual ">= 2.7.3" won't work for
-	# Python 2.7.10 (the ".1" being evaluated as less than ".3").
-	#
-	if test -n "$1"; then
+	if test $ax_python_devel_found = yes; then
+	   #
+	   # If the macro parameter ``version'' is set, honour it.
+	   # A Python shim class, VPy, is used to implement correct version comparisons via
+	   # string expressions, since e.g. a naive textual ">= 2.7.3" won't work for
+	   # Python 2.7.10 (the ".1" being evaluated as less than ".3").
+	   #
+	   if test -n "$1"; then
 		AC_MSG_CHECKING([for a version of Python $1])
                 cat << EOF > ax_python_devel_vpy.py
 class VPy:
@@ -133,7 +150,7 @@ class VPy:
         return tuple(map(int, s.strip().replace("rc", ".").split(".")))
     def __init__(self):
         import sys
-        self.vpy = tuple(sys.version_info)
+        self.vpy = tuple(sys.version_info)[[:3]]
     def __eq__(self, s):
         return self.vpy == self.vtup(s)
     def __ne__(self, s):
@@ -155,25 +172,69 @@ EOF
 			AC_MSG_RESULT([yes])
 		else
 			AC_MSG_RESULT([no])
-			AC_MSG_ERROR([this package requires Python $1.
+			AC_MSG_WARN([this package requires Python $1.
 If you have it installed, but it isn't the default Python
 interpreter in your system path, please pass the PYTHON_VERSION
 variable to configure. See ``configure --help'' for reference.
 ])
+			if ! $ax_python_devel_optional; then
+			   AC_MSG_ERROR([Giving up])
+			fi
+			ax_python_devel_found=no
 			PYTHON_VERSION=""
 		fi
+	   fi
 	fi
 
-	#
-	# Check for Python include path
-	#
-	#
-	AC_MSG_CHECKING([for Python include path])
-	if test -z "$PYTHON_CPPFLAGS"; then
-		python_path=`$PYTHON -c "import sysconfig; \
-			print (sysconfig.get_path('include'));"`
-		plat_python_path=`$PYTHON -c "import sysconfig; \
-			print (sysconfig.get_path('platinclude'));"`
+	if test $ax_python_devel_found = yes; then
+	   #
+	   # Check if you have distutils, else fail
+	   #
+	   AC_MSG_CHECKING([for the sysconfig Python package])
+	   ac_sysconfig_result=`$PYTHON -c "import sysconfig" 2>&1`
+	   if test $? -eq 0; then
+		AC_MSG_RESULT([yes])
+		IMPORT_SYSCONFIG="import sysconfig"
+	   else
+		AC_MSG_RESULT([no])
+
+		AC_MSG_CHECKING([for the distutils Python package])
+		ac_sysconfig_result=`$PYTHON -c "from distutils import sysconfig" 2>&1`
+		if test $? -eq 0; then
+			AC_MSG_RESULT([yes])
+			IMPORT_SYSCONFIG="from distutils import sysconfig"
+		else
+			AC_MSG_WARN([cannot import Python module "distutils".
+Please check your Python installation. The error was:
+$ac_sysconfig_result])
+			if ! $ax_python_devel_optional; then
+			   AC_MSG_ERROR([Giving up])
+			fi
+			ax_python_devel_found=no
+			PYTHON_VERSION=""
+		fi
+	   fi
+	fi
+
+	if test $ax_python_devel_found = yes; then
+	   #
+	   # Check for Python include path
+	   #
+	   AC_MSG_CHECKING([for Python include path])
+	   if test -z "$PYTHON_CPPFLAGS"; then
+		if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then
+			# sysconfig module has different functions
+			python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_path ('include'));"`
+			plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_path ('platinclude'));"`
+		else
+			# old distutils way
+			python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_python_inc ());"`
+			plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_python_inc (plat_specific=1));"`
+		fi
 		if test -n "${python_path}"; then
 			if test "${plat_python_path}" != "${python_path}"; then
 				python_path="-I$python_path -I$plat_python_path"
@@ -182,15 +243,15 @@ variable to configure. See ``configure --help'' for reference.
 			fi
 		fi
 		PYTHON_CPPFLAGS=$python_path
-	fi
-	AC_MSG_RESULT([$PYTHON_CPPFLAGS])
-	AC_SUBST([PYTHON_CPPFLAGS])
+	   fi
+	   AC_MSG_RESULT([$PYTHON_CPPFLAGS])
+	   AC_SUBST([PYTHON_CPPFLAGS])
 
-	#
-	# Check for Python library path
-	#
-	AC_MSG_CHECKING([for Python library path])
-	if test -z "$PYTHON_LIBS"; then
+	   #
+	   # Check for Python library path
+	   #
+	   AC_MSG_CHECKING([for Python library path])
+	   if test -z "$PYTHON_LIBS"; then
 		# (makes two attempts to ensure we've got a version number
 		# from the interpreter)
 		ac_python_version=`cat<<EOD | $PYTHON -
@@ -208,7 +269,7 @@ EOD`
 				ac_python_version=$PYTHON_VERSION
 			else
 				ac_python_version=`$PYTHON -c "import sys; \
-					print ('.'.join(sys.version.split('.')[[:2]]))"`
+					print ("%d.%d" % sys.version_info[[:2]])"`
 			fi
 		fi
 
@@ -220,7 +281,7 @@ EOD`
 		ac_python_libdir=`cat<<EOD | $PYTHON -
 
 # There should be only one
-import sysconfig
+$IMPORT_SYSCONFIG
 e = sysconfig.get_config_var('LIBDIR')
 if e is not None:
 	print (e)
@@ -229,7 +290,7 @@ EOD`
 		# Now, for the library:
 		ac_python_library=`cat<<EOD | $PYTHON -
 
-import sysconfig
+$IMPORT_SYSCONFIG
 c = sysconfig.get_config_vars()
 if 'LDVERSION' in c:
 	print ('python'+c[['LDVERSION']])
@@ -249,88 +310,140 @@ EOD`
 		else
 			# old way: use libpython from python_configdir
 			ac_python_libdir=`$PYTHON -c \
-			  "import sysconfig; \
+			  "from sysconfig import get_python_lib as f; \
 			  import os; \
-			  print (os.path.join(sysconfig.get_path('platstdlib'), 'config'));"`
+			  print (os.path.join(f(plat_specific=1, standard_lib=1), 'config'));"`
 			PYTHON_LIBS="-L$ac_python_libdir -lpython$ac_python_version"
 		fi
 
 		if test -z "PYTHON_LIBS"; then
-			m4_ifvaln([$2],[$2],[
-				AC_MSG_ERROR([
+			AC_MSG_WARN([
   Cannot determine location of your Python DSO. Please check it was installed with
   dynamic libraries enabled, or try setting PYTHON_LIBS by hand.
-				])
 			])
+			if ! $ax_python_devel_optional; then
+			   AC_MSG_ERROR([Giving up])
+			fi
+			ax_python_devel_found=no
+			PYTHON_VERSION=""
 		fi
+	   fi
 	fi
-	AC_MSG_RESULT([$PYTHON_LIBS])
-	AC_SUBST([PYTHON_LIBS])
 
-	#
-	# Check for site packages
-	#
-	AC_MSG_CHECKING([for Python site-packages path])
-	if test -z "$PYTHON_SITE_PKG"; then
-		PYTHON_SITE_PKG=`$PYTHON -c "import distutils.sysconfig; \
-			print (distutils.sysconfig.get_python_lib(0,0));" 2>/dev/null || \
-			$PYTHON -c "import sysconfig; \
-			print (sysconfig.get_path('purelib'));"`
-	fi
-	AC_MSG_RESULT([$PYTHON_SITE_PKG])
-	AC_SUBST([PYTHON_SITE_PKG])
+	if test $ax_python_devel_found = yes; then
+	   AC_MSG_RESULT([$PYTHON_LIBS])
+	   AC_SUBST([PYTHON_LIBS])
 
-	#
-	# libraries which must be linked in when embedding
-	#
-	AC_MSG_CHECKING(python extra libraries)
-	if test -z "$PYTHON_EXTRA_LIBS"; then
-	   PYTHON_EXTRA_LIBS=`$PYTHON -c "import sysconfig; \
+	   #
+	   # Check for site packages
+	   #
+	   AC_MSG_CHECKING([for Python site-packages path])
+	   if test -z "$PYTHON_SITE_PKG"; then
+		if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then
+			PYTHON_SITE_PKG=`$PYTHON -c "
+$IMPORT_SYSCONFIG;
+if hasattr(sysconfig, 'get_default_scheme'):
+    scheme = sysconfig.get_default_scheme()
+else:
+    scheme = sysconfig._get_default_scheme()
+if scheme == 'posix_local':
+    # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/
+    scheme = 'posix_prefix'
+prefix = '$prefix'
+if prefix == 'NONE':
+    prefix = '$ac_default_prefix'
+sitedir = sysconfig.get_path('purelib', scheme, vars={'base': prefix})
+print(sitedir)"`
+		else
+			# distutils.sysconfig way
+			PYTHON_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_python_lib(0,0));"`
+		fi
+	   fi
+	   AC_MSG_RESULT([$PYTHON_SITE_PKG])
+	   AC_SUBST([PYTHON_SITE_PKG])
+
+	   #
+	   # Check for platform-specific site packages
+	   #
+	   AC_MSG_CHECKING([for Python platform specific site-packages path])
+	   if test -z "$PYTHON_PLATFORM_SITE_PKG"; then
+		if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then
+			PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c "
+$IMPORT_SYSCONFIG;
+if hasattr(sysconfig, 'get_default_scheme'):
+    scheme = sysconfig.get_default_scheme()
+else:
+    scheme = sysconfig._get_default_scheme()
+if scheme == 'posix_local':
+    # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/
+    scheme = 'posix_prefix'
+prefix = '$prefix'
+if prefix == 'NONE':
+    prefix = '$ac_default_prefix'
+sitedir = sysconfig.get_path('platlib', scheme, vars={'platbase': prefix})
+print(sitedir)"`
+		else
+			# distutils.sysconfig way
+			PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \
+				print (sysconfig.get_python_lib(1,0));"`
+		fi
+	   fi
+	   AC_MSG_RESULT([$PYTHON_PLATFORM_SITE_PKG])
+	   AC_SUBST([PYTHON_PLATFORM_SITE_PKG])
+
+	   #
+	   # libraries which must be linked in when embedding
+	   #
+	   AC_MSG_CHECKING(python extra libraries)
+	   if test -z "$PYTHON_EXTRA_LIBS"; then
+	      PYTHON_EXTRA_LIBS=`$PYTHON -c "$IMPORT_SYSCONFIG; \
                 conf = sysconfig.get_config_var; \
                 print (conf('LIBS') + ' ' + conf('SYSLIBS'))"`
-	fi
-	AC_MSG_RESULT([$PYTHON_EXTRA_LIBS])
-	AC_SUBST(PYTHON_EXTRA_LIBS)
+	   fi
+	   AC_MSG_RESULT([$PYTHON_EXTRA_LIBS])
+	   AC_SUBST(PYTHON_EXTRA_LIBS)
 
-	#
-	# linking flags needed when embedding
-	#
-	AC_MSG_CHECKING(python extra linking flags)
-	if test -z "$PYTHON_EXTRA_LDFLAGS"; then
-		PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "import sysconfig; \
+	   #
+	   # linking flags needed when embedding
+	   #
+	   AC_MSG_CHECKING(python extra linking flags)
+	   if test -z "$PYTHON_EXTRA_LDFLAGS"; then
+		PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "$IMPORT_SYSCONFIG; \
 			conf = sysconfig.get_config_var; \
 			print (conf('LINKFORSHARED'))"`
-	fi
-	AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS])
-	AC_SUBST(PYTHON_EXTRA_LDFLAGS)
+		# Hack for macos, it sticks this in here.
+		PYTHON_EXTRA_LDFLAGS=`echo $PYTHON_EXTRA_LDFLAGS | sed 's/CoreFoundation.*$/CoreFoundation/'`
+	   fi
+	   AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS])
+	   AC_SUBST(PYTHON_EXTRA_LDFLAGS)
 
-	#
-	# final check to see if everything compiles alright
-	#
-	AC_MSG_CHECKING([consistency of all components of python development environment])
-	# save current global flags
-	ac_save_LIBS="$LIBS"
-	ac_save_LDFLAGS="$LDFLAGS"
-	ac_save_CPPFLAGS="$CPPFLAGS"
-	LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS $PYTHON_EXTRA_LIBS"
-	LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS"
-	CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS"
-	AC_LANG_PUSH([C])
-	AC_LINK_IFELSE([
+	   #
+	   # final check to see if everything compiles alright
+	   #
+	   AC_MSG_CHECKING([consistency of all components of python development environment])
+	   # save current global flags
+	   ac_save_LIBS="$LIBS"
+	   ac_save_LDFLAGS="$LDFLAGS"
+	   ac_save_CPPFLAGS="$CPPFLAGS"
+	   LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS"
+	   LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS"
+	   CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS"
+	   AC_LANG_PUSH([C])
+	   AC_LINK_IFELSE([
 		AC_LANG_PROGRAM([[#include <Python.h>]],
 				[[Py_Initialize();]])
 		],[pythonexists=yes],[pythonexists=no])
-	AC_LANG_POP([C])
-	# turn back to default flags
-	CPPFLAGS="$ac_save_CPPFLAGS"
-	LIBS="$ac_save_LIBS"
-	LDFLAGS="$ac_save_LDFLAGS"
+	   AC_LANG_POP([C])
+	   # turn back to default flags
+	   CPPFLAGS="$ac_save_CPPFLAGS"
+	   LIBS="$ac_save_LIBS"
+	   LDFLAGS="$ac_save_LDFLAGS"
 
-	AC_MSG_RESULT([$pythonexists])
+	   AC_MSG_RESULT([$pythonexists])
 
-        if test ! "x$pythonexists" = "xyes"; then
-		m4_ifvaln([$2],[$2],[
-			AC_MSG_FAILURE([
+	   if test ! "x$pythonexists" = "xyes"; then
+	      AC_MSG_WARN([
   Could not link test program to Python. Maybe the main Python library has been
   installed in some non-standard library path. If so, pass it to configure,
   via the LIBS environment variable.
@@ -340,9 +453,13 @@ EOD`
    You probably have to install the development version of the Python package
    for your distribution.  The exact name of this package varies among them.
   ============================================================================
-	   ])
-			PYTHON_VERSION=""
-		])
+	      ])
+	      if ! $ax_python_devel_optional; then
+		 AC_MSG_ERROR([Giving up])
+	      fi
+	      ax_python_devel_found=no
+	      PYTHON_VERSION=""
+	   fi
 	fi
 
 	#
diff --git a/contrib/debian/control b/contrib/debian/control
index 98beb900d0fa..e56fbf0f1c93 100644
--- a/contrib/debian/control
+++ b/contrib/debian/control
@@ -189,7 +189,7 @@ Depends: dkms (>> 2.1.1.2-5),
          file,
          libc6-dev | libc-dev,
          lsb-release,
-         python3-distutils | libpython3-stdlib (<< 3.6.4),
+         python3 (>> 3.12) | python3-distutils | libpython3-stdlib (<< 3.6.4),
          ${misc:Depends},
          ${perl:Depends}
 Recommends: openzfs-zfs-zed, openzfs-zfsutils (>= ${source:Version}), ${linux:Recommends}

From 21bc066ece7fcf0f8250ba5dfe05fd7f507dca28 Mon Sep 17 00:00:00 2001
From: Alan Somers <asomers@FreeBSD.org>
Date: Thu, 25 Apr 2024 16:24:52 -0500
Subject: [PATCH 095/116] Fix updating the zvol_htable when renaming a zvol

When renaming a zvol, insert it into zvol_htable using the new name, not
the old name.  Otherwise some operations won't work.  For example,
"zfs set volsize" while the zvol is open.

Sponsored by:	Axcient
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alek Pinchuk <apinchuk@axcient.com>
Signed-off-by:	Alan Somers <asomers@FreeBSD.org>
Closes #16127
Closes #16128
---
 module/os/freebsd/zfs/zvol_os.c | 2 +-
 module/os/linux/zfs/zvol_os.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c
index 6a7c2d2811b1..712ff1b837d7 100644
--- a/module/os/freebsd/zfs/zvol_os.c
+++ b/module/os/freebsd/zfs/zvol_os.c
@@ -1259,7 +1259,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	/* Move to a new hashtable entry.  */
-	zv->zv_hash = zvol_name_hash(zv->zv_name);
+	zv->zv_hash = zvol_name_hash(newname);
 	hlist_del(&zv->zv_hlink);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 4b960daf89ee..2a036dc5136b 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1571,7 +1571,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
 
 	/* move to new hashtable entry  */
-	zv->zv_hash = zvol_name_hash(zv->zv_name);
+	zv->zv_hash = zvol_name_hash(newname);
 	hlist_del(&zv->zv_hlink);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 

From 4840f023afae7c4932c903cf3a436c02c6704e20 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 29 Apr 2024 11:31:50 -0700
Subject: [PATCH 096/116] GCC: Fixes for gcc 14 on Fedora 40

- Workaround dangling pointer in uu_list.c (#16124)
- Fix calloc() transposed arguments in zpool_vdev_os.c
- Make some temp variables unsigned to prevent triggering a
  '-Werror=alloc-size-larger-than' error.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16124
Closes #16125
---
 cmd/zpool/os/linux/zpool_vdev_os.c |  2 +-
 lib/libuutil/uu_list.c             | 14 ++++++++++----
 module/zfs/vdev_raidz.c            |  5 +++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c
index 80627b58211c..f194d28c55a9 100644
--- a/cmd/zpool/os/linux/zpool_vdev_os.c
+++ b/cmd/zpool/os/linux/zpool_vdev_os.c
@@ -438,7 +438,7 @@ static char *zpool_sysfs_gets(char *path)
 		return (NULL);
 	}
 
-	buf = calloc(sizeof (*buf), statbuf.st_size + 1);
+	buf = calloc(statbuf.st_size + 1, sizeof (*buf));
 	if (buf == NULL) {
 		close(fd);
 		return (NULL);
diff --git a/lib/libuutil/uu_list.c b/lib/libuutil/uu_list.c
index 0ca6f05205e9..aa8b129cc22a 100644
--- a/lib/libuutil/uu_list.c
+++ b/lib/libuutil/uu_list.c
@@ -505,14 +505,20 @@ uu_list_walk(uu_list_t *lp, uu_walk_fn_t *func, void *private, uint32_t flags)
 	}
 
 	if (lp->ul_debug || robust) {
-		uu_list_walk_t my_walk;
+		uu_list_walk_t *my_walk;
 		void *e;
 
-		list_walk_init(&my_walk, lp, flags);
+		my_walk = uu_zalloc(sizeof (*my_walk));
+		if (my_walk == NULL)
+			return (-1);
+
+		list_walk_init(my_walk, lp, flags);
 		while (status == UU_WALK_NEXT &&
-		    (e = uu_list_walk_next(&my_walk)) != NULL)
+		    (e = uu_list_walk_next(my_walk)) != NULL)
 			status = (*func)(e, private);
-		list_walk_fini(&my_walk);
+		list_walk_fini(my_walk);
+
+		uu_free(my_walk);
 	} else {
 		if (!reverse) {
 			for (np = lp->ul_null_node.uln_next;
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index b03331ec69c6..de7d0fa79478 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1891,8 +1891,9 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
 static void
 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
 {
-	int n, i, c, t, tt;
-	int nmissing_rows;
+	int i, c, t, tt;
+	unsigned int n;
+	unsigned int nmissing_rows;
 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
 	int parity_map[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;

From db499e68f9ef8d4b12ebdab699184e3acf35567c Mon Sep 17 00:00:00 2001
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:32:49 -0400
Subject: [PATCH 097/116] Overflowing refreservation is bad

Someone came to me and pointed out that you could pretty
readily cause the refreservation calculation to exceed
2**64, given the 2**17 multiplier in it, and produce
refreservations wildly less than the actual volsize in cases where
it should have failed.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #15996
---
 lib/libzfs/libzfs_dataset.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 6f8773aed425..231bbbd92dbf 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -5565,8 +5565,21 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
 		/*
 		 * Scale this size down as a ratio of 128k / tsize.
 		 * See theory statement above.
+		 *
+		 * Bitshift is to avoid the case of nblocks * asize < tsize
+		 * producing a size of 0.
+		 */
+		volsize = (nblocks * asize) / (tsize >> SPA_MINBLOCKSHIFT);
+		/*
+		 * If we would blow UINT64_MAX with this next multiplication,
+		 * don't.
 		 */
-		volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize;
+		if (volsize >
+		    (UINT64_MAX / (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT)))
+			volsize = UINT64_MAX;
+		else
+			volsize *= (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
 		if (volsize > ret) {
 			ret = volsize;
 		}

From b28461b7c6511be571ee2f7d71c0d7be12aa4630 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Tue, 30 Apr 2024 01:28:50 +0500
Subject: [PATCH 098/116] Fix arcstats for FreeBSD after zfetch support

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16141
---
 cmd/arcstat.in | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cmd/arcstat.in b/cmd/arcstat.in
index 220f343b5b62..c4f10a1d6d3b 100755
--- a/cmd/arcstat.in
+++ b/cmd/arcstat.in
@@ -200,6 +200,8 @@ if sys.platform.startswith('freebsd'):
 
         k = [ctl for ctl in sysctl.filter('kstat.zfs.misc.arcstats')
              if ctl.type != sysctl.CTLTYPE_NODE]
+        k += [ctl for ctl in sysctl.filter('kstat.zfs.misc.zfetchstats')
+             if ctl.type != sysctl.CTLTYPE_NODE]
 
         if not k:
             sys.exit(1)
@@ -211,8 +213,12 @@ if sys.platform.startswith('freebsd'):
                 continue
 
             name, value = s.name, s.value
-            # Trims 'kstat.zfs.misc.arcstats' from the name
-            kstat[name[24:]] = int(value)
+
+            if "arcstats" in name:
+                # Trims 'kstat.zfs.misc.arcstats' from the name
+                kstat[name[24:]] = int(value)
+            else:
+                kstat["zfetch_" + name[27:]] = int(value)
 
 elif sys.platform.startswith('linux'):
     def kstat_update():

From c3f2f1aa2dccd5528336d90a6dd2f2a5c97b6352 Mon Sep 17 00:00:00 2001
From: Don Brady <don.brady@delphix.com>
Date: Mon, 29 Apr 2024 15:35:53 -0600
Subject: [PATCH 099/116] vdev probe to slow disk can stall mmp write checker

Simplify vdev probes in the zio_vdev_io_done context to
avoid holding the spa config lock for a long duration.

Also allow zpool clear if no evidence of another host
is using the pool.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #15839
---
 cmd/zpool/zpool_main.c                        |   2 +-
 include/sys/spa.h                             |   4 +-
 include/sys/uberblock_impl.h                  |  16 +--
 include/sys/vdev_impl.h                       |   2 +-
 man/man8/zpool-clear.8                        |   7 +-
 module/zfs/mmp.c                              |   5 +-
 module/zfs/spa.c                              | 102 ++++++++++++++----
 module/zfs/txg.c                              |   9 ++
 module/zfs/vdev.c                             |  22 ++--
 module/zfs/vdev_label.c                       |   4 +-
 module/zfs/zfs_ioctl.c                        |   9 +-
 module/zfs/zio.c                              |   6 +-
 module/zfs/zio_inject.c                       |   6 +-
 tests/runfiles/linux.run                      |   2 +-
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../functional/mmp/mmp_write_slow_disk.ksh    |  97 +++++++++++++++++
 16 files changed, 242 insertions(+), 52 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 636eb2a301cd..300b383af4f6 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -9050,7 +9050,7 @@ status_callback(zpool_handle_t *zhp, void *data)
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
 		    " are connected, then reboot your system and\n\timport the "
-		    "pool.\n"));
+		    "pool or run 'zpool clear' to resume the pool.\n"));
 		break;
 
 	case ZPOOL_STATUS_IO_FAILURE_WAIT:
diff --git a/include/sys/spa.h b/include/sys/spa.h
index ca15025ba33c..001c221fb46f 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -770,7 +770,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
 #define	SPA_ASYNC_CONFIG_UPDATE			0x01
 #define	SPA_ASYNC_REMOVE			0x02
-#define	SPA_ASYNC_PROBE				0x04
+#define	SPA_ASYNC_FAULT_VDEV			0x04
 #define	SPA_ASYNC_RESILVER_DONE			0x08
 #define	SPA_ASYNC_RESILVER			0x10
 #define	SPA_ASYNC_AUTOEXPAND			0x20
@@ -1123,6 +1123,8 @@ extern uint32_t spa_get_hostid(spa_t *spa);
 extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
 extern boolean_t spa_livelist_delete_check(spa_t *spa);
 
+extern boolean_t spa_mmp_remote_host_activity(spa_t *spa);
+
 extern spa_mode_t spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
 
diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h
index 1736b32cd3c6..e480a4bac0b9 100644
--- a/include/sys/uberblock_impl.h
+++ b/include/sys/uberblock_impl.h
@@ -50,20 +50,20 @@ extern "C" {
 #define	MMP_SEQ_VALID_BIT	0x02
 #define	MMP_FAIL_INT_VALID_BIT	0x04
 
-#define	MMP_VALID(ubp)		(ubp->ub_magic == UBERBLOCK_MAGIC && \
-				    ubp->ub_mmp_magic == MMP_MAGIC)
-#define	MMP_INTERVAL_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+#define	MMP_VALID(ubp)		((ubp)->ub_magic == UBERBLOCK_MAGIC && \
+				    (ubp)->ub_mmp_magic == MMP_MAGIC)
+#define	MMP_INTERVAL_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
 				    MMP_INTERVAL_VALID_BIT))
-#define	MMP_SEQ_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+#define	MMP_SEQ_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
 				    MMP_SEQ_VALID_BIT))
-#define	MMP_FAIL_INT_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+#define	MMP_FAIL_INT_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
 				    MMP_FAIL_INT_VALID_BIT))
 
-#define	MMP_INTERVAL(ubp)	((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
+#define	MMP_INTERVAL(ubp)	(((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \
 				    >> 8)
-#define	MMP_SEQ(ubp)		((ubp->ub_mmp_config & 0x0000FFFF00000000) \
+#define	MMP_SEQ(ubp)		(((ubp)->ub_mmp_config & 0x0000FFFF00000000) \
 				    >> 32)
-#define	MMP_FAIL_INT(ubp)	((ubp->ub_mmp_config & 0xFFFF000000000000) \
+#define	MMP_FAIL_INT(ubp)	(((ubp)->ub_mmp_config & 0xFFFF000000000000) \
 				    >> 48)
 
 #define	MMP_INTERVAL_SET(write) \
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 95164c4546bb..57ff31e89eb9 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -273,7 +273,7 @@ struct vdev {
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
-	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/
+	boolean_t	vdev_fault_wanted; /* async faulted wanted?	*/
 	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
 	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8
index c61ecae483ac..3e448be87fc2 100644
--- a/man/man8/zpool-clear.8
+++ b/man/man8/zpool-clear.8
@@ -50,9 +50,10 @@ If the pool was suspended it will be brought back online provided the
 devices can be accessed.
 Pools with
 .Sy multihost
-enabled which have been suspended cannot be resumed.
-While the pool was suspended, it may have been imported on
-another host, and resuming I/O could result in pool damage.
+enabled which have been suspended cannot be resumed when there is evidence
+that the pool was imported by another host.
+The same checks performed during an import will be applied before the clear
+proceeds.
 .Bl -tag -width Ds
 .It Fl -power
 Power on the devices's slot in the storage enclosure and wait for the device
diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
index 66bc0ae60b10..71122542758d 100644
--- a/module/zfs/mmp.c
+++ b/module/zfs/mmp.c
@@ -664,12 +664,13 @@ mmp_thread(void *arg)
 		    (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
 			zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
 			    "mmp_last_write %llu mmp_interval %llu "
-			    "mmp_fail_intervals %llu mmp_fail_ns %llu",
+			    "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)mmp->mmp_last_write,
 			    (u_longlong_t)mmp_interval,
 			    (u_longlong_t)mmp_fail_intervals,
-			    (u_longlong_t)mmp_fail_ns);
+			    (u_longlong_t)mmp_fail_ns,
+			    (u_longlong_t)spa->spa_uberblock.ub_txg);
 			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
 			    "succeeded in over %llu ms; suspending pool. "
 			    "Hrtime %llu",
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 879147b097d0..147165ee8570 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -3594,11 +3594,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
 }
 
 /*
- * Perform the import activity check.  If the user canceled the import or
- * we detected activity then fail.
+ * Remote host activity check.
+ *
+ * error results:
+ *          0 - no activity detected
+ *  EREMOTEIO - remote activity detected
+ *      EINTR - user canceled the operation
  */
 static int
-spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
+spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
+    boolean_t importing)
 {
 	uint64_t txg = ub->ub_txg;
 	uint64_t timestamp = ub->ub_timestamp;
@@ -3643,19 +3648,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 
 	import_expire = gethrtime() + import_delay;
 
-	spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
-	    "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+	if (importing) {
+		spa_import_progress_set_notes(spa, "Checking MMP activity, "
+		    "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+	}
 
-	int interations = 0;
+	int iterations = 0;
 	while ((now = gethrtime()) < import_expire) {
-		if (interations++ % 30 == 0) {
+		if (importing && iterations++ % 30 == 0) {
 			spa_import_progress_set_notes(spa, "Checking MMP "
 			    "activity, %llu ms remaining",
 			    (u_longlong_t)NSEC2MSEC(import_expire - now));
 		}
 
-		(void) spa_import_progress_set_mmp_check(spa_guid(spa),
-		    NSEC2SEC(import_expire - gethrtime()));
+		if (importing) {
+			(void) spa_import_progress_set_mmp_check(spa_guid(spa),
+			    NSEC2SEC(import_expire - gethrtime()));
+		}
 
 		vdev_uberblock_load(rvd, ub, &mmp_label);
 
@@ -3737,6 +3746,61 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 	return (error);
 }
 
+/*
+ * Called from zfs_ioc_clear for a pool that was suspended
+ * after failing mmp write checks.
+ */
+boolean_t
+spa_mmp_remote_host_activity(spa_t *spa)
+{
+	ASSERT(spa_multihost(spa) && spa_suspended(spa));
+
+	nvlist_t *best_label;
+	uberblock_t best_ub;
+
+	/*
+	 * Locate the best uberblock on disk
+	 */
+	vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
+	if (best_label) {
+		/*
+		 * confirm that the best hostid matches our hostid
+		 */
+		if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
+		    spa_get_hostid(spa) !=
+		    fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
+			nvlist_free(best_label);
+			return (B_TRUE);
+		}
+		nvlist_free(best_label);
+	} else {
+		return (B_TRUE);
+	}
+
+	if (!MMP_VALID(&best_ub) ||
+	    !MMP_FAIL_INT_VALID(&best_ub) ||
+	    MMP_FAIL_INT(&best_ub) == 0) {
+		return (B_TRUE);
+	}
+
+	if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
+	    best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
+		zfs_dbgmsg("txg mismatch detected during pool clear "
+		    "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
+		    (u_longlong_t)spa->spa_uberblock.ub_txg,
+		    (u_longlong_t)best_ub.ub_txg,
+		    (u_longlong_t)spa->spa_uberblock.ub_timestamp,
+		    (u_longlong_t)best_ub.ub_timestamp);
+		return (B_TRUE);
+	}
+
+	/*
+	 * Perform an activity check looking for any remote writer
+	 */
+	return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
+	    B_FALSE) != 0);
+}
+
 static int
 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
 {
@@ -4063,7 +4127,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 		}
 
-		int error = spa_activity_check(spa, ub, spa->spa_config);
+		int error =
+		    spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
 		if (error) {
 			nvlist_free(label);
 			return (error);
@@ -8771,15 +8836,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
 }
 
 static void
-spa_async_probe(spa_t *spa, vdev_t *vd)
+spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
 {
-	if (vd->vdev_probe_wanted) {
-		vd->vdev_probe_wanted = B_FALSE;
-		vdev_reopen(vd);	/* vdev_open() does the actual probe */
+	if (vd->vdev_fault_wanted) {
+		vd->vdev_fault_wanted = B_FALSE;
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    VDEV_AUX_ERR_EXCEEDED);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
-		spa_async_probe(spa, vd->vdev_child[c]);
+		spa_async_fault_vdev(spa, vd->vdev_child[c]);
 }
 
 static void
@@ -8867,11 +8933,11 @@ spa_async_thread(void *arg)
 	}
 
 	/*
-	 * See if any devices need to be probed.
+	 * See if any devices need to be marked faulted.
 	 */
-	if (tasks & SPA_ASYNC_PROBE) {
+	if (tasks & SPA_ASYNC_FAULT_VDEV) {
 		spa_vdev_state_enter(spa, SCL_NONE);
-		spa_async_probe(spa, spa->spa_root_vdev);
+		spa_async_fault_vdev(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index a67c043446f5..5ce6be69be14 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -550,6 +550,15 @@ txg_sync_thread(void *arg)
 			timer = (delta > timeout ? 0 : timeout - delta);
 		}
 
+		/*
+		 * When we're suspended, nothing should be changing and for
+		 * MMP we don't want to bump anything that would make it
+		 * harder to detect if another host is changing it when
+		 * resuming after a MMP suspend.
+		 */
+		if (spa_suspended(spa))
+			continue;
+
 		/*
 		 * Wait until the quiesce thread hands off a txg to us,
 		 * prompting it to do so if necessary.
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index d97d0a8100c2..c5551eb6cf6e 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1664,6 +1664,7 @@ vdev_metaslab_fini(vdev_t *vd)
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
+	boolean_t	vps_zio_done_probe;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
@@ -1709,6 +1710,17 @@ vdev_probe_done(zio_t *zio)
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, NULL, 0);
 			zio->io_error = SET_ERROR(ENXIO);
+
+			/*
+			 * If this probe was initiated from zio pipeline, then
+			 * change the state in a spa_async_request. Probes that
+			 * were initiated from a vdev_open can change the state
+			 * as part of the open call.
+			 */
+			if (vps->vps_zio_done_probe) {
+				vd->vdev_fault_wanted = B_TRUE;
+				spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
+			}
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
@@ -1759,6 +1771,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 		    ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
+		vps->vps_zio_done_probe = (zio != NULL);
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
@@ -1785,15 +1798,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
-
-		/*
-		 * We can't change the vdev state in this context, so we
-		 * kick off an async task to do it on our behalf.
-		 */
-		if (zio != NULL) {
-			vd->vdev_probe_wanted = B_TRUE;
-			spa_async_request(spa, SPA_ASYNC_PROBE);
-		}
 	}
 
 	if (zio != NULL)
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index c31f48028bbc..ed592514fded 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -2027,6 +2027,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 	/*
 	 * If this isn't a resync due to I/O errors,
 	 * and nothing changed in this transaction group,
+	 * and multihost protection isn't enabled,
 	 * and the vdev configuration hasn't changed,
 	 * then there's nothing to do.
 	 */
@@ -2034,7 +2035,8 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 		boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
 		    txg, spa->spa_mmp.mmp_delay);
 
-		if (!changed && list_is_empty(&spa->spa_config_dirty_list))
+		if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
+		    !spa_multihost(spa))
 			return (0);
 	}
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 2ac1e34dccec..908b9efc1813 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -5823,10 +5823,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 
 	/*
 	 * If multihost is enabled, resuming I/O is unsafe as another
-	 * host may have imported the pool.
+	 * host may have imported the pool. Check for remote activity.
 	 */
-	if (spa_multihost(spa) && spa_suspended(spa))
-		return (SET_ERROR(EINVAL));
+	if (spa_multihost(spa) && spa_suspended(spa) &&
+	    spa_mmp_remote_host_activity(spa)) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EREMOTEIO));
+	}
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 1ba99f4d4624..ce967a7cdc68 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2532,8 +2532,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
-	cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
-	    "failure and has been suspended.\n", spa_name(spa));
+	if (reason != ZIO_SUSPEND_MMP) {
+		cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
+		    "I/O failure and has been suspended.\n", spa_name(spa));
+	}
 
 	(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
 	    NULL, NULL, 0);
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 3773e400d799..012a0e3c6c17 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -607,9 +607,11 @@ zio_handle_io_delay(zio_t *zio)
 		if (vd->vdev_guid != handler->zi_record.zi_guid)
 			continue;
 
+		/* also match on I/O type (e.g., -T read) */
 		if (handler->zi_record.zi_iotype != ZIO_TYPES &&
-		    handler->zi_record.zi_iotype != zio->io_type)
-				continue;
+		    handler->zi_record.zi_iotype != zio->io_type) {
+			continue;
+		}
 
 		/*
 		 * Defensive; should never happen as the array allocation
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index a0b74ef4a8c6..92ce09ec6fcb 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -146,7 +146,7 @@ tags = ['functional', 'mmap']
 tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
     'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
     'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
-    'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid']
+    'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk']
 tags = ['functional', 'mmp']
 
 [tests/functional/mount:Linux]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index a6fe030d410c..d625c040b819 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1593,6 +1593,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/mmp/mmp_on_zdb.ksh \
 	functional/mmp/mmp_reset_interval.ksh \
 	functional/mmp/mmp_write_distribution.ksh \
+	functional/mmp/mmp_write_slow_disk.ksh \
 	functional/mmp/mmp_write_uberblocks.ksh \
 	functional/mmp/multihost_history.ksh \
 	functional/mmp/setup.ksh \
diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh
new file mode 100755
index 000000000000..8b118684aa7f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh
@@ -0,0 +1,97 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara Inc
+#
+
+# DESCRIPTION:
+#	Verify that long VDEV probes do not cause MMP checks to suspend pool
+#	Note: without PR-15839 fix, this test will suspend the pool.
+#
+#	A device that is returning unexpected errors will trigger a vdev_probe.
+#	When the device additionally has slow response times, the probe can hold
+#	the spa config lock as a writer for a long period of time such that the
+#	mmp uberblock updates stall when trying to acquire the spa config lock.
+#
+# STRATEGY:
+#	1. Create a pool with multiple leaf vdevs
+#	2. Enable multihost and multihost_history
+#	3. Delay for MMP writes to occur
+#	4. Verify that a long VDEV probe didn't cause MMP check to suspend pool
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/mmp/mmp.cfg
+. $STF_SUITE/tests/functional/mmp/mmp.kshlib
+
+verify_runnable "both"
+
+function cleanup
+{
+	log_must zinject -c all
+
+	if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then
+		log_must zpool clear $MMP_POOL
+		zpool get state $MMP_POOL $MMP_DIR/file.3
+		zpool events | grep ".fs.zfs." | grep -v "history_event"
+	fi
+
+	poolexists $MMP_POOL && destroy_pool $MMP_POOL
+	log_must rm -r $MMP_DIR
+	log_must mmp_clear_hostid
+}
+
+log_assert "A long VDEV probe doesn't cause a MMP check suspend"
+log_onexit cleanup
+
+MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost
+
+# Create a multiple drive pool
+log_must zpool events -c
+log_must mkdir -p $MMP_DIR
+log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5}
+log_must zpool create -f $MMP_POOL \
+	mirror $MMP_DIR/file.{0,1,2} \
+	mirror $MMP_DIR/file.{3,4,5}
+
+# Enable MMP
+log_must mmp_set_hostid $HOSTID1
+log_must zpool set multihost=on $MMP_POOL
+clear_mmp_history
+
+# Inject vdev write error along with a delay
+log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL
+log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL
+log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL
+
+log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5
+sleep 10
+sync_pool $MMP_POOL
+
+# Confirm mmp writes to the non-slow disks have taken place
+for x in {0,1,2,4}; do
+	write_count=$(grep -c file.${x} $MMP_HISTORY_URL)
+	[[ $write_count -gt 0 ]] || log_fail "expecting mmp writes"
+done
+
+# Expect that the pool was not suspended
+log_must check_state $MMP_POOL "" "ONLINE"
+health=$(zpool list -H -o health $MMP_POOL)
+log_note "$MMP_POOL health is $health"
+[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected"
+
+log_pass "A long VDEV probe doesn't cause a MMP check suspend"

From a6edc0adb293caf4e8bca2948af71b192b26bf58 Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Tue, 30 Apr 2024 08:57:32 +1000
Subject: [PATCH 100/116] zio: try to execute TYPE_NULL ZIOs on the current
 task

Many TYPE_NULL ZIOs are used to provide a sync point for child ZIOs, and
do not do any actual work themselves. However, they are still dispatched
to a dedicated, single-thread taskq, which leads to their execution
being entirely task switch and dequeue overhead for no actual reason.

This commit changes it so that when selecting a parent ZIO to execute,
if the parent is TYPE_NULL and has no done function (that is, no
additional work), it is executed on the same thread. This reduces task
switches and frees up CPU cores for other work.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16134
---
 module/zfs/zio.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index ce967a7cdc68..0e7993d87e87 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -803,9 +803,10 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
 
 		/*
 		 * If we can tell the caller to execute this parent next, do
-		 * so. We only do this if the parent's zio type matches the
-		 * child's type. Otherwise dispatch the parent zio in its
-		 * own taskq.
+		 * so. We do this if the parent's zio type matches the child's
+		 * type, or if it's a zio_null() with no done callback, and so
+		 * has no actual work to do. Otherwise dispatch the parent zio
+		 * in its own taskq.
 		 *
 		 * Having the caller execute the parent when possible reduces
 		 * locking on the zio taskq's, reduces context switch
@@ -825,7 +826,8 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
 		 * of writes for spa_sync(), and the chain of ZIL blocks.
 		 */
 		if (next_to_executep != NULL && *next_to_executep == NULL &&
-		    pio->io_type == zio->io_type) {
+		    (pio->io_type == zio->io_type ||
+		    (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) {
 			*next_to_executep = pio;
 		} else {
 			zio_taskq_dispatch(pio, type, B_FALSE);

From 7ac00d3c26652892e01956af29d087362ab29410 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Tue, 30 Apr 2024 12:35:30 +1000
Subject: [PATCH 101/116] find_system_library: fix var cleanup when library not
 found

The "not found" path is attempting to clear SOMELIB_CFLAGS and
SOMELIB_LIBS by resetting them in AC_SUBST(). However, the second arg to
AC_SUBST is expanded in autoconf with `m4_ifvaln([$2], [[$1]=$2])`,
which is defined as "if the first arg is non-empty". The m4 "empty"
construction is [], therefore, the existing AC_SUBST calls never modify
the variables at all.

The effect of this is that leftovers from the library test can leak out.
At least, if a library header is found in the first stage, but the
library itself is not, -lsomelib is added to SOMELIB_LIBS and further
tests done. If that library is not found, SOMELIB_LIBS will not be
cleared.

For most of our library tests this hasn't been a problem, as they're
either always found properly via pkg-config or set directly, or the
calling test immediately aborts configure. For an optional dependency
however, an apparent "partial" result where the header is found but no
corresponding library causes link errors later.

I think a complete fix should probably not be setting SOMELIB_xxx until
the final result is known, but for now, adjusting the AC_SUBST calls to
explictly set the empty shell string (which is not "empty" to m4) at
least restores the intent.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16140
---
 config/find_system_library.m4 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/find_system_library.m4 b/config/find_system_library.m4
index 310b44112aea..8b98bd67d2ee 100644
--- a/config/find_system_library.m4
+++ b/config/find_system_library.m4
@@ -90,8 +90,8 @@ AC_DEFUN([ZFS_AC_FIND_SYSTEM_LIBRARY], [
 	AC_DEFINE([HAVE_][$1], [1], [Define if you have [$5]])
 	$7
     ],[dnl ELSE
-	AC_SUBST([$1]_CFLAGS, [])
-	AC_SUBST([$1]_LIBS, [])
+	AC_SUBST([$1]_CFLAGS, [""])
+	AC_SUBST([$1]_LIBS, [""])
 	AC_MSG_WARN([cannot find [$5] via pkg-config or in the standard locations])
 	$8
     ])

From 4429ad9276cea193bb29463a7d6c38367d0d78ce Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sun, 28 Apr 2024 11:03:11 +1000
Subject: [PATCH 102/116] libzpool: set thread names

Arrange for the thread/task name to be set when new threads are created.
This makes them visible in the process table etc.

pthread_setname_np() is generally available in glibc, musl and FreeBSD,
so no test is required.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16140
---
 include/sys/zfs_context.h | 8 ++++----
 lib/libzpool/kernel.c     | 5 ++++-
 lib/libzpool/taskq.c      | 4 ++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 9ec2f73b366c..8f264b50e995 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -228,9 +228,9 @@ typedef pthread_t	kthread_t;
 
 #define	thread_create_named(name, stk, stksize, func, arg, len, \
     pp, state, pri)	\
-	zk_thread_create(func, arg, stksize, state)
+	zk_thread_create(name, func, arg, stksize, state)
 #define	thread_create(stk, stksize, func, arg, len, pp, state, pri)	\
-	zk_thread_create(func, arg, stksize, state)
+	zk_thread_create(#func, func, arg, stksize, state)
 #define	thread_exit()	pthread_exit(NULL)
 #define	thread_join(t)	pthread_join((pthread_t)(t), NULL)
 
@@ -246,8 +246,8 @@ extern struct proc p0;
 
 #define	PS_NONE		-1
 
-extern kthread_t *zk_thread_create(void (*func)(void *), void *arg,
-    size_t stksize, int state);
+extern kthread_t *zk_thread_create(const char *name, void (*func)(void *),
+    void *arg, size_t stksize, int state);
 
 #define	issig(why)	(FALSE)
 #define	ISSIG(thr, why)	(FALSE)
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index ffad7fc02bc9..a3930ee07f73 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -92,7 +92,8 @@ zk_thread_wrapper(void *arg)
 }
 
 kthread_t *
-zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state)
+zk_thread_create(const char *name, void (*func)(void *), void *arg,
+    size_t stksize, int state)
 {
 	pthread_attr_t attr;
 	pthread_t tid;
@@ -140,6 +141,8 @@ zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state)
 	VERIFY0(pthread_create(&tid, &attr, zk_thread_wrapper, ztw));
 	VERIFY0(pthread_attr_destroy(&attr));
 
+	pthread_setname_np(tid, name);
+
 	return ((void *)(uintptr_t)tid);
 }
 
diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c
index 99a181ec3c93..5fb2283cf0b1 100644
--- a/lib/libzpool/taskq.c
+++ b/lib/libzpool/taskq.c
@@ -295,8 +295,8 @@ taskq_create(const char *name, int nthreads, pri_t pri,
 	}
 
 	for (t = 0; t < nthreads; t++)
-		VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0,
-		    taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL);
+		VERIFY((tq->tq_threadlist[t] = thread_create_named(tq->tq_name,
+		    NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL);
 
 	return (tq);
 }

From 394800200e033f3a21dcbbf38a1e71b9d33b3b70 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sun, 21 Apr 2024 21:43:53 +1000
Subject: [PATCH 103/116] libspl/assert: show process/task details in assert
 output

Makes it much easier to see what thing complained.

Getting thread id, program name and thread name vary wildly between
Linux and FreeBSD, so those are set up in macros. pthread_getname_np()
did not appear in musl until very recently, but the same info has always
been available via prctl(PR_GET_NAME), so we use that instead.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16140
---
 config/user.m4      |  2 +-
 lib/libspl/assert.c | 36 ++++++++++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/config/user.m4 b/config/user.m4
index 87df8c7ccabd..3a69086a9d9d 100644
--- a/config/user.m4
+++ b/config/user.m4
@@ -31,7 +31,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
 	ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV
 	ZFS_AC_CONFIG_USER_ZFSEXEC
 
-	AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy])
+	AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid])
 
 	AC_SUBST(RM)
 ])
diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c
index 9d44740d4e3c..185ec65cb894 100644
--- a/lib/libspl/assert.c
+++ b/lib/libspl/assert.c
@@ -22,9 +22,32 @@
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
+ */
 
 #include <assert.h>
 
+#if defined(__linux__)
+#include <errno.h>
+#include <sys/prctl.h>
+#ifdef HAVE_GETTID
+#define	libspl_gettid()		gettid()
+#else
+#include <sys/syscall.h>
+#define	libspl_gettid()		((pid_t)syscall(__NR_gettid))
+#endif
+#define	libspl_getprogname()	(program_invocation_short_name)
+#define	libspl_getthreadname(buf, len)	\
+	prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0)
+#elif defined(__FreeBSD__)
+#include <pthread_np.h>
+#define	libspl_gettid()		pthread_getthreadid_np()
+#define	libspl_getprogname()	getprogname()
+#define	libspl_getthreadname(buf, len)	\
+	pthread_getname_np(pthread_self(), buf, len);
+#endif
+
 static boolean_t libspl_assert_ok = B_FALSE;
 
 void
@@ -39,13 +62,22 @@ libspl_assertf(const char *file, const char *func, int line,
     const char *format, ...)
 {
 	va_list args;
+	char tname[64];
+
+	libspl_getthreadname(tname, sizeof (tname));
+
+	fprintf(stderr, "ASSERT at %s:%d:%s()\n", file, line, func);
 
 	va_start(args, format);
 	vfprintf(stderr, format, args);
-	fprintf(stderr, "\n");
-	fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func);
 	va_end(args);
 
+	fprintf(stderr, "\n"
+	    "  PID: %-8u  COMM: %s\n"
+	    "  TID: %-8u  NAME: %s\n",
+	    getpid(), libspl_getprogname(),
+	    libspl_gettid(), tname);
+
 #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__)
 	if (libspl_assert_ok) {
 		return;

From dec697ad683ecfdf9833455af0568ce4ddc7c885 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sun, 28 Apr 2024 12:49:58 +1000
Subject: [PATCH 104/116] libspl/assert: add lock around assertion output

If multiple threads trip an assertion at the same moment (quite common),
they can be printing at the same time, and their output gets messy.

This adds a simple lock around the whole thing, to prevent a second task
printing assert output before the first has finished.

Additionally, if libspl_assert_ok is not set, abort() is called without
dropping the lock, so that any other asserting tasks will be killed
before starting any output, rather than only getting part-way through.
This is a tradeoff; it's assumed that multiple threads asserting at the
same moment are likely the same fault in different instances of a
thread, and so there won't be any more useful information from the other
tasks anyway.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16140
---
 lib/libspl/assert.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c
index 185ec65cb894..d402462531b6 100644
--- a/lib/libspl/assert.c
+++ b/lib/libspl/assert.c
@@ -27,6 +27,7 @@
  */
 
 #include <assert.h>
+#include <pthread.h>
 
 #if defined(__linux__)
 #include <errno.h>
@@ -56,11 +57,15 @@ libspl_set_assert_ok(boolean_t val)
 	libspl_assert_ok = val;
 }
 
+static pthread_mutex_t assert_lock = PTHREAD_MUTEX_INITIALIZER;
+
 /* printf version of libspl_assert */
 void
 libspl_assertf(const char *file, const char *func, int line,
     const char *format, ...)
 {
+	pthread_mutex_lock(&assert_lock);
+
 	va_list args;
 	char tname[64];
 
@@ -80,6 +85,7 @@ libspl_assertf(const char *file, const char *func, int line,
 
 #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__)
 	if (libspl_assert_ok) {
+		pthread_mutex_unlock(&assert_lock);
 		return;
 	}
 #endif

From 2152c405ba6ab0bc9fca482e9a0a968eb35699fb Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sat, 27 Apr 2024 21:35:05 +1000
Subject: [PATCH 105/116] libspl/assert: dump backtrace in assert

Adds a check for the backtrace() function. If available, uses it to show
a stack backtrace in the assertion output.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16140
---
 config/user-backtrace.m4 | 14 ++++++++++++++
 config/user.m4           |  1 +
 lib/libspl/Makefile.am   |  2 ++
 lib/libspl/assert.c      | 20 ++++++++++++++++++++
 4 files changed, 37 insertions(+)
 create mode 100644 config/user-backtrace.m4

diff --git a/config/user-backtrace.m4 b/config/user-backtrace.m4
new file mode 100644
index 000000000000..25706767cdc3
--- /dev/null
+++ b/config/user-backtrace.m4
@@ -0,0 +1,14 @@
+dnl
+dnl backtrace(), for userspace assertions. glibc has this directly in libc.
+dnl FreeBSD and (sometimes) musl have it in a separate -lexecinfo. It's assumed
+dnl that this will also get the companion function backtrace_symbols().
+dnl
+AC_DEFUN([ZFS_AC_CONFIG_USER_BACKTRACE], [
+	AX_SAVE_FLAGS
+	LIBS=""
+	AC_SEARCH_LIBS([backtrace], [execinfo], [
+		AC_DEFINE(HAVE_BACKTRACE, 1, [backtrace() is available])
+		AC_SUBST([BACKTRACE_LIBS], ["$LIBS"])
+	])
+	AX_RESTORE_FLAGS
+])
diff --git a/config/user.m4 b/config/user.m4
index 3a69086a9d9d..8d11e031ba2e 100644
--- a/config/user.m4
+++ b/config/user.m4
@@ -26,6 +26,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
 	ZFS_AC_CONFIG_USER_AIO_H
 	ZFS_AC_CONFIG_USER_CLOCK_GETTIME
 	ZFS_AC_CONFIG_USER_PAM
+	ZFS_AC_CONFIG_USER_BACKTRACE
 	ZFS_AC_CONFIG_USER_RUNSTATEDIR
 	ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS
 	ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV
diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am
index 822bef7e7a8d..9f413b08c16f 100644
--- a/lib/libspl/Makefile.am
+++ b/lib/libspl/Makefile.am
@@ -43,3 +43,5 @@ libspl_la_LIBADD = \
 	libspl_assert.la
 
 libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME)
+
+libspl_assert_la_LIBADD = $(BACKTRACE_LIBS)
diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c
index d402462531b6..4acf687f4b23 100644
--- a/lib/libspl/assert.c
+++ b/lib/libspl/assert.c
@@ -49,6 +49,24 @@
 	pthread_getname_np(pthread_self(), buf, len);
 #endif
 
+#if defined(HAVE_BACKTRACE)
+#include <execinfo.h>
+
+static inline void
+libspl_dump_backtrace(void)
+{
+	void *btptrs[100];
+	size_t nptrs = backtrace(btptrs, 100);
+	char **bt = backtrace_symbols(btptrs, nptrs);
+	fprintf(stderr, "Call trace:\n");
+	for (size_t i = 0; i < nptrs; i++)
+		fprintf(stderr, "  %s\n", bt[i]);
+	free(bt);
+}
+#else
+#define	libspl_dump_backtrace()
+#endif
+
 static boolean_t libspl_assert_ok = B_FALSE;
 
 void
@@ -83,6 +101,8 @@ libspl_assertf(const char *file, const char *func, int line,
 	    getpid(), libspl_getprogname(),
 	    libspl_gettid(), tname);
 
+	libspl_dump_backtrace();
+
 #if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__)
 	if (libspl_assert_ok) {
 		pthread_mutex_unlock(&assert_lock);

From 051460b8b2bb78add2b7ed5255f7656a33be903a Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Tue, 30 Apr 2024 10:37:29 +1000
Subject: [PATCH 106/116] libspl/assert: use libunwind for backtrace when
 available

libunwind seems to do a better job of resolving a symbols than
backtrace(), and is also useful on platforms that don't have backtrace()
(eg musl). If it's available, use it.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16140
---
 config/user-libunwind.m4 | 44 ++++++++++++++++++++++++++++++++++++++++
 config/user.m4           |  1 +
 lib/libspl/Makefile.am   |  4 ++--
 lib/libspl/assert.c      | 33 +++++++++++++++++++++++++++++-
 4 files changed, 79 insertions(+), 3 deletions(-)
 create mode 100644 config/user-libunwind.m4

diff --git a/config/user-libunwind.m4 b/config/user-libunwind.m4
new file mode 100644
index 000000000000..99ba3dcf452d
--- /dev/null
+++ b/config/user-libunwind.m4
@@ -0,0 +1,44 @@
+dnl
+dnl Checks for libunwind, which usually does a better job than backtrace() when
+dnl resolving symbols in the stack backtrace. Newer versions have support for
+dnl getting info about the object file the function came from, so we look for
+dnl that too and use it if found.
+dnl
+AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUNWIND], [
+	AC_ARG_WITH([libunwind],
+	    AS_HELP_STRING([--with-libunwind],
+		[use libunwind for backtraces in userspace assertions]),
+	    [],
+	    [with_libunwind=auto])
+
+	AS_IF([test "x$with_libunwind" != "xno"], [
+		ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUNWIND, [libunwind], [libunwind.h], [], [unwind], [], [
+			dnl unw_get_elf_filename() is sometimes a macro, other
+			dnl times a proper symbol, so we can't just do a link
+			dnl check; we need to include the header properly.
+			AX_SAVE_FLAGS
+			CFLAGS="$CFLAGS $LIBUNWIND_CFLAGS"
+			LIBS="$LIBS $LIBUNWIND_LIBS"
+			AC_MSG_CHECKING([for unw_get_elf_filename in libunwind])
+			AC_LINK_IFELSE([
+				AC_LANG_PROGRAM([
+					#define UNW_LOCAL_ONLY
+					#include <libunwind.h>
+				], [
+					unw_get_elf_filename(0, 0, 0, 0);
+				])
+			], [
+				AC_MSG_RESULT([yes])
+				AC_DEFINE(HAVE_LIBUNWIND_ELF, 1,
+				    [libunwind has unw_get_elf_filename])
+			], [
+				AC_MSG_RESULT([no])
+			])
+			AX_RESTORE_FLAGS
+		], [
+			AS_IF([test "x$with_libunwind" = "xyes"], [
+				AC_MSG_FAILURE([--with-libunwind was given, but libunwind is not available, try installing libunwind-devel])
+			])
+		])
+	])
+])
diff --git a/config/user.m4 b/config/user.m4
index 8d11e031ba2e..badd920d2b8a 100644
--- a/config/user.m4
+++ b/config/user.m4
@@ -27,6 +27,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
 	ZFS_AC_CONFIG_USER_CLOCK_GETTIME
 	ZFS_AC_CONFIG_USER_PAM
 	ZFS_AC_CONFIG_USER_BACKTRACE
+	ZFS_AC_CONFIG_USER_LIBUNWIND
 	ZFS_AC_CONFIG_USER_RUNSTATEDIR
 	ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS
 	ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV
diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am
index 9f413b08c16f..eb2377305aca 100644
--- a/lib/libspl/Makefile.am
+++ b/lib/libspl/Makefile.am
@@ -1,6 +1,6 @@
 include $(srcdir)/%D%/include/Makefile.am
 
-libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS)
+libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) $(LIBUNWIND_CFLAGS)
 libspl_la_CFLAGS        = $(libspl_assert_la_CFLAGS)
 
 noinst_LTLIBRARIES += libspl_assert.la libspl.la
@@ -44,4 +44,4 @@ libspl_la_LIBADD = \
 
 libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME)
 
-libspl_assert_la_LIBADD = $(BACKTRACE_LIBS)
+libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS)
diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c
index 4acf687f4b23..e6e3008f0aa6 100644
--- a/lib/libspl/assert.c
+++ b/lib/libspl/assert.c
@@ -49,7 +49,38 @@
 	pthread_getname_np(pthread_self(), buf, len);
 #endif
 
-#if defined(HAVE_BACKTRACE)
+#if defined(HAVE_LIBUNWIND)
+#define	UNW_LOCAL_ONLY
+#include <libunwind.h>
+
+static inline void
+libspl_dump_backtrace(void)
+{
+	unw_context_t uc;
+	unw_cursor_t cp;
+	unw_word_t ip, off;
+	char funcname[128];
+#ifdef HAVE_LIBUNWIND_ELF
+	char objname[128];
+	unw_word_t objoff;
+#endif
+
+	fprintf(stderr, "Call trace:\n");
+	unw_getcontext(&uc);
+	unw_init_local(&cp, &uc);
+	while (unw_step(&cp) > 0) {
+		unw_get_reg(&cp, UNW_REG_IP, &ip);
+		unw_get_proc_name(&cp, funcname, sizeof (funcname), &off);
+#ifdef HAVE_LIBUNWIND_ELF
+		unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff);
+		fprintf(stderr, "  [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n",
+		    ip, funcname, off, objname, objoff);
+#else
+		fprintf(stderr, "  [0x%08lx] %s+0x%2lx\n", ip, funcname, off);
+#endif
+	}
+}
+#elif defined(HAVE_BACKTRACE)
 #include <execinfo.h>
 
 static inline void

From 8fd3a5d02f3f6bad9e8e65b6aded694eae222bf2 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 1 May 2024 13:59:32 -0400
Subject: [PATCH 107/116] Slightly improve dnode hash

As I understand just for being less predictable dnode hash includes
8 bits of objset pointer, starting at 6.  But since objset_t is
more than 1KB in size, its allocations are likely aligned to 2KB,
that means 11 lower bits provide no entropy. Just take the 8 bits
starting from 11.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16131
---
 module/zfs/dmu_objset.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 2ba26f68e398..5ea99f742810 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -400,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj)
 
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	/*
-	 * The low 6 bits of the pointer don't have much entropy, because
-	 * the objset_t is larger than 2^6 bytes long.
+	 * The lower 11 bits of the pointer don't have much entropy, because
+	 * the objset_t is more than 1KB long and so likely aligned to 2KB.
 	 */
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];

From 645b83307918085ab2f0e12618809e348635b34f Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 1 May 2024 14:07:20 -0400
Subject: [PATCH 108/116] Improve write issue taskqs utilization

- Reduce number of allocators on small system down to one per 4
CPU cores, keeping maximum at 4 on 16+ core systems. Small systems
should not have the lock contention multiple allocators supposed
to solve, while having several metaslabs open and modified each
TXG is not free.
 - Reduce number of write issue taskqs down to one per 16 CPU
cores and an integer fraction of number of allocators.  On mid-
sized systems, where multiple allocators already make sense, too
many write issue taskqs may reduce write speed on single-file
workloads, since single file is handled by only one taskq to
reduce fragmentation. On large systems, that can actually benefit
from many taskq's better IOPS, the bottleneck is less important,
since in worst case there will be at least 16 cores to handle it.
 - Distribute dnodes between allocators (and taskqs) in a round-
robin fashion instead of relying on sync taskqs to be balanced.
The last is not guarantied and may depend on scheduling.
 - Remove io_wr_iss_tq from struct zio.  io_allocator is enough.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16130
---
 include/sys/spa.h       |  2 +
 include/sys/spa_impl.h  |  9 ++++-
 include/sys/zio.h       |  3 --
 man/man4/zfs.4          | 25 ++++++++-----
 module/zfs/dmu_objset.c |  2 +
 module/zfs/spa.c        | 81 ++++++++++++++++++++++++++---------------
 module/zfs/spa_misc.c   | 22 +++++++++--
 module/zfs/zio.c        |  1 -
 8 files changed, 98 insertions(+), 47 deletions(-)

diff --git a/include/sys/spa.h b/include/sys/spa.h
index 001c221fb46f..3073c4d1b937 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -829,6 +829,8 @@ extern uint_t zfs_sync_pass_deferred_free;
 /* spa sync taskqueues */
 taskq_t *spa_sync_tq_create(spa_t *spa, const char *name);
 void spa_sync_tq_destroy(spa_t *spa);
+uint_t spa_acq_allocator(spa_t *spa);
+void spa_rel_allocator(spa_t *spa, uint_t allocator);
 void spa_select_allocator(zio_t *zio);
 
 /* spa namespace global mutex */
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index d7da085ab313..a40914ec5fcb 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -63,6 +63,12 @@ typedef struct spa_alloc {
 	avl_tree_t	spaa_tree;
 } ____cacheline_aligned spa_alloc_t;
 
+typedef struct spa_allocs_use {
+	kmutex_t	sau_lock;
+	uint_t		sau_rotor;
+	boolean_t	sau_inuse[];
+} spa_allocs_use_t;
+
 typedef struct spa_error_entry {
 	zbookmark_phys_t	se_bookmark;
 	char			*se_name;
@@ -192,7 +198,7 @@ typedef struct spa_taskqs {
 /* one for each thread in the spa sync taskq */
 typedef struct spa_syncthread_info {
 	kthread_t	*sti_thread;
-	taskq_t		*sti_wr_iss_tq;		/* assigned wr_iss taskq */
+	uint_t		sti_allocator;
 } spa_syncthread_info_t;
 
 typedef enum spa_all_vdev_zap_action {
@@ -270,6 +276,7 @@ struct spa {
 	 * allocation performance in write-heavy workloads.
 	 */
 	spa_alloc_t	*spa_allocs;
+	spa_allocs_use_t *spa_allocs_use;
 	int		spa_alloc_count;
 	int		spa_active_allocator;	/* selectable allocator */
 
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 4037b429982b..77c70b9b481c 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -528,9 +528,6 @@ struct zio {
 
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
-
-	/* write issue taskq selection, based upon sync thread */
-	taskq_t		*io_wr_iss_tq;
 };
 
 enum blk_verify_flag {
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index ef0385d42b8e..5edd80659e08 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -525,10 +525,17 @@ most ZPL operations (e.g. write, create) will return
 .
 .It Sy spa_num_allocators Ns = Ns Sy 4 Pq int
 Determines the number of block alloctators to use per spa instance.
-Capped by the number of actual CPUs in the system.
+Capped by the number of actual CPUs in the system via
+.Sy spa_cpus_per_allocator .
 .Pp
 Note that setting this value too high could result in performance
 degredation and/or excess fragmentation.
+Set value only applies to pools imported/created after that.
+.
+.It Sy spa_cpus_per_allocator Ns = Ns Sy 4 Pq int
+Determines the minimum number of CPUs in a system for block alloctator
+per spa instance.
+Set value only applies to pools imported/created after that.
 .
 .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint
 Limits the number of on-disk error log entries that will be converted to the
@@ -2339,21 +2346,19 @@ Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint
 Number of worker threads per taskq.
-Lower values improve I/O ordering and CPU utilization,
-while higher reduces lock contention.
+Higher values improve I/O ordering and CPU utilization,
+while lower reduce lock contention.
+Set value only applies to pools imported/created after that.
 .Pp
 If
 .Sy 0 ,
 generate a system-dependent value close to 6 threads per taskq.
 Set value only applies to pools imported/created after that.
 .
-.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint
-Determines the number of CPUs to run write issue taskqs.
-.Pp
-When 0 (the default), the value to use is computed internally
-as the number of actual CPUs in the system divided by the
-.Sy spa_num_allocators
-value.
+.It Sy zio_taskq_write_tpq Ns = Ns Sy 16 Pq uint
+Determines the minumum number of threads per write issue taskq.
+Higher values improve CPU utilization on high throughput,
+while lower reduce taskq locks contention on high IOPS.
 Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 5ea99f742810..f1818ae155bd 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1664,12 +1664,14 @@ sync_dnodes_task(void *arg)
 	sync_objset_arg_t *soa = sda->sda_soa;
 	objset_t *os = soa->soa_os;
 
+	uint_t allocator = spa_acq_allocator(os->os_spa);
 	multilist_sublist_t *ms =
 	    multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
 
 	dmu_objset_sync_dnodes(ms, soa->soa_tx);
 
 	multilist_sublist_unlock(ms);
+	spa_rel_allocator(os->os_spa, allocator);
 
 	kmem_free(sda, sizeof (*sda));
 
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 147165ee8570..ec2b674fb7ee 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -208,7 +208,7 @@ static const uint_t	zio_taskq_basedc = 80;	  /* base duty cycle */
 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
 #endif
 
-static uint_t	zio_taskq_wr_iss_ncpus = 0;
+static uint_t	zio_taskq_write_tpq = 16;
 
 /*
  * Report any spa_load_verify errors found, but do not fail spa_load.
@@ -1067,17 +1067,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 	case ZTI_MODE_SYNC:
 
 		/*
-		 * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
-		 * not to exceed the number of spa allocators.
+		 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
+		 * not to exceed the number of spa allocators, and align to it.
 		 */
-		if (zio_taskq_wr_iss_ncpus == 0) {
-			count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
-		} else {
-			count = MAX(1,
-			    boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
-		}
+		cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
+		count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
 		count = MIN(count, spa->spa_alloc_count);
+		while (spa->spa_alloc_count % count != 0 &&
+		    spa->spa_alloc_count < count * 2)
+			count--;
 
 		/*
 		 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
@@ -1495,15 +1494,11 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
-	if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
-	    (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
-		/* dispatch to assigned write issue taskq */
-		tq = zio->io_wr_iss_tq;
-		return (tq);
-	}
-
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
+	} else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+	    (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) {
+		tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
 	} else {
 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 	}
@@ -10233,16 +10228,10 @@ spa_sync_tq_create(spa_t *spa, const char *name)
 	VERIFY(spa->spa_sync_tq != NULL);
 	VERIFY(kthreads != NULL);
 
-	spa_taskqs_t *tqs =
-	    &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
-
 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
-	for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+	for (int i = 0; i < nthreads; i++, ti++) {
 		ti->sti_thread = kthreads[i];
-		if (w == tqs->stqs_count) {
-			w = 0;
-		}
-		ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+		ti->sti_allocator = i;
 	}
 
 	kmem_free(kthreads, sizeof (*kthreads) * nthreads);
@@ -10261,6 +10250,42 @@ spa_sync_tq_destroy(spa_t *spa)
 	spa->spa_sync_tq = NULL;
 }
 
+uint_t
+spa_acq_allocator(spa_t *spa)
+{
+	int i;
+
+	if (spa->spa_alloc_count == 1)
+		return (0);
+
+	mutex_enter(&spa->spa_allocs_use->sau_lock);
+	uint_t r = spa->spa_allocs_use->sau_rotor;
+	do {
+		if (++r == spa->spa_alloc_count)
+			r = 0;
+	} while (spa->spa_allocs_use->sau_inuse[r]);
+	spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
+	spa->spa_allocs_use->sau_rotor = r;
+	mutex_exit(&spa->spa_allocs_use->sau_lock);
+
+	spa_syncthread_info_t *ti = spa->spa_syncthreads;
+	for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
+		if (ti->sti_thread == curthread) {
+			ti->sti_allocator = r;
+			break;
+		}
+	}
+	ASSERT3S(i, <, spa->spa_alloc_count);
+	return (r);
+}
+
+void
+spa_rel_allocator(spa_t *spa, uint_t allocator)
+{
+	if (spa->spa_alloc_count > 1)
+		spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
+}
+
 void
 spa_select_allocator(zio_t *zio)
 {
@@ -10288,8 +10313,7 @@ spa_select_allocator(zio_t *zio)
 		spa_syncthread_info_t *ti = spa->spa_syncthreads;
 		for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
 			if (ti->sti_thread == curthread) {
-				zio->io_allocator = i;
-				zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+				zio->io_allocator = ti->sti_allocator;
 				return;
 			}
 		}
@@ -10306,7 +10330,6 @@ spa_select_allocator(zio_t *zio)
 	    bm->zb_blkid >> 20);
 
 	zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
-	zio->io_wr_iss_tq = NULL;
 }
 
 /*
@@ -10919,5 +10942,5 @@ ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
 #endif
 /* END CSTYLED */
 
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
-	"Number of CPUs to run write issue taskqs");
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
+	"Number of CPUs per write issue taskq");
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 5fb7847b5d8b..e6d4a9bdb29c 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -394,6 +394,7 @@ static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
  * Number of allocators to use, per spa instance
  */
 static int spa_num_allocators = 4;
+static int spa_cpus_per_allocator = 4;
 
 /*
  * Spa active allocator.
@@ -747,8 +748,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	if (altroot)
 		spa->spa_root = spa_strdup(altroot);
 
-	/* Do not allow more allocators than CPUs. */
-	spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
+	/* Do not allow more allocators than fraction of CPUs. */
+	spa->spa_alloc_count = MAX(MIN(spa_num_allocators,
+	    boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1);
 
 	spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (spa_alloc_t), KM_SLEEP);
@@ -758,6 +760,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
 	}
+	if (spa->spa_alloc_count > 1) {
+		spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t,
+		    sau_inuse[spa->spa_alloc_count]), KM_SLEEP);
+		mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT,
+		    NULL);
+	}
 
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
@@ -853,6 +861,11 @@ spa_remove(spa_t *spa)
 	}
 	kmem_free(spa->spa_allocs, spa->spa_alloc_count *
 	    sizeof (spa_alloc_t));
+	if (spa->spa_alloc_count > 1) {
+		mutex_destroy(&spa->spa_allocs_use->sau_lock);
+		kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t,
+		    sau_inuse[spa->spa_alloc_count]));
+	}
 
 	avl_destroy(&spa->spa_metaslabs_by_flushed);
 	avl_destroy(&spa->spa_sm_logs_by_txg);
@@ -3097,4 +3110,7 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
 	param_get_uint, ZMOD_RW, "Reserved free space in pool");
 
 ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
-	"Number of allocators per spa, capped by ncpus");
+	"Number of allocators per spa");
+
+ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW,
+	"Minimum number of CPUs per allocators");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 0e7993d87e87..870343bf4fa3 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2925,7 +2925,6 @@ static void
 zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
 {
 	cio->io_allocator = pio->io_allocator;
-	cio->io_wr_iss_tq = pio->io_wr_iss_tq;
 }
 
 static void

From 8f1b7a6fa6762ea4c89198ceb11c521f80b92ddc Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Fri, 3 May 2024 08:18:35 +1000
Subject: [PATCH 109/116] vdev_disk: disable flushes if device does not support
 it

If the underlying device doesn't have a write-back cache, the kernel
will just return a successful response. This doesn't hurt anything, but
it's extra work on the IO taskqs that are unnecessary. So, detect this
when we open the device for the first time.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16148
---
 include/os/linux/kernel/linux/blkdev_compat.h | 27 +++++++++++++++++++
 module/os/linux/zfs/vdev_disk.c               |  7 +++--
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index b0f398354e4f..658f546213de 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -94,6 +94,33 @@ blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
 #endif
 }
 
+/*
+ * Detect if a device has a write cache. Used to set the intial value for the
+ * vdev nowritecache flag.
+ *
+ * 4.10: QUEUE_FLAG_WC added. Initialised by the driver, but can be changed
+ *       later by the operator. If not set, kernel will return flush requests
+ *       immediately without doing anything.
+ * 6.6: QUEUE_FLAG_HW_WC added. Initialised by the driver, can't be changed.
+ *      Only controls if the operator is allowed to change _WC. Initial version
+ *      buggy; aliased to QUEUE_FLAG_FUA, so unuseable.
+ * 6.6.10, 6.7: QUEUE_FLAG_HW_WC fixed.
+ *
+ * Older than 4.10 we just assume write cache, and let the normal flush fail
+ * detection apply.
+ */
+static inline boolean_t
+zfs_bdev_has_write_cache(struct block_device *bdev)
+{
+#if defined(QUEUE_FLAG_HW_WC) && QUEUE_FLAG_HW_WC != QUEUE_FLAG_FUA
+	return (test_bit(QUEUE_FLAG_HW_WC, &bdev_get_queue(bdev)->queue_flags));
+#elif defined(QUEUE_FLAG_WC)
+	return (test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags));
+#else
+	return (B_TRUE);
+#endif
+}
+
 static inline void
 blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages)
 {
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 2cea61a6294c..463c5f705102 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -429,8 +429,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	/*  Determine the logical block size */
 	int logical_block_size = bdev_logical_block_size(bdev);
 
-	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
-	v->vdev_nowritecache = B_FALSE;
+	/*
+	 * If the device has a write cache, clear the nowritecache flag,
+	 * so that we start issuing flush requests again.
+	 */
+	v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev);
 
 	/* Set when device reports it supports TRIM. */
 	v->vdev_has_trim = bdev_discard_supported(bdev);

From 04bae5ec95f7273105237159a882d5b72ec2b998 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 3 May 2024 12:53:34 -0400
Subject: [PATCH 110/116] Disable high priority ZIO threads on FreeBSD and
 Linux

High priority threads are handling ZIL writes.  While there is no
ZIL compression, there is encryption, checksuming and RAIDZ math.
We've found that on large systems 1 taskq with 5 threads can be
a bottleneck for throughput, IOPS or both. Instead of just bumping
number of threads with a risk of overloading CPUs and increasing
latency, switch to using TQ_FRONT mechanism to increase sync write
requests priority within standard write threads.  Do not do it on
Illumos, since its TQ_FRONT implementation is inherently unfair.
FreeBSD and Linux don't have this problem, so we can do it there.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #16146
---
 man/man4/zfs.4   |  2 +-
 module/zfs/spa.c | 11 ++++++++---
 module/zfs/zio.c | 12 +++++++-----
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 5edd80659e08..6895a2a6d79f 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2367,7 +2367,7 @@ This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
 Set values only apply to pools imported/created after that.
 .
-.It Sy zio_taskq_write Ns = Ns Sy sync fixed,1,5 scale fixed,1,5 Pq charp
+.It Sy zio_taskq_write Ns = Ns Sy sync null scale null Pq charp
 Set the queue and thread configuration for the IO write queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index ec2b674fb7ee..560fd67087b6 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -170,14 +170,19 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  * that scales with the number of CPUs.
  *
  * The different taskq priorities are to handle the different contexts (issue
- * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
- * need to be handled with minimum delay.
+ * and interrupt) and then to reserve threads for high priority I/Os that
+ * need to be handled with minimum delay.  Illumos taskq has unfair TQ_FRONT
+ * implementation, so separate high priority threads are used there.
  */
 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* READ */
+#ifdef illumos
 	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
+#else
+	{ ZTI_SYNC,	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* WRITE */
+#endif
 	{ ZTI_SCALE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FLUSH */
@@ -1217,7 +1222,7 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
  *
  * Example (the defaults for READ and WRITE)
  *   zio_taskq_read='fixed,1,8 null scale null'
- *   zio_taskq_write='sync fixed,1,5 scale fixed,1,5'
+ *   zio_taskq_write='sync null scale null'
  *
  * Each sets the entire row at a time.
  *
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 870343bf4fa3..65a0afaaa21c 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -2041,12 +2041,14 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 
 	/*
 	 * If this is a high priority I/O, then use the high priority taskq if
-	 * available.
+	 * available or cut the line otherwise.
 	 */
-	if ((zio->io_priority == ZIO_PRIORITY_NOW ||
-	    zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
-	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
-		q++;
+	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) {
+		if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
+			q++;
+		else
+			flags |= TQ_FRONT;
+	}
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 

From 2dff7527d4a40310f589045f5ab3a07b02963516 Mon Sep 17 00:00:00 2001
From: Daniel Perry <dtperry@amazon.com>
Date: Thu, 9 May 2024 10:30:28 -0400
Subject: [PATCH 111/116] Replace usage of schedule_timeout with
 schedule_timeout_interruptible (#16150)

This commit replaces current usages of schedule_timeout() with
schedule_timeout_interruptible() in code paths that expect the running
task to sleep for a short period of time. When schedule_timeout() is
called without previously calling set_current_state(), the running
task never sleeps because the task state remains in TASK_RUNNING.

By calling schedule_timeout_interruptible() to set the task state to
TASK_INTERRUPTIBLE before calling schedule_timeout() we achieve the
intended/desired behavior of putting the task to sleep for the
specified timeout.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Daniel Perry <dtperry@amazon.com>
Closes #16150
---
 module/os/linux/spl/spl-taskq.c | 2 +-
 module/os/linux/zfs/vdev_disk.c | 2 +-
 module/os/linux/zfs/zvol_os.c   | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index c384b7b378c3..e7b812c3b5b5 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -158,7 +158,7 @@ task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
 		 * throttling the task dispatch rate.
 		 */
 		spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
-		schedule_timeout(HZ / 100);
+		schedule_timeout_interruptible(HZ / 100);
 		spin_lock_irqsave_nested(&tq->tq_lock, *irqflags,
 		    tq->tq_lock_class);
 		if (count < 100) {
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 463c5f705102..7284b922b3bf 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -397,7 +397,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 			if (v->vdev_removed)
 				break;
 
-			schedule_timeout(MSEC_TO_TICK(10));
+			schedule_timeout_interruptible(MSEC_TO_TICK(10));
 		} else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) {
 			timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10);
 			continue;
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 2a036dc5136b..3012423e9f2a 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -798,7 +798,8 @@ zvol_open(struct block_device *bdev, fmode_t flag)
 				if ((gethrtime() - start) > timeout)
 					return (SET_ERROR(-ERESTARTSYS));
 
-				schedule_timeout(MSEC_TO_TICK(10));
+				schedule_timeout_interruptible(
+					MSEC_TO_TICK(10));
 				goto retry;
 #endif
 			} else {

From a0f3c8aaf1e8c1196282e91cca603f877d7a618b Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Thu, 9 May 2024 19:31:57 +0500
Subject: [PATCH 112/116] zdb: add missing cleanup for early return

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Don Brady <don.brady@klarasystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16152
---
 cmd/zdb/zdb.c | 78 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 25 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 449b6bf2ccb3..ce80c0aa5906 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -120,6 +120,9 @@ static int flagbits[256];
 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
 static int leaked_objects = 0;
 static range_tree_t *mos_refd_objs;
+static spa_t *spa;
+static objset_t *os;
+static boolean_t kernel_init_done;
 
 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
     boolean_t);
@@ -131,6 +134,7 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
 
 
 static void zdb_print_blkptr(const blkptr_t *bp, int flags);
+static void zdb_exit(int reason);
 
 typedef struct sublivelist_verify_block_refcnt {
 	/* block pointer entry in livelist being verified */
@@ -818,7 +822,7 @@ usage(void)
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
-	exit(1);
+	zdb_exit(1);
 }
 
 static void
@@ -849,7 +853,7 @@ fatal(const char *fmt, ...)
 
 	dump_debug_buffer();
 
-	exit(1);
+	zdb_exit(1);
 }
 
 static void
@@ -2276,7 +2280,7 @@ snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
 		buf = malloc(SPA_MAXBLOCKSIZE);
 		if (buf == NULL) {
 			(void) fprintf(stderr, "out of memory\n");
-			exit(1);
+			zdb_exit(1);
 		}
 		decode_embedded_bp_compressed(bp, buf);
 		memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
@@ -3231,6 +3235,23 @@ fuid_table_destroy(void)
 	}
 }
 
+static void
+zdb_exit(int reason)
+{
+	if (os != NULL) {
+		close_objset(os, FTAG);
+	} else if (spa != NULL) {
+		spa_close(spa, FTAG);
+	}
+
+	fuid_table_destroy();
+
+	if (kernel_init_done)
+		kernel_fini();
+
+	exit(reason);
+}
+
 /*
  * print uid or gid information.
  * For normal POSIX id just the id is printed in decimal format.
@@ -4161,32 +4182,32 @@ dump_cachefile(const char *cachefile)
 	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", cachefile,
 		    strerror(errno));
-		exit(1);
+		zdb_exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", cachefile,
 		    strerror(errno));
-		exit(1);
+		zdb_exit(1);
 	}
 
 	if ((buf = malloc(statbuf.st_size)) == NULL) {
 		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
-		exit(1);
+		zdb_exit(1);
 	}
 
 	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
 		(void) fprintf(stderr, "failed to read %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
-		exit(1);
+		zdb_exit(1);
 	}
 
 	(void) close(fd);
 
 	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
 		(void) fprintf(stderr, "failed to unpack nvlist\n");
-		exit(1);
+		zdb_exit(1);
 	}
 
 	free(buf);
@@ -5102,14 +5123,14 @@ dump_label(const char *dev)
 
 	if ((fd = open64(path, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
-		exit(1);
+		zdb_exit(1);
 	}
 
 	if (fstat64_blk(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", path,
 		    strerror(errno));
 		(void) close(fd);
-		exit(1);
+		zdb_exit(1);
 	}
 
 	if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
@@ -8221,7 +8242,7 @@ dump_zpool(spa_t *spa)
 
 	if (rc != 0) {
 		dump_debug_buffer();
-		exit(rc);
+		zdb_exit(rc);
 	}
 }
 
@@ -8825,18 +8846,18 @@ zdb_embedded_block(char *thing)
 	    words + 12, words + 13, words + 14, words + 15);
 	if (err != 16) {
 		(void) fprintf(stderr, "invalid input format\n");
-		exit(1);
+		zdb_exit(1);
 	}
 	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
 	buf = malloc(SPA_MAXBLOCKSIZE);
 	if (buf == NULL) {
 		(void) fprintf(stderr, "out of memory\n");
-		exit(1);
+		zdb_exit(1);
 	}
 	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
 	if (err != 0) {
 		(void) fprintf(stderr, "decode failed: %u\n", err);
-		exit(1);
+		zdb_exit(1);
 	}
 	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
 	free(buf);
@@ -8863,8 +8884,6 @@ int
 main(int argc, char **argv)
 {
 	int c;
-	spa_t *spa = NULL;
-	objset_t *os = NULL;
 	int dump_all = 1;
 	int verbose = 0;
 	int error = 0;
@@ -9093,6 +9112,7 @@ main(int argc, char **argv)
 	spa_mode_readable_spacemaps = B_TRUE;
 
 	kernel_init(SPA_MODE_READ);
+	kernel_init_done = B_TRUE;
 
 	if (dump_all)
 		verbose = MAX(verbose, 1);
@@ -9116,19 +9136,23 @@ main(int argc, char **argv)
 		if (argc != 1)
 			usage();
 		zdb_embedded_block(argv[0]);
-		return (0);
+		error = 0;
+		goto fini;
 	}
 
 	if (argc < 1) {
 		if (!dump_opt['e'] && dump_opt['C']) {
 			dump_cachefile(spa_config_path);
-			return (0);
+			error = 0;
+			goto fini;
 		}
 		usage();
 	}
 
-	if (dump_opt['l'])
-		return (dump_label(argv[0]));
+	if (dump_opt['l']) {
+		error = dump_label(argv[0]);
+		goto fini;
+	}
 
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
@@ -9183,7 +9207,8 @@ main(int argc, char **argv)
 		} else if (objset_str && !zdb_numeric(objset_str + 1) &&
 		    dump_opt['N']) {
 			printf("Supply a numeric objset ID with -N\n");
-			exit(1);
+			error = 1;
+			goto fini;
 		}
 	} else {
 		target_pool = target;
@@ -9240,7 +9265,8 @@ main(int argc, char **argv)
 		if (argc != 2)
 			usage();
 		dump_opt['v'] = verbose + 3;
-		return (dump_path(argv[0], argv[1], NULL));
+		error = dump_path(argv[0], argv[1], NULL);
+		goto fini;
 	}
 
 	if (dump_opt['r']) {
@@ -9328,7 +9354,7 @@ main(int argc, char **argv)
 				fatal("can't dump '%s': %s", target,
 				    strerror(error));
 			}
-			return (error);
+			goto fini;
 		} else {
 			target_pool = strdup(target);
 			if (strpbrk(target, "/@") != NULL)
@@ -9458,9 +9484,10 @@ main(int argc, char **argv)
 			free(checkpoint_target);
 	}
 
+fini:
 	if (os != NULL) {
 		close_objset(os, FTAG);
-	} else {
+	} else if (spa != NULL) {
 		spa_close(spa, FTAG);
 	}
 
@@ -9468,7 +9495,8 @@ main(int argc, char **argv)
 
 	dump_debug_buffer();
 
-	kernel_fini();
+	if (kernel_init_done)
+		kernel_fini();
 
 	return (error);
 }

From af5dbed3193eb91e1302e1b976606b64fb9c557b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 9 May 2024 10:32:59 -0400
Subject: [PATCH 113/116] Fix scn_queue races on very old pools

Code for pools before version 11 uses dmu_objset_find_dp() to scan
for children datasets/clones.  It calls enqueue_clones_cb() and
enqueue_cb() callbacks in parallel from multiple taskq threads.
It ends up bad for scan_ds_queue_insert(), corrupting scn_queue
AVL-tree.  Fix it by introducing a mutex to protect those two
scan_ds_queue_insert() calls.  All other calls are done from the
sync thread and so serialized.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16162
---
 include/sys/dsl_scan.h | 1 +
 module/zfs/dsl_scan.c  | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index 2e3452e5ebaa..f32f59a2bedf 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -173,6 +173,7 @@ typedef struct dsl_scan {
 	dsl_scan_phys_t scn_phys;	/* on disk representation of scan */
 	dsl_scan_phys_t scn_phys_cached;
 	avl_tree_t scn_queue;		/* queue of datasets to scan */
+	kmutex_t scn_queue_lock;	/* serializes scn_queue inserts */
 	uint64_t scn_queues_pending;	/* outstanding data to issue */
 	/* members needed for syncing error scrub status to disk */
 	dsl_errorscrub_phys_t errorscrub_phys;
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 55e89b89f06a..085cfd3c5691 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -491,6 +491,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 
 	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
 	    offsetof(scan_ds_t, sds_node));
+	mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
 	    sizeof (scan_prefetch_issue_ctx_t),
 	    offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
@@ -646,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp)
 
 		scan_ds_queue_clear(scn);
 		avl_destroy(&scn->scn_queue);
+		mutex_destroy(&scn->scn_queue_lock);
 		scan_ds_prefetch_queue_clear(scn);
 		avl_destroy(&scn->scn_prefetch_queue);
 
@@ -2723,8 +2725,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 			return (err);
 		ds = prev;
 	}
+	mutex_enter(&scn->scn_queue_lock);
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	mutex_exit(&scn->scn_queue_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
@@ -2915,8 +2919,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 		ds = prev;
 	}
 
+	mutex_enter(&scn->scn_queue_lock);
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	mutex_exit(&scn->scn_queue_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }

From 3400127a75fda737bc59ae52f1f8ecedd6201117 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 9 May 2024 10:39:57 -0400
Subject: [PATCH 114/116] Fix ZIL clone records for legacy holes

Previous code overengineered cloned range calculation by using
BP_GET_LSIZE(). The problem is that legacy holes don't have the
logical size, so result will be wrong.  But we also don't need
to look on every block size, since they all must be identical.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16165
---
 module/zfs/zfs_log.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index 433a653e5500..fa4e7093ca46 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -895,7 +895,7 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
 	itx_t *itx;
 	lr_clone_range_t *lr;
 	uint64_t partlen, max_log_data;
-	size_t i, partnbps;
+	size_t partnbps;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
@@ -904,10 +904,8 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
 
 	while (nbps > 0) {
 		partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
-		partlen = 0;
-		for (i = 0; i < partnbps; i++) {
-			partlen += BP_GET_LSIZE(&bps[i]);
-		}
+		partlen = partnbps * blksz;
+		ASSERT3U(partlen, <, len + blksz);
 		partlen = MIN(partlen, len);
 
 		itx = zil_itx_create(txtype,

From 414acbd37e0a1121e93310e88956e30554ad1dae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= <mm@FreeBSD.org>
Date: Thu, 9 May 2024 16:42:51 +0200
Subject: [PATCH 115/116] Unbreak FreeBSD cross-build on MacOS broken in
 051460b8b

MacOS used FreeBSD-compatible getprogname() and pthread_getname_np().
But pthread_getthreadid_np() does not exist on MacOS. This implements
libspl_gettid() using pthread_threadid_np() to get the thread id
of the current thread.

Tested with FreeBSD GitHub actions
freebsd-src/.github/workflows/cross-bootstrap-tools.yml

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Martin Matuska <mm@FreeBSD.org>
Closes #16167
---
 lib/libspl/assert.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c
index e6e3008f0aa6..5b12c14acd6e 100644
--- a/lib/libspl/assert.c
+++ b/lib/libspl/assert.c
@@ -41,9 +41,11 @@
 #define	libspl_getprogname()	(program_invocation_short_name)
 #define	libspl_getthreadname(buf, len)	\
 	prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0)
-#elif defined(__FreeBSD__)
+#elif defined(__FreeBSD__) || defined(__APPLE__)
+#if !defined(__APPLE__)
 #include <pthread_np.h>
 #define	libspl_gettid()		pthread_getthreadid_np()
+#endif
 #define	libspl_getprogname()	getprogname()
 #define	libspl_getthreadname(buf, len)	\
 	pthread_getname_np(pthread_self(), buf, len);
@@ -98,6 +100,19 @@ libspl_dump_backtrace(void)
 #define	libspl_dump_backtrace()
 #endif
 
+#if defined(__APPLE__)
+static inline uint64_t
+libspl_gettid(void)
+{
+	uint64_t tid;
+
+	if (pthread_threadid_np(NULL, &tid) != 0)
+		tid = 0;
+
+	return (tid);
+}
+#endif
+
 static boolean_t libspl_assert_ok = B_FALSE;
 
 void
@@ -128,7 +143,11 @@ libspl_assertf(const char *file, const char *func, int line,
 
 	fprintf(stderr, "\n"
 	    "  PID: %-8u  COMM: %s\n"
+#if defined(__APPLE__)
+	    "  TID: %-8" PRIu64 "  NAME: %s\n",
+#else
 	    "  TID: %-8u  NAME: %s\n",
+#endif
 	    getpid(), libspl_getprogname(),
 	    libspl_gettid(), tname);
 

From 1ede0c716beeee4a720ff5c361121021555d7e3c Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Fri, 10 May 2024 00:43:48 +1000
Subject: [PATCH 116/116] libspl_assert: always link -lpthread on FreeBSD

The pthread_* functions are in -lpthread on FreeBSD. Some of them are
implicitly linked through libc, but on FreeBSD 13 at least
pthread_getname_np() is not. Just be explicit, since -lpthread is the
documented interface anyway.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16168
---
 lib/libspl/Makefile.am | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am
index eb2377305aca..94be416d46aa 100644
--- a/lib/libspl/Makefile.am
+++ b/lib/libspl/Makefile.am
@@ -45,3 +45,7 @@ libspl_la_LIBADD = \
 libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME)
 
 libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS)
+
+if BUILD_FREEBSD
+libspl_assert_la_LIBADD += -lpthread
+endif