From b34bf2d5f6158c8ebeb47687a3fb7a2d3cc3aea7 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 29 Sep 2023 08:21:25 -0700 Subject: [PATCH 01/78] Tweak rebuild in-flight hard limit Vendor testing shows we should be able to get a little more performance if we further relax the hard limit which we're hitting. Authored-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Tony Hutter Closes #15324 --- module/zfs/vdev_rebuild.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 75c3900cbb0c..6503390f7973 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -807,12 +807,12 @@ vdev_rebuild_thread(void *arg) /* * Calculate the max number of in-flight bytes for top-level - * vdev scanning operations (minimum 1MB, maximum 1/4 of + * vdev scanning operations (minimum 1MB, maximum 1/2 of * arc_c_max shared by all top-level vdevs). Limits for the * issuing phase are done per top-level vdev and are handled * separately. */ - uint64_t limit = (arc_c_max / 4) / MAX(rvd->vdev_children, 1); + uint64_t limit = (arc_c_max / 2) / MAX(rvd->vdev_children, 1); vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, zfs_rebuild_vdev_limit * vd->vdev_children)); From 3079bf2e6c0733b9bccac573324871f49dd6d503 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 29 Sep 2023 11:22:46 -0400 Subject: [PATCH 02/78] Restrict short block cloning requests If we are copying only one block and it is smaller than recordsize property, do not allow destination to grow beyond one block if it is not there yet. Otherwise the destination will get stuck with that block size forever, that can be as small as 512 bytes, no matter how big the destination grow later. Reviewed-by: Kay Pedersen Reviewed-by: Rob Norris Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15321 --- module/zfs/zfs_vnops.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index a64e1e2dc83d..40d6c87a754e 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1206,6 +1206,19 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, goto unlock; } + /* + * If we are copying only one block and it is smaller than recordsize + * property, do not allow destination to grow beyond one block if it + * is not there yet. Otherwise the destination will get stuck with + * that block size forever, that can be as small as 512 bytes, no + * matter how big the destination grow later. + */ + if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && + outzp->z_size <= inblksz && outoff + len > inblksz) { + error = SET_ERROR(EINVAL); + goto unlock; + } + error = zn_rlimit_fsize(outoff + len); if (error != 0) { goto unlock; From 608741d062fe4b92c8018fff96b24a9629c08bcb Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 3 Oct 2023 01:57:09 +0200 Subject: [PATCH 03/78] Report ashift of L2ARC devices in zdb Commit 8af1104f does not actually store the ashift of cache devices in their label. However, in order to facilitate reporting the ashift through zdb, we enable this in the present commit. We also document how the retrieval of the ashift is done. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #15331 --- cmd/zdb/zdb.c | 2 +- module/zfs/vdev_label.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 4b9921d47b81..005bf3f16590 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -5179,7 +5179,7 @@ dump_label(const char *dev) if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) size = buflen; - /* If the device is a cache device clear the header. */ + /* If the device is a cache device read the header. */ if (!read_l2arc_header) { if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index a5c76808f2d2..a2e5524a8391 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1138,6 +1138,16 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) POOL_STATE_L2CACHE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); + + /* + * This is merely to facilitate reporting the ashift of the + * cache device through zdb. The actual retrieval of the + * ashift (in vdev_alloc()) uses the nvlist + * spa->spa_l2cache->sav_config (populated in + * spa_ld_open_aux_vdevs()). + */ + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, + vd->vdev_ashift) == 0); } else { uint64_t txg = 0ULL; From 0d870a17750feaeee9071c161d3248508a126043 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Mon, 2 Oct 2023 16:58:01 -0700 Subject: [PATCH 04/78] Fix invalid pointer access in trace_dbuf.h In dnode_destroy, dn_objset is invalidated. However, it will later call into dbuf_destroy, in which DTRACE_SET_STATE will try to access spa_name via dn_objset causing illegal pointer access. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Chunwei Chen Closes #15333 --- include/os/linux/zfs/sys/trace_dbuf.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/os/linux/zfs/sys/trace_dbuf.h b/include/os/linux/zfs/sys/trace_dbuf.h index 11d25be35bc4..0f6a98b47d60 100644 --- a/include/os/linux/zfs/sys/trace_dbuf.h +++ b/include/os/linux/zfs/sys/trace_dbuf.h @@ -60,8 +60,12 @@ #define DBUF_TP_FAST_ASSIGN \ if (db != NULL) { \ - __assign_str(os_spa, \ - spa_name(DB_DNODE(db)->dn_objset->os_spa)); \ + if (POINTER_IS_VALID(DB_DNODE(db)->dn_objset)) { \ + __assign_str(os_spa, \ + spa_name(DB_DNODE(db)->dn_objset->os_spa)); \ + } else { \ + __assign_str(os_spa, "NULL"); \ + } \ \ __entry->ds_object = db->db_objset->os_dsl_dataset ? \ db->db_objset->os_dsl_dataset->ds_object : 0; \ From db5c3b4c767f5176fe30ed19e508e36a0ba270f8 Mon Sep 17 00:00:00 2001 From: Stoiko Ivanov Date: Wed, 20 Sep 2023 19:33:14 +0200 Subject: [PATCH 05/78] contrib: bash_completion.d: make install destination vendor dependent Certain Linux distributions (Debian/Ubuntu at least) expect bash-completion snippets to be installed in /usr/share/bash-completion/completions instead of /etc/bash_completion.d. This patch sets the bashcompletiondir variable based on the vendor, inspired by similar settings for initdir and initconfdir. It seems that commit 612b8dff5bc3d827efb864a199a62bda1a419254 caused the file to be installed in the first-place (thus the error when building debian packages only became apparent when testing a 2.2.0-rc4 build) The change only sets the variable in Makefile context - the rpm/zfs.spec.in file has the path hardcoded as %{_sysconfdir}/bash_completion.d/zfs, but since running ``` ./configure --sysconfdir=/myetc ; make rpm ``` also results in all relevant files to be installed in /etc instead of /myetc I assume this can remain as is. Reviewed-by: Umer Saleem Signed-off-by: Stoiko Ivanov Closes #15304 --- config/zfs-build.m4 | 11 +++++++++++ contrib/bash_completion.d/Makefile.am | 2 -- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 2703e6c016c4..5ea6aa29a3de 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -617,6 +617,17 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_RESULT([no]) fi AC_SUBST(RPM_DEFINE_INITRAMFS) + + AC_MSG_CHECKING([default bash completion directory]) + case "$VENDOR" in + ubuntu) bashcompletiondir=/usr/share/bash-completion/completions ;; + debian) bashcompletiondir=/usr/share/bash-completion/completions ;; + freebsd) bashcompletiondir=$sysconfdir/bash_completion.d;; + *) bashcompletiondir=/etc/bash_completion.d ;; + esac + AC_MSG_RESULT([$bashcompletiondir]) + AC_SUBST(bashcompletiondir) + ]) dnl # diff --git a/contrib/bash_completion.d/Makefile.am b/contrib/bash_completion.d/Makefile.am index dc4b610c42b8..1ec05ed73d2d 100644 --- a/contrib/bash_completion.d/Makefile.am +++ b/contrib/bash_completion.d/Makefile.am @@ -1,5 +1,3 @@ -bashcompletiondir = $(sysconfdir)/bash_completion.d - nodist_bashcompletion_DATA = %D%/zfs SUBSTFILES += $(nodist_bashcompletion_DATA) From 7b1d421adfaf7149e0a5459f37f8e042802a9cbe Mon Sep 17 00:00:00 2001 From: Stoiko Ivanov Date: Thu, 21 Sep 2023 15:01:24 +0200 Subject: [PATCH 06/78] contrib: debian: switch to dh-sequence-dkms Follows b191f9a13d3005621ead9a727b811892264505ef from Debian's packaging team at: https://salsa.debian.org/zfsonlinux-team/zfs/ The previous build-dependency is kept as option, to still be able to build on older Debian based distros (e.g. Ubuntu 20.04). Without this building on Debian 12/bookworm does not work, as `dkms` is a virtual package. Reviewed-by: Umer Saleem Signed-off-by: Stoiko Ivanov Closes #15304 --- contrib/debian/control | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/debian/control b/contrib/debian/control index b9bb23b09ba0..f4e97fe16145 100644 --- a/contrib/debian/control +++ b/contrib/debian/control @@ -4,7 +4,7 @@ Priority: optional Maintainer: ZFS on Linux specific mailing list Build-Depends: debhelper-compat (= 12), dh-python, - dkms (>> 2.1.1.2-5), + dh-sequence-dkms | dkms (>> 2.1.1.2-5), libaio-dev, libblkid-dev, libcurl4-openssl-dev, From b04b13ae79e58fe6f73249a225b3ddf9e348fc59 Mon Sep 17 00:00:00 2001 From: Stoiko Ivanov Date: Wed, 20 Sep 2023 10:25:37 +0200 Subject: [PATCH 07/78] contrib: debian: drop bashcompletion mangling after install tested by running: ``` ./configure --with-config=user; cp -a contrib/debian . dpkg-buildpackage -b -uc -us ``` on a Debian 12 based system. and checking where the completion file got installed. Reviewed-by: Umer Saleem Signed-off-by: Stoiko Ivanov Closes #15304 --- contrib/debian/openzfs-zfsutils.install | 1 - contrib/debian/rules.in | 5 ----- 2 files changed, 6 deletions(-) diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 301d8f67b3af..fa05401bc168 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -1,7 +1,6 @@ etc/default/zfs etc/zfs/zfs-functions etc/zfs/zpool.d/ -etc/bash_completion.d/zfs lib/systemd/system-generators/ lib/systemd/system-preset/ lib/systemd/system/zfs-import-cache.service diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index f0791cfabd38..a3a05efacb50 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -71,10 +71,6 @@ override_dh_auto_install: @# Install the utilities. $(MAKE) install DESTDIR='$(CURDIR)/debian/tmp' - # Use upstream's bash completion - install -D -t '$(CURDIR)/debian/tmp/usr/share/bash-completion/completions/' \ - '$(CURDIR)/contrib/bash_completion.d/zfs' - # Move from bin_dir to /usr/sbin # Remove suffix (.py) as per policy 10.4 - Scripts # https://www.debian.org/doc/debian-policy/ch-files.html#s-scripts @@ -136,7 +132,6 @@ override_dh_auto_install: chmod a-x '$(CURDIR)/debian/tmp/etc/zfs/zfs-functions' chmod a-x '$(CURDIR)/debian/tmp/etc/default/zfs' - chmod a-x '$(CURDIR)/debian/tmp/usr/share/bash-completion/completions/zfs' override_dh_python3: dh_python3 -p openzfs-python3-pyzfs From e9dc31c74e7b28a0cb2a321bc220074f6461d231 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 5 Sep 2023 13:27:53 +0500 Subject: [PATCH 08/78] Update the behavior of mountpoint property There are some inconsistencies in the handling of mountpoint property. This commit updates the behavior and makes it consistent. If mountpoint property is set when dataset is unmounted, this would update the mountpoint property. The mountpoint could be valid or invalid in this case. Setting the mountpoint property would result in success in this case. Dataset would still be unmounted here. On the other hand, if dataset is mounted and mountpoint property is updated to something invalid where mount cannot be successful, for example, setting the mountpoint inside a readonly directory. This would unmount the dataset, set the mountpoint property to requested value and tries to mount the dataset. The mount operation returns error and this error is treated as overall failure of setting the property while the property is actually set. To make the behavior consistent in case dataset is mounted or unmounted, we should try to mount the dataset whenever mountpoint property is updated. This would result in mounting the datasets if canmount property is set to on, regardless if the dataset was previously unmounted. The failure in mount operation while setting the mountpoint property should not be treated as failure, since the property is actually set now to user requested value. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #15240 --- cmd/zfs/zfs_main.c | 7 ++++--- lib/libzfs/libzfs_changelist.c | 8 ++++---- .../cli_root/zfs_mount/zfs_mount_006_pos.ksh | 2 +- .../cli_root/zfs_mount/zfs_mount_008_pos.ksh | 4 ++-- .../cli_root/zfs_mount/zfs_mount_012_pos.ksh | 15 +++++++-------- .../cli_root/zfs_set/mountpoint_002_pos.ksh | 12 +++++++++--- .../cli_root/zfs_set/zfs_set_003_neg.ksh | 10 +++++++--- 7 files changed, 34 insertions(+), 24 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 62802de23e5f..8673ad8db986 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -4203,8 +4203,9 @@ static int set_callback(zfs_handle_t *zhp, void *data) { nvlist_t *props = data; + int ret = zfs_prop_set_list(zhp, props); - if (zfs_prop_set_list(zhp, props) != 0) { + if (ret != 0 || libzfs_errno(g_zfs) != EZFS_SUCCESS) { switch (libzfs_errno(g_zfs)) { case EZFS_MOUNTFAILED: (void) fprintf(stderr, gettext("property may be set " @@ -4213,11 +4214,11 @@ set_callback(zfs_handle_t *zhp, void *data) case EZFS_SHARENFSFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to reshare filesystem\n")); + ret = 1; break; } - return (1); } - return (0); + return (ret); } static int diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index dd14c570ec03..4b0f66964346 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -244,13 +244,13 @@ changelist_postfix(prop_changelist_t *clp) zfs_is_mounted(cn->cn_handle, NULL); if (!mounted && !needs_key && (cn->cn_mounted || - ((sharenfs || sharesmb || clp->cl_waslegacy) && + (((clp->cl_prop == ZFS_PROP_MOUNTPOINT && + clp->cl_prop == clp->cl_realprop) || + sharenfs || sharesmb || clp->cl_waslegacy) && (zfs_prop_get_int(cn->cn_handle, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) { - if (zfs_mount(cn->cn_handle, NULL, 0) != 0) - errors++; - else + if (zfs_mount(cn->cn_handle, NULL, 0) == 0) mounted = TRUE; } diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh index 2a2466f65c02..e9ab472795eb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh @@ -94,7 +94,7 @@ while (( depth < MAXDEPTH )); do done log_must zfs set mountpoint=$mtpt $TESTPOOL/$TESTFS -log_must zfs $mountcmd $TESTPOOL/$TESTFS +log_must ismounted $TESTPOOL/$TESTFS log_must zfs set overlay=off $TESTPOOL/$TESTFS if ! is_illumos; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh index 2c1029d551cf..0437c61a2c40 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh @@ -71,7 +71,7 @@ log_must mkfile 1M $testfile $testfile1 log_must zfs unmount $fs1 log_must zfs set mountpoint=$mntpnt $fs1 -log_must zfs mount $fs1 +log_must ismounted $fs1 log_must zfs unmount $fs1 log_must zfs mount -O $fs1 @@ -85,7 +85,7 @@ log_must ls $mntpnt/$TESTFILE1 $mntpnt/$TESTFILE2 # Verify $TESTFILE2 was created in $fs1, rather than $fs log_must zfs unmount $fs1 log_must zfs set mountpoint=$mntpnt1 $fs1 -log_must zfs mount $fs1 +log_must ismounted $fs1 log_must ls $testfile1 $mntpnt1/$TESTFILE2 # Verify $TESTFILE2 was not created in $fs, and $fs is accessible again. diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh index 5ff094d2c479..66958f2f0884 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh @@ -25,13 +25,12 @@ # STRATEGY: # 1. Unmount the dataset # 2. Create a new empty directory -# 3. Set the dataset's mountpoint -# 4. Attempt to mount the dataset -# 5. Verify the mount succeeds -# 6. Unmount the dataset -# 7. Create a file in the directory created in step 2 -# 8. Attempt to mount the dataset -# 9. Verify the mount succeeds +# 3. Set the dataset's mountpoint, this should mount the dataset +# 4. Verify the mount succeeds +# 5. Unmount the dataset +# 6. Create a file in the directory created in step 2 +# 7. Attempt to mount the dataset +# 8. Verify the mount succeeds # verify_runnable "both" @@ -43,7 +42,7 @@ fs=$TESTPOOL/$TESTFS log_must zfs umount $fs log_must mkdir -p $TESTDIR log_must zfs set mountpoint=$TESTDIR $fs -log_must zfs mount $fs +log_must ismounted $fs log_must zfs umount $fs log_must touch $TESTDIR/testfile.$$ log_must zfs mount $fs diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh index a5785226e02e..c227a6fb8aa8 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh @@ -35,7 +35,9 @@ # # DESCRIPTION: # If ZFS is currently managing the file system but it is currently unmounted, -# and the mountpoint property is changed, the file system remains unmounted. +# and the mountpoint property is changed, the file system should be mounted +# if it is a valid mountpoint and canmount allows to mount, otherwise it +# should not be mounted. # # STRATEGY: # 1. Setup a pool and create fs, ctr within it. @@ -62,7 +64,7 @@ function cleanup } log_assert "Setting a valid mountpoint for an unmounted file system, \ - it remains unmounted." + it gets mounted." log_onexit cleanup old_fs_mpt=$(get_prop mountpoint $TESTPOOL/$TESTFS) @@ -83,7 +85,11 @@ while (( i < ${#dataset[@]} )); do while (( j < ${#values[@]} )); do set_n_check_prop "${values[j]}" "mountpoint" \ "${dataset[i]}" - log_mustnot ismounted ${dataset[i]} + if [ "${dataset[i]}" = "$TESTPOOL/$TESTFS" ]; then + log_must ismounted ${dataset[i]} + else + log_mustnot ismounted ${dataset[i]} + fi (( j += 1 )) done cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh index 3afb0eb7010e..5901ba7dc461 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh @@ -33,7 +33,9 @@ # # DESCRIPTION: -# 'zfs set mountpoint/sharenfs' should fail when the mountpoint is invalid +# 'zfs set mountpoint/sharenfs' should set the property when mountpoint +# is invalid. Setting the property should be successful, but dataset +# should not be mounted, as mountpoint is invalid. # # STRATEGY: # 1. Create invalid scenarios @@ -62,10 +64,12 @@ longpath=$(gen_dataset_name 1030 "abcdefg") log_must zfs create -o mountpoint=legacy $TESTPOOL/foo # Do the negative testing about "property may be set but unable to remount filesystem" -log_mustnot eval "zfs set mountpoint=$badpath $TESTPOOL/foo >/dev/null 2>&1" +set_n_check_prop "$badpath" "mountpoint" "$TESTPOOL/foo" +log_mustnot ismounted $TESTPOOL/foo # Do the negative testing about "property may be set but unable to reshare filesystem" -log_mustnot eval "zfs set sharenfs=on $TESTPOOL/foo >/dev/null 2>&1" +set_n_check_prop "on" "sharenfs" "$TESTPOOL/foo" +log_mustnot ismounted $TESTPOOL/foo # Do the negative testing about "sharenfs property can not be set to null" log_mustnot eval "zfs set sharenfs= $TESTPOOL/foo >/dev/null 2>&1" From c53bc3837cb67a36b53ee7b9ae02903dc7b86fdb Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 5 Sep 2023 13:33:58 +0500 Subject: [PATCH 09/78] Improve the handling of sharesmb,sharenfs properties For sharesmb and sharenfs properties, the status of setting the property is tied with whether we succeed to share the dataset or not. In case sharing the dataset is not successful, this is treated as overall failure of setting the property. In this case, if we check the property after the failure, it is set to on. This commit updates this behavior and the status of setting the share properties is not returned as failure, when we fail to share the dataset. For sharenfs property, if access list is provided, the syntax errors in access list/host adresses are not validated until after setting the property during postfix phase while trying to share the dataset. This is not correct, since the property has already been set when we reach there. Syntax errors in access list/host addresses are validated while validating the property list, before setting the property and failure is returned to user in this case when there are errors in access list. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #15240 --- cmd/zfs/zfs_main.c | 1 - lib/libshare/os/freebsd/nfs.c | 3 ++- lib/libshare/os/linux/nfs.c | 47 +++++++++++++++++++++++++++++++--- lib/libzfs/libzfs_changelist.c | 11 ++++---- 4 files changed, 51 insertions(+), 11 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 8673ad8db986..c344ca8eccf8 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -4214,7 +4214,6 @@ set_callback(zfs_handle_t *zhp, void *data) case EZFS_SHARENFSFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to reshare filesystem\n")); - ret = 1; break; } } diff --git a/lib/libshare/os/freebsd/nfs.c b/lib/libshare/os/freebsd/nfs.c index 521631c51f07..d9fc66106369 100644 --- a/lib/libshare/os/freebsd/nfs.c +++ b/lib/libshare/os/freebsd/nfs.c @@ -161,7 +161,8 @@ nfs_is_shared(sa_share_impl_t impl_share) static int nfs_validate_shareopts(const char *shareopts) { - (void) shareopts; + if (strlen(shareopts) == 0) + return (SA_SYNTAX_ERR); return (SA_OK); } diff --git a/lib/libshare/os/linux/nfs.c b/lib/libshare/os/linux/nfs.c index c27e5564c1e1..004946b0cfe4 100644 --- a/lib/libshare/os/linux/nfs.c +++ b/lib/libshare/os/linux/nfs.c @@ -319,12 +319,49 @@ get_linux_shareopts_cb(const char *key, const char *value, void *cookie) "wdelay" }; char **plinux_opts = (char **)cookie; + char *host, *val_dup, *literal, *next; - /* host-specific options, these are taken care of elsewhere */ - if (strcmp(key, "ro") == 0 || strcmp(key, "rw") == 0 || - strcmp(key, "sec") == 0) + if (strcmp(key, "sec") == 0) return (SA_OK); + if (strcmp(key, "ro") == 0 || strcmp(key, "rw") == 0) { + if (value == NULL || strlen(value) == 0) + return (SA_OK); + val_dup = strdup(value); + host = val_dup; + if (host == NULL) + return (SA_NO_MEMORY); + do { + if (*host == '[') { + host++; + literal = strchr(host, ']'); + if (literal == NULL) { + free(val_dup); + return (SA_SYNTAX_ERR); + } + if (literal[1] == '\0') + next = NULL; + else if (literal[1] == '/') { + next = strchr(literal + 2, ':'); + if (next != NULL) + ++next; + } else if (literal[1] == ':') + next = literal + 2; + else { + free(val_dup); + return (SA_SYNTAX_ERR); + } + } else { + next = strchr(host, ':'); + if (next != NULL) + ++next; + } + host = next; + } while (host != NULL); + free(val_dup); + return (SA_OK); + } + if (strcmp(key, "anon") == 0) key = "anonuid"; @@ -472,6 +509,10 @@ static int nfs_validate_shareopts(const char *shareopts) { char *linux_opts = NULL; + + if (strlen(shareopts) == 0) + return (SA_SYNTAX_ERR); + int error = get_linux_shareopts(shareopts, &linux_opts); if (error != SA_OK) return (error); diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index 4b0f66964346..efe1c0c06035 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -174,7 +174,6 @@ changelist_postfix(prop_changelist_t *clp) prop_changenode_t *cn; uu_avl_walk_t *walk; char shareopts[ZFS_MAXPROPLEN]; - int errors = 0; boolean_t commit_smb_shares = B_FALSE; boolean_t commit_nfs_shares = B_FALSE; @@ -262,19 +261,19 @@ changelist_postfix(prop_changelist_t *clp) const enum sa_protocol nfs[] = {SA_PROTOCOL_NFS, SA_NO_PROTOCOL}; if (sharenfs && mounted) { - errors += zfs_share(cn->cn_handle, nfs); + zfs_share(cn->cn_handle, nfs); commit_nfs_shares = B_TRUE; } else if (cn->cn_shared || clp->cl_waslegacy) { - errors += zfs_unshare(cn->cn_handle, NULL, nfs); + zfs_unshare(cn->cn_handle, NULL, nfs); commit_nfs_shares = B_TRUE; } const enum sa_protocol smb[] = {SA_PROTOCOL_SMB, SA_NO_PROTOCOL}; if (sharesmb && mounted) { - errors += zfs_share(cn->cn_handle, smb); + zfs_share(cn->cn_handle, smb); commit_smb_shares = B_TRUE; } else if (cn->cn_shared || clp->cl_waslegacy) { - errors += zfs_unshare(cn->cn_handle, NULL, smb); + zfs_unshare(cn->cn_handle, NULL, smb); commit_smb_shares = B_TRUE; } } @@ -288,7 +287,7 @@ changelist_postfix(prop_changelist_t *clp) zfs_commit_shares(proto); uu_avl_walk_end(walk); - return (errors ? -1 : 0); + return (0); } /* From 8015e2ea66b4f6233877fef29a8a35594f33558d Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 3 Oct 2023 04:58:54 +0500 Subject: [PATCH 10/78] Add '-u' - nomount flag for zfs set This commit adds '-u' flag for zfs set operation. With this flag, mountpoint, sharenfs and sharesmb properties can be updated without actually mounting or sharing the dataset. Previously, if dataset was unmounted, and mountpoint property was updated, dataset was not mounted after the update. This behavior is changed in #15240. We mount the dataset whenever mountpoint property is updated, regardless if it's mounted or not. To provide the user with option to keep the dataset unmounted and still update the mountpoint without mounting the dataset, '-u' flag can be used. If any of mountpoint, sharenfs or sharesmb properties are updated with '-u' flag, the property is set to desired value but the operation to (re/un)mount and/or (re/un)share the dataset is not performed and dataset remains as it was before. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Umer Saleem Closes #15322 --- cmd/zfs/zfs_main.c | 48 ++++---- include/libzfs.h | 8 ++ lib/libzfs/libzfs.abi | 7 ++ lib/libzfs/libzfs_changelist.c | 27 +++-- lib/libzfs/libzfs_dataset.c | 18 ++- man/man7/zfsprops.7 | 26 ++++- man/man8/zfs-set.8 | 7 ++ tests/runfiles/common.run | 2 +- tests/runfiles/sanity.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../cli_root/zfs_set/zfs_set_nomount.ksh | 103 ++++++++++++++++++ 11 files changed, 216 insertions(+), 33 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_nomount.ksh diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index c344ca8eccf8..5644869cf336 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -339,7 +339,7 @@ get_usage(zfs_help_t idx) "\tsend [-nVvPe] -t \n" "\tsend [-PnVv] --saved filesystem\n")); case HELP_SET: - return (gettext("\tset ... " + return (gettext("\tset [-u] ... " " ...\n")); case HELP_SHARE: return (gettext("\tshare [-l] <-a [nfs|smb] | filesystem>\n")); @@ -4202,8 +4202,8 @@ zfs_do_rollback(int argc, char **argv) static int set_callback(zfs_handle_t *zhp, void *data) { - nvlist_t *props = data; - int ret = zfs_prop_set_list(zhp, props); + zprop_set_cbdata_t *cb = data; + int ret = zfs_prop_set_list_flags(zhp, cb->cb_proplist, cb->cb_flags); if (ret != 0 || libzfs_errno(g_zfs) != EZFS_SUCCESS) { switch (libzfs_errno(g_zfs)) { @@ -4223,25 +4223,35 @@ set_callback(zfs_handle_t *zhp, void *data) static int zfs_do_set(int argc, char **argv) { - nvlist_t *props = NULL; + zprop_set_cbdata_t cb = { 0 }; int ds_start = -1; /* argv idx of first dataset arg */ int ret = 0; - int i; + int i, c; - /* check for options */ - if (argc > 1 && argv[1][0] == '-') { - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - argv[1][1]); - usage(B_FALSE); + /* check options */ + while ((c = getopt(argc, argv, "u")) != -1) { + switch (c) { + case 'u': + cb.cb_flags |= ZFS_SET_NOMOUNT; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } } + argc -= optind; + argv += optind; + /* check number of arguments */ - if (argc < 2) { + if (argc < 1) { (void) fprintf(stderr, gettext("missing arguments\n")); usage(B_FALSE); } - if (argc < 3) { - if (strchr(argv[1], '=') == NULL) { + if (argc < 2) { + if (strchr(argv[0], '=') == NULL) { (void) fprintf(stderr, gettext("missing property=value " "argument(s)\n")); } else { @@ -4252,7 +4262,7 @@ zfs_do_set(int argc, char **argv) } /* validate argument order: prop=val args followed by dataset args */ - for (i = 1; i < argc; i++) { + for (i = 0; i < argc; i++) { if (strchr(argv[i], '=') != NULL) { if (ds_start > 0) { /* out-of-order prop=val argument */ @@ -4270,20 +4280,20 @@ zfs_do_set(int argc, char **argv) } /* Populate a list of property settings */ - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + if (nvlist_alloc(&cb.cb_proplist, NV_UNIQUE_NAME, 0) != 0) nomem(); - for (i = 1; i < ds_start; i++) { - if (!parseprop(props, argv[i])) { + for (i = 0; i < ds_start; i++) { + if (!parseprop(cb.cb_proplist, argv[i])) { ret = -1; goto error; } } ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, - ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props); + ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, &cb); error: - nvlist_free(props); + nvlist_free(cb.cb_proplist); return (ret); } diff --git a/include/libzfs.h b/include/libzfs.h index fa05b7921bb5..6c3669273786 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -523,6 +523,7 @@ _LIBZFS_H nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, _LIBZFS_H const char *zfs_prop_to_name(zfs_prop_t); _LIBZFS_H int zfs_prop_set(zfs_handle_t *, const char *, const char *); _LIBZFS_H int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); +_LIBZFS_H int zfs_prop_set_list_flags(zfs_handle_t *, nvlist_t *, int); _LIBZFS_H int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); _LIBZFS_H int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, @@ -645,6 +646,13 @@ typedef struct zprop_get_cbdata { vdev_cbdata_t cb_vdevs; } zprop_get_cbdata_t; +#define ZFS_SET_NOMOUNT 1 + +typedef struct zprop_set_cbdata { + int cb_flags; + nvlist_t *cb_proplist; +} zprop_set_cbdata_t; + _LIBZFS_H void zprop_print_one_property(const char *, zprop_get_cbdata_t *, const char *, const char *, zprop_source_t, const char *, const char *); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 6e53bcb41a87..8658d39e28fc 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -396,6 +396,7 @@ + @@ -4424,6 +4425,12 @@ + + + + + + diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index efe1c0c06035..4db1cbce9568 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -105,6 +105,15 @@ changelist_prefix(prop_changelist_t *clp) clp->cl_prop != ZFS_PROP_SHARESMB) return (0); + /* + * If CL_GATHER_DONT_UNMOUNT is set, don't want to unmount/unshare and + * later (re)mount/(re)share the filesystem in postfix phase, so we + * return from here. If filesystem is mounted or unmounted, leave it + * as it is. + */ + if (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) + return (0); + if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_ROBUST)) == NULL) return (-1); @@ -129,8 +138,6 @@ changelist_prefix(prop_changelist_t *clp) */ switch (clp->cl_prop) { case ZFS_PROP_MOUNTPOINT: - if (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) - break; if (zfs_unmount(cn->cn_handle, NULL, clp->cl_mflags) != 0) { ret = -1; @@ -164,9 +171,8 @@ changelist_prefix(prop_changelist_t *clp) * reshare the filesystems as necessary. In changelist_gather() we recorded * whether the filesystem was previously shared or mounted. The action we take * depends on the previous state, and whether the value was previously 'legacy'. - * For non-legacy properties, we only remount/reshare the filesystem if it was - * previously mounted/shared. Otherwise, we always remount/reshare the - * filesystem. + * For non-legacy properties, we always remount/reshare the filesystem, + * if CL_GATHER_DONT_UNMOUNT is not set. */ int changelist_postfix(prop_changelist_t *clp) @@ -177,6 +183,14 @@ changelist_postfix(prop_changelist_t *clp) boolean_t commit_smb_shares = B_FALSE; boolean_t commit_nfs_shares = B_FALSE; + /* + * If CL_GATHER_DONT_UNMOUNT is set, it means we don't want to (un)mount + * or (re/un)share the filesystem, so we return from here. If filesystem + * is mounted or unmounted, leave it as it is. + */ + if (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) + return (0); + /* * If we're changing the mountpoint, attempt to destroy the underlying * mountpoint. All other datasets will have inherited from this dataset @@ -239,8 +253,7 @@ changelist_postfix(prop_changelist_t *clp) needs_key = (zfs_prop_get_int(cn->cn_handle, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE); - mounted = (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) || - zfs_is_mounted(cn->cn_handle, NULL); + mounted = zfs_is_mounted(cn->cn_handle, NULL); if (!mounted && !needs_key && (cn->cn_mounted || (((clp->cl_prop == ZFS_PROP_MOUNTPOINT && diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 11d3eb6a3c60..727efc5a91ad 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -1771,14 +1771,24 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) return (ret); } - - /* * Given an nvlist of property names and values, set the properties for the * given dataset. */ int zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) +{ + return (zfs_prop_set_list_flags(zhp, props, 0)); +} + +/* + * Given an nvlist of property names, values and flags, set the properties + * for the given dataset. If ZFS_SET_NOMOUNT is set, it allows to update + * mountpoint, sharenfs and sharesmb properties without (un/re)mounting + * and (un/re)sharing the dataset. + */ +int +zfs_prop_set_list_flags(zfs_handle_t *zhp, nvlist_t *props, int flags) { zfs_cmd_t zc = {"\0"}; int ret = -1; @@ -1848,7 +1858,9 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) if (prop != ZFS_PROP_CANMOUNT || (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF && zfs_is_mounted(zhp, NULL))) { - cls[cl_idx] = changelist_gather(zhp, prop, 0, 0); + cls[cl_idx] = changelist_gather(zhp, prop, + ((flags & ZFS_SET_NOMOUNT) ? + CL_GATHER_DONT_UNMOUNT : 0), 0); if (cls[cl_idx] == NULL) goto error; } diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index 51ddd85eb79e..e3674b1f8a8d 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -1248,10 +1248,18 @@ Otherwise, they are automatically remounted in the new location if the property was previously .Sy legacy or -.Sy none , -or if they were mounted before the property was changed. +.Sy none . In addition, any shared file systems are unshared and shared in the new location. +.Pp +When the +.Sy mountpoint +property is set with +.Nm zfs Cm set Fl u +, the +.Sy mountpoint +property is updated but dataset is not mounted or unmounted and remains +as it was before. .It Sy nbmand Ns = Ns Sy on Ns | Ns Sy off Controls whether the file system should be mounted with .Sy nbmand @@ -1656,6 +1664,13 @@ by default. This means that any additional access control (disallow specific user specific access etc) must be done on the underlying file system. +.Pp +When the +.Sy sharesmb +property is updated with +.Nm zfs Cm set Fl u +, the property is set to desired value, but the operation to share, reshare +or unshare the the dataset is not performed. .It Sy sharenfs Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Ar opts Controls whether the file system is shared via NFS, and what options are to be used. @@ -1699,6 +1714,13 @@ or if they were shared before the property was changed. If the new property is .Sy off , the file systems are unshared. +.Pp +When the +.Sy sharenfs +property is updated with +.Nm zfs Cm set Fl u +, the property is set to desired value, but the operation to share, reshare +or unshare the the dataset is not performed. .It Sy logbias Ns = Ns Sy latency Ns | Ns Sy throughput Provide a hint to ZFS about handling of synchronous requests in this dataset. If diff --git a/man/man8/zfs-set.8 b/man/man8/zfs-set.8 index 4cabdcd8bd83..c01bcc643e5d 100644 --- a/man/man8/zfs-set.8 +++ b/man/man8/zfs-set.8 @@ -39,6 +39,7 @@ .Sh SYNOPSIS .Nm zfs .Cm set +.Op Fl u .Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns … .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns … .Nm zfs @@ -60,6 +61,7 @@ .It Xo .Nm zfs .Cm set +.Op Fl u .Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns … .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns … .Xc @@ -79,6 +81,11 @@ For more information, see the .Em User Properties section of .Xr zfsprops 7 . +.Bl -tag -width "-u" +.It Fl u +Update mountpoint, sharenfs, sharesmb property but do not mount or share the +dataset. +.El .It Xo .Nm zfs .Cm get diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 342f56d50d04..ef787c65c0f9 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -281,7 +281,7 @@ tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', 'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation', - 'zfs_set_feature_activation'] + 'zfs_set_feature_activation', 'zfs_set_nomount'] tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_share] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index 449bf1c0f56a..ab41c05b8473 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -212,7 +212,7 @@ tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_003_neg', 'property_alias_001_pos', - 'zfs_set_keylocation', 'zfs_set_feature_activation'] + 'zfs_set_keylocation', 'zfs_set_feature_activation', 'zfs_set_nomount'] tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_snapshot] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 1a58e6f774e9..3272a5d5816f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -870,6 +870,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_set/zfs_set_003_neg.ksh \ functional/cli_root/zfs_set/zfs_set_feature_activation.ksh \ functional/cli_root/zfs_set/zfs_set_keylocation.ksh \ + functional/cli_root/zfs_set/zfs_set_nomount.ksh \ functional/cli_root/zfs_share/cleanup.ksh \ functional/cli_root/zfs_share/setup.ksh \ functional/cli_root/zfs_share/zfs_share_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_nomount.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_nomount.ksh new file mode 100755 index 000000000000..ebf08711423c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_nomount.ksh @@ -0,0 +1,103 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by iXsystems, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib + +# +# DESCRIPTION: +# 'zfs set -u' should update the mountpoint, sharenfs and sharesmb +# properties without mounting and sharing the dataset. Validate the +# bevaior while dataset is mounted and unmounted. +# +# STRATEGY: +# 1. Confirm dataset is currently mounted +# 2. Update the mountpoint with -u flag +# 3. Confirm mountpoint property is updated with new value +# 4. Confirm dataset is still mounted at previous mountpoint +# 5. Unmount the dataset +# 6. Confirm dataset is unmounted +# 7. Mount the dataset +# 8. Confirm dataset is mounted at new mountpoint, that was set with -u flag. +# 9. Update and mount the dataset at previous mountpoint. +# 10. Unmount the dataset +# 11. Update mountpoint property with zfs set -u +# 12. Confirm dataset is not mounted +# 13. Update sharenfs property with zfs set -u +# 14. Confirm dataset is not mounted +# 15. Update sharesmb property with zfs set -u +# 16. Confirm dataset is not mounted +# 17. Mount the dataset and confirm dataset is mounted at new mountpoint +# + +verify_runnable "both" + +function cleanup +{ + log_must zfs set sharenfs=off $TESTPOOL/$TESTFS + if is_linux; then + log_must zfs set sharesmb=off $TESTPOOL/$TESTFS + fi + rm -r $newmpt +} + +log_assert "'zfs set -u' sets the mountpoint and share properties without " \ + "mounting the dataset" +log_onexit cleanup + +oldmpt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +newmpt=$TEST_BASE_DIR/abc + +# Test while dataset is mounted +log_must ismounted $TESTPOOL/$TESTFS +log_must zfs set -u mountpoint=$newmpt $TESTPOOL/$TESTFS +log_must check_user_prop $TESTPOOL/$TESTFS mountpoint $newmpt +log_must eval "[[ "$(mount | grep $TESTPOOL/$TESTFS | awk '{print $3}')" == $oldmpt ]]" +log_must zfs unmount $TESTPOOL/$TESTFS +log_mustnot ismounted $TESTPOOL/$TESTFS +log_must zfs mount $TESTPOOL/$TESTFS +log_must eval "[[ "$(mount | grep $TESTPOOL/$TESTFS | awk '{print $3}')" == $newmpt ]]" + +# Test while dataset is unmounted +log_must zfs set mountpoint=$oldmpt $TESTPOOL/$TESTFS +log_must ismounted $TESTPOOL/$TESTFS +log_must zfs unmount $TESTPOOL/$TESTFS +log_must zfs set -u mountpoint=$newmpt $TESTPOOL/$TESTFS +log_mustnot ismounted $TESTPOOL/$TESTFS +log_must zfs set -u sharenfs=on $TESTPOOL/$TESTFS +log_mustnot ismounted $TESTPOOL/$TESTFS +if is_linux; then + log_must zfs set -u sharesmb=on $TESTPOOL/$TESTFS + log_mustnot ismounted $TESTPOOL/$TESTFS +fi +log_must zfs mount $TESTPOOL/$TESTFS +log_must check_user_prop $TESTPOOL/$TESTFS mountpoint $newmpt +log_must eval "[[ "$(mount | grep $TESTPOOL/$TESTFS | awk '{print $3}')" == $newmpt ]]" + +log_must zfs set mountpoint=$oldmpt $TESTPOOL/$TESTFS +log_must ismounted $TESTPOOL/$TESTFS + +log_pass "'zfs set -u' functions correctly" From 1611b8e56e555bf928b795e45ba1724ef85eb1ed Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Tue, 3 Oct 2023 23:12:36 +0100 Subject: [PATCH 11/78] Add BTI landing pads to the AArch64 SHA2 assembly The Arm Branch Target Identification (BTI) extension guards against branching to an unintended instruction. To support BTI add the landing pad instructions to the SHA2 functions. These are from the hint space so are a nop on hardware that lacks BTI support or if BTI isn't enabled. Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Andrew Turner Closes #14862 Closes #15339 --- module/icp/asm-aarch64/sha2/sha256-armv8.S | 3 +++ module/icp/asm-aarch64/sha2/sha512-armv8.S | 2 ++ 2 files changed, 5 insertions(+) diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S index fa50c4e74d59..7ae486e4e229 100644 --- a/module/icp/asm-aarch64/sha2/sha256-armv8.S +++ b/module/icp/asm-aarch64/sha2/sha256-armv8.S @@ -49,6 +49,7 @@ .type zfs_sha256_block_armv7,%function .align 6 zfs_sha256_block_armv7: + hint #34 // bti c stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1015,6 +1016,7 @@ zfs_sha256_block_armv7: .type zfs_sha256_block_armv8,%function .align 6 zfs_sha256_block_armv8: + hint #34 // bti c .Lv8_entry: stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -1155,6 +1157,7 @@ zfs_sha256_block_armv8: .type zfs_sha256_block_neon,%function .align 4 zfs_sha256_block_neon: + hint #34 // bti c .Lneon_entry: stp x29, x30, [sp, #-16]! mov x29, sp diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S index 1683fc1ca53c..9c61eeee4d7b 100644 --- a/module/icp/asm-aarch64/sha2/sha512-armv8.S +++ b/module/icp/asm-aarch64/sha2/sha512-armv8.S @@ -73,6 +73,7 @@ .type zfs_sha512_block_armv7,%function .align 6 zfs_sha512_block_armv7: + hint #34 // bti c stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1040,6 +1041,7 @@ zfs_sha512_block_armv7: .type zfs_sha512_block_armv8,%function .align 6 zfs_sha512_block_armv8: + hint #34 // bti c .Lv8_entry: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later stp x29,x30,[sp,#-16]! From bc77a0c85ec9a26452992421f738dee0a786322b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 4 Oct 2023 17:45:00 -0400 Subject: [PATCH 12/78] ARC: Remove b_cv from struct l1arc_buf_hdr Earlier as part of #14123 I've removed one use of b_cv. This patch reuses the same approach to remove the other one from much more rare code path. This saves 16 bytes of L1 ARC header on FreeBSD (reducing it from 200 to 184 bytes) and seems even more on Linux. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15340 --- include/sys/arc_impl.h | 2 -- module/zfs/arc.c | 34 ++++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 78774792f367..da07fd4f8fae 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -159,8 +159,6 @@ struct arc_write_callback { * these two allocation states. */ typedef struct l1arc_buf_hdr { - /* for waiting on reads to complete */ - kcondvar_t b_cv; uint8_t b_byteswap; /* protected by arc state mutex */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 22dc0ed5e3b6..919684a589d8 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1151,7 +1151,6 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) memset(hdr, 0, HDR_FULL_SIZE); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); @@ -1211,7 +1210,6 @@ hdr_full_dest(void *vbuf, void *unused) arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); - cv_destroy(&hdr->b_l1hdr.b_cv); zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); @@ -5586,13 +5584,6 @@ arc_read_done(zio_t *zio) buf_hash_remove(hdr); } - /* - * Broadcast before we drop the hash_lock to avoid the possibility - * that the hdr (and hence the cv) might be freed before we get to - * the cv_broadcast(). - */ - cv_broadcast(&hdr->b_l1hdr.b_cv); - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); (void) remove_reference(hdr, hdr); @@ -5787,8 +5778,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, } acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; - if (hdr->b_l1hdr.b_acb) - hdr->b_l1hdr.b_acb->acb_prev = acb; + hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); @@ -5928,8 +5918,28 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * and so the performance impact shouldn't * matter. */ - cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); + arc_callback_t *acb = kmem_zalloc( + sizeof (arc_callback_t), KM_SLEEP); + acb->acb_wait = B_TRUE; + mutex_init(&acb->acb_wait_lock, NULL, + MUTEX_DEFAULT, NULL); + cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT, + NULL); + acb->acb_zio_head = + hdr->b_l1hdr.b_acb->acb_zio_head; + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb->acb_prev = acb; + hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); + mutex_enter(&acb->acb_wait_lock); + while (acb->acb_wait) { + cv_wait(&acb->acb_wait_cv, + &acb->acb_wait_lock); + } + mutex_exit(&acb->acb_wait_lock); + mutex_destroy(&acb->acb_wait_lock); + cv_destroy(&acb->acb_wait_cv); + kmem_free(acb, sizeof (arc_callback_t)); goto top; } } From ba7797c8db38db98f224e1e49eaa37de7b5bf1a5 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 6 Oct 2023 11:56:17 -0400 Subject: [PATCH 13/78] ARC: Remove b_bufcnt/b_ebufcnt from ARC headers In most cases we do not care about exact number of buffers linked to the header, we just need to know if it is zero, non-zero or one. That can easily be checked just looking on b_buf pointer or in some cases derefencing it. b_ebufcnt is read only once, and in that case we already traverse the list as part of arc_buf_remove(), so second traverse should not be expensive. This reduces L1 ARC header size by 8 bytes and full crypto header by 16 bytes, down to 176 and 232 bytes on FreeBSD respectively. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15350 --- include/os/linux/zfs/sys/trace_arc.h | 12 +-- include/sys/arc_impl.h | 8 +- module/zfs/arc.c | 105 +++++++++------------------ 3 files changed, 42 insertions(+), 83 deletions(-) diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h index c494f48bb48b..f749223daa72 100644 --- a/include/os/linux/zfs/sys/trace_arc.h +++ b/include/os/linux/zfs/sys/trace_arc.h @@ -51,7 +51,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) - __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) __field(uint16_t, hdr_psize) __field(uint16_t, hdr_lsize) @@ -70,7 +69,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; __entry->hdr_flags = ab->b_flags; - __entry->hdr_bufcnt = ab->b_l1hdr.b_bufcnt; __entry->hdr_psize = ab->b_psize; __entry->hdr_lsize = ab->b_lsize; __entry->hdr_spa = ab->b_spa; @@ -84,12 +82,12 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " - "flags 0x%x bufcnt %u type %u psize %u lsize %u spa %llu " + "flags 0x%x type %u psize %u lsize %u spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, - __entry->hdr_bufcnt, __entry->hdr_type, __entry->hdr_psize, + __entry->hdr_type, __entry->hdr_psize, __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, @@ -192,7 +190,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) - __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) __field(uint16_t, hdr_psize) __field(uint16_t, hdr_lsize) @@ -223,7 +220,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; __entry->hdr_flags = hdr->b_flags; - __entry->hdr_bufcnt = hdr->b_l1hdr.b_bufcnt; __entry->hdr_psize = hdr->b_psize; __entry->hdr_lsize = hdr->b_lsize; __entry->hdr_spa = hdr->b_spa; @@ -255,7 +251,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->zb_blkid = zb->zb_blkid; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " - "flags 0x%x bufcnt %u psize %u lsize %u spa %llu state_type %u " + "flags 0x%x psize %u lsize %u spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " @@ -264,7 +260,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, - __entry->hdr_bufcnt, __entry->hdr_psize, __entry->hdr_lsize, + __entry->hdr_psize, __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index da07fd4f8fae..adff42c55d05 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -159,8 +159,6 @@ struct arc_write_callback { * these two allocation states. */ typedef struct l1arc_buf_hdr { - uint8_t b_byteswap; - /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; @@ -171,7 +169,7 @@ typedef struct l1arc_buf_hdr { uint32_t b_mru_ghost_hits; uint32_t b_mfu_hits; uint32_t b_mfu_ghost_hits; - uint32_t b_bufcnt; + uint8_t b_byteswap; arc_buf_t *b_buf; /* self protecting */ @@ -434,12 +432,12 @@ typedef struct l2arc_dev { */ typedef struct arc_buf_hdr_crypt { abd_t *b_rabd; /* raw encrypted data */ - dmu_object_type_t b_ot; /* object type */ - uint32_t b_ebufcnt; /* count of encrypted buffers */ /* dsobj for looking up encryption key for l2arc encryption */ uint64_t b_dsobj; + dmu_object_type_t b_ot; /* object type */ + /* encryption parameters */ uint8_t b_salt[ZIO_DATA_SALT_LEN]; uint8_t b_iv[ZIO_DATA_IV_LEN]; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 919684a589d8..238eaa12709c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1993,7 +1993,6 @@ arc_buf_untransform_in_place(arc_buf_t *buf) arc_buf_size(buf)); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; - hdr->b_crypt_hdr.b_ebufcnt -= 1; } /* @@ -2228,7 +2227,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2268,7 +2266,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2384,7 +2381,9 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) l2hdr = &hdr->b_l2hdr; if (l1hdr) { - abi->abi_bufcnt = l1hdr->b_bufcnt; + abi->abi_bufcnt = 0; + for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next) + abi->abi_bufcnt++; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; @@ -2412,7 +2411,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) { arc_state_t *old_state; int64_t refcnt; - uint32_t bufcnt; boolean_t update_old, update_new; arc_buf_contents_t type = arc_buf_type(hdr); @@ -2426,19 +2424,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); - bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || - HDR_HAS_RABD(hdr)); + update_old = (hdr->b_l1hdr.b_buf != NULL || + hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); - IMPLY(GHOST_STATE(old_state), bufcnt == 0); - IMPLY(GHOST_STATE(new_state), bufcnt == 0); IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL); IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL); - IMPLY(old_state == arc_anon, bufcnt <= 1); + IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL || + ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); } else { old_state = arc_l2c_only; refcnt = 0; - bufcnt = 0; update_old = B_FALSE; } update_new = update_old; @@ -2486,14 +2481,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first - * remove all arc buffers. Thus, we'll have a - * bufcnt of zero, and no arc buffer to use for - * the reference. As a result, we use the arc - * header pointer for the reference. + * remove all arc buffers. Thus, we'll have no arc + * buffer to use for the reference. As a result, we + * use the arc header pointer for the reference. */ (void) zfs_refcount_add_many( &new_state->arcs_size[type], @@ -2501,7 +2494,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { - uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -2510,8 +2502,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; /* * When the arc_buf_t is sharing the data @@ -2527,7 +2517,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) &new_state->arcs_size[type], arc_buf_size(buf), buf); } - ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( @@ -2546,7 +2535,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2562,7 +2550,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) &old_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); } else { - uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -2571,8 +2558,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; /* * When the arc_buf_t is sharing the data @@ -2588,7 +2573,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) &old_state->arcs_size[type], arc_buf_size(buf), buf); } - ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); @@ -2836,9 +2820,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_bufcnt += 1; - if (encrypted) - hdr->b_crypt_hdr.b_ebufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or @@ -3080,8 +3061,6 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); @@ -3120,22 +3099,20 @@ arc_buf_destroy_impl(arc_buf_t *buf) } buf->b_data = NULL; - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - hdr->b_l1hdr.b_bufcnt -= 1; - - if (ARC_BUF_ENCRYPTED(buf)) { - hdr->b_crypt_hdr.b_ebufcnt -= 1; - - /* - * If we have no more encrypted buffers and we've - * already gotten a copy of the decrypted data we can - * free b_rabd to save some space. - */ - if (hdr->b_crypt_hdr.b_ebufcnt == 0 && - HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && - !HDR_IO_IN_PROGRESS(hdr)) { - arc_hdr_free_abd(hdr, B_TRUE); + /* + * If we have no more encrypted buffers and we've already + * gotten a copy of the decrypted data we can free b_rabd + * to save some space. + */ + if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) && + hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) { + arc_buf_t *b; + for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) { + if (b != buf && ARC_BUF_ENCRYPTED(b)) + break; } + if (b == NULL) + arc_hdr_free_abd(hdr, B_TRUE); } } @@ -3323,7 +3300,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -3380,7 +3356,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(hdr->b_l1hdr.b_bufcnt); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif @@ -3496,7 +3471,6 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) #ifdef ZFS_DEBUG nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; #endif - nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; @@ -3539,7 +3513,6 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) hdr->b_l1hdr.b_freeze_cksum = NULL; #endif hdr->b_l1hdr.b_buf = NULL; - hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_byteswap = 0; hdr->b_l1hdr.b_state = NULL; hdr->b_l1hdr.b_arc_access = 0; @@ -3553,7 +3526,6 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) if (ocache == hdr_full_crypt_cache) { ASSERT(!HDR_HAS_RABD(hdr)); hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; - hdr->b_crypt_hdr.b_ebufcnt = 0; hdr->b_crypt_hdr.b_dsobj = 0; memset(hdr->b_crypt_hdr.b_salt, 0, ZIO_DATA_SALT_LEN); memset(hdr->b_crypt_hdr.b_iv, 0, ZIO_DATA_IV_LEN); @@ -3787,8 +3759,6 @@ static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_bufcnt > 0); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } @@ -3869,7 +3839,8 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) arc_buf_hdr_t *hdr = buf->b_hdr; if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, tag)); return; @@ -3879,7 +3850,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); @@ -3922,7 +3893,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); @@ -6320,7 +6290,8 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); @@ -6371,7 +6342,7 @@ arc_release(arc_buf_t *buf, const void *tag) /* * Do we have more than one buf? */ - if (hdr->b_l1hdr.b_bufcnt > 1) { + if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); @@ -6452,10 +6423,6 @@ arc_release(arc_buf_t *buf, const void *tag) arc_buf_size(buf), buf); } - hdr->b_l1hdr.b_bufcnt -= 1; - if (ARC_BUF_ENCRYPTED(buf)) - hdr->b_crypt_hdr.b_ebufcnt -= 1; - arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -6468,15 +6435,11 @@ arc_release(arc_buf_t *buf, const void *tag) nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_bufcnt = 1; - if (ARC_BUF_ENCRYPTED(buf)) - nhdr->b_crypt_hdr.b_ebufcnt = 1; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; @@ -6527,7 +6490,7 @@ arc_write_ready(zio_t *zio) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); /* * If we're reexecuting this zio because the pool suspended, then @@ -6666,7 +6629,8 @@ arc_write_ready(zio_t *zio) } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); arc_share_buf(hdr, buf); } @@ -6747,7 +6711,8 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_l1hdr.b_bufcnt == 1); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); + ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); @@ -6788,7 +6753,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); if (uncached) arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); else if (l2arc) From 3158b5d718d67416ada9308dd4334aeea3869e74 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 3 Oct 2023 11:57:48 -0400 Subject: [PATCH 14/78] ARC: Drop different size headers for crypto To reduce memory usage ZFS crypto allocated bigger by 56 bytes ARC headers only when specific block was encrypted on disk. It was a nice optimization, except in some cases the code reallocated them on fly, that invalidated header pointers from the buffers. Since the buffers use different locking, it created number of races, that were originally covered (at least partially) by b_evict_lock, used also to protection evictions. But it has gone as part of #14340. As result, as was found in #15293, arc_hdr_realloc_crypt() ended up unprotected and causing use-after-free. Instead of introducing some even more elaborate locking, this patch just drops the difference between normal and protected headers. It cost us additional 56 bytes per header, but with couple patches saving 24 bytes, the net growth is only 32 bytes with total header size of 232 bytes on FreeBSD, that IMHO is acceptable price for simplicity. Additional locking would also end up consuming space, time or both. Reviewe-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15293 Closes #15347 --- module/zfs/arc.c | 183 +++-------------------------------------------- 1 file changed, 8 insertions(+), 175 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 238eaa12709c..b5946e7604c0 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -748,8 +748,7 @@ taskq_t *arc_prune_taskq; * Other sizes */ -#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* @@ -1113,7 +1112,6 @@ buf_hash_remove(arc_buf_hdr_t *hdr) */ static kmem_cache_t *hdr_full_cache; -static kmem_cache_t *hdr_full_crypt_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; @@ -1134,7 +1132,6 @@ buf_fini(void) for (int i = 0; i < BUF_LOCKS; i++) mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); - kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -1162,19 +1159,6 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) return (0); } -static int -hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) -{ - (void) unused; - arc_buf_hdr_t *hdr = vbuf; - - hdr_full_cons(vbuf, unused, kmflag); - memset(&hdr->b_crypt_hdr, 0, sizeof (hdr->b_crypt_hdr)); - arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); - - return (0); -} - static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { @@ -1218,16 +1202,6 @@ hdr_full_dest(void *vbuf, void *unused) arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } -static void -hdr_full_crypt_dest(void *vbuf, void *unused) -{ - (void) vbuf, (void) unused; - - hdr_full_dest(vbuf, unused); - arc_space_return(sizeof (((arc_buf_hdr_t *)NULL)->b_crypt_hdr), - ARC_SPACE_HDRS); -} - static void hdr_l2only_dest(void *vbuf, void *unused) { @@ -1283,9 +1257,6 @@ buf_init(void) hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); - hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", - HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, - NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); @@ -3273,11 +3244,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); - if (protected) { - hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); - } else { - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - } + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(HDR_EMPTY(hdr)); #ifdef ZFS_DEBUG @@ -3325,16 +3292,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); - /* - * if the caller wanted a new full header and the header is to be - * encrypted we will actually allocate the header from the full crypt - * cache instead. The same applies to freeing from the old cache. - */ - if (HDR_PROTECTED(hdr) && new == hdr_full_cache) - new = hdr_full_crypt_cache; - if (HDR_PROTECTED(hdr) && old == hdr_full_cache) - old = hdr_full_crypt_cache; - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); @@ -3342,7 +3299,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) memcpy(nhdr, hdr, HDR_L2ONLY_SIZE); - if (new == hdr_full_cache || new == hdr_full_crypt_cache) { + if (new == hdr_full_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a @@ -3421,123 +3378,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) return (nhdr); } -/* - * This function allows an L1 header to be reallocated as a crypt - * header and vice versa. If we are going to a crypt header, the - * new fields will be zeroed out. - */ -static arc_buf_hdr_t * -arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) -{ - arc_buf_hdr_t *nhdr; - arc_buf_t *buf; - kmem_cache_t *ncache, *ocache; - - /* - * This function requires that hdr is in the arc_anon state. - * Therefore it won't have any L2ARC data for us to worry - * about copying. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); - ASSERT3P(hdr->b_hash_next, ==, NULL); - - if (need_crypt) { - ncache = hdr_full_crypt_cache; - ocache = hdr_full_cache; - } else { - ncache = hdr_full_cache; - ocache = hdr_full_crypt_cache; - } - - nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); - - /* - * Copy all members that aren't locks or condvars to the new header. - * No lists are pointing to us (as we asserted above), so we don't - * need to worry about the list nodes. - */ - nhdr->b_dva = hdr->b_dva; - nhdr->b_birth = hdr->b_birth; - nhdr->b_type = hdr->b_type; - nhdr->b_flags = hdr->b_flags; - nhdr->b_psize = hdr->b_psize; - nhdr->b_lsize = hdr->b_lsize; - nhdr->b_spa = hdr->b_spa; -#ifdef ZFS_DEBUG - nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; -#endif - nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; - nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; - nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; - nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits; - nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; - nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; - nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; - nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; - nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; - - /* - * This zfs_refcount_add() exists only to ensure that the individual - * arc buffers always point to a header that is referenced, avoiding - * a small race condition that could trigger ASSERTs. - */ - (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); - nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; - for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) - buf->b_hdr = nhdr; - - zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); - (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); - ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); - - if (need_crypt) { - arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); - } else { - arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); - } - - /* unset all members of the original hdr */ - memset(&hdr->b_dva, 0, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_type = 0; - hdr->b_flags = 0; - hdr->b_psize = 0; - hdr->b_lsize = 0; - hdr->b_spa = 0; -#ifdef ZFS_DEBUG - hdr->b_l1hdr.b_freeze_cksum = NULL; -#endif - hdr->b_l1hdr.b_buf = NULL; - hdr->b_l1hdr.b_byteswap = 0; - hdr->b_l1hdr.b_state = NULL; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_mru_hits = 0; - hdr->b_l1hdr.b_mru_ghost_hits = 0; - hdr->b_l1hdr.b_mfu_hits = 0; - hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_acb = NULL; - hdr->b_l1hdr.b_pabd = NULL; - - if (ocache == hdr_full_crypt_cache) { - ASSERT(!HDR_HAS_RABD(hdr)); - hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; - hdr->b_crypt_hdr.b_dsobj = 0; - memset(hdr->b_crypt_hdr.b_salt, 0, ZIO_DATA_SALT_LEN); - memset(hdr->b_crypt_hdr.b_iv, 0, ZIO_DATA_IV_LEN); - memset(hdr->b_crypt_hdr.b_mac, 0, ZIO_DATA_MAC_LEN); - } - - buf_discard_identity(hdr); - kmem_cache_free(ocache, hdr); - - return (nhdr); -} - /* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It @@ -3557,8 +3397,7 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); - if (!HDR_PROTECTED(hdr)) - hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? @@ -3822,12 +3661,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif - - if (!HDR_PROTECTED(hdr)) { - kmem_cache_free(hdr_full_cache, hdr); - } else { - kmem_cache_free(hdr_full_crypt_cache, hdr); - } + kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_l2only_cache, hdr); } @@ -6525,13 +6359,9 @@ arc_write_ready(zio_t *zio) add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ } - if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) - hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); - if (BP_IS_PROTECTED(bp)) { /* ZIL blocks are written through zio_rewrite */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); - ASSERT(HDR_PROTECTED(hdr)); if (BP_SHOULD_BYTESWAP(bp)) { if (BP_GET_LEVEL(bp) > 0) { @@ -6544,11 +6374,14 @@ arc_write_ready(zio_t *zio) hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + } else { + arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED); } /* From bf54da84fb662eac474f72ae89a81b81d409c3b0 Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 7 Oct 2023 02:39:20 +1100 Subject: [PATCH 15/78] tests/block_cloning: sync before write in fallback test We're still seeing this test fail intermittently (that is, the clone happens), which must mean the write and the clone can still be happening on different txgs. It might be that there's still activity after the pool is created. So here we force a sync before starting the write. Sponsored-By: Klara Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #15359 --- .../block_cloning_copyfilerange_fallback_same_txg.ksh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index 2cd2f4763a73..e52b34ec8a51 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -52,6 +52,8 @@ log_must set_tunable64 TXG_TIMEOUT 5000 log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS +log_must sync_pool $TESTPOOL true + log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 From c27277daace5dff9d595a90626034a2b5db8ea28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Fri, 6 Oct 2023 17:50:26 +0200 Subject: [PATCH 16/78] CI: add FreeBSD build with Cirrus CI As a first step for automatic FreeBSD testing add a build and install for FreeBSD versions 12.4, 13.2 and 14-snapshot using Cirrus CI. Reviewed-by: Jose Luis Duran Reviewed-by: Brian Behlendorf Signed-off-by: Martin Matuska Closes #15332 --- .cirrus.yml | 21 +++++++++++++++++++++ .gitignore | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 .cirrus.yml diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 000000000000..18b292289e20 --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,21 @@ +env: + CIRRUS_CLONE_DEPTH: 1 + ARCH: amd64 + +build_task: + matrix: + freebsd_instance: + image_family: freebsd-12-4 + freebsd_instance: + image_family: freebsd-13-2 + freebsd_instance: + image_family: freebsd-14-0-snap + prepare_script: + - pkg install -y autoconf automake libtool gettext-runtime gmake ksh93 py39-packaging py39-cffi py39-sysctl + configure_script: + - env MAKE=gmake ./autogen.sh + - env MAKE=gmake ./configure --with-config="user" --with-python=3.9 + build_script: + - gmake -j `sysctl -n kern.smp.cpus` + install_script: + - gmake install diff --git a/.gitignore b/.gitignore index 8d91dd9466c5..1ef47d921c28 100644 --- a/.gitignore +++ b/.gitignore @@ -42,6 +42,7 @@ !udev/** !.editorconfig +!.cirrus.yml !.gitignore !.gitmodules !AUTHORS @@ -60,7 +61,6 @@ !TEST !zfs.release.in - # # Normal rules # From bcd010d3a5db120016c764051268ea764011ca92 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 6 Oct 2023 12:04:00 -0400 Subject: [PATCH 17/78] Reduce number of metaslab preload taskq threads. Before this change ZFS created threads for 50% of CPUs for each top- level vdev. Plus it created the same number of threads for embedded log groups (that have only one metaslab and don't need any preload). As result, on system with 80 CPUs and pool of 60 vdevs this resulted in 4800 metaslab preload threads, that is absolutely insane. This patch changes the preload threads to 50% of CPUs in one taskq per pool, so on the mentioned system it will be only 40 threads. Among other things this fixes zdb on the mentioned system and pool on FreeBSD, that failed to create so many threads in one process. Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15319 --- include/sys/metaslab_impl.h | 1 - include/sys/spa_impl.h | 4 ++-- man/man4/zfs.4 | 6 ++++++ module/os/freebsd/zfs/sysctl_os.c | 22 ---------------------- module/zfs/metaslab.c | 23 ++++++++--------------- module/zfs/spa.c | 28 +++++++++++++++++++++------- 6 files changed, 37 insertions(+), 47 deletions(-) diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index d328068890cc..4f434291ddbf 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -250,7 +250,6 @@ struct metaslab_group { int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; - taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 588c72f6e4fa..cdf65c371337 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -423,7 +423,9 @@ struct spa { hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ taskq_t *spa_zvol_taskq; /* Taskq for minor management */ + taskq_t *spa_metaslab_taskq; /* Taskq for metaslab preload */ taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */ + taskq_t *spa_upgrade_taskq; /* Taskq for upgrade jobs */ uint64_t spa_multihost; /* multihost aware (mmp) */ mmp_thread_t spa_mmp; /* multihost mmp thread */ list_t spa_leaf_list; /* list of leaf vdevs */ @@ -447,8 +449,6 @@ struct spa { */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ zfs_refcount_t spa_refcount; /* number of opens */ - - taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */ }; extern char *spa_config_path; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 3843419731b8..66e4f6a4b578 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -402,6 +402,12 @@ Practical upper limit of total metaslabs per top-level vdev. .It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable metaslab group preloading. . +.It Sy metaslab_preload_limit Ns = Ns Sy 10 Pq uint +Maximum number of metaslabs per group to preload +. +.It Sy metaslab_preload_pct Ns = Ns Sy 50 Pq uint +Percentage of CPUs to run a metaslab preload taskq +. .It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Give more weight to metaslabs with lower LBAs, assuming they have greater bandwidth, diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 8ae2f23c3ecf..38ef590702cb 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -596,28 +596,6 @@ SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, " space map to continue allocations in a first-fit fashion"); /* END CSTYLED */ -/* - * Percentage of all cpus that can be used by the metaslab taskq. - */ -extern int metaslab_load_pct; - -/* BEGIN CSTYLED */ -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, - CTLFLAG_RWTUN, &metaslab_load_pct, 0, - "Percentage of cpus that can be used by the metaslab taskq"); -/* END CSTYLED */ - -/* - * Max number of metaslabs per group to preload. - */ -extern uint_t metaslab_preload_limit; - -/* BEGIN CSTYLED */ -SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, - CTLFLAG_RWTUN, &metaslab_preload_limit, 0, - "Max number of metaslabs per group to preload"); -/* END CSTYLED */ - /* mmp.c */ int diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index cdf599b17924..599d7ffa0cf3 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -207,11 +207,6 @@ static const uint32_t metaslab_min_search_count = 100; */ static int metaslab_df_use_largest_segment = B_FALSE; -/* - * Percentage of all cpus that can be used by the metaslab taskq. - */ -int metaslab_load_pct = 50; - /* * These tunables control how long a metaslab will remain loaded after the * last allocation from it. A metaslab can't be unloaded until at least @@ -856,9 +851,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); } - mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, - maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); - return (mg); } @@ -874,7 +866,6 @@ metaslab_group_destroy(metaslab_group_t *mg) */ ASSERT(mg->mg_activation_count <= 0); - taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_disabled_lock); @@ -965,7 +956,7 @@ metaslab_group_passivate(metaslab_group_t *mg) * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); - taskq_wait_outstanding(mg->mg_taskq, 0); + taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < mg->mg_allocators; i++) { @@ -3529,10 +3520,8 @@ metaslab_group_preload(metaslab_group_t *mg) avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; - if (spa_shutting_down(spa) || !metaslab_preload_enabled) { - taskq_wait_outstanding(mg->mg_taskq, 0); + if (spa_shutting_down(spa) || !metaslab_preload_enabled) return; - } mutex_enter(&mg->mg_lock); @@ -3552,8 +3541,9 @@ metaslab_group_preload(metaslab_group_t *mg) continue; } - VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, - msp, TQ_SLEEP) != TASKQID_INVALID); + VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, + msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0)) + != TASKQID_INVALID); } mutex_exit(&mg->mg_lock); } @@ -6182,6 +6172,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, "Preload potential metaslabs during reassessment"); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW, + "Max number of metaslabs per group to preload"); + ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW, "Delay in txgs after metaslab was last used before unloading"); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 88ee4ea9f458..0dfa0c7b61ac 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -169,6 +169,11 @@ static int spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport); static void spa_vdev_resilver_done(spa_t *spa); +/* + * Percentage of all CPUs that can be used by the metaslab preload taskq. + */ +static uint_t metaslab_preload_pct = 50; + static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ static uint_t zio_taskq_batch_tpq; /* threads per taskq */ static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ @@ -1397,6 +1402,13 @@ spa_activate(spa_t *spa, spa_mode_t mode) spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1, INT_MAX, 0); + /* + * The taskq to preload metaslabs. + */ + spa->spa_metaslab_taskq = taskq_create("z_metaslab", + metaslab_preload_pct, maxclsyspri, 1, INT_MAX, + TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); + /* * Taskq dedicated to prefetcher threads: this is used to prevent the * pool traverse code from monopolizing the global (and limited) @@ -1432,6 +1444,11 @@ spa_deactivate(spa_t *spa) spa->spa_zvol_taskq = NULL; } + if (spa->spa_metaslab_taskq) { + taskq_destroy(spa->spa_metaslab_taskq); + spa->spa_metaslab_taskq = NULL; + } + if (spa->spa_prefetch_taskq) { taskq_destroy(spa->spa_prefetch_taskq); spa->spa_prefetch_taskq = NULL; @@ -1704,13 +1721,7 @@ spa_unload(spa_t *spa) * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. */ - if (spa->spa_root_vdev != NULL) { - for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { - vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; - if (vc->vdev_mg != NULL) - taskq_wait(vc->vdev_mg->mg_taskq); - } - } + taskq_wait(spa->spa_metaslab_taskq); if (spa->spa_mmp.mmp_thread) mmp_thread_stop(spa); @@ -10132,6 +10143,9 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs); /* asynchronous event notification */ EXPORT_SYMBOL(spa_event_notify); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, + "Percentage of CPUs to run a metaslab preload taskq"); + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, "log2 fraction of arc that can be used by inflight I/Os when " From 8495536f7f6051849d6e56834766cd4fe2ff9c5e Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 7 Oct 2023 03:06:29 +1100 Subject: [PATCH 18/78] zfsconcepts: add description of block cloning Here I'm trying to succinctly introduce the concept, the basics of its construction, how its different to dedup, how to use it, and where its limitations lie, in four paragraphs and with enough searchable terms to help the reader find more information both within OpenZFS and elsewhere. Phew. Sponsored-By: Klara, Inc. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #15362 --- man/man7/zfsconcepts.7 | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/man/man7/zfsconcepts.7 b/man/man7/zfsconcepts.7 index 18a9e9b5cafe..1be3d961c3d7 100644 --- a/man/man7/zfsconcepts.7 +++ b/man/man7/zfsconcepts.7 @@ -28,8 +28,9 @@ .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. +.\" Copyright 2023 Klara, Inc. .\" -.Dd June 30, 2019 +.Dd October 6, 2023 .Dt ZFSCONCEPTS 7 .Os . @@ -205,3 +206,40 @@ practices, such as regular backups. Consider using the .Sy compression property as a less resource-intensive alternative. +.Ss Block cloning +Block cloning is a facility that allows a file (or parts of a file) to be +.Qq cloned , +that is, a shallow copy made where the existing data blocks are referenced +rather than copied. +Later modifications to the data will cause a copy of the data block to be taken +and that copy modified. +This facility is used to implement +.Qq reflinks +or +.Qq file-level copy-on-write . +.Pp +Cloned blocks are tracked in a special on-disk structure called the Block +Reference Table +.Po BRT +.Pc . +Unlike deduplication, this table has minimal overhead, so can be enabled at all +times. +.Pp +Also unlike deduplication, cloning must be requested by a user program. +Many common file copying programs, including newer versions of +.Nm /bin/cp , +will try to create clones automatically. +Look for +.Qq clone , +.Qq dedupe +or +.Qq reflink +in the documentation for more information. +.Pp +There are some limitations to block cloning. +Only whole blocks can be cloned, and blocks can not be cloned if they are not +yet written to disk, or if they are encrypted, or the source and destination +.Sy recordsize +properties differ. +The OS may add additional restrictions; +for example, most versions of Linux will not allow clones across datasets. From 2919784be2025d3f9896a823cdb3595bed30630a Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 18 Sep 2023 11:07:32 +1000 Subject: [PATCH 19/78] tests: add tests for zpool import behaviour when hostid changes Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #15290 --- tests/runfiles/common.run | 4 ++ tests/zfs-tests/tests/Makefile.am | 4 ++ .../cli_root/zpool_import/zpool_import.cfg | 5 ++ .../cli_root/zpool_import/zpool_import.kshlib | 1 + .../zpool_import_hostid_changed.ksh | 59 +++++++++++++++ .../zpool_import_hostid_changed_cachefile.ksh | 65 +++++++++++++++++ ...ostid_changed_cachefile_unclean_export.ksh | 71 +++++++++++++++++++ ...l_import_hostid_changed_unclean_export.ksh | 70 ++++++++++++++++++ 8 files changed, 279 insertions(+) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index ef787c65c0f9..1435c55e8fc2 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -415,6 +415,10 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', 'zpool_import_errata3', 'zpool_import_errata4', + 'zpool_import_hostid_changed', + 'zpool_import_hostid_changed_unclean_export', + 'zpool_import_hostid_changed_cachefile', + 'zpool_import_hostid_changed_cachefile_unclean_export', 'import_cachefile_device_added', 'import_cachefile_device_removed', 'import_cachefile_device_replaced', diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 3272a5d5816f..158401e078aa 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1104,6 +1104,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \ functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \ functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index 4a9fb5e7489a..cf9c6a8499af 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -26,6 +26,7 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib @@ -63,3 +64,7 @@ export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4 export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5 export ALTER_ROOT=/alter_import-test + +export HOSTID_FILE="/etc/hostid" +export HOSTID1=01234567 +export HOSTID2=89abcdef diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib index 559810ff0e30..50157fa80578 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -11,6 +11,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh new file mode 100755 index 000000000000..bc82b7cc1ee8 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh @@ -0,0 +1,59 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that was cleanly exported should be importable without force even if +# the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Export the pool. +# 4. Change the hostid. +# 5. Verify that importing the pool without force succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create $TESTPOOL1 $VDEV0 + +# 3. Export the pool. +log_must zpool export $TESTPOOL1 + +# 4. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 5. Verify that importing the pool without force succeeds. +log_must zpool import -d $DEVICE_DIR $TESTPOOL1 + +log_pass "zpool import can import cleanly exported pool when hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh new file mode 100755 index 000000000000..07c43482d68f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that was cleanly exported should be importable from a cachefile +# without force even if the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool with a cachefile. +# 3. Backup the cachfile. +# 4. Export the pool. +# 5. Change the hostid. +# 6. Verify that importing the pool from the cachefile succeeds +# without force. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $CPATH $CPATHBKP + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0 + +# 3. Backup the cachfile. +log_must cp $CPATH $CPATHBKP + +# 4. Export the pool. +log_must zpool export $TESTPOOL1 + +# 5. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 6. Verify that importing the pool from the cachefile succeeds without force. +log_must zpool import -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import can import cleanly exported pool from cachefile " \ + "when hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh new file mode 100755 index 000000000000..8362d915f0cf --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that wasn't cleanly exported should be importable from a cachefile +# without force even if the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Backup the cachefile. +# 4. Simulate the pool being torn down without export: +# 4.1. Copy the underlying device state. +# 4.2. Export the pool. +# 4.3. Restore the device state from the copy. +# 5. Change the hostid. +# 6. Verify that importing the pool from the cachefile succeeds +# without force. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $CPATH $CPATHBKP $VDEV0.bak + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0 + +# 3. Backup the cachfile. +log_must cp $CPATH $CPATHBKP + +# 4. Simulate the pool being torn down without export. +log_must cp $VDEV0 $VDEV0.bak +log_must zpool export $TESTPOOL1 +log_must cp -f $VDEV0.bak $VDEV0 +log_must rm -f $VDEV0.bak + +# 5. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 6. Verify that importing the pool from the cachefile succeeds without force. +log_must zpool import -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import can import pool from cachefile if not cleanly " \ + "exported when hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh new file mode 100755 index 000000000000..ad8cca642dbc --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that wasn't cleanly exported should not be importable without force if +# the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Simulate the pool being torn down without export: +# 3.1. Copy the underlying device state. +# 3.2. Export the pool. +# 3.3. Restore the device state from the copy. +# 4. Change the hostid. +# 5. Verify that importing the pool fails. +# 6. Verify that importing the pool with force succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $VDEV0.bak + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create $TESTPOOL1 $VDEV0 + +# 3. Simulate the pool being torn down without export. +log_must cp $VDEV0 $VDEV0.bak +log_must zpool export $TESTPOOL1 +log_must cp -f $VDEV0.bak $VDEV0 +log_must rm -f $VDEV0.bak + +# 4. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 5. Verify that importing the pool fails. +log_mustnot zpool import -d $DEVICE_DIR $TESTPOOL1 + +# 6. Verify that importing the pool with force succeeds. +log_must zpool import -d $DEVICE_DIR -f $TESTPOOL1 + +log_pass "zpool import requires force if not cleanly exported " \ + "and hostid changed." From 33d7c2d165c2d2fa040282bf66b03305448e7eed Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 16 Sep 2023 17:02:02 +1000 Subject: [PATCH 20/78] import: require force when cachefile hostid doesn't match on-disk Previously, if a cachefile is passed to zpool import, the cached config is mostly offered as-is to ZFS_IOC_POOL_TRYIMPORT->spa_tryimport(), and the results are taken as the canonical pool config and handed back to ZFS_IOC_POOL_IMPORT. In the course of its operation, spa_load() will inspect the pool and build a new config from what it finds on disk. However, it then regenerates a new config ready to import, and so rightly sets the hostid and hostname for the local host in the config it returns. Because of this, the "require force" checks always decide the pool is exported and last touched by the local host, even if this is not true, which is possible in a HA environment when MMP is not enabled. The pool may be imported on another head, but the import checks still pass here, so the pool ends up imported on both. (This doesn't happen when a cachefile isn't used, because the pool config is discovered in userspace in zpool_find_import(), and that does find the on-disk hostid and hostname correctly). Since the systemd zfs-import-cache.service unit uses cachefile imports, this can lead to a system returning after a crash with a "valid" cachefile on disk and automatically, quietly, importing a pool that has already been taken up by a secondary head. This commit causes the on-disk hostid and hostname to be included in the ZPOOL_CONFIG_LOAD_INFO item in the returned config, and then changes the "force" checks for zpool import to use them if present. This method should give no change in behaviour for old userspace on new kernels (they won't know to look for the new config items) and for new userspace on old kernels (the won't find the new config items). Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #15290 --- cmd/zpool/zpool_main.c | 23 +++++++++++++++---- module/zfs/spa.c | 18 +++++++++++++++ ...ostid_changed_cachefile_unclean_export.ksh | 20 +++++++++------- 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d64fdfa5ba4c..5507f9d3fd67 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3122,12 +3122,21 @@ zfs_force_import_required(nvlist_t *config) nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + + /* + * The hostid on LOAD_INFO comes from the MOS label via + * spa_tryimport(). If its not there then we're likely talking to an + * older kernel, so use the top one, which will be from the label + * discovered in zpool_find_import(), or if a cachefile is in use, the + * local hostid. + */ + if (nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID, &hostid) != 0) + nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); - nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); @@ -3198,7 +3207,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, time_t timestamp = 0; uint64_t hostid = 0; - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTNAME)) + hostname = fnvlist_lookup_string(nvinfo, + ZPOOL_CONFIG_HOSTNAME); + else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); @@ -3206,7 +3218,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTID)) + hostid = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_HOSTID); + else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 0dfa0c7b61ac..1410651c63cc 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3931,6 +3931,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); + /* + * If 'zpool import' used a cached config, then the on-disk hostid and + * hostname may be different to the cached config in ways that should + * prevent import. Userspace can't discover this without a scan, but + * we know, so we add these values to LOAD_INFO so the caller can know + * the difference. + * + * Note that we have to do this before the config is regenerated, + * because the new config will have the hostid and hostname for this + * host, in readiness for import. + */ + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) + fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, + fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) + fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, + fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); + /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh index 8362d915f0cf..dcb1ac1ab69f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh @@ -20,8 +20,8 @@ # # DESCRIPTION: -# A pool that wasn't cleanly exported should be importable from a cachefile -# without force even if the local hostid doesn't match the on-disk hostid. +# A pool that wasn't cleanly exported should not be importable from a cachefile +# without force if the local hostid doesn't match the on-disk hostid. # # STRATEGY: # 1. Set a hostid. @@ -32,8 +32,9 @@ # 4.2. Export the pool. # 4.3. Restore the device state from the copy. # 5. Change the hostid. -# 6. Verify that importing the pool from the cachefile succeeds -# without force. +# 6. Verify that importing the pool from the cachefile fails. +# 7. Verify that importing the pool from the cachefile with force +# succeeds. # verify_runnable "global" @@ -64,8 +65,11 @@ log_must rm -f $VDEV0.bak # 5. Change the hostid. log_must zgenhostid -f $HOSTID2 -# 6. Verify that importing the pool from the cachefile succeeds without force. -log_must zpool import -c $CPATHBKP $TESTPOOL1 +# 6. Verify that importing the pool from the cachefile fails. +log_mustnot zpool import -c $CPATHBKP $TESTPOOL1 -log_pass "zpool import can import pool from cachefile if not cleanly " \ - "exported when hostid changes." +# 7. Verify that importing the pool from the cachefile with force succeeds. +log_must zpool import -f -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import from cachefile requires force if not cleanly " \ + "exported and hostid changes." From 3755cde22a33df9f06839c9034af27d25f2bc2da Mon Sep 17 00:00:00 2001 From: siv0 Date: Fri, 6 Oct 2023 18:53:23 +0200 Subject: [PATCH 21/78] rpm: Fix `make rpm` on Debian/Ubuntu The recent patch to change the bash completion install location based on the Distribution, ignored that it should still be possible to create RPMs on Debian derived systems. Additionally `make deb` itself creates RPMs and converts them via `alien`. This patch adds the bashcompletiondir variable to the rpm defines and uses this for the location, where to get the bash completion file. It still changes the location on Debian/Ubuntu systems in the final packages from /etc/bash_completion.d to /usr/share/bash-completion/completions Fixes: e69ade32e116e72d03068c03799924c3f1a15c95 Reviewed-by: Brian Behlendorf Signed-off-by: Stoiko Ivanov Closes #15355 Closes #15365 --- config/zfs-build.m4 | 3 +++ rpm/generic/zfs.spec.in | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 5ea6aa29a3de..e4197dc1424e 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -358,6 +358,9 @@ AC_DEFUN([ZFS_AC_RPM], [ AS_IF([test -n "$udevruledir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevruledir $(udevruledir)"' ]) + AS_IF([test -n "$bashcompletiondir" ], [ + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_bashcompletiondir $(bashcompletiondir)"' + ]) RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_SYSTEMD)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYZFS)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PAM)' diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 8c538a00d203..711e6c751dc0 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -522,7 +522,7 @@ systemctl --system daemon-reload >/dev/null || true %config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example %attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/* -%config(noreplace) %{_sysconfdir}/bash_completion.d/zfs +%config(noreplace) %{_bashcompletiondir}/zfs %files -n libzpool5 %{_libdir}/libzpool.so.* From 9be8ddfb3c2587b41c3d6263be110ee77be6b710 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 6 Oct 2023 13:09:27 -0400 Subject: [PATCH 22/78] ZIL: Reduce maximum size of WR_COPIED to 7.5K Benchmarks show that at certain write sizes range lock/unlock take not so much time as extra memory copy. The exact threshold is not obvious due to other overheads, but it is definitely lower than ~63KB used before. Make it configurable, defaulting at 7.5KB, that is 8KB of nearest malloc() size minus itx and lr structs. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15353 --- man/man4/zfs.4 | 5 +++++ module/zfs/zil.c | 17 +++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 66e4f6a4b578..cfadd79d87f3 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2150,6 +2150,11 @@ On very fragmented pools, lowering this .Pq typically to Sy 36 KiB can improve performance. . +.It Sy zil_maxcopied Ns = Ns Sy 7680 Ns B Po 7.5 KiB Pc Pq uint +This sets the maximum number of write bytes logged via WR_COPIED. +It tunes a tradeoff between additional memory copy and possibly worse log +space efficiency vs additional range lock/unlock. +. .It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64 This sets the minimum delay in nanoseconds ZIL care to delay block commit, waiting for more records. diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 9e9c9c22549d..18c6cbf028b3 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1958,26 +1958,28 @@ zil_max_log_data(zilog_t *zilog, size_t hdrsize) /* * Maximum amount of log space we agree to waste to reduce number of - * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). + * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%). */ static inline uint64_t zil_max_waste_space(zilog_t *zilog) { - return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8); + return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16); } /* * Maximum amount of write data for WR_COPIED. For correctness, consumers * must fall back to WR_NEED_COPY if we can't fit the entire record into one * maximum sized log block, because each WR_COPIED record must fit in a - * single log block. For space efficiency, we want to fit two records into a - * max-sized log block. + * single log block. Below that it is a tradeoff of additional memory copy + * and possibly worse log space efficiency vs additional range lock/unlock. */ +static uint_t zil_maxcopied = 7680; + uint64_t zil_max_copied_data(zilog_t *zilog) { - return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 - - sizeof (lr_write_t)); + uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t)); + return (MIN(max_data, zil_maxcopied)); } /* @@ -4226,3 +4228,6 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, "Limit in bytes of ZIL log block size"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW, + "Limit in bytes WR_COPIED size"); From 2407f30bda96f7d61a32fc38c638b3eb5b216284 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Sat, 7 Oct 2023 09:14:21 -0700 Subject: [PATCH 23/78] Tag 2.2.0-rc5 Signed-off-by: Brian Behlendorf --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 9ffe90458dbd..4178f1b5daa4 100644 --- a/META +++ b/META @@ -2,7 +2,7 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.2.0 -Release: rc4 +Release: rc5 Release-Tags: relext License: CDDL Author: OpenZFS From 120d1787d74d51fc65819be25dbc50f3303587e8 Mon Sep 17 00:00:00 2001 From: Sam James Date: Mon, 9 Oct 2023 20:50:06 +0100 Subject: [PATCH 24/78] config/zfs-build.m4: add Gentoo's bash-completion path Followup e69ade32e116e72d03068c03799924c3f1a15c95 by adding Gentoo's bash completion path. We should probably consider using/honouring the standard --with-bashcompletiondir autoconf option as well, but that's something to do later. Reviewed-by: Brian Behlendorf Reviewed-by: Umer Saleem Signed-off-by: Sam James Closes #15372 --- config/zfs-build.m4 | 1 + 1 file changed, 1 insertion(+) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index e4197dc1424e..5f36569fe25b 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -626,6 +626,7 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ ubuntu) bashcompletiondir=/usr/share/bash-completion/completions ;; debian) bashcompletiondir=/usr/share/bash-completion/completions ;; freebsd) bashcompletiondir=$sysconfdir/bash_completion.d;; + gentoo) bashcompletiondir=/usr/share/bash-completion/completions ;; *) bashcompletiondir=/etc/bash_completion.d ;; esac AC_MSG_RESULT([$bashcompletiondir]) From f6e6e77ed8a9be8cd8ef3dbc386a18908ea4f01b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 9 Oct 2023 16:27:18 -0400 Subject: [PATCH 25/78] FreeBSD: Reduce divergence from in-tree sources This includes random small tweaks, primarily a build fixes, required when ZFS is built as part of FreeBSD base. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15368 --- include/os/freebsd/spl/sys/atomic.h | 6 ++++-- include/os/freebsd/spl/sys/ccompat.h | 2 +- include/os/freebsd/spl/sys/ctype.h | 1 + include/os/freebsd/spl/sys/spl_condvar.h | 2 +- include/os/freebsd/zfs/sys/arc_os.h | 2 +- include/os/freebsd/zfs/sys/freebsd_event.h | 2 +- include/os/freebsd/zfs/sys/zfs_context_os.h | 2 +- include/sys/zfs_context.h | 2 ++ module/os/freebsd/zfs/event_os.c | 2 +- module/os/freebsd/zfs/kmod_core.c | 2 +- tests/zfs-tests/cmd/dosmode_readonly_write.c | 2 +- 11 files changed, 15 insertions(+), 10 deletions(-) diff --git a/include/os/freebsd/spl/sys/atomic.h b/include/os/freebsd/spl/sys/atomic.h index 8b9cec15c5e1..40a67704fde0 100644 --- a/include/os/freebsd/spl/sys/atomic.h +++ b/include/os/freebsd/spl/sys/atomic.h @@ -167,7 +167,7 @@ atomic_dec_64_nv(volatile uint64_t *target) return (atomic_add_64_nv(target, -1)); } -#if !defined(COMPAT_32BIT) && defined(__LP64__) +#ifdef __LP64__ static __inline void * atomic_cas_ptr(volatile void *target, void *cmp, void *newval) { @@ -181,7 +181,7 @@ atomic_cas_ptr(volatile void *target, void *cmp, void *newval) return ((void *)atomic_cas_32((volatile uint32_t *)target, (uint32_t)cmp, (uint32_t)newval)); } -#endif /* !defined(COMPAT_32BIT) && defined(__LP64__) */ +#endif /* __LP64__ */ #else /* _STANDALONE */ /* @@ -190,6 +190,8 @@ atomic_cas_ptr(volatile void *target, void *cmp, void *newval) */ #undef atomic_add_64 #define atomic_add_64(ptr, val) *(ptr) += val +#undef atomic_sub_64 +#define atomic_sub_64(ptr, val) *(ptr) -= val #endif /* !_STANDALONE */ #endif /* !_OPENSOLARIS_SYS_ATOMIC_H_ */ diff --git a/include/os/freebsd/spl/sys/ccompat.h b/include/os/freebsd/spl/sys/ccompat.h index eaee9159eabd..e34bab7e896d 100644 --- a/include/os/freebsd/spl/sys/ccompat.h +++ b/include/os/freebsd/spl/sys/ccompat.h @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/include/os/freebsd/spl/sys/ctype.h b/include/os/freebsd/spl/sys/ctype.h index 53afd8b8bd42..b9ca0f8ef2c4 100644 --- a/include/os/freebsd/spl/sys/ctype.h +++ b/include/os/freebsd/spl/sys/ctype.h @@ -39,5 +39,6 @@ ((C) >= 0x3A && (C) <= 0x40) || \ ((C) >= 0x5B && (C) <= 0x60) || \ ((C) >= 0x7B && (C) <= 0x7E)) +#define isspace(C) ((C) == 0x20 || ((C) >= 0x9 && (C) <= 0xD)) #endif diff --git a/include/os/freebsd/spl/sys/spl_condvar.h b/include/os/freebsd/spl/sys/spl_condvar.h index 7405f647d59a..2835adafd416 100644 --- a/include/os/freebsd/spl/sys/spl_condvar.h +++ b/include/os/freebsd/spl/sys/spl_condvar.h @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2000 Jake Burkholder . * All rights reserved. diff --git a/include/os/freebsd/zfs/sys/arc_os.h b/include/os/freebsd/zfs/sys/arc_os.h index a95618b91fed..ad2aba23b901 100644 --- a/include/os/freebsd/zfs/sys/arc_os.h +++ b/include/os/freebsd/zfs/sys/arc_os.h @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Martin Matuska * diff --git a/include/os/freebsd/zfs/sys/freebsd_event.h b/include/os/freebsd/zfs/sys/freebsd_event.h index 544ff8b0f81f..a32596d918a1 100644 --- a/include/os/freebsd/zfs/sys/freebsd_event.h +++ b/include/os/freebsd/zfs/sys/freebsd_event.h @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Rob Wing * diff --git a/include/os/freebsd/zfs/sys/zfs_context_os.h b/include/os/freebsd/zfs/sys/zfs_context_os.h index 1ce72330412c..457fa3af8142 100644 --- a/include/os/freebsd/zfs/sys/zfs_context_os.h +++ b/include/os/freebsd/zfs/sys/zfs_context_os.h @@ -78,7 +78,7 @@ extern int hz; extern int tick; typedef int fstrans_cookie_t; #define spl_fstrans_mark() (0) -#define spl_fstrans_unmark(x) (x = 0) +#define spl_fstrans_unmark(x) ((void)x) #define signal_pending(x) SIGPENDING(x) #define current curthread #define thread_join(x) diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 0d31195447d1..6a337b49edf3 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -695,8 +695,10 @@ extern char *kmem_asprintf(const char *fmt, ...); #define kmem_strfree(str) kmem_free((str), strlen(str) + 1) #define kmem_strdup(s) strdup(s) +#ifndef __cplusplus extern int kmem_scnprintf(char *restrict str, size_t size, const char *restrict fmt, ...); +#endif /* * Hostname information diff --git a/module/os/freebsd/zfs/event_os.c b/module/os/freebsd/zfs/event_os.c index 97ac151e4fa1..239d44d0cfe7 100644 --- a/module/os/freebsd/zfs/event_os.c +++ b/module/os/freebsd/zfs/event_os.c @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Rob Wing * diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c index f4c87013dbf0..9a268573528c 100644 --- a/module/os/freebsd/zfs/kmod_core.c +++ b/module/os/freebsd/zfs/kmod_core.c @@ -141,7 +141,7 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, if (len != sizeof (zfs_iocparm_t)) return (EINVAL); - uaddr = (void *)zp->zfs_cmd; + uaddr = (void *)(uintptr_t)zp->zfs_cmd; zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); #ifdef ZFS_LEGACY_SUPPORT /* diff --git a/tests/zfs-tests/cmd/dosmode_readonly_write.c b/tests/zfs-tests/cmd/dosmode_readonly_write.c index 0441d1c7b472..b45602d80651 100644 --- a/tests/zfs-tests/cmd/dosmode_readonly_write.c +++ b/tests/zfs-tests/cmd/dosmode_readonly_write.c @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 iXsystems, Inc. * From 8d47d2d5799727107a5d9046b13dbc3e09a91d60 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 9 Oct 2023 17:22:44 -0700 Subject: [PATCH 26/78] ZTS: Move zpool_import_hostid_changed* tests to Linux runfile Relocate the zpool_import_hostid_changed* test cases to the Linux runfile until these tests are modified to run cleanly on FreeBSD. Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #15377 --- tests/runfiles/common.run | 4 ---- tests/runfiles/linux.run | 7 +++++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 1435c55e8fc2..ef787c65c0f9 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -415,10 +415,6 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', 'zpool_import_errata3', 'zpool_import_errata4', - 'zpool_import_hostid_changed', - 'zpool_import_hostid_changed_unclean_export', - 'zpool_import_hostid_changed_cachefile', - 'zpool_import_hostid_changed_cachefile_unclean_export', 'import_cachefile_device_added', 'import_cachefile_device_removed', 'import_cachefile_device_replaced', diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 2c8d5cb0ecbb..2252e46df3a8 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -82,6 +82,13 @@ tests = ['zpool_expand_001_pos', 'zpool_expand_002_pos', 'zpool_expand_003_neg', 'zpool_expand_004_pos', 'zpool_expand_005_pos'] tags = ['functional', 'cli_root', 'zpool_expand'] +[tests/functional/cli_root/zpool_import:Linux] +tests = ['zpool_import_hostid_changed', + 'zpool_import_hostid_changed_unclean_export', + 'zpool_import_hostid_changed_cachefile', + 'zpool_import_hostid_changed_cachefile_unclean_export'] +tags = ['functional', 'cli_root', 'zpool_import'] + [tests/functional/cli_root/zpool_reopen:Linux] tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos', 'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos', From 9fa06c5574f1b4036ed187729070194f7f43522b Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 10 Oct 2023 05:24:24 +0500 Subject: [PATCH 27/78] ZTS: Fix verify_fs_mount in delegate_common.kshlib verify_fs_mount expects the dataset to remain unmounted after updating the mountpoint property in delegate_common.kshlib. This commit updates verify_fs_mount and uses nomount parameter for zfs set to update the mountpoint property without mounting the dataset. This fixes the zfs_allow_010_pos test case, which was failing on FreeBSD after the behavior update in setting the mountpoint property. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #15376 --- .../tests/functional/delegate/delegate_common.kshlib | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib b/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib index 3f2f089e8171..5ddb6ca2ddc8 100644 --- a/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib +++ b/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib @@ -861,7 +861,7 @@ function verify_fs_mount fi if ! ismounted $fs ; then - log_must zfs set mountpoint=$newmntpt $fs + log_must zfs set -u mountpoint=$newmntpt $fs log_must rm -rf $newmntpt log_must mkdir $newmntpt @@ -878,7 +878,7 @@ function verify_fs_mount fi log_must zfs umount $fs log_must rm -rf $newmntpt - log_must zfs set mountpoint=$mntpt $fs + log_must zfs set -u mountpoint=$mntpt $fs fi return 0 From da93b72c91ae187987f29fa248b28e4b8c94df93 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 6 Aug 2023 01:58:45 +1000 Subject: [PATCH 28/78] mailmap: initial, trying to tidy up a lot of the commit history This comes from the observation that a huge number of commit author fields look quite strange (to my eyes), but quite often the Signed-off-by: trailer has the correct name. For these I have updated the name where it was obvious how to do so, however, I have not created a mapping for the commit email to the Signed-off-by email, as whatever I choose for email will become the prime candidate for inclusion in the AUTHORS file, and care needs to be taken when acting without explicit consent. There's a small handful of commits that look like they were done on local machines, or CI hosts, or similar, where the git authorship config wasn't set up properly. Its obvious what this should look like, so I've just done them. The remainder is mapping Github noreply emails to either an obviously-correct Signed-off-by trailer, or to a an author from another commit. This was mostly done by hand, so there may be errors, but I think its close. I do not understand where these come from - I know that they're what commits made via Github web look like when there's no real address set on the account, but I find it hard to believe that so many of these came through the web, especially given the complexity of most of the changes. I suspect there's some kind of merge helper tool in play here. Regardless, the history is set now, and this tries to get it back on track. Obviously, all of this helps the history look tidy, but this also feeds into the AUTHORS update script. See next commit. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #15374 --- .gitignore | 1 + .mailmap | 189 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 .mailmap diff --git a/.gitignore b/.gitignore index 1ef47d921c28..47d17ae16d34 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ !.cirrus.yml !.gitignore !.gitmodules +!.mailmap !AUTHORS !autogen.sh !CODE_OF_CONDUCT.md diff --git a/.mailmap b/.mailmap new file mode 100644 index 000000000000..46ef016b93f8 --- /dev/null +++ b/.mailmap @@ -0,0 +1,189 @@ +# This file maps the name+email seen in a commit back to a canonical +# name+email. Git will replace the commit name/email with the canonical version +# wherever it sees it. +# +# If there is a commit in the history with a "wrong" name or email, list it +# here. If you regularly commit with an alternate name or email address and +# would like to ensure that you are always listed consistently in the repo, add +# mapping here. +# +# On the other hand, if you use multiple names or email addresses legitimately +# (eg you use a company email address for your paid OpenZFS work, and a +# personal address for your evening side projects), then don't map one to the +# other here. +# +# The most common formats are: +# +# Canonical Name +# Canonical Name +# Canonical Name Commit Name +# +# See https://git-scm.com/docs/gitmailmap for more info. + +# These maps are making names consistent where they have varied but the email +# address has never changed. In most cases, the full name is in the +# Signed-off-by of a commit with a matching author. +Ahelenia Ziemiańska +Ahelenia Ziemiańska +Alex John +Andreas Dilger +Andrew Walker +Benedikt Neuffer +Chengfei Zhu +Chris Lindee +Colm Buckley +Crag Wang +Damian Szuberski +Daniel Kolesa +Debabrata Banerjee +Finix Yan +Gaurav Kumar +Gionatan Danti +Glenn Washburn +Gordan Bobic +Gregory Bartholomew +hedong zhang +InsanePrawn +Jason Cohen +Jason Harmening +Jeremy Faulkner +Jinshan Xiong +John Poduska +Justin Scholz +Ka Ho Ng +Kash Pande +Kay Pedersen +KernelOfTruth +Liu Hua +Liu Qing +loli10K +Matthias Blankertz +Michael Gmelin +Olivier Mazouffre +Piotr Kubaj +Quentin Zdanis +Roberto Ricci +Rob Norris +Rob Norris +Sam Lunt +Sanjeev Bagewadi +Stoiko Ivanov +Tamas TEVESZ +WHR +Yanping Gao +Youzhong Yang + +# Commits from strange places, long ago +Brian Behlendorf +Brian Behlendorf +Brian Behlendorf +Brian Behlendorf +Brian Behlendorf +Herb Wartens +Ned Bass +Tulsi Jain + +# Mappings from Github no-reply addresses +ajs124 +Alek Pinchuk +Alexander Lobakin +Alexey Smirnoff +Allen Holl <65494904+allen-4@users.noreply.github.com> +Ameer Hamza <106930537+ixhamza@users.noreply.github.com> +Andrew J. Hesford <48421688+ahesford@users.noreply.github.com>> +Andrew Sun +Aron Xu +Arun KV <65647132+arun-kv@users.noreply.github.com> +Ben Wolsieffer +bernie1995 <42413912+bernie1995@users.noreply.github.com> +Boris Protopopov +Brad Forschinger +Brandon Thetford +buzzingwires <131118055+buzzingwires@users.noreply.github.com> +Cedric Maunoury <38213715+cedricmaunoury@users.noreply.github.com> +Charles Suh +Dacian Reece-Stremtan <35844628+dacianstremtan@users.noreply.github.com> +Damian Szuberski <30863496+szubersk@users.noreply.github.com> +Daniel Hiepler <32984777+heeplr@users.noreply.github.com> +Daniel Kobras +Daniel Reichelt +David Quigley +DHE +Dmitri John Ledkov <19779+xnox@users.noreply.github.com> +Dries Michiels <32487486+driesmp@users.noreply.github.com> +Edmund Nadolski <137826107+ednadolski-ix@users.noreply.github.com> +Érico Nogueira <34201958+ericonr@users.noreply.github.com> +Fedor Uporov <60701163+fuporovvStack@users.noreply.github.com> +Felix Dörre +Felix Neumärker <34678034+xdch47@users.noreply.github.com> +Finix Yan +Gaurav Kumar +George Gaydarov +Georgy Yakovlev <168902+gyakovlev@users.noreply.github.com> +Gerardwx +Gian-Carlo DeFazio +Giuseppe Di Natale +Hajo Möller +Harry Mallon <1816667+hjmallon@users.noreply.github.com> +Hiếu Lê +Jake Howard +James Cowgill +Jason King +Jeff Dike <52420226+jdike@users.noreply.github.com> +Jitendra Patidar <53164267+jsai20@users.noreply.github.com> +João Carlos Mendes Luís +John Eismeier <32205350+jeis2497052@users.noreply.github.com> +John L. Hammond <35266395+jhammond-intel@users.noreply.github.com> +John-Mark Gurney +John Ramsden +Jonathon Fernyhough <559369+jonathonf@users.noreply.github.com> +Justin Hibbits +Kevin Jin <33590050+jxdking@users.noreply.github.com> +Kevin P. Fleming +Krzysztof Piecuch <3964215+pikrzysztof@users.noreply.github.com> +Kyle Evans +Laurențiu Nicola +loli10K +Lorenz Hüdepohl +Luís Henriques <73643340+lumigch@users.noreply.github.com> +Marcin Skarbek +Matt Fiddaman <81489167+matt-fidd@users.noreply.github.com> +Max Zettlmeißl <6818198+maxz@users.noreply.github.com> +Michael Niewöhner +Michael Zhivich <33133421+mzhivich@users.noreply.github.com> +Mo Zhou <5723047+cdluminate@users.noreply.github.com> +Nick Mattis +omni <79493359+omnivagant@users.noreply.github.com> +Pablo Correa Gómez <32678034+pablofsf@users.noreply.github.com> +Paul Zuchowski <31706010+PaulZ-98@users.noreply.github.com> +Peter Ashford +Peter Dave Hello +Peter Wirdemo <4224155+pewo@users.noreply.github.com> +Petros Koutoupis +Ping Huang <101400146+hpingfs@users.noreply.github.com> +Piotr P. Stefaniak +Richard Allen <33836503+belperite@users.noreply.github.com> +Rich Ercolani <214141+rincebrain@users.noreply.github.com> +Rob Wing <98866084+rob-wing@users.noreply.github.com> +Roman Strashkin +Ryan Hirasaki <4690732+RyanHir@users.noreply.github.com> +Samuel Wycliffe J <115969550+samwyc@users.noreply.github.com> +Samuel Wycliffe <50765275+npc203@users.noreply.github.com> +Savyasachee Jha +Scott Colby +Sean Eric Fagan +Spencer Kinny <30333052+Spencer-Kinny@users.noreply.github.com> +Srikanth N S <75025422+nssrikanth@users.noreply.github.com> +Thomas Geppert +Tim Crawford +Tom Matthews +Tony Perkins <62951051+tony-zfs@users.noreply.github.com> +Torsten Wörtwein +Tulsi Jain +Václav Skála <33496485+vaclavskala@users.noreply.github.com> +Violet Purcell <66446404+vimproved@users.noreply.github.com> +Vipin Kumar Verma <75025470+vermavipinkumar@users.noreply.github.com> +Wolfgang Bumiller +xtouqh <72357159+xtouqh@users.noreply.github.com> +Yuri Pankov <113725409+yuripv@users.noreply.github.com> +Yuri Pankov <82001006+yuripv@users.noreply.github.com> From 3990273ffe79b4f05e882ce3b1cacd80b8b218a7 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 6 Aug 2023 02:10:31 +1000 Subject: [PATCH 29/78] update_authors: add missing names from commits to AUTHORS Full description of what's happening in comments. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #15374 --- scripts/Makefile.am | 1 + scripts/update_authors.pl | 322 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 323 insertions(+) create mode 100755 scripts/update_authors.pl diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 4175d27ea32a..17f24ff6a48b 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -9,6 +9,7 @@ dist_noinst_SCRIPTS = \ %D%/man-dates.sh \ %D%/mancheck.sh \ %D%/paxcheck.sh \ + %D%/update_authors.pl \ %D%/zfs-tests-color.sh scripts_scripts = \ diff --git a/scripts/update_authors.pl b/scripts/update_authors.pl new file mode 100755 index 000000000000..8dd49b5fb38d --- /dev/null +++ b/scripts/update_authors.pl @@ -0,0 +1,322 @@ +#!/usr/bin/env perl + +# SPDX-License-Identifier: MIT +# +# Copyright (c) 2023, Rob Norris +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +# This program will update the AUTHORS file to include commit authors that are +# in the git history but are not yet credited. +# +# The CONTRIBUTORS section of the AUTHORS file attempts to be a list of +# individual contributors to OpenZFS, with one name, address and line per +# person. This is good for readability, but does not really leave room for the +# that names and emails on commits from the same individual can be different, +# for all kinds of reasons, not limited to: +# +# - a person might change organisations, and so their email address changes +# +# - a person might be paid to work on OpenZFS for their employer, and then hack +# on personal projects in the evening, so commits legitimately come from +# different addresses +# +# - names change for all kinds of reasons +# +# To try and account for this, this program will try to find all the possible +# names and emails for a single contributor, and then select the "best" one to +# add to the AUTHORS file. +# +# The CONTRIBUTORS section of the AUTHORS file is considered the source of +# truth. Once an individual committer is listed in there, that line will not be +# removed regardless of what is discovered in the commit history. However, it +# can't just be _anything_. The name or email still has to match something seen +# in the commit history, so that we're able to undertand that its the same +# contributor. +# +# The bulk of the work is in running `git log` to fetch commit author names and +# emails. For each value, we generate a "slug" to use as an internal id for +# that value, which is mostly just the lowercase of the value with whitespace +# and punctuation removed. Two values with subtle differences can produce the +# same slug, so at this point we also try to keep the "best" pre-slug value as +# the display version. We use this slug to update two maps, one of email->name, +# the other of name->email. +# +# Once collected, we then walk all the emails we've seen and get all the names +# associated with every instance. Then for each of those names, we get all the +# emails associated, and so on until we've seen all the connected names and +# emails. This collection is every possible name and email for an individual +# contributor. +# +# Finaly, we consider these groups, and select the "best" name and email for +# the contributor, and add them to the author tables if they aren't there +# already. Once we've done everyone, we write out a new AUTHORS file, and +# that's the whole job. +# +# This is imperfect! Its necessary for the user to examine the diff and make +# sure its sensible. If it hasn't hooked up right, it may necessary to adjust +# the input data (via .mailmap) or improve the heuristics in this program. It +# took a long time to get into good shape when first written (355 new names +# added to AUTHORS!) but hopefully in the future we'll be running this +# regularly so it doesn't fall so far behind. + + +use 5.010; +use warnings; +use strict; + +# Storage for the "best looking" version of name or email, keyed on slug. +my %display_name; +my %display_email; + +# First, we load the existing AUTHORS file. We save everything before +# CONTRIBUTORS: line as-is so we can write it back out to the new file. Then +# we extract name,email pairs from the remainder and store them in a pair of +# hashtables, keyed on slug. +my %authors_name; +my %authors_email; + +my @authors_header; + +for my $line (do { local (@ARGV) = ('AUTHORS'); <> }) { + chomp $line; + state $in_header = 1; + if ($in_header) { + push @authors_header, $line; + $in_header = 0 if $line =~ m/^CONTRIBUTORS:/; + } else { + my ($name, $email) = $line =~ m/^\s+(.+)(?= <) <([^>]+)/; + next unless $name; + + my $semail = email_slug($email); + my $sname = name_slug($name); + + $authors_name{$semail} = $sname; + $authors_email{$sname} = $semail; + + # The name/email in AUTHORS is already the "best looking" + # version, by definition. + $display_name{$sname} = $name; + $display_email{$semail} = $email; + } +} + +# Next, we load all the commit authors. and form name<->email mappings, keyed +# on slug. Note that this format is getting the .mailmap-converted form. This +# lets us control the input to some extent by making changes there. +my %git_names; +my %git_emails; + +for my $line (reverse qx(git log --pretty=tformat:'%aN:::%aE')) { + chomp $line; + my ($name, $email) = $line =~ m/^(.*):::(.*)/; + next unless $name && $email; + + my $semail = email_slug($email); + my $sname = name_slug($name); + + $git_names{$semail}{$sname} = 1; + $git_emails{$sname}{$semail} = 1; + + # Update the "best looking" display value, but only if we don't already + # have something from the AUTHORS file. If we do, we must not change it. + if (!$authors_name{email_slug($email)}) { + update_display_email($email); + } + + if (!$authors_email{name_slug($name)}) { + update_display_name($name); + } +} + +# Now collect unique committers by all names+emails we've ever seen for them. +# We start with emails and resolve all possible names, then we resolve the +# emails for those names, and round and round until there's nothing left. +my @committers; +for my $start_email (sort keys %git_names) { + # it might have been deleted already through a cross-reference + next unless $git_names{$start_email}; + + my %emails; + my %names; + + my @check_emails = ($start_email); + my @check_names; + while (@check_emails || @check_names) { + while (my $email = shift @check_emails) { + next if $emails{$email}++; + push @check_names, + sort keys %{delete $git_names{$email}}; + } + while (my $name = shift @check_names) { + next if $names{$name}++; + push @check_emails, + sort keys %{delete $git_emails{$name}}; + } + } + + # A "committer" is the collection of connected names and emails. + push @committers, [[sort keys %emails], [sort keys %names]]; +} + +# Now we have our committers, we can work out what to add to AUTHORS. +for my $committer (@committers) { + my ($emails, $names) = @$committer; + + # If this commiter is already in AUTHORS, we must not touch. + next if grep { $authors_name{$_} } @$emails; + next if grep { $authors_email{$_} } @$names; + + # Decide on the "best" name and email to use + my $email = best_email(@$emails); + my $name = best_name(@$names); + + $authors_email{$name} = $email; + $authors_name{$email} = $name; +} + +# Now output the new AUTHORS file +open my $fh, '>', 'AUTHORS' or die "E: couldn't open AUTHORS for write: $!\n"; +#my $fh = \*STDOUT; +say $fh join("\n", @authors_header, ""); +for my $name (sort keys %authors_email) { + my $cname = $display_name{$name}; + my $cemail = $display_email{email_slug($authors_email{$name})}; + say $fh " $cname <$cemail>"; +} + +exit 0; + +# "Slugs" are used at the hashtable key for names and emails. They are used to +# making two variants of a value be the "same" for matching. Mostly this is +# to make upper and lower-case versions of a name or email compare the same, +# but we do a little bit of munging to handle some common cases. +# +# Note that these are only used for matching internally; for display, the +# slug will be used to look up the display form. +sub name_slug { + my ($name) = @_; + + # Remove spaces and dots, to handle differences in initials. + $name =~ s/[\s\.]//g; + + return lc $name; +} +sub email_slug { + my ($email) = @_; + + # Remove everything up to and including the first space, and the last + # space and everything after it. + $email =~ s/^(.*\s+)|(\s+.*)$//g; + + # Remove the leading userid+ on Github noreply addresses. They're + # optional and we want to treat them as the same thing. + $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/; + + return lc $email; +} + +sub update_display_name { + my ($name) = @_; + my $sname = name_slug($name); + + # For names, "more specific" means "has more non-lower-case characters" + # (in ASCII), guessing that if a person has gone to some effort to + # specialise their name in a later commit, they presumably care more + # about it. If this is wrong, its probably better to add a .mailmap + # entry. + + my $cname = $display_name{$sname}; + if (!$cname || + ($name =~ tr/a-z //) < ($cname =~ tr/a-z //)) { + $display_name{$sname} = $name; + } +} +sub update_display_email { + my ($email) = @_; + my $semail = email_slug($email); + + # Like names, we prefer uppercase when possible. We also remove any + # leading "plus address" for Github noreply addresses. + $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/; + + my $cemail = $display_email{$semail}; + if (!$cemail || + ($email =~ tr/a-z //) < ($cemail =~ tr/a-z //)) { + $display_email{$semail} = $email; + } +} + +sub best_name { + my @names = sort { + my $cmp; + my ($aa) = $display_name{$a}; + my ($bb) = $display_name{$b}; + + # The "best" name is very subjective, and a simple sort + # produced good-enough results, so I didn't try harder. Use of + # accented characters, punctuation and caps are probably an + # indicator of "better", but possibly we should also take into + # account the most recent name we saw, in case the committer + # has changed their name or nickname or similar. + # + # Really, .mailmap is the place to control this. + + return ($aa cmp $bb); + } @_; + + return shift @names; +} +sub best_email { + state $internal_re = qr/\.(?:internal|local|\(none\))$/; + state $noreply_re = qr/\.noreply\.github\.com$/; + state $freemail_re = qr/\@(?:gmail|hotmail)\.com$/; + + my @emails = sort { + my $cmp; + + # prefer address with a single @ over those without + $cmp = (($b =~ tr/@//) == 1) <=> (($a =~ tr/@//) == 1); + return $cmp unless $cmp == 0; + + # prefer any address over internal/local addresses + $cmp = (($a =~ $internal_re) <=> ($b =~ $internal_re)); + return $cmp unless $cmp == 0; + + # prefer any address over github noreply aliases + $cmp = (($a =~ $noreply_re) <=> ($b =~ $noreply_re)); + return $cmp unless $cmp == 0; + + # prefer any address over freemail providers + $cmp = (($a =~ $freemail_re) <=> ($b =~ $freemail_re)); + return $cmp unless $cmp == 0; + + # alphabetical by domain + my ($alocal, $adom) = split /\@/, $a; + my ($blocal, $bdom) = split /\@/, $b; + $cmp = ($adom cmp $bdom); + return $cmp unless $cmp == 0; + + # alphabetical by local part + return ($alocal cmp $blocal); + } @_; + + return shift @emails; +} From 111ae3364cbbd990b7fa568baa7fcbbb6c9fd13d Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 6 Aug 2023 02:11:19 +1000 Subject: [PATCH 30/78] AUTHORS: update with missing names This is generated by scripts/update_authors.pl. I've looked over the results fairly closely and while I don't think they're bad, they could be improved somewhat, but also, I don't know if its good form to just update this without explicit consent from those named. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #15374 --- AUTHORS | 361 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 337 insertions(+), 24 deletions(-) diff --git a/AUTHORS b/AUTHORS index c2af58d75085..be1efb87b34c 100644 --- a/AUTHORS +++ b/AUTHORS @@ -10,228 +10,450 @@ PAST MAINTAINERS: CONTRIBUTORS: Aaron Fineman + Adam D. Moss Adam Leventhal Adam Stevko + adisbladis + Adrian Chadd + Ahelenia Ziemiańska Ahmed G + Aidan Harris + AJ Jordan + ajs124 Akash Ayare + Akash B Alan Somers Alar Aun Albert Lee Alec Salazar + Alejandro Colomar Alejandro R. Sedeño Alek Pinchuk Aleksa Sarai - Alex Braunegg - Alex McWhirter - Alex Reece - Alex Wilson - Alex Zhuravlev Alexander Eremin + Alexander Lobakin Alexander Motin Alexander Pyhalov + Alexander Richardson Alexander Stetsenko + Alex Braunegg Alexey Shvetsov Alexey Smirnoff + Alex John + Alex McWhirter + Alex Reece + Alex Wilson + Alex Zhuravlev Allan Jude + Allen Holl + alteriks + Alyssa Ross + Ameer Hamza + Anatoly Borodin AndCycle + Andrea Gelmini + Andrea Righi Andreas Buschmann Andreas Dilger + Andreas Vögele Andrew Barnes Andrew Hamilton + Andrew Innes + Andrew J. Hesford Andrew Reid Andrew Stormont + Andrew Sun Andrew Tselischev + Andrew Turner + Andrew Walker + Andrey Prokopenko Andrey Vesnovaty Andriy Gapon Andy Bakun + Andy Fiddaman Aniruddha Shankar + Anton Gubarkov Antonio Russo Arkadiusz Bubała + Armin Wehrfritz Arne Jansen Aron Xu + Arshad Hussain + Arun KV + Arvind Sankar + Attila Fülöp + Avatat Bart Coddens Basil Crow - Huang Liu + Bassu Ben Allen - Ben Rubson + Ben Cordero + Benedikt Neuffer Benjamin Albrecht + Benjamin Gentil + Ben McGough + Ben Rubson + Ben Wolsieffer + bernie1995 Bill McGonigle Bill Pijewski Boris Protopopov + Brad Forschinger Brad Lewis + Brandon Thetford + Brian Atkinson Brian Behlendorf Brian J. Murrell + Brooks Davis + BtbN + bunder2015 + buzzingwires + bzzz77 + cable2999 Caleb James DeLisle Cao Xuewen Carlo Landmeter Carlos Alberto Lopez Perez + Cedric Maunoury Chaoyu Zhang + Charles Suh Chen Can + Chengfei Zhu Chen Haiquan Chip Parker Chris Burroughs Chris Dunlap Chris Dunlop + Chris Lindee + Chris McDonough Chris Siden - Chris Wedgwood - Chris Williamson - Chris Zubrzycki - Christ Schlacta + Chris Siebenmann Christer Ekholm Christian Kohlschütter Christian Neukirchen Christian Schwarz Christopher Voltz + Christ Schlacta + Chris Wedgwood + Chris Williamson + Chris Zubrzycki + Chuck Tuffli Chunwei Chen Clemens Fruhwirth + Clemens Lang + Clint Armstrong Coleman Kane Colin Ian King + Colm Buckley + Crag Wang Craig Loomis Craig Sanders Cyril Plisko - DHE + Cy Schubert + Cédric Berger + Dacian Reece-Stremtan + Dag-Erling Smørgrav + Damiano Albani + Damian Szuberski Damian Wojsław + Daniel Hiepler + Daniel Hoffman + Daniel Kobras + Daniel Kolesa + Daniel Reichelt + Daniel Stevenson + Daniel Verite + Daniil Lunev Dan Kimmel Dan McDonald Dan Swartzendruber Dan Vatca - Daniel Hoffman - Daniel Verite - Daniil Lunev Darik Horn Dave Eddy + David Hedberg David Lamparter David Qian David Quigley Debabrata Banerjee + D. Ebdrup Denys Rtveliashvili Derek Dai + DHE + Didier Roche Dimitri John Ledkov + Dimitry Andric + Dirkjan Bussink Dmitry Khasanov + Dominic Pearson Dominik Hassler Dominik Honnef Don Brady + Doug Rabson Dr. András Korn + Dries Michiels + Edmund Nadolski + Eitan Adler Eli Rosenthal + Eli Schwartz Eric Desrochers Eric Dillmann Eric Schrock + Ethan Coe-Renner Etienne Dechamps + Evan Allrich + Evan Harris Evan Susarret Fabian Grünbichler + Fabio Buso + Fabio Scaccabarozzi Fajar A. Nugraha Fan Yong + fbynite + Fedor Uporov + Felix Dörre + Felix Neumärker Feng Sun + Finix Yan + Francesco Mazzoli Frederik Wessels Frédéric Vanniere + Gabriel A. Devenyi Garrett D'Amore + Garrett Fields Garrison Jensen Gary Mills Gaurav Kumar GeLiXin George Amanakis + George Diamantopoulos + George Gaydarov George Melikov George Wilson Georgy Yakovlev + Gerardwx + Gian-Carlo DeFazio + Gionatan Danti Giuseppe Di Natale + Glenn Washburn Gordan Bobic + Gordon Bergling Gordon Ross + Graham Christensen + Graham Perrin Gregor Kopka + Gregory Bartholomew + grembo Grischa Zengel + grodik Gunnar Beutner Gvozden Neskovic Hajo Möller + Han Gao Hans Rosenfeld + Harald van Dijk + Harry Mallon + Harry Sintonen + HC + hedong zhang + Heitor Alves de Siqueira + Henrik Riomar + Herb Wartens + Hiếu Lê + Huang Liu Håkan Johansson + Igor K Igor Kozhukhov Igor Lvovsky + ilbsmart + illiliti + ilovezfs + InsanePrawn Isaac Huang - JK Dingwall Jacek Fefliński + Jacob Adams + Jake Howard James Cowgill + James H James Lee James Pan + James Wah Jan Engelhardt Jan Kryl Jan Sanislo + Jason Cohen + Jason Harmening Jason King Jason Zaman Javen Wu + Jean-Baptiste Lallement + Jeff Dike + Jeremy Faulkner Jeremy Gill Jeremy Jones + Jeremy Visser Jerry Jelinek + Jessica Clarke Jinshan Xiong + Jitendra Patidar + JK Dingwall Joe Stein + John-Mark Gurney John Albietz John Eismeier - John L. Hammond + John Gallagher John Layman + John L. Hammond + John M. Layman + Johnny Stenback John Paul Adrian Glaubitz + John Poduska + John Ramsden John Wren Kennedy - Johnny Stenback + jokersus + Jonathon Fernyhough Jorgen Lundman Josef 'Jeff' Sipek + Josh Soref Joshua M. Clulow + José Luis Salvador Rufo + João Carlos Mendes Luís + Julian Brunner + Julian Heuking + jumbi77 Justin Bedő + Justin Gottula + Justin Hibbits + Justin Keogh Justin Lecher + Justin Scholz Justin T. Gibbs + jyxent Jörg Thalheim - KORN Andras + ka7 + Ka Ho Ng Kamil Domański Karsten Kretschmer Kash Pande + Kay Pedersen Keith M Wesolowski + KernelOfTruth + Kevin Bowling + Kevin Jin + Kevin P. Fleming Kevin Tanguy KireinaHoro Kjeld Schouten-Lebbing + Kleber Tarcísio + Kody A Kantor Kohsuke Kawaguchi + Konstantin Khorenko + KORN Andras + Kristof Provost + Krzysztof Piecuch Kyle Blatter + Kyle Evans Kyle Fuller - Loli + Laevos + Lalufu Lars Johannsen + Laura Hild + Laurențiu Nicola + Lauri Tirkkonen + liaoyuxiangqin Li Dongyang + Liu Hua + Liu Qing Li Wei + Loli + lorddoskias + Lorenz Brun + Lorenz Hüdepohl + louwrentius Lukas Wunner + luozhengzheng + Luís Henriques Madhav Suresh + manfromafar Manoj Joseph Manuel Amador (Rudd-O) Marcel Huber + Marcel Menzel + Marcel Schilling Marcel Telka Marcel Wysocki + Marcin Skarbek + Mariusz Zaborski + Mark Johnston + Mark Maybee + Mark Roper Mark Shellenbaum + marku89 Mark Wright Martin Matuska + Martin Rüegg Massimo Maggi - Matt Johnston - Matt Kemp + Mateusz Guzik + Mateusz Piotrowski <0mp@FreeBSD.org> + Mathieu Velten + Matt Fiddaman Matthew Ahrens Matthew Thode + Matthias Blankertz + Matt Johnston + Matt Kemp + Matt Macy Matus Kral + Mauricio Faria de Oliveira Max Grossman Maximilian Mehnert + Max Zettlmeißl + Md Islam + megari + Michael D Labriola + Michael Franzl Michael Gebetsroither Michael Kjorling Michael Martin Michael Niewöhner + Michael Zhivich + Michal Vasilek Mike Gerdts Mike Harsch Mike Leddy Mike Swanson Milan Jurik + Minsoo Choo + Mohamed Tawfik Morgan Jones Moritz Maxeiner + Mo Zhou + naivekun + nathancheek Nathaniel Clark Nathaniel Wesley Filardo + Nathan Lewis Nav Ravindranath Neal Gompa (ニール・ゴンパ) Ned Bass Neependra Khare Neil Stockbridge + Nick Black Nick Garvey + Nick Mattis + Nick Terrell + Niklas Haas Nikolay Borisov + nordaux + ofthesun9 Olaf Faaland Oleg Drokin Oleg Stepura + Olivier Mazouffre + omni + Orivej Desh + Pablo Correa Gómez + Palash Gandhi + Patrick Mooney Patrik Greco Paul B. Henson Paul Dagnelie @@ -243,69 +465,160 @@ CONTRIBUTORS: Pedro Giffuni Peng Peter Ashford + Peter Dave Hello + Peter Levine + Peter Wirdemo + Petros Koutoupis + Philip Pokorny + Philipp Riederer + Phil Kauffman + Ping Huang + Piotr Kubaj + Piotr P. Stefaniak Prakash Surya Prasad Joshi + privb0x23 + P.SCH + Quentin Zdanis + Rafael Kitover + RageLtMan Ralf Ertzinger Randall Mason Remy Blank + renelson + Reno Reckling Ricardo M. Correia - Rich Ercolani + Riccardo Schirone + Richard Allen Richard Elling Richard Laager Richard Lowe Richard Sharpe Richard Yao + Rich Ercolani + Robert Novak + Roberto Ricci + Rob Norris + Rob Wing Rohan Puri Romain Dolbeau Roman Strashkin + Ross Williams Ruben Kerkhof + Ryan Hirasaki + Ryan Lahfa + Ryan Libby + Ryan Moeller + Sam Hathaway + Sam Lunt + Samuel VERSCHELDE + Samuel Wycliffe + Samuel Wycliffe J + Sanjeev Bagewadi + Sara Hartse Saso Kiselkov + Satadru Pramanik + Savyasachee Jha + Scott Colby Scot W. Stevenson Sean Eric Fagan Sebastian Gottschall + Sebastien Roy Sen Haerens Serapheim Dimitropoulos Seth Forshee + Shaan Nobee Shampavman + Shaun Tancheff Shen Yan Simon Guest Simon Klinkert Sowrabha Gopal + Spencer Kinny + Srikanth N S Stanislav Seletskiy Steffen Müthing Stephen Blinick + sterlingjensen Steve Dougherty + Steve Mokris Steven Burgess Steven Hartland Steven Johnson + Steven Noonan + stf Stian Ellingsen + Stoiko Ivanov + Stéphane Lesimple Suman Chakravartula Sydney Vanda Sören Tempel + Tamas TEVESZ + Teodor Spæren + TerraTech Thijs Cramer + Thomas Geppert + Thomas Lamprecht + Till Maas Tim Chase Tim Connors Tim Crawford Tim Haley + timor + Timothy Day + Tim Schumacher Tino Reichardt Tobin Harding Tom Caputi Tom Matthews - Tom Prince Tomohiro Kusumi + Tom Prince Tony Hutter + Tony Nguyen + Tony Perkins Toomas Soome + Torsten Wörtwein + Toyam Cox + Trevor Bautista Trey Dockendorf + Troels Nørgaard + Tulsi Jain Turbo Fredriksson Tyler J. Stachecki + Umer Saleem + Valmiky Arquissandas + Val Packett + Vince van Oosten + Violet Purcell + Vipin Kumar Verma Vitaut Bajaryn + Volker Mauel + Václav Skála + Walter Huf + Warner Losh Weigang Li + WHR Will Andrews Will Rouesnel + Windel Bouwman + Wojciech Małota-Wójcik Wolfgang Bumiller Xin Li + Xinliang Liu + xtouqh + Yann Collet + Yanping Gao Ying Zhu + Youzhong Yang + yparitcher + yuina822 YunQiang Su Yuri Pankov Yuxuan Shui Zachary Bedell + Zach Dykstra + zgock + Zhu Chuang + Érico Nogueira + Đoàn Trần Công Danh + 韩朴宇 From a80e1f1c90e1fa9142618754ce18f179896f6d7d Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 10 Oct 2023 08:57:48 -0700 Subject: [PATCH 31/78] zvol: Temporally disable blk-mq There was a report of zvol data loss (#15351) after enabling blk-mq on a zvol backed with 16k physical block sized disks. Out of an abundance of caution, do not allow the user to enable blk-mq until we can look into the issue. Note that blk-mq was not enabled by default on zvols. It was always opt-in via the zvol_use_blk_mq module parameter. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Nguyen Signed-off-by: Tony Hutter Addresses: #15351 Closes #15378 --- man/man4/zfs.4 | 57 ---------------------------- module/os/linux/zfs/zvol_os.c | 12 ------ tests/zfs-tests/include/tunables.cfg | 2 +- 3 files changed, 1 insertion(+), 70 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index cfadd79d87f3..71a3e67ee67e 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2317,63 +2317,6 @@ If .Sy zvol_threads to the number of CPUs present or 32 (whichever is greater). . -.It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint -The number of threads per zvol to use for queuing IO requests. -This parameter will only appear if your kernel supports -.Li blk-mq -and is only read and assigned to a zvol at zvol load time. -If -.Sy 0 -(the default) then internally set -.Sy zvol_blk_mq_threads -to the number of CPUs present. -. -.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint -Set to -.Sy 1 -to use the -.Li blk-mq -API for zvols. -Set to -.Sy 0 -(the default) to use the legacy zvol APIs. -This setting can give better or worse zvol performance depending on -the workload. -This parameter will only appear if your kernel supports -.Li blk-mq -and is only read and assigned to a zvol at zvol load time. -. -.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint -If -.Sy zvol_use_blk_mq -is enabled, then process this number of -.Sy volblocksize Ns -sized blocks per zvol thread. -This tunable can be use to favor better performance for zvol reads (lower -values) or writes (higher values). -If set to -.Sy 0 , -then the zvol layer will process the maximum number of blocks -per thread that it can. -This parameter will only appear if your kernel supports -.Li blk-mq -and is only applied at each zvol's load time. -. -.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint -The queue_depth value for the zvol -.Li blk-mq -interface. -This parameter will only appear if your kernel supports -.Li blk-mq -and is only applied at each zvol's load time. -If -.Sy 0 -(the default) then use the kernel's default queue depth. -Values are clamped to the kernel's -.Dv BLKDEV_MIN_RQ -and -.Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ -limits. -. .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint Defines zvol block devices behaviour when .Sy volmode Ns = Ns Sy default : diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 7a95b54bdf0d..76521c95911e 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1620,18 +1620,6 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); -#ifdef HAVE_BLK_MQ -module_param(zvol_blk_mq_queue_depth, uint, 0644); -MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); - -module_param(zvol_use_blk_mq, uint, 0644); -MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); - -module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); -MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, - "Process volblocksize blocks per thread"); -#endif - #ifndef HAVE_BLKDEV_GET_ERESTARTSYS module_param(zvol_open_timeout_ms, uint, 0644); MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 80e7bcb3bd09..8010a9451597 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -89,7 +89,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED -VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq +VOL_USE_BLK_MQ UNSUPPORTED UNSUPPORTED XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max From 75a7740574e119b40b7f4144d1d3928d3a5c9b98 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 10 Oct 2023 08:59:10 -0700 Subject: [PATCH 32/78] ZTS: Remove zfs_allow_010_pos expection for FreeBSD This issue should now be address by PR #15376 and the exception for this test case be removed. Reviewed-by: Alexander Motin Reviewed-by: Umer Saleem Signed-off-by: Brian Behlendorf Closes #15382 --- tests/test-runner/bin/zts-report.py.in | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 558e4b57279d..5d1360380de5 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -262,7 +262,6 @@ if sys.platform.startswith('freebsd'): 'cli_root/zfs_inherit/zfs_inherit_001_neg': ['FAIL', known_reason], 'cli_root/zpool_import/zpool_import_012_pos': ['FAIL', known_reason], 'delegate/zfs_allow_003_pos': ['FAIL', known_reason], - 'delegate/zfs_allow_010_pos': ['FAIL', known_reason], 'inheritance/inherit_001_pos': ['FAIL', 11829], 'resilver/resilver_restart_001': ['FAIL', known_reason], 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], From 810fc49a3eb16647c905d02bd4f945e1e4332251 Mon Sep 17 00:00:00 2001 From: Daniel Berlin Date: Tue, 10 Oct 2023 14:04:32 -0400 Subject: [PATCH 33/78] Ensure we call fput when cloning fails due to different devices. Right now, zpl_ioctl_ficlone and zpl_ioctl_ficlonerange do not call put on the src fd if the source and destination are on two different devices. This leaves the source file held open in this case. Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Daniel Berlin Closes #15386 --- module/os/linux/zfs/zpl_file_range.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 2abbf44df587..c47fe99dacff 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -202,8 +202,10 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg) if (src_file == NULL) return (-EBADF); - if (dst_file->f_op != src_file->f_op) + if (dst_file->f_op != src_file->f_op) { + fput(src_file); return (-EXDEV); + } size_t len = i_size_read(file_inode(src_file)); @@ -237,8 +239,10 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) if (src_file == NULL) return (-EBADF); - if (dst_file->f_op != src_file->f_op) + if (dst_file->f_op != src_file->f_op) { + fput(src_file); return (-EXDEV); + } size_t len = fcr.fcr_src_length; if (len == 0) From 04186d33be72b8cdb539f1428682b7d3ca3f7b5e Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 10 Oct 2023 13:31:15 -0700 Subject: [PATCH 34/78] CI: Move perl script to dist_noinst_DATA Everything listed in dist_noinst_SCRIPTS is assumed to be a shell script, this generates a shellcheck SC1071 error since perl is not supported. Move update_authors.pl to dist_noinst_DATA with the other perl scripts. Reviewed-by: Tony Hutter Reviewed-by: Rob N Signed-off-by: Brian Behlendorf Closes #15392 --- scripts/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 17f24ff6a48b..95640727ac6a 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -9,7 +9,6 @@ dist_noinst_SCRIPTS = \ %D%/man-dates.sh \ %D%/mancheck.sh \ %D%/paxcheck.sh \ - %D%/update_authors.pl \ %D%/zfs-tests-color.sh scripts_scripts = \ @@ -28,6 +27,7 @@ endif dist_noinst_DATA += \ %D%/cstyle.pl \ %D%/enum-extract.pl \ + %D%/update_authors.pl \ %D%/zfs2zol-patch.sed \ %D%/zol2zfs-patch.sed From d7b6e470ff4b84ee503085036094e0a499af9cd9 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 10 Oct 2023 13:32:33 -0700 Subject: [PATCH 35/78] ZTS: Debug zfs_share_concurrent_shares failure Update zfs_share_concurrent_shares test case to wait a few seconds and recheck that the filesystem isn't shared. The intent here is determine the nature of the error and if it may be a race. Reviewed-by: Tony Hutter Reviewed-by: Umer Saleem Signed-off-by: Brian Behlendorf Closes #15379 --- .../zfs_share/zfs_share_concurrent_shares.ksh | 39 ++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh index c226f56e3dcb..d779689f83bd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh @@ -98,11 +98,26 @@ function test_share # filesystem zfs set sharenfs=on $filesystem || \ sub_fail "zfs set sharenfs=on $filesystem failed." - is_shared $mntp || \ - sub_fail "File system $filesystem is not shared (set sharenfs)." # - # Verify 'zfs share' works as well. + # Verify 'zfs share' results in a shared mount. We check this + # multiple times because of Fedora 37+ it's been observed in + # the CI that the share may not be immediately reported. + # + for retry in $(seq 1 10); do + is_shared $mntp && break + + log_note "Wait $retry / 10 for is_shared $mntp (set sharenfs)" + + if [[ $retry -eq 10 ]]; then + sub_fail "File system $filesystem is not shared (set sharenfs)." + fi + + sleep 1 + done + + # + # Verify 'zfs unshare' works as well. # zfs unshare $filesystem || \ sub_fail "zfs unshare $filesystem failed." @@ -112,9 +127,23 @@ function test_share # filesystem zfs share $filesystem || \ sub_fail "zfs share $filesystem failed." - is_shared $mntp || \ - sub_fail "file system $filesystem is not shared (zfs share)." + # + # Verify 'zfs share' results in a shared mount. We check this + # multiple times because of Fedora 37+ it's been observed in + # the CI that the share may not be immediately reported. + # + for retry in $(seq 1 10); do + is_shared $mntp && break + + log_note "Wait $retry / 10 for is_shared $mntp (zfs share)" + + if [[ $retry -eq 10 ]]; then + sub_fail "File system $filesystem is not shared (zfs share)." + fi + + sleep 1 + done #log_note "Sharing a shared file system fails." zfs share $filesystem && \ From 30ee2ee8ecabe75a5a011e2355747114df7f7bee Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 11 Oct 2023 16:56:32 -0700 Subject: [PATCH 36/78] spec: define _bashcompletiondir if undefined Always define _bashcompletiondir in the spec file to a reasonable value when it is undefined. Required for `rpmbuild --rebuild `. Signed-off-by: Brian Behlendorf Closes #15396 --- rpm/generic/zfs.spec.in | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 711e6c751dc0..2e89abd0edfd 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -19,6 +19,15 @@ %endif %endif +# Set the default _bashcompletiondir directory based on distribution. +%if %{undefined _bashcompletiondir} +%if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} +%global _bashcompletiondir /etc/bash_completion.d +%else +%global _bashcompletiondir /usr/share/bash-completion +%endif +%endif + # Set the default dracut directory based on distribution. %if %{undefined _dracutdir} %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} From 2bba9fd479f5dce01df31bceb532c5a9e9d5c5ca Mon Sep 17 00:00:00 2001 From: Jason King Date: Thu, 12 Oct 2023 13:01:54 -0500 Subject: [PATCH 37/78] Zpool can start allocating from metaslab before TRIMs have completed When doing a manual TRIM on a zpool, the metaslab being TRIMmed is potentially re-enabled before all queued TRIM zios for that metaslab have completed. Since TRIM zios have the lowest priority, it is possible to get into a situation where allocations occur from the just re-enabled metaslab and cut ahead of queued TRIMs to the same metaslab. If the ranges overlap, this will cause corruption. We were able to trigger this pretty consistently with a small single top-level vdev zpool (i.e. small number of metaslabs) with heavy parallel write activity while performing a manual TRIM against a somewhat 'slow' device (so TRIMs took a bit of time to complete). With the patch, we've not been able to recreate it since. It was on illumos, but inspection of the OpenZFS trim code looks like the relevant pieces are largely unchanged and so it appears it would be vulnerable to the same issue. Reviewed-by: Igor Kozhukhov Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Jason King Illumos-issue: https://www.illumos.org/issues/15939 Closes #15395 --- module/zfs/vdev_trim.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 0d71b9434342..03e17db024ea 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -23,6 +23,7 @@ * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + * Copyright 2023 RackTop Systems, Inc. */ #include @@ -591,6 +592,7 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; + int error = 0; ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; @@ -610,19 +612,32 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; for (uint64_t w = 0; w < writes_required; w++) { - int error; - error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + rs_get_start(rs, ta->trim_tree) + (w *extent_bytes_max), MIN(size - (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { - return (error); + goto done; } } } - return (0); +done: + /* + * Make sure all TRIMs for this metaslab have completed before + * returning. TRIM zios have lower priority over regular or syncing + * zios, so all TRIM zios for this metaslab must complete before the + * metaslab is re-enabled. Otherwise it's possible write zios to + * this metaslab could cut ahead of still queued TRIM zios for this + * metaslab causing corruption if the ranges overlap. + */ + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[0] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + return (error); } static void @@ -941,11 +956,6 @@ vdev_trim_thread(void *arg) } spa_config_exit(spa, SCL_CONFIG, FTAG); - mutex_enter(&vd->vdev_trim_io_lock); - while (vd->vdev_trim_inflight[0] > 0) { - cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); - } - mutex_exit(&vd->vdev_trim_io_lock); range_tree_destroy(ta.trim_tree); From 95785196f26e92d82cf4445654ba84e4a9671c57 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 12 Oct 2023 16:14:14 -0700 Subject: [PATCH 38/78] Tag 2.2.0 New Features - Block cloning (#13392) - Linux container support (#14070, #14097, #12263) - Scrub error log (#12812, #12355) - BLAKE3 checksums (#12918) - Corrective "zfs receive" - Vdev and zpool user properties Performance - Fully adaptive ARC (#14359) - SHA2 checksums (#13741) - Edon-R checksums (#13618) - Zstd early abort (#13244) - Prefetch improvements (#14603, #14516, #14402, #14243, #13452) - General optimization (#14121, #14123, #14039, #13680, #13613, #13606, #13576, #13553, #12789, #14925, #14948) Signed-off-by: Brian Behlendorf --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 4178f1b5daa4..0d7df10d47db 100644 --- a/META +++ b/META @@ -2,7 +2,7 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.2.0 -Release: rc5 +Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS From 459c99ff2339a4a514abcf2255f9b3e5324ef09e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Tue, 31 Oct 2023 21:49:41 +0100 Subject: [PATCH 39/78] Fix block cloning between unencrypted and encrypted datasets Block cloning from an encrypted dataset into an unencrypted dataset and vice versa is not possible. The current code did allow cloning unencrypted files into an encrypted dataset causing a panic when these were accessed. Block cloning between encrypted and encrypted is currently supported on the same filesystem only. Reviewed-by: Alexander Motin Reviewed-by: Kay Pedersen Reviewed-by: Rob N Reviewed-by: Brian Behlendorf Signed-off-by: Martin Matuska Closes #15464 Closes #15465 --- module/zfs/zfs_vnops.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 40d6c87a754e..84e6b10ef37c 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1094,6 +1094,15 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, ASSERT(!outzfsvfs->z_replay); + /* + * Block cloning from an unencrypted dataset into an encrypted + * dataset and vice versa is not supported. + */ + if (inos->os_encrypted != outos->os_encrypted) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EXDEV)); + } + error = zfs_verify_zp(inzp); if (error == 0) error = zfs_verify_zp(outzp); From b76724ae478a7c2f73693b39d8009101efb54995 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 13 Oct 2023 13:41:11 -0400 Subject: [PATCH 40/78] FreeBSD: Improve taskq wrapper - Group tqent_task and tqent_timeout_task into a union. They are never used same time. This shrinks taskq_ent_t from 192 to 160 bytes. - Remove tqent_registered. Use tqent_id != 0 instead. - Remove tqent_cancelled. Use taskqueue pending counter instead. - Change tqent_type into uint_t. We don't need to pack it any more. - Change tqent_rc into uint_t, matching refcount(9). - Take shared locks in taskq_lookup(). - Call proper taskqueue_drain_timeout() for TIMEOUT_TASK in taskq_cancel_id() and taskq_wait_id(). - Switch from CK_LIST to regular LIST. Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Reviewed-by: Mateusz Guzik Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15356 --- include/os/freebsd/spl/sys/taskq.h | 18 ++++---- module/os/freebsd/spl/spl_taskq.c | 74 +++++++++++++++--------------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h index 30579b391711..b23a939b3aa7 100644 --- a/include/os/freebsd/spl/sys/taskq.h +++ b/include/os/freebsd/spl/sys/taskq.h @@ -30,9 +30,9 @@ #include #include +#include #include #include -#include #ifdef __cplusplus extern "C" { @@ -48,16 +48,16 @@ typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); typedef struct taskq_ent { - struct task tqent_task; - struct timeout_task tqent_timeout_task; + union { + struct task tqent_task; + struct timeout_task tqent_timeout_task; + }; task_func_t *tqent_func; void *tqent_arg; - taskqid_t tqent_id; - CK_LIST_ENTRY(taskq_ent) tqent_hash; - uint8_t tqent_type; - uint8_t tqent_registered; - uint8_t tqent_cancelled; - volatile uint32_t tqent_rc; + taskqid_t tqent_id; + LIST_ENTRY(taskq_ent) tqent_hash; + uint_t tqent_type; + volatile uint_t tqent_rc; } taskq_ent_t; /* diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index ba22c77b69c3..daefe3559538 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -30,8 +30,6 @@ __FBSDID("$FreeBSD$"); #include -#include -#include #include #include #include @@ -70,7 +68,7 @@ extern int uma_align_cache; static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures"); -static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; +static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; static unsigned long tqenthash; static unsigned long tqenthashlock; static struct sx *tqenthashtbl_lock; @@ -80,8 +78,8 @@ static taskqid_t tqidnext; #define TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash]) #define TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)]) +#define NORMAL_TASK 0 #define TIMEOUT_TASK 1 -#define NORMAL_TASK 2 static void system_taskq_init(void *arg) @@ -121,7 +119,7 @@ system_taskq_fini(void *arg) for (i = 0; i < tqenthashlock + 1; i++) sx_destroy(&tqenthashtbl_lock[i]); for (i = 0; i < tqenthash + 1; i++) - VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i])); + VERIFY(LIST_EMPTY(&tqenthashtbl[i])); free(tqenthashtbl_lock, M_TASKQ); free(tqenthashtbl, M_TASKQ); } @@ -162,27 +160,27 @@ taskq_lookup(taskqid_t tqid) { taskq_ent_t *ent = NULL; - sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) { + if (tqid == 0) + return (NULL); + sx_slock(TQIDHASHLOCK(tqid)); + LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) { if (ent->tqent_id == tqid) break; } if (ent != NULL) refcount_acquire(&ent->tqent_rc); - sx_xunlock(TQIDHASHLOCK(tqid)); + sx_sunlock(TQIDHASHLOCK(tqid)); return (ent); } static taskqid_t taskq_insert(taskq_ent_t *ent) { - taskqid_t tqid; + taskqid_t tqid = __taskq_genid(); - tqid = __taskq_genid(); ent->tqent_id = tqid; - ent->tqent_registered = B_TRUE; sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash); + LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash); sx_xunlock(TQIDHASHLOCK(tqid)); return (tqid); } @@ -192,13 +190,14 @@ taskq_remove(taskq_ent_t *ent) { taskqid_t tqid = ent->tqent_id; - if (!ent->tqent_registered) + if (tqid == 0) return; - sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_REMOVE(ent, tqent_hash); + if (ent->tqent_id != 0) { + LIST_REMOVE(ent, tqent_hash); + ent->tqent_id = 0; + } sx_xunlock(TQIDHASHLOCK(tqid)); - ent->tqent_registered = B_FALSE; } static void @@ -285,21 +284,22 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid) int rc; taskq_ent_t *ent; - if (tid == 0) - return (0); - if ((ent = taskq_lookup(tid)) == NULL) return (0); - ent->tqent_cancelled = B_TRUE; - if (ent->tqent_type == TIMEOUT_TASK) { + if (ent->tqent_type == NORMAL_TASK) { + rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); + if (rc == EBUSY) + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + } else { rc = taskqueue_cancel_timeout(tq->tq_queue, &ent->tqent_timeout_task, &pend); - } else - rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); - if (rc == EBUSY) { - taskqueue_drain(tq->tq_queue, &ent->tqent_task); - } else if (pend) { + if (rc == EBUSY) { + taskqueue_drain_timeout(tq->tq_queue, + &ent->tqent_timeout_task); + } + } + if (pend) { /* * Tasks normally free themselves when run, but here the task * was cancelled so it did not free itself. @@ -312,12 +312,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid) } static void -taskq_run(void *arg, int pending __unused) +taskq_run(void *arg, int pending) { taskq_ent_t *task = arg; - if (!task->tqent_cancelled) - task->tqent_func(task->tqent_arg); + if (pending == 0) + return; + task->tqent_func(task->tqent_arg); taskq_free(task); } @@ -345,7 +346,6 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, task->tqent_func = func; task->tqent_arg = arg; task->tqent_type = TIMEOUT_TASK; - task->tqent_cancelled = B_FALSE; refcount_init(&task->tqent_rc, 1); tqid = taskq_insert(task); TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0, @@ -379,7 +379,6 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) refcount_init(&task->tqent_rc, 1); task->tqent_func = func; task->tqent_arg = arg; - task->tqent_cancelled = B_FALSE; task->tqent_type = NORMAL_TASK; tqid = taskq_insert(task); TASK_INIT(&task->tqent_task, prio, taskq_run, task); @@ -388,10 +387,12 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) } static void -taskq_run_ent(void *arg, int pending __unused) +taskq_run_ent(void *arg, int pending) { taskq_ent_t *task = arg; + if (pending == 0) + return; task->tqent_func(task->tqent_arg); } @@ -406,8 +407,6 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, * can go at the front of the queue. */ prio = !!(flags & TQ_FRONT); - task->tqent_cancelled = B_FALSE; - task->tqent_registered = B_FALSE; task->tqent_id = 0; task->tqent_func = func; task->tqent_arg = arg; @@ -427,12 +426,13 @@ taskq_wait_id(taskq_t *tq, taskqid_t tid) { taskq_ent_t *ent; - if (tid == 0) - return; if ((ent = taskq_lookup(tid)) == NULL) return; - taskqueue_drain(tq->tq_queue, &ent->tqent_task); + if (ent->tqent_type == NORMAL_TASK) + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + else + taskqueue_drain_timeout(tq->tq_queue, &ent->tqent_timeout_task); taskq_free(ent); } From 6d693e20a20d7abd88dbae1f1075b2aca8c2faa2 Mon Sep 17 00:00:00 2001 From: John Wren Kennedy Date: Fri, 13 Oct 2023 12:15:09 -0600 Subject: [PATCH 41/78] Large sync writes perform worse with slog For synchronous write workloads with large IO sizes, a pool configured with a slog performs worse than one with an embedded zil: sequential_writes 1m sync ios, 16 threads Write IOPS: 1292 438 -66.10% Write Bandwidth: 1323570 448910 -66.08% Write Latency: 12128400 36330970 3.0x sequential_writes 1m sync ios, 32 threads Write IOPS: 1293 430 -66.74% Write Bandwidth: 1324184 441188 -66.68% Write Latency: 24486278 74028536 3.0x The reason is the `zil_slog_bulk` variable. In `zil_lwb_write_open`, if a zil block is greater than 768K, the priority of the write is downgraded from sync to async. Increasing the value allows greater throughput. To select a value for this PR, I ran an fio workload with the following values for `zil_slog_bulk`: zil_slog_bulk KiB/s 1048576 422132 2097152 478935 4194304 533645 8388608 623031 12582912 827158 16777216 1038359 25165824 1142210 33554432 1211472 50331648 1292847 67108864 1308506 100663296 1306821 134217728 1304998 At 64M, the results with a slog are now improved to parity with an embedded zil: sequential_writes 1m sync ios, 16 threads Write IOPS: 438 1288 2.9x Write Bandwidth: 448910 1319062 2.9x Write Latency: 36330970 12163408 -66.52% sequential_writes 1m sync ios, 32 threads Write IOPS: 430 1290 3.0x Write Bandwidth: 441188 1321693 3.0x Write Latency: 74028536 24519698 -66.88% None of the other tests in the performance suite (run with a zil or slog) had a significant change, including the random_write_zil tests, which use multiple datasets. Reviewed-by: Alexander Motin Reviewed-by: Tony Nguyen Signed-off-by: John Wren Kennedy Closes #14378 --- man/man4/zfs.4 | 2 +- module/zfs/zil.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 71a3e67ee67e..5f89f6adf1e3 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2172,7 +2172,7 @@ if a volatile out-of-order write cache is enabled. Disable intent logging replay. Can be disabled for recovery from corrupted ZIL. . -.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768 KiB Pc Pq u64 +.It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Limit SLOG write size per commit executed with synchronous priority. Any writes above that will be executed with lower (asynchronous) priority to limit potential SLOG device abuse by single active ZIL writer. diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 18c6cbf028b3..a11886136994 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -145,7 +145,7 @@ static int zil_nocacheflush = 0; * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. */ -static uint64_t zil_slog_bulk = 768 * 1024; +static uint64_t zil_slog_bulk = 64 * 1024 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; From 78fd79eacdc98f5452f69d62b55ba3f6c4d8018c Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 21 Sep 2023 08:36:26 -0700 Subject: [PATCH 42/78] Add zfs_prepare_disk script for disk firmware install Have libzfs call a special `zfs_prepare_disk` script before a disk is included into the pool. The user can edit this script to add things like a disk firmware update or a disk health check. Use of the script is totally optional. See the zfs_prepare_disk manpage for full details. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15243 --- cmd/zed/agents/zfs_mod.c | 43 +++++- cmd/zpool/zpool_iter.c | 33 +--- cmd/zpool/zpool_util.h | 4 + cmd/zpool/zpool_vdev.c | 43 +++++- config/Rules.am | 1 + contrib/debian/openzfs-zfsutils.install | 2 + include/libzfs.h | 9 ++ lib/libzfs/libzfs.abi | 4 + lib/libzfs/libzfs_util.c | 193 ++++++++++++++++++++++++ man/Makefile.am | 1 + man/man8/.gitignore | 1 + man/man8/zfs_prepare_disk.8.in | 70 +++++++++ scripts/Makefile.am | 2 + scripts/zfs_prepare_disk | 17 +++ 14 files changed, 388 insertions(+), 35 deletions(-) create mode 100644 man/man8/zfs_prepare_disk.8.in create mode 100755 scripts/zfs_prepare_disk diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 2f040ff7582c..b2c008ad1d0e 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -146,6 +146,17 @@ zfs_unavail_pool(zpool_handle_t *zhp, void *data) return (0); } +/* + * Write an array of strings to the zed log + */ +static void lines_to_zed_log_msg(char **lines, int lines_cnt) +{ + int i; + for (i = 0; i < lines_cnt; i++) { + zed_log_msg(LOG_INFO, "%s", lines[i]); + } +} + /* * Two stage replace on Linux * since we get disk notifications @@ -200,6 +211,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) boolean_t is_mpath_wholedisk = B_FALSE; uint_t c; vdev_stat_t *vs; + char **lines = NULL; + int lines_cnt = 0; if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) return; @@ -383,6 +396,22 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) if (is_mpath_wholedisk) { /* Don't label device mapper or multipath disks. */ + zed_log_msg(LOG_INFO, + " it's a multipath wholedisk, don't label"); + if (zpool_prepare_disk(zhp, vdev, "autoreplace", &lines, + &lines_cnt) != 0) { + zed_log_msg(LOG_INFO, + " zpool_prepare_disk: could not " + "prepare '%s' (%s)", fullpath, + libzfs_error_description(g_zfshdl)); + if (lines_cnt > 0) { + zed_log_msg(LOG_INFO, + " zfs_prepare_disk output:"); + lines_to_zed_log_msg(lines, lines_cnt); + } + libzfs_free_str_array(lines, lines_cnt); + return; + } } else if (!labeled) { /* * we're auto-replacing a raw disk, so label it first @@ -405,10 +434,18 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * If this is a request to label a whole disk, then attempt to * write out the label. */ - if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { - zed_log_msg(LOG_INFO, " zpool_label_disk: could not " + if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname, + vdev, "autoreplace", &lines, &lines_cnt) != 0) { + zed_log_msg(LOG_INFO, + " zpool_prepare_and_label_disk: could not " "label '%s' (%s)", leafname, libzfs_error_description(g_zfshdl)); + if (lines_cnt > 0) { + zed_log_msg(LOG_INFO, + " zfs_prepare_disk output:"); + lines_to_zed_log_msg(lines, lines_cnt); + } + libzfs_free_str_array(lines, lines_cnt); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); @@ -468,6 +505,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) DEV_BYID_PATH, new_devid); } + libzfs_free_str_array(lines, lines_cnt); + /* * Construct the root vdev to pass to zpool_vdev_attach(). While adding * the entire vdev structure is harmless, we construct a reduced set of diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 7c6549b0ae54..506b529dce48 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -443,37 +443,22 @@ vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) { int rc; char *argv[2] = {cmd}; - char *env[5] = {(char *)"PATH=/bin:/sbin:/usr/bin:/usr/sbin"}; + char **env; char **lines = NULL; int lines_cnt = 0; int i; - /* Setup our custom environment variables */ - rc = asprintf(&env[1], "VDEV_PATH=%s", - data->path ? data->path : ""); - if (rc == -1) { - env[1] = NULL; + env = zpool_vdev_script_alloc_env(data->pool, data->path, data->upath, + data->vdev_enc_sysfs_path, NULL, NULL); + if (env == NULL) goto out; - } - - rc = asprintf(&env[2], "VDEV_UPATH=%s", - data->upath ? data->upath : ""); - if (rc == -1) { - env[2] = NULL; - goto out; - } - - rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s", - data->vdev_enc_sysfs_path ? - data->vdev_enc_sysfs_path : ""); - if (rc == -1) { - env[3] = NULL; - goto out; - } /* Run the command */ rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines, &lines_cnt); + + zpool_vdev_script_free_env(env); + if (rc != 0) goto out; @@ -485,10 +470,6 @@ vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) out: if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); - - /* Start with i = 1 since env[0] was statically allocated */ - for (i = 1; i < ARRAY_SIZE(env); i++) - free(env[i]); } /* diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h index b35dea0cd449..db8e631dc6be 100644 --- a/cmd/zpool/zpool_util.h +++ b/cmd/zpool/zpool_util.h @@ -126,6 +126,10 @@ vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv, void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl); +void free_vdev_cmd_data(vdev_cmd_data_t *data); + +int vdev_run_cmd_simple(char *path, char *cmd); + int check_device(const char *path, boolean_t force, boolean_t isspare, boolean_t iswholedisk); boolean_t check_sector_size_database(char *path, int *sector_size); diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 99a521aa2a28..3d0fc089c32f 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -936,6 +936,15 @@ zero_label(const char *path) return (0); } +static void +lines_to_stderr(char *lines[], int lines_cnt) +{ + int i; + for (i = 0; i < lines_cnt; i++) { + fprintf(stderr, "%s\n", lines[i]); + } +} + /* * Go through and find any whole disks in the vdev specification, labelling them * as appropriate. When constructing the vdev spec, we were unable to open this @@ -947,7 +956,7 @@ zero_label(const char *path) * need to get the devid after we label the disk. */ static int -make_disks(zpool_handle_t *zhp, nvlist_t *nv) +make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing) { nvlist_t **child; uint_t c, children; @@ -1032,6 +1041,8 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) */ if (!is_exclusive && !is_spare(NULL, udevpath)) { char *devnode = strrchr(devpath, '/') + 1; + char **lines = NULL; + int lines_cnt = 0; ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); if (ret == 0) { @@ -1043,9 +1054,27 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) /* * When labeling a pool the raw device node name * is provided as it appears under /dev/. + * + * Note that 'zhp' will be NULL when we're creating a + * pool. */ - if (zpool_label_disk(g_zfs, zhp, devnode) == -1) + if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode, + nv, zhp == NULL ? "create" : + replacing ? "replace" : "add", &lines, + &lines_cnt) != 0) { + (void) fprintf(stderr, + gettext( + "Error preparing/labeling disk.\n")); + if (lines_cnt > 0) { + (void) fprintf(stderr, + gettext("zfs_prepare_disk output:\n")); + lines_to_stderr(lines, lines_cnt); + } + + libzfs_free_str_array(lines, lines_cnt); return (-1); + } + libzfs_free_str_array(lines, lines_cnt); /* * Wait for udev to signal the device is available @@ -1082,19 +1111,19 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) } for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); return (0); @@ -1752,7 +1781,7 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, return (NULL); } - if (!flags.dryrun && make_disks(zhp, newroot) != 0) { + if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) { nvlist_free(newroot); return (NULL); } @@ -1873,7 +1902,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, /* * Run through the vdev specification and label any whole disks found. */ - if (!dryrun && make_disks(zhp, newroot) != 0) { + if (!dryrun && make_disks(zhp, newroot, replacing) != 0) { nvlist_free(newroot); return (NULL); } diff --git a/config/Rules.am b/config/Rules.am index abb4ced33233..7c266964f3f3 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -33,6 +33,7 @@ AM_CPPFLAGS += -D_REENTRANT AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64 AM_CPPFLAGS += -D_LARGEFILE64_SOURCE AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" +AM_CPPFLAGS += -DZFSEXECDIR=\"$(zfsexecdir)\" AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\" diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index fa05401bc168..741014398ade 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -34,6 +34,7 @@ usr/bin/zvol_wait usr/lib/modules-load.d/ lib/ usr/lib/zfs-linux/zpool.d/ usr/lib/zfs-linux/zpool_influxdb +usr/lib/zfs-linux/zfs_prepare_disk usr/sbin/arc_summary usr/sbin/arcstat usr/sbin/dbufstat @@ -87,6 +88,7 @@ usr/share/man/man8/zfs-wait.8 usr/share/man/man8/zfs-zone.8 usr/share/man/man8/zfs.8 usr/share/man/man8/zfs_ids_to_path.8 +usr/share/man/man8/zfs_prepare_disk.8 usr/share/man/man7/zfsconcepts.7 usr/share/man/man7/zfsprops.7 usr/share/man/man8/zgenhostid.8 diff --git a/include/libzfs.h b/include/libzfs.h index 6c3669273786..4adfa38e87be 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -326,6 +326,15 @@ _LIBZFS_H nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); _LIBZFS_H int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *); +_LIBZFS_H int zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv, + const char *prepare_str, char **lines[], int *lines_cnt); +_LIBZFS_H int zpool_prepare_and_label_disk(libzfs_handle_t *hdl, + zpool_handle_t *, const char *, nvlist_t *vdev_nv, const char *prepare_str, + char **lines[], int *lines_cnt); +_LIBZFS_H char ** zpool_vdev_script_alloc_env(const char *pool_name, + const char *vdev_path, const char *vdev_upath, + const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val); +_LIBZFS_H void zpool_vdev_script_free_env(char **env); _LIBZFS_H uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 8658d39e28fc..2d612a16b227 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -515,6 +515,8 @@ + + @@ -562,6 +564,8 @@ + + diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index b94abea3d581..fdd1975fa677 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -2071,3 +2071,196 @@ printf_color(const char *color, const char *format, ...) return (rc); } + +/* PATH + 5 env vars + a NULL entry = 7 */ +#define ZPOOL_VDEV_SCRIPT_ENV_COUNT 7 + +/* + * There's a few places where ZFS will call external scripts (like the script + * in zpool.d/ and `zfs_prepare_disk`). These scripts are called with a + * reduced $PATH, and some vdev specific environment vars set. This function + * will allocate an populate the environment variable array that is passed to + * these scripts. The user must free the arrays with zpool_vdev_free_env() when + * they are done. + * + * The following env vars will be set (but value could be blank): + * + * POOL_NAME + * VDEV_PATH + * VDEV_UPATH + * VDEV_ENC_SYSFS_PATH + * + * In addition, you can set an optional environment variable named 'opt_key' + * to 'opt_val' if you want. + * + * Returns allocated env[] array on success, NULL otherwise. + */ +char ** +zpool_vdev_script_alloc_env(const char *pool_name, + const char *vdev_path, const char *vdev_upath, + const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val) +{ + char **env = NULL; + int rc; + + env = calloc(ZPOOL_VDEV_SCRIPT_ENV_COUNT, sizeof (*env)); + if (!env) + return (NULL); + + env[0] = strdup("PATH=/bin:/sbin:/usr/bin:/usr/sbin"); + if (!env[0]) + goto error; + + /* Setup our custom environment variables */ + rc = asprintf(&env[1], "POOL_NAME=%s", pool_name ? pool_name : ""); + if (rc == -1) { + env[1] = NULL; + goto error; + } + + rc = asprintf(&env[2], "VDEV_PATH=%s", vdev_path ? vdev_path : ""); + if (rc == -1) { + env[2] = NULL; + goto error; + } + + rc = asprintf(&env[3], "VDEV_UPATH=%s", vdev_upath ? vdev_upath : ""); + if (rc == -1) { + env[3] = NULL; + goto error; + } + + rc = asprintf(&env[4], "VDEV_ENC_SYSFS_PATH=%s", + vdev_enc_sysfs_path ? vdev_enc_sysfs_path : ""); + if (rc == -1) { + env[4] = NULL; + goto error; + } + + if (opt_key != NULL) { + rc = asprintf(&env[5], "%s=%s", opt_key, + opt_val ? opt_val : ""); + if (rc == -1) { + env[5] = NULL; + goto error; + } + } + + return (env); + +error: + for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++) + free(env[i]); + + free(env); + + return (NULL); +} + +/* + * Free the env[] array that was allocated by zpool_vdev_script_alloc_env(). + */ +void +zpool_vdev_script_free_env(char **env) +{ + for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++) + free(env[i]); + + free(env); +} + +/* + * Prepare a disk by (optionally) running a program before labeling the disk. + * This can be useful for installing disk firmware or doing some pre-flight + * checks on the disk before it becomes part of the pool. The program run is + * located at ZFSEXECDIR/zfs_prepare_disk + * (E.x: /usr/local/libexec/zfs/zfs_prepare_disk). + * + * Return 0 on success, non-zero on failure. + */ +int +zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv, + const char *prepare_str, char **lines[], int *lines_cnt) +{ + const char *script_path = ZFSEXECDIR "/zfs_prepare_disk"; + const char *pool_name; + int rc = 0; + + /* Path to script and a NULL entry */ + char *argv[2] = {(char *)script_path}; + char **env = NULL; + const char *path = NULL, *enc_sysfs_path = NULL; + char *upath; + *lines_cnt = 0; + + if (access(script_path, X_OK) != 0) { + /* No script, nothing to do */ + return (0); + } + + (void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH, &path); + (void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_path); + + upath = zfs_get_underlying_path(path); + pool_name = zhp ? zpool_get_name(zhp) : NULL; + + env = zpool_vdev_script_alloc_env(pool_name, path, upath, + enc_sysfs_path, "VDEV_PREPARE", prepare_str); + + free(upath); + + if (env == NULL) { + return (ENOMEM); + } + + rc = libzfs_run_process_get_stdout(script_path, argv, env, lines, + lines_cnt); + + zpool_vdev_script_free_env(env); + + return (rc); +} + +/* + * Optionally run a script and then label a disk. The script can be used to + * prepare a disk for inclusion into the pool. For example, it might update + * the disk's firmware or check its health. + * + * The 'name' provided is the short name, stripped of any leading + * /dev path, and is passed to zpool_label_disk. vdev_nv is the nvlist for + * the vdev. prepare_str is a string that gets passed as the VDEV_PREPARE + * env variable to the script. + * + * The following env vars are passed to the script: + * + * POOL_NAME: The pool name (blank during zpool create) + * VDEV_PREPARE: Reason why the disk is being prepared for inclusion: + * "create", "add", "replace", or "autoreplace" + * VDEV_PATH: Path to the disk + * VDEV_UPATH: One of the 'underlying paths' to the disk. This is + * useful for DM devices. + * VDEV_ENC_SYSFS_PATH: Path to the disk's enclosure sysfs path, if available. + * + * Note, some of these values can be blank. + * + * Return 0 on success, non-zero otherwise. + */ +int +zpool_prepare_and_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, + const char *name, nvlist_t *vdev_nv, const char *prepare_str, + char **lines[], int *lines_cnt) +{ + int rc; + char vdev_path[MAXPATHLEN]; + (void) snprintf(vdev_path, sizeof (vdev_path), "%s/%s", DISK_ROOT, + name); + + /* zhp will be NULL when creating a pool */ + rc = zpool_prepare_disk(zhp, vdev_nv, prepare_str, lines, lines_cnt); + if (rc != 0) + return (rc); + + rc = zpool_label_disk(hdl, zhp, name); + return (rc); +} diff --git a/man/Makefile.am b/man/Makefile.am index 36c1aede106e..45156571eec3 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -62,6 +62,7 @@ dist_man_MANS = \ %D%/man8/zfs-userspace.8 \ %D%/man8/zfs-wait.8 \ %D%/man8/zfs_ids_to_path.8 \ + %D%/man8/zfs_prepare_disk.8 \ %D%/man8/zgenhostid.8 \ %D%/man8/zinject.8 \ %D%/man8/zpool.8 \ diff --git a/man/man8/.gitignore b/man/man8/.gitignore index f2fc702147e9..a468f9cbf9d3 100644 --- a/man/man8/.gitignore +++ b/man/man8/.gitignore @@ -1,2 +1,3 @@ /zed.8 /zfs-mount-generator.8 +/zfs_prepare_disk.8 diff --git a/man/man8/zfs_prepare_disk.8.in b/man/man8/zfs_prepare_disk.8.in new file mode 100644 index 000000000000..2a741531e415 --- /dev/null +++ b/man/man8/zfs_prepare_disk.8.in @@ -0,0 +1,70 @@ +.\" +.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). +.\" Copyright (C) 2023 Lawrence Livermore National Security, LLC. +.\" Refer to the OpenZFS git commit log for authoritative copyright attribution. +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License Version 1.0 (CDDL-1.0). +.\" You can obtain a copy of the license from the top-level file +.\" "OPENSOLARIS.LICENSE" or at . +.\" You may not use this file except in compliance with the license. +.\" +.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) +.\" +.Dd August 30, 2023 +.Dt ZFS_PREPARE_DISK 8 +.Os +. +.Sh NAME +.Nm zfs_prepare_disk +.Nd special script that gets run before bringing a disk into a pool +.Sh DESCRIPTION +.Nm +is an optional script that gets called by libzfs before bringing a disk into a +pool. +It can be modified by the user to run whatever commands are necessary to prepare +a disk for inclusion into the pool. +For example, users can add lines to +.Nm zfs_prepare_disk +to do things like update the drive's firmware or check the drive's health. +.Nm zfs_prepare_disk +is optional and can be removed if not needed. +libzfs will look for the script at @zfsexecdir@/zfs_prepare_disk. +. +.Ss Properties +.Nm zfs_prepare_disk +will be passed the following environment variables: +.sp +.Bl -tag -compact -width "VDEV_ENC_SYSFS_PATH" +. +.It Nm POOL_NAME +.No Name of the pool +.It Nm VDEV_PATH +.No Path to the disk (like /dev/sda) +.It Nm VDEV_PREPARE +.No Reason why the disk is being prepared for inclusion +('create', 'add', 'replace', or 'autoreplace'). +This can be useful if you only want the script to be run under certain actions. +.It Nm VDEV_UPATH +.No Path to one of the underlying devices for the +disk. +For multipath this would return one of the /dev/sd* paths to the disk. +If the device is not a device mapper device, then +.Nm VDEV_UPATH +just returns the same value as +.Nm VDEV_PATH +.It Nm VDEV_ENC_SYSFS_PATH +.No Path to the disk's enclosure sysfs path, if available +.El +.Pp +Note that some of these variables may have a blank value. +.Nm POOL_NAME +is blank at pool creation time, for example. +.Sh ENVIRONMENT +.Nm zfs_prepare_disk +runs with a limited $PATH. +.Sh EXIT STATUS +.Nm zfs_prepare_disk +should return 0 on success, non-zero otherwise. +If non-zero is returned, the disk will not be included in the pool. +. diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 95640727ac6a..b43bf97dbdf4 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -20,6 +20,8 @@ scripts_scripts = \ if CONFIG_USER dist_scripts_SCRIPTS = $(scripts_scripts) +dist_zfsexec_SCRIPTS = \ + %D%/zfs_prepare_disk else dist_noinst_SCRIPTS += $(scripts_scripts) endif diff --git a/scripts/zfs_prepare_disk b/scripts/zfs_prepare_disk new file mode 100755 index 000000000000..02aa9f8a7728 --- /dev/null +++ b/scripts/zfs_prepare_disk @@ -0,0 +1,17 @@ +#!/bin/sh +# +# This is an optional helper script that is automatically called by libzfs +# before a disk is about to be added into the pool. It can be modified by +# the user to run whatever commands are necessary to prepare a disk for +# inclusion into the pool. For example, users can add lines to this +# script to do things like update the drive's firmware or check the drive's +# health. The script is optional and can be removed if it is not needed. +# +# See the zfs_prepare_disk(8) man page for details. +# +# Example: +# +# echo "Prepare disk $VDEV_PATH ($VDEV_UPATH) for $VDEV_PREPARE in $POOL_NAME" +# + +exit 0 From 0bcd1151f0c268f826e81438ceaa5ec1761f6baf Mon Sep 17 00:00:00 2001 From: Don Brady Date: Fri, 20 Oct 2023 10:29:02 -0600 Subject: [PATCH 43/78] Fix ZED auto-replace for VDEVs using by-id paths The change is simple -- restore the original code so that the VDEV path is updated when using by-id paths. The more challenging part was to devise a second ZTS test, that would test auto-replace for 'by-id' and help prevent a future regression. With that new test, we can now do an A|B test with , and without, the fix to confirm that auto-replace for by-id paths works. The existing auto-replace test, functional/fault/auto_replace_001_pos, will confirm that we didn't break auto-replace for 'by-vdev' paths. In the original functional/fault/auto_replace_001_pos test, the disk wipe (using dd) was not effective in removing the partitioning since the kernel was never informed of the wipe. Added a call to wipefs(8) so that the kernel is informed and ZED will re-partition the device. Added a validation step that the re-partitioning occurred by confirming that the GPT partition UUID changes. Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Rob Norris Reviewed-by: Tony Hutter Signed-off-by: Don Brady Closes #15363 --- cmd/zed/agents/zfs_mod.c | 55 +++-- include/libzutil.h | 2 +- lib/libzutil/os/linux/zutil_import_os.c | 5 +- tests/runfiles/linux.run | 8 +- tests/test-runner/bin/zts-report.py.in | 1 + tests/zfs-tests/include/commands.cfg | 9 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/fault/auto_replace_001_pos.ksh | 41 +++- .../functional/fault/auto_replace_002_pos.ksh | 192 ++++++++++++++++++ 9 files changed, 279 insertions(+), 35 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index b2c008ad1d0e..9636c99fc85f 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -24,6 +24,7 @@ * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016, 2017, Intel Corporation. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2023, Klara Inc. */ /* @@ -204,7 +205,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) uint64_t is_spare = 0; const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; char rawpath[PATH_MAX], fullpath[PATH_MAX]; - char devpath[PATH_MAX]; + char pathbuf[PATH_MAX]; int ret; int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; boolean_t is_sd = B_FALSE; @@ -214,6 +215,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) char **lines = NULL; int lines_cnt = 0; + /* + * Get the persistent path, typically under the '/dev/disk/by-id' or + * '/dev/disk/by-vdev' directories. Note that this path can change + * when a vdev is replaced with a new disk. + */ if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) return; @@ -370,15 +376,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) (void) snprintf(rawpath, sizeof (rawpath), "%s%s", is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); - if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) { + if (realpath(rawpath, pathbuf) == NULL && !is_mpath_wholedisk) { zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", rawpath, strerror(errno)); - (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, - &newstate); + int err = zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); - zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", - fullpath, libzfs_error_description(g_zfshdl)); + zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s) " + "err %d, new state %d", + fullpath, libzfs_error_description(g_zfshdl), err, + err ? (int)newstate : 0); return; } @@ -428,7 +436,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * to trigger a ZFS fault for the device (and any hot spare * replacement). */ - leafname = strrchr(devpath, '/') + 1; + leafname = strrchr(pathbuf, '/') + 1; /* * If this is a request to label a whole disk, then attempt to @@ -436,7 +444,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) */ if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname, vdev, "autoreplace", &lines, &lines_cnt) != 0) { - zed_log_msg(LOG_INFO, + zed_log_msg(LOG_WARNING, " zpool_prepare_and_label_disk: could not " "label '%s' (%s)", leafname, libzfs_error_description(g_zfshdl)); @@ -468,7 +476,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) sizeof (device->pd_physpath)); list_insert_tail(&g_device_list, device); - zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", + zed_log_msg(LOG_NOTICE, " zpool_label_disk: async '%s' (%llu)", leafname, (u_longlong_t)guid); return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ @@ -491,8 +499,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) } if (!found) { /* unexpected partition slice encountered */ - zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", - fullpath); + zed_log_msg(LOG_WARNING, "labeled disk %s was " + "unexpected here", fullpath); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); return; @@ -501,8 +509,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", physpath, (u_longlong_t)guid); - (void) snprintf(devpath, sizeof (devpath), "%s%s", - DEV_BYID_PATH, new_devid); + /* + * Paths that begin with '/dev/disk/by-id/' will change and so + * they must be updated before calling zpool_vdev_attach(). + */ + if (strncmp(path, DEV_BYID_PATH, strlen(DEV_BYID_PATH)) == 0) { + (void) snprintf(pathbuf, sizeof (pathbuf), "%s%s", + DEV_BYID_PATH, new_devid); + zed_log_msg(LOG_INFO, " zpool_label_disk: path '%s' " + "replaced by '%s'", path, pathbuf); + path = pathbuf; + } } libzfs_free_str_array(lines, lines_cnt); @@ -545,9 +562,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * Wait for udev to verify the links exist, then auto-replace * the leaf disk at same physical location. */ - if (zpool_label_disk_wait(path, 3000) != 0) { - zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " - "disk %s is missing", path); + if (zpool_label_disk_wait(path, DISK_LABEL_WAIT) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: pool '%s', after labeling " + "replacement disk, the expected disk partition link '%s' " + "is missing after waiting %u ms", + zpool_get_name(zhp), path, DISK_LABEL_WAIT); nvlist_free(nvroot); return; } @@ -562,7 +581,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) B_TRUE, B_FALSE); } - zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", + zed_log_msg(LOG_WARNING, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : libzfs_error_description(g_zfshdl)); @@ -660,7 +679,7 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) dp->dd_prop, path); dp->dd_found = B_TRUE; - /* pass the new devid for use by replacing code */ + /* pass the new devid for use by auto-replacing code */ if (dp->dd_new_devid != NULL) { (void) nvlist_add_string(nvl, "new_devid", dp->dd_new_devid); diff --git a/include/libzutil.h b/include/libzutil.h index 237ff976ba62..053b1ed4b52a 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -34,7 +34,7 @@ extern "C" { #endif /* - * Default wait time for a device name to be created. + * Default wait time in milliseconds for a device name to be created. */ #define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c index 8b64369dc29f..44ed697dd490 100644 --- a/lib/libzutil/os/linux/zutil_import_os.c +++ b/lib/libzutil/os/linux/zutil_import_os.c @@ -582,9 +582,8 @@ zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) * Wait up to timeout_ms for udev to set up the device node. The device is * considered ready when libudev determines it has been initialized, all of * the device links have been verified to exist, and it has been allowed to - * settle. At this point the device the device can be accessed reliably. - * Depending on the complexity of the udev rules this process could take - * several seconds. + * settle. At this point the device can be accessed reliably. Depending on + * the complexity of the udev rules this process could take several seconds. */ int zpool_label_disk_wait(const char *path, int timeout_ms) diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 2252e46df3a8..8bc55a1b4b47 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -122,10 +122,10 @@ tags = ['functional', 'fallocate'] [tests/functional/fault:Linux] tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', - 'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', - 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', - 'decrypt_fault', 'decompress_fault', 'scrub_after_resilver', - 'zpool_status_-s'] + 'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos', + 'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift', + 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', + 'scrub_after_resilver', 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 5d1360380de5..4608e87522a3 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -328,6 +328,7 @@ if os.environ.get('CI') == 'true': 'fault/auto_online_001_pos': ['SKIP', ci_reason], 'fault/auto_online_002_pos': ['SKIP', ci_reason], 'fault/auto_replace_001_pos': ['SKIP', ci_reason], + 'fault/auto_replace_002_pos': ['SKIP', ci_reason], 'fault/auto_spare_ashift': ['SKIP', ci_reason], 'fault/auto_spare_shared': ['SKIP', ci_reason], 'procfs/pool_state': ['SKIP', ci_reason], diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index fa545e06bbf3..648f2203dfba 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -130,12 +130,14 @@ export SYSTEM_FILES_LINUX='attr chattr exportfs fallocate + flock free getfattr groupadd groupdel groupmod hostid + logger losetup lsattr lsblk @@ -145,21 +147,20 @@ export SYSTEM_FILES_LINUX='attr md5sum mkswap modprobe + mountpoint mpstat nsenter parted perf setfattr + setpriv sha256sum udevadm unshare useradd userdel usermod - setpriv - mountpoint - flock - logger' + wipefs' export ZFS_FILES='zdb zfs diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 158401e078aa..87b50f59ca7a 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1431,6 +1431,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fault/auto_online_001_pos.ksh \ functional/fault/auto_online_002_pos.ksh \ functional/fault/auto_replace_001_pos.ksh \ + functional/fault/auto_replace_002_pos.ksh \ functional/fault/auto_spare_001_pos.ksh \ functional/fault/auto_spare_002_pos.ksh \ functional/fault/auto_spare_ashift.ksh \ diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh index 081e6c18430d..ae56ee9919bf 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh @@ -34,13 +34,14 @@ # 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path. # This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug". # 2. Create a pool and set autoreplace=on (auto-replace is opt-in) -# 3. Export a pool +# 3. Export the pool # 4. Wipe and offline the scsi_debug disk -# 5. Import pool with missing disk +# 5. Import the pool with missing disk # 6. Re-online the wiped scsi_debug disk -# 7. Verify the ZED detects the new unused disk and adds it back to the pool +# 7. Verify ZED detects the new blank disk and replaces the missing vdev +# 8. Verify that the scsi_debug disk was re-partitioned # -# Creates a raidz1 zpool using persistent disk path names +# Creates a raidz1 zpool using persistent /dev/disk/by-vdev path names # (ie not /dev/sdc) # # Auto-replace is opt in, and matches by phys_path. @@ -83,11 +84,27 @@ log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE $DISK1 $DISK2 $DISK3 log_must zpool set autoreplace=on $TESTPOOL # Add some data to the pool -log_must mkfile $FSIZE /$TESTPOOL/data +log_must zfs create $TESTPOOL/fs +log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z log_must zpool export $TESTPOOL +# Record the partition UUID for later comparison +part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) +[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid} + +# # Wipe and offline the disk +# +# Note that it is not enough to zero the disk to expunge the partitions. +# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe'). +# +# Using partprobe is overkill and hdparm is not as common as wipefs. So +# we use wipefs which lets the kernel know the partition was removed +# from the device (i.e., calls BLKRRPART ioctl). +# log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE +log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID remove_disk $SD block_device_wait @@ -106,4 +123,18 @@ log_must wait_replacing $TESTPOOL 60 # Validate auto-replace was successful log_must check_state $TESTPOOL "" "ONLINE" +# +# Confirm the partition UUID changed so we know the new disk was relabeled +# +# Note: some older versions of udevadm don't support "--property" option so +# we'll # skip this test when it is not supported +# +if [ ! -z "$part_uuid" ]; then + new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) + log_note new disk GPT uuid ${new_uuid} + [[ "$part_uuid" = "$new_uuid" ]] && \ + log_fail "The new disk was not relabeled as expected" +fi + log_pass "Auto-replace test successful" diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh new file mode 100755 index 000000000000..2259e604317b --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh @@ -0,0 +1,192 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2017 by Intel Corporation. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# Testing Fault Management Agent ZED Logic - Automated Auto-Replace Test. +# Verifys that auto-replace works with by-id paths. +# +# STRATEGY: +# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path. +# This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug". +# 2. Create a pool and set autoreplace=on (auto-replace is opt-in) +# 3. Export the pool +# 4. Wipe and offline the scsi_debug disk +# 5. Import the pool with missing disk +# 6. Re-online the wiped scsi_debug disk with a new serial number +# 7. Verify ZED detects the new blank disk and replaces the missing vdev +# 8. Verify that the scsi_debug disk was re-partitioned +# +# Creates a raidz1 zpool using persistent /dev/disk/by-id path names +# +# Auto-replace is opt in, and matches by phys_path. +# + +verify_runnable "both" + +if ! is_physical_device $DISKS; then + log_unsupported "Unsupported disks for this test." +fi + +function cleanup +{ + zpool status $TESTPOOL + destroy_pool $TESTPOOL + sed -i '/alias scsidebug/d' $VDEVID_CONF + unload_scsi_debug +} + +# +# Wait until a vdev transitions to its replacement vdev +# +# Return 0 when vdev reaches expected state, 1 on timeout. +# +# Note: index +2 is to skip over root and raidz-0 vdevs +# +function wait_vdev_online # pool index oldguid timeout +{ + typeset pool=$1 + typeset -i index=$2+2 + typeset guid=$3 + typeset timeout=${4:-60} + typeset -i i=0 + + while [[ $i -lt $timeout ]]; do + vdev_guids=( $(zpool get -H -o value guid $pool all-vdevs) ) + + if [ "${vdev_guids[$index]}" != "${guid}" ]; then + log_note "new vdev[$((index-2))]: ${vdev_guids[$index]}, replacing ${guid}" + return 0 + fi + + i=$((i+1)) + sleep 1 + done + + return 1 +} +log_assert "automated auto-replace with by-id paths" +log_onexit cleanup + +load_scsi_debug $SDSIZE $SDHOSTS $SDTGTS $SDLUNS '512b' +SD=$(get_debug_device) +SD_DEVICE_ID=$(get_persistent_disk_name $SD) +SD_HOST=$(get_scsi_host $SD) + +# Register vdev_id alias for scsi_debug device to create a persistent path +echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF +block_device_wait + +SD_DEVICE=$(udevadm info -q all -n $DEV_DSKDIR/$SD | \ + awk -F'=' '/ID_VDEV=/ {print $2; exit}') +[ -z $SD_DEVICE ] && log_fail "vdev rule was not registered properly" + +log_must zpool events -c +log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE_ID $DISK1 $DISK2 $DISK3 + +vdev_guid=$(zpool get guid -H -o value $TESTPOOL $SD_DEVICE_ID) +log_note original vdev guid ${vdev_guid} + +# Auto-replace is opt-in so need to set property +log_must zpool set autoreplace=on $TESTPOOL + +# Add some data to the pool +log_must zfs create $TESTPOOL/fs +log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z +log_must zpool export $TESTPOOL + +# Record the partition UUID for later comparison +part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) +[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid} + +# +# Wipe and offline the disk +# +# Note that it is not enough to zero the disk to expunge the partitions. +# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe'). +# +# Using partprobe is overkill and hdparm is not as common as wipefs. So +# we use wipefs which lets the kernel know the partition was removed +# from the device (i.e., calls BLKRRPART ioctl). +# +log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE +log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID +remove_disk $SD +block_device_wait + +# Re-import pool with drive missing +log_must zpool import $TESTPOOL +log_must check_state $TESTPOOL "" "DEGRADED" +block_device_wait + +# +# Online an empty disk in the same physical location, with a different by-id +# symlink. We use vpd_use_hostno to make sure the underlying serial number +# changes for the new disk which in turn gives us a different by-id path. +# +# The original names were something like: +# /dev/disk/by-id/scsi-SLinux_scsi_debug_16000-part1 +# /dev/disk/by-id/wwn-0x33333330000007d0-part1 +# +# This new inserted disk, will have different links like: +# /dev/disk/by-id/scsi-SLinux_scsi_debug_2000-part1 +# /dev/disk/by-id/wwn-0x0x3333333000003e80 -part1 +# +echo '0' > /sys/bus/pseudo/drivers/scsi_debug/vpd_use_hostno + +insert_disk $SD $SD_HOST + +# make sure the physical path points to the same scsi-debug device +SD_DEVICE_ID=$(get_persistent_disk_name $SD) +echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF +block_device_wait + +# Wait for the new disk to be online and replaced +log_must wait_vdev_online $TESTPOOL 0 $vdev_guid 45 +log_must wait_replacing $TESTPOOL 45 + +# Validate auto-replace was successful +log_must check_state $TESTPOOL "" "ONLINE" + +# +# Confirm the partition UUID changed so we know the new disk was relabeled +# +# Note: some older versions of udevadm don't support "--property" option so +# we'll # skip this test when it is not supported +# +if [ ! -z "$part_uuid" ]; then + new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) + log_note new disk GPT uuid ${new_uuid} + [[ "$part_uuid" = "$new_uuid" ]] && \ + log_fail "The new disk was not relabeled as expected" +fi + +log_pass "automated auto-replace with by-id paths" From 1cc1bf4fa7219c12a5b0bbf8de009eda46e1d7ff Mon Sep 17 00:00:00 2001 From: Colin Percival Date: Fri, 20 Oct 2023 10:30:32 -0700 Subject: [PATCH 44/78] Set spa_ccw_fail_time=0 when expanding a vdev. When a vdev is to be expanded -- either via `zpool online -e` or via the autoexpand option -- a SPA_ASYNC_CONFIG_UPDATE request is queued to be handled via an asynchronous worker thread (spa_async_thread). This normally happens almost immediately; but will be delayed up to zfs_ccw_retry_interval seconds (default 5 minutes) if an attempt to write the zpool configuration cache failed. When FreeBSD boots ZFS-root VM images generated using `makefs -t zfs`, the zpoolupgrade rc.d script runs `zpool upgrade`, which modifies the pool configuration and triggers an attempt to write to the cache file. This attempted write fails because the filesystem is still mounted read-only at this point in the boot process, triggering a 5-minute cooldown before SPA_ASYNC_CONFIG_UPDATE requests will be handled by the asynchronous worker thread. When expanding a vdev, reset the "when did a configuration cache write last fail" value so that the SPA_ASYNC_CONFIG_UPDATE request will be handled promptly. A cleaner but more intrusive option would be to use separate SPA_ASYNC_ flags for "configuration changed" and "try writing the configuration cache again", but with FreeBSD 14.0 coming very soon I'd prefer to leave such refactoring for a later date. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Colin Percival Closes #15405 --- module/zfs/vdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 87c145593237..afb01c0ef7fd 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4215,6 +4215,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); + spa->spa_ccw_fail_time = 0; spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } From edebca5dfc3bb6c087012355ce0ee39b3e49e7a8 Mon Sep 17 00:00:00 2001 From: Olivier Certner Date: Fri, 20 Oct 2023 20:49:56 +0200 Subject: [PATCH 45/78] FreeBSD: taskq: Remove unused declaration Variable 'uma_align_cache' has not been used since commit "FreeBSD: Use a hash table for taskqid lookups" (3933305ea). Moreover, it is soon going to become private to FreeBSD's UMA in 15.0-CURRENT (main), 14.0-STABLE (stable/14) and 13.2-STABLE (stable/13). Should accessing this information become necessary again, one will have to use the new accessors for recent versions. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Olivier Certner Closes #15416 --- module/os/freebsd/spl/spl_taskq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index daefe3559538..842b80ade1fb 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -64,8 +64,6 @@ taskq_t *dynamic_taskq = NULL; proc_t *system_proc; -extern int uma_align_cache; - static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures"); static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; From 8ca95d78c54fc3a8697c3dc8c9a8184e4b9cb95c Mon Sep 17 00:00:00 2001 From: dennisfriedrichsen <31087738+dennisfriedrichsen@users.noreply.github.com> Date: Fri, 20 Oct 2023 13:52:13 -0500 Subject: [PATCH 46/78] Fix typo in tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg Reviewed-by: Rob N Reviewed-by: Brian Behlendorf Signed-off-by: Dennis R. Friedrichsen Closes #15417 --- tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg b/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg index e98b5e8b2214..9c76a8780b4a 100644 --- a/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg +++ b/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg @@ -29,7 +29,7 @@ # if is_linux; then - # these are the set of setable ZFS properties + # these are the set of settable ZFS properties PROP_NAMES="\ acltype atime \ checksum compression devices \ @@ -81,7 +81,7 @@ elif is_freebsd; then hidden" else - # these are the set of setable ZFS properties + # these are the set of settable ZFS properties PROP_NAMES="\ aclinherit aclmode atime \ checksum compression devices \ From eaa62d995100a53bc3e88ad84ce4f57474fa4dd0 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 20 Oct 2023 14:54:05 -0400 Subject: [PATCH 47/78] Properly pad struct tx_cpu to cache line We already use ____cacheline_aligned in many places, so add one more instead of seems arbitrary char tc_pad[8]. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15402 --- include/sys/txg_impl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h index 45fde2e1f351..8ab7969b25be 100644 --- a/include/sys/txg_impl.h +++ b/include/sys/txg_impl.h @@ -73,8 +73,7 @@ struct tx_cpu { kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ - char tc_pad[8]; /* pad to fill 3 cache lines */ -}; +} ____cacheline_aligned; /* * The tx_state structure maintains the state information about the different From 0ef1964c79fd93c567a9c3d631db515f7fd447a3 Mon Sep 17 00:00:00 2001 From: VaibhavB <88050553+vaibhav-delphix@users.noreply.github.com> Date: Sat, 21 Oct 2023 00:27:39 +0530 Subject: [PATCH 48/78] run-zts test procfs/pool_state failed with uncorrectable I/O failure Once we trigger the zpool scrub, all zpool/zfs command gets stuck for 180 seconds. Post 180 seconds zpool/zfs commands gets start executing however few more seconds(10s) it take to update the status. hence sleeping for 200 seconds so that we get the correct status. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: vaibhav.bhanawat Closes #15364 --- tests/zfs-tests/tests/functional/procfs/pool_state.ksh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh index 7a02eb68abda..bae876379177 100755 --- a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh +++ b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh @@ -141,7 +141,11 @@ remove_disk $SDISK # background since the command will hang when the pool gets suspended. The # command will resume and exit after we restore the missing disk later on. zpool scrub $TESTPOOL2 & -sleep 3 # Give the scrub some time to run before we check if it fails +# Once we trigger the zpool scrub, all zpool/zfs command gets stuck for 180 seconds. +# Post 180 seconds zpool/zfs commands gets start executing however few more seconds(10s) +# it take to update the status. +# hence sleeping for 200 seconds so that we get the correct status. +sleep 200 # Give the scrub some time to run before we check if it fails log_must check_all $TESTPOOL2 "SUSPENDED" From 79f7de575252b62a2aa7c92f9cbef9710f1c244a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 20 Oct 2023 15:37:16 -0400 Subject: [PATCH 49/78] Remove lock from dsl_pool_need_dirty_delay() Torn reads/writes of dp_dirty_total are unlikely: on 64-bit systems due to register size, while on 32-bit due to memory constraints. And even if we hit some race, the code implementing the delay takes the lock any way. Removal of the poll-wide lock acquisition saves ~1% of CPU time on 8-thread 8KB write workload. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15390 --- module/zfs/dsl_pool.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 9120fef93c74..17b971248283 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -965,18 +965,18 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - mutex_enter(&dp->dp_lock); - uint64_t dirty = dp->dp_dirty_total; - mutex_exit(&dp->dp_lock); - - return (dirty > delay_min_bytes); + /* + * We are not taking the dp_lock here and few other places, since torn + * reads are unlikely: on 64-bit systems due to register size and on + * 32-bit due to memory constraints. Pool-wide locks in hot path may + * be too expensive, while we do not need a precise result here. + */ + return (dp->dp_dirty_total > delay_min_bytes); } static boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) { - ASSERT(MUTEX_HELD(&dp->dp_lock)); - uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; From 6e41aca519322757f64cd737aebbcf36d931997f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 20 Oct 2023 15:38:37 -0400 Subject: [PATCH 50/78] Trust ARC_BUF_SHARED() more In my understanding ARC_BUF_SHARED() and arc_buf_is_shared() should return identical results, except the second also asserts it deeper. The first is much cheaper though, saving few pointer dereferences. Replace production arc_buf_is_shared() calls with ARC_BUF_SHARED(), and call arc_buf_is_shared() in random assertions, while making it even more strict. On my tests this in half reduces arc_buf_destroy_impl() time, that noticeably reduces hash_lock congestion under heavy dbuf eviction. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15397 --- module/zfs/arc.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index b5946e7604c0..5d4a52fa0693 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1364,7 +1364,7 @@ arc_buf_is_shared(arc_buf_t *buf) abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); - IMPLY(shared, ARC_BUF_SHARED(buf)); + EQUIV(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* @@ -1998,7 +1998,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); - IMPLY(encrypted, !ARC_BUF_SHARED(buf)); + IMPLY(encrypted, !arc_buf_is_shared(buf)); /* * If the caller wanted encrypted data we just need to copy it from @@ -2066,7 +2066,9 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, } if (hdr_compressed == compressed) { - if (!arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { + ASSERT(arc_buf_is_shared(buf)); + } else { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } @@ -2078,7 +2080,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf its own b_data */ @@ -2090,6 +2092,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(!arc_buf_is_shared(buf)); + /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); @@ -2217,7 +2221,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); @@ -2256,7 +2260,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); @@ -2481,7 +2485,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many( @@ -2537,7 +2541,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many( @@ -3061,9 +3065,10 @@ arc_buf_destroy_impl(arc_buf_t *buf) arc_cksum_verify(buf); arc_buf_unwatch(buf); - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { + ASSERT(!arc_buf_is_shared(buf)); uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); @@ -3104,9 +3109,9 @@ arc_buf_destroy_impl(arc_buf_t *buf) */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ - VERIFY(!arc_buf_is_shared(lastbuf)); + ASSERT(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ - VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); + ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); @@ -6189,7 +6194,7 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); VERIFY3S(remove_reference(hdr, tag), >, 0); - if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { + if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } @@ -6206,9 +6211,9 @@ arc_release(arc_buf_t *buf, const void *tag) * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - VERIFY(!arc_buf_is_shared(lastbuf)); + ASSERT(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between @@ -6241,7 +6246,7 @@ arc_release(arc_buf_t *buf, const void *tag) */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); - ASSERT(!ARC_BUF_SHARED(buf)); + ASSERT(!arc_buf_is_shared(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); @@ -6335,9 +6340,10 @@ arc_write_ready(zio_t *zio) arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { + ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } } @@ -6636,9 +6642,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { + ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); From 86c3ed40e111e98cbffd780afbac0133450e2c4b Mon Sep 17 00:00:00 2001 From: ofthesun9 Date: Mon, 23 Oct 2023 22:41:29 +0200 Subject: [PATCH 51/78] "ARC prefetch metadata accesses:" appears twice in the output. The first occurrence should be "ARC prefetch data accesses:" Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: ofthesun9 Closes #15427 --- cmd/arc_summary | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/arc_summary b/cmd/arc_summary index 426e0207052d..9c69ec4f8ccc 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -711,7 +711,7 @@ def section_archits(kstats_dict): pd_total = int(arc_stats['prefetch_data_hits']) +\ int(arc_stats['prefetch_data_iohits']) +\ int(arc_stats['prefetch_data_misses']) - prt_2('ARC prefetch metadata accesses:', f_perc(pd_total, all_accesses), + prt_2('ARC prefetch data accesses:', f_perc(pd_total, all_accesses), f_hits(pd_total)) pd_todo = (('Prefetch data hits:', arc_stats['prefetch_data_hits']), ('Prefetch data I/O hits:', arc_stats['prefetch_data_iohits']), From e860cb0200a81b7abf2dfd4ce98ed41ad3d7cc3a Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 23 Oct 2023 14:45:06 -0700 Subject: [PATCH 52/78] zvol: Remove broken blk-mq optimization This fix removes a dubious optimization in zfs_uiomove_bvec_rq() that saved the iterator contents of a rq_for_each_segment(). This optimization allowed restoring the "saved state" from a previous rq_for_each_segment() call on the same uio so that you wouldn't need to iterate though each bvec on every zfs_uiomove_bvec_rq() call. However, if the kernel is manipulating the requests/bios/bvecs under the covers between zfs_uiomove_bvec_rq() calls, then it could result in corruption from using the "saved state". This optimization results in an unbootable system after installing an OS on a zvol with blk-mq enabled. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15351 --- include/os/linux/spl/sys/uio.h | 8 -------- module/os/linux/zfs/zfs_uio.c | 29 ----------------------------- 2 files changed, 37 deletions(-) diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index cce097e16fbc..a4b600004c9f 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -73,13 +73,6 @@ typedef struct zfs_uio { size_t uio_skip; struct request *rq; - - /* - * Used for saving rq_for_each_segment() state between calls - * to zfs_uiomove_bvec_rq(). - */ - struct req_iterator iter; - struct bio_vec bv; } zfs_uio_t; @@ -138,7 +131,6 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) } else { uio->uio_bvec = NULL; uio->uio_iovcnt = 0; - memset(&uio->iter, 0, sizeof (uio->iter)); } uio->uio_loffset = io_offset(bio, rq); diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index 3efd4ab159c6..c2ed67c438c6 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -204,22 +204,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) this_seg_start = orig_loffset; rq_for_each_segment(bv, rq, iter) { - if (uio->iter.bio) { - /* - * If uio->iter.bio is present, then we know we've saved - * uio->iter from a previous call to this function, and - * we can skip ahead in this rq_for_each_segment() loop - * to where we last left off. That way, we don't need - * to iterate over tons of segments we've already - * processed - we can just restore the "saved state". - */ - iter = uio->iter; - bv = uio->bv; - this_seg_start = uio->uio_loffset; - memset(&uio->iter, 0, sizeof (uio->iter)); - continue; - } - /* * Lookup what the logical offset of the last byte of this * segment is. @@ -260,19 +244,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) copied = 1; /* We copied some data */ } - if (n == 0) { - /* - * All done copying. Save our 'iter' value to the uio. - * This allows us to "save our state" and skip ahead in - * the rq_for_each_segment() loop the next time we call - * call zfs_uiomove_bvec_rq() on this uio (which we - * will be doing for any remaining data in the uio). - */ - uio->iter = iter; /* make a copy of the struct data */ - uio->bv = bv; - return (0); - } - this_seg_start = this_seg_end + 1; } From 8ba748d41411002f629ff4a36219b0c77a3534c0 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 23 Oct 2023 14:39:59 -0700 Subject: [PATCH 53/78] Revert "zvol: Temporally disable blk-mq" This reverts commit aefb6a2bd6c24597cde655e9ce69edd0a4c34357. aefb6a2bd temporally disabled blk-mq until we could fix a fix for Signed-off-by: Tony Hutter Closes #15439 --- man/man4/zfs.4 | 57 ++++++++++++++++++++++++++++ module/os/linux/zfs/zvol_os.c | 12 ++++++ tests/zfs-tests/include/tunables.cfg | 2 +- 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 5f89f6adf1e3..615332bb023b 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2317,6 +2317,63 @@ If .Sy zvol_threads to the number of CPUs present or 32 (whichever is greater). . +.It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint +The number of threads per zvol to use for queuing IO requests. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +If +.Sy 0 +(the default) then internally set +.Sy zvol_blk_mq_threads +to the number of CPUs present. +. +.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint +Set to +.Sy 1 +to use the +.Li blk-mq +API for zvols. +Set to +.Sy 0 +(the default) to use the legacy zvol APIs. +This setting can give better or worse zvol performance depending on +the workload. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +. +.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint +If +.Sy zvol_use_blk_mq +is enabled, then process this number of +.Sy volblocksize Ns -sized blocks per zvol thread. +This tunable can be use to favor better performance for zvol reads (lower +values) or writes (higher values). +If set to +.Sy 0 , +then the zvol layer will process the maximum number of blocks +per thread that it can. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only applied at each zvol's load time. +. +.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint +The queue_depth value for the zvol +.Li blk-mq +interface. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only applied at each zvol's load time. +If +.Sy 0 +(the default) then use the kernel's default queue depth. +Values are clamped to the kernel's +.Dv BLKDEV_MIN_RQ +and +.Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ +limits. +. .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint Defines zvol block devices behaviour when .Sy volmode Ns = Ns Sy default : diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 76521c95911e..7a95b54bdf0d 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1620,6 +1620,18 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); +#ifdef HAVE_BLK_MQ +module_param(zvol_blk_mq_queue_depth, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); + +module_param(zvol_use_blk_mq, uint, 0644); +MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); + +module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, + "Process volblocksize blocks per thread"); +#endif + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS module_param(zvol_open_timeout_ms, uint, 0644); MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 8010a9451597..80e7bcb3bd09 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -89,7 +89,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED -VOL_USE_BLK_MQ UNSUPPORTED UNSUPPORTED +VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max From f9a9aea126f9a35555144e1ff141bd7ae6bb3d61 Mon Sep 17 00:00:00 2001 From: Thomas Bertschinger <101425190+bertschinger@users.noreply.github.com> Date: Thu, 26 Oct 2023 10:17:40 -0600 Subject: [PATCH 54/78] Add mutex_enter_interruptible() for interruptible sleeping IOCTLs Many long-running ZFS ioctls lock the spa_namespace_lock, forcing concurrent ioctls to sleep for the mutex. Previously, the only option is to call mutex_enter() which sleeps uninterruptibly. This is a usability issue for sysadmins, for example, if the admin runs `zpool status` while a slow `zpool import` is ongoing, the admin's shell will be locked in uninterruptible sleep for a long time. This patch resolves this admin usability issue by introducing mutex_enter_interruptible() which sleeps interruptibly while waiting to acquire a lock. It is implemented for both Linux and FreeBSD. The ZFS_IOC_POOL_CONFIGS ioctl, used by `zpool status`, is changed to use this new macro so that the command can be interrupted if it is issued during a concurrent `zpool import` (or other long-running operation). Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Thomas Bertschinger Closes #15360 --- include/os/freebsd/spl/sys/mutex.h | 1 + include/os/linux/spl/sys/mutex.h | 21 +++++++++++++-------- include/sys/spa.h | 2 +- include/sys/zfs_context.h | 2 ++ lib/libzpool/kernel.c | 9 +++++++++ module/zfs/spa_config.c | 17 +++++++++-------- module/zfs/zfs_ioctl.c | 5 +++-- 7 files changed, 38 insertions(+), 19 deletions(-) diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h index e757d12c1502..8cfe56c75309 100644 --- a/include/os/freebsd/spl/sys/mutex.h +++ b/include/os/freebsd/spl/sys/mutex.h @@ -64,6 +64,7 @@ typedef enum { } while (0) #define mutex_destroy(lock) sx_destroy(lock) #define mutex_enter(lock) sx_xlock(lock) +#define mutex_enter_interruptible(lock) sx_xlock_sig(lock) #define mutex_enter_nested(lock, type) sx_xlock(lock) #define mutex_tryenter(lock) sx_try_xlock(lock) #define mutex_exit(lock) sx_xunlock(lock) diff --git a/include/os/linux/spl/sys/mutex.h b/include/os/linux/spl/sys/mutex.h index 6b61c59c48e2..b4eaa0266d20 100644 --- a/include/os/linux/spl/sys/mutex.h +++ b/include/os/linux/spl/sys/mutex.h @@ -128,7 +128,6 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ #define NESTED_SINGLE 1 -#ifdef CONFIG_DEBUG_LOCK_ALLOC #define mutex_enter_nested(mp, subclass) \ { \ ASSERT3P(mutex_owner(mp), !=, current); \ @@ -137,16 +136,22 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ spl_mutex_lockdep_on_maybe(mp); \ spl_mutex_set_owner(mp); \ } -#else /* CONFIG_DEBUG_LOCK_ALLOC */ -#define mutex_enter_nested(mp, subclass) \ -{ \ + +#define mutex_enter_interruptible(mp) \ +/* CSTYLED */ \ +({ \ + int _rc_; \ + \ ASSERT3P(mutex_owner(mp), !=, current); \ spl_mutex_lockdep_off_maybe(mp); \ - mutex_lock(MUTEX(mp)); \ + _rc_ = mutex_lock_interruptible(MUTEX(mp)); \ spl_mutex_lockdep_on_maybe(mp); \ - spl_mutex_set_owner(mp); \ -} -#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + if (!_rc_) { \ + spl_mutex_set_owner(mp); \ + } \ + \ + _rc_; \ +}) #define mutex_enter(mp) mutex_enter_nested((mp), 0) diff --git a/include/sys/spa.h b/include/sys/spa.h index b90855687411..87ddbd90e170 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -837,7 +837,7 @@ extern kmutex_t spa_namespace_lock; extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); -extern nvlist_t *spa_all_configs(uint64_t *); +extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 6a337b49edf3..750ca612b962 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -274,11 +274,13 @@ typedef struct kmutex { extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); extern void mutex_destroy(kmutex_t *mp); extern void mutex_enter(kmutex_t *mp); +extern int mutex_enter_check_return(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) +#define mutex_enter_interruptible(mp) mutex_enter_check_return(mp) /* * RW locks */ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index a9b9bf4c2ce5..ffad7fc02bc9 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -205,6 +205,15 @@ mutex_enter(kmutex_t *mp) mp->m_owner = pthread_self(); } +int +mutex_enter_check_return(kmutex_t *mp) +{ + int error = pthread_mutex_lock(&mp->m_lock); + if (error == 0) + mp->m_owner = pthread_self(); + return (error); +} + int mutex_tryenter(kmutex_t *mp) { diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 636c04d9f785..a77874ea0dd3 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -367,23 +367,24 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ -nvlist_t * -spa_all_configs(uint64_t *generation) +int +spa_all_configs(uint64_t *generation, nvlist_t **pools) { - nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) - return (NULL); + return (SET_ERROR(EEXIST)); - pools = fnvlist_alloc(); + int error = mutex_enter_interruptible(&spa_namespace_lock); + if (error) + return (SET_ERROR(EINTR)); - mutex_enter(&spa_namespace_lock); + *pools = fnvlist_alloc(); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); - fnvlist_add_nvlist(pools, spa_name(spa), + fnvlist_add_nvlist(*pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } @@ -391,7 +392,7 @@ spa_all_configs(uint64_t *generation) *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); - return (pools); + return (0); } void diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index f91a2f3bbca5..2738385e260b 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1582,8 +1582,9 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) nvlist_t *configs; int error; - if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) - return (SET_ERROR(EEXIST)); + error = spa_all_configs(&zc->zc_cookie, &configs); + if (error) + return (error); error = put_nvlist(zc, configs); From 7aef672b776a681b7006ec6b67d75b310b2a9973 Mon Sep 17 00:00:00 2001 From: shodanshok Date: Thu, 26 Oct 2023 18:40:21 +0200 Subject: [PATCH 55/78] Read prefetched buffers from L2ARC Prefetched buffers are currently read from L2ARC if, and only if, l2arc_noprefetch is set to non-default value of 0. This means that a streaming read which can be served from L2ARC will instead engage the main pool. For example, consider what happens when a file is sequentially read: - application requests contiguous data, engaging the prefetcher; - ARC buffers are initially marked as prefetched but, as the calling application consumes data, the prefetch tag is cleared; - these "normal" buffers become eligible for L2ARC and are copied to it; - re-reading the same file will *not* engage L2ARC even if it contains the required buffers; - main pool has to suffer another sequential read load, which (due to most NCQ-enabled HDDs preferring sequential loads) can dramatically increase latency for uncached random reads. In other words, current behavior is to write data to L2ARC (wearing it) without using the very same cache when reading back the same data. This was probably useful many years ago to preserve L2ARC read bandwidth but, with current SSD speed/size/price, it is vastly sub-optimal. Setting l2arc_noprefetch=1, while enabling L2ARC to serve these reads, means that even prefetched but unused buffers will be copied into L2ARC, further increasing wear and load for potentially not-useful data. This patch enable prefetched buffer to be read from L2ARC even when l2arc_noprefetch=1 (default), increasing sequential read speed and reducing load on the main pool without polluting L2ARC with not-useful (ie: unused) prefetched data. Moreover, it clear users confusion about L2ARC size increasing but not serving any IO when doing sequential reads. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Gionatan Danti Closes #15451 --- module/zfs/arc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5d4a52fa0693..06544925b5ca 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5868,12 +5868,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. - * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && - !(l2arc_noprefetch && - (*arc_flags & ARC_FLAG_PREFETCH))) { + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; From fe9d409e90884a6b19572f4dfa1dd80cfc50d325 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Mon, 11 Sep 2023 23:21:29 -0400 Subject: [PATCH 56/78] Linux 6.6 compat: use inode_get/set_ctime*(...) In Linux commit 13bc24457850583a2e7203ded05b7209ab4bc5ef, direct access to the i_ctime member of struct inode was removed. The new approach is to use accessor methods that exclusively handle passing the timestamp around by value. This change adds new tests for each of these functions and introduces zpl_* equivalents in include/os/linux/zfs/sys/zpl.h. In where the inode_get/set_ctime*() functions exist, these zpl_* calls will be mapped to the new functions. On older kernels, these macros just wrap direct-access calls. The code that operated on an address of ip->i_ctime to call ZFS_TIME_DECODE() now will take a local copy using zpl_inode_get_ctime(), and then pass the address of the local copy when performing the ZFS_TIME_DECODE() call, in all cases, rather than directly accessing the member. Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15263 Closes #15257 --- config/kernel-inode-times.m4 | 43 ++++++++++++++++++++++++++++++ include/os/linux/zfs/sys/zpl.h | 11 ++++++++ module/os/linux/zfs/zfs_ctldir.c | 2 +- module/os/linux/zfs/zfs_vnops_os.c | 12 ++++++--- module/os/linux/zfs/zfs_znode.c | 18 ++++++++----- module/os/linux/zfs/zpl_inode.c | 2 +- module/os/linux/zfs/zpl_xattr.c | 7 ++--- 7 files changed, 80 insertions(+), 15 deletions(-) diff --git a/config/kernel-inode-times.m4 b/config/kernel-inode-times.m4 index 9c016c790081..412e13b47df5 100644 --- a/config/kernel-inode-times.m4 +++ b/config/kernel-inode-times.m4 @@ -27,6 +27,31 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ memset(&ip, 0, sizeof(ip)); ts = ip.i_mtime; ]) + + dnl # + dnl # 6.6 API change + dnl # i_ctime no longer directly accessible, must use + dnl # inode_get_ctime(ip), inode_set_ctime*(ip) to + dnl # read/write. + dnl # + ZFS_LINUX_TEST_SRC([inode_get_ctime], [ + #include + ],[ + struct inode ip; + + memset(&ip, 0, sizeof(ip)); + inode_get_ctime(&ip); + ]) + + ZFS_LINUX_TEST_SRC([inode_set_ctime_to_ts], [ + #include + ],[ + struct inode ip; + struct timespec64 ts; + + memset(&ip, 0, sizeof(ip)); + inode_set_ctime_to_ts(&ip, ts); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ @@ -47,4 +72,22 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ AC_DEFINE(HAVE_INODE_TIMESPEC64_TIMES, 1, [inode->i_*time's are timespec64]) ]) + + AC_MSG_CHECKING([whether inode_get_ctime() exists]) + ZFS_LINUX_TEST_RESULT([inode_get_ctime], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_GET_CTIME, 1, + [inode_get_ctime() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_set_ctime_to_ts() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_ctime_to_ts], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_CTIME_TO_TS, 1, + [inode_set_ctime_to_ts() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) ]) diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 0bd20f64897d..f4f1dcf95d4c 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -263,4 +263,15 @@ extern long zpl_ioctl_fideduperange(struct file *filp, void *arg); #define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(dentry, ia) #endif +#ifdef HAVE_INODE_GET_CTIME +#define zpl_inode_get_ctime(ip) inode_get_ctime(ip) +#else +#define zpl_inode_get_ctime(ip) (ip->i_ctime) +#endif +#ifdef HAVE_INODE_SET_CTIME_TO_TS +#define zpl_inode_set_ctime_to_ts(ip, ts) inode_set_ctime_to_ts(ip, ts) +#else +#define zpl_inode_set_ctime_to_ts(ip, ts) (ip->i_ctime = ts) +#endif + #endif /* _SYS_ZPL_H */ diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 02cb379ea840..94e25fa0ae8f 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -522,7 +522,7 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, ip->i_blkbits = SPA_MINBLOCKSHIFT; ip->i_atime = now; ip->i_mtime = now; - ip->i_ctime = now; + zpl_inode_set_ctime_to_ts(ip, now); ip->i_fop = fops; ip->i_op = ops; #if defined(IOP_XATTR) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 33baac9db06b..03865661d315 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -2442,8 +2442,8 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); - ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, - ZTOI(zp)); + zpl_inode_set_ctime_to_ts(ZTOI(zp), + zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } @@ -3648,6 +3648,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; + inode_timespec_t tmp_ctime; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; @@ -3812,7 +3813,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, /* Preserve the mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + tmp_ctime = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ctime, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; @@ -3862,6 +3864,7 @@ zfs_dirty_inode(struct inode *ip, int flags) zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; + inode_timespec_t tmp_ctime; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; @@ -3908,7 +3911,8 @@ zfs_dirty_inode(struct inode *ip, int flags) /* Preserve the mode, mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_atime, atime); ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + tmp_ctime = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ctime, ctime); mode = ip->i_mode; zp->z_mode = mode; diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index 52c8e51df659..f71026da83cb 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -542,6 +542,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; + inode_timespec_t tmp_ctime; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; @@ -615,7 +616,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ZFS_TIME_DECODE(&ip->i_atime, atime); ZFS_TIME_DECODE(&ip->i_mtime, mtime); - ZFS_TIME_DECODE(&ip->i_ctime, ctime); + ZFS_TIME_DECODE(&tmp_ctime, ctime); + zpl_inode_set_ctime_to_ts(ip, tmp_ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; @@ -1195,6 +1197,7 @@ zfs_rezget(znode_t *zp) uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; + inode_timespec_t tmp_ctime; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; @@ -1289,7 +1292,8 @@ zfs_rezget(znode_t *zp) ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); - ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); + ZFS_TIME_DECODE(&tmp_ctime, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { @@ -1397,7 +1401,7 @@ zfs_zinactive(znode_t *zp) boolean_t zfs_relatime_need_update(const struct inode *ip) { - inode_timespec_t now; + inode_timespec_t now, tmp_ctime; gethrestime(&now); /* @@ -1408,7 +1412,8 @@ zfs_relatime_need_update(const struct inode *ip) if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) return (B_TRUE); - if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) + tmp_ctime = zpl_inode_get_ctime(ip); + if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0) return (B_TRUE); if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) @@ -1434,7 +1439,7 @@ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { - inode_timespec_t now; + inode_timespec_t now, tmp_ctime; gethrestime(&now); @@ -1451,7 +1456,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); - ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); + ZFS_TIME_DECODE(&tmp_ctime, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index 5f5ad186a61c..ef50f8687779 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -774,7 +774,7 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) return (-EMLINK); crhold(cr); - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); /* Must have an existing ref, so igrab() cannot return NULL */ VERIFY3P(igrab(ip), !=, NULL); diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 96d85991811e..4e4f5210f85d 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -513,7 +513,7 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, error = -zfs_write_simple(xzp, value, size, pos, NULL); out: if (error == 0) { - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); zfs_mark_inode_dirty(ip); } @@ -1011,7 +1011,8 @@ zpl_set_acl_impl(struct inode *ip, struct posix_acl *acl, int type) */ if (ip->i_mode != mode) { ip->i_mode = ITOZ(ip)->z_mode = mode; - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, + current_time(ip)); zfs_mark_inode_dirty(ip); } @@ -1170,7 +1171,7 @@ zpl_init_acl(struct inode *ip, struct inode *dir) return (PTR_ERR(acl)); if (!acl) { ITOZ(ip)->z_mode = (ip->i_mode &= ~current_umask()); - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); zfs_mark_inode_dirty(ip); return (0); } From 21875dd09002efad4dc0a3815a73b0debe31a067 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Fri, 15 Sep 2023 00:36:39 -0400 Subject: [PATCH 57/78] Linux 6.6 compat: generic_fillattr has a new u32 request_mask added at arg2 In commit 0d72b92883c651a11059d93335f33d65c6eb653b, a new u32 argument for the request_mask was added to generic_fillattr. This is the same request_mask for statx that's present in the most recent API implemented by zpl_getattr_impl. This commit conditionally adds it to the zpl_generic_fillattr(...) macro, as well as the zfs_getattr_fast(...) implementation, when configure determines it's present in the kernel's generic_fillattr(...). Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15263 --- config/kernel-generic_fillattr.m4 | 39 +++++++++++++++++----- include/os/linux/kernel/linux/vfs_compat.h | 6 ++++ include/os/linux/zfs/sys/zfs_vnops_os.h | 5 +++ module/os/linux/zfs/zfs_vnops_os.c | 9 +++++ module/os/linux/zfs/zpl_ctldir.c | 11 +++++- module/os/linux/zfs/zpl_inode.c | 4 ++- 6 files changed, 63 insertions(+), 11 deletions(-) diff --git a/config/kernel-generic_fillattr.m4 b/config/kernel-generic_fillattr.m4 index 02dee4d4c000..f5323f0dcb9f 100644 --- a/config/kernel-generic_fillattr.m4 +++ b/config/kernel-generic_fillattr.m4 @@ -7,6 +7,10 @@ dnl # dnl # 6.3 API dnl # generic_fillattr() now takes struct mnt_idmap* as the first argument dnl # +dnl # 6.6 API +dnl # generic_fillattr() now takes u32 as second argument, representing a +dnl # request_mask for statx +dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR], [ ZFS_LINUX_TEST_SRC([generic_fillattr_userns], [ #include @@ -25,22 +29,39 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR], [ struct kstat *k = NULL; generic_fillattr(idmap, in, k); ]) + + ZFS_LINUX_TEST_SRC([generic_fillattr_mnt_idmap_reqmask], [ + #include + ],[ + struct mnt_idmap *idmap = NULL; + struct inode *in = NULL; + struct kstat *k = NULL; + generic_fillattr(idmap, 0, in, k); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FILLATTR], [ - AC_MSG_CHECKING([whether generic_fillattr requires struct mnt_idmap*]) - ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap], [ + AC_MSG_CHECKING( + [whether generic_fillattr requires struct mnt_idmap* and request_mask]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap_reqmask], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP, 1, - [generic_fillattr requires struct mnt_idmap*]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK, 1, + [generic_fillattr requires struct mnt_idmap* and u32 request_mask]) ],[ - AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*]) - ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [ + AC_MSG_CHECKING([whether generic_fillattr requires struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1, - [generic_fillattr requires struct user_namespace*]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP, 1, + [generic_fillattr requires struct mnt_idmap*]) ],[ - AC_MSG_RESULT([no]) + AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1, + [generic_fillattr requires struct user_namespace*]) + ],[ + AC_MSG_RESULT([no]) + ]) ]) ]) ]) diff --git a/include/os/linux/kernel/linux/vfs_compat.h b/include/os/linux/kernel/linux/vfs_compat.h index e156ed41c28c..aea8bd5ed22c 100644 --- a/include/os/linux/kernel/linux/vfs_compat.h +++ b/include/os/linux/kernel/linux/vfs_compat.h @@ -461,10 +461,16 @@ zpl_is_32bit_api(void) * 6.3 API change * generic_fillattr() first arg is changed to struct mnt_idmap * * + * 6.6 API change + * generic_fillattr() gets new second arg request_mask, a u32 type + * */ #ifdef HAVE_GENERIC_FILLATTR_IDMAP #define zpl_generic_fillattr(idmap, ip, sp) \ generic_fillattr(idmap, ip, sp) +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) +#define zpl_generic_fillattr(idmap, rqm, ip, sp) \ + generic_fillattr(idmap, rqm, ip, sp) #elif defined(HAVE_GENERIC_FILLATTR_USERNS) #define zpl_generic_fillattr(user_ns, ip, sp) \ generic_fillattr(user_ns, ip, sp) diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h index 7a1db7deeec8..830c76e5743a 100644 --- a/include/os/linux/zfs/sys/zfs_vnops_os.h +++ b/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -56,7 +56,12 @@ extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags); extern int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr); +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK +extern int zfs_getattr_fast(zidmap_t *, u32 request_mask, struct inode *ip, + struct kstat *sp); +#else extern int zfs_getattr_fast(zidmap_t *, struct inode *ip, struct kstat *sp); +#endif extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr, zidmap_t *mnt_ns); extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 03865661d315..d30290c69ebd 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1652,7 +1652,12 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) * RETURN: 0 (always succeeds) */ int +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK +zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, + struct kstat *sp) +#else zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) +#endif { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); @@ -1665,7 +1670,11 @@ zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) mutex_enter(&zp->z_lock); +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + zpl_generic_fillattr(user_ns, request_mask, ip, sp); +#else zpl_generic_fillattr(user_ns, ip, sp); +#endif /* * +1 link count for root inode with visible '.zfs' directory. */ diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c index 7786444fea35..8ee7fcecc7b7 100644 --- a/module/os/linux/zfs/zpl_ctldir.c +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -124,6 +124,8 @@ zpl_root_getattr_impl(const struct path *path, struct kstat *stat, generic_fillattr(user_ns, ip, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif @@ -435,6 +437,8 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, generic_fillattr(user_ns, ip, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif @@ -609,6 +613,8 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, generic_fillattr(user_ns, path->dentry->d_inode, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, path->dentry->d_inode, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif @@ -623,7 +629,10 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error == 0) { -#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + error = -zfs_getattr_fast(user_ns, request_mask, ZTOI(dzp), + stat); +#elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat); #else error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat); diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index ef50f8687779..96f65b9e94e2 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -435,7 +435,9 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, * XXX query_flags currently ignored. */ -#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + error = -zfs_getattr_fast(user_ns, request_mask, ip, stat); +#elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ip, stat); #else error = -zfs_getattr_fast(kcred->user_ns, ip, stat); From 3f67e012e4507527f9d8aae6c93efbe5660556d3 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Fri, 15 Sep 2023 01:07:03 -0400 Subject: [PATCH 58/78] Linux 6.6 compat: fsync_bdev() has been removed in favor of sync_blockdev() In Linux commit 560e20e4bf6484a0c12f9f3c7a1aa55056948e1e, the fsync_bdev() function was removed in favor of sync_blockdev() to do (roughly) the same thing, given the same input. This change conditionally attempts to call sync_blockdev() if fsync_bdev() isn't discovered during configure. Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15263 --- config/kernel-fsync-bdev.m4 | 36 +++++++++++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/zfs/zvol_os.c | 6 ++++++ 3 files changed, 44 insertions(+) create mode 100644 config/kernel-fsync-bdev.m4 diff --git a/config/kernel-fsync-bdev.m4 b/config/kernel-fsync-bdev.m4 new file mode 100644 index 000000000000..c47e236f705f --- /dev/null +++ b/config/kernel-fsync-bdev.m4 @@ -0,0 +1,36 @@ +dnl # +dnl # 6.6 API change, +dnl # fsync_bdev was removed in favor of sync_blockdev +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SYNC_BDEV], [ + ZFS_LINUX_TEST_SRC([fsync_bdev], [ + #include + ],[ + fsync_bdev(NULL); + ]) + + ZFS_LINUX_TEST_SRC([sync_blockdev], [ + #include + ],[ + sync_blockdev(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SYNC_BDEV], [ + AC_MSG_CHECKING([whether fsync_bdev() exists]) + ZFS_LINUX_TEST_RESULT([fsync_bdev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FSYNC_BDEV, 1, + [fsync_bdev() is declared in include/blkdev.h]) + ],[ + AC_MSG_CHECKING([whether sync_blockdev() exists]) + ZFS_LINUX_TEST_RESULT([sync_blockdev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SYNC_BLOCKDEV, 1, + [sync_blockdev() is declared in include/blkdev.h]) + ],[ + ZFS_LINUX_TEST_ERROR( + [neither fsync_bdev() nor sync_blockdev() exist]) + ]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index df194ec72207..056517a841f2 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -162,6 +162,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ + ZFS_AC_KERNEL_SRC_SYNC_BDEV case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -303,6 +304,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_COPY_SPLICE_READ + ZFS_AC_KERNEL_SYNC_BDEV case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 7a95b54bdf0d..f94ce69fb9e2 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -873,7 +873,13 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case BLKFLSBUF: +#ifdef HAVE_FSYNC_BDEV fsync_bdev(bdev); +#elif defined(HAVE_SYNC_BLOCKDEV) + sync_blockdev(bdev); +#else +#error "Neither fsync_bdev() nor sync_blockdev() found" +#endif invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); From e82e68400a5be0e82d10044d4f70ebaed1547cb9 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 11 Oct 2023 19:37:21 -0400 Subject: [PATCH 59/78] DMU: Do not pre-read holes during write dmu_tx_check_ioerr() pre-reads blocks that are going to be dirtied as part of transaction to both prefetch them and check for errors. But it makes no sense to do it for holes, since there are no disk reads to prefetch and there can be no errors. On the other side those blocks are anonymous, and they are freed immediately by the dbuf_rele() without even being put into dbuf cache, so we just burn CPU time on decompression and overheads and get absolutely no result at the end. Use of dbuf_hold_impl() with fail_sparse parameter allows to skip the extra work, and on my tests with sequential 8KB writes to empty ZVOL with 32KB blocks shows throughput increase from 1.7 to 2GB/s. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15371 --- module/zfs/dmu_tx.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 0eb8c17e331a..8451b5082e86 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -210,10 +210,12 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold_level(dn, level, blkid, FTAG); + err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); - if (db == NULL) - return (SET_ERROR(EIO)); + if (err == ENOENT) + return (0); + if (err != 0) + return (err); /* * PARTIAL_FIRST allows caching for uncacheable blocks. It will * be cleared after dmu_buf_will_dirty() call dbuf_read() again. From bd7a02c251d8c119937e847d5161b512913667e6 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 30 Oct 2023 17:55:32 -0400 Subject: [PATCH 60/78] Tune zio buffer caches and their alignments We should not always use PAGESIZE alignment for caches bigger than it and SPA_MINBLOCKSIZE otherwise. Doing that caches for 5, 6, 7, 10 and 14KB rounded up to 8, 12 and 16KB respectively make no sense. Instead specify as alignment the biggest power-of-2 divisor. This way 2KB and 6KB caches are both aligned to 2KB, while 4KB and 8KB are aligned to 4KB. Reduce number of caches to half-power of 2 instead of quarter-power of 2. This removes caches difficult for underlying allocators to fit into page-granular slabs, such as: 2.5, 3.5, 5, 7, 10KB, etc. Since these caches are mostly used for transient allocations like ZIOs and small DBUF cache it does not worth being too aggressive. Due to the above alignment issue some of those caches were not working properly any way. 6KB cache now finally has a chance to work right, placing 2 buffers into 3 pages, that makes sense. Remove explicit alignment in Linux user-space case. I don't think it should be needed any more with the above fixes. As result on FreeBSD instead of such numbers of pages per slab: vm.uma.zio_buf_comb_16384.keg.ppera: 4 vm.uma.zio_buf_comb_14336.keg.ppera: 4 vm.uma.zio_buf_comb_12288.keg.ppera: 3 vm.uma.zio_buf_comb_10240.keg.ppera: 3 vm.uma.zio_buf_comb_8192.keg.ppera: 2 vm.uma.zio_buf_comb_7168.keg.ppera: 2 vm.uma.zio_buf_comb_6144.keg.ppera: 2 <= Broken vm.uma.zio_buf_comb_5120.keg.ppera: 2 vm.uma.zio_buf_comb_4096.keg.ppera: 1 vm.uma.zio_buf_comb_3584.keg.ppera: 7 <= Hard to free vm.uma.zio_buf_comb_3072.keg.ppera: 3 vm.uma.zio_buf_comb_2560.keg.ppera: 2 vm.uma.zio_buf_comb_2048.keg.ppera: 1 vm.uma.zio_buf_comb_1536.keg.ppera: 2 vm.uma.zio_buf_comb_1024.keg.ppera: 1 vm.uma.zio_buf_comb_512.keg.ppera: 1 I am now getting such: vm.uma.zio_buf_comb_16384.keg.ppera: 4 vm.uma.zio_buf_comb_12288.keg.ppera: 3 vm.uma.zio_buf_comb_8192.keg.ppera: 2 vm.uma.zio_buf_comb_6144.keg.ppera: 3 <= Fixed, 2 in 3 pages vm.uma.zio_buf_comb_4096.keg.ppera: 1 vm.uma.zio_buf_comb_3072.keg.ppera: 3 vm.uma.zio_buf_comb_2048.keg.ppera: 1 vm.uma.zio_buf_comb_1536.keg.ppera: 2 vm.uma.zio_buf_comb_1024.keg.ppera: 1 vm.uma.zio_buf_comb_512.keg.ppera: 1 Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15452 --- module/zfs/zio.c | 89 +++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 50 deletions(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3b3b40fa73d8..a719e5492323 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -158,23 +158,22 @@ zio_init(void) zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - /* - * For small buffers, we want a cache for each multiple of - * SPA_MINBLOCKSIZE. For larger buffers, we want a cache - * for each quarter-power of 2. - */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; - size_t p2 = size; - size_t align = 0; - size_t data_cflags, cflags; - - data_cflags = KMC_NODEBUG; - cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? - KMC_NODEBUG : 0; + size_t align, cflags, data_cflags; + char name[32]; + /* + * Create cache for each half-power of 2 size, starting from + * SPA_MINBLOCKSIZE. It should give us memory space efficiency + * of ~7/8, sufficient for transient allocations mostly using + * these caches. + */ + size_t p2 = size; while (!ISP2(p2)) p2 &= p2 - 1; + if (!IS_P2ALIGNED(size, p2 / 2)) + continue; #ifndef _KERNEL /* @@ -185,47 +184,37 @@ zio_init(void) */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; - /* - * Here's the problem - on 4K native devices in userland on - * Linux using O_DIRECT, buffers must be 4K aligned or I/O - * will fail with EINVAL, causing zdb (and others) to coredump. - * Since userland probably doesn't need optimized buffer caches, - * we just force 4K alignment on everything. - */ - align = 8 * SPA_MINBLOCKSIZE; -#else - if (size < PAGESIZE) { - align = SPA_MINBLOCKSIZE; - } else if (IS_P2ALIGNED(size, p2 >> 2)) { - align = PAGESIZE; - } #endif - if (align != 0) { - char name[36]; - if (cflags == data_cflags) { - /* - * Resulting kmem caches would be identical. - * Save memory by creating only one. - */ - (void) snprintf(name, sizeof (name), - "zio_buf_comb_%lu", (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, - size, align, NULL, NULL, NULL, NULL, NULL, - cflags); - zio_data_buf_cache[c] = zio_buf_cache[c]; - continue; - } - (void) snprintf(name, sizeof (name), "zio_buf_%lu", - (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, cflags); - - (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", - (ulong_t)size); - zio_data_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, data_cflags); + if (IS_P2ALIGNED(size, PAGESIZE)) + align = PAGESIZE; + else + align = 1 << (highbit64(size ^ (size - 1)) - 1); + + cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? + KMC_NODEBUG : 0; + data_cflags = KMC_NODEBUG; + if (cflags == data_cflags) { + /* + * Resulting kmem caches would be identical. + * Save memory by creating only one. + */ + (void) snprintf(name, sizeof (name), + "zio_buf_comb_%lu", (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, cflags); + zio_data_buf_cache[c] = zio_buf_cache[c]; + continue; } + (void) snprintf(name, sizeof (name), "zio_buf_%lu", + (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, cflags); + + (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", + (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, data_cflags); } while (--c != 0) { From 3ec4ea68d491a82c8de3360d50032bdecd53608f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 30 Oct 2023 19:56:04 -0400 Subject: [PATCH 61/78] Unify arc_prune_async() code There is no sense to have separate implementations for FreeBSD and Linux. Make Linux code shared as more functional and just register FreeBSD-specific prune callback with arc_add_prune_callback() API. Aside of code cleanup this should fix excessive pruning on FreeBSD: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=274698 Reviewed-by: Brian Behlendorf Reviewed-by: Mark Johnston Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15456 --- include/os/linux/zfs/sys/zpl.h | 2 +- include/sys/arc.h | 2 +- include/sys/arc_impl.h | 1 - module/os/freebsd/zfs/arc_os.c | 62 ------------------------------ module/os/freebsd/zfs/zfs_vfsops.c | 32 +++++++++++++++ module/os/linux/zfs/arc_os.c | 51 ------------------------ module/os/linux/zfs/zpl_super.c | 2 +- module/zfs/arc.c | 52 +++++++++++++++++++++++++ 8 files changed, 87 insertions(+), 117 deletions(-) diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index f4f1dcf95d4c..9b729be6d74d 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -60,7 +60,7 @@ extern const struct file_operations zpl_file_operations; extern const struct file_operations zpl_dir_file_operations; /* zpl_super.c */ -extern void zpl_prune_sb(int64_t nr_to_scan, void *arg); +extern void zpl_prune_sb(uint64_t nr_to_scan, void *arg); extern const struct super_operations zpl_super_operations; extern const struct export_operations zpl_export_operations; diff --git a/include/sys/arc.h b/include/sys/arc.h index 9d67dab06ca3..05307aab99e3 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -81,7 +81,7 @@ typedef struct arc_prune arc_prune_t; typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *priv); typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); -typedef void arc_prune_func_t(int64_t bytes, void *priv); +typedef void arc_prune_func_t(uint64_t bytes, void *priv); /* Shared module parameters */ extern uint_t zfs_arc_average_blocksize; diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index adff42c55d05..defebe3b2fbb 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -1065,7 +1065,6 @@ extern void arc_wait_for_eviction(uint64_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); -extern void arc_prune_async(uint64_t); extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); extern uint64_t arc_free_memory(void); extern int64_t arc_available_memory(void); diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 12f16edb1e2b..92696c0bf1ae 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -52,11 +52,6 @@ #include #include -#if __FreeBSD_version >= 1300139 -static struct sx arc_vnlru_lock; -static struct vnode *arc_vnlru_marker; -#endif - extern struct vfsops zfs_vfsops; uint_t zfs_arc_free_target = 0; @@ -131,53 +126,6 @@ arc_default_max(uint64_t min, uint64_t allmem) return (MAX(allmem * 5 / 8, size)); } -/* - * Helper function for arc_prune_async() it is responsible for safely - * handling the execution of a registered arc_prune_func_t. - */ -static void -arc_prune_task(void *arg) -{ - uint64_t nr_scan = (uintptr_t)arg; - -#ifndef __ILP32__ - if (nr_scan > INT_MAX) - nr_scan = INT_MAX; -#endif - -#if __FreeBSD_version >= 1300139 - sx_xlock(&arc_vnlru_lock); - vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker); - sx_xunlock(&arc_vnlru_lock); -#else - vnlru_free(nr_scan, &zfs_vfsops); -#endif -} - -/* - * Notify registered consumers they must drop holds on a portion of the ARC - * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the metadata limit and reclaim otherwise pinned ARC buffers. This - * is analogous to dnlc_reduce_cache() but more generic. - * - * This operation is performed asynchronously so it may be safely called - * in the context of the arc_reclaim_thread(). A reference is taken here - * for each registered arc_prune_t and the arc_prune_task() is responsible - * for releasing it once the registered arc_prune_func_t has completed. - */ -void -arc_prune_async(uint64_t adjust) -{ - -#ifndef __LP64__ - if (adjust > UINTPTR_MAX) - adjust = UINTPTR_MAX; -#endif - taskq_dispatch(arc_prune_taskq, arc_prune_task, - (void *)(intptr_t)adjust, TQ_SLEEP); - ARCSTAT_BUMP(arcstat_prune); -} - uint64_t arc_all_memory(void) { @@ -228,10 +176,6 @@ arc_lowmem_init(void) { arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); -#if __FreeBSD_version >= 1300139 - arc_vnlru_marker = vnlru_alloc_marker(); - sx_init(&arc_vnlru_lock, "arc vnlru lock"); -#endif } void @@ -239,12 +183,6 @@ arc_lowmem_fini(void) { if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); -#if __FreeBSD_version >= 1300139 - if (arc_vnlru_marker != NULL) { - vnlru_free_marker(arc_vnlru_marker); - sx_destroy(&arc_vnlru_lock); - } -#endif } void diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index e8b9ada1316b..a972c720dfdb 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -2070,6 +2070,26 @@ zfs_vnodes_adjust_back(void) #endif } +#if __FreeBSD_version >= 1300139 +static struct sx zfs_vnlru_lock; +static struct vnode *zfs_vnlru_marker; +#endif +static arc_prune_t *zfs_prune; + +static void +zfs_prune_task(uint64_t nr_to_scan, void *arg __unused) +{ + if (nr_to_scan > INT_MAX) + nr_to_scan = INT_MAX; +#if __FreeBSD_version >= 1300139 + sx_xlock(&zfs_vnlru_lock); + vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker); + sx_xunlock(&zfs_vnlru_lock); +#else + vnlru_free(nr_to_scan, &zfs_vfsops); +#endif +} + void zfs_init(void) { @@ -2096,11 +2116,23 @@ zfs_init(void) dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); + +#if __FreeBSD_version >= 1300139 + zfs_vnlru_marker = vnlru_alloc_marker(); + sx_init(&zfs_vnlru_lock, "zfs vnlru lock"); +#endif + zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL); } void zfs_fini(void) { + arc_remove_prune_callback(zfs_prune); +#if __FreeBSD_version >= 1300139 + vnlru_free_marker(zfs_vnlru_marker); + sx_destroy(&zfs_vnlru_lock); +#endif + taskq_destroy(zfsvfs_taskq); zfsctl_fini(); zfs_znode_fini(); diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 29a8802b8367..43ed087e2dbb 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -489,56 +489,5 @@ arc_unregister_hotplug(void) } #endif /* _KERNEL */ -/* - * Helper function for arc_prune_async() it is responsible for safely - * handling the execution of a registered arc_prune_func_t. - */ -static void -arc_prune_task(void *ptr) -{ - arc_prune_t *ap = (arc_prune_t *)ptr; - arc_prune_func_t *func = ap->p_pfunc; - - if (func != NULL) - func(ap->p_adjust, ap->p_private); - - zfs_refcount_remove(&ap->p_refcnt, func); -} - -/* - * Notify registered consumers they must drop holds on a portion of the ARC - * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the metadata limit and reclaim otherwise pinned ARC buffers. This - * is analogous to dnlc_reduce_cache() but more generic. - * - * This operation is performed asynchronously so it may be safely called - * in the context of the arc_reclaim_thread(). A reference is taken here - * for each registered arc_prune_t and the arc_prune_task() is responsible - * for releasing it once the registered arc_prune_func_t has completed. - */ -void -arc_prune_async(uint64_t adjust) -{ - arc_prune_t *ap; - - mutex_enter(&arc_prune_mtx); - for (ap = list_head(&arc_prune_list); ap != NULL; - ap = list_next(&arc_prune_list, ap)) { - - if (zfs_refcount_count(&ap->p_refcnt) >= 2) - continue; - - zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); - ap->p_adjust = adjust; - if (taskq_dispatch(arc_prune_taskq, arc_prune_task, - ap, TQ_SLEEP) == TASKQID_INVALID) { - zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); - continue; - } - ARCSTAT_BUMP(arcstat_prune); - } - mutex_exit(&arc_prune_mtx); -} - ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, "Limit on number of pages that ARC shrinker can reclaim at once"); diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index ad52a11aada0..d98d32c1f9fb 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -375,7 +375,7 @@ zpl_kill_sb(struct super_block *sb) } void -zpl_prune_sb(int64_t nr_to_scan, void *arg) +zpl_prune_sb(uint64_t nr_to_scan, void *arg) { struct super_block *sb = (struct super_block *)arg; int objects = 0; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 06544925b5ca..dfea15b74394 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -886,6 +886,8 @@ static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only); +static void arc_prune_async(uint64_t adjust); + #define l2arc_hdr_arcstats_increment(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) #define l2arc_hdr_arcstats_decrement(hdr) \ @@ -6050,6 +6052,56 @@ arc_remove_prune_callback(arc_prune_t *p) kmem_free(p, sizeof (*p)); } +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *ptr) +{ + arc_prune_t *ap = (arc_prune_t *)ptr; + arc_prune_func_t *func = ap->p_pfunc; + + if (func != NULL) + func(ap->p_adjust, ap->p_private); + + zfs_refcount_remove(&ap->p_refcnt, func); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffers they reference. This provides a mechanism to ensure the ARC can + * honor the metadata limit and reclaim otherwise pinned ARC buffers. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +static void +arc_prune_async(uint64_t adjust) +{ + arc_prune_t *ap; + + mutex_enter(&arc_prune_mtx); + for (ap = list_head(&arc_prune_list); ap != NULL; + ap = list_next(&arc_prune_list, ap)) { + + if (zfs_refcount_count(&ap->p_refcnt) >= 2) + continue; + + zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); + ap->p_adjust = adjust; + if (taskq_dispatch(arc_prune_taskq, arc_prune_task, + ap, TQ_SLEEP) == TASKQID_INVALID) { + zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); + continue; + } + ARCSTAT_BUMP(arcstat_prune); + } + mutex_exit(&arc_prune_mtx); +} + /* * Notify the arc that a block was freed, and thus will never be used again. */ From 1c7048357d91d612ebde9429a06a889a15865643 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 31 Oct 2023 21:51:54 +0500 Subject: [PATCH 62/78] Add all read-only compatible zpool features to grub2 compatibility GRUB opens the boot pool in read-only mode. All read-only compatible features for zpool can be enabled and added to grub2 compatibility, as GRUB does not open the boot-pool for write. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #15459 --- cmd/zpool/compatibility.d/grub2 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cmd/zpool/compatibility.d/grub2 b/cmd/zpool/compatibility.d/grub2 index fec73a269a78..8c3a3ce11d94 100644 --- a/cmd/zpool/compatibility.d/grub2 +++ b/cmd/zpool/compatibility.d/grub2 @@ -1,6 +1,9 @@ # Features which are supported by GRUB2 +allocation_classes async_destroy +block_cloning bookmarks +device_rebuild embedded_data empty_bpobj enabled_txg @@ -9,6 +12,13 @@ filesystem_limits hole_birth large_blocks livelist +log_spacemap lz4_compress +obsolete_counts +project_quota +resilver_defer spacemap_histogram +spacemap_v2 +userobj_accounting +zilsaxattr zpool_checkpoint From e534ba5ce7dea316eeab1ece6c9d7ae61f6dd26d Mon Sep 17 00:00:00 2001 From: AllKind Date: Tue, 7 Nov 2023 20:27:29 +0100 Subject: [PATCH 63/78] Fix dkms installation of deb packages created with Alien. Alien does not honour the %posttrans hook. So move the dkms uninstall/install scripts to the %pre/%post hooks in case of package install/upgrade. In case of package removal, handle that in %preun. Add removal of all old dkms modules. Add checking for broken 'dkms status'. Handle that as good as possible and warn the user about it. Also add more verbose messages about what we are doing. Reviewed-by: Brian Behlendorf Signed-off-by: Mart Frauenlob Closes #15415 --- rpm/generic/zfs-dkms.spec.in | 90 ++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index 23c3ed6ff408..d56967d7a8b1 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -24,6 +24,7 @@ BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) BuildArch: noarch Requires: dkms >= 2.2.0.3 +Requires(pre): dkms >= 2.2.0.3 Requires(post): dkms >= 2.2.0.3 Requires(preun): dkms >= 2.2.0.3 Requires: gcc, make, perl, diffutils @@ -68,9 +69,92 @@ fi %defattr(-,root,root) /usr/src/%{module}-%{version} +%pre +echo "Running pre installation script: $0. Parameters: $*" +# We don't want any other versions lingering around in dkms. +# Tests with 'dnf' showed that in case of reinstall, or upgrade +# the preun scriptlet removed the version we are trying to install. +# Because of this, find all zfs dkms sources in /var/lib/dkms and +# remove them, if we find a matching version in dkms. + +dkms_root=/var/lib/dkms +if [ -d ${dkms_root}/%{module} ]; then + cd ${dkms_root}/%{module} + for x in [[:digit:]]*; do + [ -d "$x" ] || continue + otherver="$x" + opath="${dkms_root}/%{module}/${otherver}" + if [ "$otherver" != %{version} ]; then + # This is a workaround for a broken 'dkms status', we caused in a previous version. + # One day it might be not needed anymore, but it does not hurt to keep it. + if dkms status -m %{module} -v "$otherver" 2>&1 | grep "${opath}/source/dkms.conf does not exist" + then + echo "ERROR: dkms status is broken!" >&2 + if [ -L "${opath}/source" -a ! -d "${opath}/source" ] + then + echo "Trying to fix it by removing the symlink: ${opath}/source" >&2 + echo "You should manually remove ${opath}" >&2 + rm -f "${opath}/source" || echo "Removal failed!" >&2 + fi + fi + if [ `dkms status -m %{module} -v "$otherver" | grep -c %{module}` -gt 0 ]; then + echo "Removing old %{module} dkms modules version $otherver from all kernels." + dkms remove -m %{module} -v "$otherver" --all ||: + fi + fi + done +fi + +# Uninstall this version of zfs dkms modules before installation of the package. +if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then + echo "Removing %{module} dkms modules version %{version} from all kernels." + dkms remove -m %{module} -v %{version} --all ||: +fi + +%post +echo "Running post installation script: $0. Parameters: $*" +# Add the module to dkms, as reccommended in the dkms man page. +# This is generally rpm specfic. +# But this also may help, if we have a broken 'dkms status'. +# Because, if the sources are available and only the symlink pointing +# to them is missing, this will resolve the situation +echo "Adding %{module} dkms modules version %{version} to dkms." +dkms add -m %{module} -v %{version} %{!?not_rpm:--rpm_safe_upgrade} ||: + +# After installing the package, dkms install this zfs version for the current kernel. +# Force the overwriting of old modules to avoid diff warnings in dkms status. +# Or in case of a downgrade to overwrite newer versions. +# Or if some other backed up versions have been restored before. +echo "Installing %{module} dkms modules version %{version} for the current kernel." +dkms install --force -m %{module} -v %{version} ||: + %preun -dkms remove -m %{module} -v %{version} --all +dkms_root="/var/lib/dkms/%{module}/%{version}" +echo "Running pre uninstall script: $0. Parameters: $*" +# In case of upgrade we do nothing. See above comment in pre hook. +if [ "$1" = "1" -o "$1" = "upgrade" ] ; then + echo "This is an upgrade. Skipping pre uninstall action." + exit 0 +fi + +# Check if we uninstall the package. In that case remove the dkms modules. +# '0' is the value for the first parameter for rpm packages. +# 'remove' or 'purge' are the possible names for deb packages. +if [ "$1" = "0" -o "$1" = "remove" -o "$1" = "purge" ] ; then + if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then + echo "Removing %{module} dkms modules version %{version} from all kernels." + dkms remove -m %{module} -v %{version} --all %{!?not_rpm:--rpm_safe_upgrade} && exit 0 + fi + # If removing the modules failed, it might be because of the broken 'dkms status'. + if dkms status -m %{module} -v %{version} 2>&1 | grep "${dkms_root}/source/dkms.conf does not exist" + then + echo "ERROR: dkms status is broken!" >&2 + echo "You should manually remove ${dkms_root}" >&2 + echo "WARNING: installed modules in /lib/modules/`uname -r`/extra could not be removed automatically!" >&2 + fi +else + echo "Script parameter $1 did not match any removal condition." +fi -%posttrans -/usr/lib/dkms/common.postinst %{module} %{version} +exit 0 From f0ffcc3adcd01ec60b384b0c07c8671b6fd1d982 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Wed, 8 Nov 2023 01:04:56 +0500 Subject: [PATCH 64/78] Remove obsolete_counts from grub2 compatibility list PR#15459 add all read-only compatible zpool features to grub2 compatibility list. 'obsolete_counts' is a read-only features that depends on 'device_removal' feature which is not read-only and is marked as ZFEATURE_FLAG_MOS. Creating a pool with grub2 compatibility enables 'device_removal' feature as well, which is not desired. This commit removes the 'obsolete_counts' feature from grub2 compatibility list, as GRUB only supports read-only compatible features. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #15499 --- cmd/zpool/compatibility.d/grub2 | 1 - 1 file changed, 1 deletion(-) diff --git a/cmd/zpool/compatibility.d/grub2 b/cmd/zpool/compatibility.d/grub2 index 8c3a3ce11d94..6d60e643593b 100644 --- a/cmd/zpool/compatibility.d/grub2 +++ b/cmd/zpool/compatibility.d/grub2 @@ -14,7 +14,6 @@ large_blocks livelist log_spacemap lz4_compress -obsolete_counts project_quota resilver_defer spacemap_histogram From 44c8ff9b0ce2d94bd692087e86812f6a9f064353 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Wed, 8 Nov 2023 02:24:16 +0500 Subject: [PATCH 65/78] Linux 6.6 compat: fix implicit conversion error with debug build With Linux v6.6.0 and GCC 12, when debug build is configured, implicit conversion error is raised while converting 'enum ' to 'boolean_t'. Use 'B_TRUE' instead of 'true' to fix the issue. Reviewed-by: Brian Behlendorf Reviewed-by: Pavel Snajdr Reviewed-by: Brian Atkinson Signed-off-by: Umer Saleem Closes #15489 --- module/os/linux/zfs/zfs_vfsops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index a1db5c57c18b..2792bc027213 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1488,7 +1488,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) * read-only flag, pretend it was set, as done for snapshots. */ if (!canwrite) - vfs->vfs_readonly = true; + vfs->vfs_readonly = B_TRUE; error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); if (error) { From 76663fe3720b3ea3ba72de8ebe1bc1debb67b393 Mon Sep 17 00:00:00 2001 From: MigeljanImeri <78048439+MigeljanImeri@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:06:14 -0700 Subject: [PATCH 66/78] Fix accounting error for pending sync IO ops in zpool iostat Currently vdev_queue_class_length is responsible for checking how long the queue length is, however, it doesn't check the length when a list is used, rather it just returns whether it is empty or not. To fix this I added a counter variable to vdev_queue_class to keep track of the sync IO ops, and changed vdev_queue_class_length to reference this variable instead. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: MigeljanImeri Closes #15478 --- include/sys/vdev_impl.h | 5 ++++- module/zfs/vdev_queue.c | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index ad9dc3aefd8e..3f2312c23438 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -131,7 +131,10 @@ typedef const struct vdev_ops { * Virtual device properties */ typedef union vdev_queue_class { - list_t vqc_list; + struct { + ulong_t vqc_list_numnodes; + list_t vqc_list; + }; avl_tree_t vqc_tree; } vdev_queue_class_t; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 08d918467d03..092b3f375be0 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -273,8 +273,10 @@ vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) { zio_priority_t p = zio->io_priority; vq->vq_cqueued |= 1U << p; - if (vdev_queue_class_fifo(p)) + if (vdev_queue_class_fifo(p)) { list_insert_tail(&vq->vq_class[p].vqc_list, zio); + vq->vq_class[p].vqc_list_numnodes++; + } else avl_add(&vq->vq_class[p].vqc_tree, zio); } @@ -288,6 +290,7 @@ vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) list_t *list = &vq->vq_class[p].vqc_list; list_remove(list, zio); empty = list_is_empty(list); + vq->vq_class[p].vqc_list_numnodes--; } else { avl_tree_t *tree = &vq->vq_class[p].vqc_tree; avl_remove(tree, zio); @@ -1069,7 +1072,7 @@ vdev_queue_class_length(vdev_t *vd, zio_priority_t p) { vdev_queue_t *vq = &vd->vdev_queue; if (vdev_queue_class_fifo(p)) - return (list_is_empty(&vq->vq_class[p].vqc_list) == 0); + return (vq->vq_class[p].vqc_list_numnodes); else return (avl_numnodes(&vq->vq_class[p].vqc_tree)); } From f2fe4d51a8daeb2f95fb60333af0d395f3ee3fd9 Mon Sep 17 00:00:00 2001 From: Low-power Date: Thu, 9 Nov 2023 04:19:38 +0800 Subject: [PATCH 67/78] Linux: reject read/write mapping to immutable file only on VM_SHARED Private read/write mapping can't be used to modify the mapped files, so they will remain be immutable. Private read/write mappings are usually used to load the data segment of executable files, rejecting them will rendering immutable executable files to stop working. Reviewed-by: Brian Behlendorf Signed-off-by: WHR Closes #15344 --- module/os/linux/zfs/zfs_vnops_os.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index d30290c69ebd..a087a5767760 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -4071,8 +4071,8 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - if ((vm_flags & VM_WRITE) && (zp->z_pflags & - (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { + if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && + (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } From f6d2e5c0759413822b615475c601b7c6b628facc Mon Sep 17 00:00:00 2001 From: AllKind Date: Wed, 8 Nov 2023 19:30:46 +0100 Subject: [PATCH 68/78] Workaround to allow openzfs-zfs-dkms install on Ubuntu As shown in #15404#issuecomment-1765002181, Ubuntu kernel has 'Provides: zfs-dkms', which will cause uninstall of the kernel, when attempting to install openzfs-zfs-dkms. As a workaround remove the 'Conflicts: zfs-dkms' definition from the debian control file. Reviewed-by: Brian Behlendorf Signed-off-by: Mart Frauenlob Closes #15503 --- contrib/debian/control | 1 - 1 file changed, 1 deletion(-) diff --git a/contrib/debian/control b/contrib/debian/control index f4e97fe16145..98beb900d0fa 100644 --- a/contrib/debian/control +++ b/contrib/debian/control @@ -197,7 +197,6 @@ Recommends: openzfs-zfs-zed, openzfs-zfsutils (>= ${source:Version}), ${linux:Re Suggests: debhelper Breaks: spl-dkms (<< 0.8.0~rc1) Replaces: spl-dkms, zfs-dkms -Conflicts: zfs-dkms Provides: openzfs-zfs-modules Description: OpenZFS filesystem kernel modules for Linux OpenZFS is a storage platform that encompasses the functionality of From f863ac3d0fab6e6c842746e6c63a3a2967f07680 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Wed, 8 Nov 2023 12:16:24 +0500 Subject: [PATCH 69/78] Update zpool-features.7 for grub2 compatibility list updates This commit updates zpool-features.7 man page to add newly added zpool features to grub2 compatibility list. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Umer Saleem Closes #15505 --- man/man7/zpool-features.7 | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index b901ce6c2935..8ca4bd927b24 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -219,8 +219,11 @@ to the end of the line is ignored. .Bd -literal -compact -offset 4n .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2 # Features which are supported by GRUB2 +allocation_classes async_destroy +block_cloning bookmarks +device_rebuild embedded_data empty_bpobj enabled_txg @@ -229,8 +232,14 @@ filesystem_limits hole_birth large_blocks livelist +log_spacemap lz4_compress +project_quota +resilver_defer spacemap_histogram +spacemap_v2 +userobj_accounting +zilsaxattr zpool_checkpoint .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev From f1659cc782e90d89ee0c417a56f1174656fba9ba Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Wed, 8 Nov 2023 16:00:59 +0500 Subject: [PATCH 70/78] ZTS: Test for all known zpool feature sets zpool_create_features_007_pos only tested for compat-2020 feature set. It would be useful to test for all known features sets. If any additional feature is found enabled that is not present in compatibility list or feature set, it should be caught and reported earlier. This commit also removes encryption from openzfsonosx-1.8.1 compatibility list. Encryption enables bookmark_v2, since it is a dependency of encryption, but not listed in openzfsonoxx-1.8.1 compatibility list. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Umer Saleem Closes #15505 --- cmd/zpool/compatibility.d/openzfsonosx-1.8.1 | 1 - .../zpool_create/zpool_create_features_007_pos.ksh | 10 +++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 b/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 index 162ff32a7803..125c578344f9 100644 --- a/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 +++ b/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 @@ -6,7 +6,6 @@ edonr embedded_data empty_bpobj enabled_txg -encryption extensible_dataset filesystem_limits hole_birth diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh index c35ca8e8c92c..c7c133a219cd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh @@ -34,6 +34,7 @@ # STRATEGY: # 1. Create a pool with a known feature set. # 2. Verify only those features are active/enabled. +# 3. Do this for all known feature sets # verify_runnable "global" @@ -47,8 +48,11 @@ log_onexit cleanup log_assert "creates a pool with a specified feature set enabled" -log_must zpool create -f -o compatibility=compat-2020 $TESTPOOL $DISKS -check_feature_set $TESTPOOL compat-2020 -log_must zpool destroy -f $TESTPOOL +for compat in "$ZPOOL_COMPAT_DIR"/* +do + log_must zpool create -f -o compatibility="${compat##*/}" $TESTPOOL $DISKS + check_feature_set $TESTPOOL "${compat##*/}" + log_must zpool destroy -f $TESTPOOL +done log_pass "creates a pool with a specified feature set enabled" From e92a680c7084c6a48687bed13896859e41c6d1b3 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Sun, 12 Nov 2023 16:26:07 -0800 Subject: [PATCH 71/78] Workaround UBSAN errors for variable arrays This gets around UBSAN errors when using arrays at the end of structs. It converts some zero-length arrays to variable length arrays and disables UBSAN checking on certain modules. It is based off of the patch from #15460. Reviewed-by: Brian Behlendorf Tested-by: Thomas Lamprecht Co-authored-by: Thomas Lamprecht Signed-off-by: Tony Hutter Issue #15145 Closes #15510 --- include/os/linux/spl/sys/kmem_cache.h | 2 +- include/sys/vdev_raidz_impl.h | 4 ++-- module/Kbuild.in | 4 ++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index 20eeadc46e10..82d50b6034c4 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -108,7 +108,7 @@ typedef struct spl_kmem_magazine { uint32_t skm_refill; /* Batch refill size */ struct spl_kmem_cache *skm_cache; /* Owned by cache */ unsigned int skm_cpu; /* Owned by cpu */ - void *skm_objs[0]; /* Object pointers */ + void *skm_objs[]; /* Object pointers */ } spl_kmem_magazine_t; typedef struct spl_kmem_obj { diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index c1037fa12e30..73c26dff1e0e 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -130,7 +130,7 @@ typedef struct raidz_row { uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ #endif - raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ + raidz_col_t rr_col[]; /* Flexible array of I/O columns */ } raidz_row_t; typedef struct raidz_map { @@ -139,7 +139,7 @@ typedef struct raidz_map { int rm_nskip; /* RAIDZ sectors skipped for padding */ int rm_skipstart; /* Column index of padding start */ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ - raidz_row_t *rm_row[0]; /* flexible array of rows */ + raidz_row_t *rm_row[]; /* flexible array of rows */ } raidz_map_t; diff --git a/module/Kbuild.in b/module/Kbuild.in index c132171592a8..b9c284a24418 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -488,6 +488,10 @@ zfs-$(CONFIG_ARM64) += $(addprefix zfs/,$(ZFS_OBJS_ARM64)) zfs-$(CONFIG_PPC) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) +UBSAN_SANITIZE_zap_leaf.o := n +UBSAN_SANITIZE_zap_micro.o := n +UBSAN_SANITIZE_sa.o := n + # Suppress incorrect warnings from versions of objtool which are not # aware of x86 EVEX prefix instructions used for AVX512. OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y From fd836dfe24f8649e3ee40c1c61fb37b5222aa367 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 14 Nov 2023 09:55:28 -0800 Subject: [PATCH 72/78] Linux 6.6 compat: META Update the META file to reflect compatibility with the 6.6 kernel. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Reviewed-by: Umer Saleem Signed-off-by: Tony Hutter Closes #15520 --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 0d7df10d47db..720500d51c06 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.5 +Linux-Maximum: 6.6 Linux-Minimum: 3.10 From 0733fe2aa56b123b153f9fe8ad86d5e67d40dc7d Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Thu, 16 Nov 2023 21:58:47 +0500 Subject: [PATCH 73/78] Packaging: Auto-generate changelog during configure (#15528) Auto-generate changelog based off on @VERSION@ during configure, so that it is not needed to be update with new releases / version updates. Signed-off-by: Umer Saleem Reviewed-by: Tony Hutter --- .gitignore | 1 + configure.ac | 1 + contrib/debian/{changelog => changelog.in} | 6 ++++++ 3 files changed, 8 insertions(+) rename contrib/debian/{changelog => changelog.in} (74%) diff --git a/.gitignore b/.gitignore index 47d17ae16d34..a2cb92dd5406 100644 --- a/.gitignore +++ b/.gitignore @@ -83,6 +83,7 @@ modules.order Makefile Makefile.in +changelog *.patch *.orig *.tmp diff --git a/configure.ac b/configure.ac index 4c75616e4299..f31fe1db81e4 100644 --- a/configure.ac +++ b/configure.ac @@ -67,6 +67,7 @@ ZFS_AC_DEBUG_INVARIANTS AC_CONFIG_FILES([ contrib/debian/rules + contrib/debian/changelog Makefile include/Makefile lib/libzfs/libzfs.pc diff --git a/contrib/debian/changelog b/contrib/debian/changelog.in similarity index 74% rename from contrib/debian/changelog rename to contrib/debian/changelog.in index ba42ea59fa8d..525519a73d08 100644 --- a/contrib/debian/changelog +++ b/contrib/debian/changelog.in @@ -1,3 +1,9 @@ +openzfs-linux (@VERSION@-1) unstable; urgency=low + + * OpenZFS @VERSION@ is tagged. + + -- Umer Saleem Wed, 15 Nov 2023 15:00:00 +0500 + openzfs-linux (2.2.0-0) unstable; urgency=low * OpenZFS 2.2.0 is tagged. From 87e9e828655c250ce064874ff5df16f870c0a52e Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Thu, 16 Nov 2023 14:35:22 -0500 Subject: [PATCH 74/78] Add a tunable to disable BRT support. Copy the disable parameter that FreeBSD implemented, and extend it to work on Linux as well, until we're sure this is stable. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #15529 --- include/os/freebsd/zfs/sys/zfs_vfsops_os.h | 1 + include/os/linux/zfs/sys/zfs_vfsops_os.h | 2 ++ man/man4/zfs.4 | 5 +++++ module/os/freebsd/zfs/zfs_vfsops.c | 4 ++++ module/os/freebsd/zfs/zfs_vnops_os.c | 5 +++++ module/os/linux/zfs/zfs_vnops_os.c | 4 ++++ module/os/linux/zfs/zpl_file_range.c | 5 +++++ tests/zfs-tests/include/libtest.shlib | 15 +++++++++++++++ tests/zfs-tests/include/tunables.cfg | 1 + .../tests/functional/block_cloning/cleanup.ksh | 4 ++++ .../tests/functional/block_cloning/setup.ksh | 5 +++++ 11 files changed, 51 insertions(+) diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index 24bb03575f33..56a0ac96ac19 100644 --- a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -286,6 +286,7 @@ typedef struct zfid_long { extern uint_t zfs_fsyncer_key; extern int zfs_super_owner; +extern int zfs_bclone_enabled; extern void zfs_init(void); extern void zfs_fini(void); diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index b4d5db21f5e5..220466550258 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -45,6 +45,8 @@ extern "C" { typedef struct zfsvfs zfsvfs_t; struct znode; +extern int zfs_bclone_enabled; + /* * This structure emulates the vfs_t from other platforms. It's purpose * is to facilitate the handling of mount options and minimize structural diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 615332bb023b..4ec52a2fb653 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1137,6 +1137,11 @@ Selecting any option other than results in vector instructions from the respective CPU instruction set being used. . +.It Sy zfs_bclone_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enable the experimental block cloning feature. +If this setting is 0, then even if feature@block_cloning is enabled, +attempts to clone blocks will act as though the feature is disabled. +. .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. .Pp diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index a972c720dfdb..f2d5391037c4 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -89,6 +89,10 @@ int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); +int zfs_bclone_enabled = 1; +SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN, + &zfs_bclone_enabled, 0, "Enable block cloning"); + struct zfs_jailparam { int mount_snapshot; }; diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index c498a1328290..f672deed34dd 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6243,6 +6243,11 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) int error; uint64_t len = *ap->a_lenp; + if (!zfs_bclone_enabled) { + mp = NULL; + goto bad_write_fallback; + } + /* * TODO: If offset/length is not aligned to recordsize, use * vn_generic_copy_file_range() on this fragment. diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index a087a5767760..b464f615cdd3 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -4242,4 +4242,8 @@ EXPORT_SYMBOL(zfs_map); module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); +/* CSTYLED */ +module_param(zfs_bclone_enabled, uint, 0644); +MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning"); + #endif diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index c47fe99dacff..73476ff40ebf 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -31,6 +31,8 @@ #include #include +int zfs_bclone_enabled = 1; + /* * Clone part of a file via block cloning. * @@ -50,6 +52,9 @@ __zpl_clone_file_range(struct file *src_file, loff_t src_off, fstrans_cookie_t cookie; int err; + if (!zfs_bclone_enabled) + return (-EOPNOTSUPP); + if (!spa_feature_is_enabled( dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING)) return (-EOPNOTSUPP); diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 844caa17d8ed..d5d7bb6c8360 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3334,6 +3334,21 @@ function set_tunable_impl esac } +function save_tunable +{ + [[ ! -d $TEST_BASE_DIR ]] && return 1 + [[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2 + echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1" +} + +function restore_tunable +{ + [[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1 + val="$(cat $TEST_BASE_DIR/tunable-"""$1""")" + set_tunable64 "$1" "$val" + rm $TEST_BASE_DIR/tunable-$1 +} + # # Get a global system tunable # diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 80e7bcb3bd09..a0edad14d028 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -90,6 +90,7 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq +BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh b/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh index 7ac13adb6325..b985445a5d12 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh @@ -31,4 +31,8 @@ verify_runnable "global" default_cleanup_noexit +if tunable_exists BCLONE_ENABLED ; then + log_must restore_tunable BCLONE_ENABLED +fi + log_pass diff --git a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh index 512f5a0644df..58441bf8f3ad 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh @@ -33,4 +33,9 @@ fi verify_runnable "global" +if tunable_exists BCLONE_ENABLED ; then + log_must save_tunable BCLONE_ENABLED + log_must set_tunable32 BCLONE_ENABLED 1 +fi + log_pass From 479dca51c66a731e637bd2d4f9bba01a05f9ac9f Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 16 Nov 2023 11:42:19 -0800 Subject: [PATCH 75/78] zfs-2.2.1: Disable block cloning by default Disable block cloning by default to mitigate possible data corruption (see #15529 and #15526). Signed-off-by: Tony Hutter --- module/os/freebsd/zfs/zfs_vfsops.c | 2 +- module/os/linux/zfs/zpl_file_range.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index f2d5391037c4..23b8da184535 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -89,7 +89,7 @@ int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); -int zfs_bclone_enabled = 1; +int zfs_bclone_enabled = 0; SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN, &zfs_bclone_enabled, 0, "Enable block cloning"); diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 73476ff40ebf..139c51cf46df 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -31,7 +31,7 @@ #include #include -int zfs_bclone_enabled = 1; +int zfs_bclone_enabled = 0; /* * Clone part of a file via block cloning. From 78287023ced23c956b953a953006c0c2b884954d Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Sat, 18 Nov 2023 13:07:06 -0800 Subject: [PATCH 76/78] ZTS: Fix 'could not unmount datasets' on Alma 9 Many tests are failing on AlmaLinux 9 because ZTS could not destroy the pool in cleanup. This was due to $PWD being set to '.' instead of the expected full path. This patch sets $PWD to the full path. Signed-off-by: Tony Hutter --- tests/zfs-tests/include/libtest.shlib | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index d5d7bb6c8360..b4d2b91dd476 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -37,6 +37,12 @@ . ${STF_SUITE}/include/math.shlib . ${STF_SUITE}/include/blkdev.shlib +# On AlmaLinux 9 we will see $PWD = '.' instead of the full path. This causes +# some tests to fail. Fix it up here. +if [ "$PWD" = "." ] ; then + PWD="$(readlink -f $PWD)" +fi + # # Apply constrained path when available. This is required since the # PATH may have been modified by sudo's secure_path behavior. From 55dd24c4ccee2da61d5396289ef560f9b7bc6a68 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 13 Nov 2023 11:38:57 -0800 Subject: [PATCH 77/78] Tag zfs-2.2.1 META file and changelog updated. Signed-off-by: Tony Hutter --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 720500d51c06..5868838a26df 100644 --- a/META +++ b/META @@ -1,7 +1,7 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.0 +Version: 2.2.1 Release: 1 Release-Tags: relext License: CDDL From 33e7c6558cb5454875f6fac8f986fd612a54ad32 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sat, 18 Nov 2023 20:01:03 -0500 Subject: [PATCH 78/78] ZIL: Do not encrypt block pointers in lr_clone_range_t In case of crash cloned blocks need to be claimed on pool import. It is only possible if they (lr_bps) and their count (lr_nbps) are not encrypted but only authenticated, similar to block pointer in lr_write_t. Few other fields can be and are still encrypted. This should fix panic on ZIL claim after crash when block cloning is actively used. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. --- module/os/freebsd/zfs/zio_crypt.c | 31 +++++++++++++++++++------------ module/os/linux/zfs/zio_crypt.c | 27 +++++++++++++++++++-------- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index fdbe13dbb5e9..74755eb6d9ef 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1338,19 +1338,14 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, * authenticate it. */ if (txtype == TX_WRITE) { - crypt_len = sizeof (lr_write_t) - - sizeof (lr_t) - sizeof (blkptr_t); - dst_iovecs[vec].iov_base = (char *)dlrp + - sizeof (lr_t); + const size_t o = offsetof(lr_write_t, lr_blkptr); + crypt_len = o - sizeof (lr_t); + dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); dst_iovecs[vec].iov_len = crypt_len; /* copy the bp now since it will not be encrypted */ - memcpy(dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), - slrp + sizeof (lr_write_t) - sizeof (blkptr_t), - sizeof (blkptr_t)); - memcpy(aadp, - slrp + sizeof (lr_write_t) - sizeof (blkptr_t), - sizeof (blkptr_t)); + memcpy(dlrp + o, slrp + o, sizeof (blkptr_t)); + memcpy(aadp, slrp + o, sizeof (blkptr_t)); aadp += sizeof (blkptr_t); aad_len += sizeof (blkptr_t); vec++; @@ -1364,10 +1359,22 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, vec++; total_len += crypt_len; } + } else if (txtype == TX_CLONE_RANGE) { + const size_t o = offsetof(lr_clone_range_t, lr_nbps); + crypt_len = o - sizeof (lr_t); + dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); + dst_iovecs[vec].iov_len = crypt_len; + + /* copy the bps now since they will not be encrypted */ + memcpy(dlrp + o, slrp + o, lr_len - o); + memcpy(aadp, slrp + o, lr_len - o); + aadp += lr_len - o; + aad_len += lr_len - o; + vec++; + total_len += crypt_len; } else { crypt_len = lr_len - sizeof (lr_t); - dst_iovecs[vec].iov_base = (char *)dlrp + - sizeof (lr_t); + dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); dst_iovecs[vec].iov_len = crypt_len; vec++; total_len += crypt_len; diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index 55554d09ee43..55f807ccfc13 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -1513,20 +1513,16 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, * authenticate it. */ if (txtype == TX_WRITE) { - crypt_len = sizeof (lr_write_t) - - sizeof (lr_t) - sizeof (blkptr_t); + const size_t o = offsetof(lr_write_t, lr_blkptr); + crypt_len = o - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; /* copy the bp now since it will not be encrypted */ - memcpy(dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), - slrp + sizeof (lr_write_t) - sizeof (blkptr_t), - sizeof (blkptr_t)); - memcpy(aadp, - slrp + sizeof (lr_write_t) - sizeof (blkptr_t), - sizeof (blkptr_t)); + memcpy(dlrp + o, slrp + o, sizeof (blkptr_t)); + memcpy(aadp, slrp + o, sizeof (blkptr_t)); aadp += sizeof (blkptr_t); aad_len += sizeof (blkptr_t); nr_iovecs++; @@ -1543,6 +1539,21 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, nr_iovecs++; total_len += crypt_len; } + } else if (txtype == TX_CLONE_RANGE) { + const size_t o = offsetof(lr_clone_range_t, lr_nbps); + crypt_len = o - sizeof (lr_t); + src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + /* copy the bps now since they will not be encrypted */ + memcpy(dlrp + o, slrp + o, lr_len - o); + memcpy(aadp, slrp + o, lr_len - o); + aadp += lr_len - o; + aad_len += lr_len - o; + nr_iovecs++; + total_len += crypt_len; } else { crypt_len = lr_len - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);