diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index 5da103f17783..6d334a2bf1d7 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -32,7 +32,7 @@ extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_holey(znode_t *, ulong_t, loff_t *); extern int zfs_access(znode_t *, int, int, cred_t *); extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *, - uint64_t *, cred_t *); + uint64_t *, cred_t *, boolean_t wait_sync); extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t, const blkptr_t *, size_t); diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index d9a8c8a0d769..0688ed65af67 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6304,7 +6304,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) goto out_locked; error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp), - ap->a_outoffp, &len, ap->a_outcred); + ap->a_outoffp, &len, ap->a_outcred, B_FALSE); if (error == EXDEV || error == EAGAIN || error == EINVAL || error == EOPNOTSUPP) goto bad_locked_fallback; diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 73476ff40ebf..7eb8496a1fd3 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -38,10 +38,15 @@ int zfs_bclone_enabled = 1; * * Note that we are not required to update file offsets; the kernel will take * care of that depending on how it was called. + * + * When the caller cannot handle a shortened range request wait_sync should + * be set. This will request that zfs_clone_range() wait on transaction groups + * as needed to allow the clone to succeed. This handles the case where a file + * is modified and immediately cloned with FICLONE or FICLONERANGE. */ static ssize_t -__zpl_clone_file_range(struct file *src_file, loff_t src_off, - struct file *dst_file, loff_t dst_off, size_t len) +zpl_clone_file_range_impl(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, size_t len, boolean_t wait_sync) { struct inode *src_i = file_inode(src_file); struct inode *dst_i = file_inode(dst_file); @@ -67,7 +72,9 @@ __zpl_clone_file_range(struct file *src_file, loff_t src_off, cookie = spl_fstrans_mark(); err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i), - &dst_off_o, &len_o, cr); + &dst_off_o, &len_o, cr, wait_sync); + if (wait_sync && err == 0 && len != len_o) + err = -EINVAL; spl_fstrans_unmark(cookie); crfree(cr); @@ -96,12 +103,13 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, { ssize_t ret; + /* Flags is reserved for future extensions and must be zero. */ if (flags != 0) return (-EINVAL); - /* Try to do it via zfs_clone_range() */ - ret = __zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len); + /* Try to do it via zfs_clone_range() and allow shortening. */ + ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len, B_FALSE); #ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE /* @@ -137,6 +145,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the * range in both files and if they're the same, arrange for them to be backed * by the same storage. + * + * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range + * if we want. Its designed for filesystems that may need to shorten the length + * for alignment, EOF, or any other requirement. ZFS may shorten the request + * when there is outstanding dirty data which hasn't been written to disk. */ loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, @@ -145,24 +158,16 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off, if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)) return (-EINVAL); - /* - * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given - * range if we want. Its designed for filesystems that make data past - * EOF available, and don't want it to be visible in both files. ZFS - * doesn't do that, so we just turn the flag off. - */ - flags &= ~REMAP_FILE_CAN_SHORTEN; - + /* No support for dedup yet */ if (flags & REMAP_FILE_DEDUP) - /* No support for dedup yet */ return (-EOPNOTSUPP); /* Zero length means to clone everything to the end of the file */ if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; - return (__zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len)); + return (zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len, !(REMAP_FILE_CAN_SHORTEN & flags))); } #endif /* HAVE_VFS_REMAP_FILE_RANGE */ @@ -179,8 +184,8 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off, if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; - return (__zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len)); + return (zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len, B_TRUE)); } #endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ @@ -215,7 +220,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg) size_t len = i_size_read(file_inode(src_file)); ssize_t ret = - __zpl_clone_file_range(src_file, 0, dst_file, 0, len); + zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len, B_TRUE); fput(src_file); @@ -253,8 +258,8 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) if (len == 0) len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset; - ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset, - dst_file, fcr.fcr_dest_offset, len); + ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset, + dst_file, fcr.fcr_dest_offset, len, B_TRUE); fput(src_file); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index c8ff7b6432fd..c987b8501a6b 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1030,7 +1030,7 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) */ int zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, - uint64_t *outoffp, uint64_t *lenp, cred_t *cr) + uint64_t *outoffp, uint64_t *lenp, cred_t *cr, boolean_t wait_sync) { zfsvfs_t *inzfsvfs, *outzfsvfs; objset_t *inos, *outos; @@ -1292,10 +1292,18 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, if (error != 0) { /* * If we are trying to clone a block that was created - * in the current transaction group, error will be - * EAGAIN here, which we can just return to the caller - * so it can fallback if it likes. + * in the current transaction group, the error will be + * EAGAIN here. Based on sync_wait either return a + * shortened range to the caller so it can fallback, + * or wait for the next sync and check again. */ + if (error == EAGAIN && wait_sync) { + int64_t txg = spa_last_synced_txg( + dmu_objset_spa(inos)) + 1; + txg_wait_synced(dmu_objset_pool(inos), txg); + continue; + } + break; } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 7e0990b5d9f9..05e8cdc8f234 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -631,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', tags = ['functional', 'compression'] [tests/functional/cp_files] -tests = ['cp_files_001_pos', 'cp_stress'] +tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress'] tags = ['functional', 'cp_files'] [tests/functional/crtime] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index ae4aa6275465..27c0d3e6c209 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'): 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], + 'cp_files/cp_files_002_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 4040e60434a7..44fa0bf6869c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1394,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/compression/setup.ksh \ functional/cp_files/cleanup.ksh \ functional/cp_files/cp_files_001_pos.ksh \ + functional/cp_files/cp_files_002_pos.ksh \ functional/cp_files/cp_stress.ksh \ functional/cp_files/setup.ksh \ functional/crtime/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh new file mode 100755 index 000000000000..af89e7aaf0d7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh @@ -0,0 +1,98 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify all cp --reflink modes work with modified file. +# +# STRATEGY: +# 1. Verify "cp --reflink=never|auto|always" all correctly copy a +# freshly modified file. Depending on the setting it may be +# cloned or copied but it must always be fully intact and never +# truncated. +# + +verify_runnable "global" + +if ! is_linux; then + log_unsupported "cp --reflink is a GNU coreutils option" +fi + +function cleanup +{ + rm -rf $TESTDIR/cp-reflink +} + +function verify_copy +{ + src_cksum=$(sha256digest $1) + dst_cksum=$(sha256digest $2) + + if [[ "$src_cksum" != "$dst_cksum" ]]; then + log_must ls -l $TESTDIR/cp + log_fail "checksum mismatch ($src_cksum != $dst_cksum)" + fi +} + +log_assert "Verify all cp --reflink modes work with modified file" + +log_onexit cleanup + +RECORDSIZE=$(zfs get -Hp -o value recordsize $TESTPOOL/$TESTFS) +SRC_FILE=src.data +DST_FILE=dst.data +SRC_SIZE=$(($RANDOM % 2048)) + +log_must mkdir $TESTDIR/cp-reflink +log_must cd $TESTDIR/cp-reflink + +for mode in "never" "auto" "always"; do + log_note "Checking 'cp --reflink=$mode'" + + # Create a new file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + rm -f $DST_FILE + + # Append to an existing file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \ + count=1 conv=notrunc + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + rm -f $DST_FILE + + # Overwrite a random range of an existing file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ + seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $SRC_FILE $DST_FILE +done + +log_pass