diff --git a/META b/META index 5868838a26df..93045ec3abe8 100644 --- a/META +++ b/META @@ -1,7 +1,7 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.1 +Version: 2.2.2 Release: 1 Release-Tags: relext License: CDDL diff --git a/README.md b/README.md index 331889560950..af244c1fff14 100644 --- a/README.md +++ b/README.md @@ -32,4 +32,4 @@ For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197` # Supported Kernels * The `META` file contains the officially recognized supported Linux kernel versions. - * Supported FreeBSD versions are any supported branches and releases starting from 12.2-RELEASE. + * Supported FreeBSD versions are any supported branches and releases starting from 12.4-RELEASE. diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 005bf3f16590..3fc9fd2a9d81 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -34,6 +34,7 @@ * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2023, Rob Norris */ #include @@ -80,6 +81,7 @@ #include #include #include +#include #include #include @@ -899,6 +901,8 @@ usage(void) "don't print label contents\n"); (void) fprintf(stderr, " -t --txg=INTEGER " "highest txg to use when searching for uberblocks\n"); + (void) fprintf(stderr, " -T --brt-stats " + "BRT statistics\n"); (void) fprintf(stderr, " -u --uberblock " "uberblock\n"); (void) fprintf(stderr, " -U --cachefile=PATH " @@ -999,6 +1003,15 @@ zdb_nicenum(uint64_t num, char *buf, size_t buflen) nicenum(num, buf, buflen); } +static void +zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen) +{ + if (dump_opt['P']) + (void) snprintf(buf, buflen, "%llu", (longlong_t)bytes); + else + zfs_nicebytes(bytes, buf, buflen); +} + static const char histo_stars[] = "****************************************"; static const uint64_t histo_width = sizeof (histo_stars) - 1; @@ -2081,6 +2094,76 @@ dump_all_ddts(spa_t *spa) dump_dedup_ratio(&dds_total); } +static void +dump_brt(spa_t *spa) +{ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) { + printf("BRT: unsupported on this pool\n"); + return; + } + + if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { + printf("BRT: empty\n"); + return; + } + + brt_t *brt = spa->spa_brt; + VERIFY(brt); + + char count[32], used[32], saved[32]; + zdb_nicebytes(brt_get_used(spa), used, sizeof (used)); + zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved)); + uint64_t ratio = brt_get_ratio(spa); + printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved, + (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100)); + + if (dump_opt['T'] < 2) + return; + + for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; + if (brtvd == NULL) + continue; + + if (!brtvd->bv_initiated) { + printf("BRT: vdev %" PRIu64 ": empty\n", vdevid); + continue; + } + + zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count)); + zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used)); + zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved)); + printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n", + vdevid, count, used, saved); + } + + if (dump_opt['T'] < 3) + return; + + char dva[64]; + printf("\n%-16s %-10s\n", "DVA", "REFCNT"); + + for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; + if (brtvd == NULL || !brtvd->bv_initiated) + continue; + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t offset = *(uint64_t *)za.za_name; + uint64_t refcnt = za.za_first_integer; + + snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid, + (u_longlong_t)offset); + printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt); + } + zap_cursor_fini(&zc); + } +} + static void dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { @@ -8093,6 +8176,9 @@ dump_zpool(spa_t *spa) if (dump_opt['D']) dump_all_ddts(spa); + if (dump_opt['T']) + dump_brt(spa); + if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); if (dump_opt['M']) @@ -8879,6 +8965,7 @@ main(int argc, char **argv) {"io-stats", no_argument, NULL, 's'}, {"simulate-dedup", no_argument, NULL, 'S'}, {"txg", required_argument, NULL, 't'}, + {"brt-stats", no_argument, NULL, 'T'}, {"uberblock", no_argument, NULL, 'u'}, {"cachefile", required_argument, NULL, 'U'}, {"verbose", no_argument, NULL, 'v'}, @@ -8892,7 +8979,7 @@ main(int argc, char **argv) }; while ((c = getopt_long(argc, argv, - "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", + "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ", long_options, NULL)) != -1) { switch (c) { case 'b': @@ -8914,6 +9001,7 @@ main(int argc, char **argv) case 'R': case 's': case 'S': + case 'T': case 'u': case 'y': case 'Z': @@ -9076,22 +9164,6 @@ main(int argc, char **argv) if (dump_opt['l']) return (dump_label(argv[0])); - if (dump_opt['O']) { - if (argc != 2) - usage(); - dump_opt['v'] = verbose + 3; - return (dump_path(argv[0], argv[1], NULL)); - } - if (dump_opt['r']) { - target_is_spa = B_FALSE; - if (argc != 3) - usage(); - dump_opt['v'] = verbose; - error = dump_path(argv[0], argv[1], &object); - if (error != 0) - fatal("internal error: %s", strerror(error)); - } - if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); @@ -9192,6 +9264,29 @@ main(int argc, char **argv) searchdirs = NULL; } + /* + * We need to make sure to process -O option or call + * dump_path after the -e option has been processed, + * which imports the pool to the namespace if it's + * not in the cachefile. + */ + if (dump_opt['O']) { + if (argc != 2) + usage(); + dump_opt['v'] = verbose + 3; + return (dump_path(argv[0], argv[1], NULL)); + } + + if (dump_opt['r']) { + target_is_spa = B_FALSE; + if (argc != 3) + usage(); + dump_opt['v'] = verbose; + error = dump_path(argv[0], argv[1], &object); + if (error != 0) + fatal("internal error: %s", strerror(error)); + } + /* * import_checkpointed_state makes the assumption that the * target pool that we pass it is already part of the spa diff --git a/config/kernel-inode-times.m4 b/config/kernel-inode-times.m4 index 412e13b47df5..aae95abf1720 100644 --- a/config/kernel-inode-times.m4 +++ b/config/kernel-inode-times.m4 @@ -47,7 +47,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ #include ],[ struct inode ip; - struct timespec64 ts; + struct timespec64 ts = {0}; memset(&ip, 0, sizeof(ip)); inode_set_ctime_to_ts(&ip, ts); diff --git a/include/Makefile.am b/include/Makefile.am index 569de6dfa781..5f38f6ac6ddb 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -33,6 +33,7 @@ COMMON_H = \ sys/bqueue.h \ sys/btree.h \ sys/brt.h \ + sys/brt_impl.h \ sys/dataset_kstats.h \ sys/dbuf.h \ sys/ddt.h \ diff --git a/include/os/freebsd/spl/sys/vfs.h b/include/os/freebsd/spl/sys/vfs.h index 7f163fcfdb1e..f2196da56bc8 100644 --- a/include/os/freebsd/spl/sys/vfs.h +++ b/include/os/freebsd/spl/sys/vfs.h @@ -101,7 +101,7 @@ void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, void vfs_clearmntopt(vfs_t *vfsp, const char *name); int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp); int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, - char *fspath, char *fspec, int fsflags); + char *fspath, char *fspec, int fsflags, vfs_t *parent_vfsp); typedef uint64_t vfs_feature_t; diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h index 0779e58e4953..75c32f221ffd 100644 --- a/include/os/freebsd/spl/sys/vnode.h +++ b/include/os/freebsd/spl/sys/vnode.h @@ -56,6 +56,7 @@ enum symfollow { NO_FOLLOW = NOFOLLOW }; #ifndef IN_BASE #include_next #endif +#include #include #include #include @@ -104,7 +105,7 @@ vn_flush_cached_data(vnode_t *vp, boolean_t sync) zfs_vmobject_wlock(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, flags); zfs_vmobject_wunlock(vp->v_object); - VOP_UNLOCK(vp); + VOP_UNLOCK1(vp); } } #endif diff --git a/include/sys/brt_impl.h b/include/sys/brt_impl.h new file mode 100644 index 000000000000..9cc06fbb2c3a --- /dev/null +++ b/include/sys/brt_impl.h @@ -0,0 +1,199 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek + */ + +#ifndef _SYS_BRT_IMPL_H +#define _SYS_BRT_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * BRT - Block Reference Table. + */ +#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:" + +/* + * We divide each VDEV into 16MB chunks. Each chunk is represented in memory + * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B + * Each element in this array represents how many BRT entries do we have in this + * chunk of storage. We always load this entire array into memory and update as + * needed. By having it in memory we can quickly tell (during zio_free()) if + * there are any BRT entries that we might need to update. + * + * This value cannot be larger than 16MB, at least as long as we support + * 512 byte block sizes. With 512 byte block size we can have exactly + * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too + * many for a 16bit counter. + */ +#define BRT_RANGESIZE (16 * 1024 * 1024) +_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX, + "BRT_RANGESIZE is too large."); +/* + * We don't want to update the whole structure every time. Maintain bitmap + * of dirty blocks within the regions, so that a single bit represents a + * block size of entcounts. For example if we have a 1PB vdev then all + * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this + * 128MB array of entcounts into 32kB disk blocks, as we don't want to update + * the whole 128MB on disk when we have updated only a single entcount. + * We maintain a bitmap where each 32kB disk block within 128MB entcounts array + * is represented by a single bit. This gives us 4096 bits. A set bit in the + * bitmap means that we had a change in at least one of the 16384 entcounts + * that reside on a 32kB disk block (32kB / sizeof (uint16_t)). + */ +#define BRT_BLOCKSIZE (32 * 1024) +#define BRT_RANGESIZE_TO_NBLOCKS(size) \ + (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1) + +#define BRT_LITTLE_ENDIAN 0 +#define BRT_BIG_ENDIAN 1 +#ifdef _ZFS_LITTLE_ENDIAN +#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN +#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN +#else +#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN +#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN +#endif + +typedef struct brt_vdev_phys { + uint64_t bvp_mos_entries; + uint64_t bvp_size; + uint64_t bvp_byteorder; + uint64_t bvp_totalcount; + uint64_t bvp_rangesize; + uint64_t bvp_usedspace; + uint64_t bvp_savedspace; +} brt_vdev_phys_t; + +typedef struct brt_vdev { + /* + * VDEV id. + */ + uint64_t bv_vdevid; + /* + * Is the structure initiated? + * (bv_entcount and bv_bitmap are allocated?) + */ + boolean_t bv_initiated; + /* + * Object number in the MOS for the entcount array and brt_vdev_phys. + */ + uint64_t bv_mos_brtvdev; + /* + * Object number in the MOS for the entries table. + */ + uint64_t bv_mos_entries; + /* + * Entries to sync. + */ + avl_tree_t bv_tree; + /* + * Does the bv_entcount[] array needs byte swapping? + */ + boolean_t bv_need_byteswap; + /* + * Number of entries in the bv_entcount[] array. + */ + uint64_t bv_size; + /* + * This is the array with BRT entry count per BRT_RANGESIZE. + */ + uint16_t *bv_entcount; + /* + * Sum of all bv_entcount[]s. + */ + uint64_t bv_totalcount; + /* + * Space on disk occupied by cloned blocks (without compression). + */ + uint64_t bv_usedspace; + /* + * How much additional space would be occupied without block cloning. + */ + uint64_t bv_savedspace; + /* + * brt_vdev_phys needs updating on disk. + */ + boolean_t bv_meta_dirty; + /* + * bv_entcount[] needs updating on disk. + */ + boolean_t bv_entcount_dirty; + /* + * bv_entcount[] potentially can be a bit too big to sychronize it all + * when we just changed few entcounts. The fields below allow us to + * track updates to bv_entcount[] array since the last sync. + * A single bit in the bv_bitmap represents as many entcounts as can + * fit into a single BRT_BLOCKSIZE. + * For example we have 65536 entcounts in the bv_entcount array + * (so the whole array is 128kB). We updated bv_entcount[2] and + * bv_entcount[5]. In that case only first bit in the bv_bitmap will + * be set and we will write only first BRT_BLOCKSIZE out of 128kB. + */ + ulong_t *bv_bitmap; + uint64_t bv_nblocks; +} brt_vdev_t; + +/* + * In-core brt + */ +typedef struct brt { + krwlock_t brt_lock; + spa_t *brt_spa; +#define brt_mos brt_spa->spa_meta_objset + uint64_t brt_rangesize; + uint64_t brt_usedspace; + uint64_t brt_savedspace; + avl_tree_t brt_pending_tree[TXG_SIZE]; + kmutex_t brt_pending_lock[TXG_SIZE]; + /* Sum of all entries across all bv_trees. */ + uint64_t brt_nentries; + brt_vdev_t *brt_vdevs; + uint64_t brt_nvdevs; +} brt_t; + +/* Size of bre_offset / sizeof (uint64_t). */ +#define BRT_KEY_WORDS (1) + +/* + * In-core brt entry. + * On-disk we use bre_offset as the key and bre_refcount as the value. + */ +typedef struct brt_entry { + uint64_t bre_offset; + uint64_t bre_refcount; + avl_node_t bre_node; +} brt_entry_t; + +typedef struct brt_pending_entry { + blkptr_t bpe_bp; + int bpe_count; + avl_node_t bpe_node; +} brt_pending_entry_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BRT_IMPL_H */ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 615ba8fe7496..06b4dc27dfea 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1072,8 +1072,7 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, struct blkptr *bps, size_t *nbpsp); int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps, - boolean_t replay); + uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps); /* * Initial setup and final teardown. diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 52c8e452fa7c..d7f66d917ac7 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -14,7 +14,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd June 27, 2023 +.Dd November 18, 2023 .Dt ZDB 8 .Os . @@ -23,7 +23,7 @@ .Nd display ZFS storage pool debugging and consistency information .Sh SYNOPSIS .Nm -.Op Fl AbcdDFGhikLMNPsvXYy +.Op Fl AbcdDFGhikLMNPsTvXYy .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … .Op Fl I Ar inflight-I/O-ops .Oo Fl o Ar var Ns = Ns Ar value Oc Ns … @@ -403,6 +403,13 @@ Display operation counts, bandwidth, and error counts of I/O to the pool from Simulate the effects of deduplication, constructing a DDT and then display that DDT as with .Fl DD . +.It Fl T , -brt-stats +Display block reference table (BRT) statistics, including the size of uniques +blocks cloned, the space saving as a result of cloning, and the saving ratio. +.It Fl TT +Display the per-vdev BRT statistics, including total references. +.It Fl TTT +Dump the contents of the block reference tables. .It Fl u , -uberblock Display the current uberblock. .El diff --git a/module/os/freebsd/spl/spl_vfs.c b/module/os/freebsd/spl/spl_vfs.c index a07098afc5b4..3f33547216eb 100644 --- a/module/os/freebsd/spl/spl_vfs.c +++ b/module/os/freebsd/spl/spl_vfs.c @@ -120,7 +120,7 @@ vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp) int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, - char *fspec, int fsflags) + char *fspec, int fsflags, vfs_t *parent_vfsp) { struct vfsconf *vfsp; struct mount *mp; @@ -220,6 +220,13 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, mp->mnt_opt = mp->mnt_optnew; (void) VFS_STATFS(mp, &mp->mnt_stat); +#ifdef VFS_SUPPORTS_EXJAIL_CLONE + /* + * Clone the mnt_exjail credentials of the parent, as required. + */ + vfs_exjail_clone(parent_vfsp, mp); +#endif + /* * Prevent external consumers of mount options from reading * mnt_optnew. diff --git a/module/os/freebsd/spl/spl_zlib.c b/module/os/freebsd/spl/spl_zlib.c index 8bd3bdedf268..42c6d7f9df6c 100644 --- a/module/os/freebsd/spl/spl_zlib.c +++ b/module/os/freebsd/spl/spl_zlib.c @@ -32,11 +32,7 @@ __FBSDID("$FreeBSD$"); #include #include #include -#if __FreeBSD_version >= 1300041 #include -#else -#include -#endif #include @@ -90,11 +86,7 @@ zlib_inflateInit(z_stream *stream) static int zlib_inflate(z_stream *stream, int finish) { -#if __FreeBSD_version >= 1300024 return (inflate(stream, finish)); -#else - return (_zlib104_inflate(stream, finish)); -#endif } diff --git a/module/os/freebsd/zfs/event_os.c b/module/os/freebsd/zfs/event_os.c index 239d44d0cfe7..e774fbaaf867 100644 --- a/module/os/freebsd/zfs/event_os.c +++ b/module/os/freebsd/zfs/event_os.c @@ -46,6 +46,7 @@ knlist_sx_xunlock(void *arg) sx_xunlock((struct sx *)arg); } +#if __FreeBSD_version >= 1300128 static void knlist_sx_assert_lock(void *arg, int what) { @@ -55,11 +56,28 @@ knlist_sx_assert_lock(void *arg, int what) else sx_assert((struct sx *)arg, SX_UNLOCKED); } +#else +static void +knlist_sx_assert_locked(void *arg) +{ + sx_assert((struct sx *)arg, SX_LOCKED); +} +static void +knlist_sx_assert_unlocked(void *arg) +{ + sx_assert((struct sx *)arg, SX_UNLOCKED); +} +#endif void knlist_init_sx(struct knlist *knl, struct sx *lock) { +#if __FreeBSD_version >= 1300128 knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock, knlist_sx_assert_lock); +#else + knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock, + knlist_sx_assert_locked, knlist_sx_assert_unlocked); +#endif } diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c index 28445a18b809..a753e91da4fe 100644 --- a/module/os/freebsd/zfs/zfs_ctldir.c +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -1026,7 +1026,8 @@ zfsctl_snapdir_lookup(struct vop_lookup_args *ap) "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", dvp->v_vfsp->mnt_stat.f_mntonname, name); - err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); + err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0, + dvp->v_vfsp); kmem_free(mountpoint, mountpoint_len); if (err == 0) { /* diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index f672deed34dd..05f28033be6a 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6213,6 +6213,7 @@ zfs_deallocate(struct vop_deallocate_args *ap) } #endif +#if __FreeBSD_version >= 1300039 #ifndef _SYS_SYSPROTO_H_ struct vop_copy_file_range_args { struct vnode *a_invp; @@ -6319,6 +6320,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) ap->a_incred, ap->a_outcred, ap->a_fsizetd); return (error); } +#endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; @@ -6383,7 +6385,9 @@ struct vop_vector zfs_vnodeops = { #if __FreeBSD_version >= 1400043 .vop_add_writecount = vop_stdadd_writecount_nomsync, #endif +#if __FreeBSD_version >= 1300039 .vop_copy_file_range = zfs_freebsd_copy_file_range, +#endif }; VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); diff --git a/module/zfs/brt.c b/module/zfs/brt.c index ddd8eefe600b..759bc8d2e2b8 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -234,178 +235,15 @@ * destination dataset is mounted and its ZIL replayed. * To address this situation we leverage zil_claim() mechanism where ZFS will * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE - * entries, we will bump reference counters for their BPs in the BRT and then - * on mount and ZIL replay we will just attach BPs to the file without - * bumping reference counters. - * Note it is still possible that after zil_claim() we never mount the - * destination, so we never replay its ZIL and we destroy it. This way we would - * end up with leaked references in BRT. We address that too as ZFS gives us - * a chance to clean this up on dataset destroy (see zil_free_clone_range()). + * entries, we will bump reference counters for their BPs in the BRT. Then + * on mount and ZIL replay we bump the reference counters once more, while the + * first references are dropped during ZIL destroy by zil_free_clone_range(). + * It is possible that after zil_claim() we never mount the destination, so + * we never replay its ZIL and just destroy it. In this case the only taken + * references will be dropped by zil_free_clone_range(), since the cloning is + * not going to ever take place. */ -/* - * BRT - Block Reference Table. - */ -#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:" - -/* - * We divide each VDEV into 16MB chunks. Each chunk is represented in memory - * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B - * Each element in this array represents how many BRT entries do we have in this - * chunk of storage. We always load this entire array into memory and update as - * needed. By having it in memory we can quickly tell (during zio_free()) if - * there are any BRT entries that we might need to update. - * - * This value cannot be larger than 16MB, at least as long as we support - * 512 byte block sizes. With 512 byte block size we can have exactly - * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too - * many for a 16bit counter. - */ -#define BRT_RANGESIZE (16 * 1024 * 1024) -_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX, - "BRT_RANGESIZE is too large."); -/* - * We don't want to update the whole structure every time. Maintain bitmap - * of dirty blocks within the regions, so that a single bit represents a - * block size of entcounts. For example if we have a 1PB vdev then all - * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this - * 128MB array of entcounts into 32kB disk blocks, as we don't want to update - * the whole 128MB on disk when we have updated only a single entcount. - * We maintain a bitmap where each 32kB disk block within 128MB entcounts array - * is represented by a single bit. This gives us 4096 bits. A set bit in the - * bitmap means that we had a change in at least one of the 16384 entcounts - * that reside on a 32kB disk block (32kB / sizeof (uint16_t)). - */ -#define BRT_BLOCKSIZE (32 * 1024) -#define BRT_RANGESIZE_TO_NBLOCKS(size) \ - (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1) - -#define BRT_LITTLE_ENDIAN 0 -#define BRT_BIG_ENDIAN 1 -#ifdef _ZFS_LITTLE_ENDIAN -#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN -#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN -#else -#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN -#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN -#endif - -typedef struct brt_vdev_phys { - uint64_t bvp_mos_entries; - uint64_t bvp_size; - uint64_t bvp_byteorder; - uint64_t bvp_totalcount; - uint64_t bvp_rangesize; - uint64_t bvp_usedspace; - uint64_t bvp_savedspace; -} brt_vdev_phys_t; - -typedef struct brt_vdev { - /* - * VDEV id. - */ - uint64_t bv_vdevid; - /* - * Is the structure initiated? - * (bv_entcount and bv_bitmap are allocated?) - */ - boolean_t bv_initiated; - /* - * Object number in the MOS for the entcount array and brt_vdev_phys. - */ - uint64_t bv_mos_brtvdev; - /* - * Object number in the MOS for the entries table. - */ - uint64_t bv_mos_entries; - /* - * Entries to sync. - */ - avl_tree_t bv_tree; - /* - * Does the bv_entcount[] array needs byte swapping? - */ - boolean_t bv_need_byteswap; - /* - * Number of entries in the bv_entcount[] array. - */ - uint64_t bv_size; - /* - * This is the array with BRT entry count per BRT_RANGESIZE. - */ - uint16_t *bv_entcount; - /* - * Sum of all bv_entcount[]s. - */ - uint64_t bv_totalcount; - /* - * Space on disk occupied by cloned blocks (without compression). - */ - uint64_t bv_usedspace; - /* - * How much additional space would be occupied without block cloning. - */ - uint64_t bv_savedspace; - /* - * brt_vdev_phys needs updating on disk. - */ - boolean_t bv_meta_dirty; - /* - * bv_entcount[] needs updating on disk. - */ - boolean_t bv_entcount_dirty; - /* - * bv_entcount[] potentially can be a bit too big to sychronize it all - * when we just changed few entcounts. The fields below allow us to - * track updates to bv_entcount[] array since the last sync. - * A single bit in the bv_bitmap represents as many entcounts as can - * fit into a single BRT_BLOCKSIZE. - * For example we have 65536 entcounts in the bv_entcount array - * (so the whole array is 128kB). We updated bv_entcount[2] and - * bv_entcount[5]. In that case only first bit in the bv_bitmap will - * be set and we will write only first BRT_BLOCKSIZE out of 128kB. - */ - ulong_t *bv_bitmap; - uint64_t bv_nblocks; -} brt_vdev_t; - -/* - * In-core brt - */ -typedef struct brt { - krwlock_t brt_lock; - spa_t *brt_spa; -#define brt_mos brt_spa->spa_meta_objset - uint64_t brt_rangesize; - uint64_t brt_usedspace; - uint64_t brt_savedspace; - avl_tree_t brt_pending_tree[TXG_SIZE]; - kmutex_t brt_pending_lock[TXG_SIZE]; - /* Sum of all entries across all bv_trees. */ - uint64_t brt_nentries; - brt_vdev_t *brt_vdevs; - uint64_t brt_nvdevs; -} brt_t; - -/* Size of bre_offset / sizeof (uint64_t). */ -#define BRT_KEY_WORDS (1) - -/* - * In-core brt entry. - * On-disk we use bre_offset as the key and bre_refcount as the value. - */ -typedef struct brt_entry { - uint64_t bre_offset; - uint64_t bre_refcount; - avl_node_t bre_node; -} brt_entry_t; - -typedef struct brt_pending_entry { - blkptr_t bpe_bp; - int bpe_count; - avl_node_t bpe_node; -} brt_pending_entry_t; - static kmem_cache_t *brt_entry_cache; static kmem_cache_t *brt_pending_entry_cache; diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index f2831a0e8abf..5a7fe42b602a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2700,15 +2700,23 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) * writes and clones into this block. */ mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); VERIFY(!dbuf_undirty(db, tx)); ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; + dbuf_clear_data(db); } + + db->db_state = DB_NOFILL; + DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone"); + + DBUF_VERIFY(db); mutex_exit(&db->db_mtx); - dmu_buf_will_not_fill(db_fake, tx); + dbuf_noread(db); + (void) dbuf_dirty(db, tx); } void diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index ddb29020b09b..63464d747428 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2255,6 +2255,21 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, goto out; } + /* + * If the block was allocated in transaction group that is not + * yet synced, we could clone it, but we couldn't write this + * operation into ZIL, or it may be impossible to replay, since + * the block may appear not yet allocated at that point. + */ + if (BP_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { + error = SET_ERROR(EINVAL); + goto out; + } + if (BP_PHYSICAL_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) { + error = SET_ERROR(EAGAIN); + goto out; + } + bps[i] = *bp; } @@ -2267,7 +2282,7 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, - dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay) + dmu_tx_t *tx, const blkptr_t *bps, size_t nbps) { spa_t *spa; dmu_buf_t **dbp, *dbuf; @@ -2341,10 +2356,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, * When data in embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. - * Also, when replaying ZIL we don't want to bump references - * in the BRT as it was already done during ZIL claim. */ - if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { brt_pending_add(spa, bp, tx); } } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 7cf03264dce2..ad9988366ada 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1764,7 +1764,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots) } /* - * Checks if the dnode contains any uncommitted dirty records. + * Checks if the dnode itself is dirty, or is carrying any uncommitted records. + * It is important to check both conditions, as some operations (eg appending + * to a file) can dirty both as a single logical unit, but they are not synced + * out atomically, so checking one and not the other can result in an object + * appearing to be clean mid-way through a commit. + * + * Do not change this lightly! If you get it wrong, dmu_offset_next() can + * detect a hole where there is really data, leading to silent corruption. */ boolean_t dnode_is_dirty(dnode_t *dn) @@ -1772,7 +1779,8 @@ dnode_is_dirty(dnode_t *dn) mutex_enter(&dn->dn_mtx); for (int i = 0; i < TXG_SIZE; i++) { - if (multilist_link_active(&dn->dn_dirty_link[i])) { + if (multilist_link_active(&dn->dn_dirty_link[i]) || + !list_is_empty(&dn->dn_dirty_records[i])) { mutex_exit(&dn->dn_mtx); return (B_TRUE); } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 84e6b10ef37c..3a5fa75df2ea 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1333,7 +1333,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, } error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, - bps, nbps, B_FALSE); + bps, nbps); if (error != 0) { dmu_tx_commit(tx); break; @@ -1467,7 +1467,7 @@ zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, if (zp->z_blksz < blksz) zfs_grow_blocksize(zp, blksz, tx); - dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE); + dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index a11886136994..1ba3c01111c4 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -624,11 +624,12 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) } static int -zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) +zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t first_txg) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; - spa_t *spa; + spa_t *spa = zilog->zl_spa; uint_t ii; ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); @@ -641,19 +642,36 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) * XXX: Do we need to byteswap lr? */ - spa = zilog->zl_spa; - for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; /* - * When data in embedded into BP there is no need to create + * When data are embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. */ - if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + continue; + + /* + * We can not handle block pointers from the future, since they + * are not yet allocated. It should not normally happen, but + * just in case lets be safe and just stop here now instead of + * corrupting the pool. + */ + if (BP_PHYSICAL_BIRTH(bp) >= first_txg) + return (SET_ERROR(ENOENT)); + + /* + * Assert the block is really allocated before we reference it. + */ + metaslab_check_free(spa, bp); + } + + for (ii = 0; ii < lr->lr_nbps; ii++) { + bp = &lr->lr_bps[ii]; + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) brt_pending_add(spa, bp, tx); - } } return (0); @@ -668,7 +686,7 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, case TX_WRITE: return (zil_claim_write(zilog, lrc, tx, first_txg)); case TX_CLONE_RANGE: - return (zil_claim_clone_range(zilog, lrc, tx)); + return (zil_claim_clone_range(zilog, lrc, tx, first_txg)); default: return (0); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a719e5492323..3b3b40fa73d8 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -158,22 +158,23 @@ zio_init(void) zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + /* + * For small buffers, we want a cache for each multiple of + * SPA_MINBLOCKSIZE. For larger buffers, we want a cache + * for each quarter-power of 2. + */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; - size_t align, cflags, data_cflags; - char name[32]; - - /* - * Create cache for each half-power of 2 size, starting from - * SPA_MINBLOCKSIZE. It should give us memory space efficiency - * of ~7/8, sufficient for transient allocations mostly using - * these caches. - */ size_t p2 = size; + size_t align = 0; + size_t data_cflags, cflags; + + data_cflags = KMC_NODEBUG; + cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? + KMC_NODEBUG : 0; + while (!ISP2(p2)) p2 &= p2 - 1; - if (!IS_P2ALIGNED(size, p2 / 2)) - continue; #ifndef _KERNEL /* @@ -184,37 +185,47 @@ zio_init(void) */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; -#endif - - if (IS_P2ALIGNED(size, PAGESIZE)) + /* + * Here's the problem - on 4K native devices in userland on + * Linux using O_DIRECT, buffers must be 4K aligned or I/O + * will fail with EINVAL, causing zdb (and others) to coredump. + * Since userland probably doesn't need optimized buffer caches, + * we just force 4K alignment on everything. + */ + align = 8 * SPA_MINBLOCKSIZE; +#else + if (size < PAGESIZE) { + align = SPA_MINBLOCKSIZE; + } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = PAGESIZE; - else - align = 1 << (highbit64(size ^ (size - 1)) - 1); + } +#endif - cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? - KMC_NODEBUG : 0; - data_cflags = KMC_NODEBUG; - if (cflags == data_cflags) { - /* - * Resulting kmem caches would be identical. - * Save memory by creating only one. - */ - (void) snprintf(name, sizeof (name), - "zio_buf_comb_%lu", (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, size, align, - NULL, NULL, NULL, NULL, NULL, cflags); - zio_data_buf_cache[c] = zio_buf_cache[c]; - continue; + if (align != 0) { + char name[36]; + if (cflags == data_cflags) { + /* + * Resulting kmem caches would be identical. + * Save memory by creating only one. + */ + (void) snprintf(name, sizeof (name), + "zio_buf_comb_%lu", (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, + size, align, NULL, NULL, NULL, NULL, NULL, + cflags); + zio_data_buf_cache[c] = zio_buf_cache[c]; + continue; + } + (void) snprintf(name, sizeof (name), "zio_buf_%lu", + (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, NULL, cflags); + + (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", + (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, NULL, data_cflags); } - (void) snprintf(name, sizeof (name), "zio_buf_%lu", - (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, size, align, - NULL, NULL, NULL, NULL, NULL, cflags); - - (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", - (ulong_t)size); - zio_data_buf_cache[c] = kmem_cache_create(name, size, align, - NULL, NULL, NULL, NULL, NULL, data_cflags); } while (--c != 0) { diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index d56967d7a8b1..cd85dd28cf56 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -103,6 +103,7 @@ if [ -d ${dkms_root}/%{module} ]; then fi fi done + cd ${dkms_root} fi # Uninstall this version of zfs dkms modules before installation of the package. diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh index 168e7c18c3a3..688d488ceb62 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh @@ -58,7 +58,7 @@ set -A args "create" "add" "destroy" "import fakepool" \ "setvprop" "blah blah" "-%" "--?" "-*" "-=" \ "-a" "-f" "-g" "-j" "-n" "-o" "-p" "-p /tmp" \ "-t" "-w" "-z" "-E" "-H" "-I" "-J" \ - "-Q" "-R" "-T" "-W" + "-Q" "-R" "-W" log_assert "Execute zdb using invalid parameters." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib index 4a85999b4ab8..f174eeeeaae9 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib @@ -123,7 +123,10 @@ if not httpd: with open('$HTTPS_PORT_FILE', 'w') as portf: print(port, file=portf) -httpd.socket = ssl.wrap_socket(httpd.socket, server_side=True, keyfile='/$TESTPOOL/snakeoil.key', certfile='$SSL_CA_CERT_FILE', ssl_version=ssl.PROTOCOL_TLS) +sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) +sslctx.check_hostname = False +sslctx.load_cert_chain(certfile='$SSL_CA_CERT_FILE', keyfile='/$TESTPOOL/snakeoil.key') +httpd.socket = httpd.socket = sslctx.wrap_socket(httpd.socket, server_side=True) os.chdir('$STF_SUITE/tests/functional/cli_root/zfs_load-key')