From d0fc187c9988699aa8aa8930e790a65c9670086f Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Mon, 15 Jun 2020 09:07:01 +0900 Subject: [PATCH] macOS: Add macOS support macOS: change update_pages OsX traditionally used uio_copy to re-issue the write in update_pages() to mapped memory. Based on Linux, it is now changed to using dmu_read() from ARC, to save the uio allocation, and clean up zfs_write(). macOS: enhance getxattr/setxattr code to use txg macOS: xattr writes to use txg, and correct xattr=sa macOS: rollback to notify finder of change macOS: setattr(uid/gid) incorrect macOS: replace obj-C function with C macOS: move AT_ to ATTR_ Appears Linux already defines them in headers, but doesn't use them. Since we just match to XNU values, it doesn't matter what they are called. macOS: zfs_writes were truncated macOS: zed changes macOS: const required after C++ fix commit macOS: Add primitive auto unmount of idle snapshots. macOS: uio changes after uio PR macOS: rebase master fixes macOS: Add all tunables from upstream macOS: compile fixes after vdev_disk_t change macOS: remove semaphore.h macOS: ASM changes after revert macOS: correct cv_wait_sig() return values macOS: Ensure to use -O2 on dsl_scan.c macOS: Disambiguate condvar API contract This is the macOS changes required for the upstream commit: 8056a75672a57c85b8e10c0c6bce138146f7d213 Also correct cstyle. remember cv_timedwait() and cv_timedwait_hires() take different timeout types! macOS: Add abd_cache_reap_now for abd_chunk_cache users macOS changes needed for commit 7564073ed6344c12e6bc4ffabd130522d937fb93 macOS: fix zfstester get_time macOS: can not pass componentname to ZFS as ZFS updates the name with the real name (case insensitive) and will cause panic, as it is often protected. We need to pass a buffer to use, and use the returned name as cachename. macOS: remove more tests to fit in 60mins macOS: zvol need to verify and lock zv before use. When called from, or calling to C++, we need to treat the stored 'zv' as a unique ID, and verify it is still in the list of active zvols. If found, lock in the correct order. SPL: correct cstyle.pl Fixed kmem_bufctl_cache memory leak (#273) macOS: cstyle fixes, remove spl-kobj.c macOS: cstyle.pl-ify SPL macOS: cstyle.pl all kernel header files. macOS: cstyle kernel C files macOS: cstyle kernel CPP files macOS: libspl cstyle fixes. Fixed memory leak during zfs driver unload. (#279) macOS: fix file based pools zfs_dev_is_whole_disk() was incorrectly ported from FreeBSD. Use Linux version. macOS: undo previous decmpfs handling We used to handle decmpfs by uncompressing it after the fact, but it added a lot of dependencies. Now, if decmpfs XATTR is set, AND it sets UF_COMPRESSED, AND is followed by a setsize(0), we will ignore the truncate, and delete the XATTR. Only concern is if we were to stop a truncate, which was supposed to happen (ie, not decmpfs related). macOS: make codecheck pass macOS: change send/recv pipe to fifo Simplify kernel dependencies by only do IO on vnodes, and userland to wrap send to pipes with fifo (opened to get vnode) and relay child processes. macOS: additional work --- appveryor.yml | 12 + cmd/zed/zed.d/Makefile.am | 4 + cmd/zed/zed.d/snapshot_mount.sh | 29 + cmd/zed/zed.d/snapshot_unmount.sh | 1 + cmd/zfs/zfs_main.c | 12 +- cmd/zpool/os/macos/zpool_vdev_os.c | 71 + configure.ac | 8 + include/os/macos/Makefile.am | 1 + include/os/macos/spl/Makefile.am | 1 + include/os/macos/spl/ia32/sys/asm_linkage.h | 297 + include/os/macos/spl/libkern/libkern.h | 39 + include/os/macos/spl/linux/init.h | 26 + include/os/macos/spl/linux/kernel.h | 25 + include/os/macos/spl/linux/module.h | 28 + include/os/macos/spl/rpc/Makefile.am | 3 + include/os/macos/spl/rpc/types.h | 32 + include/os/macos/spl/rpc/xdr.h | 175 + include/os/macos/spl/sys/Makefile.am | 49 + include/os/macos/spl/sys/acl.h | 127 + include/os/macos/spl/sys/atomic.h | 288 + include/os/macos/spl/sys/byteorder.h | 67 + include/os/macos/spl/sys/callb.h | 66 + include/os/macos/spl/sys/cmn_err.h | 54 + include/os/macos/spl/sys/condvar.h | 105 + include/os/macos/spl/sys/console.h | 41 + include/os/macos/spl/sys/cred.h | 70 + include/os/macos/spl/sys/ctype.h | 27 + include/os/macos/spl/sys/debug.h | 205 + include/os/macos/spl/sys/disp.h | 25 + include/os/macos/spl/sys/dkio.h | 527 ++ include/os/macos/spl/sys/errno.h | 29 + include/os/macos/spl/sys/fcntl.h | 39 + include/os/macos/spl/sys/file.h | 64 + include/os/macos/spl/sys/inttypes.h | 31 + include/os/macos/spl/sys/isa_defs.h | 690 ++ include/os/macos/spl/sys/kmem.h | 155 + include/os/macos/spl/sys/kmem_cache.h | 25 + include/os/macos/spl/sys/kmem_impl.h | 494 ++ include/os/macos/spl/sys/kstat.h | 217 + include/os/macos/spl/sys/list.h | 145 + include/os/macos/spl/sys/mod_os.h | 76 + include/os/macos/spl/sys/mutex.h | 148 + include/os/macos/spl/sys/param.h | 40 + include/os/macos/spl/sys/policy.h | 86 + include/os/macos/spl/sys/priv.h | 531 ++ include/os/macos/spl/sys/proc.h | 47 + include/os/macos/spl/sys/processor.h | 37 + include/os/macos/spl/sys/procfs_list.h | 64 + include/os/macos/spl/sys/random.h | 48 + include/os/macos/spl/sys/rwlock.h | 80 + include/os/macos/spl/sys/seg_kmem.h | 92 + include/os/macos/spl/sys/sha2.h | 155 + include/os/macos/spl/sys/sid.h | 104 + include/os/macos/spl/sys/signal.h | 59 + include/os/macos/spl/sys/simd.h | 712 ++ include/os/macos/spl/sys/strings.h | 26 + include/os/macos/spl/sys/stropts.h | 247 + include/os/macos/spl/sys/sunddi.h | 203 + include/os/macos/spl/sys/sysmacros.h | 265 + include/os/macos/spl/sys/systeminfo.h | 40 + include/os/macos/spl/sys/systm.h | 36 + include/os/macos/spl/sys/taskq.h | 118 + include/os/macos/spl/sys/taskq_impl.h | 181 + include/os/macos/spl/sys/thread.h | 126 + include/os/macos/spl/sys/time.h | 90 + include/os/macos/spl/sys/timer.h | 88 + include/os/macos/spl/sys/trace.h | 26 + include/os/macos/spl/sys/tsd.h | 54 + include/os/macos/spl/sys/types.h | 119 + include/os/macos/spl/sys/types32.h | 30 + include/os/macos/spl/sys/uio.h | 173 + include/os/macos/spl/sys/utsname.h | 48 + include/os/macos/spl/sys/varargs.h | 32 + include/os/macos/spl/sys/vfs.h | 84 + include/os/macos/spl/sys/vmem.h | 174 + include/os/macos/spl/sys/vmem_impl.h | 155 + include/os/macos/spl/sys/vmsystm.h | 35 + include/os/macos/spl/sys/vnode.h | 258 + include/os/macos/spl/sys/zmod.h | 122 + include/os/macos/spl/sys/zone.h | 38 + include/os/macos/zfs/Makefile.am | 1 + include/os/macos/zfs/sys/Makefile.am | 10 + include/os/macos/zfs/sys/ZFSDataset.h | 141 + include/os/macos/zfs/sys/ZFSDatasetProxy.h | 82 + include/os/macos/zfs/sys/ZFSDatasetScheme.h | 126 + include/os/macos/zfs/sys/ZFSPool.h | 127 + include/os/macos/zfs/sys/finderinfo.h | 36 + include/os/macos/zfs/sys/hfs_internal.h | 183 + include/os/macos/zfs/sys/kstat_osx.h | 369 + include/os/macos/zfs/sys/ldi_buf.h | 77 + include/os/macos/zfs/sys/ldi_impl_osx.h | 226 + include/os/macos/zfs/sys/ldi_osx.h | 153 + include/os/macos/zfs/sys/trace_zfs.h | 68 + include/os/macos/zfs/sys/vdev_disk_os.h | 44 + include/os/macos/zfs/sys/zfs_boot.h | 53 + include/os/macos/zfs/sys/zfs_context_os.h | 175 + include/os/macos/zfs/sys/zfs_ctldir.h | 124 + include/os/macos/zfs/sys/zfs_dir.h | 82 + include/os/macos/zfs/sys/zfs_ioctl_compat.h | 213 + include/os/macos/zfs/sys/zfs_mount.h | 73 + include/os/macos/zfs/sys/zfs_vfsops.h | 291 + include/os/macos/zfs/sys/zfs_vnops.h | 255 + include/os/macos/zfs/sys/zfs_znode_impl.h | 230 + include/os/macos/zfs/sys/zpl.h | 27 + include/os/macos/zfs/sys/zvolIO.h | 142 + include/os/macos/zfs/sys/zvol_os.h | 74 + include/sys/abd_impl.h | 8 +- include/sys/fs/zfs.h | 2 +- include/sys/mntent.h | 4 +- include/sys/sysevent/dev.h | 2 +- include/sys/zfs_sa.h | 9 +- lib/libefi/rdwr_efi_macos.c | 8 +- lib/libspl/include/os/Makefile.am | 4 + lib/libspl/include/os/macos/Makefile.am | 1 + lib/libspl/include/os/macos/dirent.h | 37 + lib/libspl/include/os/macos/ia32/Makefile.am | 0 .../include/os/macos/ia32/sys/Makefile.am | 0 .../include/os/macos/ia32/sys/asm_linkage.h | 297 + lib/libspl/include/os/macos/mach/Makefile.am | 3 + lib/libspl/include/os/macos/mach/boolean.h | 26 + lib/libspl/include/os/macos/mntent.h | 144 + lib/libspl/include/os/macos/poll.h | 31 + lib/libspl/include/os/macos/rpc/Makefile.am | 3 + lib/libspl/include/os/macos/rpc/xdr.h | 38 + lib/libspl/include/os/macos/stdlib.h | 28 + lib/libspl/include/os/macos/sys/Makefile.am | 17 + lib/libspl/include/os/macos/sys/byteorder.h | 279 + lib/libspl/include/os/macos/sys/errno.h | 31 + lib/libspl/include/os/macos/sys/fcntl.h | 35 + lib/libspl/include/os/macos/sys/file.h | 46 + .../include/os/macos/sys/kernel_types.h | 43 + lib/libspl/include/os/macos/sys/mnttab.h | 86 + lib/libspl/include/os/macos/sys/mount.h | 114 + lib/libspl/include/os/macos/sys/param.h | 63 + lib/libspl/include/os/macos/sys/stat.h | 77 + lib/libspl/include/os/macos/sys/sysmacros.h | 105 + lib/libspl/include/os/macos/sys/uio.h | 175 + lib/libspl/include/os/macos/sys/vfs.h | 26 + lib/libspl/include/os/macos/sys/xattr.h | 35 + .../include/os/macos/sys/zfs_context_os.h | 41 + lib/libspl/include/os/macos/time.h | 67 + lib/libspl/include/os/macos/unistd.h | 45 + lib/libspl/os/macos/getexecname.c | 31 + lib/libspl/os/macos/gethostid.c | 37 + lib/libspl/os/macos/getmntany.c | 462 ++ lib/libzfs/libzfs_dataset.c | 22 +- lib/libzfs/libzfs_sendrecv.c | 17 + lib/libzfs/os/macos/libzfs_mount_os.c | 486 ++ lib/libzfs/os/macos/libzfs_pool_os.c | 345 + lib/libzfs/os/macos/libzfs_util_os.c | 575 ++ lib/libzutil/os/macos/zutil_compat.c | 94 + lib/libzutil/os/macos/zutil_device_path_os.c | 194 + lib/libzutil/os/macos/zutil_import_os.c | 483 ++ .../icp/asm-x86_64/os/macos/aes/aes_aesni.S | 855 +++ .../icp/asm-x86_64/os/macos/aes/aes_amd64.S | 900 +++ .../asm-x86_64/os/macos/modes/gcm_pclmulqdq.S | 334 + .../asm-x86_64/os/macos/sha1/sha1-x86_64.S | 1353 ++++ .../asm-x86_64/os/macos/sha2/sha256_impl.S | 2058 +++++ .../asm-x86_64/os/macos/sha2/sha512_impl.S | 2082 +++++ module/os/linux/spl/spl-taskq.c | 3 +- module/os/macos/.gitignore | 1 + module/os/macos/Makefile.am | 6 + module/os/macos/README.md | 8 + module/os/macos/kernel/.gitignore | 5 + module/os/macos/kernel/Info.plist | 34 + module/os/macos/kernel/Makefile.am | 25 + module/os/macos/kernel/README.txt | 10 + module/os/macos/kernel/kextsymboltool.c | 912 +++ module/os/macos/kernel/version.plist | 16 + module/os/macos/kernel/zfs.exports | 32 + module/os/macos/spl/Makefile.am | 59 + module/os/macos/spl/README.md | 14 + module/os/macos/spl/spl-atomic.c | 50 + module/os/macos/spl/spl-condvar.c | 232 + module/os/macos/spl/spl-cred.c | 166 + module/os/macos/spl/spl-ddi.c | 383 + module/os/macos/spl/spl-debug.c | 10 + module/os/macos/spl/spl-err.c | 83 + module/os/macos/spl/spl-kmem.c | 6825 +++++++++++++++++ module/os/macos/spl/spl-kstat.c | 1212 +++ module/os/macos/spl/spl-list.c | 197 + module/os/macos/spl/spl-mutex.c | 415 + module/os/macos/spl/spl-osx.c | 488 ++ module/os/macos/spl/spl-policy.c | 184 + module/os/macos/spl/spl-proc.c | 30 + module/os/macos/spl/spl-proc_list.c | 157 + module/os/macos/spl/spl-processor.c | 55 + module/os/macos/spl/spl-rwlock.c | 397 + module/os/macos/spl/spl-seg_kmem.c | 289 + module/os/macos/spl/spl-taskq.c | 2529 ++++++ module/os/macos/spl/spl-thread.c | 148 + module/os/macos/spl/spl-time.c | 138 + module/os/macos/spl/spl-tsd.c | 389 + module/os/macos/spl/spl-vmem.c | 3940 ++++++++++ module/os/macos/spl/spl-vnode.c | 497 ++ module/os/macos/spl/spl-xdr.c | 524 ++ module/os/macos/spl/spl-zlib.c | 199 + module/os/macos/zfs/.gitignore | 2 + module/os/macos/zfs/Info.plist | 115 + module/os/macos/zfs/InfoPlist.strings | 5 + module/os/macos/zfs/Makefile.am | 353 + module/os/macos/zfs/ZFSDataset.cpp | 854 +++ module/os/macos/zfs/ZFSDatasetProxy.cpp | 466 ++ module/os/macos/zfs/ZFSDatasetScheme.cpp | 1108 +++ module/os/macos/zfs/ZFSPool.cpp | 868 +++ module/os/macos/zfs/abd_os.c | 482 ++ module/os/macos/zfs/arc_os.c | 883 +++ module/os/macos/zfs/ldi_iokit.cpp | 1990 +++++ module/os/macos/zfs/ldi_osx.c | 2432 ++++++ module/os/macos/zfs/ldi_vnode.c | 1020 +++ module/os/macos/zfs/policy.c | 354 + module/os/macos/zfs/qat.c | 105 + module/os/macos/zfs/qat_compress.c | 569 ++ module/os/macos/zfs/qat_crypt.c | 630 ++ module/os/macos/zfs/spa_misc_os.c | 116 + module/os/macos/zfs/trace.c | 50 + module/os/macos/zfs/vdev_disk.c | 787 ++ module/os/macos/zfs/vdev_file.c | 323 + module/os/macos/zfs/zfs_acl.c | 2983 +++++++ module/os/macos/zfs/zfs_boot.cpp | 2962 +++++++ module/os/macos/zfs/zfs_ctldir.c | 1519 ++++ module/os/macos/zfs/zfs_debug.c | 264 + module/os/macos/zfs/zfs_dir.c | 1214 +++ module/os/macos/zfs/zfs_file_os.c | 405 + module/os/macos/zfs/zfs_fuid_os.c | 52 + module/os/macos/zfs/zfs_ioctl_os.c | 403 + module/os/macos/zfs/zfs_kstat_osx.c | 869 +++ module/os/macos/zfs/zfs_osx.cpp | 310 + module/os/macos/zfs/zfs_vfsops.c | 2951 +++++++ module/os/macos/zfs/zfs_vnops.c | 4560 +++++++++++ module/os/macos/zfs/zfs_vnops_osx.c | 5276 +++++++++++++ module/os/macos/zfs/zfs_vnops_osx_lib.c | 2232 ++++++ module/os/macos/zfs/zfs_znode.c | 2347 ++++++ module/os/macos/zfs/zio_crypt.c | 1995 +++++ module/os/macos/zfs/zvolIO.cpp | 1177 +++ module/os/macos/zfs/zvol_os.c | 1024 +++ module/zcommon/zfeature_common.c | 3 +- module/zcommon/zfs_prop.c | 5 +- module/zcommon/zprop_common.c | 3 +- module/zfs/dmu.c | 9 +- module/zfs/spa.c | 10 +- module/zfs/zfs_fm.c | 3 +- module/zfs/zfs_log.c | 8 +- module/zfs/zfs_replay.c | 2 +- module/zfs/zfs_sa.c | 6 +- scripts/cmd-macos.sh | 60 + scripts/debug-macos.sh | 60 + scripts/load_macos.sh | 17 + 248 files changed, 91841 insertions(+), 53 deletions(-) create mode 100644 appveryor.yml create mode 100644 cmd/zed/zed.d/snapshot_mount.sh create mode 120000 cmd/zed/zed.d/snapshot_unmount.sh create mode 100644 cmd/zpool/os/macos/zpool_vdev_os.c create mode 100644 include/os/macos/Makefile.am create mode 100644 include/os/macos/spl/Makefile.am create mode 100644 include/os/macos/spl/ia32/sys/asm_linkage.h create mode 100644 include/os/macos/spl/libkern/libkern.h create mode 100644 include/os/macos/spl/linux/init.h create mode 100644 include/os/macos/spl/linux/kernel.h create mode 100644 include/os/macos/spl/linux/module.h create mode 100644 include/os/macos/spl/rpc/Makefile.am create mode 100644 include/os/macos/spl/rpc/types.h create mode 100644 include/os/macos/spl/rpc/xdr.h create mode 100644 include/os/macos/spl/sys/Makefile.am create mode 100644 include/os/macos/spl/sys/acl.h create mode 100644 include/os/macos/spl/sys/atomic.h create mode 100644 include/os/macos/spl/sys/byteorder.h create mode 100644 include/os/macos/spl/sys/callb.h create mode 100644 include/os/macos/spl/sys/cmn_err.h create mode 100644 include/os/macos/spl/sys/condvar.h create mode 100644 include/os/macos/spl/sys/console.h create mode 100644 include/os/macos/spl/sys/cred.h create mode 100644 include/os/macos/spl/sys/ctype.h create mode 100644 include/os/macos/spl/sys/debug.h create mode 100644 include/os/macos/spl/sys/disp.h create mode 100644 include/os/macos/spl/sys/dkio.h create mode 100644 include/os/macos/spl/sys/errno.h create mode 100644 include/os/macos/spl/sys/fcntl.h create mode 100644 include/os/macos/spl/sys/file.h create mode 100644 include/os/macos/spl/sys/inttypes.h create mode 100644 include/os/macos/spl/sys/isa_defs.h create mode 100644 include/os/macos/spl/sys/kmem.h create mode 100644 include/os/macos/spl/sys/kmem_cache.h create mode 100644 include/os/macos/spl/sys/kmem_impl.h create mode 100644 include/os/macos/spl/sys/kstat.h create mode 100644 include/os/macos/spl/sys/list.h create mode 100644 include/os/macos/spl/sys/mod_os.h create mode 100644 include/os/macos/spl/sys/mutex.h create mode 100644 include/os/macos/spl/sys/param.h create mode 100644 include/os/macos/spl/sys/policy.h create mode 100644 include/os/macos/spl/sys/priv.h create mode 100644 include/os/macos/spl/sys/proc.h create mode 100644 include/os/macos/spl/sys/processor.h create mode 100644 include/os/macos/spl/sys/procfs_list.h create mode 100644 include/os/macos/spl/sys/random.h create mode 100644 include/os/macos/spl/sys/rwlock.h create mode 100644 include/os/macos/spl/sys/seg_kmem.h create mode 100644 include/os/macos/spl/sys/sha2.h create mode 100644 include/os/macos/spl/sys/sid.h create mode 100644 include/os/macos/spl/sys/signal.h create mode 100644 include/os/macos/spl/sys/simd.h create mode 100644 include/os/macos/spl/sys/strings.h create mode 100644 include/os/macos/spl/sys/stropts.h create mode 100644 include/os/macos/spl/sys/sunddi.h create mode 100644 include/os/macos/spl/sys/sysmacros.h create mode 100644 include/os/macos/spl/sys/systeminfo.h create mode 100644 include/os/macos/spl/sys/systm.h create mode 100644 include/os/macos/spl/sys/taskq.h create mode 100644 include/os/macos/spl/sys/taskq_impl.h create mode 100644 include/os/macos/spl/sys/thread.h create mode 100644 include/os/macos/spl/sys/time.h create mode 100644 include/os/macos/spl/sys/timer.h create mode 100644 include/os/macos/spl/sys/trace.h create mode 100644 include/os/macos/spl/sys/tsd.h create mode 100644 include/os/macos/spl/sys/types.h create mode 100644 include/os/macos/spl/sys/types32.h create mode 100644 include/os/macos/spl/sys/uio.h create mode 100644 include/os/macos/spl/sys/utsname.h create mode 100644 include/os/macos/spl/sys/varargs.h create mode 100644 include/os/macos/spl/sys/vfs.h create mode 100644 include/os/macos/spl/sys/vmem.h create mode 100644 include/os/macos/spl/sys/vmem_impl.h create mode 100644 include/os/macos/spl/sys/vmsystm.h create mode 100644 include/os/macos/spl/sys/vnode.h create mode 100644 include/os/macos/spl/sys/zmod.h create mode 100644 include/os/macos/spl/sys/zone.h create mode 100644 include/os/macos/zfs/Makefile.am create mode 100644 include/os/macos/zfs/sys/Makefile.am create mode 100644 include/os/macos/zfs/sys/ZFSDataset.h create mode 100644 include/os/macos/zfs/sys/ZFSDatasetProxy.h create mode 100644 include/os/macos/zfs/sys/ZFSDatasetScheme.h create mode 100644 include/os/macos/zfs/sys/ZFSPool.h create mode 100644 include/os/macos/zfs/sys/finderinfo.h create mode 100644 include/os/macos/zfs/sys/hfs_internal.h create mode 100644 include/os/macos/zfs/sys/kstat_osx.h create mode 100644 include/os/macos/zfs/sys/ldi_buf.h create mode 100644 include/os/macos/zfs/sys/ldi_impl_osx.h create mode 100644 include/os/macos/zfs/sys/ldi_osx.h create mode 100644 include/os/macos/zfs/sys/trace_zfs.h create mode 100644 include/os/macos/zfs/sys/vdev_disk_os.h create mode 100644 include/os/macos/zfs/sys/zfs_boot.h create mode 100644 include/os/macos/zfs/sys/zfs_context_os.h create mode 100644 include/os/macos/zfs/sys/zfs_ctldir.h create mode 100644 include/os/macos/zfs/sys/zfs_dir.h create mode 100644 include/os/macos/zfs/sys/zfs_ioctl_compat.h create mode 100644 include/os/macos/zfs/sys/zfs_mount.h create mode 100644 include/os/macos/zfs/sys/zfs_vfsops.h create mode 100644 include/os/macos/zfs/sys/zfs_vnops.h create mode 100644 include/os/macos/zfs/sys/zfs_znode_impl.h create mode 100644 include/os/macos/zfs/sys/zpl.h create mode 100644 include/os/macos/zfs/sys/zvolIO.h create mode 100644 include/os/macos/zfs/sys/zvol_os.h create mode 100644 lib/libspl/include/os/macos/Makefile.am create mode 100644 lib/libspl/include/os/macos/dirent.h create mode 100644 lib/libspl/include/os/macos/ia32/Makefile.am create mode 100644 lib/libspl/include/os/macos/ia32/sys/Makefile.am create mode 100644 lib/libspl/include/os/macos/ia32/sys/asm_linkage.h create mode 100644 lib/libspl/include/os/macos/mach/Makefile.am create mode 100644 lib/libspl/include/os/macos/mach/boolean.h create mode 100644 lib/libspl/include/os/macos/mntent.h create mode 100644 lib/libspl/include/os/macos/poll.h create mode 100644 lib/libspl/include/os/macos/rpc/Makefile.am create mode 100644 lib/libspl/include/os/macos/rpc/xdr.h create mode 100644 lib/libspl/include/os/macos/stdlib.h create mode 100644 lib/libspl/include/os/macos/sys/Makefile.am create mode 100644 lib/libspl/include/os/macos/sys/byteorder.h create mode 100644 lib/libspl/include/os/macos/sys/errno.h create mode 100644 lib/libspl/include/os/macos/sys/fcntl.h create mode 100644 lib/libspl/include/os/macos/sys/file.h create mode 100644 lib/libspl/include/os/macos/sys/kernel_types.h create mode 100644 lib/libspl/include/os/macos/sys/mnttab.h create mode 100644 lib/libspl/include/os/macos/sys/mount.h create mode 100644 lib/libspl/include/os/macos/sys/param.h create mode 100644 lib/libspl/include/os/macos/sys/stat.h create mode 100644 lib/libspl/include/os/macos/sys/sysmacros.h create mode 100644 lib/libspl/include/os/macos/sys/uio.h create mode 100644 lib/libspl/include/os/macos/sys/vfs.h create mode 100644 lib/libspl/include/os/macos/sys/xattr.h create mode 100644 lib/libspl/include/os/macos/sys/zfs_context_os.h create mode 100644 lib/libspl/include/os/macos/time.h create mode 100644 lib/libspl/include/os/macos/unistd.h create mode 100644 lib/libspl/os/macos/getexecname.c create mode 100644 lib/libspl/os/macos/gethostid.c create mode 100644 lib/libspl/os/macos/getmntany.c create mode 100644 lib/libzfs/os/macos/libzfs_mount_os.c create mode 100644 lib/libzfs/os/macos/libzfs_pool_os.c create mode 100644 lib/libzfs/os/macos/libzfs_util_os.c create mode 100644 lib/libzutil/os/macos/zutil_compat.c create mode 100644 lib/libzutil/os/macos/zutil_device_path_os.c create mode 100644 lib/libzutil/os/macos/zutil_import_os.c create mode 100644 module/icp/asm-x86_64/os/macos/aes/aes_aesni.S create mode 100644 module/icp/asm-x86_64/os/macos/aes/aes_amd64.S create mode 100644 module/icp/asm-x86_64/os/macos/modes/gcm_pclmulqdq.S create mode 100644 module/icp/asm-x86_64/os/macos/sha1/sha1-x86_64.S create mode 100644 module/icp/asm-x86_64/os/macos/sha2/sha256_impl.S create mode 100644 module/icp/asm-x86_64/os/macos/sha2/sha512_impl.S create mode 100644 module/os/macos/.gitignore create mode 100644 module/os/macos/Makefile.am create mode 100644 module/os/macos/README.md create mode 100644 module/os/macos/kernel/.gitignore create mode 100644 module/os/macos/kernel/Info.plist create mode 100644 module/os/macos/kernel/Makefile.am create mode 100644 module/os/macos/kernel/README.txt create mode 100644 module/os/macos/kernel/kextsymboltool.c create mode 100644 module/os/macos/kernel/version.plist create mode 100644 module/os/macos/kernel/zfs.exports create mode 100644 module/os/macos/spl/Makefile.am create mode 100644 module/os/macos/spl/README.md create mode 100644 module/os/macos/spl/spl-atomic.c create mode 100644 module/os/macos/spl/spl-condvar.c create mode 100644 module/os/macos/spl/spl-cred.c create mode 100644 module/os/macos/spl/spl-ddi.c create mode 100644 module/os/macos/spl/spl-debug.c create mode 100644 module/os/macos/spl/spl-err.c create mode 100644 module/os/macos/spl/spl-kmem.c create mode 100644 module/os/macos/spl/spl-kstat.c create mode 100644 module/os/macos/spl/spl-list.c create mode 100644 module/os/macos/spl/spl-mutex.c create mode 100644 module/os/macos/spl/spl-osx.c create mode 100644 module/os/macos/spl/spl-policy.c create mode 100644 module/os/macos/spl/spl-proc.c create mode 100644 module/os/macos/spl/spl-proc_list.c create mode 100644 module/os/macos/spl/spl-processor.c create mode 100644 module/os/macos/spl/spl-rwlock.c create mode 100644 module/os/macos/spl/spl-seg_kmem.c create mode 100644 module/os/macos/spl/spl-taskq.c create mode 100644 module/os/macos/spl/spl-thread.c create mode 100644 module/os/macos/spl/spl-time.c create mode 100644 module/os/macos/spl/spl-tsd.c create mode 100644 module/os/macos/spl/spl-vmem.c create mode 100644 module/os/macos/spl/spl-vnode.c create mode 100644 module/os/macos/spl/spl-xdr.c create mode 100644 module/os/macos/spl/spl-zlib.c create mode 100644 module/os/macos/zfs/.gitignore create mode 100644 module/os/macos/zfs/Info.plist create mode 100644 module/os/macos/zfs/InfoPlist.strings create mode 100644 module/os/macos/zfs/Makefile.am create mode 100644 module/os/macos/zfs/ZFSDataset.cpp create mode 100644 module/os/macos/zfs/ZFSDatasetProxy.cpp create mode 100644 module/os/macos/zfs/ZFSDatasetScheme.cpp create mode 100644 module/os/macos/zfs/ZFSPool.cpp create mode 100644 module/os/macos/zfs/abd_os.c create mode 100644 module/os/macos/zfs/arc_os.c create mode 100644 module/os/macos/zfs/ldi_iokit.cpp create mode 100644 module/os/macos/zfs/ldi_osx.c create mode 100644 module/os/macos/zfs/ldi_vnode.c create mode 100644 module/os/macos/zfs/policy.c create mode 100644 module/os/macos/zfs/qat.c create mode 100644 module/os/macos/zfs/qat_compress.c create mode 100644 module/os/macos/zfs/qat_crypt.c create mode 100644 module/os/macos/zfs/spa_misc_os.c create mode 100644 module/os/macos/zfs/trace.c create mode 100644 module/os/macos/zfs/vdev_disk.c create mode 100644 module/os/macos/zfs/vdev_file.c create mode 100644 module/os/macos/zfs/zfs_acl.c create mode 100644 module/os/macos/zfs/zfs_boot.cpp create mode 100644 module/os/macos/zfs/zfs_ctldir.c create mode 100644 module/os/macos/zfs/zfs_debug.c create mode 100644 module/os/macos/zfs/zfs_dir.c create mode 100644 module/os/macos/zfs/zfs_file_os.c create mode 100644 module/os/macos/zfs/zfs_fuid_os.c create mode 100644 module/os/macos/zfs/zfs_ioctl_os.c create mode 100644 module/os/macos/zfs/zfs_kstat_osx.c create mode 100644 module/os/macos/zfs/zfs_osx.cpp create mode 100644 module/os/macos/zfs/zfs_vfsops.c create mode 100644 module/os/macos/zfs/zfs_vnops.c create mode 100644 module/os/macos/zfs/zfs_vnops_osx.c create mode 100644 module/os/macos/zfs/zfs_vnops_osx_lib.c create mode 100644 module/os/macos/zfs/zfs_znode.c create mode 100644 module/os/macos/zfs/zio_crypt.c create mode 100644 module/os/macos/zfs/zvolIO.cpp create mode 100644 module/os/macos/zfs/zvol_os.c create mode 100755 scripts/cmd-macos.sh create mode 100755 scripts/debug-macos.sh create mode 100755 scripts/load_macos.sh diff --git a/appveryor.yml b/appveryor.yml new file mode 100644 index 0000000000..725f8fd87e --- /dev/null +++ b/appveryor.yml @@ -0,0 +1,12 @@ +version: 1.0.{build} +branches: + only: + - macos +image: macOS +build_script: +- sh: >- + pwd + ls -l + sh autoconf.sh + ./configure CPPFLAGS="-I/usr/local/opt/gettext/include -I/usr/local/opt/openssl@1.1/include" LDFLAGS="-L/usr/local/opt/gettext/lib/ -L/usr/local/opt/openssl@1.1/lib" + make diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am index 8b2d0c2002..57d24aa206 100644 --- a/cmd/zed/zed.d/Makefile.am +++ b/cmd/zed/zed.d/Makefile.am @@ -20,6 +20,8 @@ dist_zedexec_SCRIPTS = \ scrub_finish-notify.sh \ statechange-led.sh \ statechange-notify.sh \ + snapshot_mount.sh \ + snapshot_unmount.sh \ vdev_clear-led.sh \ vdev_attach-led.sh \ pool_import-led.sh \ @@ -38,6 +40,8 @@ zedconfdefaults = \ scrub_finish-notify.sh \ statechange-led.sh \ statechange-notify.sh \ + snapshot_mount.sh \ + snapshot_unmount.sh \ vdev_clear-led.sh \ vdev_attach-led.sh \ pool_import-led.sh \ diff --git a/cmd/zed/zed.d/snapshot_mount.sh b/cmd/zed/zed.d/snapshot_mount.sh new file mode 100644 index 0000000000..5cf807aa99 --- /dev/null +++ b/cmd/zed/zed.d/snapshot_mount.sh @@ -0,0 +1,29 @@ +#!/bin/sh +# +# Helper to mount and unmount snapshots when asked to by kernel. +# +# Mostly used in macOS. +# +set -ef + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_SNAPSHOT_NAME}" ] || exit 1 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 2 + +if [ "${ZEVENT_SUBCLASS}" = "snapshot_mount" ]; then + action="mount" +elif [ "${ZEVENT_SUBCLASS}" = "snapshot_unmount" ]; then + action="unmount" +else + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 3 +fi + +zed_exit_if_ignoring_this_event +zed_check_cmd "${ZFS}" || exit 4 + +"${ZFS}" "${action}" "${ZEVENT_SNAPSHOT_NAME}" + +finished diff --git a/cmd/zed/zed.d/snapshot_unmount.sh b/cmd/zed/zed.d/snapshot_unmount.sh new file mode 120000 index 0000000000..9f74a29e61 --- /dev/null +++ b/cmd/zed/zed.d/snapshot_unmount.sh @@ -0,0 +1 @@ +snapshot_mount.sh \ No newline at end of file diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 34f1a2c263..98b5d62a4a 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -767,11 +767,11 @@ zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) } else if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not mounted\n")); - ret = 1; + ret = 0; } else if (zfs_share(zhp) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not shared\n")); - ret = 1; + ret = 0; } zfs_commit_all_shares(); } @@ -7003,7 +7003,7 @@ share_mount(int op, int argc, char **argv) } } else { -#if defined (__APPLE__) +#if defined(__APPLE__) /* * OsX can not mount from kernel, users are expected to mount * by hand using "zfs mount dataset@snapshot". @@ -7027,8 +7027,8 @@ share_mount(int op, int argc, char **argv) } else { - ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, - options); + ret = share_mount_one(zhp, op, flags, NULL, + B_TRUE, options); } zfs_close(zhp); @@ -7430,7 +7430,7 @@ unshare_unmount(int op, int argc, char **argv) return (unshare_unmount_path(op, argv[0], flags, B_FALSE)); -#if defined (__APPLE__) +#if defined(__APPLE__) /* Temporarily, allow mounting snapshots on OS X */ if ((zhp = zfs_open(g_zfs, argv[0], diff --git a/cmd/zpool/os/macos/zpool_vdev_os.c b/cmd/zpool/os/macos/zpool_vdev_os.c new file mode 100644 index 0000000000..b95e4deea6 --- /dev/null +++ b/cmd/zpool/os/macos/zpool_vdev_os.c @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zpool_util.h" +#include + +#include +#include +#include +#include +#include + +boolean_t +check_sector_size_database(char *path, int *sector_size) +{ + return (B_FALSE); +} + +void +zpool_vdev_enable_file(struct stat64 *statbuf, boolean_t *wholedisk) +{ + if (S_ISCHR(statbuf->st_mode)) { + statbuf->st_mode &= ~S_IFCHR; + statbuf->st_mode |= S_IFBLK; + *wholedisk = B_FALSE; + } +} + +int +check_device(const char *name, boolean_t force, + boolean_t isspare, boolean_t iswholedisk) +{ + char path[MAXPATHLEN]; + + if (strncmp(name, _PATH_DEV, sizeof (_PATH_DEV) - 1) != 0) + snprintf(path, sizeof (path), "%s%s", _PATH_DEV, name); + else + strlcpy(path, name, sizeof (path)); + + return (check_file(path, force, isspare)); +} diff --git a/configure.ac b/configure.ac index a4bd69e92c..dec23ac030 100644 --- a/configure.ac +++ b/configure.ac @@ -139,6 +139,8 @@ AC_CONFIG_FILES([ include/os/macos/spl/Makefile include/os/macos/spl/rpc/Makefile include/os/macos/spl/sys/Makefile + include/os/macos/zfs/Makefile + include/os/macos/zfs/sys/Makefile include/sys/Makefile include/sys/crypto/Makefile include/sys/fm/Makefile @@ -162,6 +164,12 @@ AC_CONFIG_FILES([ lib/libspl/include/os/freebsd/sys/Makefile lib/libspl/include/os/linux/Makefile lib/libspl/include/os/linux/sys/Makefile + lib/libspl/include/os/macos/Makefile + lib/libspl/include/os/macos/ia32/Makefile + lib/libspl/include/os/macos/ia32/sys/Makefile + lib/libspl/include/os/macos/mach/Makefile + lib/libspl/include/os/macos/rpc/Makefile + lib/libspl/include/os/macos/sys/Makefile lib/libspl/include/rpc/Makefile lib/libspl/include/sys/Makefile lib/libspl/include/sys/dktp/Makefile diff --git a/include/os/macos/Makefile.am b/include/os/macos/Makefile.am new file mode 100644 index 0000000000..a9564c3e3c --- /dev/null +++ b/include/os/macos/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = spl zfs \ No newline at end of file diff --git a/include/os/macos/spl/Makefile.am b/include/os/macos/spl/Makefile.am new file mode 100644 index 0000000000..75cad0836e --- /dev/null +++ b/include/os/macos/spl/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys rpc diff --git a/include/os/macos/spl/ia32/sys/asm_linkage.h b/include/os/macos/spl/ia32/sys/asm_linkage.h new file mode 100644 index 0000000000..0009705ad6 --- /dev/null +++ b/include/os/macos/spl/ia32/sys/asm_linkage.h @@ -0,0 +1,297 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IA32_SYS_ASM_LINKAGE_H +#define _IA32_SYS_ASM_LINKAGE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _ASM /* The remainder of this file is only for assembly files */ + +/* + * make annoying differences in assembler syntax go away + */ + +/* + * D16 and A16 are used to insert instructions prefixes; the + * macros help the assembler code be slightly more portable. + */ +#if !defined(__GNUC_AS__) +/* + * /usr/ccs/bin/as prefixes are parsed as separate instructions + */ +#define D16 data16; +#define A16 addr16; + +/* + * (There are some weird constructs in constant expressions) + */ +#define _CONST(const) [const] +#define _BITNOT(const) -1!_CONST(const) +#define _MUL(a, b) _CONST(a \* b) + +#else +/* + * Why not use the 'data16' and 'addr16' prefixes .. well, the + * assembler doesn't quite believe in real mode, and thus argues with + * us about what we're trying to do. + */ +#define D16 .byte 0x66; +#define A16 .byte 0x67; + +#define _CONST(const) (const) +#define _BITNOT(const) ~_CONST(const) +#define _MUL(a, b) _CONST(a * b) + +#endif + +/* + * C pointers are different sizes between i386 and amd64. + * These constants can be used to compute offsets into pointer arrays. + */ +#if defined(__amd64) +#define CLONGSHIFT 3 +#define CLONGSIZE 8 +#define CLONGMASK 7 +#elif defined(__i386) +#define CLONGSHIFT 2 +#define CLONGSIZE 4 +#define CLONGMASK 3 +#endif + +/* + * Since we know we're either ILP32 or LP64 .. + */ +#define CPTRSHIFT CLONGSHIFT +#define CPTRSIZE CLONGSIZE +#define CPTRMASK CLONGMASK + +#if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) +#error "inconsistent shift constants" +#endif + +#if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) +#error "inconsistent mask constants" +#endif + +#define ASM_ENTRY_ALIGN 4, 0x90 + +/* + * SSE register alignment and save areas + */ + +#define XMM_SIZE 16 +#define XMM_ALIGN 16 +#define XMM_ALIGN_LOG 4, 0x90 + +#if defined(__amd64) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp; \ + movq %rsp, sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp + +#elif defined(__i386) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \ + movl %esp, sreg; \ + addl $XMM_ALIGN, sreg; \ + andl $_BITNOT(XMM_ALIGN-1), sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; + +#endif /* __i386 */ + +/* + * profiling causes definitions of the MCOUNT and RTMCOUNT + * particular to the type + */ +#ifdef GPROF + +#define MCOUNT(x) \ + pushl %ebp; \ + movl %esp, %ebp; \ + call _mcount; \ + popl %ebp + +#endif /* GPROF */ + +#ifdef PROF + +#define MCOUNT(x) \ +/* CSTYLED */ \ + .lcomm .L_/**/x/**/1, 4, 4; \ + pushl %ebp; \ + movl %esp, %ebp; \ +/* CSTYLED */ \ + movl $.L_/**/x/**/1, %edx; \ + call _mcount; \ + popl %ebp + +#endif /* PROF */ + +/* + * if we are not profiling, MCOUNT should be defined to nothing + */ +#if !defined(PROF) && !defined(GPROF) +#define MCOUNT(x) +#endif /* !defined(PROF) && !defined(GPROF) */ + +#define RTMCOUNT(x) MCOUNT(x) + +/* + * Macro to define weak symbol aliases. These are similar to the ANSI-C + * #pragma weak name = _name + * except a compiler can determine type. The assembler must be told. Hence, + * the second parameter must be the type of the symbol (i.e.: function,...) + */ +#define ANSI_PRAGMA_WEAK(sym, stype) \ + .weak sym; \ +/* CSTYLED */ \ +sym = _/**/sym + +/* + * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in: + * #pragma weak sym1 = sym2 + */ +#define ANSI_PRAGMA_WEAK2(sym1, sym2, stype) \ + .weak sym1; \ +sym1 = sym2 + +/* + * ENTRY provides the standard procedure entry code and an easy way to + * insert the calls to mcount for profiling. ENTRY_NP is identical, but + * never calls mcount. + */ +#define ENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: MCOUNT(x) + +#define ENTRY_NP(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: + +#define RTENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: RTMCOUNT(x) + +/* + * ENTRY2 is identical to ENTRY but provides two labels for the entry point. + */ +#define ENTRY2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ +/* CSTYLED */ \ +x:; \ +y: MCOUNT(x) + +#define ENTRY_NP2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ +/* CSTYLED */ \ +x:; \ +y: + + +/* + * ALTENTRY provides for additional entry points. + */ +#define ALTENTRY(x) \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: + +/* + * DGDEF and DGDEF2 provide global data declarations. + * + * DGDEF provides a word aligned word of storage. + * + * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This + * implies this macro is best used for byte arrays. + * + * DGDEF3 allocates "sz" bytes of storage with "algn" alignment. + */ +#define DGDEF2(name, sz) \ + .data; \ + .globl name; \ +name: + +#define DGDEF3(name, sz, algn) \ + .data; \ + .align algn; \ + .globl name; \ +name: + +#define DGDEF(name) DGDEF3(name, 4, 4) + +/* + * SET_SIZE trails a function and set the size for the ELF symbol table. + */ +#define SET_SIZE(x) + +/* + * NWORD provides native word value. + */ +#if defined(__amd64) + +/*CSTYLED*/ +#define NWORD quad + +#elif defined(__i386) + +#define NWORD long + +#endif /* __i386 */ + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/include/os/macos/spl/libkern/libkern.h b/include/os/macos/spl/libkern/libkern.h new file mode 100644 index 0000000000..5d4fa410b7 --- /dev/null +++ b/include/os/macos/spl/libkern/libkern.h @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2020 Jorgen Lundman + * + */ + +#ifndef _SPL_LIBKERN_H +#define _SPL_LIBKERN_H + +/* + * We wrap this header to handle that copyinstr()'s final argument is + * mandatory on OSX. Wrap it to call our ddi_copyinstr to make it optional. + */ +#include_next +#undef copyinstr +#define copyinstr(U, K, L, D) ddi_copyinstr((U), (K), (L), (D)) + +#endif diff --git a/include/os/macos/spl/linux/init.h b/include/os/macos/spl/linux/init.h new file mode 100644 index 0000000000..4ab1523c16 --- /dev/null +++ b/include/os/macos/spl/linux/init.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LINUX_INIT_H +#define _LINUX_INIT_H + + +#endif diff --git a/include/os/macos/spl/linux/kernel.h b/include/os/macos/spl/linux/kernel.h new file mode 100644 index 0000000000..73a2b2eaad --- /dev/null +++ b/include/os/macos/spl/linux/kernel.h @@ -0,0 +1,25 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LINUX_KERNEL_H +#define _LINUX_KERNEL_H + +#endif diff --git a/include/os/macos/spl/linux/module.h b/include/os/macos/spl/linux/module.h new file mode 100644 index 0000000000..264d6c058d --- /dev/null +++ b/include/os/macos/spl/linux/module.h @@ -0,0 +1,28 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LINUX_MODULE_H +#define _LINUX_MODULE_H + +#include +#include + +#endif diff --git a/include/os/macos/spl/rpc/Makefile.am b/include/os/macos/spl/rpc/Makefile.am new file mode 100644 index 0000000000..770d26812e --- /dev/null +++ b/include/os/macos/spl/rpc/Makefile.am @@ -0,0 +1,3 @@ +KERNEL_H = \ + $(top_srcdir)/include/os/macos/spl/rpc/types.h \ + $(top_srcdir)/include/os/macos/spl/rpc/xdr.h diff --git a/include/os/macos/spl/rpc/types.h b/include/os/macos/spl/rpc/types.h new file mode 100644 index 0000000000..e089e0ed8c --- /dev/null +++ b/include/os/macos/spl/rpc/types.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ +#ifndef _SPL_RPC_TYPES_H +#define _SPL_RPC_TYPES_H + +typedef int bool_t; + +#endif /* SPL_RPC_TYPES_H */ diff --git a/include/os/macos/spl/rpc/xdr.h b/include/os/macos/spl/rpc/xdr.h new file mode 100644 index 0000000000..7b8074b05c --- /dev/null +++ b/include/os/macos/spl/rpc/xdr.h @@ -0,0 +1,175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright (c) 1989, 2011, Oracle and/or its affiliates. All rights reserved. + */ +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Portions of this source code were derived from Berkeley + * 4.3 BSD under license from the Regents of the University of + * California. + */ + +/* + * xdr.h, External Data Representation Serialization Routines. + * + */ + +#ifndef _SPL_RPC_XDR_H +#define _SPL_RPC_XDR_H + + +#include +#include + +/* + * XDR enums and types. + */ +enum xdr_op { + XDR_ENCODE, + XDR_DECODE +}; + +struct xdr_ops; + +typedef struct { + struct xdr_ops *x_ops; /* Also used to let caller know if */ + /* xdrmem_create() succeeds (sigh..) */ + caddr_t x_addr; /* Current buffer addr */ + caddr_t x_addr_end; /* End of the buffer */ + enum xdr_op x_op; /* Stream direction */ +} XDR; + +typedef bool_t (*xdrproc_t)(XDR *xdrs, void *ptr); + +struct xdr_ops { + bool_t (*xdr_control)(XDR *, int, void *); + + bool_t (*xdr_char)(XDR *, char *); + bool_t (*xdr_u_short)(XDR *, unsigned short *); + bool_t (*xdr_u_int)(XDR *, unsigned *); + bool_t (*xdr_u_longlong_t)(XDR *, u_longlong_t *); + + bool_t (*xdr_opaque)(XDR *, caddr_t, const uint_t); + bool_t (*xdr_string)(XDR *, char **, const uint_t); + bool_t (*xdr_array)(XDR *, caddr_t *, uint_t *, const uint_t, + const uint_t, const xdrproc_t); +}; + +/* + * XDR control operator. + */ +#define XDR_GET_BYTES_AVAIL 1 + +struct xdr_bytesrec { + bool_t xc_is_last_record; + size_t xc_num_avail; +}; + +/* + * XDR functions. + */ +void xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op); +#define xdr_destroy(xdrs) ((void) 0) + +#define xdr_control(xdrs, req, info) \ + (xdrs)->x_ops->xdr_control((xdrs), (req), (info)) + +/* + * For precaution, the following are defined as static inlines instead of macros + * to get some amount of type safety. + * + * Also, macros wouldn't work in the case where typecasting is done, because it + * must be possible to reference the functions' addresses by these names. + */ +static inline bool_t +xdr_char(XDR *xdrs, char *cp) +{ + return (xdrs->x_ops->xdr_char(xdrs, cp)); +} + +static inline bool_t +xdr_u_short(XDR *xdrs, unsigned short *usp) +{ + return (xdrs->x_ops->xdr_u_short(xdrs, usp)); +} + +static inline bool_t +xdr_short(XDR *xdrs, short *sp) +{ + return (xdrs->x_ops->xdr_u_short(xdrs, (unsigned short *) sp)); +} + +static inline bool_t +xdr_u_int(XDR *xdrs, unsigned *up) +{ + return (xdrs->x_ops->xdr_u_int(xdrs, up)); +} + +static inline bool_t +xdr_int(XDR *xdrs, int *ip) +{ + return (xdrs->x_ops->xdr_u_int(xdrs, (unsigned *)ip)); +} + +static inline bool_t +xdr_u_longlong_t(XDR *xdrs, u_longlong_t *ullp) +{ + return (xdrs->x_ops->xdr_u_longlong_t(xdrs, ullp)); +} + +static inline bool_t +xdr_longlong_t(XDR *xdrs, longlong_t *llp) +{ + return (xdrs->x_ops->xdr_u_longlong_t(xdrs, (u_longlong_t *)llp)); +} + +/* + * Fixed-length opaque data. + */ +static inline bool_t +xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + return (xdrs->x_ops->xdr_opaque(xdrs, cp, cnt)); +} + +/* + * Variable-length string. + * The *sp buffer must have (maxsize + 1) bytes. + */ +static inline bool_t +xdr_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + return (xdrs->x_ops->xdr_string(xdrs, sp, maxsize)); +} + +/* + * Variable-length arrays. + */ +static inline bool_t xdr_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, + const uint_t maxsize, const uint_t elsize, const xdrproc_t elproc) +{ + return (xdrs->x_ops->xdr_array(xdrs, arrp, sizep, maxsize, elsize, + elproc)); +} + +#endif /* SPL_RPC_XDR_H */ diff --git a/include/os/macos/spl/sys/Makefile.am b/include/os/macos/spl/sys/Makefile.am new file mode 100644 index 0000000000..5703774569 --- /dev/null +++ b/include/os/macos/spl/sys/Makefile.am @@ -0,0 +1,49 @@ +KERNEL_H = \ + $(top_srcdir)/include/os/macos/spl/sys/atomic.h \ + $(top_srcdir)/include/os/macos/spl/sys/byteorder.h \ + $(top_srcdir)/include/os/macos/spl/sys/callb.h \ + $(top_srcdir)/include/os/macos/spl/sys/cmn_err.h \ + $(top_srcdir)/include/os/macos/spl/sys/condvar.h \ + $(top_srcdir)/include/os/macos/spl/sys/console.h \ + $(top_srcdir)/include/os/macos/spl/sys/cred.h \ + $(top_srcdir)/include/os/macos/spl/sys/debug.h \ + $(top_srcdir)/include/os/macos/spl/sys/errno.h \ + $(top_srcdir)/include/os/macos/spl/sys/fcntl.h \ + $(top_srcdir)/include/os/macos/spl/sys/file.h \ + $(top_srcdir)/include/os/macos/spl/sys/inttypes.h \ + $(top_srcdir)/include/os/macos/spl/sys/isa_defs.h \ + $(top_srcdir)/include/os/macos/spl/sys/kmem.h \ + $(top_srcdir)/include/os/macos/spl/sys/kmem_impl.h \ + $(top_srcdir)/include/os/macos/spl/sys/kstat.h \ + $(top_srcdir)/include/os/macos/spl/sys/list.h \ + $(top_srcdir)/include/os/macos/spl/sys/mod_os.h \ + $(top_srcdir)/include/os/macos/spl/sys/mutex.h \ + $(top_srcdir)/include/os/macos/spl/sys/param.h \ + $(top_srcdir)/include/os/macos/spl/sys/policy.h \ + $(top_srcdir)/include/os/macos/spl/sys/priv.h \ + $(top_srcdir)/include/os/macos/spl/sys/proc.h \ + $(top_srcdir)/include/os/macos/spl/sys/processor.h \ + $(top_srcdir)/include/os/macos/spl/sys/random.h \ + $(top_srcdir)/include/os/macos/spl/sys/rwlock.h \ + $(top_srcdir)/include/os/macos/spl/sys/seg_kmem.h \ + $(top_srcdir)/include/os/macos/spl/sys/signal.h \ + $(top_srcdir)/include/os/macos/spl/sys/stropts.h \ + $(top_srcdir)/include/os/macos/spl/sys/sunddi.h \ + $(top_srcdir)/include/os/macos/spl/sys/sysmacros.h \ + $(top_srcdir)/include/os/macos/spl/sys/systeminfo.h \ + $(top_srcdir)/include/os/macos/spl/sys/systm.h \ + $(top_srcdir)/include/os/macos/spl/sys/taskq.h \ + $(top_srcdir)/include/os/macos/spl/sys/taskq_impl.h \ + $(top_srcdir)/include/os/macos/spl/sys/thread.h \ + $(top_srcdir)/include/os/macos/spl/sys/time.h \ + $(top_srcdir)/include/os/macos/spl/sys/timer.h \ + $(top_srcdir)/include/os/macos/spl/sys/tsd.h \ + $(top_srcdir)/include/os/macos/spl/sys/types.h \ + $(top_srcdir)/include/os/macos/spl/sys/utsname.h \ + $(top_srcdir)/include/os/macos/spl/sys/varargs.h \ + $(top_srcdir)/include/os/macos/spl/sys/vfs.h \ + $(top_srcdir)/include/os/macos/spl/sys/vmem.h \ + $(top_srcdir)/include/os/macos/spl/sys/vmem_impl.h \ + $(top_srcdir)/include/os/macos/spl/sys/vmsystm.h \ + $(top_srcdir)/include/os/macos/spl/sys/vnode.h \ + $(top_srcdir)/include/os/macos/spl/sys/zone.h diff --git a/include/os/macos/spl/sys/acl.h b/include/os/macos/spl/sys/acl.h new file mode 100644 index 0000000000..840ba7f43c --- /dev/null +++ b/include/os/macos/spl/sys/acl.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_ACL_H +#define _SPL_ACL_H + +#include + +typedef struct ace { + uid_t a_who; + uint32_t a_access_mask; + uint16_t a_flags; + uint16_t a_type; +} ace_t; + +typedef struct ace_object { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ + uint8_t a_obj_type[16]; /* obj type */ + uint8_t a_inherit_obj_type[16]; /* inherit obj */ +} ace_object_t; + +#define MAX_ACL_ENTRIES 1024 + +#define ACE_READ_DATA 0x00000001 +#define ACE_LIST_DIRECTORY 0x00000001 +#define ACE_WRITE_DATA 0x00000002 +#define ACE_ADD_FILE 0x00000002 +#define ACE_APPEND_DATA 0x00000004 +#define ACE_ADD_SUBDIRECTORY 0x00000004 +#define ACE_READ_NAMED_ATTRS 0x00000008 +#define ACE_WRITE_NAMED_ATTRS 0x00000010 +#define ACE_EXECUTE 0x00000020 +#define ACE_DELETE_CHILD 0x00000040 +#define ACE_READ_ATTRIBUTES 0x00000080 +#define ACE_WRITE_ATTRIBUTES 0x00000100 +#define ACE_DELETE 0x00010000 +#define ACE_READ_ACL 0x00020000 +#define ACE_WRITE_ACL 0x00040000 +#define ACE_WRITE_OWNER 0x00080000 +#define ACE_SYNCHRONIZE 0x00100000 + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +#define ACL_AUTO_INHERIT 0x0001 +#define ACL_PROTECTED 0x0002 +#define ACL_DEFAULTED 0x0004 +#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED|ACL_DEFAULTED) + +#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E +#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 + +#define ACE_ALL_TYPES 0x001F + +#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE|ACE_IDENTIFIER_GROUP) + +#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS|\ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES|\ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_SYNCHRONIZE) + +#define VSA_ACE 0x0010 +#define VSA_ACECNT 0x0020 +#define VSA_ACE_ALLTYPES 0x0040 +#define VSA_ACE_ACLFLAGS 0x0080 + +typedef struct trivial_acl { + uint32_t allow0; /* allow mask for bits only in owner */ + uint32_t deny1; /* deny mask for bits not in owner */ + uint32_t deny2; /* deny mask for bits not in group */ + uint32_t owner; /* allow mask matching mode */ + uint32_t group; /* allow mask matching mode */ + uint32_t everyone; /* allow mask matching mode */ +} trivial_acl_t; + +#endif /* _SPL_ACL_H */ diff --git a/include/os/macos/spl/sys/atomic.h b/include/os/macos/spl/sys/atomic.h new file mode 100644 index 0000000000..0fcc072680 --- /dev/null +++ b/include/os/macos/spl/sys/atomic.h @@ -0,0 +1,288 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * OSX Atomic functions using clang builtins. + * + * Jorgen Lundman + * + */ + +#ifndef _SPL_ATOMIC_H +#define _SPL_ATOMIC_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Increment target + */ +static inline void +atomic_inc_8(volatile uint8_t *target) +{ + __sync_fetch_and_add(target, 1); +} + +static inline void +atomic_inc_16(volatile uint16_t *target) +{ + __sync_fetch_and_add(target, 1); +} + +static inline void +atomic_inc_32(volatile uint32_t *target) +{ + __sync_fetch_and_add(target, 1); +} + +static inline void +atomic_inc_64(volatile uint64_t *target) +{ + __sync_fetch_and_add(target, 1); +} + +static inline int32_t +atomic_inc_32_nv(volatile uint32_t *target) +{ + return (__sync_add_and_fetch(target, 1)); +} + +static inline int64_t +atomic_inc_64_nv(volatile uint64_t *target) +{ + return (__sync_add_and_fetch(target, 1)); +} + + + +/* + * Decrement target + */ +static inline void +atomic_dec_8(volatile uint8_t *target) +{ + __sync_fetch_and_sub(target, 1); +} + +static inline void +atomic_dec_16(volatile uint16_t *target) +{ + __sync_fetch_and_sub(target, 1); +} + +static inline void +atomic_dec_32(volatile uint32_t *target) +{ + __sync_fetch_and_sub(target, 1); +} + +static inline void +atomic_dec_64(volatile uint64_t *target) +{ + __sync_fetch_and_sub(target, 1); +} + +static inline int32_t +atomic_dec_32_nv(volatile uint32_t *target) +{ + return (__sync_sub_and_fetch(target, 1)); +} + +static inline int64_t +atomic_dec_64_nv(volatile uint64_t *target) +{ + return (__sync_sub_and_fetch(target, 1)); +} + +/* + * Add delta to target + */ +static inline void +atomic_add_8(volatile uint8_t *target, int8_t delta) +{ + __sync_add_and_fetch(target, delta); +} + +static inline void +atomic_add_16(volatile uint16_t *target, int16_t delta) +{ + __sync_add_and_fetch(target, delta); +} + +static inline void +atomic_add_32(volatile uint32_t *target, int32_t delta) +{ + __sync_add_and_fetch(target, delta); +} + +static inline uint32_t +atomic_add_32_nv(volatile uint32_t *target, int32_t delta) +{ + return (__sync_add_and_fetch(target, delta)); +} + +static inline void +atomic_add_64(volatile uint64_t *target, int64_t delta) +{ + __sync_add_and_fetch(target, delta); +} + +static inline uint64_t +atomic_add_64_nv(volatile uint64_t *target, int64_t delta) +{ + return (__sync_add_and_fetch(target, delta)); +} + + +/* + * Subtract delta to target + */ +static inline void +atomic_sub_8(volatile uint8_t *target, int8_t delta) +{ + __sync_sub_and_fetch(target, delta); +} + +static inline void +atomic_sub_16(volatile uint16_t *target, int16_t delta) +{ + __sync_sub_and_fetch(target, delta); +} + +static inline void +atomic_sub_32(volatile uint32_t *target, int32_t delta) +{ + __sync_sub_and_fetch(target, delta); +} + +static inline void +atomic_sub_64(volatile uint64_t *target, int64_t delta) +{ + __sync_sub_and_fetch(target, delta); +} + +static inline uint64_t +atomic_sub_64_nv(volatile uint64_t *target, int64_t delta) +{ + return (__sync_sub_and_fetch(target, delta)); +} + +/* + * logical OR bits with target + */ +static inline void +atomic_or_8(volatile uint8_t *target, uint8_t mask) +{ + __sync_or_and_fetch(target, mask); +} + +static inline void +atomic_or_16(volatile uint16_t *target, uint16_t mask) +{ + __sync_or_and_fetch(target, mask); +} + +static inline void +atomic_or_32(volatile uint32_t *target, uint32_t mask) +{ + __sync_or_and_fetch(target, mask); +} + +/* + * logical AND bits with target + */ +static inline void +atomic_and_8(volatile uint8_t *target, uint8_t mask) +{ + __sync_and_and_fetch(target, mask); +} + +static inline void +atomic_and_16(volatile uint16_t *target, uint16_t mask) +{ + __sync_and_and_fetch(target, mask); +} + +static inline void +atomic_and_32(volatile uint32_t *target, uint32_t mask) +{ + __sync_and_and_fetch(target, mask); +} + +/* + * Compare And Set + * if *arg1 == arg2, then set *arg1 = arg3; return old value. + */ +static inline uint8_t +atomic_cas_8(volatile uint8_t *_target, uint8_t _cmp, uint8_t _new) +{ + return (__sync_val_compare_and_swap(_target, _cmp, _new)); +} + +static inline uint16_t +atomic_cas_16(volatile uint16_t *_target, uint16_t _cmp, uint16_t _new) +{ + return (__sync_val_compare_and_swap(_target, _cmp, _new)); +} + +static inline uint32_t +atomic_cas_32(volatile uint32_t *_target, uint32_t _cmp, uint32_t _new) +{ + return (__sync_val_compare_and_swap(_target, _cmp, _new)); +} + +static inline uint64_t +atomic_cas_64(volatile uint64_t *_target, uint64_t _cmp, uint64_t _new) +{ + return (__sync_val_compare_and_swap(_target, _cmp, _new)); +} + +static inline uint32_t +atomic_swap_32(volatile uint32_t *_target, uint32_t _new) +{ + return (__sync_lock_test_and_set(_target, _new)); +} + +static inline uint64_t +atomic_swap_64(volatile uint64_t *_target, uint64_t _new) +{ + return (__sync_lock_test_and_set(_target, _new)); +} + +extern void *atomic_cas_ptr(volatile void *_target, void *_cmp, void *_new); + +static inline void +membar_producer(void) +{ + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_ATOMIC_H */ diff --git a/include/os/macos/spl/sys/byteorder.h b/include/os/macos/spl/sys/byteorder.h new file mode 100644 index 0000000000..71631f1611 --- /dev/null +++ b/include/os/macos/spl/sys/byteorder.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_BYTEORDER_H +#define _SPL_BYTEORDER_H + +#include +#include + +#define LE_16(x) OSSwapHostToLittleInt16(x) +#define LE_32(x) OSSwapHostToLittleInt32(x) +#define LE_64(x) OSSwapHostToLittleInt64(x) +#define BE_16(x) OSSwapHostToBigInt16(x) +#define BE_32(x) OSSwapHostToBigInt32(x) +#define BE_64(x) OSSwapHostToBigInt64(x) + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + + +/* 10.8 is lacking in htonll */ +#if !defined(htonll) +#define htonll(x) __DARWIN_OSSwapInt64(x) +#endif +#if !defined(ntohll) +#define ntohll(x) __DARWIN_OSSwapInt64(x) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define _ZFS_LITTLE_ENDIAN +#endif + +#ifdef __BIG_ENDIAN__ +#define _ZFS_BIG_ENDIAN +#endif + +#endif /* SPL_BYTEORDER_H */ diff --git a/include/os/macos/spl/sys/callb.h b/include/os/macos/spl/sys/callb.h new file mode 100644 index 0000000000..3d86b9d41c --- /dev/null +++ b/include/os/macos/spl/sys/callb.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_CALLB_H +#define _SPL_CALLB_H + +#include + +#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); + +typedef struct callb_cpr { + kmutex_t *cc_lockp; +} callb_cpr_t; + +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + (cp)->cc_lockp = lockp; \ +} + +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + CALLB_CPR_ASSERT(cp); \ +} + +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + CALLB_CPR_ASSERT(cp); \ +} + +#define CALLB_CPR_EXIT(cp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ + mutex_exit((cp)->cc_lockp); \ +} + + +#define CALLOUT_FLAG_ROUNDUP 0x1 +#define CALLOUT_FLAG_ABSOLUTE 0x2 +#define CALLOUT_FLAG_HRESTIME 0x4 +#define CALLOUT_FLAG_32BIT 0x8 + +/* Move me to more correct "sys/callo.h" file when convenient. */ +#define CALLOUT_NORMAL 1 +typedef uint64_t callout_id_t; +callout_id_t timeout_generic(int, void (*)(void *), void *, hrtime_t, + hrtime_t, int); + +#endif /* _SPL_CALLB_H */ diff --git a/include/os/macos/spl/sys/cmn_err.h b/include/os/macos/spl/sys/cmn_err.h new file mode 100644 index 0000000000..e4343a97a7 --- /dev/null +++ b/include/os/macos/spl/sys/cmn_err.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SPL_CMN_ERR_H +#define _SPL_CMN_ERR_H + +#include +#include + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +#ifdef _KERNEL + +extern void vcmn_err(int, const char *, __va_list); +extern void cmn_err(int, const char *, ...); + +#endif /* _KERNEL */ + +#define fm_panic panic + +#endif /* SPL_CMN_ERR_H */ diff --git a/include/os/macos/spl/sys/condvar.h b/include/os/macos/spl/sys/condvar.h new file mode 100644 index 0000000000..e1f50da21e --- /dev/null +++ b/include/os/macos/spl/sys/condvar.h @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef OSX_CONDVAR_H +#define OSX_CONDVAR_H + +#include +#include + +#define hz 10 /* frequency when using gethrtime() >> 23 for lbolt */ + +typedef enum { + CV_DEFAULT, + CV_DRIVER +} kcv_type_t; + + +struct cv { + uint64_t pad; +}; + +typedef struct cv kcondvar_t; + +void spl_cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg); +void spl_cv_destroy(kcondvar_t *cvp); +void spl_cv_signal(kcondvar_t *cvp); +void spl_cv_broadcast(kcondvar_t *cvp); +int spl_cv_wait(kcondvar_t *cvp, kmutex_t *mp, int flags, const char *msg); +int spl_cv_timedwait(kcondvar_t *, kmutex_t *, clock_t, int, const char *msg); +int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, + hrtime_t tim, hrtime_t res, int flag); + +/* + * Use these wrapper macros to obtain the CV variable + * name to make ZFS more gdb debugging friendly! + * This name shows up as a thread's wait_event string. + */ +#define cv_wait(cvp, mp) \ + (void) spl_cv_wait((cvp), (mp), PRIBIO, #cvp) + +#define cv_wait_io(cvp, mp) \ + (void) spl_cv_wait((cvp), (mp), PRIBIO, #cvp) + +#define cv_wait_idle(cvp, mp) \ + (void) spl_cv_wait((cvp), (mp), PRIBIO, #cvp) + +#define cv_timedwait(cvp, mp, tim) \ + spl_cv_timedwait((cvp), (mp), (tim), PRIBIO, #cvp) + +#define cv_timedwait_io(cvp, mp, tim) \ + spl_cv_timedwait((cvp), (mp), (tim), PRIBIO, #cvp) + +#define cv_timedwait_idle(cvp, mp, tim) \ + spl_cv_timedwait((cvp), (mp), (tim), PRIBIO, #cvp) + +#define cv_wait_interruptible(cvp, mp) \ + (void) spl_cv_wait((cvp), (mp), PRIBIO|PCATCH, #cvp) + +#define cv_timedwait_interruptible(cvp, mp, tim) \ + spl_cv_timedwait((cvp), (mp), (tim), PRIBIO|PCATCH, #cvp) + +/* cv_wait_sig is the correct name for cv_wait_interruptible */ +#define cv_wait_sig(cvp, mp) \ + spl_cv_wait((cvp), (mp), PRIBIO|PCATCH, #cvp) + +#define cv_wait_io_sig(cvp, mp) \ + spl_cv_wait((cvp), (mp), PRIBIO|PCATCH, #cvp) + +#define cv_timedwait_sig(cvp, mp, tim) \ + spl_cv_timedwait((cvp), (mp), (tim), PRIBIO|PCATCH, #cvp) + +#define TICK_TO_NSEC(tick) ((hrtime_t)(tick) * 1000000000 / hz) +#define cv_reltimedwait(cvp, mp, tim, type) \ + cv_timedwait_hires((cvp), (mp), TICK_TO_NSEC((tim)), 0, 0) + +#define cv_timedwait_sig_hires(cvp, mp, tim, res, flag) \ + cv_timedwait_hires(cvp, mp, tim, res, (flag)|PCATCH) + +#define cv_timedwait_idle_hires(cvp, mp, tim, res, flag) \ + cv_timedwait_hires(cvp, mp, tim, res, (flag)|PCATCH) + +#define cv_init spl_cv_init +#define cv_destroy spl_cv_destroy +#define cv_broadcast spl_cv_broadcast +#define cv_signal spl_cv_signal + +#endif diff --git a/include/os/macos/spl/sys/console.h b/include/os/macos/spl/sys/console.h new file mode 100644 index 0000000000..57c9622105 --- /dev/null +++ b/include/os/macos/spl/sys/console.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SPL_SYS_CONSOLE_H +#define _SPL_SYS_CONSOLE_H + +static inline void +console_vprintf(const char *fmt, va_list args) +{ + vprintf(fmt, args); +} + +static inline void +console_printf(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + console_vprintf(fmt, args); + va_end(args); +} + +#endif /* _SPL_SYS_CONSOLE_H */ diff --git a/include/os/macos/spl/sys/cred.h b/include/os/macos/spl/sys/cred.h new file mode 100644 index 0000000000..9dd1640b5b --- /dev/null +++ b/include/os/macos/spl/sys/cred.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_CRED_H +#define _SPL_CRED_H + +#include +#include +#include + +typedef struct ucred cred_t; + +#define kcred (cred_t *)NOCRED +#define CRED() (cred_t *)kauth_cred_get() +#define KUID_TO_SUID(x) (x) +#define KGID_TO_SGID(x) (x) + +#include + +// Older OSX API +#if !(MAC_OS_X_VERSION_MIN_REQUIRED >= 1070) +#define kauth_cred_getruid(x) (x)->cr_ruid +#define kauth_cred_getrgid(x) (x)->cr_rgid +#define kauth_cred_getsvuid(x) (x)->cr_svuid +#define kauth_cred_getsvgid(x) (x)->cr_svgid +#endif + + +extern void crhold(cred_t *cr); +extern void crfree(cred_t *cr); +extern uid_t crgetuid(const cred_t *cr); +extern uid_t crgetruid(const cred_t *cr); +extern uid_t crgetsuid(const cred_t *cr); +extern uid_t crgetfsuid(const cred_t *cr); +extern gid_t crgetgid(const cred_t *cr); +extern gid_t crgetrgid(const cred_t *cr); +extern gid_t crgetsgid(const cred_t *cr); +extern gid_t crgetfsgid(const cred_t *cr); +extern int crgetngroups(const cred_t *cr); +extern gid_t *crgetgroups(const cred_t *cr); +extern void crgetgroupsfree(gid_t *gids); +extern int spl_cred_ismember_gid(cred_t *cr, gid_t gid); + +#define crgetsid(cred, i) (NULL) + +#endif /* _SPL_CRED_H */ diff --git a/include/os/macos/spl/sys/ctype.h b/include/os/macos/spl/sys/ctype.h new file mode 100644 index 0000000000..7455487330 --- /dev/null +++ b/include/os/macos/spl/sys/ctype.h @@ -0,0 +1,27 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_CTYPE_H +#define _SPL_CTYPE_H + +#define iscntrl(C) (uchar(C) <= 0x1f || uchar(C) == 0x7f) + +#endif diff --git a/include/os/macos/spl/sys/debug.h b/include/os/macos/spl/sys/debug.h new file mode 100644 index 0000000000..a761709ce7 --- /dev/null +++ b/include/os/macos/spl/sys/debug.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Available Solaris debug functions. All of the ASSERT() macros will be + * compiled out when NDEBUG is defined, this is the default behavior for + * the SPL. To enable assertions use the --enable-debug with configure. + * The VERIFY() functions are never compiled out and cannot be disabled. + * + * PANIC() - Panic the node and print message. + * ASSERT() - Assert X is true, if not panic. + * ASSERT3B() - Assert boolean X OP Y is true, if not panic. + * ASSERT3S() - Assert signed X OP Y is true, if not panic. + * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. + * ASSERT3P() - Assert pointer X OP Y is true, if not panic. + * ASSERT0() - Assert value is zero, if not panic. + * VERIFY() - Verify X is true, if not panic. + * VERIFY3B() - Verify boolean X OP Y is true, if not panic. + * VERIFY3S() - Verify signed X OP Y is true, if not panic. + * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. + * VERIFY3P() - Verify pointer X OP Y is true, if not panic. + * VERIFY0() - Verify value is zero, if not panic. + */ + +#ifndef _SPL_DEBUG_H +#define _SPL_DEBUG_H + +#include + +/* SPL has own 'dprintf' as zfs_debug.c version uses mutex */ +#ifdef __cplusplus +extern "C" { +#endif + +extern int zfs_flags; + +/* Simple dprintf for SPL only */ +#ifndef dprintf +#define dprintf(...) \ + if (zfs_flags & 1) \ + printf(__VA_ARGS__) +#endif + +/* + * Common DEBUG functionality. + */ +int spl_panic(const char *file, const char *func, int line, + const char *fmt, ...); +void spl_dumpstack(void); + +void spl_backtrace(char *thesignal); +int getpcstack(uintptr_t *pcstack, int pcstack_limit); +void print_symbol(uintptr_t symbol); + +#ifndef expect +#define expect(expr, value) (__builtin_expect((expr), (value))) +#endif +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#ifndef __maybe_unused +#define __maybe_unused __attribute__((unused)) +#endif + +/* BEGIN CSTYLED */ +#define PANIC(fmt, a...) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) + +#define VERIFY(cond) \ + (void) (unlikely(!(cond)) && \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "%s", "VERIFY(" #cond ") failed\n")) + +#define VERIFY3B(LEFT, OP, RIGHT) do { \ + boolean_t _verify3_left = (boolean_t)(LEFT); \ + boolean_t _verify3_right = (boolean_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%d " #OP " %d)\n", \ + (boolean_t) (_verify3_left), \ + (boolean_t) (_verify3_right)); \ + } while (0) + +#define VERIFY3S(LEFT, OP, RIGHT) do { \ + int64_t _verify3_left = (int64_t)(LEFT); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%lld " #OP " %lld)\n", \ + (long long) (_verify3_left), \ + (long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3U(LEFT, OP, RIGHT) do { \ + uint64_t _verify3_left = (uint64_t)(LEFT); \ + uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%llu " #OP " %llu)\n", \ + (unsigned long long) (_verify3_left), \ + (unsigned long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3P(LEFT, OP, RIGHT) do { \ + uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%px " #OP " %px)\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right)); \ + } while (0) + +#define VERIFY0(RIGHT) do { \ + int64_t _verify3_left = (int64_t)(0); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left == _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(0 == " #RIGHT ") " \ + "failed (0 == %lld)\n", \ + (long long) (_verify3_right)); \ + } while (0) + +#define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__) +#define CTASSERT(x) { _CTASSERT(x, __LINE__); } +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) \ + typedef char __attribute__ ((unused)) \ + __compile_time_assertion__ ## y[(x) ? 1 : -1] + + + +/* + * Debugging disabled (--disable-debug) + */ +#ifdef NDEBUG + +#define ASSERT(x) ((void)0) +#define ASSERT3B(x,y,z) ((void)0) +#define ASSERT3S(x,y,z) ((void)0) +#define ASSERT3U(x,y,z) ((void)0) +#define ASSERT3P(x,y,z) ((void)0) +#define ASSERT0(x) ((void)0) +#define ASSERTV(x) ((void)0) +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) + +/* + * Debugging enabled (--enable-debug) + */ +#else + +#define ASSERT3B VERIFY3B +#define ASSERT3S VERIFY3S +#define ASSERT3U VERIFY3U +#define ASSERT3P VERIFY3P +#define ASSERT0 VERIFY0 +#define ASSERT VERIFY +#define ASSERTV(X) X +#define IMPLY(A, B) \ + ((void)(((!(A)) || (B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") implies (" #B ")"))) +#define EQUIV(A, B) \ + ((void)((!!(A) == !!(B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") is equivalent to (" #B ")"))) +/* END CSTYLED */ + +#endif /* NDEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* SPL_DEBUG_H */ diff --git a/include/os/macos/spl/sys/disp.h b/include/os/macos/spl/sys/disp.h new file mode 100644 index 0000000000..3b1bcbb25c --- /dev/null +++ b/include/os/macos/spl/sys/disp.h @@ -0,0 +1,25 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_DISP_H +#define _SPL_DISP_H + +#endif diff --git a/include/os/macos/spl/sys/dkio.h b/include/os/macos/spl/sys/dkio.h new file mode 100644 index 0000000000..d10314b3e4 --- /dev/null +++ b/include/os/macos/spl/sys/dkio.h @@ -0,0 +1,527 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _OPENSOLARIS_SYS_DKIO_H_ +#define _OPENSOLARIS_SYS_DKIO_H_ + +#include /* Needed for NDKMAP define */ +#include /* Needed for NDKMAP define */ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_SUNOS_VTOC_16) +#define NDKMAP 16 /* # of logical partitions */ +#define DK_LABEL_LOC 1 /* location of disk label */ +#elif defined(_SUNOS_VTOC_8) +#define NDKMAP 8 /* # of logical partitions */ +#define DK_LABEL_LOC 0 /* location of disk label */ +#else +#error "No VTOC format defined." +#endif + +/* + * Structures and definitions for disk io control commands + */ + +/* + * Structures used as data by ioctl calls. + */ + +#define DK_DEVLEN 16 /* device name max length, including */ + /* unit # & NULL (ie - "xyc1") */ + +/* + * Used for controller info + */ +struct dk_cinfo { + char dki_cname[DK_DEVLEN]; /* controller name (no unit #) */ + ushort_t dki_ctype; /* controller type */ + ushort_t dki_flags; /* flags */ + ushort_t dki_cnum; /* controller number */ + uint_t dki_addr; /* controller address */ + uint_t dki_space; /* controller bus type */ + uint_t dki_prio; /* interrupt priority */ + uint_t dki_vec; /* interrupt vector */ + char dki_dname[DK_DEVLEN]; /* drive name (no unit #) */ + uint_t dki_unit; /* unit number */ + uint_t dki_slave; /* slave number */ + ushort_t dki_partition; /* partition number */ + ushort_t dki_maxtransfer; /* max. transfer size in DEV_BSIZE */ +}; + +/* + * Controller types + */ +#define DKC_UNKNOWN 0 +#define DKC_CDROM 1 /* CD-ROM, SCSI or otherwise */ +#define DKC_WDC2880 2 +#define DKC_XXX_0 3 /* unassigned */ +#define DKC_XXX_1 4 /* unassigned */ +#define DKC_DSD5215 5 +#define DKC_ACB4000 7 +#define DKC_MD21 8 +#define DKC_XXX_2 9 /* unassigned */ +#define DKC_NCRFLOPPY 10 +#define DKC_SMSFLOPPY 12 +#define DKC_SCSI_CCS 13 /* SCSI CCS compatible */ +#define DKC_INTEL82072 14 /* native floppy chip */ +#define DKC_MD 16 /* meta-disk (virtual-disk) driver */ +#define DKC_INTEL82077 19 /* 82077 floppy disk controller */ +#define DKC_DIRECT 20 /* Intel direct attached device i.e. IDE */ +#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type */ +#define DKC_PCMCIA_ATA 22 /* PCMCIA AT Attached type */ +#define DKC_VBD 23 /* virtual block device */ + +/* + * Sun reserves up through 1023 + */ + +#define DKC_CUSTOMER_BASE 1024 + +/* + * Flags + */ +#define DKI_BAD144 0x01 /* use DEC std 144 bad sector fwding */ +#define DKI_MAPTRK 0x02 /* controller does track mapping */ +#define DKI_FMTTRK 0x04 /* formats only full track at a time */ +#define DKI_FMTVOL 0x08 /* formats only full volume at a time */ +#define DKI_FMTCYL 0x10 /* formats only full cylinders at a time */ +#define DKI_HEXUNIT 0x20 /* unit number is printed as 3 hex digits */ +#define DKI_PCMCIA_PFD 0x40 /* PCMCIA pseudo-floppy memory card */ + +/* + * partition headers: section 1 + * Returned in struct dk_allmap by ioctl DKIOC[SG]APART (dkio(7I)) + */ +struct dk_map { + uint64_t dkl_cylno; /* starting cylinder */ + uint64_t dkl_nblk; /* number of blocks; if == 0, */ + /* partition is undefined */ +}; + +/* + * Used for all partitions + */ +struct dk_allmap { + struct dk_map dka_map[NDKMAP]; +}; + +#if defined(_SYSCALL32) +struct dk_allmap32 { + struct dk_map32 dka_map[NDKMAP]; +}; +#endif /* _SYSCALL32 */ + +/* + * Definition of a disk's geometry + */ +struct dk_geom { + unsigned short dkg_ncyl; /* # of data cylinders */ + unsigned short dkg_acyl; /* # of alternate cylinders */ + unsigned short dkg_bcyl; /* cyl offset (for fixed head area) */ + unsigned short dkg_nhead; /* # of heads */ + unsigned short dkg_obs1; /* obsolete */ + unsigned short dkg_nsect; /* # of data sectors per track */ + unsigned short dkg_intrlv; /* interleave factor */ + unsigned short dkg_obs2; /* obsolete */ + unsigned short dkg_obs3; /* obsolete */ + unsigned short dkg_apc; /* alternates per cyl (SCSI only) */ + unsigned short dkg_rpm; /* revolutions per minute */ + unsigned short dkg_pcyl; /* # of physical cylinders */ + unsigned short dkg_write_reinstruct; /* # sectors to skip, writes */ + unsigned short dkg_read_reinstruct; /* # sectors to skip, reads */ + unsigned short dkg_extra[7]; /* for compatible expansion */ +}; + +/* + * These defines are for historic compatibility with old drivers. + */ +#define dkg_bhead dkg_obs1 /* used to be head offset */ +#define dkg_gap1 dkg_obs2 /* used to be gap1 */ +#define dkg_gap2 dkg_obs3 /* used to be gap2 */ + +/* + * Disk io control commands + * Warning: some other ioctls with the DIOC prefix exist elsewhere. + * The Generic DKIOC numbers are from 0 - 50. + * The Floppy Driver uses 51 - 100. + * The Hard Disk (except SCSI) 101 - 106. (these are obsolete) + * The CDROM Driver 151 - 200. + * The USCSI ioctl 201 - 250. + */ +#define DKIOC (0x04 << 8) + +/* + * The following ioctls are generic in nature and need to be + * supported as appropriate by all disk drivers + */ +#define DKIOCGGEOM (DKIOC|1) /* Get geometry */ +#define DKIOCINFO (DKIOC|3) /* Get info */ +#define DKIOCGVTOC (DKIOC|11) /* Get VTOC */ +#define DKIOCSVTOC (DKIOC|12) /* Set VTOC & Write to Disk */ + +/* + * Disk Cache Controls. These ioctls should be supported by + * all disk drivers. + * + * DKIOCFLUSHWRITECACHE when used from user-mode ignores the ioctl + * argument, but it should be passed as NULL to allow for future + * reinterpretation. From user-mode, this ioctl request is synchronous. + * + * When invoked from within the kernel, the arg can be NULL to indicate + * a synchronous request or can be the address of a struct dk_callback + * to request an asynchronous callback when the flush request is complete. + * In this case, the flag to the ioctl must include FKIOCTL and the + * dkc_callback field of the pointed to struct must be non-null or the + * request is made synchronously. + * + * In the callback case: if the ioctl returns 0, a callback WILL be performed. + * If the ioctl returns non-zero, a callback will NOT be performed. + * NOTE: In some cases, the callback may be done BEFORE the ioctl call + * returns. The caller's locking strategy should be prepared for this case. + */ +#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ + +struct dk_callback { + void (*dkc_callback)(void *dkc_cookie, int error); + void *dkc_cookie; + int dkc_flag; +}; + +/* bit flag definitions for dkc_flag */ +#define FLUSH_VOLATILE 0x1 /* Bit 0: if set, only flush */ + /* volatile cache; otherwise, flush */ + /* volatile and non-volatile cache */ + +#define DKIOCGETWCE (DKIOC|36) /* Get current write cache */ + /* enablement status */ +#define DKIOCSETWCE (DKIOC|37) /* Enable/Disable write cache */ + +/* + * The following ioctls are used by Sun drivers to communicate + * with their associated format routines. Support of these ioctls + * is not required of foreign drivers + */ +#define DKIOCSGEOM (DKIOC|2) /* Set geometry */ +#define DKIOCSAPART (DKIOC|4) /* Set all partitions */ +#define DKIOCGAPART (DKIOC|5) /* Get all partitions */ +#define DKIOCG_PHYGEOM (DKIOC|32) /* get physical geometry */ +#define DKIOCG_VIRTGEOM (DKIOC|33) /* get virtual geometry */ + +/* + * The following ioctl's are removable media support + */ +#define DKIOCLOCK (DKIOC|7) /* Generic 'lock' */ +#define DKIOCUNLOCK (DKIOC|8) /* Generic 'unlock' */ +#define DKIOCSTATE (DKIOC|13) /* Inquire insert/eject state */ +#define DKIOCREMOVABLE (DKIOC|16) /* is media removable */ + + +/* + * ioctl for hotpluggable devices + */ +#define DKIOCHOTPLUGGABLE (DKIOC|35) /* is hotpluggable */ + +/* + * Ioctl to force driver to re-read the alternate partition and rebuild + * the internal defect map. + */ +#define DKIOCADDBAD (DKIOC|20) /* Re-read the alternate map (IDE) */ +#define DKIOCGETDEF (DKIOC|21) /* read defect list (IDE) */ + +/* + * Used by applications to get disk defect information from IDE + * drives. + */ +#ifdef _SYSCALL32 +struct defect_header32 { + int head; + caddr32_t buffer; +}; +#endif /* _SYSCALL32 */ + +struct defect_header { + int head; + caddr_t buffer; +}; + +#define DKIOCPARTINFO (DKIOC|22) /* Get partition or slice parameters */ + +/* + * Used by applications to get partition or slice information + */ +#ifdef _SYSCALL32 +struct part_info32 { + uint32_t p_start; + int p_length; +}; +#endif /* _SYSCALL32 */ + +struct part_info { + uint64_t p_start; + int p_length; +}; + +/* The following ioctls are for Optical Memory Device */ +#define DKIOC_EBP_ENABLE (DKIOC|40) /* enable by pass erase on write */ +#define DKIOC_EBP_DISABLE (DKIOC|41) /* disable by pass erase on write */ + +/* + * This state enum is the argument passed to the DKIOCSTATE ioctl. + */ +enum dkio_state { DKIO_NONE, DKIO_EJECTED, DKIO_INSERTED, DKIO_DEV_GONE }; + +#define DKIOCGMEDIAINFO (DKIOC|42) /* get information about the media */ + +/* + * ioctls to read/write mboot info. + */ +#define DKIOCGMBOOT (DKIOC|43) /* get mboot info */ +#define DKIOCSMBOOT (DKIOC|44) /* set mboot info */ + +/* + * ioctl to get the device temperature. + */ +#define DKIOCGTEMPERATURE (DKIOC|45) /* get temperature */ + +/* + * Used for providing the temperature. + */ + +struct dk_temperature { + uint_t dkt_flags; /* Flags */ + short dkt_cur_temp; /* Current disk temperature */ + short dkt_ref_temp; /* reference disk temperature */ +}; + +#define DKT_BYPASS_PM 0x1 +#define DKT_INVALID_TEMP 0xFFFF + + +/* + * Media types or profiles known + */ +#define DK_UNKNOWN 0x00 /* Media inserted - type unknown */ + + +/* + * SFF 8090 Specification Version 3, media types 0x01 - 0xfffe are retained to + * maintain compatibility with SFF8090. The following define the + * optical media type. + */ +#define DK_REMOVABLE_DISK 0x02 /* Removable Disk */ +#define DK_MO_ERASABLE 0x03 /* MO Erasable */ +#define DK_MO_WRITEONCE 0x04 /* MO Write once */ +#define DK_AS_MO 0x05 /* AS MO */ +#define DK_CDROM 0x08 /* CDROM */ +#define DK_CDR 0x09 /* CD-R */ +#define DK_CDRW 0x0A /* CD-RW */ +#define DK_DVDROM 0x10 /* DVD-ROM */ +#define DK_DVDR 0x11 /* DVD-R */ +#define DK_DVDRAM 0x12 /* DVD_RAM or DVD-RW */ + +/* + * Media types for other rewritable magnetic media + */ +#define DK_FIXED_DISK 0x10001 /* Fixed disk SCSI or otherwise */ +#define DK_FLOPPY 0x10002 /* Floppy media */ +#define DK_ZIP 0x10003 /* IOMEGA ZIP media */ +#define DK_JAZ 0x10004 /* IOMEGA JAZ media */ + +#define DKIOCSETEFI (DKIOC|17) /* Set EFI info */ +#define DKIOCGETEFI (DKIOC|18) /* Get EFI info */ + +#define DKIOCPARTITION (DKIOC|9) /* Get partition info */ + +/* + * Ioctls to get/set volume capabilities related to Logical Volume Managers. + * They include the ability to get/set capabilities and to issue a read to a + * specific underlying device of a replicated device. + */ + +#define DKIOCGETVOLCAP (DKIOC | 25) /* Get volume capabilities */ +#define DKIOCSETVOLCAP (DKIOC | 26) /* Set volume capabilities */ +#define DKIOCDMR (DKIOC | 27) /* Issue a directed read */ + +typedef uint_t volcapinfo_t; + +typedef uint_t volcapset_t; + +#define DKV_ABR_CAP 0x00000001 /* Support Appl.Based Recovery */ +#define DKV_DMR_CAP 0x00000002 /* Support Directed Mirror Read */ + +typedef struct volcap { + volcapinfo_t vc_info; /* Capabilities available */ + volcapset_t vc_set; /* Capabilities set */ +} volcap_t; + +#define VOL_SIDENAME 256 + +typedef struct vol_directed_rd { + int vdr_flags; + offset_t vdr_offset; + size_t vdr_nbytes; + size_t vdr_bytesread; + void *vdr_data; + int vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd_t; + +#define DKV_SIDE_INIT (-1) +#define DKV_DMR_NEXT_SIDE 0x00000001 +#define DKV_DMR_DONE 0x00000002 +#define DKV_DMR_ERROR 0x00000004 +#define DKV_DMR_SUCCESS 0x00000008 +#define DKV_DMR_SHORT 0x00000010 + +#ifdef _MULTI_DATAMODEL +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif +typedef struct vol_directed_rd32 { + int32_t vdr_flags; + offset_t vdr_offset; /* 64-bit element on 32-bit alignment */ + size32_t vdr_nbytes; + size32_t vdr_bytesread; + caddr32_t vdr_data; + int32_t vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd32_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif +#endif /* _MULTI_DATAMODEL */ + +/* + * The ioctl is used to fetch disk's device type, vendor ID, + * model number/product ID, firmware revision and serial number together. + * + * Currently there are two device types - DKD_ATA_TYPE which means the + * disk is driven by cmdk/ata or dad/uata driver, and DKD_SCSI_TYPE + * which means the disk is driven by sd/scsi hba driver. + */ +#define DKIOC_GETDISKID (DKIOC|46) + +/* These two labels are for dkd_dtype of dk_disk_id_t */ +#define DKD_ATA_TYPE 0x01 /* ATA disk or legacy mode SATA disk */ +#define DKD_SCSI_TYPE 0x02 /* SCSI disk or native mode SATA disk */ + +#define DKD_ATA_MODEL 40 /* model number length */ +#define DKD_ATA_FWVER 8 /* firmware revision length */ +#define DKD_ATA_SERIAL 20 /* serial number length */ + +#define DKD_SCSI_VENDOR 8 /* vendor ID length */ +#define DKD_SCSI_PRODUCT 16 /* product ID length */ +#define DKD_SCSI_REVLEVEL 4 /* revision level length */ +#define DKD_SCSI_SERIAL 12 /* serial number length */ + +/* + * The argument type for DKIOC_GETDISKID ioctl. + */ +typedef struct dk_disk_id { + uint_t dkd_dtype; + union { + struct { + char dkd_amodel[DKD_ATA_MODEL]; /* 40 bytes */ + char dkd_afwver[DKD_ATA_FWVER]; /* 8 bytes */ + char dkd_aserial[DKD_ATA_SERIAL]; /* 20 bytes */ + } ata_disk_id; + struct { + char dkd_svendor[DKD_SCSI_VENDOR]; /* 8 bytes */ + char dkd_sproduct[DKD_SCSI_PRODUCT]; /* 16 bytes */ + char dkd_sfwver[DKD_SCSI_REVLEVEL]; /* 4 bytes */ + char dkd_sserial[DKD_SCSI_SERIAL]; /* 12 bytes */ + } scsi_disk_id; + } disk_id; +} dk_disk_id_t; + +/* + * The ioctl is used to update the firmware of device. + */ +#define DKIOC_UPDATEFW (DKIOC|47) + +/* The argument type for DKIOC_UPDATEFW ioctl */ +typedef struct dk_updatefw { + caddr_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_t; + +#ifdef _SYSCALL32 +typedef struct dk_updatefw_32 { + caddr32_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_32_t; +#endif /* _SYSCALL32 */ + +/* + * firmware update type - temporary or permanent use + */ +#define FW_TYPE_TEMP 0x0 /* temporary use */ +#define FW_TYPE_PERM 0x1 /* permanent use */ + +#define DKIOC (0x04 << 8) +#define DKIOCTRIM (DKIOC | 35) + +/* + * ioctl to free space (e.g. SCSI UNMAP) off a disk. + * Pass a dkioc_free_list_t containing a list of extents to be freed. + */ +#define DKIOCFREE (DKIOC|50) + +#define DF_WAIT_SYNC 0x00000001 /* Wait for full write-out of free. */ + +typedef struct dkioc_free_list_ext_s { + uint64_t dfle_start; + uint64_t dfle_length; +} dkioc_free_list_ext_t; + +typedef struct dkioc_free_list_s { + uint64_t dfl_flags; + uint64_t dfl_num_exts; + uint64_t dfl_offset; + dkioc_free_list_ext_t dfl_exts[1]; +} dkioc_free_list_t; +#define DFL_SZ(num_exts) \ + (sizeof (dkioc_free_list_t) + \ + (num_exts - 1) * sizeof (dkioc_free_list_ext_t)) + +/* Frees a variable-length dkioc_free_list_t structure. */ +static inline void +dfl_free(dkioc_free_list_t *dfl) +{ + kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _OPENSOLARIS_SYS_DKIO_H_ */ diff --git a/include/os/macos/spl/sys/errno.h b/include/os/macos/spl/sys/errno.h new file mode 100644 index 0000000000..67574721cc --- /dev/null +++ b/include/os/macos/spl/sys/errno.h @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include_next + +#define EBADE EBADMACHO +#define ECKSUM EBADE +#define EFRAGS EIDRM +#define EREMOTEIO ENOLINK +#define ENOTACTIVE ENOPOLICY +#define ECHRNG EMULTIHOP diff --git a/include/os/macos/spl/sys/fcntl.h b/include/os/macos/spl/sys/fcntl.h new file mode 100644 index 0000000000..5f13d304fc --- /dev/null +++ b/include/os/macos/spl/sys/fcntl.h @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_FCNTL_H +#define _SPL_FCNTL_H + +#include_next + +#define F_FREESP 11 + +#define O_LARGEFILE 0 +#define O_RSYNC 0 +#define O_DIRECT 0 + +#endif /* _SPL_FCNTL_H */ diff --git a/include/os/macos/spl/sys/file.h b/include/os/macos/spl/sys/file.h new file mode 100644 index 0000000000..136e8f3bfb --- /dev/null +++ b/include/os/macos/spl/sys/file.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_FILE_H +#define _SPL_FILE_H + +#define FIGNORECASE 0x00080000 +#define FKIOCTL 0x80000000 +#define ED_CASE_CONFLICT 0x10 + +#include + +/* + * XNU has all the proc structs as opaque and with no functions we + * are allowed to call, so we implement file IO from within the kernel + * as vnode operations. + * The second mode is when we are given a "fd" from userland, which we + * map in here, using getf()/releasef(). + * When it comes to IO, if "fd" is set, we use it (fo_rdwr()) as it + * can handle both files, and pipes. + * In kernel space file ops, we use vn_rdwr on the vnode. + */ +struct spl_fileproc { + void *f_vnode; /* underlying vnode */ + list_node_t f_next; /* * next getf() link for releasef() */ + int f_fd; /* * userland file descriptor */ + off_t f_offset; /* offset for stateful IO */ + void *f_proc; /* opaque */ + void *f_fp; /* opaque */ + int f_writes; /* did write? for close sync */ + minor_t f_file; /* minor of the file */ + void *f_private; /* zfsdev_state_t */ +}; +/* Members with '*' are not used when 'fd' is not given */ + +void *getf(int fd); +void releasef(int fd); +struct vnode *getf_vnode(void *fp); + +#endif /* SPL_FILE_H */ diff --git a/include/os/macos/spl/sys/inttypes.h b/include/os/macos/spl/sys/inttypes.h new file mode 100644 index 0000000000..c9f6a316aa --- /dev/null +++ b/include/os/macos/spl/sys/inttypes.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_INTTYPES_H +#define _SPL_INTTYPES_H + +#endif /* SPL_INTTYPES_H */ diff --git a/include/os/macos/spl/sys/isa_defs.h b/include/os/macos/spl/sys/isa_defs.h new file mode 100644 index 0000000000..f702dc51e1 --- /dev/null +++ b/include/os/macos/spl/sys/isa_defs.h @@ -0,0 +1,690 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ISA_DEFS_H +#define _SYS_ISA_DEFS_H + +/* + * This header file serves to group a set of well known defines and to + * set these for each instruction set architecture. These defines may + * be divided into two groups; characteristics of the processor and + * implementation choices for Solaris on a processor. + * + * Processor Characteristics: + * + * _LITTLE_ENDIAN / _BIG_ENDIAN: + * The natural byte order of the processor. A pointer to an int points + * to the least/most significant byte of that int. + * + * _STACK_GROWS_UPWARD / _STACK_GROWS_DOWNWARD: + * The processor specific direction of stack growth. A push onto the + * stack increases/decreases the stack pointer, so it stores data at + * successively higher/lower addresses. (Stackless machines ignored + * without regrets). + * + * _LONG_LONG_HTOL / _LONG_LONG_LTOH: + * A pointer to a long long points to the most/least significant long + * within that long long. + * + * _BIT_FIELDS_HTOL / _BIT_FIELDS_LTOH: + * The C compiler assigns bit fields from the high/low to the low/high end + * of an int (most to least significant vs. least to most significant). + * + * _IEEE_754: + * The processor (or supported implementations of the processor) + * supports the ieee-754 floating point standard. No other floating + * point standards are supported (or significant). Any other supported + * floating point formats are expected to be cased on the ISA processor + * symbol. + * + * _CHAR_IS_UNSIGNED / _CHAR_IS_SIGNED: + * The C Compiler implements objects of type `char' as `unsigned' or + * `signed' respectively. This is really an implementation choice of + * the compiler writer, but it is specified in the ABI and tends to + * be uniform across compilers for an instruction set architecture. + * Hence, it has the properties of a processor characteristic. + * + * _CHAR_ALIGNMENT / _SHORT_ALIGNMENT / _INT_ALIGNMENT / _LONG_ALIGNMENT / + * _LONG_LONG_ALIGNMENT / _DOUBLE_ALIGNMENT / _LONG_DOUBLE_ALIGNMENT / + * _POINTER_ALIGNMENT / _FLOAT_ALIGNMENT: + * The ABI defines alignment requirements of each of the primitive + * object types. Some, if not all, may be hardware requirements as + * well. The values are expressed in "byte-alignment" units. + * + * _MAX_ALIGNMENT: + * The most stringent alignment requirement as specified by the ABI. + * Equal to the maximum of all the above _XXX_ALIGNMENT values. + * + * _ALIGNMENT_REQUIRED: + * True or false (1 or 0) whether or not the hardware requires the ABI + * alignment. + * + * _LONG_LONG_ALIGNMENT_32 + * The 32-bit ABI supported by a 64-bit kernel may have different + * alignment requirements for primitive object types. The value of this + * identifier is expressed in "byte-alignment" units. + * + * _HAVE_CPUID_INSN + * This indicates that the architecture supports the 'cpuid' + * instruction as defined by Intel. (Intel allows other vendors + * to extend the instruction for their own purposes.) + * + * + * Implementation Choices: + * + * _ILP32 / _LP64: + * This specifies the compiler data type implementation as specified in + * the relevant ABI. The choice between these is strongly influenced + * by the underlying hardware, but is not absolutely tied to it. + * Currently only two data type models are supported: + * + * _ILP32: + * Int/Long/Pointer are 32 bits. This is the historical UNIX + * and Solaris implementation. Due to its historical standing, + * this is the default case. + * + * _LP64: + * Long/Pointer are 64 bits, Int is 32 bits. This is the chosen + * implementation for 64-bit ABIs such as SPARC V9. + * + * _I32LPx: + * A compilation environment where 'int' is 32-bit, and + * longs and pointers are simply the same size. + * + * In all cases, Char is 8 bits and Short is 16 bits. + * + * _SUNOS_VTOC_8 / _SUNOS_VTOC_16 / _SVR4_VTOC_16: + * This specifies the form of the disk VTOC (or label): + * + * _SUNOS_VTOC_8: + * This is a VTOC form which is upwardly compatible with the + * SunOS 4.x disk label and allows 8 partitions per disk. + * + * _SUNOS_VTOC_16: + * In this format the incore vtoc image matches the ondisk + * version. It allows 16 slices per disk, and is not + * compatible with the SunOS 4.x disk label. + * + * Note that these are not the only two VTOC forms possible and + * additional forms may be added. One possible form would be the + * SVr4 VTOC form. The symbol for that is reserved now, although + * it is not implemented. + * + * _SVR4_VTOC_16: + * This VTOC form is compatible with the System V Release 4 + * VTOC (as implemented on the SVr4 Intel and 3b ports) with + * 16 partitions per disk. + * + * + * _DMA_USES_PHYSADDR / _DMA_USES_VIRTADDR + * This describes the type of addresses used by system DMA: + * + * _DMA_USES_PHYSADDR: + * This type of DMA, used in the x86 implementation, + * requires physical addresses for DMA buffers. The 24-bit + * addresses used by some legacy boards is the source of the + * "low-memory" (<16MB) requirement for some devices using DMA. + * + * _DMA_USES_VIRTADDR: + * This method of DMA allows the use of virtual addresses for + * DMA transfers. + * + * _FIRMWARE_NEEDS_FDISK / _NO_FDISK_PRESENT + * This indicates the presence/absence of an fdisk table. + * + * _FIRMWARE_NEEDS_FDISK + * The fdisk table is required by system firmware. If present, + * it allows a disk to be subdivided into multiple fdisk + * partitions, each of which is equivalent to a separate, + * virtual disk. This enables the co-existence of multiple + * operating systems on a shared hard disk. + * + * _NO_FDISK_PRESENT + * If the fdisk table is absent, it is assumed that the entire + * media is allocated for a single operating system. + * + * _HAVE_TEM_FIRMWARE + * Defined if this architecture has the (fallback) option of + * using prom_* calls for doing I/O if a suitable kernel driver + * is not available to do it. + * + * _DONT_USE_1275_GENERIC_NAMES + * Controls whether or not device tree node names should + * comply with the IEEE 1275 "Generic Names" Recommended + * Practice. With _DONT_USE_GENERIC_NAMES, device-specific + * names identifying the particular device will be used. + * + * __i386_COMPAT + * This indicates whether the i386 ABI is supported as a *non-native* + * mode for the platform. When this symbol is defined: + * - 32-bit xstat-style system calls are enabled + * - 32-bit xmknod-style system calls are enabled + * - 32-bit system calls use i386 sizes -and- alignments + * + * Note that this is NOT defined for the i386 native environment! + * + * __x86 + * This is ONLY a synonym for defined(__i386) || defined(__amd64) + * which is useful only insofar as these two architectures share + * common attributes. Analogous to __sparc. + * + * _PSM_MODULES + * This indicates whether or not the implementation uses PSM + * modules for processor support, reading /etc/mach from inside + * the kernel to extract a list. + * + * _RTC_CONFIG + * This indicates whether or not the implementation uses /etc/rtc_config + * to configure the real-time clock in the kernel. + * + * _UNIX_KRTLD + * This indicates that the implementation uses a dynamically + * linked unix + krtld to form the core kernel image at boot + * time, or (in the absence of this symbol) a prelinked kernel image. + * + * _OBP + * This indicates the firmware interface is OBP. + * + * _SOFT_HOSTID + * This indicates that the implementation obtains the hostid + * from the file /etc/hostid, rather than from hardware. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The following set of definitions characterize Solaris on AMD's + * 64-bit systems. + */ +#if defined(__x86_64) || defined(__amd64) + +#if !defined(__amd64) +#define __amd64 /* preferred guard */ +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Different alignment constraints for the i386 ABI in compatibility mode + */ +#define _LONG_LONG_ALIGNMENT_32 4 + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define __i386_COMPAT +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +/* + * The feature test macro __i386 is generic for all processors implementing + * the Intel 386 instruction set or a superset of it. Specifically, this + * includes all members of the 386, 486, and Pentium family of processors. + */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__aarch64__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__riscv) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#define _LP64 +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__arm__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__mips__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#if defined(__mips_n64) +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _INT_ALIGNMENT +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#else +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__powerpc__) + +#if defined(__BIG_ENDIAN__) +#define _BIT_FIELDS_HTOL +#else +#define _BIT_FIELDS_LTOH +#endif + +/* + * The following set of definitions characterize the Solaris on SPARC systems. + * + * The symbol __sparc indicates any of the SPARC family of processor + * architectures. This includes SPARC V7, SPARC V8 and SPARC V9. + * + * The symbol __sparcv8 indicates the 32-bit SPARC V8 architecture as defined + * by Version 8 of the SPARC Architecture Manual. (SPARC V7 is close enough + * to SPARC V8 for the former to be subsumed into the latter definition.) + * + * The symbol __sparcv9 indicates the 64-bit SPARC V9 architecture as defined + * by Version 9 of the SPARC Architecture Manual. + * + * The symbols __sparcv8 and __sparcv9 are mutually exclusive, and are only + * relevant when the symbol __sparc is defined. + */ +/* + * XXX Due to the existence of 5110166, "defined(__sparcv9)" needs to be added + * to support backwards builds. This workaround should be removed in s10_71. + */ +#elif defined(__sparc) || defined(__sparcv9) || defined(__sparc__) +#if !defined(__sparc) +#define __sparc +#endif + +/* + * You can be 32-bit or 64-bit, but not both at the same time. + */ +#if defined(__sparcv8) && defined(__sparcv9) +#error "SPARC Versions 8 and 9 are mutually exclusive choices" +#endif + +/* + * Existing compilers do not set __sparcv8. Years will transpire before + * the compilers can be depended on to set the feature test macro. In + * the interim, we'll set it here on the basis of historical behaviour; + * if you haven't asked for SPARC V9, then you must've meant SPARC V8. + */ +#if !defined(__sparcv9) && !defined(__sparcv8) +#define __sparcv8 +#endif + +/* + * Define the appropriate "processor characteristics" shared between + * all Solaris on SPARC systems. + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_HTOL +#define _BIT_FIELDS_HTOL +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Define the appropriate "implementation choices" shared between versions. + */ +#define _SUNOS_VTOC_8 +#define _DMA_USES_VIRTADDR +#define _NO_FDISK_PRESENT +#define _HAVE_TEM_FIRMWARE +#define _OBP + +/* + * The following set of definitions characterize the implementation of + * 32-bit Solaris on SPARC V8 systems. + */ +#if defined(__sparcv8) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 8 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#define _ILP32 +#if !defined(_I32LPx) +#define _I32LPx +#endif + +/* + * The following set of definitions characterize the implementation of + * 64-bit Solaris on SPARC V9 systems. + */ +#elif defined(__sparcv9) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL + +#else +#error "unknown SPARC version" +#endif + +/* + * #error is strictly ansi-C, but works as well as anything for K&R systems. + */ +#else +#error "ISA not supported" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#define ____cacheline_aligned __attribute__((aligned(64))) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ISA_DEFS_H */ diff --git a/include/os/macos/spl/sys/kmem.h b/include/os/macos/spl/sys/kmem.h new file mode 100644 index 0000000000..86287060cc --- /dev/null +++ b/include/os/macos/spl/sys/kmem.h @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS Project + * Copyright (C) 2013 Jorgen Lundman + * Copyright (C) 2017 Sean Doran + * + */ + +#ifndef _SPL_KMEM_H +#define _SPL_KMEM_H + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// XNU total amount of memory +extern uint64_t physmem; + +#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */ +#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ +#define KM_PANIC 0x0002 /* if memory cannot be allocated, panic */ +#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ +#define KM_NORMALPRI 0x0008 /* with KM_NOSLEEP, lower priority allocation */ +#define KM_NODEBUG 0x0010 /* NOT IMPLEMENTED ON OSX */ +#define KM_NO_VBA 0x0020 /* OSX: don't descend to the bucket layer */ +#define KM_VMFLAGS 0x00ff /* flags that must match VM_* flags */ + +#define KM_FLAGS 0xffff /* all settable kmem flags */ + +/* + * Kernel memory allocator: DDI interfaces. + * See kmem_alloc(9F) for details. + */ + +// Work around symbol collisions in XNU +#define kmem_alloc(size, kmflags) zfs_kmem_alloc((size), (kmflags)) +#define kmem_zalloc(size, kmflags) zfs_kmem_zalloc((size), (kmflags)) +#define kmem_free(buf, size) zfs_kmem_free((buf), (size)) + +void *zfs_kmem_alloc(size_t size, int kmflags); +void *zfs_kmem_zalloc(size_t size, int kmflags); +void zfs_kmem_free(void *buf, size_t size); + +void spl_kmem_init(uint64_t); +void spl_kmem_thread_init(void); +void spl_kmem_mp_init(void); +void spl_kmem_thread_fini(void); +void spl_kmem_fini(void); + +size_t kmem_size(void); +size_t kmem_used(void); +int64_t kmem_avail(void); +size_t kmem_num_pages_wanted(void); +int spl_vm_pool_low(void); +int32_t spl_minimal_physmem_p(void); +int64_t spl_adjust_pressure(int64_t); +int64_t spl_free_wrapper(void); +int64_t spl_free_manual_pressure_wrapper(void); +boolean_t spl_free_fast_pressure_wrapper(void); +void spl_free_set_pressure(int64_t); +void spl_free_set_fast_pressure(boolean_t); +uint64_t spl_free_last_pressure_wrapper(void); + +#define KMC_NOTOUCH 0x00010000 +#define KMC_NODEBUG 0x00020000 +#define KMC_NOMAGAZINE 0x00040000 +#define KMC_NOHASH 0x00080000 +#define KMC_QCACHE 0x00100000 +#define KMC_KMEM_ALLOC 0x00200000 /* internal use only */ +#define KMC_IDENTIFIER 0x00400000 /* internal use only */ +#define KMC_PREFILL 0x00800000 +#define KMC_ARENA_SLAB 0x01000000 /* use a bigger kmem cache */ + +struct kmem_cache; + +typedef struct kmem_cache kmem_cache_t; + +/* Client response to kmem move callback */ +typedef enum kmem_cbrc { + KMEM_CBRC_YES, + KMEM_CBRC_NO, + KMEM_CBRC_LATER, + KMEM_CBRC_DONT_NEED, + KMEM_CBRC_DONT_KNOW +} kmem_cbrc_t; + +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + +kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, + int (*constructor)(void *, void *, int), + void (*destructor)(void *, void *), + void (*reclaim)(void *), + void *_private, vmem_t *vmp, int cflags); +void kmem_cache_destroy(kmem_cache_t *cache); +void *kmem_cache_alloc(kmem_cache_t *cache, int flags); +void kmem_cache_free(kmem_cache_t *cache, void *buf); +void kmem_cache_free_to_slab(kmem_cache_t *cache, void *buf); +extern boolean_t kmem_cache_reap_active(void); +void kmem_cache_reap_now(kmem_cache_t *cache); +void kmem_depot_ws_zero(kmem_cache_t *cache); +void kmem_reap(void); +void kmem_reap_idspace(void); +kmem_cache_t *kmem_cache_buf_in_cache(kmem_cache_t *, void *); + +int kmem_debugging(void); +void kmem_cache_set_move(kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, size_t, void *)); + +char *kmem_asprintf(const char *fmt, ...); +extern char *kmem_strdup(const char *str); +extern void kmem_strfree(char *str); +char *kmem_vasprintf(const char *fmt, va_list ap); +char *kmem_strstr(const char *in, const char *str); +void strident_canon(char *s, size_t n); + +boolean_t spl_arc_no_grow(size_t, boolean_t, kmem_cache_t **); + +extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); +extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_KMEM_H */ diff --git a/include/os/macos/spl/sys/kmem_cache.h b/include/os/macos/spl/sys/kmem_cache.h new file mode 100644 index 0000000000..2dc08b1712 --- /dev/null +++ b/include/os/macos/spl/sys/kmem_cache.h @@ -0,0 +1,25 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_KMEM_CACHE_H +#define _SPL_KMEM_CACHE_H + +#endif diff --git a/include/os/macos/spl/sys/kmem_impl.h b/include/os/macos/spl/sys/kmem_impl.h new file mode 100644 index 0000000000..2f3fa7f9da --- /dev/null +++ b/include/os/macos/spl/sys/kmem_impl.h @@ -0,0 +1,494 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_KMEM_IMPL_H +#define _SYS_KMEM_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * kernel memory allocator: implementation-private data structures + * + * Lock order: + * 1. cache_lock + * 2. cc_lock in order by CPU ID + * 3. cache_depot_lock + * + * Do not call kmem_cache_alloc() or taskq_dispatch() while holding any of the + * above locks. + */ + +#define KMF_AUDIT 0x00000001 /* transaction auditing */ +#define KMF_DEADBEEF 0x00000002 /* deadbeef checking */ +#define KMF_REDZONE 0x00000004 /* redzone checking */ +#define KMF_CONTENTS 0x00000008 /* freed-buffer content logging */ +#define KMF_STICKY 0x00000010 /* if set, override /etc/system */ +#define KMF_NOMAGAZINE 0x00000020 /* disable per-cpu magazines */ +#define KMF_FIREWALL 0x00000040 /* put all bufs before unmapped pages */ +#define KMF_LITE 0x00000100 /* lightweight debugging */ + +#define KMF_HASH 0x00000200 /* cache has hash table */ +#define KMF_RANDOMIZE 0x00000400 /* randomize other kmem_flags */ + +#define KMF_DUMPDIVERT 0x00001000 /* use alternate memory at dump time */ +#define KMF_DUMPUNSAFE 0x00002000 /* flag caches used at dump time */ +#define KMF_PREFILL 0x00004000 /* Prefill the slab when created. */ + +#define KMF_BUFTAG (KMF_DEADBEEF | KMF_REDZONE) +#define KMF_TOUCH (KMF_BUFTAG | KMF_LITE | KMF_CONTENTS) +#define KMF_RANDOM (KMF_TOUCH | KMF_AUDIT | KMF_NOMAGAZINE) +#define KMF_DEBUG (KMF_RANDOM | KMF_FIREWALL) + +#define KMEM_STACK_DEPTH 15 + +#define KMEM_FREE_PATTERN 0xdeadbeefdeadbeefULL +#define KMEM_UNINITIALIZED_PATTERN 0xbaddcafebaddcafeULL +#define KMEM_REDZONE_PATTERN 0xfeedfacefeedfaceULL +#define KMEM_REDZONE_BYTE 0xbb + +/* + * Upstream platforms handle size == 0 as valid alloc, we + * can not return NULL, as that invalidates KM_SLEEP. So + * we return a valid hardcoded address, instead of actually taking up + * memory by fudging size to 1 byte. If read/writes are + * attempted, we will get page fault (which is correct, they + * asked for zero bytes after all) + */ +#define KMEM_ZERO_SIZE_PTR ((void *)16) + +/* + * Redzone size encodings for kmem_alloc() / kmem_free(). We encode the + * allocation size, rather than storing it directly, so that kmem_free() + * can distinguish frees of the wrong size from redzone violations. + * + * A size of zero is never valid. + */ +#define KMEM_SIZE_ENCODE(x) (251 * (x) + 1) +#define KMEM_SIZE_DECODE(x) ((x) / 251) +#define KMEM_SIZE_VALID(x) ((x) % 251 == 1 && (x) != 1) + + +#define KMEM_ALIGN 8 /* min guaranteed alignment */ +#define KMEM_ALIGN_SHIFT 3 /* log2(KMEM_ALIGN) */ +#define KMEM_VOID_FRACTION 8 /* never waste more than 1/8 of slab */ + +#define KMEM_SLAB_IS_PARTIAL(sp) \ + ((sp)->slab_refcnt > 0 && (sp)->slab_refcnt < (sp)->slab_chunks) +#define KMEM_SLAB_IS_ALL_USED(sp) \ + ((sp)->slab_refcnt == (sp)->slab_chunks) + +/* + * The bufctl (buffer control) structure keeps some minimal information + * about each buffer: its address, its slab, and its current linkage, + * which is either on the slab's freelist (if the buffer is free), or + * on the cache's buf-to-bufctl hash table (if the buffer is allocated). + * In the case of non-hashed, or "raw", caches (the common case), only + * the freelist linkage is necessary: the buffer address is at a fixed + * offset from the bufctl address, and the slab is at the end of the page. + * + * NOTE: bc_next must be the first field; raw buffers have linkage only. + */ +typedef struct kmem_bufctl { + struct kmem_bufctl *bc_next; /* next bufctl struct */ + void *bc_addr; /* address of buffer */ + struct kmem_slab *bc_slab; /* controlling slab */ +} kmem_bufctl_t; + +/* + * The KMF_AUDIT version of the bufctl structure. The beginning of this + * structure must be identical to the normal bufctl structure so that + * pointers are interchangeable. + */ +typedef struct kmem_bufctl_audit { + struct kmem_bufctl *bc_next; /* next bufctl struct */ + void *bc_addr; /* address of buffer */ + struct kmem_slab *bc_slab; /* controlling slab */ + kmem_cache_t *bc_cache; /* controlling cache */ + hrtime_t bc_timestamp; /* transaction time */ + kthread_t *bc_thread; /* thread doing transaction */ + struct kmem_bufctl *bc_lastlog; /* last log entry */ + void *bc_contents; /* contents at last free */ + int bc_depth; /* stack depth */ + pc_t bc_stack[KMEM_STACK_DEPTH]; /* pc stack */ +} kmem_bufctl_audit_t; + +/* + * A kmem_buftag structure is appended to each buffer whenever any of the + * KMF_BUFTAG flags (KMF_DEADBEEF, KMF_REDZONE, KMF_VERIFY) are set. + */ +typedef struct kmem_buftag { + uint64_t bt_redzone; /* 64-bit redzone pattern */ + kmem_bufctl_t *bt_bufctl; /* bufctl */ + intptr_t bt_bxstat; /* bufctl ^ (alloc/free) */ +} kmem_buftag_t; + +/* + * A variant of the kmem_buftag structure used for KMF_LITE caches. + * Previous callers are stored in reverse chronological order. (i.e. most + * recent first) + */ +typedef struct kmem_buftag_lite { + kmem_buftag_t bt_buftag; /* a normal buftag */ + pc_t bt_history[1]; /* zero or more callers */ +} kmem_buftag_lite_t; + +#define KMEM_BUFTAG_LITE_SIZE(f) \ + (offsetof(kmem_buftag_lite_t, bt_history[f])) + +#define KMEM_BUFTAG(cp, buf) \ + ((kmem_buftag_t *)((char *)(buf) + (cp)->cache_buftag)) + +#define KMEM_BUFCTL(cp, buf) \ + ((kmem_bufctl_t *)((char *)(buf) + (cp)->cache_bufctl)) + +#define KMEM_BUF(cp, bcp) \ + ((void *)((char *)(bcp) - (cp)->cache_bufctl)) + +#define KMEM_SLAB(cp, buf) \ + ((kmem_slab_t *)P2END((uintptr_t)(buf), (cp)->cache_slabsize) - 1) + +/* + * Test for using alternate memory at dump time. + */ +#define KMEM_DUMP(cp) ((cp)->cache_flags & KMF_DUMPDIVERT) +#define KMEM_DUMPCC(ccp) ((ccp)->cc_flags & KMF_DUMPDIVERT) + +/* + * The "CPU" macro loads a cpu_t that refers to the cpu that the current + * thread is running on at the time the macro is executed. A context switch + * may occur immediately after loading this data structure, leaving this + * thread pointing at the cpu_t for the previous cpu. This is not a problem; + * we'd just end up checking the previous cpu's per-cpu cache, and then check + * the other layers of the kmem cache if need be. + * + * It's not even a problem if the old cpu gets DR'ed out during the context + * switch. The cpu-remove DR operation bzero()s the cpu_t, but doesn't free + * it. So the cpu_t's cpu_cache_offset would read as 0, causing us to use + * cpu 0's per-cpu cache. + * + * So, there is no need to disable kernel preemption while using the CPU macro + * below since if we have been context switched, there will not be any + * correctness problem, just a momentary use of a different per-cpu cache. + */ + +#define KMEM_CPU_CACHE(cp) \ + (&cp->cache_cpu[cpu_number()]) + +#define KMOM_MAGAZINE_VALID(cp, mp) \ + (((kmem_slab_t *)P2END((uintptr_t)(mp), PAGESIZE) - 1)->slab_cache == \ + (cp)->cache_magtype->mt_cache) + +#define KMEM_MAGAZINE_VALID(cp, mp) \ + (((kmem_slab_t *)P2END((uintptr_t)(mp), PAGESIZE) - 1)->slab_cache == \ + (cp)->cache_magtype->mt_cache) + +#define KMEM_SLAB_OFFSET(sp, buf) \ + ((size_t)((uintptr_t)(buf) - (uintptr_t)((sp)->slab_base))) + +#define KMEM_SLAB_MEMBER(sp, buf) \ + (KMEM_SLAB_OFFSET(sp, buf) < (sp)->slab_cache->cache_slabsize) + +#define KMEM_BUFTAG_ALLOC 0xa110c8edUL +#define KMEM_BUFTAG_FREE 0xf4eef4eeUL + +/* slab_later_count thresholds */ +#define KMEM_DISBELIEF 3 + +/* slab_flags */ +#define KMEM_SLAB_NOMOVE 0x1 +#define KMEM_SLAB_MOVE_PENDING 0x2 + +typedef struct kmem_slab { + struct kmem_cache *slab_cache; /* controlling cache */ + void *slab_base; /* base of allocated memory */ + avl_node_t slab_link; /* slab linkage */ + struct kmem_bufctl *slab_head; /* first free buffer */ + long slab_refcnt; /* outstanding allocations */ + long slab_chunks; /* chunks (bufs) in this slab */ + uint32_t slab_stuck_offset; /* unmoved buffer offset */ + uint16_t slab_later_count; /* cf KMEM_CBRC_LATER */ + uint16_t slab_flags; /* bits to mark the slab */ + hrtime_t slab_create_time; /* when was slab created? */ +} kmem_slab_t; + +#define KMEM_HASH_INITIAL 64 + +#define KMEM_HASH(cp, buf) \ + ((cp)->cache_hash_table + \ + (((uintptr_t)(buf) >> (cp)->cache_hash_shift) & (cp)->cache_hash_mask)) + +#define KMEM_CACHE_NAMELEN 31 + +typedef struct kmem_magazine { + void *mag_next; + void *mag_round[1]; /* one or more rounds */ +} kmem_magazine_t; + +/* + * The magazine types for fast per-cpu allocation + */ +typedef struct kmem_magtype { + short mt_magsize; /* magazine size (number of rounds) */ + int mt_align; /* magazine alignment */ + size_t mt_minbuf; /* all smaller buffers qualify */ + size_t mt_maxbuf; /* no larger buffers qualify */ + kmem_cache_t *mt_cache; /* magazine cache */ +} kmem_magtype_t; + +#define KMEM_CPU_CACHE_SIZE 128 /* must be power of 2 */ +#define KMEM_CPU_PAD (KMEM_CPU_CACHE_SIZE - sizeof (kmutex_t) - \ + 2 * sizeof (uint64_t) - 2 * sizeof (void *) - sizeof (int) - \ + 5 * sizeof (short)) +#define KMEM_CACHE_SIZE(ncpus) \ + __builtin_offsetof(kmem_cache_t, cache_cpu[ncpus]) + + /* Offset from kmem_cache->cache_cpu for per cpu caches */ +#define KMEM_CPU_CACHE_OFFSET(cpuid) \ + __builtin_offsetof(kmem_cache_t, cache_cpu[cpuid]) - \ + __builtin_offsetof(kmem_cache_t, cache_cpu) + +/* + * Per CPU cache data + */ +typedef struct kmem_cpu_cache { + kmutex_t cc_lock; /* protects this cpu's local cache */ + uint64_t cc_alloc; /* allocations from this cpu */ + uint64_t cc_free; /* frees to this cpu */ + kmem_magazine_t *cc_loaded; /* the currently loaded magazine */ + kmem_magazine_t *cc_ploaded; /* the previously loaded magazine */ + int cc_flags; /* CPU-local copy of cache_flags */ + short cc_rounds; /* number of objects in loaded mag */ + short cc_prounds; /* number of objects in previous mag */ + short cc_magsize; /* number of rounds in a full mag */ + short cc_dump_rounds; /* dump time copy of cc_rounds */ + short cc_dump_prounds; /* dump time copy of cc_prounds */ + char cc_pad[KMEM_CPU_PAD]; /* for nice alignment */ +} kmem_cpu_cache_t; + +/* + * The magazine lists used in the depot. + */ +typedef struct kmem_maglist { + kmem_magazine_t *ml_list; /* magazine list */ + long ml_total; /* number of magazines */ + long ml_min; /* min since last update */ + long ml_reaplimit; /* max reapable magazines */ + uint64_t ml_alloc; /* allocations from this list */ +} kmem_maglist_t; + +typedef struct kmem_defrag { + /* + * Statistics + */ + uint64_t kmd_callbacks; /* move callbacks */ + uint64_t kmd_yes; /* KMEM_CBRC_YES responses */ + uint64_t kmd_no; /* NO responses */ + uint64_t kmd_later; /* LATER responses */ + uint64_t kmd_dont_need; /* DONT_NEED responses */ + uint64_t kmd_dont_know; /* DONT_KNOW responses */ + uint64_t kmd_slabs_freed; /* slabs freed by moves */ + uint64_t kmd_defrags; /* kmem_cache_defrag() */ + uint64_t kmd_scans; /* kmem_cache_scan() */ + + /* + * Consolidator fields + */ + avl_tree_t kmd_moves_pending; /* buffer moves pending */ + list_t kmd_deadlist; /* deferred slab frees */ + size_t kmd_deadcount; /* # of slabs in kmd_deadlist */ + uint8_t kmd_reclaim_numer; /* slab usage threshold */ + uint8_t kmd_pad1; /* compiler padding */ + uint16_t kmd_consolidate; /* triggers consolidator */ + uint32_t kmd_pad2; /* compiler padding */ + size_t kmd_slabs_sought; /* reclaimable slabs sought */ + size_t kmd_slabs_found; /* reclaimable slabs found */ + size_t kmd_tries; /* nth scan interval counter */ + /* + * Fields used to ASSERT that the client does not kmem_cache_free() + * objects passed to the move callback. + */ + void *kmd_from_buf; /* object to move */ + void *kmd_to_buf; /* move destination */ + kthread_t *kmd_thread; /* thread calling move */ +} kmem_defrag_t; + +/* + * Cache callback function types + */ +typedef int (*constructor_fn_t)(void*, void*, int); +typedef void (*destructor_fn_t)(void*, void*); +typedef void (*reclaim_fn_t)(void*); + +/* + * Cache + */ +struct kmem_cache { + +/* + * Statistics + */ + uint64_t cache_slab_create; /* slab creates */ + uint64_t cache_slab_destroy; /* slab destroys */ + uint64_t cache_slab_alloc; /* slab layer allocations */ + uint64_t cache_slab_free; /* slab layer frees */ + uint64_t cache_alloc_fail; /* total failed allocations */ + uint64_t cache_buftotal; /* total buffers */ + uint64_t cache_bufmax; /* max buffers ever */ + uint64_t cache_bufslab; /* buffers free in slab layer */ + uint64_t cache_reap; /* cache reaps */ + uint64_t cache_rescale; /* hash table rescales */ + uint64_t cache_lookup_depth; /* hash lookup depth */ + uint64_t cache_depot_contention; /* mutex contention count */ + uint64_t cache_depot_contention_prev; /* previous snapshot */ + uint64_t cache_alloc_count; /* Number of allocations in cache */ + /* successful calls with KM_NO_VBA flag set */ + uint64_t no_vba_success; + uint64_t no_vba_fail; + /* number of times we set arc growth suppression time */ + uint64_t arc_no_grow_set; + /* number of times spl_zio_is_suppressed returned true for this cache */ + uint64_t arc_no_grow; + + /* + * Cache properties + */ + char cache_name[KMEM_CACHE_NAMELEN + 1]; + size_t cache_bufsize; /* object size */ + size_t cache_align; /* object alignment */ + int (*cache_constructor)(void *, void *, int); + void (*cache_destructor)(void *, void *); + void (*cache_reclaim)(void *); + kmem_cbrc_t (*cache_move)(void *, void *, size_t, void *); + void *cache_private; /* opaque arg to callbacks */ + vmem_t *cache_arena; /* vmem source for slabs */ + int cache_cflags; /* cache creation flags */ + int cache_flags; /* various cache state info */ + uint32_t cache_mtbf; /* induced alloc failure rate */ + uint32_t cache_pad1; /* compiler padding */ + kstat_t *cache_kstat; /* exported statistics */ + list_node_t cache_link; /* cache linkage */ + + /* + * Slab layer + */ + kmutex_t cache_lock; /* protects slab layer */ + + size_t cache_chunksize; /* buf + alignment [+ debug] */ + size_t cache_slabsize; /* size of a slab */ + size_t cache_maxchunks; /* max buffers per slab */ + size_t cache_bufctl; /* buf-to-bufctl distance */ + size_t cache_buftag; /* buf-to-buftag distance */ + size_t cache_verify; /* bytes to verify */ + size_t cache_contents; /* bytes of saved content */ + size_t cache_color; /* next slab color */ + size_t cache_mincolor; /* maximum slab color */ + size_t cache_maxcolor; /* maximum slab color */ + size_t cache_hash_shift; /* get to interesting bits */ + size_t cache_hash_mask; /* hash table mask */ + list_t cache_complete_slabs; /* completely allocated slabs */ + size_t cache_complete_slab_count; + avl_tree_t cache_partial_slabs; /* partial slab freelist */ + size_t cache_partial_binshift; /* for AVL sort bins */ + kmem_cache_t *cache_bufctl_cache; /* source of bufctls */ + kmem_bufctl_t **cache_hash_table; /* hash table base */ + kmem_defrag_t *cache_defrag; /* slab consolidator fields */ + + /* + * Depot layer + */ + kmutex_t cache_depot_lock; /* protects depot */ + kmem_magtype_t *cache_magtype; /* magazine type */ + kmem_maglist_t cache_full; /* full magazines */ + kmem_maglist_t cache_empty; /* empty magazines */ + void *cache_dumpfreelist; /* heap during crash dump */ + void *cache_dumplog; /* log entry during dump */ + + /* + * Per CPU structures + */ + // XNU adjust to suit __builtin_offsetof + kmem_cpu_cache_t cache_cpu[1]; /* per-cpu data */ + +}; + +typedef struct kmem_cpu_log_header { + kmutex_t clh_lock; + char *clh_current; + size_t clh_avail; + int clh_chunk; + int clh_hits; +#if defined(SPL_DEBUG_MUTEX) + char clh_pad[128 - sizeof (kmutex_t) - sizeof (char *) - + sizeof (size_t) - 2 * sizeof (int)]; +#else + char clh_pad[64 - sizeof (kmutex_t) - sizeof (char *) - + sizeof (size_t) - 2 * sizeof (int)]; +#endif +} kmem_cpu_log_header_t; + +typedef struct kmem_log_header { + kmutex_t lh_lock; + char *lh_base; + int *lh_free; + size_t lh_chunksize; + int lh_nchunks; + int lh_head; + int lh_tail; + int lh_hits; + kmem_cpu_log_header_t lh_cpu[1]; /* ncpus actually allocated */ +} kmem_log_header_t; + +/* kmem_move kmm_flags */ +#define KMM_DESPERATE 0x1 +#define KMM_NOTIFY 0x2 +#define KMM_DEBUG 0x4 + +typedef struct kmem_move { + kmem_slab_t *kmm_from_slab; + void *kmm_from_buf; + void *kmm_to_buf; + avl_node_t kmm_entry; + int kmm_flags; +} kmem_move_t; + +/* + * In order to consolidate partial slabs, it must be possible for the cache to + * have partial slabs. + */ +#define KMEM_IS_MOVABLE(cp) \ + (((cp)->cache_chunksize * 2) <= (cp)->cache_slabsize) + +#endif diff --git a/include/os/macos/spl/sys/kstat.h b/include/os/macos/spl/sys/kstat.h new file mode 100644 index 0000000000..1c2b62d578 --- /dev/null +++ b/include/os/macos/spl/sys/kstat.h @@ -0,0 +1,217 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_KSTAT_H +#define _SPL_KSTAT_H + +#include +#include +#include +#include +#include + +#define KSTAT_STRLEN 255 +#define KSTAT_RAW_MAX (128*1024) + +/* + * For reference valid classes are: + * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc + */ + +#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ +#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ +#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ +#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ +#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ +#define KSTAT_TYPE_TXG 5 /* txg sync; ks_ndata >= 1 */ +#define KSTAT_NUM_TYPES 6 + +#define KSTAT_DATA_CHAR 0 +#define KSTAT_DATA_INT32 1 +#define KSTAT_DATA_UINT32 2 +#define KSTAT_DATA_INT64 3 +#define KSTAT_DATA_UINT64 4 +#define KSTAT_DATA_LONG 5 +#define KSTAT_DATA_ULONG 6 +#define KSTAT_DATA_STRING 7 +#define KSTAT_NUM_DATAS 8 + +#define KSTAT_INTR_HARD 0 +#define KSTAT_INTR_SOFT 1 +#define KSTAT_INTR_WATCHDOG 2 +#define KSTAT_INTR_SPURIOUS 3 +#define KSTAT_INTR_MULTSVC 4 +#define KSTAT_NUM_INTRS 5 + +#define KSTAT_FLAG_VIRTUAL 0x01 +#define KSTAT_FLAG_VAR_SIZE 0x02 +#define KSTAT_FLAG_WRITABLE 0x04 +#define KSTAT_FLAG_PERSISTENT 0x08 +#define KSTAT_FLAG_DORMANT 0x10 +#define KSTAT_FLAG_UNSUPPORTED (KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_WRITABLE | \ + KSTAT_FLAG_PERSISTENT | KSTAT_FLAG_DORMANT) +#define KSTAT_FLAG_INVALID 0x20 +#define KSTAT_FLAG_LONGSTRINGS 0x40 +#define KSTAT_FLAG_NO_HEADERS 0x80 + +#define KS_MAGIC 0x9d9d9d9d + +/* Dynamic updates */ +#define KSTAT_READ 0 +#define KSTAT_WRITE 1 + +struct kstat_s; + +typedef int kid_t; /* unique kstat id */ +typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ + +struct seq_file { + char *sf_buf; + size_t sf_size; +}; + +void seq_printf(struct seq_file *m, const char *fmt, ...); + +typedef struct kstat_raw_ops { + int (*headers)(char *buf, size_t size); + int (*seq_headers)(struct seq_file *); + int (*data)(char *buf, size_t size, void *data); + void *(*addr)(struct kstat_s *ksp, loff_t index); +} kstat_raw_ops_t; + +typedef struct kstat_s { + int ks_magic; /* magic value */ + kid_t ks_kid; /* unique kstat ID */ + hrtime_t ks_crtime; /* creation time */ + hrtime_t ks_snaptime; /* last access time */ + char ks_module[KSTAT_STRLEN+1]; /* provider module name */ + int ks_instance; /* provider module instance */ + char ks_name[KSTAT_STRLEN+1]; /* kstat name */ + char ks_class[KSTAT_STRLEN+1]; /* kstat class */ + uchar_t ks_type; /* kstat data type */ + uchar_t ks_flags; /* kstat flags */ + void *ks_data; /* kstat type-specific data */ + uint_t ks_ndata; /* # of type-specific data records */ + size_t ks_data_size; /* size of kstat data section */ + struct proc_dir_entry *ks_proc; /* proc linkage */ + kstat_update_t *ks_update; /* dynamic updates */ + void *ks_private; /* private data */ + void *ks_private1; /* private data */ + kmutex_t ks_private_lock; /* kstat private data lock */ + kmutex_t *ks_lock; /* kstat data lock */ + kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ + char *ks_raw_buf; /* buf used for raw ops */ + size_t ks_raw_bufsize; /* size of raw ops buffer */ +} kstat_t; + +typedef struct kstat_named_s { + char name[KSTAT_STRLEN]; /* name of counter */ + uchar_t data_type; /* data type */ + union { + char c[16]; /* 128-bit int */ + int32_t i32; /* 32-bit signed int */ + uint32_t ui32; /* 32-bit unsigned int */ + int64_t i64; /* 64-bit signed int */ + uint64_t ui64; /* 64-bit unsigned int */ + long l; /* native signed long */ + ulong_t ul; /* native unsigned long */ + struct { + union { + char *ptr; /* NULL-term string */ + char __pad[8]; /* 64-bit padding */ + } addr; + uint32_t len; /* # bytes for strlen + '\0' */ + } string; + } value; +} kstat_named_t; + +#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) +#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) + +typedef struct kstat_intr { + uint_t intrs[KSTAT_NUM_INTRS]; +} kstat_intr_t; + +typedef struct kstat_io { + u_longlong_t nread; /* number of bytes read */ + u_longlong_t nwritten; /* number of bytes written */ + uint_t reads; /* number of read operations */ + uint_t writes; /* number of write operations */ + hrtime_t wtime; /* cumulative wait (pre-service) time */ + hrtime_t wlentime; /* cumulative wait len * time product */ + hrtime_t wlastupdate; /* last time wait queue changed */ + hrtime_t rtime; /* cumulative run (service) time */ + hrtime_t rlentime; /* cumulative run length*time product */ + hrtime_t rlastupdate; /* last time run queue changed */ + uint_t wcnt; /* count of elements in wait state */ + uint_t rcnt; /* count of elements in run state */ +} kstat_io_t; + +typedef struct kstat_timer { + char name[KSTAT_STRLEN+1]; /* event name */ + u_longlong_t num_events; /* number of events */ + hrtime_t elapsed_time; /* cumulative elapsed time */ + hrtime_t min_time; /* shortest event duration */ + hrtime_t max_time; /* longest event duration */ + hrtime_t start_time; /* previous event start time */ + hrtime_t stop_time; /* previous event stop time */ +} kstat_timer_t; + +void spl_kstat_init(void); +void spl_kstat_fini(void); + +extern void __kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void* (*addr)(kstat_t *ksp, loff_t index)); + +extern void __kstat_set_seq_raw_ops(kstat_t *ksp, + int (*headers)(struct seq_file *), + int (*data)(char *buf, size_t size, void *data), + void* (*addr)(kstat_t *ksp, loff_t index)); + +extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, + const char *ks_name, const char *ks_class, + uchar_t ks_type, ulong_t ks_ndata, + uchar_t ks_flags); +extern void __kstat_install(kstat_t *ksp); +extern void __kstat_delete(kstat_t *ksp); + +#define kstat_create(m, i, n, c, t, s, f) \ + __kstat_create(m, i, n, c, t, s, f) +#define kstat_install(k) __kstat_install(k) +#define kstat_delete(k) __kstat_delete(k) + +extern void kstat_waitq_enter(kstat_io_t *); +extern void kstat_waitq_exit(kstat_io_t *); +extern void kstat_runq_enter(kstat_io_t *); +extern void kstat_runq_exit(kstat_io_t *); +extern void kstat_named_init(kstat_named_t *, const char *, uchar_t); + +#define kstat_set_seq_raw_ops(k, h, d, a) __kstat_set_seq_raw_ops(k, h, d, a) +#define kstat_set_raw_ops(k, h, d, a) __kstat_set_raw_ops(k, h, d, a) +void kstat_named_setstr(kstat_named_t *knp, const char *src); + +#endif /* _SPL_KSTAT_H */ diff --git a/include/os/macos/spl/sys/list.h b/include/os/macos/spl/sys/list.h new file mode 100644 index 0000000000..c9a72a53a6 --- /dev/null +++ b/include/os/macos/spl/sys/list.h @@ -0,0 +1,145 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _SPL_LIST_H +#define _SPL_LIST_H + +#include +#include + +/* + * NOTE: I have implemented the Solaris list API in terms of the native + * linux API. This has certain advantages in terms of leveraging the linux + * list debugging infrastructure, but it also means that the internals of a + * list differ slightly than on Solaris. This is not a problem as long as + * all callers stick to the published API. The two major differences are: + * + * 1) A list_node_t is mapped to a linux list_head struct which changes + * the name of the list_next/list_prev pointers to next/prev respectively. + * + * 2) A list_node_t which is not attached to a list on Solaris is denoted + * by having its list_next/list_prev pointers set to NULL. Under linux + * the next/prev pointers are set to LIST_POISON1 and LIST_POISON2 + * respectively. At this moment this only impacts the implementation + * of the list_link_init() and list_link_active() functions. + */ + +typedef struct list_node { + struct list_node *list_next; + struct list_node *list_prev; +} list_node_t; + + + +typedef struct list { + size_t list_size; + size_t list_offset; + list_node_t list_head; +} list_t; + +void list_create(list_t *, size_t, size_t); +void list_destroy(list_t *); + +void list_insert_after(list_t *, void *, void *); +void list_insert_before(list_t *, void *, void *); +void list_insert_head(list_t *, void *); +void list_insert_tail(list_t *, void *); +void list_remove(list_t *, void *); +void list_move_tail(list_t *, list_t *); + +void *list_head(list_t *); +void *list_tail(list_t *); +void *list_next(list_t *, void *); +void *list_prev(list_t *, void *); + +int list_link_active(list_node_t *); +int list_is_empty(list_t *); + +#define LIST_POISON1 NULL +#define LIST_POISON2 NULL + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) + + +static inline void +list_link_init(list_node_t *node) +{ + node->list_next = LIST_POISON1; + node->list_prev = LIST_POISON2; +} + +static inline void +__list_del(list_node_t *prev, list_node_t *next) +{ + next->list_prev = prev; + prev->list_next = next; +} + +static inline void list_del(list_node_t *entry) +{ + __list_del(entry->list_prev, entry->list_next); + entry->list_next = LIST_POISON1; + entry->list_prev = LIST_POISON2; +} + +static inline void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.list_next; + if (head == &list->list_head) + return (NULL); + + list_del(head); + return (list_object(list, head)); +} + +static inline void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.list_prev; + if (tail == &list->list_head) + return (NULL); + + list_del(tail); + return (list_object(list, tail)); +} + +static inline void +list_link_replace(list_node_t *old_node, list_node_t *new_node) +{ + ASSERT(list_link_active(old_node)); + ASSERT(!list_link_active(new_node)); + + new_node->list_next = old_node->list_next; + new_node->list_prev = old_node->list_prev; + old_node->list_prev->list_next = new_node; + old_node->list_next->list_prev = new_node; + list_link_init(old_node); +} + +#endif /* SPL_LIST_H */ diff --git a/include/os/macos/spl/sys/mod_os.h b/include/os/macos/spl/sys/mod_os.h new file mode 100644 index 0000000000..7c974c775c --- /dev/null +++ b/include/os/macos/spl/sys/mod_os.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SPL_MOD_H +#define _SPL_MOD_H + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#define MODULE_INIT(s) +#define MODULE_AUTHOR(s) +#define MODULE_LICENSE(s) +#define MODULE_VERSION(s) +#define ZFS_MODULE_DESCRIPTION(s) +#define ZFS_MODULE_AUTHOR(s) +#define ZFS_MODULE_LICENSE(s) +#define ZFS_MODULE_VERSION(s) + +#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \ + getfunc, perm, desc) + +#define __init __attribute__((unused)) +#define __exit __attribute__((unused)) + +/* + * The init/fini functions need to be called, but they are all static + */ +#define module_init(fn) \ + int wrap_ ## fn(void) \ + { \ + return (fn()); \ + } + +#define module_exit(fn) \ + void wrap_ ## fn(void) \ + { \ + fn(); \ + } + +#define ZFS_MODULE_PARAM_ARGS void + +#define ZFS_MODULE_PARAM(A, B, C, D, E, F) +#define module_param_call(a, b, c, d, e) +#define module_param_named(a, b, c, d) + +kern_return_t spl_start(kmod_info_t *ki, void *d); +kern_return_t spl_stop(kmod_info_t *ki, void *d); + +struct zfs_kernel_param_s; +typedef struct zfs_kernel_param_s zfs_kernel_param_t; + + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* SPL_MOD_H */ diff --git a/include/os/macos/spl/sys/mutex.h b/include/os/macos/spl/sys/mutex.h new file mode 100644 index 0000000000..11a4a89da6 --- /dev/null +++ b/include/os/macos/spl/sys/mutex.h @@ -0,0 +1,148 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * OSX mutex functions + * + * Jorgen Lundman + * + */ + +#ifndef OSX_MUTEX_H +#define OSX_MUTEX_H + +#include + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + MUTEX_ADAPTIVE = 0, /* spin if owner is running, otherwise block */ + MUTEX_SPIN = 1, /* block interrupts and spin */ + MUTEX_DRIVER = 4, /* driver (DDI) mutex */ + MUTEX_DEFAULT = 6 /* kernel default mutex */ +} kmutex_type_t; + +#define MUTEX_NOLOCKDEP 0 + +/* + * Alas lck_mtx_t; is opaque and not available at compile time, and we + * really want to embed them. Luckily, mutex size has not changed in + * many versions of OSX. We should possibly to a startup check of + * the size though. + */ +typedef struct { + uint32_t opaque[4]; +} wrapper_mutex_t; + +/* + * To enable watchdog to keep an eye on mutex being held for too long + * define this debug variable. + */ + +#define SPL_DEBUG_MUTEX + +#ifdef SPL_DEBUG_MUTEX +#define SPL_MUTEX_WATCHDOG_SLEEP 10 /* How long to sleep between checking */ +#define SPL_MUTEX_WATCHDOG_TIMEOUT 60 /* When is a mutex held too long? */ +#endif + +/* + * Solaris kmutex defined. + * + * and is embedded into ZFS structures (see dbuf) so we need to match the + * size carefully. It appears to be 32 bytes. Or rather, it needs to be + * aligned. + */ + +typedef struct kmutex { + void *m_owner; + wrapper_mutex_t m_lock; + +#ifdef SPL_DEBUG_MUTEX + void *leak; + uint64_t m_initialised; +#define MUTEX_INIT 0x123456789abcdef0ULL +#define MUTEX_DESTROYED 0xaabbccddaabbccddULL +#endif + +} kmutex_t; + +#include + +#define MUTEX_HELD(x) (mutex_owned(x)) +#define MUTEX_NOT_HELD(x) (!mutex_owned(x)) + +/* + * On OS X, CoreStorage provides these symbols, so we have to redefine them, + * preferably without having to modify SPL users. + */ +#ifdef SPL_DEBUG_MUTEX + +#define mutex_init(A, B, C, D) \ + spl_mutex_init(A, B, C, D, __FILE__, __FUNCTION__, __LINE__) +void spl_mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, + void *ibc, const char *f, const char *fn, int l); + +#else + +#define mutex_init spl_mutex_init +void spl_mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc); + +#endif + +#ifdef SPL_DEBUG_MUTEX +#define mutex_enter(X) spl_mutex_enter((X), __FILE__, __LINE__) +void spl_mutex_enter(kmutex_t *mp, char *file, int line); +#else +#define mutex_enter spl_mutex_enter +void spl_mutex_enter(kmutex_t *mp); +#endif + +#define mutex_enter_nested(A, B) mutex_enter(A) + +#define mutex_destroy spl_mutex_destroy +#define mutex_exit spl_mutex_exit +#define mutex_tryenter spl_mutex_tryenter +#define mutex_owned spl_mutex_owned +#define mutex_owner spl_mutex_owner + +void spl_mutex_destroy(kmutex_t *mp); +void spl_mutex_exit(kmutex_t *mp); +int spl_mutex_tryenter(kmutex_t *mp); +int spl_mutex_owned(kmutex_t *mp); + +struct thread *spl_mutex_owner(kmutex_t *mp); + +int spl_mutex_subsystem_init(void); +void spl_mutex_subsystem_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/os/macos/spl/sys/param.h b/include/os/macos/spl/sys/param.h new file mode 100644 index 0000000000..2819402be9 --- /dev/null +++ b/include/os/macos/spl/sys/param.h @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_PARAM_H +#define _SPL_PARAM_H + +#include_next +#include + +/* Pages to bytes and back */ +#define ptob(pages) (pages << PAGE_SHIFT) +#define btop(bytes) (bytes >> PAGE_SHIFT) + +#define MAXUID UINT32_MAX + +#endif /* SPL_PARAM_H */ diff --git a/include/os/macos/spl/sys/policy.h b/include/os/macos/spl/sys/policy.h new file mode 100644 index 0000000000..b8953209db --- /dev/null +++ b/include/os/macos/spl/sys/policy.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_POLICY_H +#define _SPL_POLICY_H + +#ifdef _KERNEL + +#include +#include + +struct vattr; + +int secpolicy_fs_unmount(cred_t *, struct mount *); +int secpolicy_nfs(const cred_t *); +int secpolicy_sys_config(const cred_t *, boolean_t); +int secpolicy_zfs(const cred_t *); +int secpolicy_zinject(const cred_t *); + +/* + * This function to be called from xxfs_setattr(). + * Must be called with the node's attributes read-write locked. + * + * cred_t * - acting credentials + * struct vnode * - vnode we're operating on + * struct vattr *va - new attributes, va_mask may be + * changed on return from a call + * struct vattr *oldva - old attributes, need include owner + * and mode only + * int flags - setattr flags + * int iaccess(void *node, int mode, cred_t *cr) + * - non-locking internal access function + * mode be checked + * w/ VREAD|VWRITE|VEXEC, not fs + * internal mode encoding. + * + * void *node - internal node (inode, tmpnode) to + * pass as arg to iaccess + */ +int secpolicy_vnode_setattr(cred_t *, struct vnode *, vattr_t *, + const vattr_t *, int, int (void *, int, cred_t *), void *); + +int secpolicy_vnode_stky_modify(const cred_t *); +int secpolicy_setid_setsticky_clear(struct vnode *vp, vattr_t *vap, + const vattr_t *ovap, cred_t *cr); + +int secpolicy_vnode_remove(struct vnode *, const cred_t *); +int secpolicy_vnode_create_gid(const cred_t *); +int secpolicy_vnode_setids_setgids(struct vnode *, const cred_t *, gid_t); +int secpolicy_vnode_setdac(struct vnode *, const cred_t *, uid_t); +int secpolicy_vnode_chown(struct vnode *, const cred_t *, uid_t); +int secpolicy_vnode_setid_retain(const cred_t *, boolean_t); +int secpolicy_xvattr(vattr_t *, uid_t, const cred_t *, mode_t); +int secpolicy_setid_clear(vattr_t *, const cred_t *); +int secpolicy_basic_link(const cred_t *); +int secpolicy_fs_mount_clearopts(const cred_t *, struct mount *); +int secpolicy_fs_mount(const cred_t *, struct vnode *, struct mount *); + +#endif /* _KERNEL */ + +#endif /* SPL_POLICY_H */ diff --git a/include/os/macos/spl/sys/priv.h b/include/os/macos/spl/sys/priv.h new file mode 100644 index 0000000000..a8da8f101e --- /dev/null +++ b/include/os/macos/spl/sys/priv.h @@ -0,0 +1,531 @@ +/* + * Copyright (c) 2006 nCircle Network Security, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson for the TrustedBSD + * Project under contract to nCircle Network Security, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, + * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Privilege checking interface for BSD kernel. + */ +#ifndef _SPL_PRIV_H +#define _SPL_PRIV_H + +/* + * Privilege list, sorted loosely by kernel subsystem. + * + * Think carefully before adding or reusing one of these privileges -- are + * there existing instances referring to the same privilege? Third party + * vendors may request the assignment of privileges to be used in loadable + * modules. Particular numeric privilege assignments are part of the + * loadable kernel module ABI, and should not be changed across minor + * releases. + * + * When adding a new privilege, remember to determine if it's appropriate for + * use in jail, and update the privilege switch in kern_jail.c as necessary. + */ + +/* + * Track beginning of privilege list. + */ +#define _PRIV_LOWEST 1 + +/* + * The remaining privileges typically correspond to one or a small + * number of specific privilege checks, and have (relatively) precise + * meanings. They are loosely sorted into a set of base system + * privileges, such as the ability to reboot, and then loosely by + * subsystem, indicated by a subsystem name. + */ +#define _PRIV_ROOT 1 /* Removed. */ +#define PRIV_ACCT 2 /* Manage process accounting. */ +#define PRIV_MAXFILES 3 /* Exceed system open files limit. */ +#define PRIV_MAXPROC 4 /* Exceed system processes limit. */ +#define PRIV_KTRACE 5 /* Set/clear KTRFAC_ROOT on ktrace. */ +#define PRIV_SETDUMPER 6 /* Configure dump device. */ +#define PRIV_REBOOT 8 /* Can reboot system. */ +#define PRIV_SWAPON 9 /* Can swapon(). */ +#define PRIV_SWAPOFF 10 /* Can swapoff(). */ +#define PRIV_MSGBUF 11 /* Can read kernel message buffer. */ +#define PRIV_IO 12 /* Can perform low-level I/O. */ +#define PRIV_KEYBOARD 13 /* Reprogram keyboard. */ +#define PRIV_DRIVER 14 /* Low-level driver privilege. */ +#define PRIV_ADJTIME 15 /* Set time adjustment. */ +#define PRIV_NTP_ADJTIME 16 /* Set NTP time adjustment. */ +#define PRIV_CLOCK_SETTIME 17 /* Can call clock_settime. */ +#define PRIV_SETTIMEOFDAY 18 /* Can call settimeofday. */ +#define _PRIV_SETHOSTID 19 /* Removed. */ +#define _PRIV_SETDOMAINNAME 20 /* Removed. */ + +/* + * Audit subsystem privileges. + */ +#define PRIV_AUDIT_CONTROL 40 /* Can configure audit. */ +#define PRIV_AUDIT_FAILSTOP 41 /* Can run during audit fail stop. */ +#define PRIV_AUDIT_GETAUDIT 42 /* Can get proc audit properties. */ +#define PRIV_AUDIT_SETAUDIT 43 /* Can set proc audit properties. */ +#define PRIV_AUDIT_SUBMIT 44 /* Can submit an audit record. */ + +/* + * Credential management privileges. + */ +#define PRIV_CRED_SETUID 50 /* setuid. */ +#define PRIV_CRED_SETEUID 51 /* seteuid to !ruid and !svuid. */ +#define PRIV_CRED_SETGID 52 /* setgid. */ +#define PRIV_CRED_SETEGID 53 /* setgid to !rgid and !svgid. */ +#define PRIV_CRED_SETGROUPS 54 /* Set process additional groups. */ +#define PRIV_CRED_SETREUID 55 /* setreuid. */ +#define PRIV_CRED_SETREGID 56 /* setregid. */ +#define PRIV_CRED_SETRESUID 57 /* setresuid. */ +#define PRIV_CRED_SETRESGID 58 /* setresgid. */ +#define PRIV_SEEOTHERGIDS 59 /* Exempt bsd.seeothergids. */ +#define PRIV_SEEOTHERUIDS 60 /* Exempt bsd.seeotheruids. */ + +/* + * Debugging privileges. + */ +#define PRIV_DEBUG_DIFFCRED 80 /* Exempt debugging other users. */ +#define PRIV_DEBUG_SUGID 81 /* Exempt debugging setuid proc. */ +#define PRIV_DEBUG_UNPRIV 82 /* Exempt unprivileged debug limit. */ +#define PRIV_DEBUG_DENIED 83 /* Exempt P2_NOTRACE. */ + +/* + * Dtrace privileges. + */ +#define PRIV_DTRACE_KERNEL 90 /* Allow use of DTrace on the kernel. */ +#define PRIV_DTRACE_PROC 91 /* Allow attaching DTrace to process. */ +#define PRIV_DTRACE_USER 92 /* Process may submit DTrace events. */ + +/* + * Firmware privilegs. + */ +#define PRIV_FIRMWARE_LOAD 100 /* Can load firmware. */ + +/* + * Jail privileges. + */ +#define PRIV_JAIL_ATTACH 110 /* Attach to a jail. */ +#define PRIV_JAIL_SET 111 /* Set jail parameters. */ +#define PRIV_JAIL_REMOVE 112 /* Remove a jail. */ + +/* + * Kernel environment priveleges. + */ +#define PRIV_KENV_SET 120 /* Set kernel env. variables. */ +#define PRIV_KENV_UNSET 121 /* Unset kernel env. variables. */ + +/* + * Loadable kernel module privileges. + */ +#define PRIV_KLD_LOAD 130 /* Load a kernel module. */ +#define PRIV_KLD_UNLOAD 131 /* Unload a kernel module. */ + +/* + * Privileges associated with the MAC Framework and specific MAC policy + * modules. + */ +#define PRIV_MAC_PARTITION 140 /* Privilege in mac_partition policy. */ +#define PRIV_MAC_PRIVS 141 /* Privilege in the mac_privs policy. */ + +/* + * Process-related privileges. + */ +#define PRIV_PROC_LIMIT 160 /* Exceed user process limit. */ +#define PRIV_PROC_SETLOGIN 161 /* Can call setlogin. */ +#define PRIV_PROC_SETRLIMIT 162 /* Can raise resources limits. */ +#define PRIV_PROC_SETLOGINCLASS 163 /* Can call setloginclass(2). */ + +/* + * System V IPC privileges. + */ +#define PRIV_IPC_READ 170 /* Can override IPC read perm. */ +#define PRIV_IPC_WRITE 171 /* Can override IPC write perm. */ +#define PRIV_IPC_ADMIN 172 /* Can override IPC owner-only perm. */ +#define PRIV_IPC_MSGSIZE 173 /* Exempt IPC message queue limit. */ + +/* + * POSIX message queue privileges. + */ +#define PRIV_MQ_ADMIN 180 /* Can override msgq owner-only perm. */ + +/* + * Performance monitoring counter privileges. + */ +#define PRIV_PMC_MANAGE 190 /* Can administer PMC. */ +#define PRIV_PMC_SYSTEM 191 /* Can allocate a system-wide PMC. */ + +/* + * Scheduling privileges. + */ +#define PRIV_SCHED_DIFFCRED 200 /* Exempt scheduling other users. */ +#define PRIV_SCHED_SETPRIORITY 201 /* Can set lower nice value for proc. */ +#define PRIV_SCHED_RTPRIO 202 /* Can set real time scheduling. */ +#define PRIV_SCHED_SETPOLICY 203 /* Can set scheduler policy. */ +#define PRIV_SCHED_SET 204 /* Can set thread scheduler. */ +#define PRIV_SCHED_SETPARAM 205 /* Can set thread scheduler params. */ +#define PRIV_SCHED_CPUSET 206 /* Can manipulate cpusets. */ +#define PRIV_SCHED_CPUSET_INTR 207 /* Can adjust IRQ to CPU binding. */ + +/* + * POSIX semaphore privileges. + */ +#define PRIV_SEM_WRITE 220 /* Can override sem write perm. */ + +/* + * Signal privileges. + */ +#define PRIV_SIGNAL_DIFFCRED 230 /* Exempt signalling other users. */ +#define PRIV_SIGNAL_SUGID 231 /* Non-conserv signal setuid proc. */ + +/* + * Sysctl privileges. + */ +#define PRIV_SYSCTL_DEBUG 240 /* Can invoke sysctl.debug. */ +#define PRIV_SYSCTL_WRITE 241 /* Can write sysctls. */ +#define PRIV_SYSCTL_WRITEJAIL 242 /* Can write sysctls, jail permitted. */ + +/* + * TTY privileges. + */ +#define PRIV_TTY_CONSOLE 250 /* Set console to tty. */ +#define PRIV_TTY_DRAINWAIT 251 /* Set tty drain wait time. */ +#define PRIV_TTY_DTRWAIT 252 /* Set DTR wait on tty. */ +#define PRIV_TTY_EXCLUSIVE 253 /* Override tty exclusive flag. */ +#define _PRIV_TTY_PRISON 254 /* Removed. */ +#define PRIV_TTY_STI 255 /* Simulate input on another tty. */ +#define PRIV_TTY_SETA 256 /* Set tty termios structure. */ + +/* + * UFS-specific privileges. + */ +#define PRIV_UFS_EXTATTRCTL 270 /* Can configure EAs on UFS1. */ +#define PRIV_UFS_QUOTAOFF 271 /* quotaoff(). */ +#define PRIV_UFS_QUOTAON 272 /* quotaon(). */ +#define PRIV_UFS_SETUSE 273 /* setuse(). */ + +/* + * ZFS-specific privileges. + */ +#define PRIV_ZFS_POOL_CONFIG 280 /* Can configure ZFS pools. */ + +/* Can inject faults in the ZFS fault injection framework. */ +#define PRIV_ZFS_INJECT 281 + +/* Can attach/detach ZFS file systems to/from jails. */ +#define PRIV_ZFS_JAIL 282 + +/* + * NFS-specific privileges. + */ +#define PRIV_NFS_DAEMON 290 /* Can become the NFS daemon. */ +#define PRIV_NFS_LOCKD 291 /* Can become NFS lock daemon. */ + +/* + * VFS privileges. + */ +#define PRIV_VFS_READ 310 /* Override vnode DAC read perm. */ +#define PRIV_VFS_WRITE 311 /* Override vnode DAC write perm. */ +#define PRIV_VFS_ADMIN 312 /* Override vnode DAC admin perm. */ +#define PRIV_VFS_EXEC 313 /* Override vnode DAC exec perm. */ +#define PRIV_VFS_LOOKUP 314 /* Override vnode DAC lookup perm. */ +#define PRIV_VFS_BLOCKRESERVE 315 /* Can use free block reserve. */ +#define PRIV_VFS_CHFLAGS_DEV 316 /* Can chflags() a device node. */ +#define PRIV_VFS_CHOWN 317 /* Can set user; group to non-member. */ +#define PRIV_VFS_CHROOT 318 /* chroot(). */ +#define PRIV_VFS_RETAINSUGID 319 /* Can retain sugid bits on change. */ +#define PRIV_VFS_EXCEEDQUOTA 320 /* Exempt from quota restrictions. */ +#define PRIV_VFS_EXTATTR_SYSTEM 321 /* Operate on system EA namespace. */ +#define PRIV_VFS_FCHROOT 322 /* fchroot(). */ +#define PRIV_VFS_FHOPEN 323 /* Can fhopen(). */ +#define PRIV_VFS_FHSTAT 324 /* Can fhstat(). */ +#define PRIV_VFS_FHSTATFS 325 /* Can fhstatfs(). */ +#define PRIV_VFS_GENERATION 326 /* stat() returns generation number. */ +#define PRIV_VFS_GETFH 327 /* Can retrieve file handles. */ +#define PRIV_VFS_GETQUOTA 328 /* getquota(). */ +#define PRIV_VFS_LINK 329 /* bsd.hardlink_check_uid */ +#define PRIV_VFS_MKNOD_BAD 330 /* Can mknod() to mark bad inodes. */ +#define PRIV_VFS_MKNOD_DEV 331 /* Can mknod() to create dev nodes. */ +#define PRIV_VFS_MKNOD_WHT 332 /* Can mknod() to create whiteout. */ +#define PRIV_VFS_MOUNT 333 /* Can mount(). */ +#define PRIV_VFS_MOUNT_OWNER 334 /* Can manage other users' fsystems. */ +#define PRIV_VFS_MOUNT_EXPORTED 335 /* Can set MNT_EXPORTED on mount. */ +#define PRIV_VFS_MOUNT_PERM 336 /* Override dev node perms at mount. */ +#define PRIV_VFS_MOUNT_SUIDDIR 337 /* Can set MNT_SUIDDIR on mount. */ +#define PRIV_VFS_MOUNT_NONUSER 338 /* Can perform a non-user mount. */ +#define PRIV_VFS_SETGID 339 /* Can setgid if not in group. */ +#define PRIV_VFS_SETQUOTA 340 /* setquota(). */ +#define PRIV_VFS_STICKYFILE 341 /* Can set sticky bit on file. */ +#define PRIV_VFS_SYSFLAGS 342 /* Can modify system flags. */ +#define PRIV_VFS_UNMOUNT 343 /* Can unmount(). */ +#define PRIV_VFS_STAT 344 /* Override vnode MAC stat perm. */ + +/* + * Virtual memory privileges. + */ +#define PRIV_VM_MADV_PROTECT 360 /* Can set MADV_PROTECT. */ +#define PRIV_VM_MLOCK 361 /* Can mlock(), mlockall(). */ +#define PRIV_VM_MUNLOCK 362 /* Can munlock(), munlockall(). */ +/* Can override the global swap reservation limits. */ +#define PRIV_VM_SWAP_NOQUOTA 363 +/* Can override the per-uid swap reservation limits. */ +#define PRIV_VM_SWAP_NORLIMIT 364 + +/* + * Device file system privileges. + */ +#define PRIV_DEVFS_RULE 370 /* Can manage devfs rules. */ +#define PRIV_DEVFS_SYMLINK 371 /* Can create symlinks in devfs. */ + +/* + * Random number generator privileges. + */ +#define PRIV_RANDOM_RESEED 380 /* Closing /dev/random reseeds. */ + +/* + * Network stack privileges. + */ +#define PRIV_NET_BRIDGE 390 /* Administer bridge. */ +#define PRIV_NET_GRE 391 /* Administer GRE. */ +#define _PRIV_NET_PPP 392 /* Removed. */ +#define _PRIV_NET_SLIP 393 /* Removed. */ +#define PRIV_NET_BPF 394 /* Monitor BPF. */ +#define PRIV_NET_RAW 395 /* Open raw socket. */ +#define PRIV_NET_ROUTE 396 /* Administer routing. */ +#define PRIV_NET_TAP 397 /* Can open tap device. */ +#define PRIV_NET_SETIFMTU 398 /* Set interface MTU. */ +#define PRIV_NET_SETIFFLAGS 399 /* Set interface flags. */ +#define PRIV_NET_SETIFCAP 400 /* Set interface capabilities. */ +#define PRIV_NET_SETIFNAME 401 /* Set interface name. */ +#define PRIV_NET_SETIFMETRIC 402 /* Set interface metrics. */ +#define PRIV_NET_SETIFPHYS 403 /* Set interface physical layer prop. */ +#define PRIV_NET_SETIFMAC 404 /* Set interface MAC label. */ +#define PRIV_NET_ADDMULTI 405 /* Add multicast addr. to ifnet. */ +#define PRIV_NET_DELMULTI 406 /* Delete multicast addr. from ifnet. */ +#define PRIV_NET_HWIOCTL 407 /* Issue hardware ioctl on ifnet. */ +#define PRIV_NET_SETLLADDR 408 /* Set interface link-level address. */ +#define PRIV_NET_ADDIFGROUP 409 /* Add new interface group. */ +#define PRIV_NET_DELIFGROUP 410 /* Delete interface group. */ +#define PRIV_NET_IFCREATE 411 /* Create cloned interface. */ +#define PRIV_NET_IFDESTROY 412 /* Destroy cloned interface. */ +#define PRIV_NET_ADDIFADDR 413 /* Add protocol addr to interface. */ +#define PRIV_NET_DELIFADDR 414 /* Delete protocol addr on interface. */ +#define PRIV_NET_LAGG 415 /* Administer lagg interface. */ +#define PRIV_NET_GIF 416 /* Administer gif interface. */ +#define PRIV_NET_SETIFVNET 417 /* Move interface to vnet. */ +#define PRIV_NET_SETIFDESCR 418 /* Set interface description. */ +#define PRIV_NET_SETIFFIB 419 /* Set interface fib. */ +#define PRIV_NET_VXLAN 420 /* Administer vxlan. */ + +/* + * 802.11-related privileges. + */ +#define PRIV_NET80211_GETKEY 440 /* Query 802.11 keys. */ +#define PRIV_NET80211_MANAGE 441 /* Administer 802.11. */ + +/* + * Placeholder for AppleTalk privileges, not supported anymore. + */ +#define _PRIV_NETATALK_RESERVEDPORT 450 /* Bind low port number. */ + +/* + * ATM privileges. + */ +#define PRIV_NETATM_CFG 460 +#define PRIV_NETATM_ADD 461 +#define PRIV_NETATM_DEL 462 +#define PRIV_NETATM_SET 463 + +/* + * Bluetooth privileges. + */ +#define PRIV_NETBLUETOOTH_RAW 470 /* Open raw bluetooth socket. */ + +/* + * Netgraph and netgraph module privileges. + */ +#define PRIV_NETGRAPH_CONTROL 480 /* Open netgraph control socket. */ +#define PRIV_NETGRAPH_TTY 481 /* Configure tty for netgraph. */ + +/* + * IPv4 and IPv6 privileges. + */ +#define PRIV_NETINET_RESERVEDPORT 490 /* Bind low port number. */ +#define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ +#define PRIV_NETINET_DIVERT 492 /* Open IP divert socket. */ +#define PRIV_NETINET_PF 493 /* Administer pf firewall. */ +#define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ +#define PRIV_NETINET_CARP 495 /* Administer CARP. */ +#define PRIV_NETINET_MROUTE 496 /* Administer multicast routing. */ +#define PRIV_NETINET_RAW 497 /* Open netinet raw socket. */ +#define PRIV_NETINET_GETCRED 498 /* Query netinet pcb credentials. */ +#define PRIV_NETINET_ADDRCTRL6 499 /* Administer IPv6 address scopes. */ +#define PRIV_NETINET_ND6 500 /* Administer IPv6 neighbor disc. */ +#define PRIV_NETINET_SCOPE6 501 /* Administer IPv6 address scopes. */ +#define PRIV_NETINET_ALIFETIME6 502 /* Administer IPv6 address lifetimes. */ +#define PRIV_NETINET_IPSEC 503 /* Administer IPSEC. */ +#define PRIV_NETINET_REUSEPORT 504 /* Allow [rapid] port/address reuse. */ +#define PRIV_NETINET_SETHDROPTS 505 /* Set certain IPv4/6 header options. */ +#define PRIV_NETINET_BINDANY 506 /* Allow bind to any address. */ +#define PRIV_NETINET_HASHKEY 507 /* Get and set hash keys for IPv4/6. */ + +/* + * Placeholders for IPX/SPX privileges, not supported any more. + */ +#define _PRIV_NETIPX_RESERVEDPORT 520 /* Bind low port number. */ +#define _PRIV_NETIPX_RAW 521 /* Open netipx raw socket. */ + +/* + * NCP privileges. + */ +#define PRIV_NETNCP 530 /* Use another user's connection. */ + +/* + * SMB privileges. + */ +#define PRIV_NETSMB 540 /* Use another user's connection. */ + +/* + * VM86 privileges. + */ +#define PRIV_VM86_INTCALL 550 /* Allow invoking vm86 int handlers. */ + +/* + * Set of reserved privilege values, which will be allocated to code as + * needed, in order to avoid renumbering later privileges due to insertion. + */ +#define _PRIV_RESERVED0 560 +#define _PRIV_RESERVED1 561 +#define _PRIV_RESERVED2 562 +#define _PRIV_RESERVED3 563 +#define _PRIV_RESERVED4 564 +#define _PRIV_RESERVED5 565 +#define _PRIV_RESERVED6 566 +#define _PRIV_RESERVED7 567 +#define _PRIV_RESERVED8 568 +#define _PRIV_RESERVED9 569 +#define _PRIV_RESERVED10 570 +#define _PRIV_RESERVED11 571 +#define _PRIV_RESERVED12 572 +#define _PRIV_RESERVED13 573 +#define _PRIV_RESERVED14 574 +#define _PRIV_RESERVED15 575 + +/* + * Define a set of valid privilege numbers that can be used by loadable + * modules that don't yet have privilege reservations. Ideally, these should + * not be used, since their meaning is opaque to any policies that are aware + * of specific privileges, such as jail, and as such may be arbitrarily + * denied. + */ +#define PRIV_MODULE0 600 +#define PRIV_MODULE1 601 +#define PRIV_MODULE2 602 +#define PRIV_MODULE3 603 +#define PRIV_MODULE4 604 +#define PRIV_MODULE5 605 +#define PRIV_MODULE6 606 +#define PRIV_MODULE7 607 +#define PRIV_MODULE8 608 +#define PRIV_MODULE9 609 +#define PRIV_MODULE10 610 +#define PRIV_MODULE11 611 +#define PRIV_MODULE12 612 +#define PRIV_MODULE13 613 +#define PRIV_MODULE14 614 +#define PRIV_MODULE15 615 + +/* + * DDB(4) privileges. + */ +#define PRIV_DDB_CAPTURE 620 /* Allow reading of DDB capture log. */ + +/* + * Arla/nnpfs privileges. + */ +#define PRIV_NNPFS_DEBUG 630 /* Perforn ARLA_VIOC_NNPFSDEBUG. */ + +/* + * cpuctl(4) privileges. + */ +#define PRIV_CPUCTL_WRMSR 640 /* Write model-specific register. */ +#define PRIV_CPUCTL_UPDATE 641 /* Update cpu microcode. */ + +/* + * Capi4BSD privileges. + */ +#define PRIV_C4B_RESET_CTLR 650 /* Load firmware, reset controller. */ +#define PRIV_C4B_TRACE 651 /* Unrestricted CAPI message tracing. */ + +/* + * OpenAFS privileges. + */ +#define PRIV_AFS_ADMIN 660 /* Can change AFS client settings. */ +#define PRIV_AFS_DAEMON 661 /* Can become the AFS daemon. */ + +/* + * Resource Limits privileges. + */ +#define PRIV_RCTL_GET_RACCT 670 +#define PRIV_RCTL_GET_RULES 671 +#define PRIV_RCTL_GET_LIMITS 672 +#define PRIV_RCTL_ADD_RULE 673 +#define PRIV_RCTL_REMOVE_RULE 674 + +/* + * mem(4) privileges. + */ +#define PRIV_KMEM_READ 680 /* Open mem/kmem for reading. */ +#define PRIV_KMEM_WRITE 681 /* Open mem/kmem for writing. */ + +/* + * Track end of privilege list. + */ +#define _PRIV_HIGHEST 682 + +/* + * Validate that a named privilege is known by the privilege system. Invalid + * privileges presented to the privilege system by a priv_check interface + * will result in a panic. This is only approximate due to sparse allocation + * of the privilege space. + */ +#define PRIV_VALID(x) ((x) > _PRIV_LOWEST && (x) < _PRIV_HIGHEST) + +#ifdef _KERNEL +/* + * Privilege check interfaces, modeled after historic suser() interfaces, but + * with the addition of a specific privilege name. No flags are currently + * defined for the API. Historically, flags specified using the real uid + * instead of the effective uid, and whether or not the check should be + * allowed in jail. + */ +struct thread; +struct ucred; +int priv_check(struct thread *td, int priv); +int priv_check_cred(struct ucred *cred, int priv, int flags); +#endif + +#endif /* _SPL_PRIV_H */ diff --git a/include/os/macos/spl/sys/proc.h b/include/os/macos/spl/sys/proc.h new file mode 100644 index 0000000000..132964d9c1 --- /dev/null +++ b/include/os/macos/spl/sys/proc.h @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#ifndef _SPL_PROC_H +#define _SPL_PROC_H + +#include +#include +#include_next +#include +#include + +#define proc_t struct proc + +extern proc_t p0; /* process 0 */ + +static inline boolean_t +zfs_proc_is_caller(proc_t *p) +{ + return (p == curproc); +} + +#endif /* SPL_PROC_H */ diff --git a/include/os/macos/spl/sys/processor.h b/include/os/macos/spl/sys/processor.h new file mode 100644 index 0000000000..d077817e94 --- /dev/null +++ b/include/os/macos/spl/sys/processor.h @@ -0,0 +1,37 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_PROCESSOR_H +#define _SPL_PROCESSOR_H + +#include + +extern uint32_t getcpuid(void); + +typedef int processorid_t; + +#endif /* _SPL_PROCESSOR_H */ diff --git a/include/os/macos/spl/sys/procfs_list.h b/include/os/macos/spl/sys/procfs_list.h new file mode 100644 index 0000000000..d77fb72c30 --- /dev/null +++ b/include/os/macos/spl/sys/procfs_list.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_PROCFS_LIST_H +#define _SPL_PROCFS_LIST_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct procfs_list procfs_list_t; +struct procfs_list { + void *pl_private; + void *pl_next_data; + kmutex_t pl_lock; + list_t pl_list; + uint64_t pl_next_id; + int (*pl_show)(struct seq_file *f, void *p); + int (*pl_show_header)(struct seq_file *f); + int (*pl_clear)(procfs_list_t *procfs_list); + size_t pl_node_offset; +}; + +typedef struct procfs_list_node { + list_node_t pln_link; + uint64_t pln_id; +} procfs_list_node_t; + +void procfs_list_install(const char *module, + const char *submodule, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off); +void procfs_list_uninstall(procfs_list_t *procfs_list); +void procfs_list_destroy(procfs_list_t *procfs_list); +void procfs_list_add(procfs_list_t *procfs_list, void *p); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/os/macos/spl/sys/random.h b/include/os/macos/spl/sys/random.h new file mode 100644 index 0000000000..c69184cc84 --- /dev/null +++ b/include/os/macos/spl/sys/random.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_RANDOM_H +#define _SPL_RANDOM_H + +#include_next + + +static inline int +random_get_bytes(uint8_t *ptr, size_t len) +{ + read_random(ptr, len); + return (0); +} + +static inline int +random_get_pseudo_bytes(uint8_t *ptr, size_t len) +{ + read_random(ptr, len); + return (0); +} + +#endif /* _SPL_RANDOM_H */ diff --git a/include/os/macos/spl/sys/rwlock.h b/include/os/macos/spl/sys/rwlock.h new file mode 100644 index 0000000000..aa69db9bd3 --- /dev/null +++ b/include/os/macos/spl/sys/rwlock.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_RWLOCK_H +#define _SPL_RWLOCK_H + +#include +#include + +typedef enum { + RW_DRIVER = 2, + RW_DEFAULT = 4 +} krw_type_t; + +typedef enum { + RW_NONE = 0, + RW_WRITER = 1, + RW_READER = 2 +} krw_t; + +#define RW_NOLOCKDEP 0 + +struct krwlock { + uint32_t rw_lock[4]; /* opaque lck_rw_t data */ + void *rw_owner; /* writer (exclusive) lock only */ + int rw_readers; /* reader lock only */ + int rw_pad; /* */ +#ifdef SPL_DEBUG_RWLOCK + void *leak; +#endif +}; +typedef struct krwlock krwlock_t; + +#define RW_WRITE_HELD(x) (rw_write_held((x))) +#define RW_LOCK_HELD(x) (rw_lock_held((x))) + +#ifdef SPL_DEBUG_RWLOCK +#define rw_init(A, B, C, D) \ + rw_initx(A, B, C, D, __FILE__, __FUNCTION__, __LINE__) +extern void rw_initx(krwlock_t *, char *, krw_type_t, void *, + const char *, const char *, int); +#else +extern void rw_init(krwlock_t *, char *, krw_type_t, void *); +#endif +extern void rw_destroy(krwlock_t *); +extern void rw_enter(krwlock_t *, krw_t); +extern int rw_tryenter(krwlock_t *, krw_t); +extern void rw_exit(krwlock_t *); +extern void rw_downgrade(krwlock_t *); +extern int rw_tryupgrade(krwlock_t *); +extern int rw_write_held(krwlock_t *); +extern int rw_lock_held(krwlock_t *); +extern int rw_isinit(krwlock_t *); + +int spl_rwlock_init(void); +void spl_rwlock_fini(void); + +#endif /* _SPL_RWLOCK_H */ diff --git a/include/os/macos/spl/sys/seg_kmem.h b/include/os/macos/spl/sys/seg_kmem.h new file mode 100644 index 0000000000..8ef014c129 --- /dev/null +++ b/include/os/macos/spl/sys/seg_kmem.h @@ -0,0 +1,92 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VM_SEG_KMEM_H +#define _VM_SEG_KMEM_H + + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * VM - Kernel Segment Driver + */ + +#if defined(_KERNEL) + +extern uint64_t segkmem_total_allocated; + +/* qcaching for zio arenas and abd arena */ +extern vmem_t *zio_arena_parent; +/* arena for zio caches for file blocks */ +extern vmem_t *zio_arena; +/* arena for zio caches for (zfs) metadata blocks */ +extern vmem_t *zio_metadata_arena; + +/* + * segkmem page vnodes + */ +#define kvp (kvps[KV_KVP]) +#define zvp (kvps[KV_ZVP]) +#if defined(__sparc) +#define mpvp (kvps[KV_MPVP]) +#define promvp (kvps[KV_PROMVP]) +#endif /* __sparc */ + +void *segkmem_alloc(vmem_t *, size_t, int); +extern void segkmem_free(vmem_t *, void *, size_t); +extern void kernelheap_init(void); +extern void kernelheap_fini(void); + +extern void *segkmem_zio_alloc(vmem_t *, size_t, int); +extern void segkmem_zio_free(vmem_t *, void *, size_t); +extern void segkmem_zio_init(void); +extern void segkmem_zio_fini(void); + +/* + * Flags for segkmem_xalloc(). + * + * SEGKMEM_SHARELOCKED requests pages which are locked SE_SHARED to be + * returned rather than unlocked which is now the default. Note that + * memory returned by SEGKMEM_SHARELOCKED cannot be freed by segkmem_free(). + * This is a hack for seg_dev that should be cleaned up in the future. + */ +#define SEGKMEM_SHARELOCKED 0x20000 + +#define SEGKMEM_USE_LARGEPAGES (segkmem_lpsize > PAGESIZE) + +#define IS_KMEM_VA_LARGEPAGE(vaddr) \ + (((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end)) + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_KMEM_H */ diff --git a/include/os/macos/spl/sys/sha2.h b/include/os/macos/spl/sys/sha2.h new file mode 100644 index 0000000000..9039835f18 --- /dev/null +++ b/include/os/macos/spl/sys/sha2.h @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2013 Saso Kiselkov. All rights reserved. */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#ifdef _KERNEL +#include /* for uint_* */ +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA2_HMAC_MIN_KEY_LEN 1 /* SHA2-HMAC min key length in bytes */ +#define SHA2_HMAC_MAX_KEY_LEN INT_MAX /* SHA2-HMAC max key length in bytes */ + +#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ +#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ +#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ + +/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ +#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ +#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ + +#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ +#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ + +#define SHA256 0 +#define SHA256_HMAC 1 +#define SHA256_HMAC_GEN 2 +#define SHA384 3 +#define SHA384_HMAC 4 +#define SHA384_HMAC_GEN 5 +#define SHA512 6 +#define SHA512_HMAC 7 +#define SHA512_HMAC_GEN 8 +#define SHA512_224 9 +#define SHA512_256 10 + +/* + * SHA2 context. + * The contents of this structure are a private interface between the + * Init/Update/Final calls of the functions defined below. + * Callers must never attempt to read or write any of the fields + * in this structure directly. + */ +typedef struct { + uint32_t algotype; /* Algorithm Type */ + + /* state (ABCDEFGH) */ + union { + uint32_t s32[8]; /* for SHA256 */ + uint64_t s64[8]; /* for SHA384/512 */ + } state; + /* number of bits */ + union { + uint32_t c32[2]; /* for SHA256 , modulo 2^64 */ + uint64_t c64[2]; /* for SHA384/512, modulo 2^128 */ + } count; + union { + uint8_t buf8[128]; /* undigested input */ + uint32_t buf32[32]; /* realigned input */ + uint64_t buf64[16]; /* realigned input */ + } buf_un; +} SHA2_CTX; + +typedef SHA2_CTX SHA256_CTX; +typedef SHA2_CTX SHA384_CTX; +typedef SHA2_CTX SHA512_CTX; + +extern void SHA2Init(uint64_t mech, SHA2_CTX *); + +extern void SHA2Update(SHA2_CTX *, const void *, size_t); + +extern void SHA2Final(void *, SHA2_CTX *); + +extern void SHA256Init(SHA256_CTX *); + +extern void SHA256Update(SHA256_CTX *, const void *, size_t); + +extern void SHA256Final(void *, SHA256_CTX *); + +extern void SHA384Init(SHA384_CTX *); + +extern void SHA384Update(SHA384_CTX *, const void *, size_t); + +extern void SHA384Final(void *, SHA384_CTX *); + +extern void SHA512Init(SHA512_CTX *); + +extern void SHA512Update(SHA512_CTX *, const void *, size_t); + +extern void SHA512Final(void *, SHA512_CTX *); + +#ifdef _SHA2_IMPL +/* + * The following types/functions are all private to the implementation + * of the SHA2 functions and must not be used by consumers of the interface + */ + +/* + * List of support mechanisms in this module. + * + * It is important to note that in the module, division or modulus calculations + * are used on the enumerated type to determine which mechanism is being used; + * therefore, changing the order or additional mechanisms should be done + * carefully + */ +typedef enum sha2_mech_type { + SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ + SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ + SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ + SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ + SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ + SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ + SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ + SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ +} sha2_mech_type_t; + +#endif /* _SHA2_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA2_H */ diff --git a/include/os/macos/spl/sys/sid.h b/include/os/macos/spl/sys/sid.h new file mode 100644 index 0000000000..ac8c58b885 --- /dev/null +++ b/include/os/macos/spl/sys/sid.h @@ -0,0 +1,104 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _SPL_SID_H +#define _SPL_SID_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define crgetzoneid(x) (GLOBAL_ZONEID) + +typedef struct ksiddomain { + char *kd_name; +} ksiddomain_t; + +typedef enum ksid_index { + KSID_USER, + KSID_GROUP, + KSID_OWNER, + KSID_COUNT +} ksid_index_t; + +typedef int ksid_t; + +/* Should be in kidmap.h */ +typedef int32_t idmap_stat; + +static inline ksiddomain_t * +ksid_lookupdomain(const char *dom) +{ + ksiddomain_t *kd; + int len = strlen(dom); + + kd = (ksiddomain_t *)kmem_zalloc(sizeof (ksiddomain_t), KM_SLEEP); + kd->kd_name = (char *)kmem_zalloc(len + 1, KM_SLEEP); + memcpy(kd->kd_name, dom, len); + + return (kd); +} + +static inline void +ksiddomain_rele(ksiddomain_t *ksid) +{ + kmem_free(ksid->kd_name, strlen(ksid->kd_name) + 1); + kmem_free(ksid, sizeof (ksiddomain_t)); +} + +#define UID_NOBODY 65534 +#define GID_NOBODY 65534 + +static __inline uint_t +ksid_getid(ksid_t *ks) +{ + panic("%s has been unexpectedly called", __func__); + return (0); +} + +static __inline const char * +ksid_getdomain(ksid_t *ks) +{ + panic("%s has been unexpectedly called", __func__); + return (0); +} + +static __inline uint_t +ksid_getrid(ksid_t *ks) +{ + panic("%s has been unexpectedly called", __func__); + return (0); +} + +#define kidmap_getsidbyuid(zone, uid, sid_prefix, rid) (1) +#define kidmap_getsidbygid(zone, gid, sid_prefix, rid) (1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_SID_H */ diff --git a/include/os/macos/spl/sys/signal.h b/include/os/macos/spl/sys/signal.h new file mode 100644 index 0000000000..f4bf1845e9 --- /dev/null +++ b/include/os/macos/spl/sys/signal.h @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2020 Jorgen Lundman + * + */ + +#ifndef _SPL_SYS_SIGNAL_H +#define _SPL_SYS_SIGNAL_H + +#include +#include_next +#include + +#define FORREAL 0 /* Usual side-effects */ +#define JUSTLOOKING 1 /* Don't stop the process */ + +struct proc; + +extern int thread_issignal(struct proc *, thread_t, sigset_t); + +#define THREADMASK (sigmask(SIGILL)|sigmask(SIGTRAP)|\ + sigmask(SIGIOT)|sigmask(SIGEMT)|\ + sigmask(SIGFPE)|sigmask(SIGBUS)|\ + sigmask(SIGSEGV)|sigmask(SIGSYS)|\ + sigmask(SIGPIPE)|sigmask(SIGKILL)|\ + sigmask(SIGTERM)|sigmask(SIGINT)) + +static __inline__ int +issig(int why) +{ + return (thread_issignal(current_proc(), current_thread(), + THREADMASK)); +} + +/* Always called with curthread */ +#define signal_pending(p) issig(0) + +#endif /* SPL_SYS_SIGNAL_H */ diff --git a/include/os/macos/spl/sys/simd.h b/include/os/macos/spl/sys/simd.h new file mode 100644 index 0000000000..37c06fc78a --- /dev/null +++ b/include/os/macos/spl/sys/simd.h @@ -0,0 +1,712 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Neskovic . + */ + +/* + * USER API: + * + * Kernel fpu methods: + * kfpu_begin() + * kfpu_end() + * + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_sse_available() + * zfs_sse2_available() + * zfs_sse3_available() + * zfs_ssse3_available() + * zfs_sse4_1_available() + * zfs_sse4_2_available() + * + * zfs_avx_available() + * zfs_avx2_available() + * + * zfs_bmi1_available() + * zfs_bmi2_available() + * + * zfs_avx512f_available() + * zfs_avx512cd_available() + * zfs_avx512er_available() + * zfs_avx512pf_available() + * zfs_avx512bw_available() + * zfs_avx512dq_available() + * zfs_avx512vl_available() + * zfs_avx512ifma_available() + * zfs_avx512vbmi_available() + * + * NOTE(AVX-512VL): If using AVX-512 instructions with 128Bit registers + * also add zfs_avx512vl_available() to feature check. + */ + +#ifndef _SIMD_X86_H +#define _SIMD_X86_H + +#include + +/* only for __x86 */ +#if defined(__x86) + +#include + +#if defined(_KERNEL) +#include +#include + +#ifdef __APPLE__ +// XNU fpu.h +static inline uint64_t +xgetbv(uint32_t c) +{ + uint32_t mask_hi, mask_lo; + __asm__ __volatile__("xgetbv" : "=a"(mask_lo), "=d"(mask_hi) : "c" (c)); + return (((uint64_t)mask_hi<<32) + (uint64_t)mask_lo); +} + +#endif + +extern uint64_t spl_cpuid_features(void); +extern uint64_t spl_cpuid_leaf7_features(void); + +#define ZFS_ASM_BUG() { ASSERT(0); } break + +#define kfpu_allowed() 1 + +#endif + +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) + +#define kfpu_begin() ((void)0) +#define kfpu_end() ((void)0) + +/* + * CPUID feature tests for user-space. Linux kernel provides an interface for + * CPU feature testing. + */ +#if !defined(_KERNEL) + +#include + +#define ZFS_ASM_BUG() { assert(0); } break + +/* + * x86 registers used implicitly by CPUID + */ +typedef enum cpuid_regs { + EAX = 0, + EBX, + ECX, + EDX, + CPUID_REG_CNT = 4 +} cpuid_regs_t; + +/* + * List of instruction sets identified by CPUID + */ +typedef enum cpuid_inst_sets { + SSE = 0, + SSE2, + SSE3, + SSSE3, + SSE4_1, + SSE4_2, + OSXSAVE, + AVX, + AVX2, + BMI1, + BMI2, + AVX512F, + AVX512CD, + AVX512DQ, + AVX512BW, + AVX512IFMA, + AVX512VBMI, + AVX512PF, + AVX512ER, + AVX512VL, + AES, + PCLMULQDQ +} cpuid_inst_sets_t; + +/* + * Instruction set descriptor. + */ +typedef struct cpuid_feature_desc { + uint32_t leaf; /* CPUID leaf */ + uint32_t subleaf; /* CPUID sub-leaf */ + uint32_t flag; /* bit mask of the feature */ + cpuid_regs_t reg; /* which CPUID return register to test */ +} cpuid_feature_desc_t; + +#define _AVX512F_BIT (1U << 16) +#define _AVX512CD_BIT (_AVX512F_BIT | (1U << 28)) +#define _AVX512DQ_BIT (_AVX512F_BIT | (1U << 17)) +#define _AVX512BW_BIT (_AVX512F_BIT | (1U << 30)) +#define _AVX512IFMA_BIT (_AVX512F_BIT | (1U << 21)) +#define _AVX512VBMI_BIT (1U << 1) /* AVX512F_BIT is on another leaf */ +#define _AVX512PF_BIT (_AVX512F_BIT | (1U << 26)) +#define _AVX512ER_BIT (_AVX512F_BIT | (1U << 27)) +#define _AVX512VL_BIT (1U << 31) /* if used also check other levels */ +#define _AES_BIT (1U << 25) +#define _PCLMULQDQ_BIT (1U << 1) + +/* + * Descriptions of supported instruction sets + */ +static const cpuid_feature_desc_t spl_cpuid_features[] = { + [SSE] = {1U, 0U, 1U << 25, EDX }, + [SSE2] = {1U, 0U, 1U << 26, EDX }, + [SSE3] = {1U, 0U, 1U << 0, ECX }, + [SSSE3] = {1U, 0U, 1U << 9, ECX }, + [SSE4_1] = {1U, 0U, 1U << 19, ECX }, + [SSE4_2] = {1U, 0U, 1U << 20, ECX }, + [OSXSAVE] = {1U, 0U, 1U << 27, ECX }, + [AVX] = {1U, 0U, 1U << 28, ECX }, + [AVX2] = {7U, 0U, 1U << 5, EBX }, + [BMI1] = {7U, 0U, 1U << 3, EBX }, + [BMI2] = {7U, 0U, 1U << 8, EBX }, + [AVX512F] = {7U, 0U, _AVX512F_BIT, EBX }, + [AVX512CD] = {7U, 0U, _AVX512CD_BIT, EBX }, + [AVX512DQ] = {7U, 0U, _AVX512DQ_BIT, EBX }, + [AVX512BW] = {7U, 0U, _AVX512BW_BIT, EBX }, + [AVX512IFMA] = {7U, 0U, _AVX512IFMA_BIT, EBX }, + [AVX512VBMI] = {7U, 0U, _AVX512VBMI_BIT, ECX }, + [AVX512PF] = {7U, 0U, _AVX512PF_BIT, EBX }, + [AVX512ER] = {7U, 0U, _AVX512ER_BIT, EBX }, + [AVX512VL] = {7U, 0U, _AVX512ER_BIT, EBX }, + [AES] = {1U, 0U, _AES_BIT, ECX }, + [PCLMULQDQ] = {1U, 0U, _PCLMULQDQ_BIT, ECX }, +}; + +/* + * Check if OS supports AVX and AVX2 by checking XCR0 + * Only call this function if CPUID indicates that AVX feature is + * supported by the CPU, otherwise it might be an illegal instruction. + */ +static inline uint64_t +xgetbv(uint32_t index) +{ + uint32_t eax, edx; + /* xgetbv - instruction byte code */ + __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0" + : "=a" (eax), "=d" (edx) + : "c" (index)); + + return ((((uint64_t)edx)<<32) | (uint64_t)eax); +} + +/* + * Check if CPU supports a feature + */ +static inline boolean_t +__cpuid_check_feature(const cpuid_feature_desc_t *desc) +{ + uint32_t r[CPUID_REG_CNT]; + + if (__get_cpuid_max(0, NULL) >= desc->leaf) { + /* + * __cpuid_count is needed to properly check + * for AVX2. It is a macro, so return parameters + * are passed by value. + */ + __cpuid_count(desc->leaf, desc->subleaf, + r[EAX], r[EBX], r[ECX], r[EDX]); + return ((r[desc->reg] & desc->flag) == desc->flag); + } + return (B_FALSE); +} + +#define CPUID_FEATURE_CHECK(name, id) \ +static inline boolean_t \ +__cpuid_has_ ## name(void) \ +{ \ + return (__cpuid_check_feature(&spl_cpuid_features[id])); \ +} + +/* + * Define functions for user-space CPUID features testing + */ +CPUID_FEATURE_CHECK(sse, SSE); +CPUID_FEATURE_CHECK(sse2, SSE2); +CPUID_FEATURE_CHECK(sse3, SSE3); +CPUID_FEATURE_CHECK(ssse3, SSSE3); +CPUID_FEATURE_CHECK(sse4_1, SSE4_1); +CPUID_FEATURE_CHECK(sse4_2, SSE4_2); +CPUID_FEATURE_CHECK(avx, AVX); +CPUID_FEATURE_CHECK(avx2, AVX2); +CPUID_FEATURE_CHECK(osxsave, OSXSAVE); +CPUID_FEATURE_CHECK(bmi1, BMI1); +CPUID_FEATURE_CHECK(bmi2, BMI2); +CPUID_FEATURE_CHECK(avx512f, AVX512F); +CPUID_FEATURE_CHECK(avx512cd, AVX512CD); +CPUID_FEATURE_CHECK(avx512dq, AVX512DQ); +CPUID_FEATURE_CHECK(avx512bw, AVX512BW); +CPUID_FEATURE_CHECK(avx512ifma, AVX512IFMA); +CPUID_FEATURE_CHECK(avx512vbmi, AVX512VBMI); +CPUID_FEATURE_CHECK(avx512pf, AVX512PF); +CPUID_FEATURE_CHECK(avx512er, AVX512ER); +CPUID_FEATURE_CHECK(avx512vl, AVX512VL); +CPUID_FEATURE_CHECK(aes, AES); +CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ); + +#endif /* !defined(_KERNEL) */ + + +/* + * Detect register set support + */ +static inline boolean_t +__simd_state_enabled(const uint64_t state) +{ + boolean_t has_osxsave; + uint64_t xcr0; + +#if defined(_KERNEL) + has_osxsave = !!(spl_cpuid_features() & CPUID_FEATURE_OSXSAVE); +#elif !defined(_KERNEL) + has_osxsave = __cpuid_has_osxsave(); +#endif + if (!has_osxsave) + return (B_FALSE); + + xcr0 = xgetbv(0); + return ((xcr0 & state) == state); +} + +#define _XSTATE_SSE_AVX (0x2 | 0x4) +#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) + +#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) +#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) + + +/* + * Check if SSE instruction set is available + */ +static inline boolean_t +zfs_sse_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSE)); +#elif !defined(_KERNEL) + return (__cpuid_has_sse()); +#endif +} + +/* + * Check if SSE2 instruction set is available + */ +static inline boolean_t +zfs_sse2_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSE2)); +#elif !defined(_KERNEL) + return (__cpuid_has_sse2()); +#endif +} + +/* + * Check if SSE3 instruction set is available + */ +static inline boolean_t +zfs_sse3_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSE3)); +#elif !defined(_KERNEL) + return (__cpuid_has_sse3()); +#endif +} + +/* + * Check if SSSE3 instruction set is available + */ +static inline boolean_t +zfs_ssse3_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSSE3)); +#elif !defined(_KERNEL) + return (__cpuid_has_ssse3()); +#endif +} + +/* + * Check if SSE4.1 instruction set is available + */ +static inline boolean_t +zfs_sse4_1_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSE4_1)); +#elif !defined(_KERNEL) + return (__cpuid_has_sse4_1()); +#endif +} + +/* + * Check if SSE4.2 instruction set is available + */ +static inline boolean_t +zfs_sse4_2_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSE4_2)); +#elif !defined(_KERNEL) + return (__cpuid_has_sse4_2()); +#endif +} + +/* + * Check if AVX instruction set is available + */ +static inline boolean_t +zfs_avx_available(void) +{ + boolean_t has_avx; +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_AVX1_0)); +#elif !defined(_KERNEL) + has_avx = __cpuid_has_avx(); +#endif + + return (has_avx && __ymm_enabled()); +} + +/* + * Check if AVX2 instruction set is available + */ +static inline boolean_t +zfs_avx2_available(void) +{ + boolean_t has_avx2; +#if defined(_KERNEL) +#if defined(HAVE_AVX2) + has_avx2 = (!!(spl_cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX2)); +#else + has_avx2 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx2 = __cpuid_has_avx2(); +#endif + + return (has_avx2 && __ymm_enabled()); +} + +/* + * Check if BMI1 instruction set is available + */ +static inline boolean_t +zfs_bmi1_available(void) +{ +#if defined(_KERNEL) +#if defined(CPUID_LEAF7_FEATURE_BMI1) + return (!!(spl_cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_BMI1)); +#else + return (B_FALSE); +#endif +#elif !defined(_KERNEL) + return (__cpuid_has_bmi1()); +#endif +} + +/* + * Check if BMI2 instruction set is available + */ +static inline boolean_t +zfs_bmi2_available(void) +{ +#if defined(_KERNEL) +#if defined(CPUID_LEAF7_FEATURE_BMI2) + return (!!(spl_cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_BMI2)); +#else + return (B_FALSE); +#endif +#elif !defined(_KERNEL) + return (__cpuid_has_bmi2()); +#endif +} + +/* + * Check if AES instruction set is available + */ +static inline boolean_t +zfs_aes_available(void) +{ +#if defined(_KERNEL) +#if defined(HAVE_AES) + return (!!(spl_cpuid_features() & CPUID_FEATURE_AES)); +#else + return (B_FALSE); +#endif +#elif !defined(_KERNEL) + return (__cpuid_has_aes()); +#endif +} + +/* + * Check if PCLMULQDQ instruction set is available + */ +static inline boolean_t +zfs_pclmulqdq_available(void) +{ +#if defined(_KERNEL) +#if defined(HAVE_PCLMULQDQ) + return (!!(spl_cpuid_features() & CPUID_FEATURE_PCLMULQDQ)); +#else + return (B_FALSE); +#endif +#elif !defined(_KERNEL) + return (__cpuid_has_pclmulqdq()); +#endif +} + +/* + * AVX-512 family of instruction sets: + * + * AVX512F Foundation + * AVX512CD Conflict Detection Instructions + * AVX512ER Exponential and Reciprocal Instructions + * AVX512PF Prefetch Instructions + * + * AVX512BW Byte and Word Instructions + * AVX512DQ Double-word and Quadword Instructions + * AVX512VL Vector Length Extensions + * + * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) + * AVX512VBMI Vector Byte Manipulation Instructions + */ + + +/* Check if AVX512F instruction set is available */ +static inline boolean_t +zfs_avx512f_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512F) && defined(CPUID_LEAF7_FEATURE_AVX512F) + return (!!(spl_cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F)); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512f(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512CD instruction set is available */ +static inline boolean_t +zfs_avx512cd_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512F) && defined(HAVE_AVX512CD) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512CD) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512CD)) == + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512CD); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512cd(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512ER instruction set is available */ +static inline boolean_t +zfs_avx512er_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512F) && defined(HAVE_AVX512ER) && \ + defined(CPUID_LEAF7_FEATURE_AVX512ER) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512ER)) == + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512ER); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512er(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512PF instruction set is available */ +static inline boolean_t +zfs_avx512pf_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512PF) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512PF) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512PF)) == + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512PF); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512pf(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512BW instruction set is available */ +static inline boolean_t +zfs_avx512bw_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512BW) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512BW) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512BW)) == + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512BW); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512bw(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512DQ instruction set is available */ +static inline boolean_t +zfs_avx512dq_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512DQ) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512DQ) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512DQ)) == + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512DQ); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512dq(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VL instruction set is available */ +static inline boolean_t +zfs_avx512vl_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512VL) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512VL) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512VL)) == + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512VL); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512vl(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512IFMA instruction set is available */ +static inline boolean_t +zfs_avx512ifma_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512IFMA) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512IFMA) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512IFMA)) == + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512IFMA); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512ifma(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VBMI instruction set is available */ +static inline boolean_t +zfs_avx512vbmi_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512VBMI) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512VBMI) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512VBMI)) == + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512VBMI); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512f() && + __cpuid_has_avx512vbmi(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +#endif /* defined(__x86) */ + +#endif /* _SIMD_X86_H */ diff --git a/include/os/macos/spl/sys/strings.h b/include/os/macos/spl/sys/strings.h new file mode 100644 index 0000000000..af643259c8 --- /dev/null +++ b/include/os/macos/spl/sys/strings.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_STRINGS_H +#define _SPL_STRINGS_H + + +#endif diff --git a/include/os/macos/spl/sys/stropts.h b/include/os/macos/spl/sys/stropts.h new file mode 100644 index 0000000000..bdce60f327 --- /dev/null +++ b/include/os/macos/spl/sys/stropts.h @@ -0,0 +1,247 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + + +#ifndef _SPL_STROPTS_H +#define _SPL_STROPTS_H + +#define LOCORE +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define isprint(c) ((c) >= ' ' && (c) <= '~') + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + * High order bit is 31 (or 63 in _LP64 kernel). + */ +static inline int +highbit64(unsigned long long i) +{ + register int h = 1; + if (i == 0) + return (0); + if (i & 0xffffffff00000000ull) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +static inline int +highbit(unsigned long long i) +{ + register int h = 1; + if (i == 0) + return (0); + if (i & 0xffffffff00000000ull) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +/* + * Find lowest one bit set. + * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. + * Low order bit is 0. + */ +static inline int +lowbit(unsigned long long i) +{ + register int h = 1; + + if (i == 0) + return (0); + + if (!(i & 0xffffffff)) { + h += 32; i >>= 32; + } + if (!(i & 0xffff)) { + h += 16; i >>= 16; + } + if (!(i & 0xff)) { + h += 8; i >>= 8; + } + if (!(i & 0xf)) { + h += 4; i >>= 4; + } + if (!(i & 0x3)) { + h += 2; i >>= 2; + } + if (!(i & 0x1)) { + h += 1; + } + return (h); +} + +static inline int +isdigit(char c) +{ + return (c >= ' ' && c <= '9'); +} + + +static inline char * +strpbrk(const char *s, const char *b) +{ + const char *p; + do { + for (p = b; *p != '\0' && *p != *s; ++p) + ; + if (*p != '\0') + return ((char *)s); + } while (*s++); + return (NULL); +} + + +static inline char * +strrchr(const char *p, int ch) +{ + union { + const char *cp; + char *p; + } u; + char *save; + + u.cp = p; + for (save = NULL; /* empty */; ++u.p) { + if (*u.p == ch) + save = u.p; + if (*u.p == '\0') + return (save); + } + /* NOTREACHED */ +} + +static inline int +is_ascii_str(const char *str) +{ + unsigned char ch; + + while ((ch = (unsigned char)*str++) != '\0') { + if (ch >= 0x80) + return (0); + } + return (1); +} + + +static inline void * +kmemchr(const void *s, int c, size_t n) +{ + if (n != 0) { + const unsigned char *p = (const unsigned char *)s; + do { + if (*p++ == (unsigned char)c) + return ((void *)(uintptr_t)(p - 1)); + } while (--n != 0); + } + return (NULL); +} + +#ifndef memchr +#define memchr kmemchr +#endif + +#define IDX(c) ((unsigned char)(c) / LONG_BIT) +#define BIT(c) ((unsigned long)1 << ((unsigned char)(c) % LONG_BIT)) + +static inline size_t +strcspn(const char *__restrict s, const char *__restrict charset) +{ + /* + * NB: idx and bit are temporaries whose use causes gcc 3.4.2 to + * generate better code. Without them, gcc gets a little confused. + */ + const char *s1; + unsigned long bit; + unsigned long tbl[(UCHAR_MAX + 1) / LONG_BIT]; + int idx; + + if (*s == '\0') + return (0); + + tbl[0] = 1; + tbl[3] = tbl[2] = tbl[1] = 0; + + for (; *charset != '\0'; charset++) { + idx = IDX(*charset); + bit = BIT(*charset); + tbl[idx] |= bit; + } + + for (s1 = s; ; s1++) { + idx = IDX(*s1); + bit = BIT(*s1); + if ((tbl[idx] & bit) != 0) + break; + } + return (s1 - s); +} + +#ifdef __cplusplus +} +#endif + +#endif /* SPL_STROPTS_H */ diff --git a/include/os/macos/spl/sys/sunddi.h b/include/os/macos/spl/sys/sunddi.h new file mode 100644 index 0000000000..ff299eeaf9 --- /dev/null +++ b/include/os/macos/spl/sys/sunddi.h @@ -0,0 +1,203 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Garrett D'Amore . All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + + + +#ifndef _SPL_SUNDDI_H +#define _SPL_SUNDDI_H + +#include +#include +#include +#include +#include +#include +#include + +typedef int ddi_devid_t; + +#define DDI_DEV_T_NONE ((dev_t)-1) +#define DDI_DEV_T_ANY ((dev_t)-2) +#define DI_MAJOR_T_UNKNOWN ((major_t)0) + +#define DDI_PROP_DONTPASS 0x0001 +#define DDI_PROP_CANSLEEP 0x0002 + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define ddi_prop_lookup_string(x1, x2, x3, x4, x5) (*x5 = NULL) +#define ddi_prop_free(x) (void)0 +#define ddi_root_node() (void)0 + +#define isalnum(ch) (isalpha(ch) || isdigit(ch)) +#define isalpha(ch) (isupper(ch) || islower(ch)) +#define isdigit(ch) ((ch) >= '0' && (ch) <= '9') +#define islower(ch) ((ch) >= 'a' && (ch) <= 'z') +#define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \ + ((ch) == '\t') || ((ch) == '\f')) +#define isupper(ch) ((ch) >= 'A' && (ch) <= 'Z') +#define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \ + ((ch) >= 'A' && (ch) <= 'F')) +#define tolower(C) (((C) >= 'A' && (C) <= 'Z') ? (C) - 'A' + 'a' : (C)) +#define toupper(C) (((C) >= 'a' && (C) <= 'z') ? (C) - 'a' + 'A': (C)) +#define isgraph(C) ((C) >= 0x21 && (C) <= 0x7E) +#define ispunct(C) (((C) >= 0x21 && (C) <= 0x2F) || \ + ((C) >= 0x3A && (C) <= 0x40) || \ + ((C) >= 0x5B && (C) <= 0x60) || \ + ((C) >= 0x7B && (C) <= 0x7E)) + +// Define proper Solaris API calls, and clean ZFS up to use +int ddi_copyin(const void *from, void *to, size_t len, int flags); +int ddi_copyout(const void *from, void *to, size_t len, int flags); +int ddi_copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done); + +static inline int +ddi_strtol(const char *str, char **nptr, int base, long *result) +{ + *result = strtol(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == LONG_MIN || *result == LONG_MAX) + return (ERANGE); + return (0); +} + +static inline int +ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result) +{ + *result = strtoul(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == ULONG_MAX) + return (ERANGE); + return (0); +} + +static inline int +ddi_strtoull(const char *str, char **nptr, int base, + unsigned long long *result) +{ + *result = (unsigned long long)strtouq(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == ULLONG_MAX) + return (ERANGE); + return (0); +} + +static inline int +ddi_strtoll(const char *str, char **nptr, int base, long long *result) +{ + *result = (unsigned long long)strtoq(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == ULLONG_MAX) + return (ERANGE); + return (0); +} + +#ifndef OTYPCNT +#define OTYPCNT 5 +#define OTYP_BLK 0 +#define OTYP_MNT 1 +#define OTYP_CHR 2 +#define OTYP_SWP 3 +#define OTYP_LYR 4 +#endif + +#define P2END(x, align) (-(~(x) & -(align))) + +#define ddi_name_to_major(name) devsw_name2blk(name, NULL, 0) + +struct dev_info { + dev_t dev; // Major / Minor + void *devc; + void *devb; +}; +typedef struct dev_info dev_info_t; + + +int ddi_strtoul(const char *, char **, int, unsigned long *); +int ddi_strtol(const char *, char **, int, long *); +int ddi_soft_state_init(void **, size_t, size_t); +int ddi_soft_state_zalloc(void *, int); +void *ddi_get_soft_state(void *, int); +void ddi_soft_state_free(void *, int); +void ddi_soft_state_fini(void **); +int ddi_create_minor_node(dev_info_t *, char *, int, + minor_t, char *, int); +void ddi_remove_minor_node(dev_info_t *, char *); + +int ddi_driver_major(dev_info_t *); + +typedef void *ldi_ident_t; + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define DDI_PSEUDO "" + +#define ddi_prop_update_int64(a, b, c, d) DDI_SUCCESS +#define ddi_prop_update_string(a, b, c, d) DDI_SUCCESS + +#define bioerror(bp, er) (buf_seterror((bp), (er))) +#define biodone(bp) buf_biodone(bp) + +#define ddi_ffs ffs +static inline long ddi_fls(long mask) { \ + /* Algorithm courtesy of Steve Chessin. */ \ + while (mask) { \ + long nx; \ + if ((nx = (mask & (mask - 1))) == 0) \ + break; \ + mask = nx; \ + } \ + return (ffs(mask)); \ +} + +#define getminor(X) minor((X)) + + + +/* + * This data structure is entirely private to the soft state allocator. + */ +struct i_ddi_soft_state { + void **array; /* the array of pointers */ + kmutex_t lock; /* serialize access to this struct */ + size_t size; /* how many bytes per state struct */ + size_t n_items; /* how many structs herein */ + struct i_ddi_soft_state *next; /* 'dirty' elements */ +}; + +#define MIN_N_ITEMS 8 /* 8 void *'s == 32 bytes */ + +extern int strspn(const char *string, register char *charset); + + +#endif /* SPL_SUNDDI_H */ diff --git a/include/os/macos/spl/sys/sysmacros.h b/include/os/macos/spl/sys/sysmacros.h new file mode 100644 index 0000000000..8858f8ae4c --- /dev/null +++ b/include/os/macos/spl/sys/sysmacros.h @@ -0,0 +1,265 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_SYSMACROS_H +#define _SPL_SYSMACROS_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _KERNEL +#define _KERNEL __KERNEL__ +#endif + +#define FALSE 0 +#define TRUE 1 + +#if 0 +#define INT8_MAX (127) +#define INT8_MIN (-128) +#define UINT8_MAX (255) +#define UINT8_MIN (0) + +#define INT16_MAX (32767) +#define INT16_MIN (-32768) +#define UINT16_MAX (65535) +#define UINT16_MIN (0) + +#define INT32_MAX INT_MAX +#define INT32_MIN INT_MIN +#define UINT32_MAX UINT_MAX +#define UINT32_MIN UINT_MIN + +#define INT64_MAX LLONG_MAX +#define INT64_MIN LLONG_MIN +#define UINT64_MAX ULLONG_MAX +#define UINT64_MIN ULLONG_MIN + +#define NBBY 8 +#define MAXBSIZE 8192 +#endif + +#define MAXMSGLEN 256 +#define MAXNAMELEN 256 +#define MAXPATHLEN PATH_MAX +#define MAXOFFSET_T LLONG_MAX +#define DEV_BSIZE 512 +#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ + +#define proc_pageout NULL +#define curproc (struct proc *)current_proc() + +extern int cpu_number(void); +#define CPU_SEQID (cpu_number()) +#define is_system_labeled() 0 + +extern unsigned int max_ncpus; +#define boot_ncpus max_ncpus + +#ifndef RLIM64_INFINITY +#define RLIM64_INFINITY (~0ULL) +#endif + +/* + * 0..MAX_PRIO-1: Process priority + * 0..MAX_RT_PRIO-1: RT priority tasks + * MAX_RT_PRIO..MAX_PRIO-1: SCHED_NORMAL tasks + * + * Treat shim tasks as SCHED_NORMAL tasks + */ + +/* + * In OSX, the kernel thread priorities start at 81 and goes to + * 95 MAXPRI_KERNEL. BASEPRI_REALTIME starts from 96. Since + * swap priority is at 92. Most ZFS priorities should probably + * stay below this, but kmem_reap needs to be higher. + */ +#define minclsyspri 81 /* BASEPRI_KERNEL */ +#define defclsyspri 81 /* BASEPRI_KERNEL */ +#define maxclsyspri 95 + + +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) + +/* + * Missing macros + */ +#define PAGESIZE PAGE_SIZE + +/* from Solaris sys/byteorder.h */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + + +/* Dtrace probes do not exist in the linux kernel */ +#ifdef DTRACE_PROBE +#undef DTRACE_PROBE +#endif /* DTRACE_PROBE */ +#define DTRACE_PROBE(a) ((void)0) + +#ifdef DTRACE_PROBE1 +#undef DTRACE_PROBE1 +#endif /* DTRACE_PROBE1 */ +#define DTRACE_PROBE1(a, b, c) ((void)0) + +#ifdef DTRACE_PROBE2 +#undef DTRACE_PROBE2 +#endif /* DTRACE_PROBE2 */ +#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) + +#ifdef DTRACE_PROBE3 +#undef DTRACE_PROBE3 +#endif /* DTRACE_PROBE3 */ +#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) + +#ifdef DTRACE_PROBE4 +#undef DTRACE_PROBE4 +#endif /* DTRACE_PROBE4 */ +#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) + +/* Missing globals */ +extern char spl_version[32]; +extern unsigned long spl_hostid; +extern char hw_serial[11]; + +/* Missing misc functions */ +extern uint32_t zone_get_hostid(void *zone); +extern void spl_setup(void); +extern void spl_cleanup(void); + +#define makedevice(maj, min) makedev(maj, min) + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#endif + +/* + * Compatibility macros/typedefs needed for Solaris -> Linux port + */ +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define ISP2(x) (((x) & ((x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +/* + * P2* Macros from Illumos + */ + +/* + * return x rounded up to the next phase (offset) within align. + * phase should be < align. + * eg, P2PHASEUP(0x1234, 0x100, 0x10) == 0x1310 (0x13*align + phase) + * eg, P2PHASEUP(0x5600, 0x100, 0x10) == 0x5610 (0x56*align + phase) + */ +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) + +/* + * Return TRUE if they have the same highest bit set. + * eg, P2SAMEHIGHBIT(0x1234, 0x1001) == TRUE (the high bit is 0x1000) + * eg, P2SAMEHIGHBIT(0x1234, 0x3010) == FALSE (high bit of 0x3010 is 0x2000) + */ +#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y))) + +/* + * End Illumos copy-fest + */ + +/* avoid any possibility of clashing with version */ +#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) +/* + * Use the correct builtin mechanism. The Traditional macro is + * not safe on this platform. + */ +#define offsetof(s, m) __builtin_offsetof(s, m) +#endif + +#define SET_ERROR(X) (X) + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_SYSMACROS_H */ diff --git a/include/os/macos/spl/sys/systeminfo.h b/include/os/macos/spl/sys/systeminfo.h new file mode 100644 index 0000000000..d1c15744ec --- /dev/null +++ b/include/os/macos/spl/sys/systeminfo.h @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_SYSTEMINFO_H +#define _SPL_SYSTEMINFO_H + +#define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ +#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ + /* to hold a decimal or hex */ + /* hostid string */ + +const char *spl_panicstr(void); +int spl_system_inshutdown(void); + + +#endif /* SPL_SYSTEMINFO_H */ diff --git a/include/os/macos/spl/sys/systm.h b/include/os/macos/spl/sys/systm.h new file mode 100644 index 0000000000..54b75e29b5 --- /dev/null +++ b/include/os/macos/spl/sys/systm.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_SYSTM_H +#define _SPL_SYSTM_H + +#include_next +#include + +typedef uintptr_t pc_t; + +#endif /* SPL_SYSTM_H */ diff --git a/include/os/macos/spl/sys/taskq.h b/include/os/macos/spl/sys/taskq.h new file mode 100644 index 0000000000..9dd6776108 --- /dev/null +++ b/include/os/macos/spl/sys/taskq.h @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (C) 2015 Jorgen Lundman + */ + +#ifndef _SYS_TASKQ_H +#define _SYS_TASKQ_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TASKQ_NAMELEN 31 + +typedef struct taskq taskq_t; +typedef uintptr_t taskqid_t; +typedef void (task_func_t)(void *); + +struct proc; +struct taskq_ent; + +/* New ZFS expects to find taskq_ent_t as well */ +#include + +/* + * Public flags for taskq_create(): bit range 0-15 + */ +#define TASKQ_PREPOPULATE 0x0001 /* Prepopulate with threads and data */ +#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ +#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */ +#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */ + +/* + * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as + * KM_SLEEP/KM_NOSLEEP. + */ +#define TQ_SLEEP 0x00 /* Can block for memory */ +#define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */ +#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ +#define TQ_FRONT 0x08 /* Put task at the front of the queue */ + +#define TASKQID_INVALID ((taskqid_t)0) + +#ifdef _KERNEL + +extern taskq_t *system_taskq; +/* Global dynamic task queue for long delay */ +extern taskq_t *system_delay_taskq; + +extern int spl_taskq_init(void); +extern void spl_taskq_fini(void); +extern void taskq_mp_init(void); + +extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, + int, uint_t); +extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, + proc_t *, uint_t); +extern taskq_t *taskq_create_sysdc(const char *, int, int, int, + proc_t *, uint_t, uint_t); +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern void nulltask(void *); +extern void taskq_destroy(taskq_t *); +extern void taskq_wait(taskq_t *); +#define HAVE_TASKQ_WAIT_ID +extern void taskq_wait_id(taskq_t *, taskqid_t); +extern void taskq_suspend(taskq_t *); +extern int taskq_suspended(taskq_t *); +extern void taskq_resume(taskq_t *); +extern int taskq_member(taskq_t *, kthread_t *); +extern boolean_t taskq_empty(taskq_t *tq); +extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern taskq_t *taskq_of_curthread(void); +extern int taskq_empty_ent(struct taskq_ent *); + +#define taskq_wait_outstanding(T, D) taskq_wait((T)) + +extern void system_taskq_init(void); +extern void system_taskq_fini(void); + +#endif /* _KERNEL */ + +extern int EMPTY_TASKQ(taskq_t *tq); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TASKQ_H */ diff --git a/include/os/macos/spl/sys/taskq_impl.h b/include/os/macos/spl/sys/taskq_impl.h new file mode 100644 index 0000000000..60e86b3673 --- /dev/null +++ b/include/os/macos/spl/sys/taskq_impl.h @@ -0,0 +1,181 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright (C) 2015 Jorgen Lundman + */ + + +#ifndef _SYS_TASKQ_IMPL_H +#define _SYS_TASKQ_IMPL_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct taskq_bucket taskq_bucket_t; + +typedef struct taskq_ent { + struct taskq_ent *tqent_next; + struct taskq_ent *tqent_prev; + task_func_t *tqent_func; + void *tqent_arg; + union { + taskq_bucket_t *tqent_bucket; + uintptr_t tqent_flags; + } tqent_un; + kthread_t *tqent_thread; + kcondvar_t tqent_cv; +#ifdef __APPLE__ + /* Used to simulate TS_STOPPED */ + kmutex_t tqent_thread_lock; + kcondvar_t tqent_thread_cv; +#endif +} taskq_ent_t; + +#define TQENT_FLAG_PREALLOC 0x1 + +/* + * Taskq Statistics fields are not protected by any locks. + */ +typedef struct tqstat { + uint_t tqs_hits; + uint_t tqs_misses; + uint_t tqs_overflow; /* no threads to allocate */ + uint_t tqs_tcreates; /* threads created */ + uint_t tqs_tdeaths; /* threads died */ + uint_t tqs_maxthreads; /* max # of alive threads */ + uint_t tqs_nomem; /* # of times there were no memory */ + uint_t tqs_disptcreates; +} tqstat_t; + +/* + * Per-CPU hash bucket manages taskq_bent_t structures using freelist. + */ +struct taskq_bucket { + kmutex_t tqbucket_lock; + taskq_t *tqbucket_taskq; /* Enclosing taskq */ + taskq_ent_t tqbucket_freelist; + uint_t tqbucket_nalloc; /* # of allocated entries */ + uint_t tqbucket_nfree; /* # of free entries */ + kcondvar_t tqbucket_cv; + ushort_t tqbucket_flags; + hrtime_t tqbucket_totaltime; + tqstat_t tqbucket_stat; +}; + +/* + * Bucket flags. + */ +#define TQBUCKET_CLOSE 0x01 +#define TQBUCKET_SUSPEND 0x02 + +#define TASKQ_INTERFACE_FLAGS 0x0000ffff /* defined in */ + +/* + * taskq implementation flags: bit range 16-31 + */ +#define TASKQ_CHANGING 0x00010000 /* nthreads != target */ +#define TASKQ_SUSPENDED 0x00020000 /* taskq is suspended */ +#define TASKQ_NOINSTANCE 0x00040000 /* no instance number */ +#define TASKQ_THREAD_CREATED 0x00080000 /* a thread has been created */ +#define TASKQ_DUTY_CYCLE 0x00100000 /* using the SDC class */ + +struct taskq { + char tq_name[TASKQ_NAMELEN + 1]; + kmutex_t tq_lock; + krwlock_t tq_threadlock; + kcondvar_t tq_dispatch_cv; + kcondvar_t tq_wait_cv; + kcondvar_t tq_exit_cv; + pri_t tq_pri; /* Scheduling priority */ + uint_t tq_flags; + int tq_active; + int tq_nthreads; + int tq_nthreads_target; + int tq_nthreads_max; + int tq_threads_ncpus_pct; + int tq_nalloc; + int tq_minalloc; + int tq_maxalloc; + kcondvar_t tq_maxalloc_cv; + int tq_maxalloc_wait; + taskq_ent_t *tq_freelist; + taskq_ent_t tq_task; + int tq_maxsize; + taskq_bucket_t *tq_buckets; /* Per-cpu array of buckets */ + int tq_instance; + uint_t tq_nbuckets; /* # of buckets (2^n) */ + union { + kthread_t *_tq_thread; + kthread_t **_tq_threadlist; + } tq_thr; + + list_node_t tq_cpupct_link; /* linkage for taskq_cpupct_list */ + proc_t *tq_proc; /* process for taskq threads */ + int tq_cpupart; /* cpupart id bound to */ + uint_t tq_DC; /* duty cycle for SDC */ + + /* + * Statistics. + */ + kstat_t *tq_kstat; /* Exported statistics */ + hrtime_t tq_totaltime; /* Time spent processing tasks */ + uint64_t tq_tasks; /* Total # of tasks posted */ + uint64_t tq_executed; /* Total # of tasks executed */ + int tq_maxtasks; /* Max number of tasks in the queue */ + int tq_tcreates; + int tq_tdeaths; +}; + +/* Special form of taskq dispatch that uses preallocated entries. */ +void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); + + +#define tq_thread tq_thr._tq_thread +#define tq_threadlist tq_thr._tq_threadlist + +/* The MAX guarantees we have at least one thread */ +#define TASKQ_THREADS_PCT(ncpus, pct) MAX(((ncpus) * (pct)) / 100, 1) + +/* Extra ZOL / Apple */ +extern void taskq_init_ent(taskq_ent_t *t); +extern taskqid_t taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time); + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TASKQ_IMPL_H */ diff --git a/include/os/macos/spl/sys/thread.h b/include/os/macos/spl/sys/thread.h new file mode 100644 index 0000000000..d9762afe64 --- /dev/null +++ b/include/os/macos/spl/sys/thread.h @@ -0,0 +1,126 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_THREAD_H +#define _SPL_THREAD_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * OsX thread type is + * typedef struct thread *thread_t; + * + * Map that to the ZFS thread type: kthread_t + */ +#define kthread thread +#define kthread_t struct kthread + +/* + * Thread interfaces + */ +#define TP_MAGIC 0x53535353 + +#define TS_FREE 0x00 /* Thread at loose ends */ +#define TS_SLEEP 0x01 /* Awaiting an event */ +#define TS_RUN 0x02 /* Runnable, but not yet on a processor */ +#define TS_ONPROC 0x04 /* Thread is being run on a processor */ +#define TS_ZOMB 0x08 /* Thread has died but hasn't been reaped */ +#define TS_STOPPED 0x10 /* Stopped, initial state */ +#define TS_WAIT 0x20 /* Waiting to become runnable */ + + +typedef void (*thread_func_t)(void *); + + +#define curthread ((kthread_t *)current_thread()) /* current thread pointer */ +#define curproj (ttoproj(curthread)) /* current project pointer */ + +#define thread_join(t) VERIFY(0) + +// Drop the p0 argument, not used. + +#ifdef SPL_DEBUG_THREAD + +#define thread_create(A, B, C, D, E, F, G, H) \ + spl_thread_create(A, B, C, D, E, G, __FILE__, __LINE__, H) +#define thread_create_named(name, A, B, C, D, E, F, G, H) \ + spl_thread_create(A, B, C, D, E, G, __FILE__, __LINE__, H) +extern kthread_t *spl_thread_create(caddr_t stk, size_t stksize, + void (*proc)(void *), void *arg, size_t len, /* proc_t *pp, */ int state, + char *, int, pri_t pri); + +#else + +#define thread_create(A, B, C, D, E, F, G, H) \ + spl_thread_create(A, B, C, D, E, G, H) +#define thread_create_named(name, A, B, C, D, E, F, G, H) \ + spl_thread_create(A, B, C, D, E, G, H) +extern kthread_t *spl_thread_create(caddr_t stk, size_t stksize, + void (*proc)(void *), void *arg, size_t len, /* proc_t *pp, */ int state, + pri_t pri); + +#endif + +#define thread_exit spl_thread_exit +extern void spl_thread_exit(void); + +extern kthread_t *spl_current_thread(void); + +#define delay osx_delay +extern void osx_delay(int); + +#define KPREEMPT_SYNC 0 +static inline void kpreempt(int flags) +{ + (void) thread_block(THREAD_CONTINUE_NULL); +} + +static inline char * +getcomm(void) +{ + static char name[MAXCOMLEN + 1]; + proc_selfname(name, sizeof (name)); + /* Not thread safe */ + return (name); +} + +#define getpid() proc_selfpid() + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_THREAD_H */ diff --git a/include/os/macos/spl/sys/time.h b/include/os/macos/spl/sys/time.h new file mode 100644 index 0000000000..d027112802 --- /dev/null +++ b/include/os/macos/spl/sys/time.h @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_TIME_H +#define _SPL_TIME_H + +#include +#include_next +#include +#include + +#if defined(CONFIG_64BIT) +#define TIME_MAX INT64_MAX +#define TIME_MIN INT64_MIN +#else +#define TIME_MAX INT32_MAX +#define TIME_MIN INT32_MIN +#endif + +#define SEC 1 +#define MILLISEC 1000 +#define MICROSEC 1000000 +#define NANOSEC 1000000000 + +/* Already defined in include/linux/time.h */ +#undef CLOCK_THREAD_CPUTIME_ID +#undef CLOCK_REALTIME +#undef CLOCK_MONOTONIC +#undef CLOCK_PROCESS_CPUTIME_ID + +typedef enum clock_type { + __CLOCK_REALTIME0 = 0, /* obsolete; same as CLOCK_REALTIME */ + CLOCK_VIRTUAL = 1, /* thread's user-level CPU clock */ + CLOCK_THREAD_CPUTIME_ID = 2, /* thread's user+system CPU clock */ + CLOCK_REALTIME = 3, /* wall clock */ + CLOCK_MONOTONIC = 4, /* high resolution monotonic clock */ + CLOCK_PROCESS_CPUTIME_ID = 5, /* process's user+system CPU clock */ + CLOCK_HIGHRES = CLOCK_MONOTONIC, /* alternate name */ + CLOCK_PROF = CLOCK_THREAD_CPUTIME_ID, /* alternate name */ +} clock_type_t; + +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < TIME_MIN || (ts)->tv_sec > TIME_MAX) + +typedef long long hrtime_t; + +extern hrtime_t gethrtime(void); +extern void gethrestime(struct timespec *); +extern time_t gethrestime_sec(void); +extern void hrt2ts(hrtime_t hrt, struct timespec *tsp); + +#define SEC_TO_TICK(sec) ((sec) * hz) +#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz)) + +#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) +#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) + +#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) +#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) + +#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) +#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) + + +#endif /* _SPL_TIME_H */ diff --git a/include/os/macos/spl/sys/timer.h b/include/os/macos/spl/sys/timer.h new file mode 100644 index 0000000000..ccdf64f5df --- /dev/null +++ b/include/os/macos/spl/sys/timer.h @@ -0,0 +1,88 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_TIMER_H +#define _SPL_TIMER_H + +#include + +/* Open Solaris lbolt is in hz */ +static inline uint64_t +zfs_lbolt(void) +{ + struct timeval tv; + uint64_t lbolt_hz; + microuptime(&tv); + lbolt_hz = ((uint64_t)tv.tv_sec * USEC_PER_SEC + tv.tv_usec) / 10000; + return (lbolt_hz); +} + + +#define lbolt zfs_lbolt() +#define lbolt64 zfs_lbolt() + +#define ddi_get_lbolt() (zfs_lbolt()) +#define ddi_get_lbolt64() (zfs_lbolt()) + +#define typecheck(type, x) \ + ( \ + { type __dummy; \ + typeof(x) __dummy2; \ + (void) (&__dummy == &__dummy2); \ + 1; \ + }) + + + +#define ddi_time_before(a, b) (typecheck(clock_t, a) && \ + typecheck(clock_t, b) && \ + ((a) - (b) < 0)) +#define ddi_time_after(a, b) ddi_time_before(b, a) + +#define ddi_time_before64(a, b) (typecheck(int64_t, a) && \ + typecheck(int64_t, b) && \ + ((a) - (b) < 0)) +#define ddi_time_after64(a, b) ddi_time_before64(b, a) + + + +extern void delay(clock_t ticks); + +#define usleep_range(wakeup, whocares) \ + do { \ + hrtime_t delta = wakeup - gethrtime(); \ + if (delta > 0) { \ + struct timespec ts; \ + ts.tv_sec = delta / NANOSEC; \ + ts.tv_nsec = delta % NANOSEC; \ + (void) msleep(NULL, NULL, PWAIT, "usleep_range", &ts); \ + } \ + } while (0) + + +#endif /* _SPL_TIMER_H */ diff --git a/include/os/macos/spl/sys/trace.h b/include/os/macos/spl/sys/trace.h new file mode 100644 index 0000000000..7b72d3a98d --- /dev/null +++ b/include/os/macos/spl/sys/trace.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_TRACE_H +#define _SPL_TRACE_H + + +#endif diff --git a/include/os/macos/spl/sys/tsd.h b/include/os/macos/spl/sys/tsd.h new file mode 100644 index 0000000000..cfc48000a5 --- /dev/null +++ b/include/os/macos/spl/sys/tsd.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + + +#ifndef _SPL_TSD_H +#define _SPL_TSD_H + +#include + +#define TSD_HASH_TABLE_BITS_DEFAULT 9 +#define TSD_KEYS_MAX 32768 +#define DTOR_PID (PID_MAX_LIMIT+1) +#define PID_KEY (TSD_KEYS_MAX+1) + +typedef void (*dtor_func_t)(void *); + +extern int tsd_set(uint_t, void *); +extern void *tsd_get(uint_t); +extern void *tsd_get_by_thread(uint_t, thread_t); +extern void tsd_create(uint_t *, dtor_func_t); +extern void tsd_destroy(uint_t *); +extern void tsd_exit(void); + +uint64_t spl_tsd_size(void); +void tsd_thread_exit(void); +int spl_tsd_init(void); +void spl_tsd_fini(void); + +#endif /* _SPL_TSD_H */ diff --git a/include/os/macos/spl/sys/types.h b/include/os/macos/spl/sys/types.h new file mode 100644 index 0000000000..38faab8367 --- /dev/null +++ b/include/os/macos/spl/sys/types.h @@ -0,0 +1,119 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_TYPES_H +#define _SPL_TYPES_H + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#include_next +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Avoid kcdata.h header error */ +extern unsigned long strnlen(const char *, unsigned long); + +#ifdef __cplusplus +} +#endif + +#include + +#include + +#ifndef ULLONG_MAX +#define ULLONG_MAX (~0ULL) +#endif + +#ifndef LLONG_MAX +#define LLONG_MAX ((long long)(~0ULL>>1)) +#endif + +enum { B_FALSE = 0, B_TRUE = 1 }; +typedef short pri_t; +typedef unsigned long ulong_t; +typedef unsigned long long u_longlong_t; +typedef unsigned long long rlim64_t; +typedef unsigned long long loff_t; +typedef long long longlong_t; +typedef unsigned char uchar_t; +typedef unsigned int uint_t; +typedef unsigned short ushort_t; +typedef void *spinlock_t; +typedef long long offset_t; +typedef struct timespec timestruc_t; /* definition per SVr4 */ +typedef struct timespec timespec_t; +typedef ulong_t pgcnt_t; +typedef unsigned int umode_t; +#define NODEV32 (dev32_t)(-1) +typedef uint32_t dev32_t; +typedef uint_t minor_t; +typedef short index_t; + +#include +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FEXCL O_EXCL +#define FNOCTTY O_NOCTTY +#define FNOFOLLOW O_NOFOLLOW + +#ifdef __APPLE__ +#define FSYNC O_SYNC /* file (data+inode) integrity while writing */ +#define FDSYNC O_DSYNC /* file data only integrity while writing */ +#define FOFFMAX 0x0000 /* not used */ +#define FRSYNC 0x0000 /* not used */ +#else +#define FRSYNC 0x8000 /* sync read operations at same level of */ + /* integrity as specified for writes by */ + /* FSYNC and FDSYNC flags */ +#define FOFFMAX 0x2000 /* large file */ +#endif + +#define EXPORT_SYMBOL(X) +#define module_param(X, Y, Z) +#define MODULE_PARM_DESC(X, Y) + +#ifdef __GNUC__ +#define member_type(type, member) __typeof__(((type *)0)->member) +#else +#define member_type(type, member) void +#endif + +#define container_of(ptr, type, member) ((type *) \ + ((char *)(member_type(type, member) *) \ + { ptr } - offsetof(type, member))) + +typedef struct timespec inode_timespec_t; + +#endif /* _SPL_TYPES_H */ diff --git a/include/os/macos/spl/sys/types32.h b/include/os/macos/spl/sys/types32.h new file mode 100644 index 0000000000..799a956b5e --- /dev/null +++ b/include/os/macos/spl/sys/types32.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SPL_TYPES32_H +#define _SPL_TYPES32_H + +typedef uint32_t caddr32_t; +typedef int32_t daddr32_t; +typedef int32_t time32_t; +typedef uint32_t size32_t; + +#endif /* SPL_TYPE32_H */ diff --git a/include/os/macos/spl/sys/uio.h b/include/os/macos/spl/sys/uio.h new file mode 100644 index 0000000000..2327de209d --- /dev/null +++ b/include/os/macos/spl/sys/uio.h @@ -0,0 +1,173 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + + +#ifndef _SPL_UIO_H +#define _SPL_UIO_H + + +// OSX defines "uio_t" as "struct uio *" +// ZFS defines "uio_t" as "struct uio" +#undef uio_t +#include_next +#define uio_t struct uio + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct iovec iovec_t; + +typedef enum uio_seg uio_seg_t; +typedef enum uio_rw uio_rw_t; + +typedef struct aio_req { + uio_t *aio_uio; + void *aio_private; +} aio_req_t; + +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY, +} xuio_type_t; + + +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { + int uioa_pfncnt; + void **uioa_ppp; + caddr_t uioa_base; + size_t uioa_len; +} uioa_page_t; + +typedef struct xuio { + uio_t *xu_uio; + enum xuio_type xu_type; + union { + struct { + uint32_t xu_a_state; + ssize_t xu_a_mbytes; + uioa_page_t *xu_a_lcur; + void **xu_a_lppp; + void *xu_a_hwst[4]; + uioa_page_t xu_a_locked[UIOA_IOV_MAX]; + } xu_aio; + + struct { + int xu_zc_rw; + void *xu_zc_priv; + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +#define uio_segflg(U) \ + (uio_isuserspace((struct uio *)(U))?UIO_USERSPACE:UIO_SYSSPACE) +#define uio_advance(U, N) uio_update((struct uio *)(U), (N)) + +static inline uint64_t +uio_iovlen(const struct uio *u, unsigned int i) +{ + user_size_t iov_len; + uio_getiov((struct uio *)u, i, NULL, &iov_len); + return (iov_len); +} + +static inline void * +uio_iovbase(const struct uio *u, unsigned int i) +{ + user_addr_t iov_base; + uio_getiov((struct uio *)u, i, &iov_base, NULL); + return ((void *)iov_base); +} + +static inline void +uio_iov_at_index(uio_t *uio, unsigned int idx, void **base, uint64_t *len) +{ + (void) uio_getiov(uio, idx, (user_addr_t *)base, len); +} + +static inline long long +uio_index_at_offset(struct uio *uio, long long off, unsigned int *vec_idx) +{ + uint64_t len; + *vec_idx = 0; + while (*vec_idx < uio_iovcnt(uio) && off >= + (len = uio_iovlen(uio, *vec_idx))) { + off -= len; + (*vec_idx)++; + } + return (off); +} + +/* + * same as uiomove() but doesn't modify uio structure. + * return in cbytes how many bytes were copied. + */ +static inline int +uiocopy(const char *p, size_t n, enum uio_rw rw, struct uio *uio, + size_t *cbytes) +{ + int result; + struct uio *nuio = uio_duplicate(uio); + unsigned long long x = uio_resid(uio); + if (!nuio) + return (ENOMEM); + uio_setrw(nuio, rw); + result = uiomove(p, n, nuio); + *cbytes = x-uio_resid(nuio); + uio_free(nuio); + return (result); +} + + +// Apple's uiomove puts the uio_rw in uio_create +#define uiomove(A, B, C, D) uiomove((A), (B), (D)) +#define uioskip(A, B) uio_update((A), (B)) + +extern int uio_prefaultpages(ssize_t, uio_t *); + +#ifdef __cplusplus +} +#endif +#endif /* SPL_UIO_H */ diff --git a/include/os/macos/spl/sys/utsname.h b/include/os/macos/spl/sys/utsname.h new file mode 100644 index 0000000000..b6bcab77bb --- /dev/null +++ b/include/os/macos/spl/sys/utsname.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _SPL_UTSNAME_H +#define _SPL_UTSNAME_H + +#define _SYS_NMLN 257 +struct opensolaris_utsname { + char sysname[_SYS_NMLN]; + char nodename[_SYS_NMLN]; + char release[_SYS_NMLN]; + char version[_SYS_NMLN]; + char machine[_SYS_NMLN]; +}; + +typedef struct opensolaris_utsname utsname_t; + +extern utsname_t *utsname(void); + +#endif /* SPL_UTSNAME_H */ diff --git a/include/os/macos/spl/sys/varargs.h b/include/os/macos/spl/sys/varargs.h new file mode 100644 index 0000000000..b7371a1f2a --- /dev/null +++ b/include/os/macos/spl/sys/varargs.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ +#ifndef _SPL_VARARGS_H +#define _SPL_VARARGS_H + +#define __va_list va_list + +#endif /* SPL_VARARGS_H */ diff --git a/include/os/macos/spl/sys/vfs.h b/include/os/macos/spl/sys/vfs.h new file mode 100644 index 0000000000..aa78dc4347 --- /dev/null +++ b/include/os/macos/spl/sys/vfs.h @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _SPL_ZFS_H +#define _SPL_ZFS_H + +#include +#include + +#define MAXFIDSZ 64 + +typedef struct mount vfs_t; + +#define vn_vfswlock(vp) (0) +#define vn_vfsunlock(vp) +#define VFS_HOLD(vfsp) +#define VFS_RELE(vfsp) + + + +/* + * File identifier. Should be unique per filesystem on a single + * machine. This is typically called by a stateless file server + * in order to generate "file handles". + * + * Do not change the definition of struct fid ... fid_t without + * letting the CacheFS group know about it! They will have to do at + * least two things, in the same change that changes this structure: + * 1. change CFSVERSION in usr/src/uts/common/sys/fs/cachefs_fs.h + * 2. put the old version # in the canupgrade array + * in cachfs_upgrade() in usr/src/cmd/fs.d/cachefs/fsck/fsck.c + * This is necessary because CacheFS stores FIDs on disk. + * + * Many underlying file systems cast a struct fid into other + * file system dependent structures which may require 4 byte alignment. + * Because a fid starts with a short it may not be 4 byte aligned, the + * fid_pad will force the alignment. + */ +#define MAXFIDSZ 64 +#define OLD_MAXFIDSZ 16 + +typedef struct fid { + union { + long fid_pad; + struct { + ushort_t len; /* length of data in bytes */ + char data[MAXFIDSZ]; /* data (variable len) */ + } _fid; + } un; +} fid_t; + + +extern void (*mountroot_post_hook)(void); + +#endif /* SPL_ZFS_H */ diff --git a/include/os/macos/spl/sys/vmem.h b/include/os/macos/spl/sys/vmem.h new file mode 100644 index 0000000000..6ff8a6e146 --- /dev/null +++ b/include/os/macos/spl/sys/vmem.h @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VMEM_H +#define _SYS_VMEM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +#define KMEM_QUANTUM (PAGESIZE) + + + /* + * Per-allocation flags + */ +#define VM_SLEEP 0x00000000 /* same as KM_SLEEP */ +#define VM_NOSLEEP 0x00000001 /* same as KM_NOSLEEP */ +#define VM_PANIC 0x00000002 /* same as KM_PANIC */ +#define VM_PUSHPAGE 0x00000004 /* same as KM_PUSHPAGE */ +#define VM_NORMALPRI 0x00000008 /* same as KM_NORMALPRI */ +#define VM_NODEBUG 0x00000010 /* matches KM_NODE~BUG, */ + /* not implemented on OSX */ +#define VM_NO_VBA 0x00000020 /* OSX: do not descend to the bucket layer */ +#define VM_KMFLAGS 0x000000ff /* flags that must match KM_* flags */ + +#define VM_BESTFIT 0x00000100 +#define VM_FIRSTFIT 0x00000200 +#define VM_NEXTFIT 0x00000400 + +/* + * The following flags are restricted for use only within the kernel. + * VM_MEMLOAD is for use by the HAT to avoid infinite recursion. + * VM_NORELOC is used by the kernel when static VA->PA mappings are required. + */ +#define VM_MEMLOAD 0x00000800 +#define VM_NORELOC 0x00001000 + +/* + * VM_ABORT requests that vmem_alloc() *ignore* the VM_SLEEP/VM_NOSLEEP flags + * and forgo reaping if the allocation or attempted import, fails. This + * flag is a segkmem-specific flag, and should not be used by anyone else. + */ +#define VM_ABORT 0x00002000 + +/* + * VM_ENDALLOC requests that large addresses be preferred in allocations. + * Has no effect if VM_NEXTFIT is active. + */ +#define VM_ENDALLOC 0x00004000 + +#define VM_FLAGS 0x0000FFFF + +/* + * Arena creation flags + */ +#define VMC_POPULATOR 0x00010000 +#define VMC_NO_QCACHE 0x00020000 /* cannot use quantum caches */ +#define VMC_IDENTIFIER 0x00040000 /* not backed by memory */ +// VMC_XALLOC 0x00080000 below +// VMC_XALIGN 0x00100000 below +#define VMC_DUMPSAFE 0x00200000 /* can use alternate dump memory */ +// KMC_IDENTIFIER == 0x00400000 +// KMC_PREFILL == 0x00800000 +#define VMC_TIMEFREE 0x01000000 /* keep span creation time, */ + /* newest spans to front */ +#define VMC_OLDFIRST 0x02000000 /* must accompany VMC_TIMEFREE, */ + /* oldest spans to front */ + +/* + * internal use only; the import function uses the vmem_ximport_t interface + * and may increase the request size if it so desires. + * VMC_XALIGN, for use with vmem_xcreate, specifies that + * the address returned by the import function will be + * aligned according to the alignment argument. + */ +#define VMC_XALLOC 0x00080000 +#define VMC_XALIGN 0x00100000 +#define VMC_FLAGS 0xFFFF0000 + +/* + * Public segment types + */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + +/* + * Implementation-private segment types + */ +#define VMEM_SPAN 0x10 +#define VMEM_ROTOR 0x20 +#define VMEM_WALKER 0x40 + +/* + * VMEM_REENTRANT indicates to vmem_walk() that the callback routine may + * call back into the arena being walked, so vmem_walk() must drop the + * arena lock before each callback. The caveat is that since the arena + * isn't locked, its state can change. Therefore it is up to the callback + * routine to handle cases where the segment isn't of the expected type. + * For example, we use this to walk heap_arena when generating a crash dump; + * see segkmem_dump() for sample usage. + */ +#define VMEM_REENTRANT 0x80000000 + +struct vmem; + +typedef struct vmem vmem_t; +typedef void *(vmem_alloc_t)(vmem_t *, size_t, int); +typedef void (vmem_free_t)(vmem_t *, void *, size_t); + +/* + * Alternate import style; the requested size is passed in a pointer, + * which can be increased by the import function if desired. + */ +typedef void *(vmem_ximport_t)(vmem_t *, size_t *, size_t, int); + +#ifdef _KERNEL +extern vmem_t *vmem_init(const char *, void *, size_t, size_t, + vmem_alloc_t *, vmem_free_t *); +extern void vmem_fini(vmem_t *); +extern void vmem_update(void *); +extern int vmem_is_populator(void); +extern size_t vmem_seg_size; +#endif + +extern vmem_t *vmem_create(const char *, void *, size_t, size_t, + vmem_alloc_t *, vmem_free_t *, vmem_t *, size_t, int); +extern vmem_t *vmem_xcreate(const char *, void *, size_t, size_t, + vmem_ximport_t *, vmem_free_t *, vmem_t *, size_t, int); +extern void vmem_destroy(vmem_t *); +extern void *vmem_alloc(vmem_t *, size_t, int); +extern void *vmem_xalloc(vmem_t *, size_t, size_t, size_t, size_t, + void *, void *, int); +extern void vmem_free(vmem_t *, void *, size_t); +extern void vmem_xfree(vmem_t *, void *, size_t); +extern void *vmem_add(vmem_t *, void *, size_t, int); +extern int vmem_contains(vmem_t *, void *, size_t); +extern void vmem_walk(vmem_t *, int, void (*)(void *, void *, size_t), void *); +extern size_t vmem_size(vmem_t *, int); +extern size_t vmem_size_locked(vmem_t *, int); +extern size_t vmem_size_semi_atomic(vmem_t *, int); +extern void vmem_qcache_reap(vmem_t *vmp); +extern int64_t vmem_buckets_size(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VMEM_H */ diff --git a/include/os/macos/spl/sys/vmem_impl.h b/include/os/macos/spl/sys/vmem_impl.h new file mode 100644 index 0000000000..233a6b33ae --- /dev/null +++ b/include/os/macos/spl/sys/vmem_impl.h @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1999-2001, 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VMEM_IMPL_H +#define _SYS_VMEM_IMPL_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vmem_seg vmem_seg_t; + +#define VMEM_STACK_DEPTH 20 + +struct vmem_seg { + /* + * The first four fields must match vmem_freelist_t exactly. + */ + uintptr_t vs_start; /* start of segment (inclusive) */ + uintptr_t vs_end; /* end of segment (exclusive) */ + vmem_seg_t *vs_knext; /* next of kin (alloc, free, span) */ + vmem_seg_t *vs_kprev; /* prev of kin */ + + vmem_seg_t *vs_anext; /* next in arena */ + vmem_seg_t *vs_aprev; /* prev in arena */ + uint8_t vs_type; /* alloc, free, span */ + uint8_t vs_import; /* non-zero if segment was imported */ + uint8_t vs_depth; /* stack depth if KMF_AUDIT active */ + /* + * if VM_FREESORT is set on the arena, then + * this field is set at span creation time. + */ + hrtime_t vs_span_createtime; + /* + * The following fields are present only when KMF_AUDIT is set. + */ + kthread_t *vs_thread; + hrtime_t vs_timestamp; + pc_t vs_stack[VMEM_STACK_DEPTH]; +}; + +typedef struct vmem_freelist { + uintptr_t vs_start; /* always zero */ + uintptr_t vs_end; /* segment size */ + vmem_seg_t *vs_knext; /* next of kin */ + vmem_seg_t *vs_kprev; /* prev of kin */ +} vmem_freelist_t; + +#define VS_SIZE(vsp) ((vsp)->vs_end - (vsp)->vs_start) + +/* + * Segment hashing + */ +#define VMEM_HASH_INDEX(a, s, q, m) \ + ((((a) + ((a) >> (s)) + ((a) >> ((s) << 1))) >> (q)) & (m)) + +#define VMEM_HASH(vmp, addr) \ + (&(vmp)->vm_hash_table[VMEM_HASH_INDEX(addr, \ + (vmp)->vm_hash_shift, (vmp)->vm_qshift, (vmp)->vm_hash_mask)]) + +#define VMEM_QCACHE_SLABSIZE(max) \ + MAX(1 << highbit(3 * (max)), 64) + +#define VMEM_NAMELEN 30 +#define VMEM_HASH_INITIAL 16 +#define VMEM_NQCACHE_MAX 16 +#define VMEM_FREELISTS (sizeof (void *) * 8) + +typedef struct vmem_kstat { + kstat_named_t vk_mem_inuse; /* memory in use */ + kstat_named_t vk_mem_import; /* memory imported */ + kstat_named_t vk_mem_total; /* total memory in arena */ + kstat_named_t vk_source_id; /* vmem id of vmem source */ + kstat_named_t vk_alloc; /* number of allocations */ + kstat_named_t vk_free; /* number of frees */ + kstat_named_t vk_wait; /* number of allocations that waited */ + kstat_named_t vk_fail; /* number of allocations that failed */ + kstat_named_t vk_lookup; /* hash lookup count */ + kstat_named_t vk_search; /* freelist search count */ + kstat_named_t vk_populate_fail; /* populates that failed */ + kstat_named_t vk_contains; /* vmem_contains() calls */ + kstat_named_t vk_contains_search; /* vmem_contains() search cnt */ + kstat_named_t vk_parent_alloc; /* called the source allocator */ + kstat_named_t vk_parent_free; /* called the source free function */ + kstat_named_t vk_threads_waiting; /* threads in cv_wait in vmem */ + /* allocator function */ + kstat_named_t vk_excess; /* count of retained excess imports */ +} vmem_kstat_t; + +struct vmem { + char vm_name[VMEM_NAMELEN]; /* arena name */ + kcondvar_t vm_cv; /* cv for blocking allocations */ + kmutex_t vm_lock; /* arena lock */ + uint32_t vm_id; /* vmem id */ + hrtime_t vm_createtime; + uint32_t vm_mtbf; /* induced alloc failure rate */ + int vm_cflags; /* arena creation flags */ + int vm_qshift; /* log2(vm_quantum) */ + size_t vm_quantum; /* vmem quantum */ + size_t vm_qcache_max; /* maximum size to front by kmem */ + size_t vm_min_import; /* smallest amount to import */ + void *(*vm_source_alloc)(vmem_t *, size_t, int); + void (*vm_source_free)(vmem_t *, void *, size_t); + vmem_t *vm_source; /* vmem source for imported memory */ + vmem_t *vm_next; /* next in vmem_list */ + kstat_t *vm_ksp; /* kstat */ + ssize_t vm_nsegfree; /* number of free vmem_seg_t's */ + vmem_seg_t *vm_segfree; /* free vmem_seg_t list */ + vmem_seg_t **vm_hash_table; /* allocated-segment hash table */ + size_t vm_hash_mask; /* hash_size - 1 */ + size_t vm_hash_shift; /* log2(vm_hash_mask + 1) */ + ulong_t vm_freemap; /* bitmap of non-empty freelists */ + vmem_seg_t vm_seg0; /* anchor segment */ + vmem_seg_t vm_rotor; /* rotor for VM_NEXTFIT allocations */ + vmem_seg_t *vm_hash0[VMEM_HASH_INITIAL]; /* initial hash table */ + void *vm_qcache[VMEM_NQCACHE_MAX]; /* quantum caches */ + vmem_freelist_t vm_freelist[VMEM_FREELISTS + 1]; /* power-of-2 flists */ + vmem_kstat_t vm_kstat; /* kstat data */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VMEM_IMPL_H */ diff --git a/include/os/macos/spl/sys/vmsystm.h b/include/os/macos/spl/sys/vmsystm.h new file mode 100644 index 0000000000..421e26364c --- /dev/null +++ b/include/os/macos/spl/sys/vmsystm.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_VMSYSTM_H +#define _SPL_VMSYSTM_H + +#include + +#define xcopyout copyout + +#endif /* SPL_VMSYSTM_H */ diff --git a/include/os/macos/spl/sys/vnode.h b/include/os/macos/spl/sys/vnode.h new file mode 100644 index 0000000000..37b7e41e75 --- /dev/null +++ b/include/os/macos/spl/sys/vnode.h @@ -0,0 +1,258 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_VNODE_H +#define _SPL_VNODE_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +// Be aware that Apple defines "typedef struct vnode *vnode_t" and +// ZFS uses "typedef struct vnode vnode_t". +#undef uio_t +#undef vnode_t +#include_next +#define vnode_t struct vnode +#define uio_t struct uio + + +struct caller_context; +typedef struct caller_context caller_context_t; +typedef int vcexcl_t; + +enum vcexcl { NONEXCL, EXCL }; + +#define B_INVAL 0x01 +#define B_TRUNC 0x02 + +#define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ + +#define IS_DEVVP(vp) \ + (vnode_ischr(vp) || vnode_isblk(vp) || vnode_isfifo(vp)) + +enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */ +enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */ + +#define va_mask va_active +#define va_nodeid va_fileid +#define va_nblocks va_filerev + +/* + * vnode attr translations + */ +#define ATTR_TYPE VNODE_ATTR_va_type +#define ATTR_MODE VNODE_ATTR_va_mode +#define ATTR_ACL VNODE_ATTR_va_acl +#define ATTR_UID VNODE_ATTR_va_uid +#define ATTR_GID VNODE_ATTR_va_gid +#define ATTR_ATIME VNODE_ATTR_va_access_time +#define ATTR_MTIME VNODE_ATTR_va_modify_time +#define ATTR_CTIME VNODE_ATTR_va_change_time +#define ATTR_CRTIME VNODE_ATTR_va_create_time +#define ATTR_SIZE VNODE_ATTR_va_data_size +#define ATTR_NOSET 0 +/* + * OSX uses separate vnop getxattr and setxattr to deal with XATTRs, so + * we never get vop&XVATTR set from VFS. All internal checks for it in + * ZFS is not required. + */ +#define ATTR_XVATTR 0 +#define AT_XVATTR ATTR_XVATTR + +#define va_size va_data_size +#define va_atime va_access_time +#define va_mtime va_modify_time +#define va_ctime va_change_time +#define va_crtime va_create_time +#define va_bytes va_data_size + +typedef struct vnode_attr vattr; +typedef struct vnode_attr vattr_t; + +/* vsa_mask values */ +#define VSA_ACL 0x0001 +#define VSA_ACLCNT 0x0002 +#define VSA_DFACL 0x0004 +#define VSA_DFACLCNT 0x0008 +#define VSA_ACE 0x0010 +#define VSA_ACECNT 0x0020 +#define VSA_ACE_ALLTYPES 0x0040 +#define VSA_ACE_ACLFLAGS 0x0080 /* get/set ACE ACL flags */ + + +extern struct vnode *vn_alloc(int flag); + +extern int vn_open(char *pnamep, enum uio_seg seg, int filemode, + int createmode, struct vnode **vpp, enum create crwhy, mode_t umask); +extern int vn_openat(char *pnamep, enum uio_seg seg, int filemode, + int createmode, struct vnode **vpp, enum create crwhy, + mode_t umask, struct vnode *startvp); + +#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) +#define vn_free(vp) do { } while (0) +#define vn_pages_remove(vp, fl, op) do { } while (0) + +/* XNU is a vn_rdwr, so we work around it to match arguments */ +/* This should be deprecated, if not now, soon. */ +extern int zfs_vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, + ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, + rlim64_t ulimit, cred_t *cr, ssize_t *residp); + +#define vn_rdwr(rw, vp, b, l, o, s, flg, li, cr, resid) \ + zfs_vn_rdwr((rw), (vp), (b), (l), (o), (s), (flg), (li), (cr), (resid)) + +/* Other vn_rdwr for zfs_file_t ops */ +struct spl_fileproc; +extern int spl_vn_rdwr(enum uio_rw rw, struct spl_fileproc *, caddr_t base, + ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, + rlim64_t ulimit, cred_t *cr, ssize_t *residp); + +extern int vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag); +extern int vn_rename(char *from, char *to, enum uio_seg seg); + +#define LK_RETRY 0 +#define LK_SHARED 0 +#define VN_UNLOCK(vp) +static inline int +vn_lock(struct vnode *vp, int fl) +{ + return (0); +} + +/* + * XNU reserves fileID 1-15, so we remap them high. + * 2 is root-of-the-mount. + * If ID is same as root, return 2. Otherwise, if it is 0-15, return + * adjusted, otherwise, return as-is. + * See hfs_format.h: kHFSRootFolderID, kHFSExtentsFileID, ... + */ +#define INO_ROOT 2ULL +#define INO_RESERVED 16ULL /* [0-15] reserved. */ +#define INO_ISRESERVED(ID) ((ID) < (INO_RESERVED)) +/* 0xFFFFFFFFFFFFFFF0 */ +#define INO_MAP ((uint64_t)-INO_RESERVED) /* -16, -15, .., -1 */ + +#define INO_ZFSTOXNU(ID, ROOT) \ + ((ID) == (ROOT)?INO_ROOT:(INO_ISRESERVED(ID)?INO_MAP+(ID):(ID))) + +/* + * This macro relies on *unsigned*. + * If asking for 2, return rootID. If in special range, adjust to + * normal, otherwise, return as-is. + */ +#define INO_XNUTOZFS(ID, ROOT) \ + ((ID) == INO_ROOT)?(ROOT): \ + (INO_ISRESERVED((ID)-INO_MAP))?((ID)-INO_MAP):(ID) + +#define VN_HOLD(vp) vnode_getwithref(vp) +#define VN_RELE(vp) vnode_put(vp) + +void spl_rele_async(void *arg); +void vn_rele_async(struct vnode *vp, void *taskq); + +extern int vnode_iocount(struct vnode *); + +#define VN_RELE_ASYNC(vp, tq) vn_rele_async((vp), (tq)) + +#define vn_exists(vp) +#define vn_is_readonly(vp) vnode_vfsisrdonly(vp) + +#define vnode_pager_setsize(vp, sz) ubc_setsize((vp), (sz)) + +#define VATTR_NULL(v) do { } while (0) + +extern int VOP_CLOSE(struct vnode *vp, int flag, int count, + offset_t off, void *cr, void *); +extern int VOP_FSYNC(struct vnode *vp, int flags, void* unused, void *); +extern int VOP_SPACE(struct vnode *vp, int cmd, struct flock *fl, + int flags, offset_t off, cred_t *cr, void *ctx); + +extern int VOP_GETATTR(struct vnode *vp, vattr_t *vap, int flags, + void *x3, void *x4); + +#define VOP_UNLOCK(vp, fl) do { } while (0) + +void vfs_mountedfrom(struct mount *vfsp, char *osname); + +#define build_path(A, B, C, D, E, F) spl_build_path(A, B, C, D, E, F) +extern int spl_build_path(struct vnode *vp, char *buff, int buflen, + int *outlen, int flags, vfs_context_t ctx); + +extern struct vnode *rootdir; + +static inline int chklock(struct vnode *vp, int iomode, + unsigned long long offset, ssize_t len, int fmode, void *ct) +{ + return (0); +} + +#define vn_ismntpt(vp) (vnode_mountedhere(vp) != NULL) + +extern errno_t VOP_LOOKUP (struct vnode *, struct vnode **, + struct componentname *, vfs_context_t); +extern errno_t VOP_MKDIR (struct vnode *, struct vnode **, + struct componentname *, struct vnode_attr *, + vfs_context_t); +extern errno_t VOP_REMOVE (struct vnode *, struct vnode *, + struct componentname *, int, vfs_context_t); +extern errno_t VOP_SYMLINK (struct vnode *, struct vnode **, + struct componentname *, struct vnode_attr *, + char *, vfs_context_t); + +void spl_vnode_fini(void); +int spl_vnode_init(void); + + +extern int spl_vfs_root(mount_t mount, struct vnode **vp); +#define VFS_ROOT(V, L, VP) spl_vfs_root((V), (VP)) + +extern void cache_purgevfs(mount_t mp); + +vfs_context_t vfs_context_kernel(void); +vfs_context_t spl_vfs_context_kernel(void); +extern int spl_vnode_notify(struct vnode *vp, uint32_t type, + struct vnode_attr *vap); +extern int spl_vfs_get_notify_attributes(struct vnode_attr *vap); +extern void spl_hijack_mountroot(void *func); +extern void spl_setrootvnode(struct vnode *vp); + +struct vnode *getrootdir(void); +void spl_vfs_start(void); + +#endif /* SPL_VNODE_H */ diff --git a/include/os/macos/spl/sys/zmod.h b/include/os/macos/spl/sys/zmod.h new file mode 100644 index 0000000000..6965c91f3d --- /dev/null +++ b/include/os/macos/spl/sys/zmod.h @@ -0,0 +1,122 @@ +/* + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler + */ + +#ifndef _SPL_ZMOD_H +#define _SPL_ZMOD_H + + +#include +#include +#include + +struct _zmemheader { + uint64_t length; + char data[0]; +}; + +static inline void * +zfs_zalloc(void* opaque, uInt items, uInt size) +{ + struct _zmemheader *hdr; + size_t alloc_size = (items * size) + sizeof (uint64_t); + hdr = kmem_zalloc(alloc_size, KM_SLEEP); + hdr->length = alloc_size; + return (&hdr->data); +} + +static inline void +zfs_zfree(void *opaque, void *addr) +{ + struct _zmemheader *hdr; + hdr = addr; + hdr--; + kmem_free(hdr, hdr->length); +} + +/* + * Uncompress the buffer 'src' into the buffer 'dst'. The caller must store + * the expected decompressed data size externally so it can be passed in. + * The resulting decompressed size is then returned through dstlen. This + * function return Z_OK on success, or another error code on failure. + */ +static inline int + z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) +{ + z_stream zs; + int err; + + bzero(&zs, sizeof (zs)); + zs.next_in = (uchar_t *)src; + zs.avail_in = srclen; + zs.next_out = dst; + zs.avail_out = *dstlen; + zs.zalloc = zfs_zalloc; + zs.zfree = zfs_zfree; + if ((err = inflateInit(&zs)) != Z_OK) + return (err); + if ((err = inflate(&zs, Z_FINISH)) != Z_STREAM_END) { + (void) inflateEnd(&zs); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *dstlen = zs.total_out; + return (inflateEnd(&zs)); +} + +static inline int +z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, + int level) +{ + z_stream zs; + int err; + bzero(&zs, sizeof (zs)); + zs.next_in = (uchar_t *)src; + zs.avail_in = srclen; + zs.next_out = dst; + zs.avail_out = *dstlen; + zs.zalloc = zfs_zalloc; + zs.zfree = zfs_zfree; + if ((err = deflateInit(&zs, level)) != Z_OK) + return (err); + if ((err = deflate(&zs, Z_FINISH)) != Z_STREAM_END) { + (void) deflateEnd(&zs); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *dstlen = zs.total_out; + return (deflateEnd(&zs)); +} + +static inline int +z_compress(void *dst, size_t *dstlen, const void *src, size_t srclen) +{ + return (z_compress_level(dst, dstlen, src, srclen, + Z_DEFAULT_COMPRESSION)); +} + + +int spl_zlib_init(void); +void spl_zlib_fini(void); + +#endif /* SPL_ZMOD_H */ diff --git a/include/os/macos/spl/sys/zone.h b/include/os/macos/spl/sys/zone.h new file mode 100644 index 0000000000..deefad54a1 --- /dev/null +++ b/include/os/macos/spl/sys/zone.h @@ -0,0 +1,38 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_ZONE_H +#define _SPL_ZONE_H + +#include + +#define GLOBAL_ZONEID 0 + +#define zone_dataset_visible(x, y) (1) +#define INGLOBALZONE(z) (1) + +#endif /* SPL_ZONE_H */ diff --git a/include/os/macos/zfs/Makefile.am b/include/os/macos/zfs/Makefile.am new file mode 100644 index 0000000000..081839c48c --- /dev/null +++ b/include/os/macos/zfs/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/include/os/macos/zfs/sys/Makefile.am b/include/os/macos/zfs/sys/Makefile.am new file mode 100644 index 0000000000..9779de08e0 --- /dev/null +++ b/include/os/macos/zfs/sys/Makefile.am @@ -0,0 +1,10 @@ +KERNEL_H = \ + $(top_srcdir)/include/os/macos/zfs/sys/kstat_osx.h \ + $(top_srcdir)/include/os/macos/spl/sys/ldi_buf.h \ + $(top_srcdir)/include/os/macos/spl/sys/ldi_impl_osx.h \ + $(top_srcdir)/include/os/macos/spl/sys/ldi_osx.h \ + $(top_srcdir)/include/os/macos/spl/sys/trace_zfs.h \ + $(top_srcdir)/include/os/macos/spl/sys/vdev_disk_os.h \ + $(top_srcdir)/include/os/macos/spl/sys/zfs_ioctl_compat.h \ + $(top_srcdir)/include/os/macos/spl/sys/zfs_vfsops.h \ + $(top_srcdir)/include/os/macos/spl/sys/zfs_znode_impl.h diff --git a/include/os/macos/zfs/sys/ZFSDataset.h b/include/os/macos/zfs/sys/ZFSDataset.h new file mode 100644 index 0000000000..06fa8fdfb0 --- /dev/null +++ b/include/os/macos/zfs/sys/ZFSDataset.h @@ -0,0 +1,141 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#ifndef ZFSDATASET_H_INCLUDED +#define ZFSDATASET_H_INCLUDED + +#ifdef __cplusplus + +#include +#include + +#ifdef super +#undef super +#endif +#define super IOMedia + +// #define kZFSContentHint "6A898CC3-1DD2-11B2-99A6-080020736631" +#define kZFSContentHint "ZFS_Dataset" + +#define kZFSIOMediaPrefix "ZFS " +#define kZFSIOMediaSuffix " Media" +#define kZFSDatasetNameKey "ZFS Dataset" +#define kZFSDatasetClassKey "ZFSDataset" + +class ZFSDataset : public IOMedia +{ + OSDeclareDefaultStructors(ZFSDataset) +public: +#if 0 + /* XXX Only for debug tracing */ + virtual bool open(IOService *client, + IOOptionBits options, IOStorageAccess access = 0); + virtual bool isOpen(const IOService *forClient = 0) const; + virtual void close(IOService *client, + IOOptionBits options); + + virtual bool handleOpen(IOService *client, + IOOptionBits options, void *access); + virtual bool handleIsOpen(const IOService *client) const; + virtual void handleClose(IOService *client, + IOOptionBits options); + + virtual bool attach(IOService *provider); + virtual void detach(IOService *provider); + + virtual bool start(IOService *provider); + virtual void stop(IOService *provider); +#endif + + virtual bool init(UInt64 base, UInt64 size, + UInt64 preferredBlockSize, + IOMediaAttributeMask attributes, + bool isWhole, bool isWritable, + const char *contentHint = 0, + OSDictionary *properties = 0); + virtual void free(); + + static ZFSDataset * withDatasetNameAndSize(const char *name, + uint64_t size); + + virtual void read(IOService *client, + UInt64 byteStart, IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion); + virtual void write(IOService *client, + UInt64 byteStart, IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion); + +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + virtual IOReturn synchronize(IOService *client, + UInt64 byteStart, UInt64 byteCount, + IOStorageSynchronizeOptions options = 0); +#else + virtual IOReturn synchronizeCache(IOService *client); +#endif + + virtual IOReturn unmap(IOService *client, + IOStorageExtent *extents, UInt32 extentsCount, +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + IOStorageUnmapOptions options = 0); +#else + UInt32 options = 0); +#endif + + virtual bool lockPhysicalExtents(IOService *client); + virtual IOStorage *copyPhysicalExtent(IOService *client, + UInt64 *byteStart, UInt64 *byteCount); + virtual void unlockPhysicalExtents(IOService *client); + +#if defined(MAC_OS_X_VERSION_10_10) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10) + virtual IOReturn setPriority(IOService *client, + IOStorageExtent *extents, UInt32 extentsCount, + IOStoragePriority priority); +#endif + + virtual UInt64 getPreferredBlockSize() const; + virtual UInt64 getSize() const; + virtual UInt64 getBase() const; + + virtual bool isEjectable() const; + virtual bool isFormatted() const; + virtual bool isWhole() const; + virtual bool isWritable() const; + + virtual const char *getContent() const; + virtual const char *getContentHint() const; + virtual IOMediaAttributeMask getAttributes() const; + +protected: +private: + bool setDatasetName(const char *); +}; + +#endif /* __cplusplus */ + +#endif /* ZFSDATASET_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/ZFSDatasetProxy.h b/include/os/macos/zfs/sys/ZFSDatasetProxy.h new file mode 100644 index 0000000000..e220cdcf9a --- /dev/null +++ b/include/os/macos/zfs/sys/ZFSDatasetProxy.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#ifndef ZFSDATASETPROXY_H_INCLUDED +#define ZFSDATASETPROXY_H_INCLUDED + +#include + +class ZFSDatasetProxy : public IOBlockStorageDevice +{ + OSDeclareDefaultStructors(ZFSDatasetProxy); +public: + + virtual void free(void); + virtual bool init(OSDictionary *properties); + virtual bool start(IOService *provider); + + /* IOBlockStorageDevice */ + virtual IOReturn doSynchronizeCache(void); + virtual IOReturn doAsyncReadWrite(IOMemoryDescriptor *, + UInt64, UInt64, IOStorageAttributes *, + IOStorageCompletion *); + virtual UInt32 doGetFormatCapacities(UInt64 *, + UInt32) const; + virtual IOReturn doFormatMedia(UInt64 byteCapacity); + virtual IOReturn doEjectMedia(); + virtual char *getVendorString(); + virtual char *getProductString(); + virtual char *getRevisionString(); + virtual char *getAdditionalDeviceInfoString(); + virtual IOReturn reportWriteProtection(bool *); + virtual IOReturn reportRemovability(bool *); + virtual IOReturn reportMediaState(bool *, bool *); + virtual IOReturn reportBlockSize(UInt64 *); + virtual IOReturn reportEjectability(bool *); + virtual IOReturn reportMaxValidBlock(UInt64 *); + + virtual IOReturn setWriteCacheState(bool enabled); + virtual IOReturn getWriteCacheState(bool *enabled); +#if 0 + virtual void read(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion); + virtual void write(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion); +#endif + +protected: +private: + /* These are declared class static to share across instances */ + const char *vendorString; + const char *revisionString; + const char *infoString; + /* These are per-instance */ + const char *productString; + uint64_t _pool_bcount; + bool isReadOnly; +}; + +#endif /* ZFSDATASETPROXY_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/ZFSDatasetScheme.h b/include/os/macos/zfs/sys/ZFSDatasetScheme.h new file mode 100644 index 0000000000..eaa8bb368d --- /dev/null +++ b/include/os/macos/zfs/sys/ZFSDatasetScheme.h @@ -0,0 +1,126 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#ifndef ZFSDATASETSCHEME_H_INCLUDED +#define ZFSDATASETSCHEME_H_INCLUDED + +#define kZFSDatasetSchemeClass "ZFSDatasetScheme" + +#include +#include + + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +int zfs_osx_proxy_get_osname(const char *bsdname, + char *osname, int len); +int zfs_osx_proxy_get_bsdname(const char *osname, + char *bsdname, int len); + + +void zfs_osx_proxy_remove(const char *osname); +int zfs_osx_proxy_create(const char *osname); + +#ifdef __cplusplus +} /* extern "C" */ + +/* Not C external */ +ZFSDataset * zfs_osx_proxy_get(const char *osname); + +class ZFSDatasetScheme : public IOPartitionScheme +{ + OSDeclareDefaultStructors(ZFSDatasetScheme); +public: + + virtual void free(void); + virtual bool init(OSDictionary *properties); + virtual bool start(IOService *provider); + virtual IOService *probe(IOService *provider, SInt32 *score); + + bool addDataset(const char *osname); + bool removeDataset(const char *osname, bool force); + + /* Compatibility shims */ + virtual void read(IOService *client, + UInt64 byteStart, + IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion); + + virtual void write(IOService *client, + UInt64 byteStart, + IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion); + +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + virtual IOReturn synchronize(IOService *client, + UInt64 byteStart, + UInt64 byteCount, + IOStorageSynchronizeOptions options = 0); +#else + virtual IOReturn synchronizeCache(IOService *client); +#endif + + virtual IOReturn unmap(IOService *client, + IOStorageExtent *extents, + UInt32 extentsCount, +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + IOStorageUnmapOptions options = 0); +#else + UInt32 options = 0); +#endif + + virtual bool lockPhysicalExtents(IOService *client); + + virtual IOStorage *copyPhysicalExtent(IOService *client, + UInt64 * byteStart, + UInt64 * byteCount); + + virtual void unlockPhysicalExtents(IOService *client); + +#if defined(MAC_OS_X_VERSION_10_10) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10) + virtual IOReturn setPriority(IOService *client, + IOStorageExtent *extents, + UInt32 extentsCount, + IOStoragePriority priority); +#endif + +protected: +private: + OSSet *_datasets; + OSOrderedSet *_holes; + uint64_t _max_id; + + uint32_t getNextPartitionID(); + void returnPartitionID(uint32_t part_id); +}; + +#endif /* __cplusplus */ +#endif /* ZFSDATASETSCHEME_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/ZFSPool.h b/include/os/macos/zfs/sys/ZFSPool.h new file mode 100644 index 0000000000..56a190a0c6 --- /dev/null +++ b/include/os/macos/zfs/sys/ZFSPool.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#ifndef ZFSPOOL_H_INCLUDED +#define ZFSPOOL_H_INCLUDED + +#ifdef __cplusplus +#include + +#pragma mark - ZFSPool + +#define kZFSPoolNameKey "ZFS Pool Name" +#define kZFSPoolSizeKey "ZFS Pool Size" +#define kZFSPoolGUIDKey "ZFS Pool GUID" +#define kZFSPoolReadOnlyKey "ZFS Pool Read-Only" + +typedef struct spa spa_t; + +class ZFSPool : public IOService { + OSDeclareDefaultStructors(ZFSPool); + +protected: +#if 0 + /* XXX Only for debug tracing */ + virtual bool open(IOService *client, + IOOptionBits options, void *arg = 0); + virtual bool isOpen(const IOService *forClient = 0) const; + virtual void close(IOService *client, + IOOptionBits options); +#endif + + bool setPoolName(const char *name); + + virtual bool handleOpen(IOService *client, + IOOptionBits options, void *arg); + virtual bool handleIsOpen(const IOService *client) const; + virtual void handleClose(IOService *client, + IOOptionBits options); + + virtual bool init(OSDictionary *properties, spa_t *spa); + virtual void free(); + +#if 0 + /* IOBlockStorageDevice */ + virtual IOReturn doSynchronizeCache(void); + virtual IOReturn doAsyncReadWrite(IOMemoryDescriptor *, + UInt64, UInt64, IOStorageAttributes *, + IOStorageCompletion *); + virtual UInt32 doGetFormatCapacities(UInt64 *, + UInt32) const; + virtual IOReturn doFormatMedia(UInt64 byteCapacity); + virtual IOReturn doEjectMedia(); + virtual char *getVendorString(); + virtual char *getProductString(); + virtual char *getRevisionString(); + virtual char *getAdditionalDeviceInfoString(); + virtual IOReturn reportWriteProtection(bool *); + virtual IOReturn reportRemovability(bool *); + virtual IOReturn reportMediaState(bool *, bool *); + virtual IOReturn reportBlockSize(UInt64 *); + virtual IOReturn reportEjectability(bool *); + virtual IOReturn reportMaxValidBlock(UInt64 *); + +public: + virtual void read(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion); + virtual void write(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion); +#endif +public: + static ZFSPool * withProviderAndPool(IOService *, spa_t *); + +private: + OSSet *_openClients; + spa_t *_spa; + +#if 0 + /* These are declared class static to share across instances */ + static const char *vendorString; + static const char *revisionString; + static const char *infoString; + /* These are per-instance */ + const char *productString; + bool isReadOnly; +#endif +}; + +/* C++ wrapper, C uses opaque pointer reference */ +typedef struct spa_iokit { + ZFSPool *proxy; +} spa_iokit_t; + +extern "C" { +#endif /* __cplusplus */ + +/* C functions */ +void spa_iokit_pool_proxy_destroy(spa_t *spa); +int spa_iokit_pool_proxy_create(spa_t *spa); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* ZFSPOOL_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/finderinfo.h b/include/os/macos/zfs/sys/finderinfo.h new file mode 100644 index 0000000000..ee3b48017b --- /dev/null +++ b/include/os/macos/zfs/sys/finderinfo.h @@ -0,0 +1,36 @@ +#ifndef FINDERINFO_H +#define FINDERINFO_H + + +struct FndrExtendedDirInfo { + u_int32_t document_id; + u_int32_t date_added; + u_int16_t extended_flags; + u_int16_t reserved3; + u_int32_t write_gen_counter; +} __attribute__((aligned(2), packed)); + +struct FndrExtendedFileInfo { + u_int32_t document_id; + u_int32_t date_added; + u_int16_t extended_flags; + u_int16_t reserved2; + u_int32_t write_gen_counter; +} __attribute__((aligned(2), packed)); + +/* Finder information */ +struct FndrFileInfo { + u_int32_t fdType; + u_int32_t fdCreator; + u_int16_t fdFlags; + struct { + int16_t v; + int16_t h; + } fdLocation; + int16_t opaque; +} __attribute__((aligned(2), packed)); +typedef struct FndrFileInfo FndrFileInfo; + + + +#endif diff --git a/include/os/macos/zfs/sys/hfs_internal.h b/include/os/macos/zfs/sys/hfs_internal.h new file mode 100644 index 0000000000..db8c76f7a3 --- /dev/null +++ b/include/os/macos/zfs/sys/hfs_internal.h @@ -0,0 +1,183 @@ + +#ifndef HFS_INTERNAL_H +#define HFS_INTERNAL_H + +// BGH - Definitions of HFS vnops that we will need to emulate +// including supporting structures. + +struct hfs_journal_info { + off_t jstart; + off_t jsize; +}; + +struct user32_access_t { + uid_t uid; + short flags; + short num_groups; + int num_files; + user32_addr_t file_ids; + user32_addr_t groups; + user32_addr_t access; +}; + +struct user64_access_t { + uid_t uid; + short flags; + short num_groups; + int num_files; + user64_addr_t file_ids; + user64_addr_t groups; + user64_addr_t access; +}; + +struct user32_ext_access_t { + uint32_t flags; + uint32_t num_files; + uint32_t map_size; + user32_addr_t file_ids; + user32_addr_t bitmap; + user32_addr_t access; + uint32_t num_parents; + user32_addr_t parents; +}; + +struct user64_ext_access_t { + uint32_t flags; + uint32_t num_files; + uint32_t map_size; + user64_addr_t file_ids; + user64_addr_t bitmap; + user64_addr_t access; + uint32_t num_parents; + user64_addr_t parents; +}; + +/* + * HFS specific fcntl()'s + */ +#define HFS_BULKACCESS (FCNTL_FS_SPECIFIC_BASE + 0x00001) +#define HFS_GET_MOUNT_TIME (FCNTL_FS_SPECIFIC_BASE + 0x00002) +#define HFS_GET_LAST_MTIME (FCNTL_FS_SPECIFIC_BASE + 0x00003) +#define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004) +#define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005) + +/* HFS FS CONTROL COMMANDS */ + +#define HFSIOC_RESIZE_PROGRESS _IOR('h', 1, u_int32_t) +#define HFS_RESIZE_PROGRESS IOCBASECMD(HFSIOC_RESIZE_PROGRESS) + +#define HFSIOC_RESIZE_VOLUME _IOW('h', 2, u_int64_t) +#define HFS_RESIZE_VOLUME IOCBASECMD(HFSIOC_RESIZE_VOLUME) + +#define HFSIOC_CHANGE_NEXT_ALLOCATION _IOWR('h', 3, u_int32_t) +#define HFS_CHANGE_NEXT_ALLOCATION IOCBASECMD(HFSIOC_CHANGE_NEXT_ALLOCATION) +/* + * Magic value for next allocation to use with fcntl to set next allocation + * to zero and never update it again on new block allocation. + */ +#define HFS_NO_UPDATE_NEXT_ALLOCATION 0xffffFFFF + +#define HFSIOC_GETCREATETIME _IOR('h', 4, time_t) +#define HFS_GETCREATETIME IOCBASECMD(HFSIOC_GETCREATETIME) + +#define HFSIOC_SETBACKINGSTOREINFO _IOW('h', 7, struct hfs_backingstoreinfo) +#define HFS_SETBACKINGSTOREINFO IOCBASECMD(HFSIOC_SETBACKINGSTOREINFO) + +#define HFSIOC_CLRBACKINGSTOREINFO _IO('h', 8) +#define HFS_CLRBACKINGSTOREINFO IOCBASECMD(HFSIOC_CLRBACKINGSTOREINFO) + +#define HFSIOC_BULKACCESS _IOW('h', 9, struct user32_access_t) +#define HFS_BULKACCESS_FSCTL IOCBASECMD(HFSIOC_BULKACCESS) + +#define HFSIOC_SETACLSTATE _IOW('h', 10, int32_t) +#define HFS_SETACLSTATE IOCBASECMD(HFSIOC_SETACLSTATE) + +#define HFSIOC_PREV_LINK _IOWR('h', 11, u_int32_t) +#define HFS_PREV_LINK IOCBASECMD(HFSIOC_PREV_LINK) + +#define HFSIOC_NEXT_LINK _IOWR('h', 12, u_int32_t) +#define HFS_NEXT_LINK IOCBASECMD(HFSIOC_NEXT_LINK) + +#define HFSIOC_GETPATH _IOWR('h', 13, pathname_t) +#define HFS_GETPATH IOCBASECMD(HFSIOC_GETPATH) +#define HFS_GETPATH_VOLUME_RELATIVE 0x1 + +/* This define is deemed secret by Apple */ +#define BUILDPATH_VOLUME_RELATIVE 0x8 + +/* Enable/disable extent-based extended attributes */ +#define HFSIOC_SET_XATTREXTENTS_STATE _IOW('h', 14, u_int32_t) +#define HFS_SET_XATTREXTENTS_STATE IOCBASECMD(HFSIOC_SET_XATTREXTENTS_STATE) + +#define HFSIOC_EXT_BULKACCESS _IOW('h', 15, struct user32_ext_access_t) +#define HFS_EXT_BULKACCESS_FSCTL IOCBASECMD(HFSIOC_EXT_BULKACCESS) + +#define HFSIOC_MARK_BOOT_CORRUPT _IO('h', 16) +#define HFS_MARK_BOOT_CORRUPT IOCBASECMD(HFSIOC_MARK_BOOT_CORRUPT) + +#define HFSIOC_GET_JOURNAL_INFO _IOR('h', 17, struct hfs_journal_info) +#define HFS_FSCTL_GET_JOURNAL_INFO IOCBASECMD(HFSIOC_GET_JOURNAL_INFO) + +#define HFSIOC_SET_VERY_LOW_DISK _IOW('h', 20, u_int32_t) +#define HFS_FSCTL_SET_VERY_LOW_DISK IOCBASECMD(HFSIOC_SET_VERY_LOW_DISK) + +#define HFSIOC_SET_LOW_DISK _IOW('h', 21, u_int32_t) +#define HFS_FSCTL_SET_LOW_DISK IOCBASECMD(HFSIOC_SET_LOW_DISK) + +#define HFSIOC_SET_DESIRED_DISK _IOW('h', 22, u_int32_t) +#define HFS_FSCTL_SET_DESIRED_DISK IOCBASECMD(HFSIOC_SET_DESIRED_DISK) + +#define HFSIOC_SET_ALWAYS_ZEROFILL _IOW('h', 23, int32_t) +#define HFS_SET_ALWAYS_ZEROFILL IOCBASECMD(HFSIOC_SET_ALWAYS_ZEROFILL) + +#define HFSIOC_VOLUME_STATUS _IOR('h', 24, u_int32_t) +#define HFS_VOLUME_STATUS IOCBASECMD(HFSIOC_VOLUME_STATUS) + +/* Disable metadata zone for given volume */ +#define HFSIOC_DISABLE_METAZONE _IO('h', 25) +#define HFS_DISABLE_METAZONE IOCBASECMD(HFSIOC_DISABLE_METAZONE) + +/* Change the next CNID value */ +#define HFSIOC_CHANGE_NEXTCNID _IOWR('h', 26, u_int32_t) +#define HFS_CHANGE_NEXTCNID IOCBASECMD(HFSIOC_CHANGE_NEXTCNID) + +/* Get the low disk space values */ +#define HFSIOC_GET_VERY_LOW_DISK _IOR('h', 27, u_int32_t) +#define HFS_FSCTL_GET_VERY_LOW_DISK IOCBASECMD(HFSIOC_GET_VERY_LOW_DISK) + +#define HFSIOC_GET_LOW_DISK _IOR('h', 28, u_int32_t) +#define HFS_FSCTL_GET_LOW_DISK IOCBASECMD(HFSIOC_GET_LOW_DISK) + +#define HFSIOC_GET_DESIRED_DISK _IOR('h', 29, u_int32_t) +#define HFS_FSCTL_GET_DESIRED_DISK IOCBASECMD(HFSIOC_GET_DESIRED_DISK) + +/* + * revisiond only uses this when something transforms in a way + * the kernel can't track such as "foo.rtf" -> "foo.rtfd" + */ +#define HFSIOC_TRANSFER_DOCUMENT_ID _IOW('h', 32, u_int32_t) +#define HFS_TRANSFER_DOCUMENT_ID IOCBASECMD(HFSIOC_TRANSFER_DOCUMENT_ID) + + +/* fcntl.h */ +#define F_MAKECOMPRESSED 80 + +/* Get file system information for the given volume */ +// #define HFSIOC_GET_FSINFO _IOWR('h', 45, hfs_fsinfo) +// #define HFS_GET_FSINFO IOCBASECMD(HFSIOC_GET_FSINFO) + +/* Re-pin hotfile data; argument controls what state gets repinned */ +#define HFSIOC_REPIN_HOTFILE_STATE _IOWR('h', 46, u_int32_t) +#define HFS_REPIN_HOTFILE_STATE IOCBASECMD(HFSIOC_REPIN_HOTFILE_STATE) + +/* Mark a directory or file as worth caching on any underlying "fast" device */ +#define HFSIOC_SET_HOTFILE_STATE _IOWR('h', 47, u_int32_t) +#define HFS_SET_HOTFILE_STATE IOCBASECMD(HFSIOC_SET_HOTFILE_STATE) + +#define APFSIOC_SET_NEAR_LOW_DISK _IOW('J', 17, u_int32_t) +#define APFSIOC_GET_NEAR_LOW_DISK _IOR('J', 18, u_int32_t) + + +// END of definitions + +#endif diff --git a/include/os/macos/zfs/sys/kstat_osx.h b/include/os/macos/zfs/sys/kstat_osx.h new file mode 100644 index 0000000000..693e0ff5da --- /dev/null +++ b/include/os/macos/zfs/sys/kstat_osx.h @@ -0,0 +1,369 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014, 2016 Jorgen Lundman + */ + +#ifndef KSTAT_OSX_INCLUDED +#define KSTAT_OSX_INCLUDED + +typedef struct osx_kstat { + kstat_named_t spa_version; + kstat_named_t zpl_version; + + kstat_named_t darwin_active_vnodes; + kstat_named_t darwin_debug; + kstat_named_t darwin_reclaim_nodes; + kstat_named_t darwin_ignore_negatives; + kstat_named_t darwin_ignore_positives; + kstat_named_t darwin_create_negatives; + kstat_named_t darwin_force_formd_normalized; + kstat_named_t darwin_skip_unlinked_drain; + kstat_named_t darwin_use_system_sync; + + kstat_named_t arc_zfs_arc_max; + kstat_named_t arc_zfs_arc_min; + kstat_named_t arc_zfs_arc_meta_limit; + kstat_named_t arc_zfs_arc_meta_min; + kstat_named_t arc_zfs_arc_grow_retry; + kstat_named_t arc_zfs_arc_shrink_shift; + kstat_named_t arc_zfs_arc_p_min_shift; + kstat_named_t arc_zfs_arc_average_blocksize; + + kstat_named_t l2arc_write_max; + kstat_named_t l2arc_write_boost; + kstat_named_t l2arc_headroom; + kstat_named_t l2arc_headroom_boost; + kstat_named_t l2arc_feed_secs; + kstat_named_t l2arc_feed_min_ms; + + kstat_named_t zfs_vdev_max_active; + kstat_named_t zfs_vdev_sync_read_min_active; + kstat_named_t zfs_vdev_sync_read_max_active; + kstat_named_t zfs_vdev_sync_write_min_active; + kstat_named_t zfs_vdev_sync_write_max_active; + kstat_named_t zfs_vdev_async_read_min_active; + kstat_named_t zfs_vdev_async_read_max_active; + kstat_named_t zfs_vdev_async_write_min_active; + kstat_named_t zfs_vdev_async_write_max_active; + kstat_named_t zfs_vdev_scrub_min_active; + kstat_named_t zfs_vdev_scrub_max_active; + kstat_named_t zfs_vdev_async_write_active_min_dirty_percent; + kstat_named_t zfs_vdev_async_write_active_max_dirty_percent; + kstat_named_t zfs_vdev_aggregation_limit; + kstat_named_t zfs_vdev_read_gap_limit; + kstat_named_t zfs_vdev_write_gap_limit; + + kstat_named_t arc_lotsfree_percent; + kstat_named_t zfs_dirty_data_max; + kstat_named_t zfs_delay_max_ns; + kstat_named_t zfs_delay_min_dirty_percent; + kstat_named_t zfs_delay_scale; + kstat_named_t spa_asize_inflation; + kstat_named_t zfs_prefetch_disable; + kstat_named_t zfetch_max_streams; + kstat_named_t zfetch_min_sec_reap; + kstat_named_t zfetch_array_rd_sz; + kstat_named_t zfs_default_bs; + kstat_named_t zfs_default_ibs; + kstat_named_t metaslab_aliquot; + kstat_named_t spa_max_replication_override; + kstat_named_t spa_mode_global; + kstat_named_t zfs_flags; + kstat_named_t zfs_txg_timeout; + kstat_named_t zfs_vdev_cache_max; + kstat_named_t zfs_vdev_cache_size; + kstat_named_t zfs_vdev_cache_bshift; + kstat_named_t vdev_mirror_shift; + kstat_named_t zfs_scrub_limit; + kstat_named_t zfs_no_scrub_io; + kstat_named_t zfs_no_scrub_prefetch; + kstat_named_t fzap_default_block_shift; + kstat_named_t zfs_immediate_write_sz; + kstat_named_t zfs_read_chunk_size; + kstat_named_t zfs_nocacheflush; + kstat_named_t zil_replay_disable; + kstat_named_t metaslab_df_alloc_threshold; + kstat_named_t metaslab_df_free_pct; + kstat_named_t zio_injection_enabled; + kstat_named_t zvol_immediate_write_sz; + + kstat_named_t l2arc_noprefetch; + kstat_named_t l2arc_feed_again; + kstat_named_t l2arc_norw; + + kstat_named_t zfs_recover; + + kstat_named_t zfs_free_bpobj_enabled; + + kstat_named_t zfs_send_corrupt_data; + kstat_named_t zfs_send_queue_length; + kstat_named_t zfs_recv_queue_length; + + kstat_named_t zvol_inhibit_dev; + kstat_named_t zfs_send_set_freerecords_bit; + + kstat_named_t zfs_write_implies_delete_child; + kstat_named_t zfs_send_holes_without_birth_time; + + kstat_named_t dbuf_cache_max_bytes; + + kstat_named_t zfs_vdev_queue_depth_pct; + kstat_named_t zio_dva_throttle_enabled; + + kstat_named_t zfs_lua_max_instrlimit; + kstat_named_t zfs_lua_max_memlimit; + + kstat_named_t zfs_trim_extent_bytes_max; + kstat_named_t zfs_trim_extent_bytes_min; + kstat_named_t zfs_trim_metaslab_skip; + kstat_named_t zfs_trim_txg_batch; + kstat_named_t zfs_trim_queue_limit; + + kstat_named_t zfs_send_unmodified_spill_blocks; + kstat_named_t zfs_special_class_metadata_reserve_pct; + + kstat_named_t zfs_vdev_raidz_impl; + kstat_named_t icp_gcm_impl; + kstat_named_t icp_aes_impl; + kstat_named_t zfs_fletcher_4_impl; + + kstat_named_t zfs_expire_snapshot; + kstat_named_t zfs_admin_snapshot; + kstat_named_t zfs_auto_snapshot; + + kstat_named_t zfs_spa_discard_memory_limit; + kstat_named_t zfs_async_block_max_blocks; + kstat_named_t zfs_initialize_chunk_size; + kstat_named_t zfs_scan_suspend_progress; + kstat_named_t zfs_removal_suspend_progress; + kstat_named_t zfs_livelist_max_entries; + + kstat_named_t zfs_allow_redacted_dataset_mount; + kstat_named_t zfs_checksum_events_per_second; + kstat_named_t zfs_commit_timeout_pct; + kstat_named_t zfs_compressed_arc_enabled; + kstat_named_t zfs_condense_indirect_commit_entry_delay_ms; + kstat_named_t zfs_condense_min_mapping_bytes; + kstat_named_t zfs_deadman_checktime_ms; + kstat_named_t zfs_deadman_failmode; + kstat_named_t zfs_deadman_synctime_ms; + kstat_named_t zfs_deadman_ziotime_ms; + kstat_named_t zfs_disable_ivset_guid_check; + kstat_named_t zfs_initialize_value; + kstat_named_t zfs_keep_log_spacemaps_at_export; + kstat_named_t l2arc_rebuild_blocks_min_l2size; + kstat_named_t l2arc_rebuild_enabled; + kstat_named_t l2arc_trim_ahead; + kstat_named_t zfs_livelist_condense_new_alloc; + kstat_named_t zfs_livelist_condense_sync_cancel; + kstat_named_t zfs_livelist_condense_sync_pause; + kstat_named_t zfs_livelist_condense_zthr_cancel; + kstat_named_t zfs_livelist_condense_zthr_pause; + kstat_named_t zfs_livelist_min_percent_shared; + kstat_named_t zfs_max_dataset_nesting; + kstat_named_t zfs_max_missing_tvds; + kstat_named_t metaslab_debug_load; + kstat_named_t metaslab_force_ganging; + kstat_named_t zfs_multihost_fail_intervals; + kstat_named_t zfs_multihost_import_intervals; + kstat_named_t zfs_multihost_interval; + kstat_named_t zfs_override_estimate_recordsize; + kstat_named_t zfs_remove_max_segment; + kstat_named_t zfs_resilver_min_time_ms; + kstat_named_t zfs_scan_legacy; + kstat_named_t zfs_scan_vdev_limit; + kstat_named_t zfs_slow_io_events_per_second; + kstat_named_t spa_load_verify_data; + kstat_named_t spa_load_verify_metadata; + kstat_named_t zfs_unlink_suspend_progress; + kstat_named_t zfs_vdev_min_ms_count; + kstat_named_t vdev_validate_skip; + kstat_named_t zfs_zevent_len_max; + kstat_named_t zio_slow_io_ms; +} osx_kstat_t; + +extern unsigned int zfs_vnop_ignore_negatives; +extern unsigned int zfs_vnop_ignore_positives; +extern unsigned int zfs_vnop_create_negatives; +extern unsigned int zfs_vnop_skip_unlinked_drain; +extern uint64_t zfs_vfs_sync_paranoia; +extern uint64_t vnop_num_vnodes; +extern uint64_t vnop_num_reclaims; + +extern unsigned long zfs_arc_max; +extern unsigned long zfs_arc_min; +extern unsigned long zfs_arc_meta_limit; +extern uint64_t zfs_arc_meta_min; +extern int zfs_arc_grow_retry; +extern int zfs_arc_shrink_shift; +extern int zfs_arc_p_min_shift; +extern int zfs_arc_average_blocksize; + +extern uint64_t l2arc_write_max; +extern uint64_t l2arc_write_boost; +extern uint64_t l2arc_headroom; +extern uint64_t l2arc_headroom_boost; +extern uint64_t l2arc_feed_secs; +extern uint64_t l2arc_feed_min_ms; + +extern uint32_t zfs_vdev_max_active; +extern uint32_t zfs_vdev_sync_read_min_active; +extern uint32_t zfs_vdev_sync_read_max_active; +extern uint32_t zfs_vdev_sync_write_min_active; +extern uint32_t zfs_vdev_sync_write_max_active; +extern uint32_t zfs_vdev_async_read_min_active; +extern uint32_t zfs_vdev_async_read_max_active; +extern uint32_t zfs_vdev_async_write_min_active; +extern uint32_t zfs_vdev_async_write_max_active; +extern uint32_t zfs_vdev_scrub_min_active; +extern uint32_t zfs_vdev_scrub_max_active; +extern int zfs_vdev_async_write_active_min_dirty_percent; +extern int zfs_vdev_async_write_active_max_dirty_percent; +extern int zfs_vdev_aggregation_limit; +extern int zfs_vdev_read_gap_limit; +extern int zfs_vdev_write_gap_limit; + +extern uint_t arc_reduce_dnlc_percent; +extern int arc_lotsfree_percent; +extern hrtime_t zfs_delay_max_ns; +extern int spa_asize_inflation; +extern unsigned int zfetch_max_streams; +extern unsigned int zfetch_min_sec_reap; +extern int zfs_default_bs; +extern int zfs_default_ibs; +extern uint64_t metaslab_aliquot; +extern int zfs_vdev_cache_max; +extern int spa_max_replication_override; +extern int zfs_no_scrub_io; +extern int zfs_no_scrub_prefetch; +extern ssize_t zfs_immediate_write_sz; +extern offset_t zfs_read_chunk_size; +extern uint64_t metaslab_df_alloc_threshold; +extern int metaslab_df_free_pct; +extern ssize_t zvol_immediate_write_sz; + +extern boolean_t l2arc_noprefetch; +extern boolean_t l2arc_feed_again; +extern boolean_t l2arc_norw; + +extern int zfs_top_maxinflight; +extern int zfs_resilver_delay; +extern int zfs_scrub_delay; +extern int zfs_scan_idle; + +extern int64_t zfs_free_bpobj_enabled; + +extern int zfs_send_corrupt_data; +extern int zfs_send_queue_length; +extern int zfs_recv_queue_length; + +extern uint64_t zvol_inhibit_dev; +extern int zfs_send_set_freerecords_bit; + +extern uint64_t zfs_write_implies_delete_child; +extern uint32_t send_holes_without_birth_time; +extern uint64_t zfs_send_holes_without_birth_time; + +extern uint64_t dbuf_cache_max_bytes; + +extern int zfs_vdev_queue_depth_pct; +extern boolean_t zio_dva_throttle_enabled; + +extern uint64_t zfs_lua_max_instrlimit; +extern uint64_t zfs_lua_max_memlimit; + + +extern uint64_t zfs_trim_extent_bytes_max; +extern uint64_t zfs_trim_extent_bytes_min; +extern unsigned int zfs_trim_metaslab_skip; +extern uint64_t zfs_trim_txg_batch; +extern uint64_t zfs_trim_queue_limit; + +extern int zfs_send_unmodified_spill_blocks; +extern int zfs_special_class_metadata_reserve_pct; + +extern int zfs_vnop_force_formd_normalized_output; + +extern int zfs_arc_min_prefetch_ms; +extern int zfs_arc_min_prescient_prefetch_ms; + +extern int zfs_expire_snapshot; +extern int zfs_admin_snapshot; +extern int zfs_auto_snapshot; + +extern unsigned long zfs_spa_discard_memory_limit; +extern unsigned long zfs_async_block_max_blocks; +extern unsigned long zfs_initialize_chunk_size; +extern int zfs_scan_suspend_progress; +extern int zfs_removal_suspend_progress; +extern unsigned long zfs_livelist_max_entries; + +extern int zfs_allow_redacted_dataset_mount; +extern unsigned int zfs_checksum_events_per_second; +extern int zfs_commit_timeout_pct; +extern int zfs_compressed_arc_enabled; +extern int zfs_condense_indirect_commit_entry_delay_ms; +extern unsigned long zfs_condense_min_mapping_bytes; +extern unsigned long zfs_deadman_checktime_ms; +extern char *zfs_deadman_failmode; +extern unsigned long zfs_deadman_synctime_ms; +extern unsigned long zfs_deadman_ziotime_ms; +extern int zfs_disable_ivset_guid_check; +extern unsigned long zfs_initialize_value; +extern int zfs_keep_log_spacemaps_at_export; +extern unsigned long l2arc_rebuild_blocks_min_l2size; +extern int l2arc_rebuild_enabled; +extern unsigned long l2arc_trim_ahead; +extern int zfs_livelist_condense_new_alloc; +extern int zfs_livelist_condense_sync_cancel; +extern int zfs_livelist_condense_sync_pause; +extern int zfs_livelist_condense_zthr_cancel; +extern int zfs_livelist_condense_zthr_pause; +extern int zfs_livelist_min_percent_shared; +extern int zfs_max_dataset_nesting; +extern unsigned long zfs_max_missing_tvds; +extern int metaslab_debug_load; +extern unsigned long metaslab_force_ganging; +extern unsigned int zfs_multihost_fail_intervals; +extern unsigned int zfs_multihost_import_intervals; +extern unsigned long zfs_multihost_interval; +extern int zfs_override_estimate_recordsize; +extern int zfs_remove_max_segment; +extern int zfs_resilver_min_time_ms; +extern int zfs_scan_legacy; +extern unsigned long zfs_scan_vdev_limit; +extern unsigned int zfs_slow_io_events_per_second; +extern int spa_load_verify_data; +extern int spa_load_verify_metadata; +extern int zfs_unlink_suspend_progress; +extern int zfs_vdev_min_ms_count; +extern int vdev_validate_skip; +extern int zfs_zevent_len_max; +extern int zio_slow_io_ms; + +int kstat_osx_init(void); +void kstat_osx_fini(void); + +int arc_kstat_update(kstat_t *ksp, int rw); +int arc_kstat_update_osx(kstat_t *ksp, int rw); + +#endif diff --git a/include/os/macos/zfs/sys/ldi_buf.h b/include/os/macos/zfs/sys/ldi_buf.h new file mode 100644 index 0000000000..9b69b0610a --- /dev/null +++ b/include/os/macos/zfs/sys/ldi_buf.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + * + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +#ifndef _SYS_LDI_BUF_H +#define _SYS_LDI_BUF_H + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* + * Buffer context for LDI strategy + */ +typedef struct ldi_buf { + /* For client use */ + int (*b_iodone)(struct ldi_buf *); /* Callback */ + union { + void *b_addr; /* Passed buffer address */ + } b_un; /* Union to match illumos */ + uint64_t b_bcount; /* Size of IO */ + uint64_t b_bufsize; /* Size of buffer */ + uint64_t b_lblkno; /* logical block number */ + uint64_t b_resid; /* Remaining IO size */ + int b_flags; /* Read or write, options */ + int b_error; /* IO error code */ + uint64_t pad; /* Pad to 64 bytes */ +} ldi_buf_t; /* XXX Currently 64b */ + +ldi_buf_t *ldi_getrbuf(int); +void ldi_freerbuf(ldi_buf_t *); +void ldi_bioinit(ldi_buf_t *); + +/* Define macros to get and release a buffer */ +#define getrbuf(flags) ldi_getrbuf(flags) +#define freerbuf(lbp) ldi_freerbuf(lbp) +#define bioinit(lbp) ldi_bioinit(lbp) +#define geterror(lbp) (lbp->b_error) +#define biowait(lbp) (0) + +#define lbtodb(bytes) \ + (bytes >> DEV_BSHIFT) +#define dbtolb(blkno) \ + (blkno << DEV_BSHIFT) +#define ldbtob(blkno) dbtolb(blkno) + +/* Redefine B_BUSY */ +#define B_BUSY B_PHYS + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _SYS_LDI_BUF_H */ diff --git a/include/os/macos/zfs/sys/ldi_impl_osx.h b/include/os/macos/zfs/sys/ldi_impl_osx.h new file mode 100644 index 0000000000..68c8d121ab --- /dev/null +++ b/include/os/macos/zfs/sys/ldi_impl_osx.h @@ -0,0 +1,226 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * Portions of this document are copyright Oracle and Joyent. + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +#ifndef _SYS_LDI_IMPL_OSX_H +#define _SYS_LDI_IMPL_OSX_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* + * OS X + */ +#define LDI_TYPE_INVALID 0x0 /* uninitialized */ +#define LDI_TYPE_IOKIT 0x1 /* IOMedia device */ +#define LDI_TYPE_VNODE 0x2 /* vnode (bdev) device */ + +/* + * OS X + */ +#define LDI_STATUS_OFFLINE 0x0 /* device offline (dead-end) */ +#define LDI_STATUS_CLOSED 0x1 /* just initialized or closed */ +#define LDI_STATUS_CLOSING 0x2 /* close in-progress */ +#define LDI_STATUS_OPENING 0x3 /* open in-progress */ +#define LDI_STATUS_ONLINE 0x4 /* device is open and active */ +typedef uint_t ldi_status_t; + +/* + * LDI hash definitions + */ +#define LH_HASH_SZ 32 /* number of hash lists */ + +/* + * Flag for LDI handle's lh_flags field + */ +#define LH_FLAGS_NOTIFY 0x0001 /* invoked in context of a notify */ + + +/* + * LDI handle (OS X) + */ +typedef struct _handle_iokit *handle_iokit_t; +typedef struct _handle_vnode *handle_vnode_t; +typedef struct _handle_notifier *handle_notifier_t; + +struct ldi_handle { + /* protected by ldi_handle_hash_lock */ + list_node_t lh_node; /* list membership */ + uint_t lh_ref; /* active references */ + uint_t lh_flags; /* for notify event */ + + /* protected by handle lh_lock */ + kmutex_t lh_lock; /* internal lock */ + kcondvar_t lh_cv; /* for concurrent open */ + ldi_status_t lh_status; /* Closed, Offline, Online */ + uint_t lh_openref; /* open client count */ + + /* unique/static fields in the handle */ + union ldi_handle_tsd { + handle_iokit_t iokit_tsd; + handle_vnode_t vnode_tsd; + } lh_tsd; /* union */ + handle_notifier_t lh_notifier; /* pointer */ + uint_t lh_type; /* IOKit or vnode */ + uint_t lh_fmode; /* FREAD | FWRITE */ + dev_t lh_dev; /* device number */ + uint_t pad; /* pad to 96 bytes */ +}; /* XXX Currently 96b */ + +/* Shared functions */ +struct ldi_handle *handle_alloc_common(uint_t, dev_t, int); +struct ldi_handle *handle_find(dev_t, int, boolean_t); +struct ldi_handle *handle_add(struct ldi_handle *); +int handle_status_change(struct ldi_handle *, int); +void handle_hold(struct ldi_handle *); +void handle_release(struct ldi_handle *); +ldi_status_t handle_open_start(struct ldi_handle *); +void handle_open_done(struct ldi_handle *, ldi_status_t); + +/* Handle IOKit functions */ +void handle_free_iokit(struct ldi_handle *); +struct ldi_handle *handle_alloc_iokit(dev_t, int); +int handle_register_notifier(struct ldi_handle *); +int handle_close_iokit(struct ldi_handle *); +int handle_free_ioservice(struct ldi_handle *); +int handle_alloc_ioservice(struct ldi_handle *); +int handle_remove_notifier(struct ldi_handle *); +int handle_set_wce_iokit(struct ldi_handle *, int *); +int handle_get_size_iokit(struct ldi_handle *, uint64_t *); +int handle_get_dev_path_iokit(struct ldi_handle *lh, + char *path, int len); +int handle_get_media_info_iokit(struct ldi_handle *, + struct dk_minfo *); +int handle_get_media_info_ext_iokit(struct ldi_handle *, + struct dk_minfo_ext *); +int handle_check_media_iokit(struct ldi_handle *, int *); +int handle_is_solidstate_iokit(struct ldi_handle *, int *); +int handle_sync_iokit(struct ldi_handle *); +int buf_strategy_iokit(ldi_buf_t *, struct ldi_handle *); +int ldi_open_media_by_dev(dev_t, int, ldi_handle_t *); +int ldi_open_media_by_path(char *, int, ldi_handle_t *); +int handle_get_bootinfo_iokit(struct ldi_handle *, + struct io_bootinfo *); +int handle_features_iokit(struct ldi_handle *, + uint32_t *); +int handle_unmap_iokit(struct ldi_handle *, + dkioc_free_list_ext_t *); + +/* Handle vnode functions */ +dev_t dev_from_path(char *); +void handle_free_vnode(struct ldi_handle *); +struct ldi_handle *handle_alloc_vnode(dev_t, int); +int handle_close_vnode(struct ldi_handle *); +int handle_get_size_vnode(struct ldi_handle *, uint64_t *); +int handle_get_dev_path_vnode(struct ldi_handle *lh, + char *path, int len); +int handle_get_media_info_vnode(struct ldi_handle *, + struct dk_minfo *); +int handle_get_media_info_ext_vnode(struct ldi_handle *, + struct dk_minfo_ext *); +int handle_check_media_vnode(struct ldi_handle *, int *); +int handle_is_solidstate_vnode(struct ldi_handle *, int *); +int handle_sync_vnode(struct ldi_handle *); +int buf_strategy_vnode(ldi_buf_t *, struct ldi_handle *); +int ldi_open_vnode_by_path(char *, dev_t, int, ldi_handle_t *); +int handle_get_bootinfo_vnode(struct ldi_handle *, + struct io_bootinfo *); +int handle_features_vnode(struct ldi_handle *, + uint32_t *); +int handle_unmap_vnode(struct ldi_handle *, + dkioc_free_list_ext_t *); + +/* + * LDI event information + */ +typedef struct ldi_ev_callback_impl { + struct ldi_handle *lec_lhp; +#ifdef illumos + dev_info_t *lec_dip; +#endif + dev_t lec_dev; + int lec_spec; + int (*lec_notify)(ldi_handle_t, ldi_ev_cookie_t, void *, void *); + void (*lec_finalize)(ldi_handle_t, ldi_ev_cookie_t, int, + void *, void *); + void *lec_arg; + void *lec_cookie; + void *lec_id; + list_node_t lec_list; +} ldi_ev_callback_impl_t; /* XXX Currently 72b */ + +/* + * Members of "struct ldi_ev_callback_list" are protected by their le_lock + * member. The struct is currently only used once, as a file-level global, + * and the locking protocol is currently implemented in ldi_ev_lock() and + * ldi_ev_unlock(). + * + * When delivering events to subscribers, ldi_invoke_notify() and + * ldi_invoke_finalize() will walk the list of callbacks: le_head. It is + * possible that an invoked callback function will need to unregister an + * arbitrary number of callbacks from this list. + * + * To enable ldi_ev_remove_callbacks() to remove elements from the list + * without breaking the walk-in-progress, we store the next element in the + * walk direction on the struct as le_walker_next and le_walker_prev. + */ +struct ldi_ev_callback_list { + kmutex_t le_lock; + kcondvar_t le_cv; + uint64_t le_busy; + void *le_thread; + list_t le_head; + ldi_ev_callback_impl_t *le_walker_next; + ldi_ev_callback_impl_t *le_walker_prev; +}; /* XXX Currently 96b, but only used once */ + +int ldi_invoke_notify(dev_info_t *, dev_t, int, char *, void *); +void ldi_invoke_finalize(dev_info_t *, dev_t, int, char *, int, void *); +int e_ddi_offline_notify(dev_info_t *); +void e_ddi_offline_finalize(dev_info_t *, int); + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _SYS_LDI_IMPL_OSX_H */ diff --git a/include/os/macos/zfs/sys/ldi_osx.h b/include/os/macos/zfs/sys/ldi_osx.h new file mode 100644 index 0000000000..2d78017c42 --- /dev/null +++ b/include/os/macos/zfs/sys/ldi_osx.h @@ -0,0 +1,153 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * Portions of this document are copyright Oracle and Joyent. + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +#ifndef _SYS_LDI_OSX_H +#define _SYS_LDI_OSX_H + +#include + +/* + * OS X - The initialization/destructor functions are available + * for zfs-osx.cpp to call during zfs_init/zfs_fini. + */ +#ifdef __cplusplus +extern "C" { + +int ldi_init(void *); /* passes IOService provider */ +void ldi_fini(); /* teardown */ +#endif /* __cplusplus */ + +/* + * Opaque layered driver data structures. + * vdev_disk and other C callers may use these LDI interfaces + * ldi_ident_t is already defined as typedef void* by spl sunddi.h + */ +typedef struct __ldi_handle *ldi_handle_t; +typedef struct __ldi_callback_id *ldi_callback_id_t; +typedef struct __ldi_ev_cookie *ldi_ev_cookie_t; + +/* + * LDI event interface related + */ +#define LDI_EV_SUCCESS 0 +#define LDI_EV_FAILURE (-1) +#define LDI_EV_NONE (-2) /* no matching callbacks registered */ +#define LDI_EV_OFFLINE "LDI:EVENT:OFFLINE" +#define LDI_EV_DEGRADE "LDI:EVENT:DEGRADE" +#define LDI_EV_DEVICE_REMOVE "LDI:EVENT:DEVICE_REMOVE" + +#define LDI_EV_CB_VERS_1 1 +#define LDI_EV_CB_VERS LDI_EV_CB_VERS_1 + +typedef struct ldi_ev_callback { + uint_t cb_vers; + int (*cb_notify)(ldi_handle_t, ldi_ev_cookie_t, void *, void *); + void (*cb_finalize)(ldi_handle_t, ldi_ev_cookie_t, int, + void *, void *); +} ldi_ev_callback_t; + +/* Structs passed to media_get_info */ +struct dk_minfo { + uint32_t dki_capacity; /* Logical block count */ + uint32_t dki_lbsize; /* Logical block size */ +}; /* (8b) */ + +struct dk_minfo_ext { + uint64_t dki_capacity; /* Logical block count */ + uint32_t dki_lbsize; /* Logical block size */ + uint32_t dki_pbsize; /* Physical block size */ +}; /* (16b) */ + +struct io_bootinfo { + char dev_path[MAXPATHLEN]; /* IODeviceTree path */ + uint64_t dev_size; /* IOMedia device size */ +}; + +/* + * XXX This struct is defined in spl but was unused until now. + * There is a reference in zvol.c zvol_ioctl, commented out. + */ +#if 0 +struct dk_callback { + void (*dkc_callback)(void *dkc_cookie, int error); + void *dkc_cookie; + int dkc_flag; +}; /* XXX Currently 20b */ +#endif + +/* XXX Already defined in spl dkio.h (used elsewhere) */ +#if 0 +#define DKIOCFLUSHWRITECACHE (DKIOC | 34) +#endif + +#define FLUSH_VOLATILE 0x1 +#define DKIOCGMEDIAINFOEXT (DKIOC | 48) + +/* XXX Created this additional ioctl */ +#define DKIOCGETBOOTINFO (DKIOC | 99) + +/* + * LDI Handle manipulation functions + */ +int ldi_open_by_dev(dev_t, int, int, cred_t *, + ldi_handle_t *, __unused ldi_ident_t); +int ldi_open_by_name(char *, int, cred_t *, + ldi_handle_t *, __unused ldi_ident_t); + +int ldi_close(ldi_handle_t, int, cred_t *); + +int ldi_sync(ldi_handle_t); +int ldi_get_size(ldi_handle_t, uint64_t *); +int ldi_ioctl(ldi_handle_t, int, intptr_t, int, cred_t *, int *); +int ldi_strategy(ldi_handle_t, ldi_buf_t *); + +/* + * LDI events related declarations + */ +extern int ldi_ev_get_cookie(ldi_handle_t, char *, ldi_ev_cookie_t *); +extern char *ldi_ev_get_type(ldi_ev_cookie_t); +extern int ldi_ev_register_callbacks(ldi_handle_t, ldi_ev_cookie_t, + ldi_ev_callback_t *, void *, ldi_callback_id_t *); +extern int ldi_ev_remove_callbacks(ldi_callback_id_t); + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _SYS_LDI_OSX_H */ diff --git a/include/os/macos/zfs/sys/trace_zfs.h b/include/os/macos/zfs/sys/trace_zfs.h new file mode 100644 index 0000000000..f32ba529ec --- /dev/null +++ b/include/os/macos/zfs/sys/trace_zfs.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#if !defined(_TRACE_ZFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZFS_H + +#include +#include + +/* + * The sys/trace_dbgmsg.h header defines tracepoint events for + * dprintf(), dbgmsg(), and SET_ERROR(). + */ +#define _SYS_TRACE_DBGMSG_INDIRECT +#include +#undef _SYS_TRACE_DBGMSG_INDIRECT + +/* + * Redefine the DTRACE_PROBE* functions to use Linux tracepoints + */ +#undef DTRACE_PROBE1 +#define DTRACE_PROBE1(name, t1, arg1) \ + trace_zfs_##name((arg1)) + +#undef DTRACE_PROBE2 +#define DTRACE_PROBE2(name, t1, arg1, t2, arg2) \ + trace_zfs_##name((arg1), (arg2)) + +#undef DTRACE_PROBE3 +#define DTRACE_PROBE3(name, t1, arg1, t2, arg2, t3, arg3) \ + trace_zfs_##name((arg1), (arg2), (arg3)) + +#undef DTRACE_PROBE4 +#define DTRACE_PROBE4(name, t1, arg1, t2, arg2, t3, arg3, t4, arg4) \ + trace_zfs_##name((arg1), (arg2), (arg3), (arg4)) + +#endif /* _TRACE_ZFS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace +#include + +#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/os/macos/zfs/sys/vdev_disk_os.h b/include/os/macos/zfs/sys/vdev_disk_os.h new file mode 100644 index 0000000000..79b68c7ee6 --- /dev/null +++ b/include/os/macos/zfs/sys/vdev_disk_os.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _ZFS_VDEV_DISK_OS_H +#define _ZFS_VDEV_DISK_OS_H + +#include + +typedef struct vdev_disk { + ldi_handle_t vd_lh; + list_t vd_ldi_cbs; + boolean_t vd_ldi_offline; +} vdev_disk_t; + +/* + * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. + */ +typedef struct vdev_buf { + ldi_buf_t vb_buf; /* buffer that describes the io */ + zio_t *vb_io; /* pointer back to the original zio_t */ +} vdev_buf_t; + + +extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); + +#endif diff --git a/include/os/macos/zfs/sys/zfs_boot.h b/include/os/macos/zfs/sys/zfs_boot.h new file mode 100644 index 0000000000..cad5c0bdfd --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_boot.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#ifndef ZFS_BOOT_H_INCLUDED +#define ZFS_BOOT_H_INCLUDED + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* Link data vdevs to virtual devices */ +int zfs_boot_update_bootinfo(spa_t *spa); + +int zfs_attach_devicedisk(zfsvfs_t *zfsvfs); +int zfs_detach_devicedisk(zfsvfs_t *zfsvfs); +int zfs_devdisk_get_path(void *, char *, int); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + + + +#ifdef __cplusplus +#include +bool zfs_boot_init(IOService *); +void zfs_boot_fini(); +#endif /* __cplusplus */ + + +#endif /* ZFS_BOOT_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/zfs_context_os.h b/include/os/macos/zfs/sys/zfs_context_os.h new file mode 100644 index 0000000000..097152f26e --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_context_os.h @@ -0,0 +1,175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_ZFS_CONTEXT_OS_H +#define _SPL_ZFS_CONTEXT_OS_H + +#include +#include +#include + +#define MSEC_TO_TICK(msec) ((msec) / (MILLISEC / hz)) + +#define KMALLOC_MAX_SIZE MAXPHYS + +#define MNTTYPE_ZFS_SUBTYPE ('Z'<<24|'F'<<16|'S'<<8) + +#ifndef MAX_UPL_TRANSFER +#define MAX_UPL_TRANSFER 256 +#endif + +#define flock64_t struct flock + +struct spa_iokit; +typedef struct spa_iokit spa_iokit_t; + +#define noinline __attribute__((noinline)) + +/* really? */ +#define kpreempt_disable() ((void)0) +#define kpreempt_enable() ((void)0) +#define cond_resched() (void)thread_block(THREAD_CONTINUE_NULL); +#define schedule() (void)thread_block(THREAD_CONTINUE_NULL); + +#define current curthread + +extern boolean_t ml_set_interrupts_enabled(boolean_t); + +/* Make sure kmem and vmem are already included */ +#include +#include + +/* Since Linux code uses vmem_free() and we already have one: */ +#define vmem_free(A, B) zfs_kmem_free((A), (B)) +#define vmem_alloc(A, B) zfs_kmem_alloc((A), (B)) +#define vmem_zalloc(A, B) zfs_kmem_zalloc((A), (B)) + +typedef int fstrans_cookie_t; +#define spl_fstrans_mark() (0) +#define spl_fstrans_unmark(x) (x = 0) + +#ifdef _KERNEL + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + +typedef struct { + volatile int counter; +} atomic_t; + +#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) + +#define barrier() __asm__ __volatile__("": : :"memory") +#define smp_rmb() barrier() + +#define READ_ONCE(x) ( \ +{ \ + __typeof(x) __var = ( \ + { \ + barrier(); \ + ACCESS_ONCE(x); \ + }); \ + barrier(); \ + __var; \ + }) + +#define WRITE_ONCE(x, v) do { \ + barrier(); \ + ACCESS_ONCE(x) = (v); \ + barrier(); \ + } while (0) + +/* BEGIN CSTYLED */ +#define hlist_for_each(p, head) \ + for (p = (head)->first; p; p = (p)->next) + +#define hlist_entry(ptr, type, field) container_of(ptr, type, field) +/* END CSTYLED */ + +static inline void +hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + n->next = h->first; + if (h->first != NULL) + h->first->pprev = &n->next; + WRITE_ONCE(h->first, n); + n->pprev = &h->first; +} + +static inline void +hlist_del(struct hlist_node *n) +{ + WRITE_ONCE(*(n->pprev), n->next); + if (n->next != NULL) + n->next->pprev = n->pprev; +} + + +#define HLIST_HEAD_INIT { } +#define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT +#define INIT_HLIST_HEAD(head) (head)->first = NULL + +/* BEGIN CSTYLED */ +#define INIT_HLIST_NODE(node) \ + do { \ + (node)->next = NULL; \ + (node)->pprev = NULL; \ + } while (0) + +/* END CSTYLED */ + +static inline int +atomic_read(const atomic_t *v) +{ + return (READ_ONCE(v->counter)); +} + +static inline int +atomic_inc(atomic_t *v) +{ + return (__sync_fetch_and_add(&v->counter, 1) + 1); +} + +static inline int +atomic_dec(atomic_t *v) +{ + return (__sync_fetch_and_add(&v->counter, -1) - 1); +} + +extern void kx_qsort(void *array, size_t nm, size_t member_size, + int (*cmpf)(const void *, const void *)); +#define qsort kx_qsort + +#define strstr kmem_strstr + +void spa_create_os(void *spa); +void spa_export_os(void *spa); +void spa_activate_os(void *spa); +void spa_deactivate_os(void *spa); + +#endif // _KERNEL + +#endif diff --git a/include/os/macos/zfs/sys/zfs_ctldir.h b/include/os/macos/zfs/sys/zfs_ctldir.h new file mode 100644 index 0000000000..4cacf1aefe --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_ctldir.h @@ -0,0 +1,124 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * LLNL-CODE-403049. + * Rewritten for Linux by: + * Rohan Puri + * Brian Behlendorf + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#include +#include +#include +#include + +#define ZFS_CTLDIR_NAME ".zfs" +#define ZFS_SNAPDIR_NAME "snapshot" +#define ZFS_SHAREDIR_NAME "shares" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == ZTOZSB(zdp)->z_root && \ + (ZTOZSB(zdp)->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + (ZTOZSB(zdp)->z_show_ctldir)) + +struct path; + +extern int zfs_expire_snapshot; + +/* zfsctl generic functions */ +extern int zfsctl_create(zfsvfs_t *); +extern void zfsctl_destroy(zfsvfs_t *); +extern struct vnode *zfsctl_root(znode_t *); +extern void zfsctl_init(void); +extern void zfsctl_fini(void); +extern boolean_t zfsctl_is_node(struct vnode *ip); +extern boolean_t zfsctl_is_snapdir(struct vnode *ip); +extern int zfsctl_fid(struct vnode *ip, fid_t *fidp); + +/* zfsctl '.zfs' functions */ +extern int zfsctl_root_lookup(struct vnode *dip, char *name, + struct vnode **ipp, int flags, cred_t *cr, int *direntflags, + struct componentname *realpnp); + +/* zfsctl '.zfs/snapshot' functions */ +extern int zfsctl_snapdir_lookup(struct vnode *dip, char *name, + struct vnode **ipp, int flags, cred_t *cr, int *direntflags, + struct componentname *realpnp); +extern int zfsctl_snapdir_rename(struct vnode *sdip, char *sname, + struct vnode *tdip, char *tname, cred_t *cr, int flags); +extern int zfsctl_snapdir_remove(struct vnode *dip, char *name, cred_t *cr, + int flags); +extern int zfsctl_snapdir_mkdir(struct vnode *dip, char *dirname, vattr_t *vap, + struct vnode **ipp, cred_t *cr, int flags); +extern int zfsctl_snapshot_mount(struct vnode *, int flags); +extern int zfsctl_snapshot_unmount(const char *, int flags); +extern int zfsctl_snapshot_unmount_node(struct vnode *, const char *, + int flags); +extern int zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, + int delay); +extern int zfsctl_snapdir_vget(struct mount *sb, uint64_t objsetid, + int gen, struct vnode **ipp); + +/* zfsctl '.zfs/shares' functions */ +extern int zfsctl_shares_lookup(struct vnode *dip, char *name, + struct vnode **ipp, int flags, cred_t *cr, int *direntflags, + struct componentname *realpnp); + +extern int zfsctl_vnop_lookup(struct vnop_lookup_args *); +extern int zfsctl_vnop_getattr(struct vnop_getattr_args *); +extern int zfsctl_vnop_readdir(struct vnop_readdir_args *); +extern int zfsctl_vnop_mkdir(struct vnop_mkdir_args *); +extern int zfsctl_vnop_rmdir(struct vnop_rmdir_args *); +extern int zfsctl_vnop_access(struct vnop_access_args *); +extern int zfsctl_vnop_open(struct vnop_open_args *); +extern int zfsctl_vnop_close(struct vnop_close_args *); +extern int zfsctl_vnop_inactive(struct vnop_inactive_args *); +extern int zfsctl_vnop_reclaim(struct vnop_reclaim_args *); + +extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, + const char *name); + +extern void zfsctl_mount_signal(char *, boolean_t); + + +/* + * These vnodes numbers are reserved for the .zfs control directory. + * It is important that they be no larger that 48-bits because only + * 6 bytes are reserved in the NFS file handle for the object number. + * However, they should be as large as possible to avoid conflicts + * with the objects which are assigned monotonically by the dmu. + */ +#define ZFSCTL_INO_ROOT 0x0000FFFFFFFFFFFFULL +#define ZFSCTL_INO_SHARES 0x0000FFFFFFFFFFFEULL +#define ZFSCTL_INO_SNAPDIR 0x0000FFFFFFFFFFFDULL +#define ZFSCTL_INO_SNAPDIRS 0x0000FFFFFFFFFFFCULL + +#define ZFSCTL_EXPIRE_SNAPSHOT 300 + +#endif /* _ZFS_CTLDIR_H */ diff --git a/include/os/macos/zfs/sys/zfs_dir.h b/include/os/macos/zfs/sys/zfs_dir.h new file mode 100644 index 0000000000..cfee82308a --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_dir.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ +#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ +#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ +#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ +#define IS_REPLAY 0x04 /* we are replaying intent log */ + +extern int zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, + znode_t **zpp, int flag, int *direntflags, + struct componentname *realpnp); + +extern void zfs_dirent_unlock(zfs_dirlock_t *); +extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, + boolean_t *); + +extern int zfs_dirlook(znode_t *, char *name, znode_t **, int, + int *deflg, struct componentname *rpnp); + +extern void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids); + +extern void zfs_rmnode(znode_t *); +extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); +extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); +extern void zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); + +extern int zfs_get_xattrdir(znode_t *, znode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, znode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/include/os/macos/zfs/sys/zfs_ioctl_compat.h b/include/os/macos/zfs/sys/zfs_ioctl_compat.h new file mode 100644 index 0000000000..15f12b34fe --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_ioctl_compat.h @@ -0,0 +1,213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Jorgen Lundan . All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_IOCTL_COMPAT_H +#define _SYS_ZFS_IOCTL_COMPAT_H + +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Backwards ioctl compatibility + */ + +/* ioctl versions for vfs.zfs.version.ioctl */ +#define ZFS_IOCVER_UNDEF -1 +#define ZFS_IOCVER_NONE 0 +#define ZFS_IOCVER_1_9_4 1 +#define ZFS_IOCVER_ZOF 15 + +/* compatibility conversion flag */ +#define ZFS_CMD_COMPAT_NONE 0 +#define ZFS_CMD_COMPAT_V15 1 +#define ZFS_CMD_COMPAT_V28 2 + +#define ZFS_IOC_COMPAT_PASS 254 +#define ZFS_IOC_COMPAT_FAIL 255 + +#define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff) + +typedef struct zfs_iocparm { + uint32_t zfs_ioctl_version; + uint64_t zfs_cmd; + uint64_t zfs_cmd_size; + + /* + * ioctl() return codes can not be used to communicate - + * as XNU will skip copyout() if there is an error, so it + * is passed along in this wrapping structure. + */ + int zfs_ioc_error; /* ioctl error value */ +} zfs_iocparm_t; + +typedef struct zfs_cmd_1_9_4 +{ + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad3[3]; + boolean_t zc_resumable; + uint32_t zc_pad4; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; + int zc_ioc_error; /* ioctl error value */ + uint64_t zc_dev; /* OSX doesn't have ddi_driver_major */ +} zfs_cmd_1_9_4_t; + +// Figure this out +unsigned static long zfs_ioctl_1_9_4[] = +{ + // ZFS_IOC_POOL_CREATE = _IOWR('Z', 0, struct zfs_cmd), + + 0, /* 0 ZFS_IOC_POOL_CREATE */ + 1, /* 1 ZFS_IOC_POOL_DESTROY */ + 2, /* 2 ZFS_IOC_POOL_IMPORT */ + 3, /* 3 ZFS_IOC_POOL_EXPORT */ + 4, /* 4 ZFS_IOC_POOL_CONFIGS */ + 5, /* 5 ZFS_IOC_POOL_STATS */ + 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */ + 7, /* 7 ZFS_IOC_POOL_SCRUB */ + 8, /* 8 ZFS_IOC_POOL_FREEZE */ + 9, /* 9 ZFS_IOC_POOL_UPGRADE */ + 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */ + 11, /* 11 ZFS_IOC_VDEV_ADD */ + 12, /* 12 ZFS_IOC_VDEV_REMOVE */ + 13, /* 13 ZFS_IOC_VDEV_SET_STATE */ + 14, /* 14 ZFS_IOC_VDEV_ATTACH */ + 15, /* 15 ZFS_IOC_VDEV_DETACH */ + 16, /* 16 ZFS_IOC_VDEV_SETPATH */ + 18, /* 17 ZFS_IOC_OBJSET_STATS */ + 19, /* 18 ZFS_IOC_OBJSET_ZPLPROPS */ + 20, /* 19 ZFS_IOC_DATASET_LIST_NEXT */ + 21, /* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */ + 22, /* 21 ZFS_IOC_SET_PROP */ + ZFS_IOC_COMPAT_PASS, /* 22 ZFS_IOC_CREATE_MINOR */ + ZFS_IOC_COMPAT_PASS, /* 23 ZFS_IOC_REMOVE_MINOR */ + 23, /* 24 ZFS_IOC_CREATE */ + 24, /* 25 ZFS_IOC_DESTROY */ + 25, /* 26 ZFS_IOC_ROLLBACK */ + 26, /* 27 ZFS_IOC_RENAME */ + 27, /* 28 ZFS_IOC_RECV */ + 28, /* 29 ZFS_IOC_SEND */ + 29, /* 30 ZFS_IOC_INJECT_FAULT */ + 30, /* 31 ZFS_IOC_CLEAR_FAULT */ + 31, /* 32 ZFS_IOC_INJECT_LIST_NEXT */ + 32, /* 33 ZFS_IOC_ERROR_LOG */ + 33, /* 34 ZFS_IOC_CLEAR */ + 34, /* 35 ZFS_IOC_PROMOTE */ + 35, /* 36 ZFS_IOC_DESTROY_SNAPS */ + 36, /* 37 ZFS_IOC_SNAPSHOT */ + 37, /* 38 ZFS_IOC_DSOBJ_TO_DSNAME */ + 38, /* 39 ZFS_IOC_OBJ_TO_PATH */ + 39, /* 40 ZFS_IOC_POOL_SET_PROPS */ + 40, /* 41 ZFS_IOC_POOL_GET_PROPS */ + 41, /* 42 ZFS_IOC_SET_FSACL */ + 42, /* 43 ZFS_IOC_GET_FSACL */ + ZFS_IOC_COMPAT_PASS, /* 44 ZFS_IOC_ISCSI_PERM_CHECK */ + 43, /* 45 ZFS_IOC_SHARE */ + 44, /* 46 ZFS_IOC_IHNERIT_PROP */ + 58, /* 47 ZFS_IOC_JAIL */ + 59, /* 48 ZFS_IOC_UNJAIL */ + 45, /* 49 ZFS_IOC_SMB_ACL */ + 46, /* 50 ZFS_IOC_USERSPACE_ONE */ + 47, /* 51 ZFS_IOC_USERSPACE_MANY */ + 48, /* 52 ZFS_IOC_USERSPACE_UPGRADE */ + 17, /* 53 ZFS_IOC_SETFRU */ +}; + +#ifdef _KERNEL +int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); +void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int); +nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +#endif /* _KERNEL */ +void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int); +void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int); + +int wrap_avl_init(void); +int wrap_unicode_init(void); +int wrap_nvpair_init(void); +int wrap_zcommon_init(void); +int wrap_icp_init(void); +int wrap_lua_init(void); +void wrap_avl_fini(void); +void wrap_unicode_fini(void); +void wrap_nvpair_fini(void); +void wrap_zcommon_fini(void); +void wrap_icp_fini(void); +void wrap_lua_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_COMPAT_H */ diff --git a/include/os/macos/zfs/sys/zfs_mount.h b/include/os/macos/zfs/sys/zfs_mount.h new file mode 100644 index 0000000000..b69229c929 --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_mount.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SYS_ZFS_MOUNT_H_ +#define _SYS_ZFS_MOUNT_H_ + +struct zfs_mount_args { + const char *fspec; + int mflag; + char *optptr; + int optlen; + int struct_size; +}; + +/* + * Flag bits passed to mount(2). + */ +#define MS_RDONLY 0x0001 /* Read-only */ +#define MS_FSS 0x0002 /* Old (4-argument) mount (compatibility) */ +#define MS_DATA 0x0004 /* 6-argument mount */ +#define MS_NOSUID 0x0010 /* Setuid programs disallowed */ +#define MS_REMOUNT 0x0020 /* Remount */ +#define MS_NOTRUNC 0x0040 /* Return ENAMETOOLONG for long filenames */ +#define MS_OVERLAY 0x0080 /* Allow overlay mounts */ +#define MS_OPTIONSTR 0x0100 /* Data is a an in/out option string */ +#define MS_GLOBAL 0x0200 /* Clustering: Mount into global name space */ +#define MS_FORCE 0x0400 /* Forced unmount */ +#define MS_NOMNTTAB 0x0800 /* Don't show mount in mnttab */ +/* + * Additional flag bits that domount() is prepared to interpret, but that + * can't be passed through mount(2). + */ +#define MS_SYSSPACE 0x0008 /* Mounta already in kernel space */ +#define MS_NOSPLICE 0x1000 /* Don't splice fs instance into name space */ +#define MS_NOCHECK 0x2000 /* Clustering: suppress mount busy checks */ +/* + * Mask to sift out flag bits allowable from mount(2). + */ +#define MS_MASK \ + (MS_RDONLY|MS_FSS|MS_DATA|MS_NOSUID|MS_REMOUNT|MS_NOTRUNC|MS_OVERLAY|\ + MS_OPTIONSTR|MS_GLOBAL|MS_NOMNTTAB) + +/* + * Mask to sift out flag bits allowable from umount2(2). + */ + +#define MS_UMOUNT_MASK (MS_FORCE) + +/* + * Maximum option string length accepted or returned by mount(2). + */ +#define MAX_MNTOPT_STR 1024 /* max length of mount options string */ + + +#endif /* _SYS_ZFS_IOCTL_H */ diff --git a/include/os/macos/zfs/sys/zfs_vfsops.h b/include/os/macos/zfs/sys/zfs_vfsops.h new file mode 100644 index 0000000000..0e36b2fad4 --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_vfsops.h @@ -0,0 +1,291 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct zfs_sb; +struct znode; + +#ifdef __APPLE__ +#define APPLE_SA_RECOVER +/* #define WITH_SEARCHFS */ +/* #define WITH_READDIRATTR */ +#define HAVE_NAMED_STREAMS 1 +#define HAVE_PAGEOUT_V2 1 +#define HIDE_TRIVIAL_ACL 1 +#endif + +/* + * Status of the zfs_unlinked_drain thread. + */ +typedef enum drain_state { + ZFS_DRAIN_SHUTDOWN = 0, + ZFS_DRAIN_RUNNING, + ZFS_DRAIN_SHUTDOWN_REQ +} drain_state_t; + + +typedef struct zfsvfs zfsvfs_t; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + zfsvfs_t *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_root; /* id of root znode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ + uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ + boolean_t z_atime; /* enable atimes mount option */ + boolean_t z_unmounted; /* unmounted */ + rrmlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; + list_t z_all_znodes; /* all vnodes in the fs */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + struct vnode *z_ctldir; /* .zfs directory pointer */ + uint64_t z_ctldir_startid; /* Start of snapdir range */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_vscan; /* virus scan on/off */ + boolean_t z_use_fuids; /* version allows fuids */ + boolean_t z_replay; /* set during ZIL replay */ + boolean_t z_use_sa; /* version allow system attributes */ + boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ + uint64_t z_version; + uint64_t z_shares_dir; /* hidden shares dir */ + kmutex_t z_lock; + + /* for controlling async zfs_unlinked_drain */ + kmutex_t z_drain_lock; + kcondvar_t z_drain_cv; + drain_state_t z_drain_state; + + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; + uint64_t z_userobjquota_obj; + uint64_t z_groupobjquota_obj; + uint64_t z_projectquota_obj; + uint64_t z_projectobjquota_obj; + +#ifdef __APPLE__ + dev_t z_rdev; /* proxy device for mount */ + boolean_t z_rdonly; /* is mount read-only? */ + time_t z_mount_time; /* mount timestamp (for Spotlight) */ + time_t z_last_unmount_time; /* unmount timestamp (for Spotlight) */ + boolean_t z_xattr; /* enable atimes mount option */ + + avl_tree_t z_hardlinks; /* linkid hash avl tree for vget */ + avl_tree_t z_hardlinks_linkid; /* sorted on linkid */ + krwlock_t z_hardlinks_lock; /* lock to access z_hardlinks */ + + uint64_t z_notification_conditions; /* HFSIOC_VOLUME_STATUS */ + uint64_t z_freespace_notify_warninglimit; + uint64_t z_freespace_notify_dangerlimit; + uint64_t z_freespace_notify_desiredlevel; + + void *z_devdisk; /* Hold fake disk if prop devdisk is on */ + + uint64_t z_findernotify_space; + +#endif + uint64_t z_replay_eof; /* New end of file - replay only */ + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ + + uint64_t z_hold_size; /* znode hold array size */ + avl_tree_t *z_hold_trees; /* znode hold trees */ + kmutex_t *z_hold_locks; /* znode hold locks */ + taskqid_t z_drain_task; /* task id for the unlink drain task */ +}; +#define ZFS_OBJ_MTX_SZ 64 + +#ifdef __APPLE__ +struct hardlinks_struct { + avl_node_t hl_node; + avl_node_t hl_node_linkid; + uint64_t hl_parent; // parentid of entry + uint64_t hl_fileid; // the fileid (z_id) for vget + uint32_t hl_linkid; // the linkid, persistent over renames + char hl_name[PATH_MAX]; // cached name for vget +}; +typedef struct hardlinks_struct hardlinks_t; + +int zfs_vfs_uuid_unparse(uuid_t uuid, char *dst); +int zfs_vfs_uuid_gen(const char *osname, uuid_t uuid); +#endif + + +#define ZFS_SUPER_MAGIC 0x2fc12fc1 + +#define ZSB_XATTR 0x0001 /* Enable user xattrs */ + +/* + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. + * + * For normal filesystems, we partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + */ +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +extern uint_t zfs_fsyncer_key; + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valuep); +extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); +extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota); +extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *, + boolean_t isgroup); +extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, + uint64_t fuid); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); + +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, + uint64_t *value); + +extern int zfs_sb_create(const char *name, zfsvfs_t **zfsvfsp); +extern int zfs_sb_setup(zfsvfs_t *zfsvfs, boolean_t mounting); +extern void zfs_sb_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); +extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); + + + + +extern int zfs_vfs_init(struct vfsconf *vfsp); +extern int zfs_vfs_start(struct mount *mp, int flags, vfs_context_t context); +extern int zfs_vfs_mount(struct mount *mp, vnode_t *devvp, + user_addr_t data, vfs_context_t context); +extern int zfs_vfs_unmount(struct mount *mp, int mntflags, + vfs_context_t context); +extern int zfs_vfs_root(struct mount *mp, vnode_t **vpp, + vfs_context_t context); +extern int zfs_vfs_vget(struct mount *mp, ino64_t ino, vnode_t **vpp, + vfs_context_t context); +extern int zfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, + vfs_context_t context); +extern int zfs_vfs_setattr(struct mount *mp, struct vfs_attr *fsap, + vfs_context_t context); +extern int zfs_vfs_sync(struct mount *mp, int waitfor, vfs_context_t context); +extern int zfs_vfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, + vnode_t **vpp, vfs_context_t context); +extern int zfs_vfs_vptofh(vnode_t *vp, int *fhlenp, unsigned char *fhp, + vfs_context_t context); +extern int zfs_vfs_sysctl(int *name, uint_t namelen, user_addr_t oldp, + size_t *oldlenp, user_addr_t newp, size_t newlen, vfs_context_t context); +extern int zfs_vfs_quotactl(struct mount *mp, int cmds, uid_t uid, + caddr_t datap, vfs_context_t context); +extern int zfs_vfs_mountroot(struct mount *mp, struct vnode *vp, + vfs_context_t context); + +extern void zfs_init(void); +extern void zfs_fini(void); + +extern int zfs_vnode_lock(vnode_t *vp, int flags); +extern void zfs_freevfs(struct mount *vfsp); + +extern int zfsvfs_create(const char *name, boolean_t rd, zfsvfs_t **zfvp); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); + +extern int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, + uint64_t *val, char *setpoint); + +extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/include/os/macos/zfs/sys/zfs_vnops.h b/include/os/macos/zfs/sys/zfs_vnops.h new file mode 100644 index 0000000000..6d7df203ff --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_vnops.h @@ -0,0 +1,255 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VNOPS_H +#define _SYS_FS_ZFS_VNOPS_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Spotlight specific fcntl()'s + */ + +// Older defines +#define SPOTLIGHT_GET_MOUNT_TIME (FCNTL_FS_SPECIFIC_BASE + 0x00002) +#define SPOTLIGHT_GET_UNMOUNT_TIME (FCNTL_FS_SPECIFIC_BASE + 0x00003) + +// Newer defines, will these need a OSX version test to compile on older? +#define SPOTLIGHT_IOC_GET_MOUNT_TIME _IOR('h', 18, u_int32_t) +#define SPOTLIGHT_FSCTL_GET_MOUNT_TIME \ + IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME) +#define SPOTLIGHT_IOC_GET_LAST_MTIME _IOR('h', 19, u_int32_t) +#define SPOTLIGHT_FSCTL_GET_LAST_MTIME \ + IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME) + +/* + * Account for user timespec structure differences + */ +#ifdef ZFS_LEOPARD_ONLY +typedef struct timespec timespec_user32_t; +typedef struct user_timespec timespec_user64_t; +#else +typedef struct user32_timespec timespec_user32_t; +typedef struct user64_timespec timespec_user64_t; +#endif + +#define UNKNOWNUID ((uid_t)99) +#define UNKNOWNGID ((gid_t)99) + +#define DTTOVT(dtype) (iftovt_tab[(dtype)]) +#define kTextEncodingMacUnicode 0x7e +#define ZAP_AVENAMELEN (ZAP_MAXNAMELEN / 4) + +/* Finder information */ +struct finderinfo { + u_int32_t fi_type; /* files only */ + u_int32_t fi_creator; /* files only */ + u_int16_t fi_flags; + struct { + int16_t v; + int16_t h; + } fi_location; + int8_t fi_opaque[18]; +} __attribute__((aligned(2), packed)); +typedef struct finderinfo finderinfo_t; + +enum { + /* Finder Flags */ + kHasBeenInited = 0x0100, + kHasCustomIcon = 0x0400, + kIsStationery = 0x0800, + kNameLocked = 0x1000, + kHasBundle = 0x2000, + kIsInvisible = 0x4000, + kIsAlias = 0x8000 +}; + +/* Attribute packing information */ +typedef struct attrinfo { + struct attrlist *ai_attrlist; + void **ai_attrbufpp; + void **ai_varbufpp; + void *ai_varbufend; + vfs_context_t ai_context; +} attrinfo_t; + +/* + * Attributes that we can get for free from the zap (ie without a znode) + */ +#define ZFS_DIR_ENT_ATTRS ( \ + ATTR_CMN_NAME | ATTR_CMN_DEVID | ATTR_CMN_FSID | \ + ATTR_CMN_OBJTYPE | ATTR_CMN_OBJTAG | ATTR_CMN_OBJID | \ + ATTR_CMN_OBJPERMANENTID | ATTR_CMN_SCRIPT | \ + ATTR_CMN_FILEID) + +/* + * Attributes that we support + */ +#define ZFS_ATTR_BIT_MAP_COUNT 5 + +#define ZFS_ATTR_CMN_VALID ( \ + ATTR_CMN_NAME | ATTR_CMN_DEVID | ATTR_CMN_FSID | \ + ATTR_CMN_OBJTYPE | ATTR_CMN_OBJTAG | ATTR_CMN_OBJID | \ + ATTR_CMN_OBJPERMANENTID | ATTR_CMN_PAROBJID | \ + ATTR_CMN_SCRIPT | ATTR_CMN_CRTIME | ATTR_CMN_MODTIME | \ + ATTR_CMN_CHGTIME | ATTR_CMN_ACCTIME | \ + ATTR_CMN_BKUPTIME | ATTR_CMN_FNDRINFO | \ + ATTR_CMN_OWNERID | ATTR_CMN_GRPID | \ + ATTR_CMN_ACCESSMASK | ATTR_CMN_FLAGS | \ + ATTR_CMN_USERACCESS | ATTR_CMN_FILEID | \ + ATTR_CMN_PARENTID) + +#define ZFS_ATTR_DIR_VALID ( \ + ATTR_DIR_LINKCOUNT | ATTR_DIR_ENTRYCOUNT | \ + ATTR_DIR_MOUNTSTATUS) + +#define ZFS_ATTR_FILE_VALID ( \ + ATTR_FILE_LINKCOUNT |ATTR_FILE_TOTALSIZE | \ + ATTR_FILE_ALLOCSIZE | ATTR_FILE_IOBLOCKSIZE | \ + ATTR_FILE_DEVTYPE | ATTR_FILE_DATALENGTH | \ + ATTR_FILE_DATAALLOCSIZE | ATTR_FILE_RSRCLENGTH | \ + ATTR_FILE_RSRCALLOCSIZE) + +extern int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags); +extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, + znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp); +extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, + cred_t *cr, int flags); +extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr); +extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, + char *tnm, cred_t *cr, int flags); +extern int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, + char *link, znode_t **zpp, cred_t *cr, int flags); +extern int zfs_link(znode_t *tdzp, znode_t *sp, + char *name, cred_t *cr, int flags); +extern int zfs_space(znode_t *zp, int cmd, struct flock *bfp, int flag, + offset_t offset, cred_t *cr); +extern int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp); +extern int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, + cred_t *cr); +extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *resid); + +extern int zfs_open(struct vnode *ip, int mode, int flag, cred_t *cr); +extern int zfs_close(struct vnode *ip, int flag, cred_t *cr); +extern int zfs_read(struct vnode *ip, uio_t *uio, int ioflag, cred_t *cr); +extern int zfs_write(struct vnode *ip, uio_t *uio, int ioflag, cred_t *cr); +extern int zfs_lookup(znode_t *dzp, char *nm, znode_t **zpp, + int flags, cred_t *cr, int *direntflags, struct componentname *realpnp); +extern int zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, + cred_t *cred, int *rvalp, caller_context_t *ct); +extern int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, + int flags, int *a_numdirent); +extern int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr); +extern int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, + cred_t *cr, caller_context_t *ct); +extern int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr); + +extern int zfs_access(struct vnode *ip, int mode, int flag, cred_t *cr); +extern void zfs_inactive(vnode_t *vp); + +/* zfs_vops_osx.c calls */ +extern int zfs_znode_getvnode(znode_t *zp, zfsvfs_t *zfsvfs); + +extern void getnewvnode_reserve(int num); +extern void getnewvnode_drop_reserve(void); +extern int zfs_vfsops_init(void); +extern int zfs_vfsops_fini(void); +extern int zfs_znode_asyncgetvnode(znode_t *zp, zfsvfs_t *zfsvfs); +extern void zfs_znode_asyncput(znode_t *zp); +extern int zfs_znode_asyncwait(znode_t *zp); + +/* zfs_vnops_osx_lib calls */ +extern int zfs_ioflags(int ap_ioflag); +extern int zfs_getattr_znode_unlocked(struct vnode *vp, vattr_t *vap); +extern int ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)); +extern void acl_trivial_access_masks(mode_t mode, boolean_t isdir, + trivial_acl_t *masks); +extern int zpl_obtain_xattr(struct znode *, const char *name, mode_t mode, + cred_t *cr, struct vnode **vpp, int flag); + +extern void commonattrpack(attrinfo_t *aip, zfsvfs_t *zfsvfs, znode_t *zp, + const char *name, ino64_t objnum, enum vtype vtype, + boolean_t user64); +extern void dirattrpack(attrinfo_t *aip, znode_t *zp); +extern void fileattrpack(attrinfo_t *aip, zfsvfs_t *zfsvfs, znode_t *zp); +extern void nameattrpack(attrinfo_t *aip, const char *name, int namelen); +extern int getpackedsize(struct attrlist *alp, boolean_t user64); +extern void getfinderinfo(znode_t *zp, cred_t *cr, finderinfo_t *fip); +extern uint32_t getuseraccess(znode_t *zp, vfs_context_t ctx); +extern void finderinfo_update(uint8_t *finderinfo, znode_t *zp); +extern int zpl_xattr_set_sa(struct vnode *vp, const char *name, + const void *value, size_t size, int flags, cred_t *cr); +extern int zpl_xattr_get_sa(struct vnode *vp, const char *name, void *value, + size_t size); +extern void zfs_zrele_async(znode_t *zp); + +/* + * OSX ACL Helper funcions + * + * OSX uses 'guids' for the 'who' part of ACLs, and uses a 'well known' + * binary sequence to signify the special rules of "owner", "group" and + * "everybody". We translate between this "well-known" guid and ZFS' + * flags ACE_OWNER, ACE_GROUP and ACE_EVERYBODY. + * + */ +#define KAUTH_WKG_NOT 0 /* not a well-known GUID */ +#define KAUTH_WKG_OWNER 1 +#define KAUTH_WKG_GROUP 2 +#define KAUTH_WKG_NOBODY 3 +#define KAUTH_WKG_EVERYBODY 4 + +extern int kauth_wellknown_guid(guid_t *guid); +extern void aces_from_acl(ace_t *aces, int *nentries, struct kauth_acl *k_acl, + int *seen_type); +extern void nfsacl_set_wellknown(int wkg, guid_t *guid); +extern int zfs_addacl_trivial(znode_t *zp, ace_t *aces, int *nentries, + int seen_type); + +extern struct vnodeopv_desc zfs_dvnodeop_opv_desc; +extern struct vnodeopv_desc zfs_fvnodeop_opv_desc; +extern struct vnodeopv_desc zfs_symvnodeop_opv_desc; +extern struct vnodeopv_desc zfs_xdvnodeop_opv_desc; +extern struct vnodeopv_desc zfs_evnodeop_opv_desc; +extern struct vnodeopv_desc zfs_fifonodeop_opv_desc; +extern struct vnodeopv_desc zfs_ctldir_opv_desc; +extern int (**zfs_ctldirops)(void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VNOPS_H */ diff --git a/include/os/macos/zfs/sys/zfs_znode_impl.h b/include/os/macos/zfs/sys/zfs_znode_impl.h new file mode 100644 index 0000000000..fa53ac8d6e --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_znode_impl.h @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _MACOS_ZFS_SYS_ZNODE_IMPL_H +#define _MACOS_ZFS_SYS_ZNODE_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_UIMMUTABLE 0x0000001000000000ull // OSX +#define ZFS_UAPPENDONLY 0x0000004000000000ull // OSX + +// #define ZFS_IMMUTABLE (ZFS_UIMMUTABLE | ZFS_SIMMUTABLE) +// #define ZFS_APPENDONLY (ZFS_UAPPENDONLY | ZFS_SAPPENDONLY) + +#define ZFS_TRACKED 0x0010000000000000ull +#define ZFS_COMPRESSED 0x0020000000000000ull + +#define ZFS_SIMMUTABLE 0x0040000000000000ull +#define ZFS_SAPPENDONLY 0x0080000000000000ull + +#define SA_ZPL_ADDTIME(z) z->z_attr_table[ZPL_ADDTIME] +#define SA_ZPL_DOCUMENTID(z) z->z_attr_table[ZPL_DOCUMENTID] + +#define ZGET_FLAG_UNLINKED (1<<0) /* Also lookup unlinked */ +#define ZGET_FLAG_ASYNC (1<<3) /* taskq the vnode_create call */ + +extern int zfs_zget_ext(zfsvfs_t *zfsvfs, uint64_t obj_num, + struct znode **zpp, int flags); + + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +#define ZNODE_OS_FIELDS \ + struct zfsvfs *z_zfsvfs; \ + struct vnode *z_vnode; \ + uint64_t z_uid; \ + uint64_t z_gid; \ + uint64_t z_gen; \ + uint64_t z_atime[2]; \ + uint64_t z_links; \ + uint32_t z_vid; \ + uint32_t z_document_id; \ + uint64_t z_finder_parentid; \ + boolean_t z_finder_hardlink; \ + uint64_t z_write_gencount; \ + char z_name_cache[MAXPATHLEN]; \ + boolean_t z_skip_truncate_undo_decmpfs; \ + taskq_ent_t z_attach_taskq; \ + kcondvar_t z_attach_cv; \ + kmutex_t z_attach_lock; \ + hrtime_t z_snap_mount_time; \ + krwlock_t z_map_lock; + +#define ZFS_LINK_MAX UINT64_MAX + +/* + * ZFS minor numbers can refer to either a control device instance or + * a zvol. Depending on the value of zss_type, zss_data points to either + * a zvol_state_t or a zfs_onexit_t. + */ +enum zfs_soft_state_type { + ZSST_ZVOL, + ZSST_CTLDEV +}; + +typedef struct zfs_soft_state { + enum zfs_soft_state_type zss_type; + void *zss_data; +} zfs_soft_state_t; + +extern minor_t zfsdev_minor_alloc(void); + +/* + * Convert between znode pointers and vnode pointers + */ +#define ZTOV(ZP) ((ZP)->z_vnode) +#define ZTOI(ZP) ((ZP)->z_vnode) +#define VTOZ(VP) ((znode_t *)vnode_fsnode((VP))) +#define ITOZ(VP) ((znode_t *)vnode_fsnode((VP))) + +#define VTOM(VP) ((mount_t *)vnode_mount((VP))) + +/* These are not used so far, VN_HOLD returncode must be checked. */ +#define zhold(zp) VN_HOLD(ZTOV(zp)) +#define zrele(zp) VN_RELE(ZTOV(zp)) + +#define ZTOZSB(zp) ((zp)->z_zfsvfs) +#define ITOZSB(vp) ((zfsvfs_t *)vfs_fsprivate(vnode_mount(vp))) +#define ZTOTYPE(zp) (vnode_vtype(ZTOV(zp))) +#define ZTOGID(zp) ((zp)->z_gid) +#define ZTOUID(zp) ((zp)->z_uid) +#define ZTONLNK(zp) ((zp)->z_links) +#define Z_ISBLK(type) ((type) == VBLK) +#define Z_ISCHR(type) ((type) == VCHR) +#define Z_ISLNK(type) ((type) == VLNK) + +/* Called on entry to each ZFS inode and vfs operation. */ +#define ZFS_ENTER_IFERROR(zfsvfs) \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + if ((zfsvfs)->z_unmounted) + +#define ZFS_ENTER_ERROR(zfsvfs, error) \ + do { \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + if ((zfsvfs)->z_unmounted) { \ + ZFS_EXIT(zfsvfs); \ + return (error); \ + } \ + } while (0) + +#define ZFS_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) +#define ZPL_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) + +/* Must be called before exiting the operation. */ +#define ZFS_EXIT(zfsvfs) \ + do { \ + rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG); \ + } while (0) +#define ZPL_EXIT(zfsvfs) ZFS_EXIT(zfsvfs) + +/* Verifies the znode is valid. */ +#define ZFS_VERIFY_ZP_ERROR(zp, error) \ + do { \ + if ((zp)->z_sa_hdl == NULL) { \ + ZFS_EXIT(ZTOZSB(zp)); \ + return (error); \ + } \ + } while (0) + +#define ZFS_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) +#define ZPL_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_MTX_SZ 64 +#define ZFS_OBJ_MTX_MAX (1024 * 1024) +#define ZFS_OBJ_HASH(zfsvfs, obj) ((obj) & ((zfsvfs->z_hold_size) - 1)) + +extern unsigned int zfs_object_mutex_size; + +/* Encode ZFS stored time values from a struct timespec */ +#define ZFS_TIME_ENCODE(tp, stmp) \ + { \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ + } + +/* Decode ZFS stored time values to a struct timespec */ +#define ZFS_TIME_DECODE(tp, stmp) \ + { \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} + +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ + if ((zfsvfs)->z_atime && !vfs_isrdonly(zfsvfs->z_vfs)) \ + zfs_tstamp_update_setup_ext(zp, ACCESSED, NULL, NULL, B_FALSE); + +extern void zfs_tstamp_update_setup_ext(struct znode *, + uint_t, uint64_t [2], uint64_t [2], boolean_t); +extern void zfs_tstamp_update_setup(struct znode *, + uint_t, uint64_t [2], uint64_t [2]); +extern void zfs_znode_free(struct znode *); + +extern zil_get_data_t zfs_get_data; +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp, + char *buf); +extern uint32_t zfs_getbsdflags(struct znode *zp); +extern void zfs_setattr_generate_id(struct znode *, uint64_t, char *name); + +extern int zfs_setattr_set_documentid(struct znode *zp, + boolean_t update_flags); + +/* Legacy macOS uses fnv_32a hash for hostid. */ +#define FNV1_32A_INIT ((uint32_t)0x811c9dc5) +uint32_t fnv_32a_str(const char *str, uint32_t hval); + +void zfs_setbsdflags(struct znode *, uint32_t bsdflags); +uint32_t zfs_getbsdflags(struct znode *zp); + +#ifdef __cplusplus +} +#endif + +#endif /* _MACOS_SYS_FS_ZFS_ZNODE_H */ diff --git a/include/os/macos/zfs/sys/zpl.h b/include/os/macos/zfs/sys/zpl.h new file mode 100644 index 0000000000..5d391c6a96 --- /dev/null +++ b/include/os/macos/zfs/sys/zpl.h @@ -0,0 +1,27 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SYS_ZPL_H +#define _SYS_ZPL_H + + + + +#endif // _SYS_ZPL_H diff --git a/include/os/macos/zfs/sys/zvolIO.h b/include/os/macos/zfs/sys/zvolIO.h new file mode 100644 index 0000000000..927840aa7b --- /dev/null +++ b/include/os/macos/zfs/sys/zvolIO.h @@ -0,0 +1,142 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2016 Jorgen Lundman + */ + +#ifndef ZVOLIO_H_INCLUDED +#define ZVOLIO_H_INCLUDED + +/* Linux polutes 'current' */ +#undef current + +#ifdef __cplusplus +#include + +extern "C" { +#endif /* __cplusplus */ + +#include +#include + +struct iomem { + IOMemoryDescriptor *buf; +}; + +uint64_t zvolIO_kit_read(struct iomem *iomem, uint64_t offset, + char *address, uint64_t len); +uint64_t zvolIO_kit_write(struct iomem *iomem, uint64_t offset, + char *address, uint64_t len); + +#ifdef __cplusplus +} /* extern "C" */ + +class net_lundman_zfs_zvol : public IOService +{ + OSDeclareDefaultStructors(net_lundman_zfs_zvol) + +private: + +public: + virtual bool init(OSDictionary* dictionary = NULL); + virtual void free(void); + virtual IOService* probe(IOService* provider, SInt32* score); + virtual bool start(IOService* provider); + virtual void stop(IOService* provider); + + virtual bool handleOpen(IOService *client, + IOOptionBits options, void *arg); + virtual bool handleIsOpen(const IOService *client) const; + virtual void handleClose(IOService *client, + IOOptionBits options); + virtual bool isOpen(const IOService *forClient = 0) const; + +private: + OSSet *_openClients; +}; + +#include + +class net_lundman_zfs_zvol_device : public IOBlockStorageDevice +{ + OSDeclareDefaultStructors(net_lundman_zfs_zvol_device) + +private: + // IOService *m_provider; + zvol_state_t *zv; + +public: + virtual bool init(zvol_state_t *c_zv, + OSDictionary* properties = 0); + + virtual bool attach(IOService* provider); + virtual void detach(IOService* provider); + virtual IOReturn doEjectMedia(void); + virtual IOReturn doFormatMedia(UInt64 byteCapacity); + virtual UInt32 doGetFormatCapacities(UInt64 * capacities, + UInt32 capacitiesMaxCount) const; + + virtual IOReturn doLockUnlockMedia(bool doLock); + virtual IOReturn doSynchronizeCache(void); + virtual char *getVendorString(void); + virtual char *getProductString(void); + virtual char *getRevisionString(void); + virtual char *getAdditionalDeviceInfoString(void); + virtual IOReturn reportBlockSize(UInt64 *blockSize); + virtual IOReturn reportEjectability(bool *isEjectable); + virtual IOReturn reportLockability(bool *isLockable); + virtual IOReturn reportMaxValidBlock(UInt64 *maxBlock); + virtual IOReturn reportMediaState(bool *mediaPresent, + bool *changedState); + + virtual IOReturn reportPollRequirements(bool *pollRequired, + bool *pollIsExpensive); + + virtual IOReturn reportRemovability(bool *isRemovable); + virtual IOReturn reportWriteProtection(bool *isWriteProtected); + virtual IOReturn getWriteCacheState(bool *enabled); + virtual IOReturn setWriteCacheState(bool enabled); + virtual IOReturn doAsyncReadWrite(IOMemoryDescriptor *buffer, + UInt64 block, UInt64 nblks, + IOStorageAttributes *attributes, + IOStorageCompletion *completion); + + virtual IOReturn doDiscard(UInt64 block, UInt64 nblks); + virtual IOReturn doUnmap(IOBlockStorageDeviceExtent *extents, + UInt32 extentsCount, UInt32 options); + + virtual bool handleOpen(IOService *client, + IOOptionBits options, void *access); + + virtual void handleClose(IOService *client, + IOOptionBits options); + + virtual int getBSDName(void); + virtual int renameDevice(void); + virtual int offlineDevice(void); + virtual int onlineDevice(void); + virtual int refreshDevice(void); + + virtual void clearState(void); +}; +#endif /* __cplusplus */ + +#endif /* ZVOLIO_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/zvol_os.h b/include/os/macos/zfs/sys/zvol_os.h new file mode 100644 index 0000000000..a5e5d86aa8 --- /dev/null +++ b/include/os/macos/zfs/sys/zvol_os.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SYS_ZVOL_OS_h +#define _SYS_ZVOL_OS_h + +#ifdef __cplusplus +extern "C" { +#endif + +/* struct wrapper for IOKit class */ +typedef struct zvol_iokit zvol_iokit_t; +typedef struct zvol_state zvol_state_t; +struct iomem; + +struct zvol_state_os { + dev_t zvo_dev; /* device id */ + + zvol_iokit_t *zvo_iokitdev; /* IOKit device */ + uint64_t zvo_openflags; /* Remember flags used at open */ + char zvo_bsdname[MAXPATHLEN]; /* /dev/diskX */ +}; + +extern int zvol_os_ioctl(dev_t, unsigned long, caddr_t, + int isblk, cred_t *, int *rvalp); +extern int zvol_os_open_zv(zvol_state_t *, int, int, struct proc *p); +extern int zvol_os_open(dev_t dev, int flag, int otyp, struct proc *p); +extern int zvol_os_close_zv(zvol_state_t *, int, int, struct proc *p); +extern int zvol_os_close(dev_t dev, int flag, int otyp, struct proc *p); +extern int zvol_os_read(dev_t dev, struct uio *uio, int p); +extern int zvol_os_write(dev_t dev, struct uio *uio, int p); + +extern int zvol_os_read_zv(zvol_state_t *zv, uint64_t position, + uint64_t count, struct iomem *iomem); +extern int zvol_os_write_zv(zvol_state_t *zv, uint64_t position, + uint64_t count, struct iomem *iomem); +extern int zvol_os_unmap(zvol_state_t *zv, uint64_t off, uint64_t bytes); + +extern void zvol_os_strategy(struct buf *bp); +extern int zvol_os_get_volume_blocksize(dev_t dev); + +extern void zvol_os_lock_zv(zvol_state_t *zv); +extern void zvol_os_unlock_zv(zvol_state_t *zv); + +extern void *zvolRemoveDevice(zvol_iokit_t *iokitdev); +extern int zvolRemoveDeviceTerminate(void *iokitdev); +extern int zvolCreateNewDevice(zvol_state_t *zv); +extern int zvolRegisterDevice(zvol_state_t *zv); + +extern int zvolRenameDevice(zvol_state_t *zv); +extern int zvolSetVolsize(zvol_state_t *zv); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 7ed2470268..7a80733882 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -59,7 +59,7 @@ struct abd { union { struct abd_scatter { uint_t abd_offset; -#if defined(_KERNEL) && ( defined(__FreeBSD__) || defined(__APPLE__) ) +#if defined(_KERNEL) && (defined(__FreeBSD__) || defined(__APPLE__)) uint_t abd_chunk_size; void *abd_chunks[]; #else @@ -134,9 +134,9 @@ void abd_iter_unmap(struct abd_iter *); #if defined(__FreeBSD__) #define abd_enter_critical(flags) critical_enter() #define abd_exit_critical(flags) critical_exit() -#elif defined (__APPLE__) -#define abd_enter_critical(flags) (flags) = ml_set_interrupts_enabled(FALSE) -#define abd_exit_critical(flags) ml_set_interrupts_enabled((flags)) +#elif defined(__APPLE__) +#define abd_enter_critical(flags) (flags) = ml_set_interrupts_enabled(FALSE) +#define abd_exit_critical(flags) ml_set_interrupts_enabled((flags)) #else #define abd_enter_critical(flags) local_irq_save(flags) #define abd_exit_critical(flags) local_irq_restore(flags) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 4e50740506..c62d0e8dd5 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1251,7 +1251,7 @@ typedef enum zfs_ioc { /* * Core features - 81/128 numbers reserved. */ -#if defined (__FreeBSD__) || defined(__APPLE__) +#if defined(__FreeBSD__) || defined(__APPLE__) ZFS_IOC_FIRST = 0, #else ZFS_IOC_FIRST = ('Z' << 8), diff --git a/include/sys/mntent.h b/include/sys/mntent.h index cb463ce292..be62c53976 100644 --- a/include/sys/mntent.h +++ b/include/sys/mntent.h @@ -84,8 +84,8 @@ #define MNTOPT_NOSETUID "nosetuid" /* Set uid not allowed */ #define MNTOPT_BROWSE "browse" /* browsable autofs mount */ #define MNTOPT_NOBROWSE "nobrowse" /* non-browsable autofs mount */ -#define MNTOPT_OWNERS "owners" /* VFS will not ignore ownership information on filesystem objects */ -#define MNTOPT_NOOWNERS "noowners" /* VFS will ignore ownership information on filesystem objects */ +#define MNTOPT_OWNERS "owners" /* use ownership */ +#define MNTOPT_NOOWNERS "noowners" /* ignore ownership */ #else #error "unknown OS" #endif diff --git a/include/sys/sysevent/dev.h b/include/sys/sysevent/dev.h index 1255a176d3..a41f0c0fe8 100644 --- a/include/sys/sysevent/dev.h +++ b/include/sys/sysevent/dev.h @@ -239,7 +239,7 @@ extern "C" { #define DEV_INSTANCE "instance" #define DEV_PROP_PREFIX "prop-" -#if defined (__linux__) || defined (__APPLE__) +#if defined(__linux__) || defined(__APPLE__) #define DEV_IDENTIFIER "devid" #define DEV_PATH "path" #define DEV_IS_PART "is_slice" diff --git a/include/sys/zfs_sa.h b/include/sys/zfs_sa.h index dfac8ce4b6..b1517b9d3f 100644 --- a/include/sys/zfs_sa.h +++ b/include/sys/zfs_sa.h @@ -76,10 +76,11 @@ typedef enum zpl_attr { ZPL_DXATTR, ZPL_PROJID, - /* Apple defines a ADDEDTIME, which is the time the entry was placed in - * the containing directory. Ie, CRTIME and updated when moved into - * a different directory. This can be retrieved with getxattr "FinderInfo" - * or the getattrlist() syscall. + /* + * Apple defines a ADDEDTIME, which is the time the entry was placed + * in the containing directory. Ie, CRTIME and updated when moved + * into a different directory. This can be retrieved with getxattr + * "FinderInfo" or the getattrlist() syscall. */ ZPL_ADDTIME, ZPL_DOCUMENTID, diff --git a/lib/libefi/rdwr_efi_macos.c b/lib/libefi/rdwr_efi_macos.c index 9a1a64ebe4..c2ef128a8f 100644 --- a/lib/libefi/rdwr_efi_macos.c +++ b/lib/libefi/rdwr_efi_macos.c @@ -248,8 +248,8 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) dki_info->dki_partition = 0; } strlcpy(dki_info->dki_dname, - &pathbuf[5], - sizeof(dki_info->dki_dname)); + &pathbuf[5], + sizeof (dki_info->dki_dname)); } /* @@ -1663,7 +1663,7 @@ isDeviceMatchForKeyAndSubstr(char *device, CFStringRef key, CFStringRef substr, if ((error = setupDADiskSession(&ds, device)) == 0) { CFDictionaryRef descDict = NULL; - if((descDict = DADiskCopyDescription(ds.disk)) != NULL) { + if ((descDict = DADiskCopyDescription(ds.disk)) != NULL) { *isMatch = CFDictionaryValueIfPresentMatchesSubstring(descDict, key, substr); @@ -1709,5 +1709,5 @@ osx_device_isvirtual(char *device) isCoreStorageLV, isVirtualInterface); - return (isCoreStorageLV || isVirtualInterface); + return (isCoreStorageLV /* || isVirtualInterface*/); } diff --git a/lib/libspl/include/os/Makefile.am b/lib/libspl/include/os/Makefile.am index 7b362e02ad..22495a05b7 100644 --- a/lib/libspl/include/os/Makefile.am +++ b/lib/libspl/include/os/Makefile.am @@ -5,3 +5,7 @@ endif if BUILD_LINUX SUBDIRS = linux endif + +if BUILD_MACOS +SUBDIRS = macos +endif diff --git a/lib/libspl/include/os/macos/Makefile.am b/lib/libspl/include/os/macos/Makefile.am new file mode 100644 index 0000000000..1d3c559bed --- /dev/null +++ b/lib/libspl/include/os/macos/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = mach rpc sys diff --git a/lib/libspl/include/os/macos/dirent.h b/lib/libspl/include/os/macos/dirent.h new file mode 100644 index 0000000000..b7ffe3d89c --- /dev/null +++ b/lib/libspl/include/os/macos/dirent.h @@ -0,0 +1,37 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBSPL_DIRENT_H +#define _LIBSPL_DIRENT_H + +#include_next + + +/* Handle Linux use of 64 names */ + +#define readdir64 readdir +#define dirent64 dirent + +#endif diff --git a/lib/libspl/include/os/macos/ia32/Makefile.am b/lib/libspl/include/os/macos/ia32/Makefile.am new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/libspl/include/os/macos/ia32/sys/Makefile.am b/lib/libspl/include/os/macos/ia32/sys/Makefile.am new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/libspl/include/os/macos/ia32/sys/asm_linkage.h b/lib/libspl/include/os/macos/ia32/sys/asm_linkage.h new file mode 100644 index 0000000000..0009705ad6 --- /dev/null +++ b/lib/libspl/include/os/macos/ia32/sys/asm_linkage.h @@ -0,0 +1,297 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IA32_SYS_ASM_LINKAGE_H +#define _IA32_SYS_ASM_LINKAGE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _ASM /* The remainder of this file is only for assembly files */ + +/* + * make annoying differences in assembler syntax go away + */ + +/* + * D16 and A16 are used to insert instructions prefixes; the + * macros help the assembler code be slightly more portable. + */ +#if !defined(__GNUC_AS__) +/* + * /usr/ccs/bin/as prefixes are parsed as separate instructions + */ +#define D16 data16; +#define A16 addr16; + +/* + * (There are some weird constructs in constant expressions) + */ +#define _CONST(const) [const] +#define _BITNOT(const) -1!_CONST(const) +#define _MUL(a, b) _CONST(a \* b) + +#else +/* + * Why not use the 'data16' and 'addr16' prefixes .. well, the + * assembler doesn't quite believe in real mode, and thus argues with + * us about what we're trying to do. + */ +#define D16 .byte 0x66; +#define A16 .byte 0x67; + +#define _CONST(const) (const) +#define _BITNOT(const) ~_CONST(const) +#define _MUL(a, b) _CONST(a * b) + +#endif + +/* + * C pointers are different sizes between i386 and amd64. + * These constants can be used to compute offsets into pointer arrays. + */ +#if defined(__amd64) +#define CLONGSHIFT 3 +#define CLONGSIZE 8 +#define CLONGMASK 7 +#elif defined(__i386) +#define CLONGSHIFT 2 +#define CLONGSIZE 4 +#define CLONGMASK 3 +#endif + +/* + * Since we know we're either ILP32 or LP64 .. + */ +#define CPTRSHIFT CLONGSHIFT +#define CPTRSIZE CLONGSIZE +#define CPTRMASK CLONGMASK + +#if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) +#error "inconsistent shift constants" +#endif + +#if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) +#error "inconsistent mask constants" +#endif + +#define ASM_ENTRY_ALIGN 4, 0x90 + +/* + * SSE register alignment and save areas + */ + +#define XMM_SIZE 16 +#define XMM_ALIGN 16 +#define XMM_ALIGN_LOG 4, 0x90 + +#if defined(__amd64) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp; \ + movq %rsp, sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp + +#elif defined(__i386) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \ + movl %esp, sreg; \ + addl $XMM_ALIGN, sreg; \ + andl $_BITNOT(XMM_ALIGN-1), sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; + +#endif /* __i386 */ + +/* + * profiling causes definitions of the MCOUNT and RTMCOUNT + * particular to the type + */ +#ifdef GPROF + +#define MCOUNT(x) \ + pushl %ebp; \ + movl %esp, %ebp; \ + call _mcount; \ + popl %ebp + +#endif /* GPROF */ + +#ifdef PROF + +#define MCOUNT(x) \ +/* CSTYLED */ \ + .lcomm .L_/**/x/**/1, 4, 4; \ + pushl %ebp; \ + movl %esp, %ebp; \ +/* CSTYLED */ \ + movl $.L_/**/x/**/1, %edx; \ + call _mcount; \ + popl %ebp + +#endif /* PROF */ + +/* + * if we are not profiling, MCOUNT should be defined to nothing + */ +#if !defined(PROF) && !defined(GPROF) +#define MCOUNT(x) +#endif /* !defined(PROF) && !defined(GPROF) */ + +#define RTMCOUNT(x) MCOUNT(x) + +/* + * Macro to define weak symbol aliases. These are similar to the ANSI-C + * #pragma weak name = _name + * except a compiler can determine type. The assembler must be told. Hence, + * the second parameter must be the type of the symbol (i.e.: function,...) + */ +#define ANSI_PRAGMA_WEAK(sym, stype) \ + .weak sym; \ +/* CSTYLED */ \ +sym = _/**/sym + +/* + * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in: + * #pragma weak sym1 = sym2 + */ +#define ANSI_PRAGMA_WEAK2(sym1, sym2, stype) \ + .weak sym1; \ +sym1 = sym2 + +/* + * ENTRY provides the standard procedure entry code and an easy way to + * insert the calls to mcount for profiling. ENTRY_NP is identical, but + * never calls mcount. + */ +#define ENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: MCOUNT(x) + +#define ENTRY_NP(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: + +#define RTENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: RTMCOUNT(x) + +/* + * ENTRY2 is identical to ENTRY but provides two labels for the entry point. + */ +#define ENTRY2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ +/* CSTYLED */ \ +x:; \ +y: MCOUNT(x) + +#define ENTRY_NP2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ +/* CSTYLED */ \ +x:; \ +y: + + +/* + * ALTENTRY provides for additional entry points. + */ +#define ALTENTRY(x) \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: + +/* + * DGDEF and DGDEF2 provide global data declarations. + * + * DGDEF provides a word aligned word of storage. + * + * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This + * implies this macro is best used for byte arrays. + * + * DGDEF3 allocates "sz" bytes of storage with "algn" alignment. + */ +#define DGDEF2(name, sz) \ + .data; \ + .globl name; \ +name: + +#define DGDEF3(name, sz, algn) \ + .data; \ + .align algn; \ + .globl name; \ +name: + +#define DGDEF(name) DGDEF3(name, 4, 4) + +/* + * SET_SIZE trails a function and set the size for the ELF symbol table. + */ +#define SET_SIZE(x) + +/* + * NWORD provides native word value. + */ +#if defined(__amd64) + +/*CSTYLED*/ +#define NWORD quad + +#elif defined(__i386) + +#define NWORD long + +#endif /* __i386 */ + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/lib/libspl/include/os/macos/mach/Makefile.am b/lib/libspl/include/os/macos/mach/Makefile.am new file mode 100644 index 0000000000..89b0459882 --- /dev/null +++ b/lib/libspl/include/os/macos/mach/Makefile.am @@ -0,0 +1,3 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + $(top_srcdir)/lib/libspl/include/os/macos/mach/boolean.h diff --git a/lib/libspl/include/os/macos/mach/boolean.h b/lib/libspl/include/os/macos/mach/boolean.h new file mode 100644 index 0000000000..47c93a3151 --- /dev/null +++ b/lib/libspl/include/os/macos/mach/boolean.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* Deal with XNU's own boolean_t version */ + +#define boolean_t xnu_boolean_t +#include_next +#undef boolean_t diff --git a/lib/libspl/include/os/macos/mntent.h b/lib/libspl/include/os/macos/mntent.h new file mode 100644 index 0000000000..8183dda00b --- /dev/null +++ b/lib/libspl/include/os/macos/mntent.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + */ + +#ifndef _SYS_MNTENT_H +#define _SYS_MNTENT_H + + +#ifdef __cplusplus +extern "C" { +#endif + +#define MNTTAB "/etc/mnttab" +#define VFSTAB "/etc/vfstab" +#define MNTMAXSTR 128 + +#define MNTTYPE_ZFS "zfs" /* ZFS file system */ +#define MNTTYPE_UFS "ufs" /* Unix file system */ +#define MNTTYPE_NFS "nfs" /* NFS file system */ +#define MNTTYPE_NFS3 "nfs3" /* NFS Version 3 file system */ +#define MNTTYPE_NFS4 "nfs4" /* NFS Version 4 file system */ +#define MNTTYPE_CACHEFS "cachefs" /* Cache File System */ +#define MNTTYPE_PCFS "pcfs" /* PC (MSDOS) file system */ +#define MNTTYPE_PC MNTTYPE_PCFS /* Deprecated name; use MNTTYPE_PCFS */ +#define MNTTYPE_LOFS "lofs" /* Loop back file system */ +#define MNTTYPE_LO MNTTYPE_LOFS /* Deprecated name; use MNTTYPE_LOFS */ +#define MNTTYPE_HSFS "hsfs" /* High Sierra (9660) file system */ +#define MNTTYPE_SWAP "swap" /* Swap file system */ +#define MNTTYPE_TMPFS "tmpfs" /* Tmp volatile file system */ +#define MNTTYPE_AUTOFS "autofs" /* Automounter ``file'' system */ +#define MNTTYPE_MNTFS "mntfs" /* In-kernel mnttab */ +#define MNTTYPE_DEV "dev" /* /dev file system */ +#define MNTTYPE_CTFS "ctfs" /* Contract file system */ +#define MNTTYPE_OBJFS "objfs" /* Kernel object file system */ +#define MNTTYPE_SHAREFS "sharefs" /* Kernel sharetab file system */ + + +#define MNTOPT_RO "ro" /* Read only */ +#define MNTOPT_RW "rw" /* Read/write */ +#define MNTOPT_RQ "rq" /* Read/write with quotas */ +#define MNTOPT_QUOTA "quota" /* Check quotas */ +#define MNTOPT_NOQUOTA "noquota" /* Don't check quotas */ +#define MNTOPT_ONERROR "onerror" /* action to taken on error */ +#define MNTOPT_SOFT "soft" /* Soft mount */ +#define MNTOPT_SEMISOFT "semisoft" /* partial soft, uncommited interface */ +#define MNTOPT_HARD "hard" /* Hard mount */ +#define MNTOPT_SUID "suid" /* Both setuid and devices allowed */ +#define MNTOPT_NOSUID "nosuid" /* Neither setuid nor devices allowed */ +#define MNTOPT_DEVICES "devices" /* Device-special allowed */ +#define MNTOPT_NODEVICES "nodevices" /* Device-special disallowed */ +#define MNTOPT_SETUID "setuid" /* Set uid allowed */ +#define MNTOPT_NOSETUID "nosetuid" /* Set uid not allowed */ +#define MNTOPT_GRPID "grpid" /* SysV-compatible gid on create */ +#define MNTOPT_REMOUNT "remount" /* Change mount options */ +#define MNTOPT_NOSUB "nosub" /* Disallow mounts on subdirs */ +#define MNTOPT_MULTI "multi" /* Do multi-component lookup */ +#define MNTOPT_INTR "intr" /* Allow NFS ops to be interrupted */ +#define MNTOPT_NOINTR "nointr" /* Don't allow interrupted ops */ +#define MNTOPT_PORT "port" /* NFS server IP port number */ +#define MNTOPT_SECURE "secure" /* Secure (AUTH_DES) mounting */ +#define MNTOPT_RSIZE "rsize" /* Max NFS read size (bytes) */ +#define MNTOPT_WSIZE "wsize" /* Max NFS write size (bytes) */ +#define MNTOPT_TIMEO "timeo" /* NFS timeout (1/10 sec) */ +#define MNTOPT_RETRANS "retrans" /* Max retransmissions (soft mnts) */ +#define MNTOPT_ACTIMEO "actimeo" /* Attr cache timeout (sec) */ +#define MNTOPT_ACREGMIN "acregmin" /* Min attr cache timeout (files) */ +#define MNTOPT_ACREGMAX "acregmax" /* Max attr cache timeout (files) */ +#define MNTOPT_ACDIRMIN "acdirmin" /* Min attr cache timeout (dirs) */ +#define MNTOPT_ACDIRMAX "acdirmax" /* Max attr cache timeout (dirs) */ +#define MNTOPT_NOAC "noac" /* Don't cache attributes at all */ +#define MNTOPT_NOCTO "nocto" /* No close-to-open consistency */ +#define MNTOPT_BG "bg" /* Do mount retries in background */ +#define MNTOPT_FG "fg" /* Do mount retries in foreground */ +#define MNTOPT_RETRY "retry" /* Number of mount retries */ +#define MNTOPT_DEV "dev" /* Device id of mounted fs */ +#define MNTOPT_POSIX "posix" /* Get static pathconf for mount */ +#define MNTOPT_MAP "map" /* Automount map */ +#define MNTOPT_DIRECT "direct" /* Automount direct map mount */ +#define MNTOPT_INDIRECT "indirect" /* Automount indirect map mount */ +#define MNTOPT_LLOCK "llock" /* Local locking (no lock manager) */ +#define MNTOPT_IGNORE "ignore" /* Ignore this entry */ +#define MNTOPT_VERS "vers" /* protocol version number indicator */ +#define MNTOPT_PROTO "proto" /* protocol network_id indicator */ +#define MNTOPT_SEC "sec" /* Security flavor indicator */ +#define MNTOPT_SYNCDIR "syncdir" /* Synchronous local directory ops */ +#define MNTOPT_NOSETSEC "nosec" /* Do no allow setting sec attrs */ +#define MNTOPT_NOPRINT "noprint" /* Do not print messages */ +#define MNTOPT_LARGEFILES "largefiles" /* allow large files */ +#define MNTOPT_NOLARGEFILES "nolargefiles" /* don't allow large files */ +#define MNTOPT_FORCEDIRECTIO "forcedirectio" /* Force DirectIO on all files */ +#define MNTOPT_NOFORCEDIRECTIO "noforcedirectio" /* No Force DirectIO */ +#define MNTOPT_DISABLEDIRECTIO "disabledirectio" /* Disable DirectIO ioctls */ +#define MNTOPT_PUBLIC "public" /* Use NFS public file handlee */ +#define MNTOPT_LOGGING "logging" /* enable logging */ +#define MNTOPT_NOLOGGING "nologging" /* disable logging */ +#define MNTOPT_ATIME "atime" /* update atime for files */ +#define MNTOPT_NOATIME "noatime" /* do not update atime for files */ +#define MNTOPT_GLOBAL "global" /* Cluster-wide global mount */ +#define MNTOPT_NOGLOBAL "noglobal" /* Mount local to single node */ +#define MNTOPT_DFRATIME "dfratime" /* Deferred access time updates */ +#define MNTOPT_NODFRATIME "nodfratime" /* No Deferred access time updates */ +#define MNTOPT_NBMAND "nbmand" /* allow non-blocking mandatory locks */ +#define MNTOPT_NONBMAND "nonbmand" /* deny non-blocking mandatory locks */ +#define MNTOPT_XATTR "xattr" /* enable extended attributes */ +#define MNTOPT_NOXATTR "noxattr" /* disable extended attributes */ +#define MNTOPT_EXEC "exec" /* enable executables */ +#define MNTOPT_NOEXEC "noexec" /* disable executables */ +#define MNTOPT_RESTRICT "restrict" /* restricted autofs mount */ +#define MNTOPT_BROWSE "browse" /* browsable autofs mount */ +#define MNTOPT_NOBROWSE "nobrowse" /* non-browsable autofs mount */ +/* VFS will not ignore ownership information on filesystem objects */ +#define MNTOPT_OWNERS "owners" +/* VFS will ignore ownership information on filesystem objects */ +#define MNTOPT_NOOWNERS "noowners" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MNTENT_H */ diff --git a/lib/libspl/include/os/macos/poll.h b/lib/libspl/include/os/macos/poll.h new file mode 100644 index 0000000000..2bb5203d00 --- /dev/null +++ b/lib/libspl/include/os/macos/poll.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LIBSPL_POLL_H +#define _LIBSPL_POLL_H + +#include_next + +#ifndef O_DIRECT +#define O_DIRECT 0 +#endif + +#endif diff --git a/lib/libspl/include/os/macos/rpc/Makefile.am b/lib/libspl/include/os/macos/rpc/Makefile.am new file mode 100644 index 0000000000..645ec772f9 --- /dev/null +++ b/lib/libspl/include/os/macos/rpc/Makefile.am @@ -0,0 +1,3 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + $(top_srcdir)/lib/libspl/include/os/macos/rpc/xdr.h diff --git a/lib/libspl/include/os/macos/rpc/xdr.h b/lib/libspl/include/os/macos/rpc/xdr.h new file mode 100644 index 0000000000..9fc13fefaf --- /dev/null +++ b/lib/libspl/include/os/macos/rpc/xdr.h @@ -0,0 +1,38 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef LIBSPL_MACOS_RPC_XDR_H +#define LIBSPL_MACOS_RPC_XDR_H + +#include +#include_next + +#endif /* LIBSPL_MACOS_RPC_XDR_H */ diff --git a/lib/libspl/include/os/macos/stdlib.h b/lib/libspl/include/os/macos/stdlib.h new file mode 100644 index 0000000000..2cb27f29c1 --- /dev/null +++ b/lib/libspl/include/os/macos/stdlib.h @@ -0,0 +1,28 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LIBSPL_OSX_STDLIB_H +#define _LIBSPL_OSX_STDLIB_H + +#include_next +#include + +#endif diff --git a/lib/libspl/include/os/macos/sys/Makefile.am b/lib/libspl/include/os/macos/sys/Makefile.am new file mode 100644 index 0000000000..fb063b875c --- /dev/null +++ b/lib/libspl/include/os/macos/sys/Makefile.am @@ -0,0 +1,17 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + $(top_srcdir)/lib/libspl/include/os/macos/mach/boolean.h \ + $(top_srcdir)/lib/libspl/include/os/macos/rpc/xdr.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/byteorder.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/errno.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/file.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/kernel_types.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/mnttab.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/mount.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/param.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/stat.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/sysmacros.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/uio.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/vfs.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/xattr.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/zfs_context_os.h diff --git a/lib/libspl/include/os/macos/sys/byteorder.h b/lib/libspl/include/os/macos/sys/byteorder.h new file mode 100644 index 0000000000..dd578e4493 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/byteorder.h @@ -0,0 +1,279 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_BYTEORDER_H +#define _SYS_BYTEORDER_H + +#include +#include <_types.h> + +/* + * Define the order of 32-bit words in 64-bit words. + */ +#define _QUAD_HIGHWORD 1 +#define _QUAD_LOWWORD 0 + +/* + * Definitions for byte order, according to byte significance from low + * address to high. + */ +#undef _LITTLE_ENDIAN +/* LSB first: i386, vax */ +#define _LITTLE_ENDIAN 1234 +/* LSB first in word, MSW first in long */ +#define _PDP_ENDIAN 3412 + +#define _BYTE_ORDER _LITTLE_ENDIAN + +/* + * Deprecated variants that don't have enough underscores to be useful in more + * strict namespaces. + */ +#if __BSD_VISIBLE +#define LITTLE_ENDIAN _LITTLE_ENDIAN +#define PDP_ENDIAN _PDP_ENDIAN +#define BYTE_ORDER _BYTE_ORDER +#endif + +#define __bswap16_gen(x) (__uint16_t)((x) << 8 | (x) >> 8) +#define __bswap32_gen(x) \ + (((__uint32_t)__bswap16((x) & 0xffff) << 16) | __bswap16((x) >> 16)) +#define __bswap64_gen(x) \ + (((__uint64_t)__bswap32((x) & 0xffffffff) << 32) | __bswap32((x) >> 32)) + +#ifdef __GNUCLIKE_BUILTIN_CONSTANT_P +#define __bswap16(x) \ + ((__uint16_t)(__builtin_constant_p(x) ? \ + __bswap16_gen((__uint16_t)(x)) : __bswap16_var(x))) +#define __bswap32(x) \ + (__builtin_constant_p(x) ? \ + __bswap32_gen((__uint32_t)(x)) : __bswap32_var(x)) +#define __bswap64(x) \ + (__builtin_constant_p(x) ? \ + __bswap64_gen((__uint64_t)(x)) : __bswap64_var(x)) +#else +/* XXX these are broken for use in static initializers. */ +#define __bswap16(x) __bswap16_var(x) +#define __bswap32(x) __bswap32_var(x) +#define __bswap64(x) __bswap64_var(x) +#endif + +/* These are defined as functions to avoid multiple evaluation of x. */ + +static __inline __uint16_t +__bswap16_var(__uint16_t _x) +{ + + return (__bswap16_gen(_x)); +} + +static __inline __uint32_t +__bswap32_var(__uint32_t _x) +{ + +#ifdef __GNUCLIKE_ASM + __asm("bswap %0" : "+r" (_x)); + return (_x); +#else + return (__bswap32_gen(_x)); +#endif +} +#define __htonl(x) __bswap32(x) +#define __htons(x) __bswap16(x) +#define __ntohl(x) __bswap32(x) +#define __ntohs(x) __bswap16(x) + +#include +#include + +#if defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * macros for conversion between host and (internet) network byte order + */ + +#if defined(_BIG_ENDIAN) && !defined(ntohl) && !defined(__lint) +/* big-endian */ +#if defined(_BIG_ENDIAN) && (defined(__amd64__) || defined(__amd64)) +#error "incompatible ENDIAN / ARCH combination" +#endif +#define ntohl(x) (x) +#define ntohs(x) (x) +#define htonl(x) (x) +#define htons(x) (x) + +#elif !defined(ntohl) /* little-endian */ + +#ifndef _IN_PORT_T +#define _IN_PORT_T +typedef uint16_t in_port_t; +#endif + +#ifndef _IN_ADDR_T +#define _IN_ADDR_T +typedef uint32_t in_addr_t; +#endif + +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) +extern uint32_t htonl(uint32_t); +extern uint16_t htons(uint16_t); +extern uint32_t ntohl(uint32_t); +extern uint16_t ntohs(uint16_t); +#else +extern in_addr_t htonl(in_addr_t); +extern in_port_t htons(in_port_t); +extern in_addr_t ntohl(in_addr_t); +extern in_port_t ntohs(in_port_t); +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) */ +#endif + +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +/* + * Macros to convert from a specific byte order to/from native byte order + */ +#ifdef _BIG_ENDIAN +#define BE_8(x) BMASK_8(x) +#define BE_16(x) BMASK_16(x) +#define BE_32(x) BMASK_32(x) +#define BE_64(x) BMASK_64(x) +#define LE_8(x) BSWAP_8(x) +#define LE_16(x) BSWAP_16(x) +#define LE_32(x) BSWAP_32(x) +#define LE_64(x) BSWAP_64(x) +#else +#define LE_8(x) BMASK_8(x) +#define LE_16(x) BMASK_16(x) +#define LE_32(x) BMASK_32(x) +#define LE_64(x) BMASK_64(x) +#define BE_8(x) BSWAP_8(x) +#define BE_16(x) BSWAP_16(x) +#define BE_32(x) BSWAP_32(x) +#define BE_64(x) BSWAP_64(x) +#endif + +/* + * Macros to read unaligned values from a specific byte order to + * native byte order + */ + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + +#define BE_IN64(xa) \ + (((uint64_t)BE_IN32(xa) << 32) | BE_IN32((uint8_t *)(xa)+4)) + +#define LE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define LE_IN16(xa) \ + (((uint16_t)LE_IN8((uint8_t *)(xa) + 1) << 8) | LE_IN8(xa)) + +#define LE_IN32(xa) \ + (((uint32_t)LE_IN16((uint8_t *)(xa) + 2) << 16) | LE_IN16(xa)) + +#define LE_IN64(xa) \ + (((uint64_t)LE_IN32((uint8_t *)(xa) + 4) << 32) | LE_IN32(xa)) + +/* + * Macros to write unaligned values from native byte order to a specific byte + * order. + */ + +#define BE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define BE_OUT16(xa, yv) \ + BE_OUT8((uint8_t *)(xa) + 1, yv); \ + BE_OUT8((uint8_t *)(xa), (yv) >> 8); + +#define BE_OUT32(xa, yv) \ + BE_OUT16((uint8_t *)(xa) + 2, yv); \ + BE_OUT16((uint8_t *)(xa), (yv) >> 16); + +#define BE_OUT64(xa, yv) \ + BE_OUT32((uint8_t *)(xa) + 4, yv); \ + BE_OUT32((uint8_t *)(xa), (yv) >> 32); + +#define LE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define LE_OUT16(xa, yv) \ + LE_OUT8((uint8_t *)(xa), yv); \ + LE_OUT8((uint8_t *)(xa) + 1, (yv) >> 8); + +#define LE_OUT32(xa, yv) \ + LE_OUT16((uint8_t *)(xa), yv); \ + LE_OUT16((uint8_t *)(xa) + 2, (yv) >> 16); + +#define LE_OUT64(xa, yv) \ + LE_OUT32((uint8_t *)(xa), yv); \ + LE_OUT32((uint8_t *)(xa) + 4, (yv) >> 32); + +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BYTEORDER_H */ diff --git a/lib/libspl/include/os/macos/sys/errno.h b/lib/libspl/include/os/macos/sys/errno.h new file mode 100644 index 0000000000..af4846ebb9 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/errno.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include_next + +#define EBADE EBADMACHO +#define ECKSUM EBADE +#define EFRAGS EIDRM +#define EREMOTEIO ENOLINK +#define ENOTACTIVE ENOPOLICY +#define ECHRNG EMULTIHOP + +#define ERESTART (-1) /* restart syscall */ diff --git a/lib/libspl/include/os/macos/sys/fcntl.h b/lib/libspl/include/os/macos/sys/fcntl.h new file mode 100644 index 0000000000..f0b03be99b --- /dev/null +++ b/lib/libspl/include/os/macos/sys/fcntl.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _LIBSPL_SYS_FCNTL_H +#define _LIBSPL_SYS_FCNTL_H + +#include_next + +#define O_LARGEFILE 0 +#define O_RSYNC 0 + +#ifndef O_DIRECT +#define O_DIRECT 0 +#endif + +#endif diff --git a/lib/libspl/include/os/macos/sys/file.h b/lib/libspl/include/os/macos/sys/file.h new file mode 100644 index 0000000000..94a33cbb33 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/file.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_FILE_H +#define _LIBSPL_SYS_FILE_H + +#include_next + +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FOFFMAX 0 +#define FSYNC O_SYNC +#define FDSYNC O_DSYNC +#define FRSYNC O_RSYNC +#define FEXCL O_EXCL + +#define IO_DIRECT 0 + +#define FNODSYNC 0x10000 /* fsync pseudo flag */ +#define FNOFOLLOW 0x20000 /* don't follow symlinks */ +#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ + +#endif diff --git a/lib/libspl/include/os/macos/sys/kernel_types.h b/lib/libspl/include/os/macos/sys/kernel_types.h new file mode 100644 index 0000000000..5796351a20 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/kernel_types.h @@ -0,0 +1,43 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef LIBSPL_SYS_KERNEL_TYPES_H +#define LIBSPL_SYS_KERNEL_TYPES_H + +/* + * Unfortunately, XNU defines uio_t, proc_t and vnode_t differently to + * ZFS, so we need to hack around it. + */ + +#undef vnode_t +#undef uio_t +#define proc_t kernel_proc_t +#include_next +#define vnode_t struct vnode +#define uio_t struct uio +#undef proc_t + + +/* Other missing Linux types */ +typedef off_t loff_t; + +#endif diff --git a/lib/libspl/include/os/macos/sys/mnttab.h b/lib/libspl/include/os/macos/sys/mnttab.h new file mode 100644 index 0000000000..9ba5d08b21 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/mnttab.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2006 Ricardo Correia */ + +#ifndef _SYS_MNTTAB_H +#define _SYS_MNTTAB_H + +#include +#include +#include +#include + +#ifdef MNTTAB +#undef MNTTAB +#endif /* MNTTAB */ + +#include +#include +#define MNTTAB _PATH_DEVNULL +#define MS_NOMNTTAB 0x0 +#define MS_RDONLY 0x1 +#define umount2(p, f) unmount(p, f) +#define MNT_LINE_MAX 4096 + +#define MNT_TOOLONG 1 /* entry exceeds MNT_LINE_MAX */ +#define MNT_TOOMANY 2 /* too many fields in line */ +#define MNT_TOOFEW 3 /* too few fields in line */ + +struct mnttab { + char *mnt_special; + char *mnt_mountp; + char *mnt_fstype; + char *mnt_mntopts; + uint_t mnt_major; + uint_t mnt_minor; + uint32_t mnt_fssubtype; +}; + +#define extmnttab mnttab + +struct stat64; +struct statfs; + +extern DIR *fdopendir(int fd); +extern int openat64(int, const char *, int, ...); + +extern int getmntany(FILE *fd, struct mnttab *mgetp, struct mnttab *mrefp); +extern int getmntent(FILE *fp, struct mnttab *mp); +extern char *hasmntopt(struct mnttab *mnt, char *opt); +extern int getextmntent(const char *path, struct extmnttab *entry, + struct stat64 *statbuf); + +extern void statfs2mnttab(struct statfs *sfs, struct mnttab *mp); + +#ifndef AT_SYMLINK_NOFOLLOW +#define AT_SYMLINK_NOFOLLOW 0x100 +#endif + +extern int fstatat64(int, const char *, struct stat *, int); + +#endif diff --git a/lib/libspl/include/os/macos/sys/mount.h b/lib/libspl/include/os/macos/sys/mount.h new file mode 100644 index 0000000000..640159b544 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/mount.h @@ -0,0 +1,114 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _LIBSPL_SYS_MOUNT_H +#define _LIBSPL_SYS_MOUNT_H + +#undef _SYS_MOUNT_H_ + +#include +#include +#include +#include +#include + +/* Unfortunately, XNU has a different meaning for "vnode_t". */ +#undef vnode_t +#include_next +#define vnode_t struct vnode + +/* + * Some old glibc headers don't define BLKGETSIZE64 + * and we don't want to require the kernel headers + */ +#if !defined(BLKGETSIZE64) +#define BLKGETSIZE64 _IOR(0x12, 114, size_t) +#endif + +/* + * Some old glibc headers don't correctly define MS_DIRSYNC and + * instead use the enum name S_WRITE. When using these older + * headers define MS_DIRSYNC to be S_WRITE. + */ +#if !defined(MS_DIRSYNC) +#define MS_DIRSYNC S_WRITE +#endif + +/* + * Some old glibc headers don't correctly define MS_POSIXACL and + * instead leave it undefined. When using these older headers define + * MS_POSIXACL to the reserved value of (1<<16). + */ +#if !defined(MS_POSIXACL) +#define MS_POSIXACL (1<<16) +#endif + +#define MS_NOSUID MNT_NOSUID +#define MS_NOEXEC MNT_NOEXEC +#define MS_NODEV MNT_NODEV +#define S_WRITE 0 +#define MS_BIND 0 +#define MS_REMOUNT MNT_UPDATE +#define MS_SYNCHRONOUS MNT_SYNCHRONOUS + +#define MS_USERS (MS_NOEXEC|MS_NOSUID|MS_NODEV) +#define MS_OWNER (MS_NOSUID|MS_NODEV) +#define MS_GROUP (MS_NOSUID|MS_NODEV) +#define MS_COMMENT 0 + +/* + * Older glibc headers did not define all the available + * umount2(2) flags. Both MNT_FORCE and MNT_DETACH are supported in the + * kernel back to 2.4.11 so we define them correctly if they are missing. + */ +#ifdef MNT_FORCE +#define MS_FORCE MNT_FORCE +#else +#define MS_FORCE 0x00000001 +#endif /* MNT_FORCE */ + +#ifdef MNT_DETACH +#define MS_DETACH MNT_DETACH +#else +#define MS_DETACH 0x00000002 +#endif /* MNT_DETACH */ + +/* + * Overlay mount is default in Linux, but for solaris/zfs + * compatibility, MS_OVERLAY is defined to explicitly have the user + * provide a flag (-O) to mount over a non empty directory. + */ +#define MS_OVERLAY 0x00000004 + +/* + * MS_CRYPT indicates that encryption keys should be loaded if they are not + * already available. This is not defined in glibc, but it is never seen by + * the kernel so it will not cause any problems. + */ +#define MS_CRYPT 0x00000008 + +#endif /* _LIBSPL_SYS_MOUNT_H */ diff --git a/lib/libspl/include/os/macos/sys/param.h b/lib/libspl/include/os/macos/sys/param.h new file mode 100644 index 0000000000..a9815f10b5 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/param.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_PARAM_H +#define _LIBSPL_SYS_PARAM_H + +#include_next +#include + +/* + * File system parameters and macros. + * + * The file system is made out of blocks of at most MAXBSIZE units, + * with smaller units (fragments) only in the last direct block. + * MAXBSIZE primarily determines the size of buffers in the buffer + * pool. It may be made larger without any effect on existing + * file systems; however making it smaller may make some file + * systems unmountable. + * + * Note that the blocked devices are assumed to have DEV_BSIZE + * "sectors" and that fragments must be some multiple of this size. + */ +#define MAXNAMELEN 256 + +#define UID_NOBODY 60001 /* user ID no body */ +#define GID_NOBODY UID_NOBODY +#define UID_NOACCESS 60002 /* user ID no access */ + +#define MAXUID UINT32_MAX /* max user id */ +#define MAXPROJID MAXUID /* max project id */ + +#ifndef PAGESIZE +#define PAGESIZE (sysconf(_SC_PAGESIZE)) +#endif /* PAGESIZE */ + +extern int execvpe(const char *name, char * const argv[], char * const envp[]); + +struct zfs_handle; + +#endif diff --git a/lib/libspl/include/os/macos/sys/stat.h b/lib/libspl/include/os/macos/sys/stat.h new file mode 100644 index 0000000000..1c7858194d --- /dev/null +++ b/lib/libspl/include/os/macos/sys/stat.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBSPL_SYS_STAT_H +#define _LIBSPL_SYS_STAT_H + +#include_next + +#include +#include /* for BLKGETSIZE64 */ + +#define MAXOFFSET_T OFF_MAX + +#ifndef _KERNEL +#include +#endif + +static inline int +fstat_blk(int fd, struct stat *st) +{ + if (fstat(fd, st) == -1) + return (-1); + + /* In OS X we need to use ioctl to get the size of a block dev */ + if (st->st_mode & (S_IFBLK | S_IFCHR)) { + uint32_t blksize; + uint64_t blkcnt; + + if (ioctl(fd, DKIOCGETBLOCKSIZE, &blksize) < 0) { + return (-1); + } + if (ioctl(fd, DKIOCGETBLOCKCOUNT, &blkcnt) < 0) { + return (-1); + } + + st->st_size = (off_t)((uint64_t)blksize * blkcnt); + } + + return (0); +} + + +/* + * Deal with Linux use of 64 for everything. + * OsX has moved past it, dropped all 32 versions, and + * standard form is 64 bit. + */ + +#define stat64 stat +#define lstat64 lstat +#define fstat64 fstat +#define fstat64_blk fstat_blk +#define statfs64 statfs + +#endif /* _LIBSPL_SYS_STAT_H */ diff --git a/lib/libspl/include/os/macos/sys/sysmacros.h b/lib/libspl/include/os/macos/sys/sysmacros.h new file mode 100644 index 0000000000..7480eb85a5 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/sysmacros.h @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_SYSMACROS_H +#define _LIBSPL_SYS_SYSMACROS_H + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#define makedevice(maj, min) makedev(maj, min) +#define _sysconf(a) sysconf(a) + +/* + * Compatibility macros/typedefs needed for Solaris -> Linux port + */ +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define ISP2(x) (((x) & ((x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + + +/* avoid any possibility of clashing with version */ +#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + +#ifndef RLIM64_INFINITY +#define RLIM64_INFINITY (~0ULL) +#endif + +#endif /* _LIBSPL_SYS_SYSMACROS_H */ diff --git a/lib/libspl/include/os/macos/sys/uio.h b/lib/libspl/include/os/macos/sys/uio.h new file mode 100644 index 0000000000..f646b3e5d5 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/uio.h @@ -0,0 +1,175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _LIBSPL_SYS_UIO_H +#define _LIBSPL_SYS_UIO_H + +#include +#include_next + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* + * I/O parameter information. A uio structure describes the I/O which + * is to be performed by an operation. Typically the data movement will + * be performed by a routine such as uiomove(), which updates the uio + * structure to reflect what was done. + */ + +typedef struct iovec iovec_t; + + +/* + * I/O direction. + */ +// typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t; + +/* + * Segment flag values. + */ +typedef enum uio_seg { UIO_USERSPACE, UIO_SYSSPACE, UIO_USERISPACE } uio_seg_t; + + +struct uio { + struct iovec *uio_iov; /* pointer to array of iovecs */ + int uio_iovcnt; /* number of iovecs */ + off_t uio_offset; /* file offset */ + uio_seg_t uio_segflg; /* address space (kernel or user) */ + off_t uio_limit; /* u-limit (maximum byte offset) */ + ssize_t uio_resid; /* residual count */ + enum uio_rw uio_rw; + int uio_max_iovs; /* max iovecs this uio_t can hold */ + uint32_t uio_index; /* Current index */ +}; + + +uio_t *uio_create(int iovcount, off_t offset, int spacetype, int iodirection); +void uio_free(uio_t *uio); +int uio_addiov(uio_t *uio, user_addr_t baseaddr, user_size_t length); +int uio_isuserspace(uio_t *uio); +int uio_getiov(uio_t *uio, int index, user_addr_t *baseaddr, + user_size_t *length); +int uio_iovcnt(uio_t *uio); +off_t uio_offset(uio_t *uio); +void uio_update(uio_t *uio, user_size_t count); +uint64_t uio_resid(uio_t *uio); +user_addr_t uio_curriovbase(uio_t *uio); +user_size_t uio_curriovlen(uio_t *uio); +void uio_setoffset(uio_t *uio, off_t a_offset); +uio_t *uio_duplicate(uio_t *uio); +int uio_rw(uio_t *a_uio); +void uio_setrw(uio_t *a_uio, int a_value); + +int uiomove(void *, uint32_t, enum uio_rw, struct uio *); +int spllib_uiomove(const uint8_t *, uint32_t, struct uio *); +void uioskip(struct uio *, uint32_t); +int uiodup(struct uio *, struct uio *, iovec_t *, int); + +// xuio struct is not used in this platform, but we define it +// to allow compilation and easier patching +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY, +} xuio_type_t; + + +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { + int uioa_pfncnt; + void **uioa_ppp; + caddr_t uioa_base; + size_t uioa_len; +} uioa_page_t; + +typedef struct xuio { + uio_t *xu_uio; + enum xuio_type xu_type; + union { + struct { + uint32_t xu_a_state; + ssize_t xu_a_mbytes; + uioa_page_t *xu_a_lcur; + void **xu_a_lppp; + void *xu_a_hwst[4]; + uioa_page_t xu_a_locked[UIOA_IOV_MAX]; + } xu_aio; + struct { + int xu_zc_rw; + void *xu_zc_priv; + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +/* + * same as uiomove() but doesn't modify uio structure. + * return in cbytes how many bytes were copied. + */ +static inline int uiocopy(const unsigned char *p, uint32_t n, + enum uio_rw rw, struct uio *uio, uint64_t *cbytes) +{ + int result; + struct uio *nuio = uio_duplicate(uio); + unsigned long long x = uio_resid(uio); + if (!nuio) + return (ENOMEM); + uio_setrw(nuio, rw); + result = spllib_uiomove(p, n, nuio); + *cbytes = (x - uio_resid(nuio)); + uio_free(nuio); + return (result); +} + +// Apple's uiomove puts the uio_rw in uio_create +#define uiomove(A, B, C, D) spllib_uiomove((A), (B), (D)) +#define uioskip(A, B) uio_update((A), (B)) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UIO_H */ diff --git a/lib/libspl/include/os/macos/sys/vfs.h b/lib/libspl/include/os/macos/sys/vfs.h new file mode 100644 index 0000000000..a2ffcc08d8 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/vfs.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef ZFS_SYS_VFS_H_ +#define ZFS_SYS_VFS_H_ + +#endif /* !ZFS_SYS_VFS_H_ */ diff --git a/lib/libspl/include/os/macos/sys/xattr.h b/lib/libspl/include/os/macos/sys/xattr.h new file mode 100644 index 0000000000..045f681b1e --- /dev/null +++ b/lib/libspl/include/os/macos/sys/xattr.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBSPL_SYS_XATTR_H +#define _LIBSPL_SYS_XATTR_H + +#include_next + +/* macOS has one more argument */ +#define setxattr(A, B, C, D, E) setxattr(A, B, C, D, E, 0) +#define getxattr(A, B, C, D, E) getxattr(A, B, C, D, E, 0) + +#endif diff --git a/lib/libspl/include/os/macos/sys/zfs_context_os.h b/lib/libspl/include/os/macos/sys/zfs_context_os.h new file mode 100644 index 0000000000..3324a1cf25 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/zfs_context_os.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef ZFS_CONTEXT_OS_H_ +#define ZFS_CONTEXT_OS_H_ + +#include + +#define ZFS_EXPORTS_PATH "/etc/exports" +#define MNTTYPE_ZFS_SUBTYPE ('Z'<<24|'F'<<16|'S'<<8) + +struct spa_iokit; +typedef struct spa_iokit spa_iokit_t; + +typedef off_t loff_t; + +struct zfs_handle; + +extern void zfs_rollback_os(struct zfs_handle *zhp); +extern void libzfs_macos_wrapfd(int *srcfd, boolean_t send); + +#endif diff --git a/lib/libspl/include/os/macos/time.h b/lib/libspl/include/os/macos/time.h new file mode 100644 index 0000000000..3b1717e6cc --- /dev/null +++ b/lib/libspl/include/os/macos/time.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LIBSPL_TIME_H +#define _LIBSPL_TIME_H + +#include_next + +/* Linux also has a timer_create() API we need to emulate. */ + +/* + * OsX version can probably be implemented by using: + * dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, 0, 0, queue); + * dispatch_source_set_event_handler(timer1, ^{vector1(timer1);}); + * dispatch_source_set_cancel_handler(timer1 + * dispatch_time_t start = dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC); + * dispatch_source_set_timer(timer1, start, NSEC_PER_SEC / 5, 0); + */ + +typedef void *timer_t; + +struct itimerspec { + struct timespec it_interval; /* timer period */ + struct timespec it_value; /* timer expiration */ +}; + +struct sigevent; + +static inline int +timer_create(clockid_t clockid, struct sigevent *sevp, + timer_t *timerid) +{ + return (0); +} + +static inline int +timer_settime(timer_t id, int flags, + const struct itimerspec *its, struct itimerspec *remainvalue) +{ + return (0); +} + +static inline int +timer_delete(timer_t id) +{ + return (0); +} + +#endif diff --git a/lib/libspl/include/os/macos/unistd.h b/lib/libspl/include/os/macos/unistd.h new file mode 100644 index 0000000000..4a2814b9ec --- /dev/null +++ b/lib/libspl/include/os/macos/unistd.h @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LIBSPL_UNISTD_H +#define _LIBSPL_UNISTD_H + +#include_next +#include + +/* Handle Linux use of 64 names */ + +#define open64 open +#define pread64 pread +#define pwrite64 pwrite +#define ftruncate64 ftruncate +#define lseek64 lseek + + +static inline int +fdatasync(int fd) +{ + if (fcntl(fd, F_FULLFSYNC) == -1) + return (-1); + return (0); +} + +#endif diff --git a/lib/libspl/os/macos/getexecname.c b/lib/libspl/os/macos/getexecname.c new file mode 100644 index 0000000000..7e37958869 --- /dev/null +++ b/lib/libspl/os/macos/getexecname.c @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +const char * +getexecname(void) +{ + return (getprogname()); +} diff --git a/lib/libspl/os/macos/gethostid.c b/lib/libspl/os/macos/gethostid.c new file mode 100644 index 0000000000..bedea579ed --- /dev/null +++ b/lib/libspl/os/macos/gethostid.c @@ -0,0 +1,37 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2020, Jorgen Lundman + */ + +#include +#include +#include + +unsigned long +get_system_hostid(void) +{ + size_t len; + uint32_t myhostid = 0; + len = sizeof (myhostid); + sysctlbyname("kern.hostid", &myhostid, &len, NULL, 0); + return (myhostid); +} diff --git a/lib/libspl/os/macos/getmntany.c b/lib/libspl/os/macos/getmntany.c new file mode 100644 index 0000000000..f3fec9654e --- /dev/null +++ b/lib/libspl/os/macos/getmntany.c @@ -0,0 +1,462 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Ricardo Correia. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +#include +#include +#include +#include /* for isspace() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DIFF(xx) ((mrefp->xx != NULL) && \ + (mgetp->xx == NULL || strcmp(mrefp->xx, mgetp->xx) != 0)) + +static struct statfs *gsfs = NULL; +static int allfs = 0; +/* + * We will also query the extended filesystem capabilities API, to lookup + * other mount options, for example, XATTR. We can not use the MNTNOUSERXATTR + * option due to VFS rejecting with EACCESS. + */ + +#include +typedef struct attrlist attrlist_t; + +struct attrBufS { + u_int32_t length; + vol_capabilities_set_t caps; +} __attribute__((aligned(4), packed)); + + +DIR * +fdopendir(int fd) +{ + char fullpath[MAXPATHLEN]; + + if (fcntl(fd, F_GETPATH, fullpath) < 0) { + perror("fcntl"); + return (NULL); + } + if (close(fd) < 0) { + return (NULL); + } + + return (opendir(fullpath)); +} + +static int +chdir_block_begin(int newroot_fd) +{ + int cwdfd, error; + + cwdfd = open(".", O_RDONLY | O_DIRECTORY); + if (cwdfd == -1) + return (-1); + + if (fchdir(newroot_fd) == -1) { + error = errno; + (void) close(cwdfd); + errno = error; + return (-1); + } + return (cwdfd); +} + +static void +chdir_block_end(int cwdfd) +{ + int error = errno; + (void) fchdir(cwdfd); + (void) close(cwdfd); + errno = error; +} + +int +openat64(int dirfd, const char *path, int flags, ...) +{ + int cwdfd, filefd; + + if ((cwdfd = chdir_block_begin(dirfd)) == -1) + return (-1); + + if ((flags & O_CREAT) != 0) { + va_list ap; + int mode; + + va_start(ap, flags); + mode = va_arg(ap, int); + va_end(ap); + + filefd = open(path, flags, mode); + } else + filefd = open(path, flags); + + chdir_block_end(cwdfd); + return (filefd); +} + +int +fstatat64(int dirfd, const char *path, struct stat *statbuf, int flag) +{ + int cwdfd, error; + + if ((cwdfd = chdir_block_begin(dirfd)) == -1) + return (-1); + + if (flag == AT_SYMLINK_NOFOLLOW) + error = lstat(path, statbuf); + else + error = stat(path, statbuf); + + chdir_block_end(cwdfd); + return (error); +} + + +static char * +mntopt(char **p) +{ + char *cp = *p; + char *retstr; + + while (*cp && isspace(*cp)) + cp++; + + retstr = cp; + while (*cp && *cp != ',') + cp++; + + if (*cp) { + *cp = '\0'; + cp++; + } + + *p = cp; + return (retstr); +} + +char * +hasmntopt(struct mnttab *mnt, char *opt) +{ + char tmpopts[256]; + char *f, *opts = tmpopts; + + if (mnt->mnt_mntopts == NULL) + return (NULL); + (void) strlcpy(opts, mnt->mnt_mntopts, 256); + f = mntopt(&opts); + for (; *f; f = mntopt(&opts)) { + if (strncmp(opt, f, strlen(opt)) == 0) + return (f - tmpopts + mnt->mnt_mntopts); + } + return (NULL); +} + +static void +optadd(char *mntopts, size_t size, const char *opt) +{ + + if (mntopts[0] != '\0') + strlcat(mntopts, ",", size); + strlcat(mntopts, opt, size); +} + + +#include +#include +#include +#include +#include + + +char * +MYCFStringCopyUTF8String(CFStringRef aString) +{ + if (aString == NULL) + return (NULL); + + CFIndex length = CFStringGetLength(aString); + CFIndex maxSize = + CFStringGetMaximumSizeForEncoding(length, + kCFStringEncodingUTF8); + char *buffer = (char *)malloc(maxSize); + if (CFStringGetCString(aString, buffer, maxSize, + kCFStringEncodingUTF8)) { + return (buffer); + } + return (NULL); +} + +/* + * Given "/dev/disk6" connect to IOkit and fetch the dataset + * name "BOOM/lower", and use it instead. + */ +void +expand_disk_to_zfs(char *devname, int len) +{ + char *result = NULL; + CFMutableDictionaryRef matchingDict; + io_service_t service; + CFStringRef cfstr; + char *device; + + if (strncmp(devname, "/dev/disk", 9) != 0) + return; + + device = &devname[5]; + + matchingDict = IOBSDNameMatching(kIOMasterPortDefault, 0, device); + if (NULL == matchingDict) + return; + + /* + * Fetch the object with the matching BSD node name. + * Note that there should only be one match, so + * IOServiceGetMatchingService is used instead of + * IOServiceGetMatchingServices to simplify the code. + */ + service = IOServiceGetMatchingService(kIOMasterPortDefault, + matchingDict); + + if (IO_OBJECT_NULL == service) { + return; + } + + cfstr = IORegistryEntryCreateCFProperty(service, + CFSTR("ZFS Dataset"), kCFAllocatorDefault, 0); + if (cfstr) { + result = MYCFStringCopyUTF8String(cfstr); + CFRelease(cfstr); + } + + IOObjectRelease(service); + + if (result) { + strlcpy(devname, result, len); + free(result); + } +} + +void +statfs2mnttab(struct statfs *sfs, struct mnttab *mp) +{ + static char mntopts[MNTMAXSTR]; + long flags; + + mntopts[0] = '\0'; + + flags = sfs->f_flags; +#define OPTADD(opt) optadd(mntopts, sizeof (mntopts), (opt)) + if (flags & MNT_RDONLY) + OPTADD(MNTOPT_RO); + else + OPTADD(MNTOPT_RW); + if (flags & MNT_NOSUID) +#ifdef __FreeBSD__ + OPTADD(MNTOPT_NOSUID); +#elif defined(__APPLE__) + OPTADD(MNTOPT_NOSETUID); +#endif + else + OPTADD(MNTOPT_SETUID); + if (flags & MNT_UPDATE) + OPTADD(MNTOPT_REMOUNT); + if (flags & MNT_NOATIME) + OPTADD(MNTOPT_NOATIME); + else + OPTADD(MNTOPT_ATIME); + { + struct attrBufS attrBuf; + attrlist_t attrList; + + memset(&attrList, 0, sizeof (attrList)); + attrList.bitmapcount = ATTR_BIT_MAP_COUNT; + attrList.volattr = ATTR_VOL_INFO|ATTR_VOL_CAPABILITIES; + + if (getattrlist(sfs->f_mntonname, &attrList, &attrBuf, + sizeof (attrBuf), 0) == 0) { + + if (attrBuf.caps[VOL_CAPABILITIES_INTERFACES] & + VOL_CAP_INT_EXTENDED_ATTR) { + OPTADD(MNTOPT_XATTR); + } else { + OPTADD(MNTOPT_NOXATTR); + } // If EXTENDED + } // if getattrlist + } + if (flags & MNT_NOEXEC) + OPTADD(MNTOPT_NOEXEC); + else + OPTADD(MNTOPT_EXEC); + if (flags & MNT_NODEV) + OPTADD(MNTOPT_NODEVICES); + else + OPTADD(MNTOPT_DEVICES); + if (flags & MNT_DONTBROWSE) + OPTADD(MNTOPT_NOBROWSE); + else + OPTADD(MNTOPT_BROWSE); + if (flags & MNT_IGNORE_OWNERSHIP) + OPTADD(MNTOPT_NOOWNERS); + else + OPTADD(MNTOPT_OWNERS); + +#undef OPTADD + + // If a disk is /dev/diskX, lets see if it has "zfs_dataset_name" + // set, and if so, use it instead, for mount matching. + expand_disk_to_zfs(sfs->f_mntfromname, sizeof (sfs->f_mntfromname)); + + mp->mnt_special = sfs->f_mntfromname; + mp->mnt_mountp = sfs->f_mntonname; + mp->mnt_fstype = sfs->f_fstypename; + mp->mnt_mntopts = mntopts; + mp->mnt_fssubtype = sfs->f_fssubtype; + +} + +static int +statfs_init(void) +{ + struct statfs *sfs; + int error; + + if (gsfs != NULL) { + free(gsfs); + gsfs = NULL; + } + allfs = getfsstat(NULL, 0, MNT_NOWAIT); + if (allfs == -1) + goto fail; + gsfs = malloc(sizeof (gsfs[0]) * allfs * 2); + if (gsfs == NULL) + goto fail; + allfs = getfsstat(gsfs, (long)(sizeof (gsfs[0]) * allfs * 2), + MNT_NOWAIT); + if (allfs == -1) + goto fail; + sfs = realloc(gsfs, allfs * sizeof (gsfs[0])); + if (sfs != NULL) + gsfs = sfs; + return (0); +fail: + error = errno; + if (gsfs != NULL) + free(gsfs); + gsfs = NULL; + allfs = 0; + return (error); +} + +int +getmntany(FILE *fd __unused, struct mnttab *mgetp, struct mnttab *mrefp) +{ + int i, error; + + error = statfs_init(); + if (error != 0) + return (error); + + for (i = 0; i < allfs; i++) { + statfs2mnttab(&gsfs[i], mgetp); + if (mrefp->mnt_special != NULL && mgetp->mnt_special != NULL && + strcmp(mrefp->mnt_special, mgetp->mnt_special) != 0) { + continue; + } + if (mrefp->mnt_mountp != NULL && mgetp->mnt_mountp != NULL && + strcmp(mrefp->mnt_mountp, mgetp->mnt_mountp) != 0) { + continue; + } + if (mrefp->mnt_fstype != NULL && mgetp->mnt_fstype != NULL && + strcmp(mrefp->mnt_fstype, mgetp->mnt_fstype) != 0) { + continue; + } + return (0); + } + return (-1); +} + +int +getmntent(FILE *fp, struct mnttab *mp) +{ + static int index = -1; + int error = 0; + + if (index < 0) { + error = statfs_init(); + } + + if (error != 0) + return (error); + + index++; + + // If we have finished "reading" the mnttab, reset it to + // start from the beginning, and return EOF. + if (index >= allfs) { + index = -1; + return (-1); + } + + statfs2mnttab(&gsfs[index], mp); + return (0); +} + +int +getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) +{ + struct statfs sfs; + + if (strlen(path) >= MAXPATHLEN) { + (void) fprintf(stderr, "invalid object; pathname too long\n"); + return (-1); + } + + if (stat64(path, statbuf) != 0) { + (void) fprintf(stderr, "cannot open '%s': %s\n", + path, strerror(errno)); + return (-1); + } + + if (statfs(path, &sfs) != 0) { + (void) fprintf(stderr, "%s: %s\n", path, + strerror(errno)); + return (-1); + } + statfs2mnttab(&sfs, (struct mnttab *)entry); + return (0); +} diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 4a5d448af6..bc53c3979f 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -2718,15 +2718,19 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, #ifdef __APPLE__ /* - * On OSX by default we mount pools under /Volumes unless - * the dataset property mountpoint specifies otherwise. - * In addition to this, there is an undocumented environment - * variable __ZFS_MAIN_MOUNTPOINT_DIR, used mainly by the - * testing environment, as it expects "/" by default. + * On OSX by default we mount pools under /Volumes + * unless the dataset property mountpoint specifies + * otherwise. + * In addition to this, there is an undocumented + * environment variable __ZFS_MAIN_MOUNTPOINT_DIR, + * used mainly by the testing environment, as it + * expects "/" by default. */ char *default_mountpoint; - default_mountpoint = getenv("__ZFS_MAIN_MOUNTPOINT_DIR"); - if (!default_mountpoint) default_mountpoint = "/Volumes/"; + default_mountpoint = + getenv("__ZFS_MAIN_MOUNTPOINT_DIR"); + if (!default_mountpoint) + default_mountpoint = "/Volumes/"; if (relpath[0] == '\0') (void) snprintf(propbuf, proplen, "%s%s", @@ -2734,8 +2738,8 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, else (void) snprintf(propbuf, proplen, "%s%s%s%s", root, str, source == NULL || - source[0] == '\0' ? default_mountpoint : "/", - relpath); + source[0] == '\0' ? default_mountpoint : + "/", relpath); #else diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 3de7d7d9cc..b6c7275db7 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -1100,6 +1100,12 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) } if (!sdd->dryrun) { + +#if defined(__APPLE__) + /* Can't do IO on pipes, possibly wrap fd in domain socket */ + libzfs_macos_wrapfd(&sdd->outfd, B_TRUE); +#endif + /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. @@ -2483,6 +2489,11 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, if (flags->dryrun) return (0); +#if defined(__APPLE__) + /* Can't do IO on pipes, possibly wrap fd in domain socket */ + libzfs_macos_wrapfd(&fd, B_TRUE); +#endif + /* * If progress reporting is requested, spawn a new thread to poll * ZFS_IOC_SEND_PROGRESS at a regular interval. @@ -2573,6 +2584,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, return (zfs_standard_error(hdl, errno, errbuf)); } } + return (err != 0); } @@ -4646,6 +4658,11 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF); } +#if defined(__APPLE__) + /* Can't do IO on pipes, possibly wrap fd in domain socket */ + libzfs_macos_wrapfd(&infd, B_FALSE); +#endif + err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, raw, infd, drr_noswap, -1, &read_bytes, &errflags, diff --git a/lib/libzfs/os/macos/libzfs_mount_os.c b/lib/libzfs/os/macos/libzfs_mount_os.c new file mode 100644 index 0000000000..127a3cd422 --- /dev/null +++ b/lib/libzfs/os/macos/libzfs_mount_os.c @@ -0,0 +1,486 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2019 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright 2017 RackTop Systems. + * Copyright (c) 2018 Datto Inc. + * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" +#include +#include + + +/* + * The default OpenZFS icon. Compare against known values to see if it needs + * updating. Allowing users to set own. + * No file: copy icon + * correct size: do nothing + * other size: user custom icon, do nothing + */ + +/* icon name on root of a mount */ +#define MOUNT_POINT_CUSTOM_ICON ".VolumeIcon.icns" + +/* source icon name from inside zfs.kext bundle */ +#define CUSTOM_ICON_PATH \ + KERNEL_MODPREFIX "/zfs.kext/Contents/Resources/VolumeIcon.icns" + +#include + + +/* + * On OSX we can set the icon to an Open ZFS specific one, just to be extra + * shiny + */ +static void +zfs_mount_seticon(const char *mountpoint) +{ + /* For a root file system, add a volume icon. */ + ssize_t attrsize; + uint16_t finderinfo[16]; + struct stat sbuf; + char *path = NULL; + FILE *dstfp = NULL, *srcfp = NULL; + unsigned char buf[1024]; + unsigned int red; + + if (asprintf(&path, "%s/%s", mountpoint, MOUNT_POINT_CUSTOM_ICON) == -1) + return; + + /* If we can stat it, and it has a size, leave it be. */ + if ((stat(path, &sbuf) == 0 && sbuf.st_size > 0)) + goto out; + + /* Looks like we should copy the icon over */ + + /* check if we can read in the default ZFS icon */ + srcfp = fopen(CUSTOM_ICON_PATH, "r"); + + /* No source icon */ + if (!srcfp) + goto out; + + /* Open the output icon for writing */ + dstfp = fopen(path, "w"); + if (!dstfp) + goto out; + + /* Copy icon */ + while ((red = fread(buf, 1, sizeof (buf), srcfp)) > 0) + (void) fwrite(buf, 1, red, dstfp); + + /* We have copied it, set icon */ + attrsize = getxattr(mountpoint, XATTR_FINDERINFO_NAME, &finderinfo, + sizeof (finderinfo), 0); + if (attrsize != sizeof (finderinfo)) + (void) memset(&finderinfo, 0, sizeof (finderinfo)); + if ((finderinfo[4] & BE_16(0x0400)) == 0) { + finderinfo[4] |= BE_16(0x0400); + (void) setxattr(mountpoint, XATTR_FINDERINFO_NAME, &finderinfo, + sizeof (finderinfo), 0); + } + + /* Now tell Finder to update */ +#if 0 + int fd = -1; + strlcpy(template, mountpoint, sizeof (template)); + strlcat(template, "/tempXXXXXX", sizeof (template)); + if ((fd = mkstemp(template)) != -1) { + unlink(template); // Just delete it right away + close(fd); + } +#endif + +out: + if (dstfp != NULL) + fclose(dstfp); + if (srcfp != NULL) + fclose(srcfp); + if (path != NULL) + free(path); +} + +/* + * if (zmount(zhp, zfs_get_name(zhp), mountpoint, MS_OPTIONSTR | flags, + * MNTTYPE_ZFS, NULL, 0, mntopts, sizeof (mntopts)) != 0) { + */ +int +do_mount(zfs_handle_t *zhp, const char *dir, char *optptr, int mflag) +{ + int rv; + const char *spec = zfs_get_name(zhp); + const char *fstype = MNTTYPE_ZFS; + struct zfs_mount_args mnt_args; + char *rpath = NULL; + zfs_cmd_t zc = { "\0" }; + int devdisk = ZFS_DEVDISK_POOLONLY; + int ispool = 0; // the pool dataset, that is + int optlen = 0; + + assert(spec != NULL); + assert(dir != NULL); + assert(fstype != NULL); + assert(mflag >= 0); + assert(strcmp(fstype, MNTTYPE_ZFS) == 0); + assert(dataptr == NULL); + assert(datalen == 0); + assert(optptr != NULL); + assert(optlen > 0); + + if (optptr != NULL) + optlen = strlen(optptr); + + /* + * Figure out if we want this mount as a /dev/diskX mount, if so + * ask kernel to create one for us, then use it to mount. + */ + + // Use dataset name by default + mnt_args.fspec = spec; + + /* + * Lookup the dataset property devdisk, and depending on its + * setting, we need to create a /dev/diskX for the mount + */ + if (zhp) { + + /* If we are in zfs-tests, no devdisks */ + if (getenv("__ZFS_MAIN_MOUNTPOINT_DIR") != NULL) + devdisk = ZFS_DEVDISK_OFF; + else + devdisk = zfs_prop_get_int(zhp, ZFS_PROP_DEVDISK); + + if (zhp && zhp->zpool_hdl && + strcmp(zpool_get_name(zhp->zpool_hdl), + zfs_get_name(zhp)) == 0) + ispool = 1; + + if ((devdisk == ZFS_DEVDISK_ON) || + ((devdisk == ZFS_DEVDISK_POOLONLY) && + ispool)) { + (void) strlcpy(zc.zc_name, zhp->zfs_name, + sizeof (zc.zc_name)); + zc.zc_value[0] = 0; + + rv = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_PROXY_DATASET, + &zc); + +#ifdef DEBUG + if (rv) + fprintf(stderr, + "proxy dataset returns %d '%s'\n", + rv, zc.zc_value); +#endif + + // Mount using /dev/diskX, use temporary buffer to + // give it full name + if (rv == 0) { + snprintf(zc.zc_name, sizeof (zc.zc_name), + "/dev/%s", zc.zc_value); + mnt_args.fspec = zc.zc_name; + } + } + } + + // Some arguments need to be told to XNU + if (strstr(optptr, "remount") != NULL) + mflag |= MNT_UPDATE; + + mnt_args.mflag = mflag; + mnt_args.optptr = optptr; + mnt_args.optlen = optlen; + mnt_args.struct_size = sizeof (mnt_args); + + /* + * There is a bug in XNU where /var/tmp is resolved as + * "private/var/tmp" without the leading "/", and both mount(2) and + * diskutil mount avoid this by calling realpath() first. So we will + * do the same. + */ + rpath = realpath(dir, NULL); + +#ifdef ZFS_DEBUG + printf("%s calling mount with fstype %s, %s %s, fspec %s, mflag %d," + " optptr %s, optlen %d, devdisk %d, ispool %d\n", + __func__, fstype, (rpath ? "rpath" : "dir"), + (rpath ? rpath : dir), mnt_args.fspec, mflag, optptr, optlen, + devdisk, ispool); +#endif + rv = mount(fstype, rpath ? rpath : dir, mflag, &mnt_args); + + if (rpath) free(rpath); + + /* Check if we need to create/update icon */ + if (rv == 0) + zfs_mount_seticon(dir); + + return (rv); +} + +int +do_unmount_impl(const char *mntpt, int flags) +{ + char force_opt[] = "force"; + char *argv[7] = { + "/usr/sbin/diskutil", + "unmount", + NULL, NULL, NULL, NULL }; + int rc, count = 2; + + if (flags & MS_FORCE) { + argv[count] = force_opt; + count++; + } + + argv[count] = (char *)mntpt; + rc = libzfs_run_process(argv[0], argv, STDOUT_VERBOSE|STDERR_VERBOSE); + + /* + * There is a bug, where we can not unmount, with the error + * already unmounted, even though it wasn't. But it is easy + * to work around by calling 'umount'. Until a real fix is done... + * re-test this: 202004/lundman + */ + if (rc != 0) { + char *argv[7] = { + "/sbin/umount", + NULL, NULL, NULL, NULL }; + int rc, count = 1; + + fprintf(stderr, "Fallback umount called\r\n"); + if (flags & MS_FORCE) { + argv[count] = "-f"; + count++; + } + argv[count] = (char *)mntpt; + rc = libzfs_run_process(argv[0], argv, + STDOUT_VERBOSE|STDERR_VERBOSE); + } + + return (rc ? EINVAL : 0); +} + + +void unmount_snapshots(libzfs_handle_t *hdl, const char *mntpt, int flags); + +int +do_unmount(libzfs_handle_t *hdl, const char *mntpt, int flags) +{ + /* + * On OSX, the kernel can not unmount all snapshots for us, as XNU + * rejects the unmount before it reaches ZFS. But we can easily handle + * unmounting snapshots from userland. + */ + unmount_snapshots(hdl, mntpt, flags); + + return (do_unmount_impl(mntpt, flags)); +} + +/* + * Given "/Volumes/BOOM" look for any lower mounts with ".zfs/snapshot/" + * in them - issue unmount. + */ +void +unmount_snapshots(libzfs_handle_t *hdl, const char *mntpt, int flags) +{ + struct mnttab entry; + int len = strlen(mntpt); + + while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { + /* Starts with our mountpoint ? */ + if (strncmp(mntpt, entry.mnt_mountp, len) == 0) { + /* The next part is "/.zfs/snapshot/" ? */ + if (strncmp("/.zfs/snapshot/", &entry.mnt_mountp[len], + 15) == 0) { + /* Unmount it */ + do_unmount_impl(entry.mnt_mountp, MS_FORCE); + } + } + } +} + +int +zfs_mount_delegation_check(void) +{ + return ((geteuid() != 0) ? EACCES : 0); +} + +static char * +zfs_snapshot_mountpoint(zfs_handle_t *zhp) +{ + char *dataset_name, *snapshot_mountpoint, *parent_mountpoint; + libzfs_handle_t *hdl = zhp->zfs_hdl; + zfs_handle_t *parent; + char *r; + + dataset_name = zfs_strdup(hdl, zhp->zfs_name); + if (dataset_name == NULL) { + (void) fprintf(stderr, gettext("not enough memory")); + return (NULL); + } + + r = strrchr(dataset_name, '@'); + + if (r == NULL) { + (void) fprintf(stderr, gettext("snapshot '%s' " + "has no '@'\n"), zhp->zfs_name); + free(dataset_name); + return (NULL); + } + + r[0] = 0; + + /* Open the dataset */ + if ((parent = zfs_open(hdl, dataset_name, + ZFS_TYPE_FILESYSTEM)) == NULL) { + (void) fprintf(stderr, + gettext("unable to open parent dataset '%s'\n"), + dataset_name); + free(dataset_name); + return (NULL); + } + + if (!zfs_is_mounted(parent, &parent_mountpoint)) { + (void) fprintf(stderr, + gettext("parent dataset '%s' must be mounted\n"), + dataset_name); + free(dataset_name); + zfs_close(parent); + return (NULL); + } + + zfs_close(parent); + + snapshot_mountpoint = + zfs_asprintf(hdl, "%s/.zfs/snapshot/%s/", + parent_mountpoint, &r[1]); + + free(dataset_name); + free(parent_mountpoint); + + return (snapshot_mountpoint); +} + +/* + * Mount a snapshot; called from "zfs mount dataset@snapshot". + * Given "dataset@snapshot" construct mountpoint path of the + * style "/mountpoint/dataset/.zfs/snapshot/$name/". Ensure + * parent "dataset" is mounted, then issue mount for snapshot. + */ +int +zfs_snapshot_mount(zfs_handle_t *zhp, const char *options, + int flags) +{ + int ret = 0; + char *mountpoint; + + /* + * The automounting will kick in, and zed mounts it - so + * we temporarily disable it + */ + uint64_t automount = 0; + uint64_t saved_automount = 0; + size_t len = sizeof (automount); + size_t slen = sizeof (saved_automount); + + /* Remember what the user has it set to */ + sysctlbyname("kstat.zfs.darwin.tunable.zfs_auto_snapshot", + &saved_automount, &slen, NULL, 0); + + /* Disable automounting */ + sysctlbyname("kstat.zfs.darwin.tunable.zfs_auto_snapshot", + NULL, NULL, &automount, len); + + if (zfs_is_mounted(zhp, NULL)) { + return (EBUSY); + } + + mountpoint = zfs_snapshot_mountpoint(zhp); + if (mountpoint == NULL) + return (EINVAL); + + ret = zfs_mount_at(zhp, options, MS_RDONLY | flags, + mountpoint); + + /* If zed is running, it can mount it before us */ + if (ret == -1 && errno == EINVAL) + ret = 0; + + if (ret == 0) { + (void) fprintf(stderr, + gettext("ZFS: snapshot mountpoint '%s'\n"), + mountpoint); + } + + free(mountpoint); + + /* Restore automount setting */ + sysctlbyname("kstat.zfs.darwin.tunable.zfs_auto_snapshot", + NULL, NULL, &saved_automount, len); + + return (ret); +} + +int +zfs_snapshot_unmount(zfs_handle_t *zhp, int flags) +{ + int ret = 0; + char *mountpoint; + + if (!zfs_is_mounted(zhp, NULL)) { + return (ENOENT); + } + + mountpoint = zfs_snapshot_mountpoint(zhp); + if (mountpoint == NULL) + return (EINVAL); + + ret = zfs_unmount(zhp, mountpoint, flags); + + free(mountpoint); + + return (ret); +} diff --git a/lib/libzfs/os/macos/libzfs_pool_os.c b/lib/libzfs/os/macos/libzfs_pool_os.c new file mode 100644 index 0000000000..f7685e2f01 --- /dev/null +++ b/lib/libzfs/os/macos/libzfs_pool_os.c @@ -0,0 +1,345 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright (c) 2018 Datto Inc. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2018, loli10K + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "libzfs_impl.h" +#include "zfs_comutil.h" +#include "zfeature_common.h" + +/* + * If the device has being dynamically expanded then we need to relabel + * the disk to use the new unallocated space. + */ +int +zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) +{ + int fd, error; + + if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to open device: %d"), path, errno); + return (zfs_error(hdl, EZFS_OPENFAILED, msg)); + } + + /* + * It's possible that we might encounter an error if the device + * does not have any unallocated space left. If so, we simply + * ignore that error and continue on. + * + * Also, we don't call efi_rescan() - that would just return EBUSY. + * The module will do it for us in vdev_disk_open(). + */ + error = efi_use_whole_disk(fd); + + /* Flush the buffers to disk and invalidate the page cache. */ + (void) fsync(fd); +// (void) ioctl(fd, BLKFLSBUF); + + (void) close(fd); + if (error && error != VT_ENOSPC) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to read disk capacity"), path); + return (zfs_error(hdl, EZFS_NOCAP, msg)); + } + return (0); +} + +/* + * Read the EFI label from the config, if a label does not exist then + * pass back the error to the caller. If the caller has passed a non-NULL + * diskaddr argument then we set it to the starting address of the EFI + * partition. + */ +static int +read_efi_label(nvlist_t *config, diskaddr_t *sb) +{ + char *path; + int fd; + char diskname[MAXPATHLEN]; + int err = -1; + + if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) + return (err); + + (void) snprintf(diskname, sizeof (diskname), "%s%s", DISK_ROOT, + strrchr(path, '/')); + if ((fd = open(diskname, O_RDONLY|O_DIRECT)) >= 0) { + struct dk_gpt *vtoc; + + if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { + if (sb != NULL) + *sb = vtoc->efi_parts[0].p_start; + efi_free(vtoc); + } + (void) close(fd); + } + return (err); +} + +/* + * determine where a partition starts on a disk in the current + * configuration + */ +static diskaddr_t +find_start_block(nvlist_t *config) +{ + nvlist_t **child; + uint_t c, children; + diskaddr_t sb = MAXOFFSET_T; + uint64_t wholedisk; + + if (nvlist_lookup_nvlist_array(config, + ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) != 0 || !wholedisk) { + return (MAXOFFSET_T); + } + if (read_efi_label(config, &sb) < 0) + sb = MAXOFFSET_T; + return (sb); + } + + for (c = 0; c < children; c++) { + sb = find_start_block(child[c]); + if (sb != MAXOFFSET_T) { + return (sb); + } + } + return (MAXOFFSET_T); +} + +static int +zpool_label_disk_check(char *path) +{ + struct dk_gpt *vtoc; + int fd, err; + + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) + return (errno); + + if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) { + (void) close(fd); + return (err); + } + + if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { + efi_free(vtoc); + (void) close(fd); + return (EIDRM); + } + + efi_free(vtoc); + (void) close(fd); + return (0); +} + +/* + * Generate a unique partition name for the ZFS member. Partitions must + * have unique names to ensure udev will be able to create symlinks under + * /dev/disk/by-partlabel/ for all pool members. The partition names are + * of the form -. + */ +static void +zpool_label_name(char *label_name, int label_size) +{ + uint64_t id = 0; + int fd; + + fd = open("/dev/urandom", O_RDONLY); + if (fd >= 0) { + if (read(fd, &id, sizeof (id)) != sizeof (id)) + id = 0; + + close(fd); + } + + if (id == 0) + id = (((uint64_t)rand()) << 32) | (uint64_t)rand(); + + snprintf(label_name, label_size, "zfs-%016llx", (u_longlong_t)id); +} + +/* + * Label an individual disk. The name provided is the short name, + * stripped of any leading /dev path. + */ +int +zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) +{ + char path[MAXPATHLEN]; + struct dk_gpt *vtoc; + int rval, fd; + size_t resv = EFI_MIN_RESV_SIZE; + uint64_t slice_size; + diskaddr_t start_block; + char errbuf[1024]; + + /* prepare an error message just in case */ + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); + + if (zhp) { + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + + if (zhp->zpool_start_block == 0) + start_block = find_start_block(nvroot); + else + start_block = zhp->zpool_start_block; + zhp->zpool_start_block = start_block; + } else { + /* new pool */ + start_block = NEW_START_BLOCK; + } + + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + + if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) { + /* + * This shouldn't happen. We've long since verified that this + * is a valid device. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "label '%s': unable to open device: %d"), path, errno); + return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); + } + + if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) { + /* + * The only way this can fail is if we run out of memory, or we + * were unable to read the disk's capacity + */ + if (errno == ENOMEM) + (void) no_memory(hdl); + + (void) close(fd); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "label '%s': unable to read disk capacity"), path); + + return (zfs_error(hdl, EZFS_NOCAP, errbuf)); + } + + slice_size = vtoc->efi_last_u_lba + 1; + slice_size -= EFI_MIN_RESV_SIZE; + if (start_block == MAXOFFSET_T) + start_block = NEW_START_BLOCK; + slice_size -= start_block; + slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT); + + vtoc->efi_parts[0].p_start = start_block; + vtoc->efi_parts[0].p_size = slice_size; + + /* + * Why we use V_USR: V_BACKUP confuses users, and is considered + * disposable by some EFI utilities (since EFI doesn't have a backup + * slice). V_UNASSIGNED is supposed to be used only for zero size + * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, + * etc. were all pretty specific. V_USR is as close to reality as we + * can get, in the absence of V_OTHER. + */ + vtoc->efi_parts[0].p_tag = V_USR; + zpool_label_name(vtoc->efi_parts[0].p_name, EFI_PART_NAME_LEN); + + vtoc->efi_parts[8].p_start = slice_size + start_block; + vtoc->efi_parts[8].p_size = resv; + vtoc->efi_parts[8].p_tag = V_RESERVED; + + rval = efi_write(fd, vtoc); + + /* Flush the buffers to disk and invalidate the page cache. */ + (void) fsync(fd); +// (void) ioctl(fd, BLKFLSBUF); + + if (rval == 0) + rval = efi_rescan(fd); + + /* + * Some block drivers (like pcata) may not support EFI GPT labels. + * Print out a helpful error message directing the user to manually + * label the disk and give a specific slice. + */ + if (rval != 0) { + (void) close(fd); + efi_free(vtoc); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using " + "parted(8) and then provide a specific slice: %d"), rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + (void) close(fd); + efi_free(vtoc); + + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + (void) zfs_append_partition(path, MAXPATHLEN); + + /* Wait to udev to signal use the device has settled. */ + rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT); + if (rval) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to " + "detect device partitions on '%s': %d"), path, rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + /* We can't be to paranoid. Read the label back and verify it. */ + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + rval = zpool_label_disk_check(path); + if (rval) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written " + "EFI label on '%s' is damaged. Ensure\nthis device " + "is not in use, and is functioning properly: %d"), + path, rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + return (0); +} diff --git a/lib/libzfs/os/macos/libzfs_util_os.c b/lib/libzfs/os/macos/libzfs_util_os.c new file mode 100644 index 0000000000..cd732786ad --- /dev/null +++ b/lib/libzfs/os/macos/libzfs_util_os.c @@ -0,0 +1,575 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "libzfs_impl.h" +#include "zfs_prop.h" +#include +#include + +#define ZDIFF_SHARESDIR "/.zfs/shares/" + + +int +zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) +{ + return (zfs_ioctl_fd(hdl->libzfs_fd, request, zc)); +} + +const char * +libzfs_error_init(int error) +{ + switch (error) { + case ENXIO: + return (dgettext(TEXT_DOMAIN, "The ZFS modules are not " + "loaded.\nTry running '/sbin/kextload zfs.kext' as root " + "to load them.")); + case ENOENT: + return (dgettext(TEXT_DOMAIN, "/dev/zfs and /proc/self/mounts " + "are required.\nTry running 'udevadm trigger' and 'mount " + "-t proc proc /proc' as root.")); + case ENOEXEC: + return (dgettext(TEXT_DOMAIN, "The ZFS modules cannot be " + "auto-loaded.\nTry running '/sbin/kextload zfs.kext' as " + "root to manually load them.")); + case EACCES: + return (dgettext(TEXT_DOMAIN, "Permission denied the " + "ZFS utilities must be run as root.")); + default: + return (dgettext(TEXT_DOMAIN, "Failed to initialize the " + "libzfs library.")); + } +} + +static int +libzfs_module_loaded(const char *module) +{ + const char path_prefix[] = "/dev/"; + char path[256]; + + memcpy(path, path_prefix, sizeof (path_prefix) - 1); + strcpy(path + sizeof (path_prefix) - 1, module); + + return (access(path, F_OK) == 0); +} + +/* + * Verify the required ZFS_DEV device is available and optionally attempt + * to load the ZFS modules. Under normal circumstances the modules + * should already have been loaded by some external mechanism. + * + * Environment variables: + * - ZFS_MODULE_LOADING="YES|yes|ON|on" - Attempt to load modules. + * - ZFS_MODULE_TIMEOUT="" - Seconds to wait for ZFS_DEV + */ +static int +libzfs_load_module_impl(const char *module) +{ + char *argv[4] = {"/sbin/kextload", (char *)module, (char *)0}; + char *load_str, *timeout_str; + long timeout = 10; /* seconds */ + long busy_timeout = 10; /* milliseconds */ + int load = 0, fd; + hrtime_t start; + + /* Optionally request module loading */ + if (!libzfs_module_loaded(module)) { + load_str = getenv("ZFS_MODULE_LOADING"); + if (load_str) { + if (!strncasecmp(load_str, "YES", strlen("YES")) || + !strncasecmp(load_str, "ON", strlen("ON"))) + load = 1; + else + load = 0; + } + + if (load) { + if (libzfs_run_process("/sbin/kextload", argv, 0)) + return (ENOEXEC); + } + + if (!libzfs_module_loaded(module)) + return (ENXIO); + } + + /* + * Device creation by udev is asynchronous and waiting may be + * required. Busy wait for 10ms and then fall back to polling every + * 10ms for the allowed timeout (default 10s, max 10m). This is + * done to optimize for the common case where the device is + * immediately available and to avoid penalizing the possible + * case where udev is slow or unable to create the device. + */ + timeout_str = getenv("ZFS_MODULE_TIMEOUT"); + if (timeout_str) { + timeout = strtol(timeout_str, NULL, 0); + timeout = MAX(MIN(timeout, (10 * 60)), 0); /* 0 <= N <= 600 */ + } + + start = gethrtime(); + do { + fd = open(ZFS_DEV, O_RDWR); + if (fd >= 0) { + (void) close(fd); + return (0); + } else if (errno != ENOENT) { + return (errno); + } else if (NSEC2MSEC(gethrtime() - start) < busy_timeout) { + sched_yield(); + } else { + usleep(10 * MILLISEC); + } + } while (NSEC2MSEC(gethrtime() - start) < (timeout * MILLISEC)); + + return (ENOENT); +} + +int +libzfs_load_module(void) +{ + return (libzfs_load_module_impl(ZFS_DRIVER)); +} + +int +find_shares_object(differ_info_t *di) +{ + char fullpath[MAXPATHLEN]; + struct stat64 sb = { 0 }; + + (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN); + (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN); + + if (stat64(fullpath, &sb) != 0) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath); + return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf)); + } + + di->shares = (uint64_t)sb.st_ino; + return (0); +} + +/* + * Fill given version buffer with zfs kernel version read from ZFS_SYSFS_DIR + * Returns 0 on success, and -1 on error (with errno set) + */ +int +zfs_version_kernel(char *version, int len) +{ + size_t rlen = len; + + if (sysctlbyname("zfs.kext_version", + version, &rlen, NULL, 0) == -1) + return (-1); + + return (0); +} + +static int +execvPe(const char *name, const char *path, char * const *argv, + char * const *envp) +{ + const char **memp; + size_t cnt, lp, ln; + int eacces, save_errno; + char *cur, buf[MAXPATHLEN]; + const char *p, *bp; + struct stat sb; + + eacces = 0; + + /* If it's an absolute or relative path name, it's easy. */ + if (strchr(name, '/')) { + bp = name; + cur = NULL; + goto retry; + } + bp = buf; + + /* If it's an empty path name, fail in the usual POSIX way. */ + if (*name == '\0') { + errno = ENOENT; + return (-1); + } + + cur = alloca(strlen(path) + 1); + if (cur == NULL) { + errno = ENOMEM; + return (-1); + } + strcpy(cur, path); + while ((p = strsep(&cur, ":")) != NULL) { + /* + * It's a SHELL path -- double, leading and trailing colons + * mean the current directory. + */ + if (*p == '\0') { + p = "."; + lp = 1; + } else + lp = strlen(p); + ln = strlen(name); + + /* + * If the path is too long complain. This is a possible + * security issue; given a way to make the path too long + * the user may execute the wrong program. + */ + if (lp + ln + 2 > sizeof (buf)) { + (void) write(STDERR_FILENO, "execvP: ", 8); + (void) write(STDERR_FILENO, p, lp); + (void) write(STDERR_FILENO, ": path too long\n", + 16); + continue; + } + bcopy(p, buf, lp); + buf[lp] = '/'; + bcopy(name, buf + lp + 1, ln); + buf[lp + ln + 1] = '\0'; + +retry: + (void) execve(bp, argv, envp); + switch (errno) { + case E2BIG: + goto done; + case ELOOP: + case ENAMETOOLONG: + case ENOENT: + break; + case ENOEXEC: + for (cnt = 0; argv[cnt]; ++cnt) + ; + memp = alloca((cnt + 2) * sizeof (char *)); + if (memp == NULL) { + goto done; + } + memp[0] = "sh"; + memp[1] = bp; + bcopy(argv + 1, memp + 2, cnt * sizeof (char *)); + execve(_PATH_BSHELL, __DECONST(char **, memp), + envp); + goto done; + case ENOMEM: + goto done; + case ENOTDIR: + break; + case ETXTBSY: + /* + * We used to retry here, but sh(1) doesn't. + */ + goto done; + default: + /* + * EACCES may be for an inaccessible directory or + * a non-executable file. Call stat() to decide + * which. This also handles ambiguities for EFAULT + * and EIO, and undocumented errors like ESTALE. + * We hope that the race for a stat() is unimportant. + */ + save_errno = errno; + if (stat(bp, &sb) != 0) + break; + if (save_errno == EACCES) { + eacces = 1; + continue; + } + errno = save_errno; + goto done; + } + } + if (eacces) + errno = EACCES; + else + errno = ENOENT; +done: + return (-1); +} + +int +execvpe(const char *name, char * const argv[], char * const envp[]) +{ + const char *path; + + /* Get the path we're searching. */ + if ((path = getenv("PATH")) == NULL) + path = _PATH_DEFPATH; + + return (execvPe(name, path, argv, envp)); +} + +#include +#include +#include + +extern void libzfs_refresh_finder(char *); + +/* + * To tell Finder to refresh is relatively easy from Obj-C, but as this + * would be the only function to use Obj-C (and only .m), the following code: + * void libzfs_refresh_finder(char *mountpoint) + * { + * [[NSWorkspace sharedWorkspace] noteFileSystemChanged:[NSString + * stringWithUTF8String:mountpoint]]; + * } + * Has been converted to C to keep autoconf simpler. If in future we have + * more Obj-C source files, then we should re-address this. + */ +void +libzfs_refresh_finder(char *path) +{ + Class NSWorkspace = objc_getClass("NSWorkspace"); + Class NSString = objc_getClass("NSString"); + SEL stringWithUTF8String = sel_registerName("stringWithUTF8String:"); + SEL sharedWorkspace = sel_registerName("sharedWorkspace"); + SEL noteFileSystemChanged = sel_registerName("noteFileSystemChanged:"); + id ns_path = ((id(*)(Class, SEL, char *))objc_msgSend)(NSString, + stringWithUTF8String, path); + id workspace = ((id(*)(Class, SEL))objc_msgSend)(NSWorkspace, + sharedWorkspace); + ((id(*)(id, SEL, id))objc_msgSend)(workspace, noteFileSystemChanged, + ns_path); +} + +void +zfs_rollback_os(zfs_handle_t *zhp) +{ + char sourceloc[ZFS_MAX_DATASET_NAME_LEN]; + char mountpoint[ZFS_MAXPROPLEN]; + zprop_source_t sourcetype; + + if (zfs_prop_valid_for_type(ZFS_PROP_MOUNTPOINT, zhp->zfs_type, + B_FALSE)) { + if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, + mountpoint, sizeof (mountpoint), + &sourcetype, sourceloc, sizeof (sourceloc), B_FALSE) == 0) + libzfs_refresh_finder(mountpoint); + } +} + +struct pipe2file { + int from; + int to; +}; +typedef struct pipe2file pipe2file_t; + +#include + +static void * +pipe_io_relay(void *arg) +{ + pipe2file_t *p2f = (pipe2file_t *)arg; + int readfd, writefd; + unsigned char *buffer; + unsigned char space[1024]; + int size = 1024 * 1024; + int red, sent; + uint64_t total = 0; + + readfd = p2f->from; + writefd = p2f->to; + free(p2f); + p2f = NULL; + + buffer = malloc(size); + if (buffer == NULL) { + buffer = space; + size = sizeof (space); + } + + fprintf(stderr, "%s: thread up: read(%d) write(%d)\r\n", __func__, + readfd, writefd); + + for (;;) { + + red = read(readfd, buffer, size); + // fprintf(stderr, "%s: read(%d): %d (errno %d)\r\n", __func__, + // readfd, red, errno); + if (red == 0) + break; + if (red < 0 && errno != EWOULDBLOCK) + break; + sent = write(writefd, buffer, red); + // fprintf(stderr, "%s: write(%d): %d (errno %d)\r\n", __func__, + // writefd, sent, errno); + if (sent < 0) + break; + total += red; + } + + /* + * It seems unlikely that this code is ever reached, as the process + * calls exit() when done, and this thread is terminated. + */ + + fprintf(stderr, "loop exit\r\n"); + + close(readfd); + close(writefd); + + if (buffer != space) + free(buffer); + + fprintf(stderr, "%s: thread done: %llu bytes\r\n", __func__, total); + return (NULL); +} + +/* + * XNU only lets us do IO on vnodes, not pipes, so create a Unix + * Domain socket, open it to get a vnode for the kernel, and spawn + * thread to relay IO. + */ +void +libzfs_macos_wrapfd(int *srcfd, boolean_t send) +{ + char template[100]; + int readfd = -1; + int writefd = -1; + int error; + struct stat sb; + pipe2file_t *p2f = NULL; + + fprintf(stderr, "%s: checking if we need pipe wrap\r\n", __func__); + + // Check if it is a pipe + error = fstat(*srcfd, &sb); + + if (error != 0) + return; + + if (!S_ISFIFO(sb.st_mode)) + return; + + p2f = (pipe2file_t *)malloc(sizeof (pipe2file_t)); + if (p2f == NULL) + return; + + fprintf(stderr, "%s: is pipe: work on fd %d\r\n", __func__, *srcfd); + + snprintf(template, sizeof (template), "/tmp/.zfs.pipe.XXXXXX"); + + mktemp(template); + + mkfifo(template, 0600); + + readfd = open(template, O_RDONLY | O_NONBLOCK); + + fprintf(stderr, "%s: readfd %d (%d)\r\n", __func__, readfd, error); + + writefd = open(template, O_WRONLY | O_NONBLOCK); + + fprintf(stderr, "%s: writefd %d (%d)\r\n", __func__, writefd, error); + + // set it to delete + unlink(template); + + // Check delayed so unlink() is always called. + if (readfd < 0) + goto out; + if (writefd < 0) + goto out; + + /* Open needs NONBLOCK, so switch back to BLOCK */ + int flags; + flags = fcntl(readfd, F_GETFL); + flags &= ~O_NONBLOCK; + fcntl(readfd, F_SETFL, flags); + flags = fcntl(writefd, F_GETFL); + flags &= ~O_NONBLOCK; + fcntl(writefd, F_SETFL, flags); + + // create IO thread + + // Send, kernel was to be given *srcfd - to write to. + // Instead we give it writefd. + // thread then uses read(readfd) -> write(*srcfd) + if (send) { + p2f->from = readfd; + p2f->to = *srcfd; + } else { + p2f->from = *srcfd; + p2f->to = writefd; + } + fprintf(stderr, "%s: spawning thread\r\n", __func__); + + // pthread kills all threads on exit, and pipe_io_relay may not + // have fully completed. + error = fork(); + if (error == 0) { + + // Close the fd we don't need + if (send) + close(writefd); + else + close(readfd); + + setsid(); + pipe_io_relay(p2f); + _exit(0); + } + + if (error < 0) + goto out; + + // Return open(file) fd to kernel only after all error cases + if (send) { + *srcfd = writefd; + close(readfd); + } else { + *srcfd = readfd; + close(writefd); + } + return; + +out: + if (p2f != NULL) + free(p2f); + + if (readfd >= 0) + close(readfd); + + if (writefd >= 0) + close(writefd); +} + +void +libzfs_set_pipe_max(int infd) +{ + /* macOS automatically resizes */ +} diff --git a/lib/libzutil/os/macos/zutil_compat.c b/lib/libzutil/os/macos/zutil_compat.c new file mode 100644 index 0000000000..b46263ac27 --- /dev/null +++ b/lib/libzutil/os/macos/zutil_compat.c @@ -0,0 +1,94 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#include +#include +#include +#include +#include +#include + +static int +zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) +{ + int ret; + void *zc_c; + unsigned long ncmd; + zfs_iocparm_t zp; + + switch (cflag) { + case ZFS_CMD_COMPAT_NONE: + ncmd = _IOWR('Z', request, zfs_iocparm_t); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof (zfs_cmd_t); + zp.zfs_ioctl_version = ZFS_IOCVER_ZOF; + zp.zfs_ioc_error = 0; + + ret = ioctl(fd, ncmd, &zp); + + /* + * If ioctl worked, get actual rc from kernel, which goes + * into errno, and return -1 if not-zero. + */ + if (ret == 0) { + errno = zp.zfs_ioc_error; + if (zp.zfs_ioc_error != 0) + ret = -1; + } + return (ret); + + default: + abort(); + return (EINVAL); + } + + /* Pass-through ioctl, rarely used if at all */ + + ret = ioctl(fd, ncmd, zc_c); + ASSERT0(ret); + + zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag); + free(zc_c); + + return (ret); +} + +/* + * This is the macOS version of ioctl(). Because the XNU kernel + * handles copyin() and copyout(), we must return success from the + * ioctl() handler (or it will not copyout() for userland), + * and instead embed the error return value in the zc structure. + */ +int +zfs_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) +{ + size_t oldsize; + int ret, cflag = ZFS_CMD_COMPAT_NONE; + + oldsize = zc->zc_nvlist_dst_size; + ret = zcmd_ioctl_compat(fd, request, zc, cflag); + + if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { + ret = -1; + errno = ENOMEM; + } + + return (ret); +} diff --git a/lib/libzutil/os/macos/zutil_device_path_os.c b/lib/libzutil/os/macos/zutil_device_path_os.c new file mode 100644 index 0000000000..97284c204e --- /dev/null +++ b/lib/libzutil/os/macos/zutil_device_path_os.c @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +/* + * We don't strip/append partitions on FreeBSD. + */ + +/* + * Note: The caller must free the returned string. + */ +char * +zfs_strip_partition(char *dev) +{ + unsigned int disk, slice; + char *partless; + + partless = strdup(dev); + + /* Ends with "diskNsP" - where 'N' and 'P' are integers - strip sP */ + if (sscanf(partless, "disk%us%u", &disk, &slice) == 2) { + char *r; + r = strrchr(partless, 's'); + if (r != NULL) + *r = 0; + } + + return (partless); +} + +int +zfs_append_partition(char *path, size_t max_len) +{ + int len = strlen(path); + + if (strncmp(path, "/private/var/run/disk/by-id", 27) == 0) { + return (len); + } else if (strncmp(path, "/private/var/run/disk/by-path", 29) == 0) { + if (path[len - 1] == '0' && + path[len - 2] == ':') + path[len - 1] = '1'; + else + return (-1); /* should have ended with ":0" */ + + } else if (strncmp(path, "/private/var/run/disk/by-serial", 31) == 0) { + if (len + 2 >= max_len) + return (-1); + + (void) strcat(path, ":1"); + len += 2; + + } else { + + if (len + 2 >= max_len) + return (-1); + + if (isdigit(path[len-1])) { + (void) strcat(path, "s1"); + len += 2; + } else { + (void) strcat(path, "1"); + len += 1; + } + } + + return (len); +} + +/* + * Strip the path from a device name. + * On FreeBSD we only want to remove "/dev/" from the beginning of + * paths if present. + */ +char * +zfs_strip_path(char *path) +{ + char *r; + r = strrchr(path, '/'); + if (r == NULL) + return (r); + return (&r[1]); +} + +char * +zfs_get_underlying_path(const char *dev_name) +{ + + if (dev_name == NULL) + return (NULL); + + return (realpath(dev_name, NULL)); +} + +boolean_t +zfs_dev_is_whole_disk(const char *dev_name) +{ + struct dk_gpt *label; + int fd; + + if ((fd = open(dev_name, O_RDONLY | O_DIRECT)) < 0) + return (B_FALSE); + + if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { + (void) close(fd); + return (B_FALSE); + } + + efi_free(label); + (void) close(fd); + + return (B_TRUE); +} + +/* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when libudev determines it has been initialized, all of + * the device links have been verified to exist, and it has been allowed to + * settle. At this point the device the device can be accessed reliably. + * Depending on the complexity of the udev rules this process could take + * several seconds. + */ +int +zpool_label_disk_wait(const char *path, int timeout_ms) +{ + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + struct stat64 statbuf; + + start = gethrtime(); + settle = 0; + + do { + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) { + if (settle == 0) + settle = gethrtime(); + else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) + return (0); + } else if (errno != ENOENT) { + return (errno); + } + + usleep(sleep_ms * MILLISEC); + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + return (ENODEV); +} + +/* ARGSUSED */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + return (B_FALSE); +} + +/* + * Return B_TRUE if device is a device mapper or multipath device. + * Return B_FALSE if not. + */ +boolean_t +zfs_dev_is_dm(const char *dev_name) +{ + return (B_FALSE); +} diff --git a/lib/libzutil/os/macos/zutil_import_os.c b/lib/libzutil/os/macos/zutil_import_os.c new file mode 100644 index 0000000000..0382ae17f4 --- /dev/null +++ b/lib/libzutil/os/macos/zutil_import_os.c @@ -0,0 +1,483 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * Pool import support functions. + * + * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since + * these commands are expected to run in the global zone, we can assume + * that the devices are all readable when called. + * + * To import a pool, we rely on reading the configuration information from the + * ZFS label of each device. If we successfully read the label, then we + * organize the configuration information in the following hierarchy: + * + * pool guid -> toplevel vdev guid -> label txg + * + * Duplicate entries matching this same tuple will be discarded. Once we have + * examined every device, we pick the best label txg config for each toplevel + * vdev. We then arrange these toplevel vdevs into a complete pool config, and + * update any paths that have changed. Finally, we attempt to import the pool + * using our derived config, and record the results. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zutil_import.h" + +#ifdef HAVE_LIBUDEV +#include +#include +#endif + +/* We allow /dev/ to be search in DEBUG build */ +#ifdef DEBUG +#define DEFAULT_IMPORT_PATH_SIZE 4 +#else +#define DEFAULT_IMPORT_PATH_SIZE 3 +#endif + +#define DEV_BYID_PATH "/private/var/run/disk/by-id" + +static char * +zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = { + "/private/var/run/disk/by-id", + "/private/var/run/disk/by-path", + "/private/var/run/disk/by-serial", +#ifdef DEBUG + "/dev" /* Only with DEBUG build */ +#endif +}; + +static boolean_t +is_watchdog_dev(char *dev) +{ + /* For 'watchdog' dev */ + if (strcmp(dev, "watchdog") == 0) + return (B_TRUE); + + /* For 'watchdog */ + if (strstr(dev, "watchdog") == dev && isdigit(dev[8])) + return (B_TRUE); + + return (B_FALSE); +} + +int +zfs_dev_flush(int fd) +{ +// return (ioctl(fd, BLKFLSBUF)); + return (0); +} + +void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + libpc_handle_t *hdl = rn->rn_hdl; + struct stat64 statbuf; + nvlist_t *config; + char *bname, *dupname; + uint64_t vdev_guid = 0; + int error; + int num_labels = 0; + int fd; + + /* + * Skip devices with well known prefixes there can be side effects + * when opening devices which need to be avoided. + * + * hpet - High Precision Event Timer + * watchdog - Watchdog must be closed in a special way. + */ + dupname = zutil_strdup(hdl, rn->rn_name); + bname = basename(dupname); + error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname)); + free(dupname); + if (error) + return; + + /* + * Ignore failed stats. We only want regular files and block devices. + */ + if (stat64(rn->rn_name, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) + return; + + fd = open(rn->rn_name, O_RDONLY); + if ((fd < 0) && (errno == EINVAL)) + fd = open(rn->rn_name, O_RDONLY); + if ((fd < 0) && (errno == EACCES)) + hdl->lpc_open_access_error = B_TRUE; + if (fd < 0) + return; + + /* + * This file is too small to hold a zpool + */ + if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } + + error = zpool_read_label(fd, &config, &num_labels); + if (error != 0) { + (void) close(fd); + return; + } + + if (num_labels == 0) { + (void) close(fd); + nvlist_free(config); + return; + } + + /* + * Check that the vdev is for the expected guid. Additional entries + * are speculatively added based on the paths stored in the labels. + * Entries with valid paths but incorrect guids must be removed. + */ + error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { + (void) close(fd); + nvlist_free(config); + return; + } + + (void) close(fd); + + rn->rn_config = config; + rn->rn_num_labels = num_labels; + + /* + * Add additional entries for paths described by this label. + */ + if (rn->rn_labelpaths) { + char *path = NULL; + char *devid = NULL; + char *env = NULL; + rdsk_node_t *slice; + avl_index_t where; + int timeout; + int error; + + if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) + return; + + env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS"); + if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 || + timeout < 0) { + timeout = DISK_LABEL_WAIT; + } + + /* + * Allow devlinks to stabilize so all paths are available. + */ + zpool_label_disk_wait(rn->rn_name, timeout); + + if (path != NULL) { + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, path); + slice->rn_vdev_guid = vdev_guid; + slice->rn_avl = rn->rn_avl; + slice->rn_hdl = hdl; + slice->rn_order = IMPORT_ORDER_PREFERRED_1; + slice->rn_labelpaths = B_FALSE; + pthread_mutex_lock(rn->rn_lock); + if (avl_find(rn->rn_avl, slice, &where)) { + pthread_mutex_unlock(rn->rn_lock); + free(slice->rn_name); + free(slice); + } else { + avl_insert(rn->rn_avl, slice, where); + pthread_mutex_unlock(rn->rn_lock); + zpool_open_func(slice); + } + } + + if (devid != NULL) { + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + error = asprintf(&slice->rn_name, "%s%s", + DEV_BYID_PATH, devid); + if (error == -1) { + free(slice); + return; + } + + slice->rn_vdev_guid = vdev_guid; + slice->rn_avl = rn->rn_avl; + slice->rn_hdl = hdl; + slice->rn_order = IMPORT_ORDER_PREFERRED_2; + slice->rn_labelpaths = B_FALSE; + pthread_mutex_lock(rn->rn_lock); + if (avl_find(rn->rn_avl, slice, &where)) { + pthread_mutex_unlock(rn->rn_lock); + free(slice->rn_name); + free(slice); + } else { + avl_insert(rn->rn_avl, slice, where); + pthread_mutex_unlock(rn->rn_lock); + zpool_open_func(slice); + } + } + } +} + +const char * const * +zpool_default_search_paths(size_t *count) +{ + *count = DEFAULT_IMPORT_PATH_SIZE; + return ((const char * const *)zpool_default_import_path); +} + +int +zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache) +{ + int i, dirs; + struct dirent *dp; + char path[MAXPATHLEN]; + char *end, **dir; + size_t pathleft; + avl_index_t where; + rdsk_node_t *slice; + int error = 0; + + dir = zpool_default_import_path; + dirs = DEFAULT_IMPORT_PATH_SIZE; + + /* + * Go through and read the label configuration information from every + * possible device, organizing the information according to pool GUID + * and toplevel GUID. + */ + *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t)); + avl_create(*slice_cache, slice_cache_compare, + sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); + + for (i = 0; i < dirs; i++) { + char rdsk[MAXPATHLEN]; + int dfd; + DIR *dirp; + + /* use realpath to normalize the path */ + if (realpath(dir[i], path) == 0) { + + /* it is safe to skip missing search paths */ + if (errno == ENOENT) + continue; + + return (EPERM); + } + end = &path[strlen(path)]; + *end++ = '/'; + *end = 0; + pathleft = &path[sizeof (path)] - end; + + (void) strlcpy(rdsk, path, sizeof (rdsk)); + + if ((dfd = open(rdsk, O_RDONLY)) < 0 || + (dirp = fdopendir(dfd)) == NULL) { + if (dfd >= 0) + (void) close(dfd); + return (ENOENT); + } + + while ((dp = readdir(dirp)) != NULL) { + const char *name = dp->d_name; + if (name[0] == '.' && + (name[1] == 0 || (name[1] == '.' && name[2] == 0))) + continue; + + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + + error = asprintf(&slice->rn_name, "%s%s", + path, name); + if (error == -1) { + free(slice); + return (ENOMEM); + } + + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = *slice_cache; + slice->rn_hdl = hdl; + slice->rn_labelpaths = B_FALSE; + slice->rn_order = IMPORT_ORDER_SCAN_OFFSET + i; + + pthread_mutex_lock(lock); + if (avl_find(*slice_cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(*slice_cache, slice, where); + } + pthread_mutex_unlock(lock); + } + + (void) closedir(dirp); + } + + return (0); +} + +/* + * Linux persistent device strings for vdev labels + * + * based on libudev for consistency with libudev disk add/remove events + */ + +typedef struct vdev_dev_strs { + char vds_devid[128]; + char vds_devphys[128]; +} vdev_dev_strs_t; + +/* ARGSUSED */ +int +zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) +{ + return (ENODATA); +} + +/* ARGSUSED */ +int +zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) +{ + return (ENODATA); +} + +/* + * Encode the persistent devices strings + * used for the vdev disk label + */ +static int +encode_device_strings(const char *path, vdev_dev_strs_t *ds, + boolean_t wholedisk) +{ + return (ENOENT); +} + +/* + * Update a leaf vdev's persistent device strings + * + * - only applies for a dedicated leaf vdev (aka whole disk) + * - updated during pool create|add|attach|import + * - used for matching device matching during auto-{online,expand,replace} + * - stored in a leaf disk config label (i.e. alongside 'path' NVP) + * - these strings are currently not used in kernel (i.e. for vdev_disk_open) + * + * single device node example: + * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1' + * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0' + * + * multipath device node example: + * devid: 'dm-uuid-mpath-35000c5006304de3f' + * + * We also store the enclosure sysfs path for turning on enclosure LEDs + * (if applicable): + * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + */ +void +update_vdev_config_dev_strs(nvlist_t *nv) +{ + vdev_dev_strs_t vds; + char *env, *type, *path; + uint64_t wholedisk = 0; + + /* + * For the benefit of legacy ZFS implementations, allow + * for opting out of devid strings in the vdev label. + * + * example use: + * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer + * + * explanation: + * Older ZFS on Linux implementations had issues when attempting to + * display pool config VDEV names if a "devid" NVP value is present + * in the pool's config. + * + * For example, a pool that originated on illumos platform would + * have a devid value in the config and "zpool status" would fail + * when listing the config. + * + * A pool can be stripped of any "devid" values on import or + * prevented from adding them on zpool create|add by setting + * ZFS_VDEV_DEVID_OPT_OUT. + */ + env = getenv("ZFS_VDEV_DEVID_OPT_OUT"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) { + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + return; + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 || + strcmp(type, VDEV_TYPE_DISK) != 0) { + return; + } + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + + /* + * Update device string values in the config nvlist. + */ + if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid); + if (vds.vds_devphys[0] != '\0') { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vds.vds_devphys); + } + + } else { + /* Clear out any stale entries. */ + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + } +} diff --git a/module/icp/asm-x86_64/os/macos/aes/aes_aesni.S b/module/icp/asm-x86_64/os/macos/aes/aes_aesni.S new file mode 100644 index 0000000000..9be7f63151 --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/aes/aes_aesni.S @@ -0,0 +1,855 @@ +/* + * ==================================================================== + * Written by Intel Corporation for the OpenSSL project to add support + * for Intel AES-NI instructions. Rights for redistribution and usage + * in source and binary forms are granted according to the OpenSSL + * license. + * + * Author: Huang Ying + * Vinodh Gopal + * Kahraman Akdemir + * + * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD) + * instructions that are going to be introduced in the next generation + * of Intel processor, as of 2009. These instructions enable fast and + * secure data encryption and decryption, using the Advanced Encryption + * Standard (AES), defined by FIPS Publication number 197. The + * architecture introduces six instructions that offer full hardware + * support for AES. Four of them support high performance data + * encryption and decryption, and the other two instructions support + * the AES key expansion procedure. + * ==================================================================== + */ + +/* + * ==================================================================== + * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +/* + * ==================================================================== + * OpenSolaris OS modifications + * + * This source originates as files aes-intel.S and eng_aesni_asm.pl, in + * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by + * Huang Ying of Intel to the openssl-dev mailing list under the subject + * of "Add support to Intel AES-NI instruction set for x86_64 platform". + * + * This OpenSolaris version has these major changes from the original source: + * + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function + * definitions for lint. + * + * 2. Formatted code, added comments, and added #includes and #defines. + * + * 3. If bit CR0.TS is set, clear and set the TS bit, after and before + * calling kpreempt_disable() and kpreempt_enable(). + * If the TS bit is not set, Save and restore %xmm registers at the beginning + * and end of function calls (%xmm* registers are not saved and restored by + * during kernel thread preemption). + * + * 4. Renamed functions, reordered parameters, and changed return value + * to match OpenSolaris: + * + * OpenSSL interface: + * int intel_AES_set_encrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * int intel_AES_set_decrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return values for above are non-zero on error, 0 on success. + * + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + * typedef struct aes_key_st { + * unsigned int rd_key[4 *(AES_MAXNR + 1)]; + * int rounds; + * unsigned int pad[3]; + * } AES_KEY; + * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules + * (ks32) instead of 64-bit (ks64). + * Number of rounds (aka round count) is at offset 240 of AES_KEY. + * + * OpenSolaris OS interface (#ifdefs removed for readability): + * int rijndael_key_setup_dec_intel(uint32_t rk[], + * const uint32_t cipherKey[], uint64_t keyBits); + * int rijndael_key_setup_enc_intel(uint32_t rk[], + * const uint32_t cipherKey[], uint64_t keyBits); + * Return values for above are 0 on error, number of rounds on success. + * + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]); + * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]); + * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]; + * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t; + * + * typedef union { + * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; + * } aes_ks_t; + * typedef struct aes_key { + * aes_ks_t encr_ks, decr_ks; + * long double align128; + * int flags, nr, type; + * } aes_key_t; + * + * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, + * ct is crypto text, and MAX_AES_NR is 14. + * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. + * + * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary. + * ==================================================================== + * Mac OS X modifications + * 1. Removed CR0.TS / STTS / CLTS since the XNU kernel can apparently use floating point + * registers without this. + * + * ==================================================================== + */ + +#if defined(lint) || defined(__lint) + +#include + +/* ARGSUSED */ +void +aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) { +} +/* ARGSUSED */ +void +aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) { +} +/* ARGSUSED */ +int +rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], + uint64_t keyBits) { + return (0); +} +/* ARGSUSED */ +int +rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], + uint64_t keyBits) { + return (0); +} + + +#else /* lint */ + +#define _ASM +#include + +#if defined(_KERNEL) && !defined(__APPLE__) + /* + * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, + * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it + * uses it to pass P2 to syscall. + * This also occurs with the STTS macro, but we dont care if + * P2 (%rsi) is modified just before function exit. + * The CLTS and STTS macros push and pop P1 (%rdi) already. + */ +#ifdef __xpv +#define PROTECTED_CLTS \ + push %rsi; \ + CLTS; \ + pop %rsi +#else +#define PROTECTED_CLTS \ + CLTS +#endif /* __xpv */ + +#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \ + push %rbp; \ + mov %rsp, %rbp; \ + movq %cr0, tmpreg; \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + and $-XMM_ALIGN, %rsp; \ + sub $(XMM_SIZE * 2), %rsp; \ + movaps %xmm0, 16(%rsp); \ + movaps %xmm1, (%rsp); \ + jmp 2f; \ +1: \ + PROTECTED_CLTS; \ +2: + + /* + * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack, + * otherwise set CR0_TS. + */ +#define SET_TS_OR_POP_XMM0_XMM1(tmpreg) \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + movaps (%rsp), %xmm1; \ + movaps 16(%rsp), %xmm0; \ + jmp 2f; \ +1: \ + STTS(tmpreg); \ +2: \ + mov %rbp, %rsp; \ + pop %rbp + + /* + * If CR0_TS is not set, align stack (with push %rbp) and push + * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS + */ +#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \ + push %rbp; \ + mov %rsp, %rbp; \ + movq %cr0, tmpreg; \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + and $-XMM_ALIGN, %rsp; \ + sub $(XMM_SIZE * 7), %rsp; \ + movaps %xmm0, 96(%rsp); \ + movaps %xmm1, 80(%rsp); \ + movaps %xmm2, 64(%rsp); \ + movaps %xmm3, 48(%rsp); \ + movaps %xmm4, 32(%rsp); \ + movaps %xmm5, 16(%rsp); \ + movaps %xmm6, (%rsp); \ + jmp 2f; \ +1: \ + PROTECTED_CLTS; \ +2: + + + /* + * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack, + * otherwise set CR0_TS. + */ +#define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + movaps (%rsp), %xmm6; \ + movaps 16(%rsp), %xmm5; \ + movaps 32(%rsp), %xmm4; \ + movaps 48(%rsp), %xmm3; \ + movaps 64(%rsp), %xmm2; \ + movaps 80(%rsp), %xmm1; \ + movaps 96(%rsp), %xmm0; \ + jmp 2f; \ +1: \ + STTS(tmpreg); \ +2: \ + mov %rbp, %rsp; \ + pop %rbp + + +#else +#define PROTECTED_CLTS +#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) +#define SET_TS_OR_POP_XMM0_XMM1(tmpreg) +#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) +#define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) +#endif /* _KERNEL */ + + +/* + * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(), + * _key_expansion_256a(), _key_expansion_256b() + * + * Helper functions called by rijndael_key_setup_inc_intel(). + * Also used indirectly by rijndael_key_setup_dec_intel(). + * + * Input: + * %xmm0 User-provided cipher key + * %xmm1 Round constant + * Output: + * (%rcx) AES key + */ + +.align 4, 0x90 +_key_expansion_128: +_key_expansion_256a: + pshufd $0b11111111, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + movups %xmm0, (%rcx) + add $0x10, %rcx + ret + SET_SIZE(_key_expansion_128) + SET_SIZE(_key_expansion_256a) + +.align 4, 0x90 +_key_expansion_192a: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movups %xmm2, %xmm5 + movups %xmm2, %xmm6 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movups %xmm0, %xmm1 + shufps $0b01000100, %xmm0, %xmm6 + movups %xmm6, (%rcx) + shufps $0b01001110, %xmm2, %xmm1 + movups %xmm1, 0x10(%rcx) + add $0x20, %rcx + ret + SET_SIZE(_key_expansion_192a) + +.align 4, 0x90 +_key_expansion_192b: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movups %xmm2, %xmm5 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movups %xmm0, (%rcx) + add $0x10, %rcx + ret + SET_SIZE(_key_expansion_192b) + +.align 4, 0x90 +_key_expansion_256b: + pshufd $0b10101010, %xmm1, %xmm1 + shufps $0b00010000, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + shufps $0b10001100, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + pxor %xmm1, %xmm2 + movups %xmm2, (%rcx) + add $0x10, %rcx + ret + SET_SIZE(_key_expansion_256b) + + +/* + * rijndael_key_setup_enc_intel() + * Expand the cipher key into the encryption key schedule. + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * OpenSolaris interface: + * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], + * uint64_t keyBits); + * Return value is 0 on error, number of rounds on success. + * + * Original Intel OpenSSL interface: + * int intel_AES_set_encrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return value is non-zero on error, 0 on success. + */ + +#ifdef OPENSSL_INTERFACE +#define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key +#define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key + +#define USERCIPHERKEY rdi /* P1, 64 bits */ +#define KEYSIZE32 esi /* P2, 32 bits */ +#define KEYSIZE64 rsi /* P2, 64 bits */ +#define AESKEY rdx /* P3, 64 bits */ + +#else /* OpenSolaris Interface */ +#define AESKEY rdi /* P1, 64 bits */ +#define USERCIPHERKEY rsi /* P2, 64 bits */ +#define KEYSIZE32 edx /* P3, 32 bits */ +#define KEYSIZE64 rdx /* P3, 64 bits */ +#endif /* OPENSSL_INTERFACE */ + +#define ROUNDS32 KEYSIZE32 /* temp */ +#define ROUNDS64 KEYSIZE64 /* temp */ +#define ENDAESKEY USERCIPHERKEY /* temp */ + +ENTRY_NP(rijndael_key_setup_enc_intel) +rijndael_key_setup_enc_intel_local: + CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10) + + // NULL pointer sanity check + test %USERCIPHERKEY, %USERCIPHERKEY + jz .Lenc_key_invalid_param + test %AESKEY, %AESKEY + jz .Lenc_key_invalid_param + + movups (%USERCIPHERKEY), %xmm0 // user key (first 16 bytes) + movups %xmm0, (%AESKEY) + lea 0x10(%AESKEY), %rcx // key addr + pxor %xmm4, %xmm4 // xmm4 is assumed 0 in _key_expansion_x + + cmp $256, %KEYSIZE32 + jnz .Lenc_key192 + + // AES 256: 14 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $14, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 14 +#endif /* OPENSSL_INTERFACE */ + + movups 0x10(%USERCIPHERKEY), %xmm2 // other user key (2nd 16 bytes) + movups %xmm2, (%rcx) + add $0x10, %rcx + + aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x1, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x2, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x4, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x8, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x10, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x20, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* Open Solaris Interface */ + mov $14, %rax // return # rounds = 14 +#endif + ret + +.align 4 +.Lenc_key192: + cmp $192, %KEYSIZE32 + jnz .Lenc_key128 + + // AES 192: 12 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $12, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 12 +#endif /* OPENSSL_INTERFACE */ + + movq 0x10(%USERCIPHERKEY), %xmm2 // other user key + aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x80, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* OpenSolaris Interface */ + mov $12, %rax // return # rounds = 12 +#endif + ret + +.align 4 +.Lenc_key128: + cmp $128, %KEYSIZE32 + jnz .Lenc_key_invalid_key_bits + + // AES 128: 10 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $10, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 10 +#endif /* OPENSSL_INTERFACE */ + + aeskeygenassist $0x1, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x2, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x4, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x8, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x10, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x20, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x40, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x80, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x1b, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x36, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* OpenSolaris Interface */ + mov $10, %rax // return # rounds = 10 +#endif + ret + +.Lenc_key_invalid_param: +#ifdef OPENSSL_INTERFACE + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) + mov $-1, %rax // user key or AES key pointer is NULL + ret +#else + /* FALLTHROUGH */ +#endif /* OPENSSL_INTERFACE */ + +.Lenc_key_invalid_key_bits: + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + mov $-2, %rax // keysize is invalid +#else /* Open Solaris Interface */ + xor %rax, %rax // a key pointer is NULL or invalid keysize +#endif /* OPENSSL_INTERFACE */ + + ret + SET_SIZE(rijndael_key_setup_enc_intel) + + +/* + * rijndael_key_setup_dec_intel() + * Expand the cipher key into the decryption key schedule. + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * OpenSolaris interface: + * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], + * uint64_t keyBits); + * Return value is 0 on error, number of rounds on success. + * P1->P2, P2->P3, P3->P1 + * + * Original Intel OpenSSL interface: + * int intel_AES_set_decrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return value is non-zero on error, 0 on success. + */ +ENTRY_NP(rijndael_key_setup_dec_intel) + // Generate round keys used for encryption + call rijndael_key_setup_enc_intel_local + test %rax, %rax +#ifdef OPENSSL_INTERFACE + jnz .Ldec_key_exit // Failed if returned non-0 +#else /* OpenSolaris Interface */ + jz .Ldec_key_exit // Failed if returned 0 +#endif /* OPENSSL_INTERFACE */ + + CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) + + /* + * Convert round keys used for encryption + * to a form usable for decryption + */ +#ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */ + mov %rax, %ROUNDS64 // set # rounds (10, 12, or 14) + // (already set for OpenSSL) +#endif + + lea 0x10(%AESKEY), %rcx // key addr + shl $4, %ROUNDS32 + add %AESKEY, %ROUNDS64 + mov %ROUNDS64, %ENDAESKEY + +.align 4 +.Ldec_key_reorder_loop: + movups (%AESKEY), %xmm0 + movups (%ROUNDS64), %xmm1 + movups %xmm0, (%ROUNDS64) + movups %xmm1, (%AESKEY) + lea 0x10(%AESKEY), %AESKEY + lea -0x10(%ROUNDS64), %ROUNDS64 + cmp %AESKEY, %ROUNDS64 + ja .Ldec_key_reorder_loop + +.align 4 +.Ldec_key_inv_loop: + movups (%rcx), %xmm0 + // Convert an encryption round key to a form usable for decryption + // with the "AES Inverse Mix Columns" instruction + aesimc %xmm0, %xmm1 + movups %xmm1, (%rcx) + lea 0x10(%rcx), %rcx + cmp %ENDAESKEY, %rcx + jnz .Ldec_key_inv_loop + + SET_TS_OR_POP_XMM0_XMM1(%r10) + +.Ldec_key_exit: + // OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error + // OpenSSL: rax = 0 for OK, or non-zero for error + ret + SET_SIZE(rijndael_key_setup_dec_intel) + + +/* + * aes_encrypt_intel() + * Encrypt a single block (in and out can overlap). + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * Temporary register usage: + * %xmm0 State + * %xmm1 Key + * + * Original OpenSolaris Interface: + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]) + * + * Original Intel OpenSSL Interface: + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key) + */ + +#ifdef OPENSSL_INTERFACE +#define aes_encrypt_intel intel_AES_encrypt +#define aes_decrypt_intel intel_AES_decrypt + +#define INP rdi /* P1, 64 bits */ +#define OUTP rsi /* P2, 64 bits */ +#define KEYP rdx /* P3, 64 bits */ + +/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */ +#define NROUNDS32 ecx /* temporary, 32 bits */ +#define NROUNDS cl /* temporary, 8 bits */ + +#else /* OpenSolaris Interface */ +#define KEYP rdi /* P1, 64 bits */ +#define NROUNDS esi /* P2, 32 bits */ +#define INP rdx /* P3, 64 bits */ +#define OUTP rcx /* P4, 64 bits */ +#endif /* OPENSSL_INTERFACE */ + +#define STATE xmm0 /* temporary, 128 bits */ +#define KEY xmm1 /* temporary, 128 bits */ + +ENTRY_NP(aes_encrypt_intel) + CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) + + movups (%INP), %STATE // input + movups (%KEYP), %KEY // key +#ifdef OPENSSL_INTERFACE + mov 240(%KEYP), %NROUNDS32 // round count +#else /* OpenSolaris Interface */ + /* Round count is already present as P2 in %rsi/%esi */ +#endif /* OPENSSL_INTERFACE */ + + pxor %KEY, %STATE // round 0 + lea 0x30(%KEYP), %KEYP + cmp $12, %NROUNDS + jb .Lenc128 + lea 0x20(%KEYP), %KEYP + je .Lenc192 + + // AES 256 + lea 0x20(%KEYP), %KEYP + movups -0x60(%KEYP), %KEY + aesenc %KEY, %STATE + movups -0x50(%KEYP), %KEY + aesenc %KEY, %STATE + +.align 4 +.Lenc192: + // AES 192 and 256 + movups -0x40(%KEYP), %KEY + aesenc %KEY, %STATE + movups -0x30(%KEYP), %KEY + aesenc %KEY, %STATE + +.align 4 +.Lenc128: + // AES 128, 192, and 256 + movups -0x20(%KEYP), %KEY + aesenc %KEY, %STATE + movups -0x10(%KEYP), %KEY + aesenc %KEY, %STATE + movups (%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x10(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x20(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x30(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x40(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x50(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x60(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x70(%KEYP), %KEY + aesenclast %KEY, %STATE // last round + movups %STATE, (%OUTP) // output + + SET_TS_OR_POP_XMM0_XMM1(%r10) + ret + SET_SIZE(aes_encrypt_intel) + + +/* + * aes_decrypt_intel() + * Decrypt a single block (in and out can overlap). + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * Temporary register usage: + * %xmm0 State + * %xmm1 Key + * + * Original OpenSolaris Interface: + * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original Intel OpenSSL Interface: + * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + */ +ENTRY_NP(aes_decrypt_intel) + CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) + + movups (%INP), %STATE // input + movups (%KEYP), %KEY // key +#ifdef OPENSSL_INTERFACE + mov 240(%KEYP), %NROUNDS32 // round count +#else /* OpenSolaris Interface */ + /* Round count is already present as P2 in %rsi/%esi */ +#endif /* OPENSSL_INTERFACE */ + + pxor %KEY, %STATE // round 0 + lea 0x30(%KEYP), %KEYP + cmp $12, %NROUNDS + jb .Ldec128 + lea 0x20(%KEYP), %KEYP + je .Ldec192 + + // AES 256 + lea 0x20(%KEYP), %KEYP + movups -0x60(%KEYP), %KEY + aesdec %KEY, %STATE + movups -0x50(%KEYP), %KEY + aesdec %KEY, %STATE + +.align 4 +.Ldec192: + // AES 192 and 256 + movups -0x40(%KEYP), %KEY + aesdec %KEY, %STATE + movups -0x30(%KEYP), %KEY + aesdec %KEY, %STATE + +.align 4 +.Ldec128: + // AES 128, 192, and 256 + movups -0x20(%KEYP), %KEY + aesdec %KEY, %STATE + movups -0x10(%KEYP), %KEY + aesdec %KEY, %STATE + movups (%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x10(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x20(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x30(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x40(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x50(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x60(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x70(%KEYP), %KEY + aesdeclast %KEY, %STATE // last round + movups %STATE, (%OUTP) // output + + SET_TS_OR_POP_XMM0_XMM1(%r10) + ret + SET_SIZE(aes_decrypt_intel) + +#endif /* lint || __lint */ diff --git a/module/icp/asm-x86_64/os/macos/aes/aes_amd64.S b/module/icp/asm-x86_64/os/macos/aes/aes_amd64.S new file mode 100644 index 0000000000..cdd9a861be --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/aes/aes_amd64.S @@ -0,0 +1,900 @@ +/* + * --------------------------------------------------------------------------- + * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software is allowed (with or without + * changes) provided that: + * + * 1. source code distributions include the above copyright notice, this + * list of conditions and the following disclaimer; + * + * 2. binary distributions include the above copyright notice, this list + * of conditions and the following disclaimer in their documentation; + * + * 3. the name of the copyright holder is not used to endorse products + * built using this software without specific written permission. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + * Issue 20/12/2007 + * + * I am grateful to Dag Arne Osvik for many discussions of the techniques that + * can be used to optimise AES assembler code on AMD64/EM64T architectures. + * Some of the techniques used in this implementation are the result of + * suggestions made by him for which I am most grateful. + * + * An AES implementation for AMD64 processors using the YASM assembler. This + * implementation provides only encryption, decryption and hence requires key + * scheduling support in C. It uses 8k bytes of tables but its encryption and + * decryption performance is very close to that obtained using large tables. + * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, + * which are as follows: + * ms windows gnu/linux/opensolaris os + * + * in_blk rcx rdi + * out_blk rdx rsi + * context (cx) r8 rdx + * + * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 + * registers rdi - on both + * + * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 + * registers - rdi on both + * + * The convention used here is that for gnu/linux/opensolaris os. + * + * This code provides the standard AES block size (128 bits, 16 bytes) and the + * three standard AES key sizes (128, 192 and 256 bits). It has the same call + * interface as my C implementation. It uses the Microsoft C AMD64 calling + * conventions in which the three parameters are placed in rcx, rdx and r8 + * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. + * + * OpenSolaris Note: + * Modified to use GNU/Linux/Solaris calling conventions. + * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. + * + * AES_RETURN aes_encrypt(const unsigned char in_blk[], + * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt(const unsigned char in_blk[], + * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_encrypt_key(const unsigned char key[], + * const aes_encrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt_key(const unsigned char key[], + * const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_encrypt_key(const unsigned char key[], + * unsigned int len, const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt_key(const unsigned char key[], + * unsigned int len, const aes_decrypt_ctx cx[1])/ + * + * where is 128, 102 or 256. In the last two calls the length can be in + * either bits or bytes. + * + * Comment in/out the following lines to obtain the desired subroutines. These + * selections MUST match those in the C header file aesopt.h + */ +#define AES_REV_DKS /* define if key decryption schedule is reversed */ + +#define LAST_ROUND_TABLES /* define for the faster version using extra tables */ + +/* + * The encryption key schedule has the following in memory layout where N is the + * number of rounds (10, 12 or 14): + * + * lo: | input key (round 0) | / each round is four 32-bit words + * | encryption round 1 | + * | encryption round 2 | + * .... + * | encryption round N-1 | + * hi: | encryption round N | + * + * The decryption key schedule is normally set up so that it has the same + * layout as above by actually reversing the order of the encryption key + * schedule in memory (this happens when AES_REV_DKS is set): + * + * lo: | decryption round 0 | = | encryption round N | + * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] + * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] + * .... .... + * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] + * hi: | decryption round N | = | input key (round 0) | + * + * with rounds except the first and last modified using inv_mix_column() + * But if AES_REV_DKS is NOT set the order of keys is left as it is for + * encryption so that it has to be accessed in reverse when used for + * decryption (although the inverse mix column modifications are done) + * + * lo: | decryption round 0 | = | input key (round 0) | + * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] + * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] + * .... .... + * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] + * hi: | decryption round N | = | encryption round N | + * + * This layout is faster when the assembler key scheduling provided here + * is used. + * + * End of user defines + */ + +/* + * --------------------------------------------------------------------------- + * OpenSolaris OS modifications + * + * This source originates from Brian Gladman file aes_amd64.asm + * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip + * with these changes: + * + * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and + * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, + * AES_128, AES_192, AES_256, AES_VAR ifdefs. + * + * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define + * + * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef + * + * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax + * (operands reversed, literals prefixed with "$", registers prefixed with "%", + * and "[register+offset]", addressing changed to "offset(register)", + * parenthesis in constant expressions "()" changed to square brackets "[]", + * "." removed from local (numeric) labels, and other changes. + * Examples: + * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax + * mov rax,(4*20h) mov $[4*0x20],%rax + * mov rax,[ebx+20h] mov 0x20(%ebx),%rax + * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax + * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax + * + * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function + * definitions for lint. + * + * 6. Renamed functions and reordered parameters to match OpenSolaris: + * Original Gladman interface: + * int aes_encrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + * int aes_decrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, + * and a union type, inf., containing inf.l, a uint32_t and + * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is + * used and contains the key schedule length * 16 where key schedule length is + * 10, 12, or 14 bytes. + * + * OpenSolaris OS interface: + * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ + * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ + * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, + * ct is crypto text, and MAX_AES_NR is 14. + * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. + */ + +#if defined(lint) || defined(__lint) + +#include +/* ARGSUSED */ +void +aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) { +} +/* ARGSUSED */ +void +aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) { +} + + +#else + +#define _ASM +#include + +#define KS_LENGTH 60 + +#define raxd eax +#define rdxd edx +#define rcxd ecx +#define rbxd ebx +#define rsid esi +#define rdid edi + +#define raxb al +#define rdxb dl +#define rcxb cl +#define rbxb bl +#define rsib sil +#define rdib dil + +// finite field multiplies by {02}, {04} and {08} + +#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) +#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) +#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) + +// finite field multiplies required in table generation + +#define f3(x) ((f2(x)) ^ (x)) +#define f9(x) ((f8(x)) ^ (x)) +#define fb(x) ((f8(x)) ^ (f2(x)) ^ (x)) +#define fd(x) ((f8(x)) ^ (f4(x)) ^ (x)) +#define fe(x) ((f8(x)) ^ (f4(x)) ^ (f2(x))) + +// macros for expanding S-box data + +#define u8(x) (f2(x)), (x), (x), (f3(x)), (f2(x)), (x), (x), (f3(x)) +#define v8(x) (fe(x)), (f9(x)), (fd(x)), (fb(x)), (fe(x)), (f9(x)), (fd(x)), (x) +#define w8(x) (x), 0, 0, 0, (x), 0, 0, 0 + +#define enc_vals(x) \ + .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ + .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ + .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ + .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ + .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ + .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ + .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ + .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ + .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ + .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ + .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ + .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ + .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ + .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ + .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ + .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ + .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ + .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ + .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ + .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ + .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ + .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ + .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ + .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ + .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ + .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ + .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ + .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ + .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ + .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ + .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ + .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) + +#define dec_vals(x) \ + .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ + .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ + .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ + .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ + .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ + .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ + .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ + .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ + .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ + .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ + .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ + .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ + .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ + .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ + .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ + .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ + .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ + .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ + .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ + .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ + .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ + .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ + .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ + .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ + .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ + .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ + .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ + .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ + .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ + .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ + .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ + .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) + +#define tptr %rbp /* table pointer */ +#define kptr %r8 /* key schedule pointer */ +#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ +#define fk_ref(x, y) -16*x+fofs+4*y(kptr) + +#ifdef AES_REV_DKS +#define rofs 128 +#define ik_ref(x, y) -16*x+rofs+4*y(kptr) + +#else +#define rofs -128 +#define ik_ref(x, y) 16*x+rofs+4*y(kptr) +#endif /* AES_REV_DKS */ + +#define tab_0(x) (tptr,x,8) +#define tab_1(x) 3(tptr,x,8) +#define tab_2(x) 2(tptr,x,8) +#define tab_3(x) 1(tptr,x,8) +#define tab_f(x) 1(tptr,x,8) +#define tab_i(x) 7(tptr,x,8) + +#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p1; \ + \ + mov p1, %eax; \ + mov p2, %ebx; \ + mov p3, %ecx; \ + mov p4, %edx + +#ifdef LAST_ROUND_TABLES + +#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ + add $2048, tptr; \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p1 + +#else + +#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + xor %esi, p1; \ + rol $8, %edi; \ + xor %edi, p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p3; \ + xor %edi, p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + xor %esi, p2; \ + rol $8, %edi; \ + xor %edi, p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p4; \ + xor %edi, p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + shr $16, %ecx; \ + xor %esi, p3; \ + rol $8, %edi; \ + xor %edi, p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p1; \ + xor %edi, p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + shr $16, %edx; \ + xor %esi, p4; \ + rol $8, %edi; \ + xor %edi, p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p2; \ + xor %edi, p1 + +#endif /* LAST_ROUND_TABLES */ + +#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p3; \ + \ + mov p1, %eax; \ + mov p2, %ebx; \ + mov p3, %ecx; \ + mov p4, %edx + +#ifdef LAST_ROUND_TABLES + +#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ + add $2048, tptr; \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p3 + +#else + +#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %eax; \ + xor %esi, p1; \ + rol $8, %edi; \ + xor %edi, p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p3; \ + xor %edi, p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %ebx; \ + xor %esi, p2; \ + rol $8, %edi; \ + xor %edi, p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p4; \ + xor %edi, p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %ecx; \ + xor %esi, p3; \ + rol $8, %edi; \ + xor %edi, p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p1; \ + xor %edi, p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %edx; \ + xor %esi, p4; \ + rol $8, %edi; \ + xor %edi, p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p2; \ + xor %edi, p3 + +#endif /* LAST_ROUND_TABLES */ + +/* + * OpenSolaris OS: + * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original interface: + * int aes_encrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + */ + .align 6, 0x90 +enc_tab: + enc_vals(u8) +#ifdef LAST_ROUND_TABLES + // Last Round Tables: + enc_vals(w8) +#endif + + + ENTRY_NP(aes_encrypt_amd64) +#ifdef GLADMAN_INTERFACE + // Original interface + sub $[4*8], %rsp // gnu/linux/opensolaris binary interface + mov %rsi, (%rsp) // output pointer (P2) + mov %rdx, %r8 // context (P3) + + mov %rbx, 1*8(%rsp) // P1: input pointer in rdi + mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) + mov %r12, 3*8(%rsp) // P3: context in r8 + movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 + +#else + // OpenSolaris OS interface + sub $(4*8), %rsp // Make room on stack to save registers + mov %rcx, (%rsp) // Save output pointer (P4) on stack + mov %rdi, %r8 // context (P1) + mov %rdx, %rdi // P3: save input pointer + shl $4, %esi // P2: esi byte key length * 16 + + mov %rbx, 1*8(%rsp) // Save registers + mov %rbp, 2*8(%rsp) + mov %r12, 3*8(%rsp) + // P1: context in r8 + // P2: byte key length * 16 in esi + // P3: input pointer in rdi + // P4: output pointer in (rsp) +#endif /* GLADMAN_INTERFACE */ + + lea enc_tab(%rip), tptr + sub $fofs, kptr + + // Load input block into registers + mov (%rdi), %eax + mov 1*4(%rdi), %ebx + mov 2*4(%rdi), %ecx + mov 3*4(%rdi), %edx + + xor fofs(kptr), %eax + xor fofs+4(kptr), %ebx + xor fofs+8(kptr), %ecx + xor fofs+12(kptr), %edx + + lea (kptr,%rsi), kptr + // Jump based on byte key length * 16: + cmp $(10*16), %esi + je 3f + cmp $(12*16), %esi + je 2f + cmp $(14*16), %esi + je 1f + mov $-1, %rax // error + jmp 4f + + // Perform normal forward rounds +1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) +2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) +3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) + fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) + + // Copy results + mov (%rsp), %rbx + mov %r9d, (%rbx) + mov %r10d, 4(%rbx) + mov %r11d, 8(%rbx) + mov %r12d, 12(%rbx) + xor %rax, %rax +4: // Restore registers + mov 1*8(%rsp), %rbx + mov 2*8(%rsp), %rbp + mov 3*8(%rsp), %r12 + add $(4*8), %rsp + ret + + SET_SIZE(aes_encrypt_amd64) + +/* + * OpenSolaris OS: + * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original interface: + * int aes_decrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + */ + .align 6, 0x90 +dec_tab: + dec_vals(v8) +#ifdef LAST_ROUND_TABLES + // Last Round Tables: + dec_vals(w8) +#endif + + + ENTRY_NP(aes_decrypt_amd64) +#ifdef GLADMAN_INTERFACE + // Original interface + sub $[4*8], %rsp // gnu/linux/opensolaris binary interface + mov %rsi, (%rsp) // output pointer (P2) + mov %rdx, %r8 // context (P3) + + mov %rbx, 1*8(%rsp) // P1: input pointer in rdi + mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) + mov %r12, 3*8(%rsp) // P3: context in r8 + movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 + +#else + // OpenSolaris OS interface + sub $(4*8), %rsp // Make room on stack to save registers + mov %rcx, (%rsp) // Save output pointer (P4) on stack + mov %rdi, %r8 // context (P1) + mov %rdx, %rdi // P3: save input pointer + shl $4, %esi // P2: esi byte key length * 16 + + mov %rbx, 1*8(%rsp) // Save registers + mov %rbp, 2*8(%rsp) + mov %r12, 3*8(%rsp) + // P1: context in r8 + // P2: byte key length * 16 in esi + // P3: input pointer in rdi + // P4: output pointer in (rsp) +#endif /* GLADMAN_INTERFACE */ + + lea dec_tab(%rip), tptr + sub $rofs, kptr + + // Load input block into registers + mov (%rdi), %eax + mov 1*4(%rdi), %ebx + mov 2*4(%rdi), %ecx + mov 3*4(%rdi), %edx + +#ifdef AES_REV_DKS + mov kptr, %rdi + lea (kptr,%rsi), kptr +#else + lea (kptr,%rsi), %rdi +#endif + + xor rofs(%rdi), %eax + xor rofs+4(%rdi), %ebx + xor rofs+8(%rdi), %ecx + xor rofs+12(%rdi), %edx + + // Jump based on byte key length * 16: + cmp $(10*16), %esi + je 3f + cmp $(12*16), %esi + je 2f + cmp $(14*16), %esi + je 1f + mov $-1, %rax // error + jmp 4f + + // Perform normal inverse rounds +1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) +2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) +3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) + il_rnd(%r9d, %r10d, %r11d, %r12d, 0) + + // Copy results + mov (%rsp), %rbx + mov %r9d, (%rbx) + mov %r10d, 4(%rbx) + mov %r11d, 8(%rbx) + mov %r12d, 12(%rbx) + xor %rax, %rax +4: // Restore registers + mov 1*8(%rsp), %rbx + mov 2*8(%rsp), %rbp + mov 3*8(%rsp), %r12 + add $(4*8), %rsp + ret + + SET_SIZE(aes_decrypt_amd64) +#endif /* lint || __lint */ diff --git a/module/icp/asm-x86_64/os/macos/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/os/macos/modes/gcm_pclmulqdq.S new file mode 100644 index 0000000000..20f4d14c78 --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/modes/gcm_pclmulqdq.S @@ -0,0 +1,334 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009 Intel Corporation + * All Rights Reserved. + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains an accelerated + * Galois Field Multiplication implementation. + * + * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, + * carry-less multiplication. More information about PCLMULQDQ can be + * found at: + * http://software.intel.com/en-us/articles/ + * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + */ + +/* + * ==================================================================== + * OpenSolaris OS modifications + * + * This source originates as file galois_hash_asm.c from + * Intel Corporation dated September 21, 2009. + * + * This OpenSolaris version has these major changes from the original source: + * + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function + * definition for lint. + * + * 2. Formatted code, added comments, and added #includes and #defines. + * + * 3. If bit CR0.TS is set, clear and set the TS bit, after and before + * calling kpreempt_disable() and kpreempt_enable(). + * If the TS bit is not set, Save and restore %xmm registers at the beginning + * and end of function calls (%xmm* registers are not saved and restored by + * during kernel thread preemption). + * + * 4. Removed code to perform hashing. This is already done with C macro + * GHASH in gcm.c. For better performance, this removed code should be + * reintegrated in the future to replace the C GHASH macro. + * + * 5. Added code to byte swap 16-byte input and output. + * + * 6. Folded in comments from the original C source with embedded assembly + * (SB_w_shift_xor.c) + * + * 7. Renamed function and reordered parameters to match OpenSolaris: + * Intel interface: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * OpenSolaris OS interface: + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * ==================================================================== + */ + + +#if defined(lint) || defined(__lint) + +#include + +/* ARGSUSED */ +void +gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { +} + +#else /* lint */ + +#define _ASM +#include + +#if defined(_KERNEL) && !defined(__APPLE__) + /* + * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, + * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it + * uses it to pass P2 to syscall. + * This also occurs with the STTS macro, but we dont care if + * P2 (%rsi) is modified just before function exit. + * The CLTS and STTS macros push and pop P1 (%rdi) already. + */ +#ifdef __xpv +#define PROTECTED_CLTS \ + push %rsi; \ + CLTS; \ + pop %rsi +#else +#define PROTECTED_CLTS \ + CLTS +#endif /* __xpv */ + + /* + * If CR0_TS is not set, align stack (with push %rbp) and push + * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS + */ +#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \ + push %rbp; \ + mov %rsp, %rbp; \ + movq %cr0, tmpreg; \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + and $-XMM_ALIGN, %rsp; \ + sub $(XMM_SIZE * 11), %rsp; \ + movaps %xmm0, 160(%rsp); \ + movaps %xmm1, 144(%rsp); \ + movaps %xmm2, 128(%rsp); \ + movaps %xmm3, 112(%rsp); \ + movaps %xmm4, 96(%rsp); \ + movaps %xmm5, 80(%rsp); \ + movaps %xmm6, 64(%rsp); \ + movaps %xmm7, 48(%rsp); \ + movaps %xmm8, 32(%rsp); \ + movaps %xmm9, 16(%rsp); \ + movaps %xmm10, (%rsp); \ + jmp 2f; \ +1: \ + PROTECTED_CLTS; \ +2: + + + /* + * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack, + * otherwise set CR0_TS. + */ +#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + movaps (%rsp), %xmm10; \ + movaps 16(%rsp), %xmm9; \ + movaps 32(%rsp), %xmm8; \ + movaps 48(%rsp), %xmm7; \ + movaps 64(%rsp), %xmm6; \ + movaps 80(%rsp), %xmm5; \ + movaps 96(%rsp), %xmm4; \ + movaps 112(%rsp), %xmm3; \ + movaps 128(%rsp), %xmm2; \ + movaps 144(%rsp), %xmm1; \ + movaps 160(%rsp), %xmm0; \ + jmp 2f; \ +1: \ + STTS(tmpreg); \ +2: \ + mov %rbp, %rsp; \ + pop %rbp + + +#else +#define PROTECTED_CLTS +#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) +#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) +#endif /* _KERNEL */ + +/* + * Use this mask to byte-swap a 16-byte integer with the pshufb instruction + */ + +// static uint8_t byte_swap16_mask[] = { +// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; +.text +.align XMM_ALIGN_LOG +.Lbyte_swap16_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + + + +/* + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * + * Perform a carry-less multiplication (that is, use XOR instead of the + * multiply operator) on P1 and P2 and place the result in P3. + * + * Byte swap the input and the output. + * + * Note: x_in, y, and res all point to a block of 20-byte numbers + * (an array of two 64-bit integers). + * + * Note2: For kernel code, caller is responsible for ensuring + * kpreempt_disable() has been called. This is because %xmm registers are + * not saved/restored. Clear and set the CR0.TS bit on entry and exit, + * respectively, if TS is set on entry. Otherwise, if TS is not set, + * save and restore %xmm registers on the stack. + * + * Note3: Original Intel definition: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * + * Note4: Register/parameter mapping: + * Intel: + * Parameter 1: %rcx (copied to %xmm0) hk or x_in + * Parameter 2: %rdx (copied to %xmm1) s or y + * Parameter 3: %rdi (result) d or res + * OpenSolaris: + * Parameter 1: %rdi (copied to %xmm0) x_in + * Parameter 2: %rsi (copied to %xmm1) y + * Parameter 3: %rdx (result) res + */ + +ENTRY_NP(gcm_mul_pclmulqdq) + CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10) + + // + // Copy Parameters + // + movdqu (%rdi), %xmm0 // P1 + movdqu (%rsi), %xmm1 // P2 + + // + // Byte swap 16-byte input + // + lea .Lbyte_swap16_mask(%rip), %rax + movups (%rax), %xmm10 + pshufb %xmm10, %xmm0 + pshufb %xmm10, %xmm1 + + + // + // Multiply with the hash key + // + movdqu %xmm0, %xmm3 + pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 + + movdqu %xmm0, %xmm4 + pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 + + movdqu %xmm0, %xmm5 + pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 + movdqu %xmm0, %xmm6 + pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 + + pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 + + movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 + psrldq $8, %xmm4 // shift by xmm4 64 bits to the right + pslldq $8, %xmm5 // shift by xmm5 64 bits to the left + pxor %xmm5, %xmm3 + pxor %xmm4, %xmm6 // Register pair holds the result + // of the carry-less multiplication of + // xmm0 by xmm1. + + // We shift the result of the multiplication by one bit position + // to the left to cope for the fact that the bits are reversed. + movdqu %xmm3, %xmm7 + movdqu %xmm6, %xmm8 + pslld $1, %xmm3 + pslld $1, %xmm6 + psrld $31, %xmm7 + psrld $31, %xmm8 + movdqu %xmm7, %xmm9 + pslldq $4, %xmm8 + pslldq $4, %xmm7 + psrldq $12, %xmm9 + por %xmm7, %xmm3 + por %xmm8, %xmm6 + por %xmm9, %xmm6 + + // + // First phase of the reduction + // + // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts + // independently. + movdqu %xmm3, %xmm7 + movdqu %xmm3, %xmm8 + movdqu %xmm3, %xmm9 + pslld $31, %xmm7 // packed right shift shifting << 31 + pslld $30, %xmm8 // packed right shift shifting << 30 + pslld $25, %xmm9 // packed right shift shifting << 25 + pxor %xmm8, %xmm7 // xor the shifted versions + pxor %xmm9, %xmm7 + movdqu %xmm7, %xmm8 + pslldq $12, %xmm7 + psrldq $4, %xmm8 + pxor %xmm7, %xmm3 // first phase of the reduction complete + + // + // Second phase of the reduction + // + // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these + // shift operations. + movdqu %xmm3, %xmm2 + movdqu %xmm3, %xmm4 // packed left shifting >> 1 + movdqu %xmm3, %xmm5 + psrld $1, %xmm2 + psrld $2, %xmm4 // packed left shifting >> 2 + psrld $7, %xmm5 // packed left shifting >> 7 + pxor %xmm4, %xmm2 // xor the shifted versions + pxor %xmm5, %xmm2 + pxor %xmm8, %xmm2 + pxor %xmm2, %xmm3 + pxor %xmm3, %xmm6 // the result is in xmm6 + + // + // Byte swap 16-byte result + // + pshufb %xmm10, %xmm6 // %xmm10 has the swap mask + + // + // Store the result + // + movdqu %xmm6, (%rdx) // P3 + + + // + // Cleanup and Return + // + SET_TS_OR_POP_XMM_REGISTERS(%r10) + ret + SET_SIZE(gcm_mul_pclmulqdq) + +#endif /* lint || __lint */ diff --git a/module/icp/asm-x86_64/os/macos/sha1/sha1-x86_64.S b/module/icp/asm-x86_64/os/macos/sha1/sha1-x86_64.S new file mode 100644 index 0000000000..cb923784a7 --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/sha1/sha1-x86_64.S @@ -0,0 +1,1353 @@ +/* + * !/usr/bin/env perl + * + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. The module is, however, dual licensed under OpenSSL and + * CRYPTOGAMS licenses depending on where you obtain it. For further + * details see http://www.openssl.org/~appro/cryptogams/. + * ==================================================================== + * + * sha1_block procedure for x86_64. + * + * It was brought to my attention that on EM64T compiler-generated code + * was far behind 32-bit assembler implementation. This is unlike on + * Opteron where compiler-generated code was only 15% behind 32-bit + * assembler, which originally made it hard to motivate the effort. + * There was suggestion to mechanically translate 32-bit code, but I + * dismissed it, reasoning that x86_64 offers enough register bank + * capacity to fully utilize SHA-1 parallelism. Therefore this fresh + * implementation:-) However! While 64-bit code does performs better + * on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, + * x86_64 does offer larger *addressable* bank, but out-of-order core + * reaches for even more registers through dynamic aliasing, and EM64T + * core must have managed to run-time optimize even 32-bit code just as + * good as 64-bit one. Performance improvement is summarized in the + * following table: + * + * gcc 3.4 32-bit asm cycles/byte + * Opteron +45% +20% 6.8 + * Xeon P4 +65% +0% 9.9 + * Core2 +60% +10% 7.0 + * + * + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha1-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). + * + */ + +/* + * This file was generated by a perl script (sha1-x86_64.pl). The comments from + * the original file have been pasted above. + */ + +#if defined(lint) || defined(__lint) +#include +#include + + +/* ARGSUSED */ +void +sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks) +{ +} + +#else +#define _ASM +#include +ENTRY_NP(sha1_block_data_order) + push %rbx + push %rbp + push %r12 + mov %rsp,%rax + mov %rdi,%r8 # reassigned argument + sub $72,%rsp + mov %rsi,%r9 # reassigned argument + and $-64,%rsp + mov %rdx,%r10 # reassigned argument + mov %rax,64(%rsp) + + mov 0(%r8),%edx + mov 4(%r8),%esi + mov 8(%r8),%edi + mov 12(%r8),%ebp + mov 16(%r8),%r11d +.align 4 +.Lloop: + mov 0(%r9),%eax + bswap %eax + mov %eax,0(%rsp) + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 4(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,4(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 8(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,8(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 12(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,12(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov %r12d,%ebx + mov 16(%r9),%eax + mov %ebp,%esi + xor %edx,%ebx + bswap %eax + rol $5,%esi + and %r11d,%ebx + mov %eax,16(%rsp) + add %esi,%edi + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + lea 0x5a827999(%eax,%edx),%esi + mov %r11d,%ebx + mov 20(%r9),%eax + mov %edi,%edx + xor %r12d,%ebx + bswap %eax + rol $5,%edx + and %ebp,%ebx + mov %eax,20(%rsp) + add %edx,%esi + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + lea 0x5a827999(%eax,%r12d),%edx + mov %ebp,%ebx + mov 24(%r9),%eax + mov %esi,%r12d + xor %r11d,%ebx + bswap %eax + rol $5,%r12d + and %edi,%ebx + mov %eax,24(%rsp) + add %r12d,%edx + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 28(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,28(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 32(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,32(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 36(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,36(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov %r12d,%ebx + mov 40(%r9),%eax + mov %ebp,%esi + xor %edx,%ebx + bswap %eax + rol $5,%esi + and %r11d,%ebx + mov %eax,40(%rsp) + add %esi,%edi + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + lea 0x5a827999(%eax,%edx),%esi + mov %r11d,%ebx + mov 44(%r9),%eax + mov %edi,%edx + xor %r12d,%ebx + bswap %eax + rol $5,%edx + and %ebp,%ebx + mov %eax,44(%rsp) + add %edx,%esi + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + lea 0x5a827999(%eax,%r12d),%edx + mov %ebp,%ebx + mov 48(%r9),%eax + mov %esi,%r12d + xor %r11d,%ebx + bswap %eax + rol $5,%r12d + and %edi,%ebx + mov %eax,48(%rsp) + add %r12d,%edx + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 52(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,52(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 56(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,56(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 60(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,60(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov 0(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 8(%rsp),%eax + xor %edx,%ebx + rol $5,%esi + xor 32(%rsp),%eax + and %r11d,%ebx + add %esi,%edi + xor 52(%rsp),%eax + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,0(%rsp) + lea 0x5a827999(%eax,%edx),%esi + mov 4(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 12(%rsp),%eax + xor %r12d,%ebx + rol $5,%edx + xor 36(%rsp),%eax + and %ebp,%ebx + add %edx,%esi + xor 56(%rsp),%eax + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,4(%rsp) + lea 0x5a827999(%eax,%r12d),%edx + mov 8(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 16(%rsp),%eax + xor %r11d,%ebx + rol $5,%r12d + xor 40(%rsp),%eax + and %edi,%ebx + add %r12d,%edx + xor 60(%rsp),%eax + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,8(%rsp) + lea 0x5a827999(%eax,%r11d),%r12d + mov 12(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 20(%rsp),%eax + xor %ebp,%ebx + rol $5,%r11d + xor 44(%rsp),%eax + and %esi,%ebx + add %r11d,%r12d + xor 0(%rsp),%eax + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,12(%rsp) + lea 0x5a827999(%eax,%ebp),%r11d + mov 16(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 24(%rsp),%eax + xor %edi,%ebx + rol $5,%ebp + xor 48(%rsp),%eax + and %edx,%ebx + add %ebp,%r11d + xor 4(%rsp),%eax + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,16(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 20(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 28(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 52(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 8(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,20(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 24(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 32(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 56(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 12(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,24(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 28(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 36(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 60(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 16(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,28(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 32(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 40(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 0(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 20(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,32(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 36(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 44(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 4(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 24(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,36(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 40(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 48(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 8(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 28(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,40(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 44(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 52(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 12(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 32(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,44(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 48(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 56(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 16(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 36(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,48(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 52(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 60(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 20(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 40(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,52(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 56(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 0(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 24(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 44(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,56(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 60(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 4(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 28(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 48(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,60(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 0(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 8(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 32(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 52(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,0(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 4(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 12(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 36(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 56(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,4(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 8(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 16(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 40(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 60(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,8(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 12(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 20(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 44(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 0(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,12(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 16(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 24(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 48(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 4(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,16(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 20(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 28(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 52(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 8(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,20(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 24(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 32(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 56(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 12(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,24(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 28(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 36(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 60(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 16(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,28(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 32(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 40(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 0(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 20(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,32(%rsp) + lea -0x70e44324(%eax,%edx),%esi + mov 36(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 44(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 4(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 24(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,36(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 40(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 48(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 8(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 28(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,40(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 44(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 52(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 12(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 32(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,44(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 48(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 56(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 16(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 36(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,48(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 52(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 60(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 20(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 40(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,52(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 56(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 0(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 24(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 44(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,56(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 60(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 4(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 28(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 48(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,60(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 0(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 8(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 32(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 52(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,0(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 4(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 12(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 36(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 56(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,4(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 8(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 16(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 40(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 60(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,8(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 12(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 20(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 44(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 0(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,12(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 16(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 24(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 48(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 4(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,16(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 20(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 28(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 52(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 8(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,20(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 24(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 32(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 56(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 12(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,24(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 28(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 36(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 60(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 16(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,28(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 32(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 40(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 0(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 20(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,32(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 36(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 44(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 4(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 24(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,36(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 40(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 48(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 8(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 28(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,40(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 44(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 52(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 12(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 32(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,44(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 48(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 56(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 16(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 36(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,48(%rsp) + add %ebx,%edx + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 52(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 60(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 20(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 40(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,52(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 56(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 0(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 24(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 44(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,56(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 60(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 4(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 28(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 48(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,60(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 0(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 8(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 32(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 52(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,0(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 4(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 12(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 36(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 56(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,4(%rsp) + lea -0x359d3e2a(%eax,%r12d),%edx + mov 8(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 16(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 40(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 60(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,8(%rsp) + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 12(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 20(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 44(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 0(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,12(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 16(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 24(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 48(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 4(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,16(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 20(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 28(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 52(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 8(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,20(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 24(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 32(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 56(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 12(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,24(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 28(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 36(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 60(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 16(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,28(%rsp) + lea -0x359d3e2a(%eax,%r12d),%edx + mov 32(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 40(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 0(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 20(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,32(%rsp) + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 36(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 44(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 4(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 24(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,36(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 40(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 48(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 8(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 28(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,40(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 44(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 52(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 12(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 32(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,44(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 48(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 56(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 16(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 36(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,48(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 52(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 60(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 20(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 40(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + lea -0x359d3e2a(%eax,%r12d),%edx + mov 56(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 0(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 24(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 44(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 60(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 4(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 28(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 48(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + lea -0x359d3e2a(%eax,%ebp),%r11d + mov %esi,%ebx + mov %r12d,%ebp + xor %edx,%ebx + rol $5,%ebp + xor %edi,%ebx + add %ebp,%r11d + rol $30,%edx + add %ebx,%r11d + // Update and save state information in SHA-1 context + add 0(%r8),%r11d + add 4(%r8),%r12d + add 8(%r8),%edx + add 12(%r8),%esi + add 16(%r8),%edi + mov %r11d,0(%r8) + mov %r12d,4(%r8) + mov %edx,8(%r8) + mov %esi,12(%r8) + mov %edi,16(%r8) + + xchg %r11d,%edx # mov %r11d,%edx + xchg %r12d,%esi # mov %r12d,%esi + xchg %r11d,%edi # mov %edx,%edi + xchg %r12d,%ebp # mov %esi,%ebp + # mov %edi,%r11d + lea 64(%r9),%r9 + sub $1,%r10 + jnz .Lloop + mov 64(%rsp),%rsp + pop %r12 + pop %rbp + pop %rbx + ret +SET_SIZE(sha1_block_data_order) + +.data +.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by " + +#endif /* lint || __lint */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/os/macos/sha2/sha256_impl.S b/module/icp/asm-x86_64/os/macos/sha2/sha256_impl.S new file mode 100644 index 0000000000..0b0f3444fa --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/sha2/sha256_impl.S @@ -0,0 +1,2058 @@ + +/* + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. Rights for redistribution and usage in source and binary + * forms are granted according to the OpenSSL license. + * ==================================================================== + * + * sha256/512_block procedure for x86_64. + * + * 40% improvement over compiler-generated code on Opteron. On EM64T + * sha256 was observed to run >80% faster and sha512 - >40%. No magical + * tricks, just straight implementation... I really wonder why gcc + * [being armed with inline assembler] fails to generate as fast code. + * The only thing which is cool about this module is that it's very + * same instruction sequence used for both SHA-256 and SHA-512. In + * former case the instructions operate on 32-bit operands, while in + * latter - on 64-bit ones. All I had to do is to get one flavor right, + * the other one passed the test right away:-) + * + * sha256_block runs in ~1005 cycles on Opteron, which gives you + * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock + * frequency in GHz. sha512_block runs in ~1275 cycles, which results + * in 128*1000/1275=100MBps per GHz. Is there room for improvement? + * Well, if you compare it to IA-64 implementation, which maintains + * X[16] in register bank[!], tends to 4 instructions per CPU clock + * cycle and runs in 1003 cycles, 1275 is very good result for 3-way + * issue Opteron pipeline and X[16] maintained in memory. So that *if* + * there is a way to improve it, *then* the only way would be to try to + * offload X[16] updates to SSE unit, but that would require "deeper" + * loop unroll, which in turn would naturally cause size blow-up, not + * to mention increased complexity! And once again, only *if* it's + * actually possible to noticeably improve overall ILP, instruction + * level parallelism, on a given CPU implementation in this case. + * + * Special note on Intel EM64T. While Opteron CPU exhibits perfect + * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], + * [currently available] EM64T CPUs apparently are far from it. On the + * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit + * sha256_block:-( This is presumably because 64-bit shifts/rotates + * apparently are not atomic instructions, but implemented in microcode. + */ + +/* + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha512-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). Replaced the .picmeup macro with assembler code. + * + * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", + * at the beginning of SHA2_CTX (the next field is 8-byte aligned). + */ + +/* + * This file was generated by a perl script (sha512-x86_64.pl) that were + * used to generate sha256 and sha512 variants from the same code base. + * The comments from the original file have been pasted above. + */ + +#if defined(lint) || defined(__lint) +#include +#include + +/* ARGSUSED */ +void +SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) +{ +} + + +#else +#define _ASM +#include + +ENTRY_NP(SHA256TransformBlocks) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%rbp # copy %rsp + shl $4,%rdx # num*16 + sub $16*4+4*8,%rsp + lea (%rsi,%rdx,4),%rdx # inp+num*16*4 + and $-64,%rsp # align stack frame + add $8,%rdi # Skip OpenSolaris field, "algotype" + mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg + mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg + mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg + mov %rbp,16*4+3*8(%rsp) # save copy of %rsp + + //.picmeup %rbp + // The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts + // the address of the "next" instruction into the target register + // (%rbp). This generates these 2 instructions: + lea .Llea(%rip),%rbp + //nop // .picmeup generates a nop for mod 8 alignment--not needed here + +.Llea: + lea K256-.(%rbp),%rbp + + mov 4*0(%rdi),%eax + mov 4*1(%rdi),%ebx + mov 4*2(%rdi),%ecx + mov 4*3(%rdi),%edx + mov 4*4(%rdi),%r8d + mov 4*5(%rdi),%r9d + mov 4*6(%rdi),%r10d + mov 4*7(%rdi),%r11d + jmp .Lloop + +.align 4, 0x90 +.Lloop: + xor %rdi,%rdi + mov 4*0(%rsi),%r12d + bswap %r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,0(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 4*1(%rsi),%r12d + bswap %r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,4(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 4*2(%rsi),%r12d + bswap %r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,8(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 4*3(%rsi),%r12d + bswap %r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,12(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 4*4(%rsi),%r12d + bswap %r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,16(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 4*5(%rsi),%r12d + bswap %r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,20(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 4*6(%rsi),%r12d + bswap %r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,24(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 4*7(%rsi),%r12d + bswap %r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,28(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + mov 4*8(%rsi),%r12d + bswap %r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,32(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 4*9(%rsi),%r12d + bswap %r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,36(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 4*10(%rsi),%r12d + bswap %r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,40(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 4*11(%rsi),%r12d + bswap %r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,44(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 4*12(%rsi),%r12d + bswap %r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,48(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 4*13(%rsi),%r12d + bswap %r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,52(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 4*14(%rsi),%r12d + bswap %r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,56(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 4*15(%rsi),%r12d + bswap %r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,60(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + jmp .Lrounds_16_xx +.align 4, 0x90 +.Lrounds_16_xx: + mov 4(%rsp),%r13d + mov 56(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 36(%rsp),%r12d + + add 0(%rsp),%r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,0(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 8(%rsp),%r13d + mov 60(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 40(%rsp),%r12d + + add 4(%rsp),%r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,4(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 12(%rsp),%r13d + mov 0(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 44(%rsp),%r12d + + add 8(%rsp),%r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,8(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 16(%rsp),%r13d + mov 4(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 48(%rsp),%r12d + + add 12(%rsp),%r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,12(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 20(%rsp),%r13d + mov 8(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 52(%rsp),%r12d + + add 16(%rsp),%r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,16(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 24(%rsp),%r13d + mov 12(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 56(%rsp),%r12d + + add 20(%rsp),%r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,20(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 28(%rsp),%r13d + mov 16(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 60(%rsp),%r12d + + add 24(%rsp),%r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,24(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 32(%rsp),%r13d + mov 20(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 0(%rsp),%r12d + + add 28(%rsp),%r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,28(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + mov 36(%rsp),%r13d + mov 24(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 4(%rsp),%r12d + + add 32(%rsp),%r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,32(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 40(%rsp),%r13d + mov 28(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 8(%rsp),%r12d + + add 36(%rsp),%r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,36(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 44(%rsp),%r13d + mov 32(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 12(%rsp),%r12d + + add 40(%rsp),%r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,40(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 48(%rsp),%r13d + mov 36(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 16(%rsp),%r12d + + add 44(%rsp),%r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,44(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 52(%rsp),%r13d + mov 40(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 20(%rsp),%r12d + + add 48(%rsp),%r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,48(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 56(%rsp),%r13d + mov 44(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 24(%rsp),%r12d + + add 52(%rsp),%r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,52(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 60(%rsp),%r13d + mov 48(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 28(%rsp),%r12d + + add 56(%rsp),%r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,56(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 0(%rsp),%r13d + mov 52(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 32(%rsp),%r12d + + add 60(%rsp),%r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,60(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + cmp $64,%rdi + jb .Lrounds_16_xx + + mov 16*4+0*8(%rsp),%rdi + lea 16*4(%rsi),%rsi + + add 4*0(%rdi),%eax + add 4*1(%rdi),%ebx + add 4*2(%rdi),%ecx + add 4*3(%rdi),%edx + add 4*4(%rdi),%r8d + add 4*5(%rdi),%r9d + add 4*6(%rdi),%r10d + add 4*7(%rdi),%r11d + + cmp 16*4+2*8(%rsp),%rsi + + mov %eax,4*0(%rdi) + mov %ebx,4*1(%rdi) + mov %ecx,4*2(%rdi) + mov %edx,4*3(%rdi) + mov %r8d,4*4(%rdi) + mov %r9d,4*5(%rdi) + mov %r10d,4*6(%rdi) + mov %r11d,4*7(%rdi) + jb .Lloop + + mov 16*4+3*8(%rsp),%rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +SET_SIZE(SHA256TransformBlocks) + +.align 6, 0x90 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +#endif /* !lint && !__lint */ diff --git a/module/icp/asm-x86_64/os/macos/sha2/sha512_impl.S b/module/icp/asm-x86_64/os/macos/sha2/sha512_impl.S new file mode 100644 index 0000000000..1b51f9d5b4 --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/sha2/sha512_impl.S @@ -0,0 +1,2082 @@ +/* + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. Rights for redistribution and usage in source and binary + * forms are granted according to the OpenSSL license. + * ==================================================================== + * + * sha256/512_block procedure for x86_64. + * + * 40% improvement over compiler-generated code on Opteron. On EM64T + * sha256 was observed to run >80% faster and sha512 - >40%. No magical + * tricks, just straight implementation... I really wonder why gcc + * [being armed with inline assembler] fails to generate as fast code. + * The only thing which is cool about this module is that it's very + * same instruction sequence used for both SHA-256 and SHA-512. In + * former case the instructions operate on 32-bit operands, while in + * latter - on 64-bit ones. All I had to do is to get one flavor right, + * the other one passed the test right away:-) + * + * sha256_block runs in ~1005 cycles on Opteron, which gives you + * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock + * frequency in GHz. sha512_block runs in ~1275 cycles, which results + * in 128*1000/1275=100MBps per GHz. Is there room for improvement? + * Well, if you compare it to IA-64 implementation, which maintains + * X[16] in register bank[!], tends to 4 instructions per CPU clock + * cycle and runs in 1003 cycles, 1275 is very good result for 3-way + * issue Opteron pipeline and X[16] maintained in memory. So that *if* + * there is a way to improve it, *then* the only way would be to try to + * offload X[16] updates to SSE unit, but that would require "deeper" + * loop unroll, which in turn would naturally cause size blow-up, not + * to mention increased complexity! And once again, only *if* it's + * actually possible to noticeably improve overall ILP, instruction + * level parallelism, on a given CPU implementation in this case. + * + * Special note on Intel EM64T. While Opteron CPU exhibits perfect + * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], + * [currently available] EM64T CPUs apparently are far from it. On the + * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit + * sha256_block:-( This is presumably because 64-bit shifts/rotates + * apparently are not atomic instructions, but implemented in microcode. + */ + +/* + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha512-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). Replaced the .picmeup macro with assembler code. + * + * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", + * at the beginning of SHA2_CTX (the next field is 8-byte aligned). + */ + +/* + * This file was generated by a perl script (sha512-x86_64.pl) that were + * used to generate sha256 and sha512 variants from the same code base. + * The comments from the original file have been pasted above. + */ + + +#if defined(lint) || defined(__lint) +#include +#include + +/* ARGSUSED */ +void +SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) +{ +} + + +#else +#define _ASM +#include + +ENTRY_NP(SHA512TransformBlocks) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%rbp # copy %rsp + shl $4,%rdx # num*16 + sub $16*8+4*8,%rsp + lea (%rsi,%rdx,8),%rdx # inp+num*16*8 + and $-64,%rsp # align stack frame + add $8,%rdi # Skip OpenSolaris field, "algotype" + mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg + mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg + mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg + mov %rbp,16*8+3*8(%rsp) # save copy of %rsp + + //.picmeup %rbp + // The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts + // the address of the "next" instruction into the target register + // (%rbp). This generates these 2 instructions: + lea .Llea(%rip),%rbp + //nop // .picmeup generates a nop for mod 8 alignment--not needed here + +.Llea: + lea K512-.(%rbp),%rbp + + mov 8*0(%rdi),%rax + mov 8*1(%rdi),%rbx + mov 8*2(%rdi),%rcx + mov 8*3(%rdi),%rdx + mov 8*4(%rdi),%r8 + mov 8*5(%rdi),%r9 + mov 8*6(%rdi),%r10 + mov 8*7(%rdi),%r11 + jmp .Lloop + +.align 4, 0x90 +.Lloop: + xor %rdi,%rdi + mov 8*0(%rsi),%r12 + bswap %r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,0(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 8*1(%rsi),%r12 + bswap %r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,8(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 8*2(%rsi),%r12 + bswap %r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,16(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 8*3(%rsi),%r12 + bswap %r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,24(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 8*4(%rsi),%r12 + bswap %r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,32(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 8*5(%rsi),%r12 + bswap %r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,40(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 8*6(%rsi),%r12 + bswap %r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,48(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 8*7(%rsi),%r12 + bswap %r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,56(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + mov 8*8(%rsi),%r12 + bswap %r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,64(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 8*9(%rsi),%r12 + bswap %r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,72(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 8*10(%rsi),%r12 + bswap %r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,80(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 8*11(%rsi),%r12 + bswap %r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,88(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 8*12(%rsi),%r12 + bswap %r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,96(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 8*13(%rsi),%r12 + bswap %r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,104(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 8*14(%rsi),%r12 + bswap %r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,112(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 8*15(%rsi),%r12 + bswap %r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,120(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + jmp .Lrounds_16_xx +.align 4, 0x90 +.Lrounds_16_xx: + mov 8(%rsp),%r13 + mov 112(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 72(%rsp),%r12 + + add 0(%rsp),%r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,0(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 16(%rsp),%r13 + mov 120(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 80(%rsp),%r12 + + add 8(%rsp),%r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,8(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 24(%rsp),%r13 + mov 0(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 88(%rsp),%r12 + + add 16(%rsp),%r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,16(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 32(%rsp),%r13 + mov 8(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 96(%rsp),%r12 + + add 24(%rsp),%r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,24(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 40(%rsp),%r13 + mov 16(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 104(%rsp),%r12 + + add 32(%rsp),%r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,32(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 48(%rsp),%r13 + mov 24(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 112(%rsp),%r12 + + add 40(%rsp),%r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,40(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 56(%rsp),%r13 + mov 32(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 120(%rsp),%r12 + + add 48(%rsp),%r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,48(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 64(%rsp),%r13 + mov 40(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 0(%rsp),%r12 + + add 56(%rsp),%r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,56(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + mov 72(%rsp),%r13 + mov 48(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 8(%rsp),%r12 + + add 64(%rsp),%r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,64(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 80(%rsp),%r13 + mov 56(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 16(%rsp),%r12 + + add 72(%rsp),%r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,72(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 88(%rsp),%r13 + mov 64(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 24(%rsp),%r12 + + add 80(%rsp),%r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,80(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 96(%rsp),%r13 + mov 72(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 32(%rsp),%r12 + + add 88(%rsp),%r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,88(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 104(%rsp),%r13 + mov 80(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 40(%rsp),%r12 + + add 96(%rsp),%r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,96(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 112(%rsp),%r13 + mov 88(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 48(%rsp),%r12 + + add 104(%rsp),%r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,104(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 120(%rsp),%r13 + mov 96(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 56(%rsp),%r12 + + add 112(%rsp),%r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,112(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 0(%rsp),%r13 + mov 104(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 64(%rsp),%r12 + + add 120(%rsp),%r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,120(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + cmp $80,%rdi + jb .Lrounds_16_xx + + mov 16*8+0*8(%rsp),%rdi + lea 16*8(%rsi),%rsi + + add 8*0(%rdi),%rax + add 8*1(%rdi),%rbx + add 8*2(%rdi),%rcx + add 8*3(%rdi),%rdx + add 8*4(%rdi),%r8 + add 8*5(%rdi),%r9 + add 8*6(%rdi),%r10 + add 8*7(%rdi),%r11 + + cmp 16*8+2*8(%rsp),%rsi + + mov %rax,8*0(%rdi) + mov %rbx,8*1(%rdi) + mov %rcx,8*2(%rdi) + mov %rdx,8*3(%rdi) + mov %r8,8*4(%rdi) + mov %r9,8*5(%rdi) + mov %r10,8*6(%rdi) + mov %r11,8*7(%rdi) + jb .Lloop + + mov 16*8+3*8(%rsp),%rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +SET_SIZE(SHA512TransformBlocks) + +.align 6, 0x90 +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +#endif /* !lint && !__lint */ diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index fec8171813..a1594d34db 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -1198,7 +1198,8 @@ taskq_destroy(taskq_t *tq) } EXPORT_SYMBOL(taskq_destroy); -int EMPTY_TASKQ(taskq_t *tq) +int +EMPTY_TASKQ(taskq_t *tq) { #ifdef _KERNEL return (tq->tq_lowest_id == tq->tq_next_id); diff --git a/module/os/macos/.gitignore b/module/os/macos/.gitignore new file mode 100644 index 0000000000..14140f47af --- /dev/null +++ b/module/os/macos/.gitignore @@ -0,0 +1 @@ +*.in diff --git a/module/os/macos/Makefile.am b/module/os/macos/Makefile.am new file mode 100644 index 0000000000..5a729e5ad2 --- /dev/null +++ b/module/os/macos/Makefile.am @@ -0,0 +1,6 @@ +# Makefile used only by macOS. Should define no dependencies for +# other platforms. + +if BUILD_MACOS +SUBDIRS=kernel spl zfs +endif diff --git a/module/os/macos/README.md b/module/os/macos/README.md new file mode 100644 index 0000000000..45c21cf4a0 --- /dev/null +++ b/module/os/macos/README.md @@ -0,0 +1,8 @@ + +OpenZFS on OS X, the [macOS](https://openzfsonosx.org) port of [Open ZFS](https://openzfs.org) + +Please use the [OpenZFSOnOsX](https://github.com/openzfsonosx/openzfs) +repository for support, troubleshooting, and using GitHub issues. + +For more compiling information please visit the +[wiki](https://openzfsonosx.org/wiki/Install#Initial_installation_from_source) diff --git a/module/os/macos/kernel/.gitignore b/module/os/macos/kernel/.gitignore new file mode 100644 index 0000000000..f0d4d14070 --- /dev/null +++ b/module/os/macos/kernel/.gitignore @@ -0,0 +1,5 @@ +allsymbols +kernelexports +kernelexports_32 +kernelexports_64 +kextsymboltool diff --git a/module/os/macos/kernel/Info.plist b/module/os/macos/kernel/Info.plist new file mode 100644 index 0000000000..5dd4c6b416 --- /dev/null +++ b/module/os/macos/kernel/Info.plist @@ -0,0 +1,34 @@ + + + + + CFBundleDevelopmentRegion + English + CFBundleExecutable + KernelExports + CFBundleGetInfoString + Mach Kernel Pseudoextension, Apple Computer Inc, 12.5.0 + CFBundleIdentifier + net.lundman.kernel.dependencies + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + Mach Kernel Pseudoextension + CFBundlePackageType + KEXT + CFBundleShortVersionString + 12.5.0 + CFBundleSignature + ???? + CFBundleVersion + 12.5.0 + OSBundleCompatibleVersion + 8.0.0d0 + OSKernelResource + + OSBundleAllowUserLoad + + OSBundleRequired + Root + + diff --git a/module/os/macos/kernel/Makefile.am b/module/os/macos/kernel/Makefile.am new file mode 100644 index 0000000000..499bc3ef3a --- /dev/null +++ b/module/os/macos/kernel/Makefile.am @@ -0,0 +1,25 @@ + +AUTOMAKE_OPTIONS = subdir-objects + +noinst_PROGRAMS = kextsymboltool + +kextsymboltool_SOURCES = \ + kextsymboltool.c + +kextsymboltool_CPPFLAGS = +kextsymboltool_CFLAGS = +kextsymboltool_LDFLAGS = -lstdc++ + +kernelexports: zfs.exports | kextsymboltool + ./kextsymboltool -arch x86_64 -import allsymbols -export zfs.exports -output kernelexports_64 + ./kextsymboltool -arch i386 -import allsymbols -export zfs.exports -output kernelexports_32 + lipo -create kernelexports_32 kernelexports_64 -output kernelexports + +clean: + rm -f kernelexports kernelexports_32 kernelexports_64 allsymbols + rm -f kextsymboltool.o kextsymboltool + +allsymbols: + $(NM) -gj $(MACH_KERNEL) > allsymbols + +all:kextsymboltool allsymbols kernelexports diff --git a/module/os/macos/kernel/README.txt b/module/os/macos/kernel/README.txt new file mode 100644 index 0000000000..104f668bbe --- /dev/null +++ b/module/os/macos/kernel/README.txt @@ -0,0 +1,10 @@ + +Not all symbols are exported by default in OS X, and we have to do +a little magic to get around that. + +This uses the OpenSource kextsymbol.c utility, and a dump of all the +symbols in the kernel, to produce a link helper kext. + +We most likely need to make it better to handle kernel versions +a little more flexibly. + diff --git a/module/os/macos/kernel/kextsymboltool.c b/module/os/macos/kernel/kextsymboltool.c new file mode 100644 index 0000000000..19ffb2f306 --- /dev/null +++ b/module/os/macos/kernel/kextsymboltool.c @@ -0,0 +1,912 @@ +/* + * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#pragma mark Typedefs, Enums, Constants +/********************************************************************* +* Typedefs, Enums, Constants +*********************************************************************/ +typedef enum { + kErrorNone = 0, + kError, + kErrorFileAccess, + kErrorDiskFull, + kErrorDuplicate +} ToolError; + +#pragma mark Function Protos +/********************************************************************* +* Function Protos +*********************************************************************/ +__private_extern__ ToolError +readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize); + +__private_extern__ ToolError +writeFile(int fd, const void * data, size_t length); + +extern char* __cxa_demangle (const char* mangled_name, + char* buf, + size_t* n, + int* status); + +#pragma mark Functions +/********************************************************************* +*********************************************************************/ +__private_extern__ ToolError +writeFile(int fd, const void * data, size_t length) +{ + ToolError err; + + if (length != (size_t)write(fd, data, length)) + err = kErrorDiskFull; + else + err = kErrorNone; + + if (kErrorNone != err) + perror("couldn't write output"); + + return( err ); +} + +/********************************************************************* +*********************************************************************/ +__private_extern__ ToolError +readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize) +{ + ToolError err = kErrorFileAccess; + int fd; + struct stat stat_buf; + + *objAddr = 0; + *objSize = 0; + + do + { + if((fd = open(path, O_RDONLY)) == -1) + continue; + + if(fstat(fd, &stat_buf) == -1) + continue; + + if (0 == (stat_buf.st_mode & S_IFREG)) + continue; + + /* Don't try to map an empty file, it fails now due to conformance + * stuff (PR 4611502). + */ + if (0 == stat_buf.st_size) { + err = kErrorNone; + continue; + } + + *objSize = stat_buf.st_size; + + *objAddr = (vm_offset_t)mmap(NULL /* address */, *objSize, + PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE /* flags */, + fd, 0 /* offset */); + + if ((void *)*objAddr == MAP_FAILED) { + *objAddr = 0; + *objSize = 0; + continue; + } + + err = kErrorNone; + + } while( false ); + + if (-1 != fd) + { + close(fd); + } + if (kErrorNone != err) + { + fprintf(stderr, "couldn't read %s: %s\n", path, strerror(errno)); + } + + return( err ); +} + + +enum { kExported = 0x00000001, kObsolete = 0x00000002 }; + +struct symbol { + char * name; + unsigned int name_len; + char * indirect; + unsigned int indirect_len; + unsigned int flags; + struct symbol * list; + unsigned int list_count; +}; + +static bool issymchar( char c ) +{ + return ((c > ' ') && (c <= '~') && (c != ':') && (c != '#')); +} + +static bool iswhitespace( char c ) +{ + return ((c == ' ') || (c == '\t')); +} + +/* + * Function for qsort for comparing symbol list names. + */ +static int +qsort_cmp(const void * _left, const void * _right) +{ + struct symbol * left = (struct symbol *) _left; + struct symbol * right = (struct symbol *) _right; + + return (strcmp(left->name, right->name)); +} + +/* + * Function for bsearch for finding a symbol name. + */ + +static int +bsearch_cmp( const void * _key, const void * _cmp) +{ + char * key = (char *)_key; + struct symbol * cmp = (struct symbol *) _cmp; + + return(strcmp(key, cmp->name)); +} + +struct bsearch_key +{ + char * name; + unsigned int name_len; +}; + +static int +bsearch_cmp_prefix( const void * _key, const void * _cmp) +{ + struct bsearch_key * key = (struct bsearch_key *)_key; + struct symbol * cmp = (struct symbol *) _cmp; + + return(strncmp(key->name, cmp->name, key->name_len)); +} + +static uint32_t +count_symbols(char * file, vm_size_t file_size) +{ + uint32_t nsyms = 0; + char * scan; + char * eol; + char * next; + + for (scan = file; true; scan = next) { + + eol = memchr(scan, '\n', file_size - (scan - file)); + if (eol == NULL) { + break; + } + next = eol + 1; + + /* Skip empty lines. + */ + if (eol == scan) { + continue; + } + + /* Skip comment lines. + */ + if (scan[0] == '#') { + continue; + } + + /* Scan past any non-symbol characters at the beginning of the line. */ + while ((scan < eol) && !issymchar(*scan)) { + scan++; + } + + /* No symbol on line? Move along. + */ + if (scan == eol) { + continue; + } + + /* Skip symbols starting with '.'. + */ + if (scan[0] == '.') { + continue; + } + nsyms++; + } + + return nsyms; +} + +static uint32_t +store_symbols(char * file, vm_size_t file_size, struct symbol * symbols, uint32_t idx, uint32_t max_symbols) +{ + char * scan; + char * line; + char * eol; + char * next; + + uint32_t strtabsize; + + strtabsize = 0; + + for (scan = file, line = file; true; scan = next, line = next) { + + char * name = NULL; + char * name_term = NULL; + unsigned int name_len = 0; + char * indirect = NULL; + char * indirect_term = NULL; + unsigned int indirect_len = 0; + char * option = NULL; + char * option_term = NULL; + unsigned int option_len = 0; + char optionstr[256]; + boolean_t obsolete = 0; + + eol = memchr(scan, '\n', file_size - (scan - file)); + if (eol == NULL) { + break; + } + next = eol + 1; + + /* Skip empty lines. + */ + if (eol == scan) { + continue; + } + + *eol = '\0'; + + /* Skip comment lines. + */ + if (scan[0] == '#') { + continue; + } + + /* Scan past any non-symbol characters at the beginning of the line. */ + while ((scan < eol) && !issymchar(*scan)) { + scan++; + } + + /* No symbol on line? Move along. + */ + if (scan == eol) { + continue; + } + + /* Skip symbols starting with '.'. + */ + if (scan[0] == '.') { + continue; + } + + name = scan; + + /* Find the end of the symbol. + */ + while ((*scan != '\0') && issymchar(*scan)) { + scan++; + } + + /* Note char past end of symbol. + */ + name_term = scan; + + /* Stored length must include the terminating nul char. + */ + name_len = name_term - name + 1; + + /* Now look for an indirect. + */ + if (*scan != '\0') { + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + if (*scan == ':') { + scan++; + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + if (issymchar(*scan)) { + indirect = scan; + + /* Find the end of the symbol. + */ + while ((*scan != '\0') && issymchar(*scan)) { + scan++; + } + + /* Note char past end of symbol. + */ + indirect_term = scan; + + /* Stored length must include the terminating nul char. + */ + indirect_len = indirect_term - indirect + 1; + + } else if (*scan == '\0') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + } else if (*scan != '\0' && *scan != '-') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + } + + /* Look for options. + */ + if (*scan != '\0') { + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + + if (*scan == '-') { + scan++; + + if (isalpha(*scan)) { + option = scan; + + /* Find the end of the option. + */ + while ((*scan != '\0') && isalpha(*scan)) { + scan++; + } + + /* Note char past end of option. + */ + option_term = scan; + option_len = option_term - option; + + if (option_len >= sizeof(optionstr)) { + fprintf(stderr, "option too long in symbol line: %s\n", line); + exit(1); + } + memcpy(optionstr, option, option_len); + optionstr[option_len] = '\0'; + + /* Find the option. + */ + if (!strncmp(optionstr, "obsolete", option_len)) { + obsolete = TRUE; + } + + } else if (*scan == '\0') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + + } + + } + + if(idx >= max_symbols) { + fprintf(stderr, "symbol[%d/%d] overflow: %s\n", idx, max_symbols, line); + exit(1); + } + + *name_term = '\0'; + if (indirect_term) { + *indirect_term = '\0'; + } + + symbols[idx].name = name; + symbols[idx].name_len = name_len; + symbols[idx].indirect = indirect; + symbols[idx].indirect_len = indirect_len; + symbols[idx].flags = (obsolete) ? kObsolete : 0; + + strtabsize += symbols[idx].name_len + symbols[idx].indirect_len; + idx++; + } + + return strtabsize; +} + +/********************************************************************* +*********************************************************************/ +int main(int argc, char * argv[]) +{ + ToolError err; + int i, fd; + const char * output_name = NULL; + uint32_t zero = 0, num_files = 0; + uint32_t filenum; + uint32_t strx, strtabsize, strtabpad; + struct symbol * import_symbols; + struct symbol * export_symbols; + uint32_t num_import_syms, num_export_syms; + uint32_t result_count, num_removed_syms; + uint32_t import_idx, export_idx; + const NXArchInfo * host_arch; + const NXArchInfo * target_arch; + boolean_t require_imports = true; + boolean_t diff = false; + + + struct file { + vm_offset_t mapped; + vm_size_t mapped_size; + uint32_t nsyms; + boolean_t import; + const char * path; + }; + struct file files[64]; + + host_arch = NXGetLocalArchInfo(); + target_arch = host_arch; + + for( i = 1; i < argc; i += 2) + { + boolean_t import; + + if (!strcmp("-sect", argv[i])) + { + require_imports = false; + i--; + continue; + } + if (!strcmp("-diff", argv[i])) + { + require_imports = false; + diff = true; + i--; + continue; + } + + if (i == (argc - 1)) + { + fprintf(stderr, "bad arguments: %s\n", argv[i]); + exit(1); + } + + if (!strcmp("-arch", argv[i])) + { + target_arch = NXGetArchInfoFromName(argv[i + 1]); + if (!target_arch) + { + fprintf(stderr, "unknown architecture name: %s\n", argv[i+1]); + exit(1); + } + continue; + } + if (!strcmp("-output", argv[i])) + { + output_name = argv[i+1]; + continue; + } + + if (!strcmp("-import", argv[i])) + import = true; + else if (!strcmp("-export", argv[i])) + import = false; + else + { + fprintf(stderr, "unknown option: %s\n", argv[i]); + exit(1); + } + + err = readFile(argv[i+1], &files[num_files].mapped, &files[num_files].mapped_size); + if (kErrorNone != err) + exit(1); + + if (files[num_files].mapped && files[num_files].mapped_size) + { + files[num_files].import = import; + files[num_files].path = argv[i+1]; + num_files++; + } + } + + if (!output_name) + { + fprintf(stderr, "no output file\n"); + exit(1); + } + + num_import_syms = 0; + num_export_syms = 0; + for (filenum = 0; filenum < num_files; filenum++) + { + files[filenum].nsyms = count_symbols((char *) files[filenum].mapped, files[filenum].mapped_size); + if (files[filenum].import) + num_import_syms += files[filenum].nsyms; + else + num_export_syms += files[filenum].nsyms; + } + if (!num_export_syms) + { + fprintf(stderr, "no export names\n"); + exit(1); + } + + import_symbols = calloc(num_import_syms, sizeof(struct symbol)); + export_symbols = calloc(num_export_syms, sizeof(struct symbol)); + + import_idx = 0; + export_idx = 0; + + for (filenum = 0; filenum < num_files; filenum++) + { + if (files[filenum].import) + { + store_symbols((char *) files[filenum].mapped, files[filenum].mapped_size, + import_symbols, import_idx, num_import_syms); + import_idx += files[filenum].nsyms; + } + else + { + store_symbols((char *) files[filenum].mapped, files[filenum].mapped_size, + export_symbols, export_idx, num_export_syms); + export_idx += files[filenum].nsyms; + } + if (false && !files[filenum].nsyms) + { + fprintf(stderr, "warning: file %s contains no names\n", files[filenum].path); + } + } + + + qsort(import_symbols, num_import_syms, sizeof(struct symbol), &qsort_cmp); + qsort(export_symbols, num_export_syms, sizeof(struct symbol), &qsort_cmp); + + result_count = 0; + num_removed_syms = 0; + strtabsize = 4; + if (num_import_syms) + { + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + struct symbol * result; + char * name; + size_t len; + boolean_t wild; + + name = export_symbols[export_idx].indirect; + len = export_symbols[export_idx].indirect_len; + if (!name) + { + name = export_symbols[export_idx].name; + len = export_symbols[export_idx].name_len; + } + wild = ((len > 2) && ('*' == name[len-=2])); + if (wild) + { + struct bsearch_key key; + key.name = name; + key.name_len = len; + result = bsearch(&key, import_symbols, + num_import_syms, sizeof(struct symbol), &bsearch_cmp_prefix); + + if (result) + { + struct symbol * first; + struct symbol * last; + + strtabsize += (result->name_len + result->indirect_len); + + first = result; + while (--first >= &import_symbols[0]) + { + if (bsearch_cmp_prefix(&key, first)) + break; + strtabsize += (first->name_len + first->indirect_len); + } + first++; + + last = result; + while (++last < (&import_symbols[0] + num_import_syms)) + { + if (bsearch_cmp_prefix(&key, last)) + break; + strtabsize += (last->name_len + last->indirect_len); + } + result_count += last - first; + result = first; + export_symbols[export_idx].list = first; + export_symbols[export_idx].list_count = last - first; + export_symbols[export_idx].flags |= kExported; + } + } + else + result = bsearch(name, import_symbols, + num_import_syms, sizeof(struct symbol), &bsearch_cmp); + + if (!result && require_imports) + { + int status; + char * demangled_result = + __cxa_demangle(export_symbols[export_idx].name + 1, NULL, NULL, &status); + fprintf(stderr, "exported name not in import list: %s\n", + demangled_result ? demangled_result : export_symbols[export_idx].name); +// fprintf(stderr, " : %s\n", export_symbols[export_idx].name); + if (demangled_result) { + free(demangled_result); + } + num_removed_syms++; + } + if (diff) + { + if (!result) + result = &export_symbols[export_idx]; + else + result = NULL; + } + if (result && !wild) + { + export_symbols[export_idx].flags |= kExported; + strtabsize += (export_symbols[export_idx].name_len + export_symbols[export_idx].indirect_len); + result_count++; + export_symbols[export_idx].list = &export_symbols[export_idx]; + export_symbols[export_idx].list_count = 1; + } + } + } + strtabpad = (strtabsize + 3) & ~3; + + if (require_imports && num_removed_syms) + { + err = kError; + goto finish; + } + + fd = open(output_name, O_WRONLY|O_CREAT|O_TRUNC, 0755); + if (-1 == fd) + { + perror("couldn't write output"); + err = kErrorFileAccess; + goto finish; + } + + struct symtab_command symcmd; + struct uuid_command uuidcmd; + + symcmd.cmd = LC_SYMTAB; + symcmd.cmdsize = sizeof(symcmd); + symcmd.symoff = sizeof(symcmd) + sizeof(uuidcmd); + symcmd.nsyms = result_count; + symcmd.strsize = strtabpad; + + uuidcmd.cmd = LC_UUID; + uuidcmd.cmdsize = sizeof(uuidcmd); + uuid_generate(uuidcmd.uuid); + + if (CPU_ARCH_ABI64 & target_arch->cputype) + { + struct mach_header_64 hdr; + hdr.magic = MH_MAGIC_64; + hdr.cputype = target_arch->cputype; + hdr.cpusubtype = target_arch->cpusubtype; + hdr.filetype = MH_KEXT_BUNDLE; + hdr.ncmds = 2; + hdr.sizeofcmds = sizeof(symcmd) + sizeof(uuidcmd); + hdr.flags = MH_INCRLINK; + + symcmd.symoff += sizeof(hdr); + symcmd.stroff = result_count * sizeof(struct nlist_64) + + symcmd.symoff; + + if (target_arch->byteorder != host_arch->byteorder) + swap_mach_header_64(&hdr, target_arch->byteorder); + err = writeFile(fd, &hdr, sizeof(hdr)); + } + else + { + struct mach_header hdr; + hdr.magic = MH_MAGIC; + hdr.cputype = target_arch->cputype; + hdr.cpusubtype = target_arch->cpusubtype; + hdr.filetype = (target_arch->cputype == CPU_TYPE_I386) ? MH_OBJECT : MH_KEXT_BUNDLE; + hdr.ncmds = 2; + hdr.sizeofcmds = sizeof(symcmd) + sizeof(uuidcmd); + hdr.flags = MH_INCRLINK; + + symcmd.symoff += sizeof(hdr); + symcmd.stroff = result_count * sizeof(struct nlist) + + symcmd.symoff; + + if (target_arch->byteorder != host_arch->byteorder) + swap_mach_header(&hdr, target_arch->byteorder); + err = writeFile(fd, &hdr, sizeof(hdr)); + } + + if (kErrorNone != err) + goto finish; + + if (target_arch->byteorder != host_arch->byteorder) { + swap_symtab_command(&symcmd, target_arch->byteorder); + swap_uuid_command(&uuidcmd, target_arch->byteorder); + } + err = writeFile(fd, &symcmd, sizeof(symcmd)); + if (kErrorNone != err) + goto finish; + err = writeFile(fd, &uuidcmd, sizeof(uuidcmd)); + if (kErrorNone != err) + goto finish; + + strx = 4; + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + if (!export_symbols[export_idx].name) + continue; + if (!(kExported & export_symbols[export_idx].flags)) + continue; + + if (export_idx + && export_symbols[export_idx - 1].name + && !strcmp(export_symbols[export_idx - 1].name, export_symbols[export_idx].name)) + { + fprintf(stderr, "duplicate export: %s\n", export_symbols[export_idx - 1].name); + err = kErrorDuplicate; + goto finish; + } + + for (import_idx = 0; import_idx < export_symbols[export_idx].list_count; import_idx++) + { + + if (export_symbols[export_idx].list != &export_symbols[export_idx]) + { + printf("wild: %s, %s\n", export_symbols[export_idx].name, + export_symbols[export_idx].list[import_idx].name); + } + if (CPU_ARCH_ABI64 & target_arch->cputype) + { + struct nlist_64 nl; + + nl.n_sect = 0; + nl.n_desc = 0; + nl.n_un.n_strx = strx; + strx += export_symbols[export_idx].list[import_idx].name_len; + + if (export_symbols[export_idx].flags & kObsolete) { + nl.n_desc |= N_DESC_DISCARDED; + } + + if (export_symbols[export_idx].list[import_idx].indirect) + { + nl.n_type = N_INDR | N_EXT; + nl.n_value = strx; + strx += export_symbols[export_idx].list[import_idx].indirect_len; + } + else + { + nl.n_type = N_UNDF | N_EXT; + nl.n_value = 0; + } + + if (target_arch->byteorder != host_arch->byteorder) + swap_nlist_64(&nl, 1, target_arch->byteorder); + + err = writeFile(fd, &nl, sizeof(nl)); + } + else + { + struct nlist nl; + + nl.n_sect = 0; + nl.n_desc = 0; + nl.n_un.n_strx = strx; + strx += export_symbols[export_idx].list[import_idx].name_len; + + if (export_symbols[export_idx].flags & kObsolete) { + nl.n_desc |= N_DESC_DISCARDED; + } + + if (export_symbols[export_idx].list[import_idx].indirect) + { + nl.n_type = N_INDR | N_EXT; + nl.n_value = strx; + strx += export_symbols[export_idx].list[import_idx].indirect_len; + } + else + { + nl.n_type = N_UNDF | N_EXT; + nl.n_value = 0; + } + + if (target_arch->byteorder != host_arch->byteorder) + swap_nlist(&nl, 1, target_arch->byteorder); + + err = writeFile(fd, &nl, sizeof(nl)); + } + } + + if (kErrorNone != err) + goto finish; + } + + strx = sizeof(uint32_t); + err = writeFile(fd, &zero, strx); + if (kErrorNone != err) + goto finish; + + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + if (!export_symbols[export_idx].name) + continue; + + for (import_idx = 0; import_idx < export_symbols[export_idx].list_count; import_idx++) + { + err = writeFile(fd, export_symbols[export_idx].list[import_idx].name, + export_symbols[export_idx].list[import_idx].name_len); + if (kErrorNone != err) + goto finish; + if (export_symbols[export_idx].list[import_idx].indirect) + { + err = writeFile(fd, export_symbols[export_idx].list[import_idx].indirect, + export_symbols[export_idx].list[import_idx].indirect_len); + if (kErrorNone != err) + goto finish; + } + } + } + + err = writeFile(fd, &zero, strtabpad - strtabsize); + if (kErrorNone != err) + goto finish; + + close(fd); + + +finish: + for (filenum = 0; filenum < num_files; filenum++) { + // unmap file + if (files[filenum].mapped_size) + { + munmap((caddr_t)files[filenum].mapped, files[filenum].mapped_size); + files[filenum].mapped = 0; + files[filenum].mapped_size = 0; + } + + } + + if (kErrorNone != err) + { + if (output_name) + unlink(output_name); + exit(1); + } + else + exit(0); + return(0); +} + diff --git a/module/os/macos/kernel/version.plist b/module/os/macos/kernel/version.plist new file mode 100644 index 0000000000..93dfa2a162 --- /dev/null +++ b/module/os/macos/kernel/version.plist @@ -0,0 +1,16 @@ + + + + + BuildVersion + 1 + CFBundleShortVersionString + 12.0.0 + CFBundleVersion + 12.0.0 + ProjectName + xnu + SourceVersion + 2050007009000000 + + diff --git a/module/os/macos/kernel/zfs.exports b/module/os/macos/kernel/zfs.exports new file mode 100644 index 0000000000..82752554d0 --- /dev/null +++ b/module/os/macos/kernel/zfs.exports @@ -0,0 +1,32 @@ +_cpu_number +_fp_lookup +_fd_rdwr +_hostname +_kernel_memory_allocate +_virtual_space_start +_virtual_space_end +_vm_page_free_wanted +_vm_page_free_count +_vm_page_free_min +_vm_page_speculative_count +_VFS_ROOT +_vm_pool_low +_fp_drop +_fp_drop_written +_fo_read +_fo_write +_system_inshutdown +_cache_purgevfs +_vfs_context_kernel +_build_path +_kvtophys +__mh_execute_header +_gLoadedKextSummaries +_VNOP_LOOKUP +_vnode_notify +_vfs_get_notify_attributes +_kauth_cred_getgroups +_rootvnode +_cpuid_info +_vnode_iocount +_kx_qsort diff --git a/module/os/macos/spl/Makefile.am b/module/os/macos/spl/Makefile.am new file mode 100644 index 0000000000..4bdcf4aaec --- /dev/null +++ b/module/os/macos/spl/Makefile.am @@ -0,0 +1,59 @@ + +# Anyone remember why we made this a library? +libspl_la_CPPFLAGS= \ + -Wall \ + -nostdinc \ + -mkernel \ + -fno-builtin-printf \ + -D_KERNEL \ + -DKERNEL \ + -DKERNEL_PRIVATE \ + -DDRIVER_PRIVATE \ + -DAPPLE \ + -DNeXT \ + -I$(top_srcdir)/include/os/macos/spl \ + -I$(top_srcdir)/include \ + -I@KERNEL_HEADERS@/Headers \ + -I@KERNEL_HEADERS@/PrivateHeaders + +libspl_la_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ + +libspl_la_LDFLAGS= \ + -Xlinker \ + -kext \ + -nostdlib \ + -lkmodc++ \ + -lkmod \ + -lcc_kext + +libspl_la_LIBS = -lnone + +# If we don't set this to nothing, it adds "-lz -liconv" +LIBS = + +noinst_LTLIBRARIES = libspl.la + +libspl_la_SOURCES = \ + spl-atomic.c \ + spl-condvar.c \ + spl-cred.c \ + spl-ddi.c \ + spl-err.c \ + spl-kmem.c \ + spl-kstat.c \ + spl-list.c \ + spl-mutex.c \ + spl-osx.c \ + spl-policy.c \ + spl-proc.c \ + spl-processor.c \ + spl-proc_list.c \ + spl-rwlock.c \ + spl-seg_kmem.c \ + spl-taskq.c \ + spl-thread.c \ + spl-time.c \ + spl-tsd.c \ + spl-vmem.c \ + spl-vnode.c \ + spl-xdr.c diff --git a/module/os/macos/spl/README.md b/module/os/macos/spl/README.md new file mode 100644 index 0000000000..cc8dbf288e --- /dev/null +++ b/module/os/macos/spl/README.md @@ -0,0 +1,14 @@ +The Solaris Porting Layer, SPL, is a macOS kernel module which provides a +compatibility layer used by the macOS port of Open ZFS. + +# Installation + +The latest version of the SPL is maintained as part of this repository. +Only when building ZFS version 1.9.4 or earlier must an external SPL release +be used. These releases can be found at: + + * Version 1.9.4: https://github.com/openzfsonosx/spl/tree/spl-1.9.4-release + +# Release + +The SPL is released under a CDDL license. diff --git a/module/os/macos/spl/spl-atomic.c b/module/os/macos/spl/spl-atomic.c new file mode 100644 index 0000000000..973f462fdf --- /dev/null +++ b/module/os/macos/spl/spl-atomic.c @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Solaris Porting Layer (SPL) Atomic Implementation. + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include +#include + + +#include +#include +#include +#include + +void * +atomic_cas_ptr(volatile void *target, void *cmp, void *new) +{ +#ifdef __LP64__ + return (void *)__sync_val_compare_and_swap((uint64_t *)target, + (uint64_t)cmp, (uint64_t)new); +#else + return (void *)__sync_val_compare_and_swap((uint32_t *)target, cmp, + new); +#endif +} diff --git a/module/os/macos/spl/spl-condvar.c b/module/os/macos/spl/spl-condvar.c new file mode 100644 index 0000000000..e8618161b3 --- /dev/null +++ b/module/os/macos/spl/spl-condvar.c @@ -0,0 +1,232 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#include +#include +#include + +/* + * cv_timedwait() is similar to cv_wait() except that it additionally expects + * a timeout value specified in ticks. When woken by cv_signal() or + * cv_broadcast() it returns 1, otherwise when the timeout is reached -1 is + * returned. + * + * cv_timedwait_sig() behaves the same as cv_timedwait() but blocks + * interruptibly and can be woken by a signal (EINTR, ERESTART). When + * this occurs 0 is returned. + * + * cv_timedwait_io() and cv_timedwait_sig_io() are variants of cv_timedwait() + * and cv_timedwait_sig() which should be used when waiting for outstanding + * IO to complete. They are responsible for updating the iowait accounting + * when this is supported by the platform. + * + * cv_timedwait_hires() and cv_timedwait_sig_hires() are high resolution + * versions of cv_timedwait() and cv_timedwait_sig(). They expect the timeout + * to be specified as a hrtime_t allowing for timeouts of less than a tick. + * + * N.B. The return values differ slightly from the illumos implementation + * which returns the time remaining, instead of 1, when woken. They both + * return -1 on timeout. Consumers which need to know the time remaining + * are responsible for tracking it themselves. + */ + +#ifdef SPL_DEBUG_MUTEX +void spl_wdlist_settime(void *mpleak, uint64_t value); +#endif + +void +spl_cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) +{ +} + +void +spl_cv_destroy(kcondvar_t *cvp) +{ +} + +void +spl_cv_signal(kcondvar_t *cvp) +{ + wakeup_one((caddr_t)cvp); +} + +void +spl_cv_broadcast(kcondvar_t *cvp) +{ + wakeup((caddr_t)cvp); +} + + +/* + * Block on the indicated condition variable and + * release the associated mutex while blocked. + */ +int +spl_cv_wait(kcondvar_t *cvp, kmutex_t *mp, int flags, const char *msg) +{ + int result; + + if (msg != NULL && msg[0] == '&') + ++msg; /* skip over '&' prefixes */ + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, 0); +#endif + mp->m_owner = NULL; + result = msleep(cvp, (lck_mtx_t *)&mp->m_lock, flags, msg, 0); + mp->m_owner = current_thread(); +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, gethrestime_sec()); +#endif + + /* + * 1 - condvar got cv_signal()/cv_broadcast() + * 0 - received signal (kill -signal) + */ + return (result == EINTR ? 0 : 1); +} + +/* + * Same as cv_wait except the thread will unblock at 'tim' + * (an absolute time) if it hasn't already unblocked. + * + * Returns the amount of time left from the original 'tim' value + * when it was unblocked. + */ +int +spl_cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flags, + const char *msg) +{ + struct timespec ts; + int result; + + if (msg != NULL && msg[0] == '&') + ++msg; /* skip over '&' prefixes */ + + clock_t timenow = zfs_lbolt(); + + /* Already expired? */ + if (timenow >= tim) + return (-1); + + tim -= timenow; + + ts.tv_sec = (tim / hz); + ts.tv_nsec = (tim % hz) * NSEC_PER_SEC / hz; + + /* Both sec and nsec zero is a blocking call in XNU. (Not poll) */ + if (ts.tv_sec == 0 && ts.tv_nsec == 0) + ts.tv_nsec = 1000; + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, 0); +#endif + + mp->m_owner = NULL; + result = msleep(cvp, (lck_mtx_t *)&mp->m_lock, flags, msg, &ts); + + /* msleep grabs the mutex, even if timeout/signal */ + mp->m_owner = current_thread(); + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, gethrestime_sec()); +#endif + + switch (result) { + + case EINTR: /* Signal */ + case ERESTART: + return (0); + + case EWOULDBLOCK: /* Timeout */ + return (-1); + } + + return (1); +} + + +/* + * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. + */ +int +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + struct timespec ts; + int result; + + if (res > 1) { + /* + * Align expiration to the specified resolution. + */ + if (flag & CALLOUT_FLAG_ROUNDUP) + tim += res - 1; + tim = (tim / res) * res; + } + + if ((flag & CALLOUT_FLAG_ABSOLUTE)) { + hrtime_t timenow = gethrtime(); + + /* Already expired? */ + if (timenow >= tim) + return (-1); + + tim -= timenow; + } + + ts.tv_sec = NSEC2SEC(tim); + ts.tv_nsec = tim - SEC2NSEC(ts.tv_sec); + + /* Both sec and nsec set to zero is a blocking call in XNU. */ + if (ts.tv_sec == 0 && ts.tv_nsec == 0) + ts.tv_nsec = 1000; + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, 0); +#endif + + mp->m_owner = NULL; + result = msleep(cvp, (lck_mtx_t *)&mp->m_lock, + flag, "cv_timedwait_hires", &ts); + mp->m_owner = current_thread(); +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, gethrestime_sec()); +#endif + + switch (result) { + + case EINTR: /* Signal */ + case ERESTART: + return (0); + + case EWOULDBLOCK: /* Timeout */ + return (-1); + } + + return (1); +} diff --git a/module/os/macos/spl/spl-cred.c b/module/os/macos/spl/spl-cred.c new file mode 100644 index 0000000000..e7d9da0360 --- /dev/null +++ b/module/os/macos/spl/spl-cred.c @@ -0,0 +1,166 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include +#include + +/* Return the effective user id */ +uid_t +crgetuid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getuid((kauth_cred_t)cr)); +} + +/* Return the real user id */ +uid_t +crgetruid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getruid((kauth_cred_t)cr)); +} + +/* Return the saved user id */ +uid_t +crgetsuid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getsvuid((kauth_cred_t)cr)); +} + +/* Return the filesystem user id */ +uid_t +crgetfsuid(const cred_t *cr) +{ + if (!cr) + return (0); + return (-1); +} + +/* Return the effective group id */ +gid_t +crgetgid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getgid((kauth_cred_t)cr)); +} + +/* Return the real group id */ +gid_t +crgetrgid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getrgid((kauth_cred_t)cr)); +} + +/* Return the saved group id */ +gid_t +crgetsgid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getsvgid((kauth_cred_t)cr)); +} + +/* Return the filesystem group id */ +gid_t +crgetfsgid(const cred_t *cr) +{ + return (-1); +} + + +extern int kauth_cred_getgroups(kauth_cred_t _cred, gid_t *_groups, + int *_groupcount); +/* + * Unfortunately, to get the count of groups, we have to call XNU which + * memcpy's them over. No real clean way to get around that, but at least + * these calls are done sparingly. + */ +int +crgetngroups(const cred_t *cr) +{ + gid_t gids[NGROUPS]; + int count = NGROUPS; + int ret; + + ret = kauth_cred_getgroups((kauth_cred_t)cr, gids, &count); + + if (!ret) + return (count); + + return (0); +} + + +/* + * We always allocate NGROUPs here, since we don't know how many there will + * be until after the call. Unlike IllumOS, the ptr returned is allocated + * and must be returned by a call to crgetgroupsfree(). + */ +gid_t * +crgetgroups(const cred_t *cr) +{ + gid_t *gids; + int count = NGROUPS; + + gids = kmem_zalloc(sizeof (gid_t) * count, KM_SLEEP); + if (!gids) + return (NULL); + + kauth_cred_getgroups((kauth_cred_t)cr, gids, &count); + + return (gids); +} + +void +crgetgroupsfree(gid_t *gids) +{ + if (!gids) + return; + kmem_free(gids, sizeof (gid_t) * NGROUPS); +} + +/* + * Return true if "cr" belongs in group "gid". + */ +int +spl_cred_ismember_gid(cred_t *cr, gid_t gid) +{ + int ret = 0; // Is not member. + kauth_cred_ismember_gid((kauth_cred_t)cr, gid, &ret); + if (ret == 1) + return (TRUE); + return (FALSE); +} diff --git a/module/os/macos/spl/spl-ddi.c b/module/os/macos/spl/spl-ddi.c new file mode 100644 index 0000000000..0f74af6e9f --- /dev/null +++ b/module/os/macos/spl/spl-ddi.c @@ -0,0 +1,383 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include + + +/* + * Allocate a set of pointers to 'n_items' objects of size 'size' + * bytes. Each pointer is initialized to nil. + * + * The 'size' and 'n_items' values are stashed in the opaque + * handle returned to the caller. + * + * This implementation interprets 'set of pointers' to mean 'array + * of pointers' but note that nothing in the interface definition + * precludes an implementation that uses, for example, a linked list. + * However there should be a small efficiency gain from using an array + * at lookup time. + * + * NOTE As an optimization, we make our growable array allocations in + * powers of two (bytes), since that's how much kmem_alloc (currently) + * gives us anyway. It should save us some free/realloc's .. + * + * As a further optimization, we make the growable array start out + * with MIN_N_ITEMS in it. + */ + + +int +ddi_soft_state_init(void **state_p, size_t size, size_t n_items) +{ + struct i_ddi_soft_state *ss; + + if (state_p == NULL || *state_p != NULL || size == 0) + return (EINVAL); + + ss = kmem_zalloc(sizeof (*ss), KM_SLEEP); + mutex_init(&ss->lock, NULL, MUTEX_DRIVER, NULL); + ss->size = size; + + if (n_items < MIN_N_ITEMS) + ss->n_items = MIN_N_ITEMS; + else { + int bitlog; + + if ((bitlog = ddi_fls(n_items)) == ddi_ffs(n_items)) + bitlog--; + ss->n_items = 1 << bitlog; + } + + ASSERT(ss->n_items >= n_items); + + ss->array = kmem_zalloc(ss->n_items * sizeof (void *), KM_SLEEP); + + *state_p = ss; + + return (0); +} + + +/* + * Allocate a state structure of size 'size' to be associated + * with item 'item'. + * + * In this implementation, the array is extended to + * allow the requested offset, if needed. + */ +int +ddi_soft_state_zalloc(void *state, int item) +{ + struct i_ddi_soft_state *ss; + void **array; + void *new_element; + + if ((ss = state) == NULL || item < 0) + return (DDI_FAILURE); + + mutex_enter(&ss->lock); + if (ss->size == 0) { + mutex_exit(&ss->lock); + cmn_err(CE_WARN, "ddi_soft_state_zalloc: bad handle"); + return (DDI_FAILURE); + } + + array = ss->array; /* NULL if ss->n_items == 0 */ + ASSERT(ss->n_items != 0 && array != NULL); + + /* + * refuse to tread on an existing element + */ + if (item < ss->n_items && array[item] != NULL) { + mutex_exit(&ss->lock); + return (DDI_FAILURE); + } + + /* + * Allocate a new element to plug in + */ + new_element = kmem_zalloc(ss->size, KM_SLEEP); + + /* + * Check if the array is big enough, if not, grow it. + */ + if (item >= ss->n_items) { + void **new_array; + size_t new_n_items; + struct i_ddi_soft_state *dirty; + + /* + * Allocate a new array of the right length, copy + * all the old pointers to the new array, then + * if it exists at all, put the old array on the + * dirty list. + * + * Note that we can't kmem_free() the old array. + * + * Why -- well the 'get' operation is 'mutex-free', so we + * can't easily catch a suspended thread that is just about + * to dereference the array we just grew out of. So we + * cons up a header and put it on a list of 'dirty' + * pointer arrays. (Dirty in the sense that there may + * be suspended threads somewhere that are in the middle + * of referencing them). Fortunately, we -can- garbage + * collect it all at ddi_soft_state_fini time. + */ + new_n_items = ss->n_items; + while (new_n_items < (1 + item)) + new_n_items <<= 1; /* double array size .. */ + + ASSERT(new_n_items >= (1 + item)); /* sanity check! */ + + new_array = kmem_zalloc(new_n_items * sizeof (void *), + KM_SLEEP); + /* + * Copy the pointers into the new array + */ + bcopy(array, new_array, ss->n_items * sizeof (void *)); + + /* + * Save the old array on the dirty list + */ + dirty = kmem_zalloc(sizeof (*dirty), KM_SLEEP); + dirty->array = ss->array; + dirty->n_items = ss->n_items; + dirty->next = ss->next; + ss->next = dirty; + + ss->array = (array = new_array); + ss->n_items = new_n_items; + } + + ASSERT(array != NULL && item < ss->n_items && array[item] == NULL); + + array[item] = new_element; + + mutex_exit(&ss->lock); + return (DDI_SUCCESS); +} + + +/* + * Fetch a pointer to the allocated soft state structure. + * + * This is designed to be cheap. + * + * There's an argument that there should be more checking for + * nil pointers and out of bounds on the array.. but we do a lot + * of that in the alloc/free routines. + * + * An array has the convenience that we don't need to lock read-access + * to it c.f. a linked list. However our "expanding array" strategy + * means that we should hold a readers lock on the i_ddi_soft_state + * structure. + * + * However, from a performance viewpoint, we need to do it without + * any locks at all -- this also makes it a leaf routine. The algorithm + * is 'lock-free' because we only discard the pointer arrays at + * ddi_soft_state_fini() time. + */ +void * +ddi_get_soft_state(void *state, int item) +{ + struct i_ddi_soft_state *ss = state; + + ASSERT(ss != NULL && item >= 0); + + if (item < ss->n_items && ss->array != NULL) + return (ss->array[item]); + return (NULL); +} + +/* + * Free the state structure corresponding to 'item.' Freeing an + * element that has either gone or was never allocated is not + * considered an error. Note that we free the state structure, but + * we don't shrink our pointer array, or discard 'dirty' arrays, + * since even a few pointers don't really waste too much memory. + * + * Passing an item number that is out of bounds, or a null pointer will + * provoke an error message. + */ +void +ddi_soft_state_free(void *state, int item) +{ + struct i_ddi_soft_state *ss; + void **array; + void *element; + static char msg[] = "ddi_soft_state_free:"; + + if ((ss = state) == NULL) { + cmn_err(CE_WARN, "%s null handle", + msg); + return; + } + + element = NULL; + + mutex_enter(&ss->lock); + + if ((array = ss->array) == NULL || ss->size == 0) { + cmn_err(CE_WARN, "%s bad handle", + msg); + } else if (item < 0 || item >= ss->n_items) { + cmn_err(CE_WARN, "%s item %d not in range [0..%lu]", + msg, item, ss->n_items - 1); + } else if (array[item] != NULL) { + element = array[item]; + array[item] = NULL; + } + + mutex_exit(&ss->lock); + + if (element) + kmem_free(element, ss->size); +} + + +/* + * Free the entire set of pointers, and any + * soft state structures contained therein. + * + * Note that we don't grab the ss->lock mutex, even though + * we're inspecting the various fields of the data structure. + * + * There is an implicit assumption that this routine will + * never run concurrently with any of the above on this + * particular state structure i.e. by the time the driver + * calls this routine, there should be no other threads + * running in the driver. + */ +void +ddi_soft_state_fini(void **state_p) +{ + struct i_ddi_soft_state *ss, *dirty; + int item; + static char msg[] = "ddi_soft_state_fini:"; + + if (state_p == NULL || (ss = *state_p) == NULL) + return; + + if (ss->size == 0) { + cmn_err(CE_WARN, "%s bad handle", + msg); + return; + } + + if (ss->n_items > 0) { + for (item = 0; item < ss->n_items; item++) + ddi_soft_state_free(ss, item); + kmem_free(ss->array, ss->n_items * sizeof (void *)); + } + + /* + * Now delete any dirty arrays from previous 'grow' operations + */ + for (dirty = ss->next; dirty; dirty = ss->next) { + ss->next = dirty->next; + kmem_free(dirty->array, dirty->n_items * sizeof (void *)); + kmem_free(dirty, sizeof (*dirty)); + } + + mutex_destroy(&ss->lock); + kmem_free(ss, sizeof (*ss)); + + *state_p = NULL; +} + +int +ddi_create_minor_node(dev_info_t *dip, char *name, int spec_type, + minor_t minor_num, char *node_type, int flag) +{ + dev_t dev; + int error = 0; + char *r, *dup; + + dev = makedev(flag, minor_num); + dip->dev = dev; + + /* + * http://lists.apple.com/archives/darwin-kernel/2007/Nov/msg00038.html + * + * devfs_make_name() has an off-by-one error when using directories + * and it appears Apple does not want to fix it. + * + * We then change "/" to "_" and create more Apple-like /dev names + * + */ + MALLOC(dup, char *, strlen(name)+1, M_TEMP, M_WAITOK); + if (dup == NULL) + return (ENOMEM); + bcopy(name, dup, strlen(name)); + dup[strlen(name)] = '\0'; + + for (r = dup; + (r = strchr(r, '/')); + *r = '_') + /* empty */; + + dip->devc = NULL; + dip->devb = NULL; + + if (spec_type == S_IFCHR) + dip->devc = devfs_make_node(dev, DEVFS_CHAR, + UID_ROOT, GID_OPERATOR, + 0600, "rdisk_%s", dup); + else + dip->devb = devfs_make_node(dev, DEVFS_BLOCK, + UID_ROOT, GID_OPERATOR, + 0600, "disk_%s", dup); + FREE(dup, M_TEMP); + + return (error); +} + +void +ddi_remove_minor_node(dev_info_t *dip, char *name) +{ + if (dip->devc) { + devfs_remove(dip->devc); + dip->devc = NULL; + } + if (dip->devb) { + devfs_remove(dip->devb); + dip->devb = NULL; + } +} + +int +strspn(const char *string, + register char *charset) +{ + register const char *p, *q; + + for (q = string; *q != '\0'; ++q) { + for (p = charset; *p != '\0' && *p != *q; ++p) + ; + if (*p == '\0') + break; + } + return (q-string); +} diff --git a/module/os/macos/spl/spl-debug.c b/module/os/macos/spl/spl-debug.c new file mode 100644 index 0000000000..28ec1612d4 --- /dev/null +++ b/module/os/macos/spl/spl-debug.c @@ -0,0 +1,10 @@ +#include + + + +/* Debug log support enabled */ +__attribute__((noinline)) int assfail(const char *str, const char *file, + unsigned int line) __attribute__((optnone)) +{ + return (1); /* Must return true for ASSERT macro */ +} diff --git a/module/os/macos/spl/spl-err.c b/module/os/macos/spl/spl-err.c new file mode 100644 index 0000000000..455bf2c8b9 --- /dev/null +++ b/module/os/macos/spl/spl-err.c @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#include +#include +#include + +void +vcmn_err(int ce, const char *fmt, va_list ap) +{ + char msg[MAXMSGLEN]; + + vsnprintf(msg, MAXMSGLEN - 1, fmt, ap); + + switch (ce) { + case CE_IGNORE: + break; + case CE_CONT: + printf("%s", msg); + break; + case CE_NOTE: + printf("SPL: Notice: %s\n", msg); + break; + case CE_WARN: + printf("SPL: Warning: %s\n", msg); + break; + case CE_PANIC: + PANIC("%s", msg); + break; + } +} /* vcmn_err() */ + +void +cmn_err(int ce, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(ce, fmt, ap); + va_end(ap); +} /* cmn_err() */ + + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + char msg[MAXMSGLEN]; + va_list ap; + + va_start(ap, fmt); + (void) vsnprintf(msg, sizeof (msg), fmt, ap); + va_end(ap); + + printf("%s", msg); + panic("%s", msg); + + /* Unreachable */ + return (1); +} diff --git a/module/os/macos/spl/spl-kmem.c b/module/os/macos/spl/spl-kmem.c new file mode 100644 index 0000000000..a5120946a1 --- /dev/null +++ b/module/os/macos/spl/spl-kmem.c @@ -0,0 +1,6825 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013, 2020 Jorgen Lundman + * Copyright (C) 2014 Brendon Humphrey + * Copyright (C) 2017 Sean Doran + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// =============================================================== +// Options +// =============================================================== +// #define PRINT_CACHE_STATS 1 + +// =============================================================== +// OS Interface +// =============================================================== + +// This variable is a count of the number of threads +// blocked waiting for memory pages to become free. +// We are using wake indications on this event as a +// indication of paging activity, and therefore as a +// proxy to the machine experiencing memory pressure. +// +// xnu vm variables + +// 0 by default smd +extern volatile unsigned int vm_page_free_wanted; + +// 3500 kern.vm_page_free_min, rarely changes +extern unsigned int vm_page_free_min; + +// will tend to vm_page_free_min smd +extern volatile unsigned int vm_page_free_count; + +#define SMALL_PRESSURE_INCURSION_PAGES (vm_page_free_min >> 5) + +static kcondvar_t spl_free_thread_cv; +static kmutex_t spl_free_thread_lock; +static boolean_t spl_free_thread_exit; +static volatile _Atomic int64_t spl_free; +int64_t spl_free_delta_ema; + +static volatile _Atomic int64_t spl_free_manual_pressure = 0; +static volatile _Atomic boolean_t spl_free_fast_pressure = FALSE; +static _Atomic bool spl_free_maybe_reap_flag = false; +static _Atomic uint64_t spl_free_last_pressure = 0; + +// Start and end address of kernel memory +extern vm_offset_t virtual_space_start; +extern vm_offset_t virtual_space_end; + +// Can be polled to determine if the VM is experiecing +// a shortage of free pages. +extern int vm_pool_low(void); + +// Which CPU are we executing on? +extern int cpu_number(void); + +// Invoke the kernel debugger +extern void Debugger(const char *message); + +// Read from /dev/random +void read_random(void *buffer, uint_t numbytes); + +// =============================================================== +// Non Illumos Variables +// =============================================================== + +// Flag to cause tasks and threads to terminate as +// the kmem module is preparing to unload. +static int shutting_down = 0; + +// Amount of RAM in machine +uint64_t physmem = 0; + +// Size in bytes of the memory allocated in seg_kmem +extern uint64_t segkmem_total_mem_allocated; + +// Number of active threads +extern uint64_t zfs_threads; +extern uint64_t zfs_active_mutex; +extern uint64_t zfs_active_rwlock; + +extern uint64_t total_memory; +extern uint64_t real_total_memory; + +#define MULT 1 + +static const char *KMEM_VA_PREFIX = "kmem_va"; +static const char *KMEM_MAGAZINE_PREFIX = "kmem_magazine_"; + +// =============================================================== +// Illumos Variables +// =============================================================== + +struct kmem_cache_kstat { + kstat_named_t kmc_buf_size; + kstat_named_t kmc_align; + kstat_named_t kmc_chunk_size; + kstat_named_t kmc_slab_size; + kstat_named_t kmc_alloc; + kstat_named_t kmc_alloc_fail; + kstat_named_t kmc_free; + kstat_named_t kmc_depot_alloc; + kstat_named_t kmc_depot_free; + kstat_named_t kmc_depot_contention; + kstat_named_t kmc_slab_alloc; + kstat_named_t kmc_slab_free; + kstat_named_t kmc_buf_constructed; + kstat_named_t kmc_buf_avail; + kstat_named_t kmc_buf_inuse; + kstat_named_t kmc_buf_total; + kstat_named_t kmc_buf_max; + kstat_named_t kmc_slab_create; + kstat_named_t kmc_slab_destroy; + kstat_named_t kmc_vmem_source; + kstat_named_t kmc_hash_size; + kstat_named_t kmc_hash_lookup_depth; + kstat_named_t kmc_hash_rescale; + kstat_named_t kmc_full_magazines; + kstat_named_t kmc_empty_magazines; + kstat_named_t kmc_magazine_size; + kstat_named_t kmc_reap; /* number of kmem_cache_reap() calls */ + kstat_named_t kmc_defrag; /* attempts to defrag all partial slabs */ + kstat_named_t kmc_scan; /* attempts to defrag one partial slab */ + kstat_named_t kmc_move_callbacks; /* sum of yes, no, later, dn, dk */ + kstat_named_t kmc_move_yes; + kstat_named_t kmc_move_no; + kstat_named_t kmc_move_later; + kstat_named_t kmc_move_dont_need; + kstat_named_t kmc_move_dont_know; /* obj unrecognized by client ... */ + kstat_named_t kmc_move_hunt_found; /* ... but found in mag layer */ + kstat_named_t kmc_move_slabs_freed; /* slabs freed by consolidator */ + kstat_named_t kmc_move_reclaimable; /* buffers, if consolidator ran */ + kstat_named_t kmc_no_vba_success; + kstat_named_t kmc_no_vba_fail; + kstat_named_t kmc_arc_no_grow_set; + kstat_named_t kmc_arc_no_grow; +} kmem_cache_kstat = { + { "buf_size", KSTAT_DATA_UINT64 }, + { "align", KSTAT_DATA_UINT64 }, + { "chunk_size", KSTAT_DATA_UINT64 }, + { "slab_size", KSTAT_DATA_UINT64 }, + { "alloc", KSTAT_DATA_UINT64 }, + { "alloc_fail", KSTAT_DATA_UINT64 }, + { "free", KSTAT_DATA_UINT64 }, + { "depot_alloc", KSTAT_DATA_UINT64 }, + { "depot_free", KSTAT_DATA_UINT64 }, + { "depot_contention", KSTAT_DATA_UINT64 }, + { "slab_alloc", KSTAT_DATA_UINT64 }, + { "slab_free", KSTAT_DATA_UINT64 }, + { "buf_constructed", KSTAT_DATA_UINT64 }, + { "buf_avail", KSTAT_DATA_UINT64 }, + { "buf_inuse", KSTAT_DATA_UINT64 }, + { "buf_total", KSTAT_DATA_UINT64 }, + { "buf_max", KSTAT_DATA_UINT64 }, + { "slab_create", KSTAT_DATA_UINT64 }, + { "slab_destroy", KSTAT_DATA_UINT64 }, + { "vmem_source", KSTAT_DATA_UINT64 }, + { "hash_size", KSTAT_DATA_UINT64 }, + { "hash_lookup_depth", KSTAT_DATA_UINT64 }, + { "hash_rescale", KSTAT_DATA_UINT64 }, + { "full_magazines", KSTAT_DATA_UINT64 }, + { "empty_magazines", KSTAT_DATA_UINT64 }, + { "magazine_size", KSTAT_DATA_UINT64 }, + { "reap", KSTAT_DATA_UINT64 }, + { "defrag", KSTAT_DATA_UINT64 }, + { "scan", KSTAT_DATA_UINT64 }, + { "move_callbacks", KSTAT_DATA_UINT64 }, + { "move_yes", KSTAT_DATA_UINT64 }, + { "move_no", KSTAT_DATA_UINT64 }, + { "move_later", KSTAT_DATA_UINT64 }, + { "move_dont_need", KSTAT_DATA_UINT64 }, + { "move_dont_know", KSTAT_DATA_UINT64 }, + { "move_hunt_found", KSTAT_DATA_UINT64 }, + { "move_slabs_freed", KSTAT_DATA_UINT64 }, + { "move_reclaimable", KSTAT_DATA_UINT64 }, + { "no_vba_success", KSTAT_DATA_UINT64 }, + { "no_vba_fail", KSTAT_DATA_UINT64 }, + { "arc_no_grow_set", KSTAT_DATA_UINT64 }, + { "arc_no_grow", KSTAT_DATA_UINT64 }, +}; + +static kmutex_t kmem_cache_kstat_lock; + +/* + * The default set of caches to back kmem_alloc(). + * These sizes should be reevaluated periodically. + * + * We want allocations that are multiples of the coherency granularity + * (64 bytes) to be satisfied from a cache which is a multiple of 64 + * bytes, so that it will be 64-byte aligned. For all multiples of 64, + * the next 1 greater than or equal to it must be a + * multiple of 64. + * + * We split the table into two sections: size <= 4k and size > 4k. This + * saves a lot of space and cache footprint in our cache tables. + */ +static const int kmem_alloc_sizes[] = { + 1 * 8, + 2 * 8, + 3 * 8, + 4 * 8, 5 * 8, 6 * 8, 7 * 8, + 4 * 16, 5 * 16, 6 * 16, 7 * 16, + 4 * 32, 5 * 32, 6 * 32, 7 * 32, + 4 * 64, 5 * 64, 6 * 64, 7 * 64, + 4 * 128, 9*64, 5 * 128, 6 * 128, 13*64, 7 * 128, + P2ALIGN(8192 / 8, 64), + P2ALIGN(8192 / 7, 64), + P2ALIGN(8192 / 6, 64), + P2ALIGN(8192 / 5, 64), + P2ALIGN(8192 / 4, 64), + P2ALIGN(8192 / 3, 64), + P2ALIGN(8192 / 2, 64), +}; + +static const int kmem_big_alloc_sizes[] = { + 2 * 4096, 3 * 4096, + 2 * 8192, 3 * 8192, + 4 * 8192, 5 * 8192, 6 * 8192, 7 * 8192, + 8 * 8192, 9 * 8192, 10 * 8192, 11 * 8192, + 12 * 8192, 13 * 8192, 14 * 8192, 15 * 8192, + 16 * 8192 +}; + +#define KMEM_MAXBUF 4096 +#define KMEM_BIG_MAXBUF_32BIT 32768 +#define KMEM_BIG_MAXBUF 131072 + +#define KMEM_BIG_MULTIPLE 4096 /* big_alloc_sizes must be a multiple */ +#define KMEM_BIG_SHIFT 12 /* lg(KMEM_BIG_MULTIPLE) */ + +static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT]; +static kmem_cache_t *kmem_big_alloc_table[KMEM_BIG_MAXBUF >> KMEM_BIG_SHIFT]; + +#define KMEM_ALLOC_TABLE_MAX (KMEM_MAXBUF >> KMEM_ALIGN_SHIFT) +static size_t kmem_big_alloc_table_max = 0; /* # of filled elements */ + +static kmem_magtype_t kmem_magtype[] = { + { 1, 8, 3200, 65536 }, + { 3, 16, 256, 32768 }, + { 7, 32, 64, 16384 }, + { 15, 64, 0, 8192 }, + { 31, 64, 0, 4096 }, + { 47, 64, 0, 2048 }, + { 63, 64, 0, 1024 }, + { 95, 64, 0, 512 }, + { 143, 64, 0, 0 }, +}; + +static uint32_t kmem_reaping; +static uint32_t kmem_reaping_idspace; + +/* + * kmem tunables + */ +static struct timespec kmem_reap_interval = {15, 0}; +int kmem_depot_contention = 3; /* max failed tryenters per real interval */ +pgcnt_t kmem_reapahead = 0; /* start reaping N pages before pageout */ +int kmem_panic = 1; /* whether to panic on error */ +int kmem_logging = 0; /* kmem_log_enter() override */ +uint32_t kmem_mtbf = 0; /* mean time between failures [default: off] */ +size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */ +size_t kmem_content_log_size; /* content log size [2% of memory] */ +size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */ +size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */ +size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */ +size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */ +size_t kmem_lite_maxalign = 8192; /* maximum buffer alignment for KMF_LITE */ +int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */ +size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */ +size_t kmem_minfirewall; /* hardware-enforced redzone threshold */ + +size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */ + +/* + * Be aware that KMF_AUDIT does not release memory, and you will eventually + * grind to a halt. But it is useful to enable if you can trigger a memory + * fault, and wish to see the calling stack. + */ +#ifdef DEBUG +// can be 0 or KMF_LITE +// or KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS +// with or without KMF_AUDIT +int kmem_flags = KMF_DEADBEEF | KMF_REDZONE | KMF_LITE; +#else +int kmem_flags = 0; +#endif +int kmem_ready; + +static kmem_cache_t *kmem_slab_cache; +static kmem_cache_t *kmem_bufctl_cache; +static kmem_cache_t *kmem_bufctl_audit_cache; + +static kmutex_t kmem_cache_lock; /* inter-cache linkage only */ +static list_t kmem_caches; +extern vmem_t *heap_arena; +static taskq_t *kmem_taskq; +static kmutex_t kmem_flags_lock; +static vmem_t *kmem_metadata_arena; +static vmem_t *kmem_msb_arena; /* arena for metadata caches */ +static vmem_t *kmem_cache_arena; +static vmem_t *kmem_hash_arena; +static vmem_t *kmem_log_arena; +static vmem_t *kmem_oversize_arena; +static vmem_t *kmem_va_arena; +static vmem_t *kmem_default_arena; +static vmem_t *kmem_firewall_arena; + +/* + * kmem slab consolidator thresholds (tunables) + */ +size_t kmem_frag_minslabs = 101; /* minimum total slabs */ +size_t kmem_frag_numer = 1; /* free buffers (numerator) */ +size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */ +/* + * Maximum number of slabs from which to move buffers during a single + * maintenance interval while the system is not low on memory. + */ +size_t kmem_reclaim_max_slabs = 4; // smd 1 +/* + * Number of slabs to scan backwards from the end of the partial slab list + * when searching for buffers to relocate. + */ +size_t kmem_reclaim_scan_range = 48; // smd 12 + +/* consolidator knobs */ +static boolean_t kmem_move_noreap; +static boolean_t kmem_move_blocked; +static boolean_t kmem_move_fulltilt; +static boolean_t kmem_move_any_partial; + +#ifdef DEBUG +/* + * kmem consolidator debug tunables: + * Ensure code coverage by occasionally running the consolidator even when the + * caches are not fragmented (they may never be). These intervals are mean time + * in cache maintenance intervals (kmem_cache_update). + */ +uint32_t kmem_mtb_move = 20; /* defrag 1 slab (~5min) */ +uint32_t kmem_mtb_reap = 240; /* defrag all slabs (~1hrs) */ +uint32_t kmem_mtb_reap_count = 0; +#endif /* DEBUG */ + +static kmem_cache_t *kmem_defrag_cache; +static kmem_cache_t *kmem_move_cache; +static taskq_t *kmem_move_taskq; + +static void kmem_cache_scan(kmem_cache_t *); +static void kmem_cache_defrag(kmem_cache_t *); +static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *); + + +kmem_log_header_t *kmem_transaction_log; +kmem_log_header_t *kmem_content_log; +kmem_log_header_t *kmem_failure_log; +kmem_log_header_t *kmem_slab_log; + +static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */ + +#define KMEM_BUFTAG_LITE_ENTER(bt, count, caller) \ +if ((count) > 0) { \ +pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history; \ +pc_t *_e; \ +/* memmove() the old entries down one notch */ \ +for (_e = &_s[(count) - 1]; _e > _s; _e--) \ +*_e = *(_e - 1); \ +*_s = (uintptr_t)(caller); \ +} + +#define KMERR_MODIFIED 0 /* buffer modified while on freelist */ +#define KMERR_REDZONE 1 /* redzone violation (write past end of buf) */ +#define KMERR_DUPFREE 2 /* freed a buffer twice */ +#define KMERR_BADADDR 3 /* freed a bad (unallocated) address */ +#define KMERR_BADBUFTAG 4 /* buftag corrupted */ +#define KMERR_BADBUFCTL 5 /* bufctl corrupted */ +#define KMERR_BADCACHE 6 /* freed a buffer to the wrong cache */ +#define KMERR_BADSIZE 7 /* alloc size != free size */ +#define KMERR_BADBASE 8 /* buffer base address wrong */ + +struct { + hrtime_t kmp_timestamp; /* timestamp of panic */ + int kmp_error; /* type of kmem error */ + void *kmp_buffer; /* buffer that induced panic */ + void *kmp_realbuf; /* real start address for buffer */ + kmem_cache_t *kmp_cache; /* buffer's cache according to client */ + kmem_cache_t *kmp_realcache; /* actual cache containing buffer */ + kmem_slab_t *kmp_slab; /* slab accoring to kmem_findslab() */ + kmem_bufctl_t *kmp_bufctl; /* bufctl */ +} kmem_panic_info; + +extern uint64_t stat_osif_malloc_success; +extern uint64_t stat_osif_malloc_bytes; +extern uint64_t stat_osif_free; +extern uint64_t stat_osif_free_bytes; + +extern uint64_t spl_bucket_non_pow2_allocs; + +// stats for spl_root_allocator(); +extern uint64_t spl_root_allocator_calls; +extern uint64_t spl_root_allocator_large_bytes_asked; +extern uint64_t spl_root_allocator_small_bytes_asked; +extern uint64_t spl_root_allocator_minalloc_bytes_asked; +extern uint64_t spl_root_allocator_extra_pass; +extern uint64_t spl_root_allocator_recovered; +extern uint64_t spl_root_allocator_recovered_bytes; + +extern uint64_t spl_vmem_unconditional_allocs; +extern uint64_t spl_vmem_unconditional_alloc_bytes; +extern uint64_t spl_vmem_conditional_allocs; +extern uint64_t spl_vmem_conditional_alloc_bytes; +extern uint64_t spl_vmem_conditional_alloc_deny; +extern uint64_t spl_vmem_conditional_alloc_deny_bytes; + +extern uint64_t spl_xat_success; +extern uint64_t spl_xat_late_success; +extern uint64_t spl_xat_late_success_nosleep; +extern uint64_t spl_xat_pressured; +extern uint64_t spl_xat_bailed; +extern uint64_t spl_xat_bailed_contended; +extern uint64_t spl_xat_lastalloc; +extern uint64_t spl_xat_lastfree; +extern uint64_t spl_xat_forced; +extern uint64_t spl_xat_sleep; +extern uint64_t spl_xat_late_deny; +extern uint64_t spl_xat_no_waiters; +extern uint64_t spl_xft_wait; + +extern uint64_t spl_vba_parent_memory_appeared; +extern uint64_t spl_vba_parent_memory_blocked; +extern uint64_t spl_vba_hiprio_blocked; +extern uint64_t spl_vba_cv_timeout; +extern uint64_t spl_vba_loop_timeout; +extern uint64_t spl_vba_cv_timeout_blocked; +extern uint64_t spl_vba_loop_timeout_blocked; +extern uint64_t spl_vba_sleep; +extern uint64_t spl_vba_loop_entries; + +extern uint64_t spl_bucket_tunable_large_span; +extern uint64_t spl_bucket_tunable_small_span; +extern void spl_set_bucket_tunable_large_span(uint64_t); +extern void spl_set_bucket_tunable_small_span(uint64_t); + +extern _Atomic uint64_t spl_arc_no_grow_bits; +extern uint64_t spl_arc_no_grow_count; + +extern uint64_t spl_frag_max_walk; +extern uint64_t spl_frag_walked_out; +extern uint64_t spl_frag_walk_cnt; + +uint64_t spl_buckets_mem_free = 0; +uint64_t spl_arc_reclaim_avoided = 0; + +uint64_t kmem_free_to_slab_when_fragmented = 0; + +typedef struct spl_stats { + kstat_named_t spl_os_alloc; + kstat_named_t spl_active_threads; + kstat_named_t spl_active_mutex; + kstat_named_t spl_active_rwlock; + kstat_named_t spl_active_tsd; + kstat_named_t spl_free_wake_count; + kstat_named_t spl_spl_free; + kstat_named_t spl_spl_free_manual_pressure; + kstat_named_t spl_spl_free_fast_pressure; + kstat_named_t spl_spl_free_delta_ema; + kstat_named_t spl_spl_free_negative_count; + kstat_named_t spl_osif_malloc_success; + kstat_named_t spl_osif_malloc_bytes; + kstat_named_t spl_osif_free; + kstat_named_t spl_osif_free_bytes; + kstat_named_t spl_bucket_non_pow2_allocs; + + kstat_named_t spl_vmem_unconditional_allocs; + kstat_named_t spl_vmem_unconditional_alloc_bytes; + kstat_named_t spl_vmem_conditional_allocs; + kstat_named_t spl_vmem_conditional_alloc_bytes; + kstat_named_t spl_vmem_conditional_alloc_deny; + kstat_named_t spl_vmem_conditional_alloc_deny_bytes; + + kstat_named_t spl_xat_success; + kstat_named_t spl_xat_late_success; + kstat_named_t spl_xat_late_success_nosleep; + kstat_named_t spl_xat_pressured; + kstat_named_t spl_xat_bailed; + kstat_named_t spl_xat_bailed_contended; + kstat_named_t spl_xat_lastalloc; + kstat_named_t spl_xat_lastfree; + kstat_named_t spl_xat_forced; + kstat_named_t spl_xat_sleep; + kstat_named_t spl_xat_late_deny; + kstat_named_t spl_xat_no_waiters; + kstat_named_t spl_xft_wait; + + kstat_named_t spl_vba_parent_memory_appeared; + kstat_named_t spl_vba_parent_memory_blocked; + kstat_named_t spl_vba_hiprio_blocked; + kstat_named_t spl_vba_cv_timeout; + kstat_named_t spl_vba_loop_timeout; + kstat_named_t spl_vba_cv_timeout_blocked; + kstat_named_t spl_vba_loop_timeout_blocked; + kstat_named_t spl_vba_sleep; + kstat_named_t spl_vba_loop_entries; + + kstat_named_t spl_bucket_tunable_large_span; + kstat_named_t spl_bucket_tunable_small_span; + + kstat_named_t spl_buckets_mem_free; + kstat_named_t spl_arc_no_grow_bits; + kstat_named_t spl_arc_no_grow_count; + kstat_named_t spl_frag_max_walk; + kstat_named_t spl_frag_walked_out; + kstat_named_t spl_frag_walk_cnt; + kstat_named_t spl_arc_reclaim_avoided; + + kstat_named_t kmem_free_to_slab_when_fragmented; +} spl_stats_t; + +static spl_stats_t spl_stats = { + {"os_mem_alloc", KSTAT_DATA_UINT64}, + {"active_threads", KSTAT_DATA_UINT64}, + {"active_mutex", KSTAT_DATA_UINT64}, + {"active_rwlock", KSTAT_DATA_UINT64}, + {"active_tsd", KSTAT_DATA_UINT64}, + {"spl_free_wake_count", KSTAT_DATA_UINT64}, + {"spl_spl_free", KSTAT_DATA_INT64}, + {"spl_spl_free_manual_pressure", KSTAT_DATA_UINT64}, + {"spl_spl_free_fast_pressure", KSTAT_DATA_UINT64}, + {"spl_spl_free_delta_ema", KSTAT_DATA_UINT64}, + {"spl_spl_free_negative_count", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_success", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_bytes", KSTAT_DATA_UINT64}, + {"spl_osif_free", KSTAT_DATA_UINT64}, + {"spl_osif_free_bytes", KSTAT_DATA_UINT64}, + {"spl_bucket_non_pow2_allocs", KSTAT_DATA_UINT64}, + + {"vmem_unconditional_allocs", KSTAT_DATA_UINT64}, + {"vmem_unconditional_alloc_bytes", KSTAT_DATA_UINT64}, + {"vmem_conditional_allocs", KSTAT_DATA_UINT64}, + {"vmem_conditional_alloc_bytes", KSTAT_DATA_UINT64}, + {"vmem_conditional_alloc_deny", KSTAT_DATA_UINT64}, + {"vmem_conditional_alloc_deny_bytes", KSTAT_DATA_UINT64}, + + {"spl_xat_success", KSTAT_DATA_UINT64}, + {"spl_xat_late_success", KSTAT_DATA_UINT64}, + {"spl_xat_late_success_nosleep", KSTAT_DATA_UINT64}, + {"spl_xat_pressured", KSTAT_DATA_UINT64}, + {"spl_xat_bailed", KSTAT_DATA_UINT64}, + {"spl_xat_bailed_contended", KSTAT_DATA_UINT64}, + {"spl_xat_lastalloc", KSTAT_DATA_UINT64}, + {"spl_xat_lastfree", KSTAT_DATA_UINT64}, + {"spl_xat_forced", KSTAT_DATA_UINT64}, + {"spl_xat_sleep", KSTAT_DATA_UINT64}, + {"spl_xat_late_deny", KSTAT_DATA_UINT64}, + {"spl_xat_no_waiters", KSTAT_DATA_UINT64}, + {"spl_xft_wait", KSTAT_DATA_UINT64}, + + {"spl_vba_parent_memory_appeared", KSTAT_DATA_UINT64}, + {"spl_vba_parent_memory_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_hiprio_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_cv_timeout", KSTAT_DATA_UINT64}, + {"spl_vba_loop_timeout", KSTAT_DATA_UINT64}, + {"spl_vba_cv_timeout_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_loop_timeout_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_sleep", KSTAT_DATA_UINT64}, + {"spl_vba_loop_entries", KSTAT_DATA_UINT64}, + + {"spl_tunable_large_span", KSTAT_DATA_UINT64}, + {"spl_tunable_small_span", KSTAT_DATA_UINT64}, + + {"spl_buckets_mem_free", KSTAT_DATA_UINT64}, + {"spl_arc_no_grow_bits", KSTAT_DATA_UINT64}, + {"spl_arc_no_grow_count", KSTAT_DATA_UINT64}, + + {"spl_vmem_frag_max_walk", KSTAT_DATA_UINT64}, + {"spl_vmem_frag_walked_out", KSTAT_DATA_UINT64}, + {"spl_vmem_frag_walk_cnt", KSTAT_DATA_UINT64}, + {"spl_arc_reclaim_avoided", KSTAT_DATA_UINT64}, + + {"kmem_free_to_slab_when_fragmented", KSTAT_DATA_UINT64}, +}; + +static kstat_t *spl_ksp = 0; + +// Stub out caller() +caddr_t +caller() +{ + return ((caddr_t)(0)); +} + +void * +calloc(size_t n, size_t s) +{ + return (zfs_kmem_zalloc(n * s, KM_NOSLEEP)); +} + +#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9') + +#define IS_ALPHA(c) \ +(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + +/* + * Get bytes from the /dev/random generator. Returns 0 + * on success. Returns EAGAIN if there is insufficient entropy. + */ +int +random_get_bytes(uint8_t *ptr, size_t len) +{ + read_random(ptr, len); + return (0); +} + +/* + * BGH - Missing from OSX? + * + * Convert a string into a valid C identifier by replacing invalid + * characters with '_'. Also makes sure the string is nul-terminated + * and takes up at most n bytes. + */ +void +strident_canon(char *s, size_t n) +{ + char c; + char *end = s + n - 1; + + if ((c = *s) == 0) + return; + + if (!IS_ALPHA(c) && c != '_') + *s = '_'; + + while (s < end && ((c = *(++s)) != 0)) { + if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_') + *s = '_'; + } + *s = 0; +} + +int +strident_valid(const char *id) +{ + int c = *id++; + + if (!IS_ALPHA(c) && c != '_') + return (0); + while ((c = *id++) != 0) { + if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_') + return (0); + } + return (1); +} + +static void +copy_pattern(uint64_t pattern, void *buf_arg, size_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf = buf_arg; + + while (buf < bufend) + *buf++ = pattern; +} + +static void * +verify_pattern(uint64_t pattern, void *buf_arg, size_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf; + + for (buf = buf_arg; buf < bufend; buf++) + if (*buf != pattern) + return (buf); + return (NULL); +} + +static void * +verify_and_copy_pattern(uint64_t old, uint64_t new, void *buf_arg, size_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf; + + for (buf = buf_arg; buf < bufend; buf++) { + if (*buf != old) { + copy_pattern(old, buf_arg, + (char *)buf - (char *)buf_arg); + return (buf); + } + *buf = new; + } + + return (NULL); +} + +static void +kmem_cache_applyall(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag) +{ + kmem_cache_t *cp; + + mutex_enter(&kmem_cache_lock); + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) + if (tq != NULL) + (void) taskq_dispatch(tq, (task_func_t *)func, cp, + tqflag); + else + func(cp); + mutex_exit(&kmem_cache_lock); +} + +static void +kmem_cache_applyall_id(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag) +{ + kmem_cache_t *cp; + + mutex_enter(&kmem_cache_lock); + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (!(cp->cache_cflags & KMC_IDENTIFIER)) + continue; + if (tq != NULL) + (void) taskq_dispatch(tq, (task_func_t *)func, cp, + tqflag); + else + func(cp); + } + mutex_exit(&kmem_cache_lock); +} + +/* + * Debugging support. Given a buffer address, find its slab. + */ +static kmem_slab_t * +kmem_findslab(kmem_cache_t *cp, void *buf) +{ + kmem_slab_t *sp; + + mutex_enter(&cp->cache_lock); + for (sp = list_head(&cp->cache_complete_slabs); sp != NULL; + sp = list_next(&cp->cache_complete_slabs, sp)) { + if (KMEM_SLAB_MEMBER(sp, buf)) { + mutex_exit(&cp->cache_lock); + return (sp); + } + } + for (sp = avl_first(&cp->cache_partial_slabs); sp != NULL; + sp = AVL_NEXT(&cp->cache_partial_slabs, sp)) { + if (KMEM_SLAB_MEMBER(sp, buf)) { + mutex_exit(&cp->cache_lock); + return (sp); + } + } + mutex_exit(&cp->cache_lock); + + return (NULL); +} + +static void +kmem_error(int error, kmem_cache_t *cparg, void *bufarg) +{ + kmem_buftag_t *btp = NULL; + kmem_bufctl_t *bcp = NULL; + kmem_cache_t *cp = cparg; + kmem_slab_t *sp; + uint64_t *off; + void *buf = bufarg; + + kmem_logging = 0; /* stop logging when a bad thing happens */ + + kmem_panic_info.kmp_timestamp = gethrtime(); + + sp = kmem_findslab(cp, buf); + if (sp == NULL) { + for (cp = list_tail(&kmem_caches); cp != NULL; + cp = list_prev(&kmem_caches, cp)) { + if ((sp = kmem_findslab(cp, buf)) != NULL) + break; + } + } + + if (sp == NULL) { + cp = NULL; + error = KMERR_BADADDR; + } else { + if (cp != cparg) + error = KMERR_BADCACHE; + else + buf = (char *)bufarg - + ((uintptr_t)bufarg - + (uintptr_t)sp->slab_base) % cp->cache_chunksize; + if (buf != bufarg) + error = KMERR_BADBASE; + if (cp->cache_flags & KMF_BUFTAG) + btp = KMEM_BUFTAG(cp, buf); + if (cp->cache_flags & KMF_HASH) { + mutex_enter(&cp->cache_lock); + for (bcp = *KMEM_HASH(cp, buf); bcp; bcp = bcp->bc_next) + if (bcp->bc_addr == buf) + break; + mutex_exit(&cp->cache_lock); + if (bcp == NULL && btp != NULL) + bcp = btp->bt_bufctl; + if (kmem_findslab(cp->cache_bufctl_cache, bcp) == + NULL || P2PHASE((uintptr_t)bcp, KMEM_ALIGN) || + bcp->bc_addr != buf) { + error = KMERR_BADBUFCTL; + bcp = NULL; + } + } + } + + kmem_panic_info.kmp_error = error; + kmem_panic_info.kmp_buffer = bufarg; + kmem_panic_info.kmp_realbuf = buf; + kmem_panic_info.kmp_cache = cparg; + kmem_panic_info.kmp_realcache = cp; + kmem_panic_info.kmp_slab = sp; + kmem_panic_info.kmp_bufctl = bcp; + + printf("SPL: kernel memory allocator: "); + + switch (error) { + + case KMERR_MODIFIED: + printf("buffer modified after being freed\n"); + off = verify_pattern(KMEM_FREE_PATTERN, buf, + cp->cache_verify); + if (off == NULL) /* shouldn't happen */ + off = buf; + printf("SPL: modification occurred at offset 0x%lx " + "(0x%llx replaced by 0x%llx)\n", + (uintptr_t)off - (uintptr_t)buf, + (longlong_t)KMEM_FREE_PATTERN, (longlong_t)*off); + break; + + case KMERR_REDZONE: + printf("redzone violation: write past end of buffer\n"); + break; + + case KMERR_BADADDR: + printf("invalid free: buffer not in cache\n"); + break; + + case KMERR_DUPFREE: + printf("duplicate free: buffer freed twice\n"); + break; + + case KMERR_BADBUFTAG: + printf("boundary tag corrupted\n"); + printf("SPL: bcp ^ bxstat = %lx, should be %lx\n", + (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat, + KMEM_BUFTAG_FREE); + break; + + case KMERR_BADBUFCTL: + printf("bufctl corrupted\n"); + break; + + case KMERR_BADCACHE: + printf("buffer freed to wrong cache\n"); + printf("SPL: buffer was allocated from %s,\n", + cp->cache_name); + printf("SPL: caller attempting free to %s.\n", + cparg->cache_name); + break; + + case KMERR_BADSIZE: + printf("bad free: free size (%u) != alloc size (%u)\n", + KMEM_SIZE_DECODE(((uint32_t *)btp)[0]), + KMEM_SIZE_DECODE(((uint32_t *)btp)[1])); + break; + + case KMERR_BADBASE: + printf("bad free: free address (%p) != alloc address" + " (%p)\n", bufarg, buf); + break; + } + + printf("SPL: buffer=%p bufctl=%p cache: %s\n", + bufarg, (void *)bcp, cparg->cache_name); + + if (bcp != NULL && (cp->cache_flags & KMF_AUDIT) && + error != KMERR_BADBUFCTL) { + int d; + timestruc_t ts = {0, 0}; + kmem_bufctl_audit_t *bcap = (kmem_bufctl_audit_t *)bcp; + + hrt2ts(kmem_panic_info.kmp_timestamp - bcap->bc_timestamp, &ts); + printf("SPL: previous transaction on buffer %p:\n", buf); + printf("SPL: thread=%p time=T-%ld.%09ld slab=%p cache: %s\n", + (void *)bcap->bc_thread, ts.tv_sec, ts.tv_nsec, + (void *)sp, cp->cache_name); + for (d = 0; d < MIN(bcap->bc_depth, KMEM_STACK_DEPTH); d++) { + print_symbol(bcap->bc_stack[d]); + } + } + + if (kmem_panic > 0) { + extern void IODelay(unsigned microseconds); // lh_cpu[max_ncpus]; + int i; + + /* + * Make sure that lhp->lh_cpu[] is nicely aligned + * to prevent false sharing of cache lines. + */ + lhsize = P2ROUNDUP(lhsize, KMEM_ALIGN); + lhp = vmem_xalloc(kmem_log_arena, lhsize, 64, P2NPHASE(lhsize, 64), 0, + NULL, NULL, VM_SLEEP); + bzero(lhp, lhsize); + + mutex_init(&lhp->lh_lock, NULL, MUTEX_DEFAULT, NULL); + lhp->lh_nchunks = nchunks; + lhp->lh_chunksize = P2ROUNDUP(logsize / nchunks + 1, PAGESIZE); + lhp->lh_base = vmem_alloc(kmem_log_arena, + lhp->lh_chunksize * nchunks, VM_SLEEP); + lhp->lh_free = vmem_alloc(kmem_log_arena, + nchunks * sizeof (int), VM_SLEEP); + bzero(lhp->lh_base, lhp->lh_chunksize * nchunks); + + for (i = 0; i < max_ncpus; i++) { + kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[i]; + mutex_init(&clhp->clh_lock, NULL, MUTEX_DEFAULT, NULL); + clhp->clh_chunk = i; + } + + for (i = max_ncpus; i < nchunks; i++) + lhp->lh_free[i] = i; + + lhp->lh_head = max_ncpus; + lhp->lh_tail = 0; + + return (lhp); +} + + +static void +kmem_log_fini(kmem_log_header_t *lhp) +{ + int nchunks = 4 * max_ncpus; + size_t lhsize = (size_t)&((kmem_log_header_t *)0)->lh_cpu[max_ncpus]; + int i; + + + + for (i = 0; i < max_ncpus; i++) { + kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[i]; + mutex_destroy(&clhp->clh_lock); + } + + vmem_free(kmem_log_arena, lhp->lh_free, nchunks * sizeof (int)); + + vmem_free(kmem_log_arena, lhp->lh_base, lhp->lh_chunksize * nchunks); + + mutex_destroy(&lhp->lh_lock); + + lhsize = P2ROUNDUP(lhsize, KMEM_ALIGN); + vmem_xfree(kmem_log_arena, lhp, lhsize); +} + + +static void * +kmem_log_enter(kmem_log_header_t *lhp, void *data, size_t size) +{ + void *logspace; + + kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[cpu_number()]; + + // if (lhp == NULL || kmem_logging == 0 || panicstr) + if (lhp == NULL || kmem_logging == 0) + return (NULL); + + mutex_enter(&clhp->clh_lock); + clhp->clh_hits++; + if (size > clhp->clh_avail) { + mutex_enter(&lhp->lh_lock); + lhp->lh_hits++; + lhp->lh_free[lhp->lh_tail] = clhp->clh_chunk; + lhp->lh_tail = (lhp->lh_tail + 1) % lhp->lh_nchunks; + clhp->clh_chunk = lhp->lh_free[lhp->lh_head]; + lhp->lh_head = (lhp->lh_head + 1) % lhp->lh_nchunks; + clhp->clh_current = lhp->lh_base + + clhp->clh_chunk * lhp->lh_chunksize; + clhp->clh_avail = lhp->lh_chunksize; + if (size > lhp->lh_chunksize) + size = lhp->lh_chunksize; + mutex_exit(&lhp->lh_lock); + } + logspace = clhp->clh_current; + clhp->clh_current += size; + clhp->clh_avail -= size; + bcopy(data, logspace, size); + mutex_exit(&clhp->clh_lock); + return (logspace); +} + +#define KMEM_AUDIT(lp, cp, bcp) \ +{ \ +kmem_bufctl_audit_t *_bcp = (kmem_bufctl_audit_t *)(bcp); \ +_bcp->bc_timestamp = gethrtime(); \ +_bcp->bc_thread = spl_current_thread(); \ +_bcp->bc_depth = getpcstack(_bcp->bc_stack, KMEM_STACK_DEPTH); \ +_bcp->bc_lastlog = kmem_log_enter((lp), _bcp, sizeof (*_bcp)); \ +} + +static void +kmem_log_event(kmem_log_header_t *lp, kmem_cache_t *cp, + kmem_slab_t *sp, void *addr) +{ + kmem_bufctl_audit_t bca; + + bzero(&bca, sizeof (kmem_bufctl_audit_t)); + bca.bc_addr = addr; + bca.bc_slab = sp; + KMEM_AUDIT(lp, cp, &bca); +} + +/* + * Create a new slab for cache cp. + */ +static kmem_slab_t * +kmem_slab_create(kmem_cache_t *cp, int kmflag) +{ + size_t slabsize = cp->cache_slabsize; + size_t chunksize = cp->cache_chunksize; + int cache_flags = cp->cache_flags; + size_t color, chunks; + char *buf, *slab; + kmem_slab_t *sp; + kmem_bufctl_t *bcp; + vmem_t *vmp = cp->cache_arena; + + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + color = cp->cache_color + cp->cache_align; + if (color > cp->cache_maxcolor) + color = cp->cache_mincolor; + cp->cache_color = color; + + slab = vmem_alloc(vmp, slabsize, kmflag & KM_VMFLAGS); + + if (slab == NULL) + goto vmem_alloc_failure; + + ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0); + + /* + * Reverify what was already checked in kmem_cache_set_move(), since the + * consolidator depends (for correctness) on slabs being initialized + * with the 0xbaddcafe memory pattern (setting a low order bit usable by + * clients to distinguish uninitialized memory from known objects). + */ + ASSERT((cp->cache_move == NULL) || !(cp->cache_cflags & KMC_NOTOUCH)); + if (!(cp->cache_cflags & KMC_NOTOUCH)) + copy_pattern(KMEM_UNINITIALIZED_PATTERN, slab, slabsize); + + if (cache_flags & KMF_HASH) { + if ((sp = kmem_cache_alloc(kmem_slab_cache, kmflag)) == NULL) + goto slab_alloc_failure; + chunks = (slabsize - color) / chunksize; + } else { + sp = KMEM_SLAB(cp, slab); + chunks = (slabsize - sizeof (kmem_slab_t) - color) / chunksize; + } + + sp->slab_cache = cp; + sp->slab_head = NULL; + sp->slab_refcnt = 0; + sp->slab_base = buf = slab + color; + sp->slab_chunks = chunks; + sp->slab_stuck_offset = (uint32_t)-1; + sp->slab_later_count = 0; + sp->slab_flags = 0; + sp->slab_create_time = gethrtime(); + + ASSERT(chunks > 0); + while (chunks-- != 0) { + if (cache_flags & KMF_HASH) { + bcp = kmem_cache_alloc(cp->cache_bufctl_cache, kmflag); + if (bcp == NULL) + goto bufctl_alloc_failure; + if (cache_flags & KMF_AUDIT) { + kmem_bufctl_audit_t *bcap = + (kmem_bufctl_audit_t *)bcp; + bzero(bcap, sizeof (kmem_bufctl_audit_t)); + bcap->bc_cache = cp; + } + bcp->bc_addr = buf; + bcp->bc_slab = sp; + } else { + bcp = KMEM_BUFCTL(cp, buf); + } + if (cache_flags & KMF_BUFTAG) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + btp->bt_redzone = KMEM_REDZONE_PATTERN; + btp->bt_bufctl = bcp; + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE; + if (cache_flags & KMF_DEADBEEF) { + copy_pattern(KMEM_FREE_PATTERN, buf, + cp->cache_verify); + } + } + bcp->bc_next = sp->slab_head; + sp->slab_head = bcp; + buf += chunksize; + } + + kmem_log_event(kmem_slab_log, cp, sp, slab); + + return (sp); + +bufctl_alloc_failure: + + while ((bcp = sp->slab_head) != NULL) { + sp->slab_head = bcp->bc_next; + kmem_cache_free(cp->cache_bufctl_cache, bcp); + } + kmem_cache_free(kmem_slab_cache, sp); + +slab_alloc_failure: + + vmem_free(vmp, slab, slabsize); + +vmem_alloc_failure: + + if (0 == (kmflag & KM_NO_VBA)) { + kmem_log_event(kmem_failure_log, cp, NULL, NULL); + atomic_inc_64(&cp->cache_alloc_fail); + } + + return (NULL); +} + +/* + * Destroy a slab. + */ +static void +kmem_slab_destroy(kmem_cache_t *cp, kmem_slab_t *sp) +{ + vmem_t *vmp = cp->cache_arena; + void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum); + + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(sp->slab_refcnt == 0); + + if (cp->cache_flags & KMF_HASH) { + kmem_bufctl_t *bcp; + while ((bcp = sp->slab_head) != NULL) { + sp->slab_head = bcp->bc_next; + kmem_cache_free(cp->cache_bufctl_cache, bcp); + } + kmem_cache_free(kmem_slab_cache, sp); + } + kpreempt(KPREEMPT_SYNC); + vmem_free(vmp, slab, cp->cache_slabsize); +} + +static void * +kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp, boolean_t prefill) +{ + kmem_bufctl_t *bcp, **hash_bucket; + void *buf; + boolean_t new_slab = (sp->slab_refcnt == 0); + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + /* + * kmem_slab_alloc() drops cache_lock when it creates a new slab, so we + * can't ASSERT(avl_is_empty(&cp->cache_partial_slabs)) here when the + * slab is newly created. + */ + ASSERT(new_slab || (KMEM_SLAB_IS_PARTIAL(sp) && + (sp == avl_first(&cp->cache_partial_slabs)))); + ASSERT(sp->slab_cache == cp); + + cp->cache_slab_alloc++; + cp->cache_bufslab--; + sp->slab_refcnt++; + + bcp = sp->slab_head; + sp->slab_head = bcp->bc_next; + + if (cp->cache_flags & KMF_HASH) { + /* + * Add buffer to allocated-address hash table. + */ + buf = bcp->bc_addr; + hash_bucket = KMEM_HASH(cp, buf); + bcp->bc_next = *hash_bucket; + *hash_bucket = bcp; + if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) { + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + } else { + buf = KMEM_BUF(cp, bcp); + } + + ASSERT(KMEM_SLAB_MEMBER(sp, buf)); + + if (sp->slab_head == NULL) { + ASSERT(KMEM_SLAB_IS_ALL_USED(sp)); + if (new_slab) { + ASSERT(sp->slab_chunks == 1); + } else { + ASSERT(sp->slab_chunks > 1); /* the slab was partial */ + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_later_count = 0; /* clear history */ + sp->slab_flags &= ~KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = (uint32_t)-1; + } + list_insert_head(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count++; + return (buf); + } + + ASSERT(KMEM_SLAB_IS_PARTIAL(sp)); + /* + * Peek to see if the magazine layer is enabled before + * we prefill. We're not holding the cpu cache lock, + * so the peek could be wrong, but there's no harm in it. + */ + if (new_slab && prefill && (cp->cache_flags & KMF_PREFILL) && + (KMEM_CPU_CACHE(cp)->cc_magsize != 0)) { + kmem_slab_prefill(cp, sp); + return (buf); + } + + if (new_slab) { + avl_add(&cp->cache_partial_slabs, sp); + return (buf); + } + + /* + * The slab is now more allocated than it was, so the + * order remains unchanged. + */ + ASSERT(!avl_update(&cp->cache_partial_slabs, sp)); + return (buf); +} + +/* + * Allocate a raw (unconstructed) buffer from cp's slab layer. + */ +static void * +kmem_slab_alloc(kmem_cache_t *cp, int kmflag) +{ + kmem_slab_t *sp; + void *buf; + boolean_t test_destructor; + + mutex_enter(&cp->cache_lock); + test_destructor = (cp->cache_slab_alloc == 0); + sp = avl_first(&cp->cache_partial_slabs); + if (sp == NULL) { + ASSERT(cp->cache_bufslab == 0); + + /* + * The freelist is empty. Create a new slab. + */ + mutex_exit(&cp->cache_lock); + if ((sp = kmem_slab_create(cp, kmflag)) == NULL) { + return (NULL); + } + mutex_enter(&cp->cache_lock); + cp->cache_slab_create++; + if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax) + cp->cache_bufmax = cp->cache_buftotal; + cp->cache_bufslab += sp->slab_chunks; + } + + buf = kmem_slab_alloc_impl(cp, sp, B_TRUE); + ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == + (cp->cache_complete_slab_count + + avl_numnodes(&cp->cache_partial_slabs) + + (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount))); + mutex_exit(&cp->cache_lock); + + if (test_destructor && cp->cache_destructor != NULL) { + copy_pattern(KMEM_UNINITIALIZED_PATTERN, buf, + cp->cache_bufsize); + if (cp->cache_flags & KMF_DEADBEEF) { + copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); + } + } + + return (buf); +} + +static void kmem_slab_move_yes(kmem_cache_t *, kmem_slab_t *, void *); + +/* + * Free a raw (unconstructed) buffer to cp's slab layer. + */ +static void +kmem_slab_free(kmem_cache_t *cp, void *buf) +{ + kmem_slab_t *sp; + kmem_bufctl_t *bcp, **prev_bcpp; + + ASSERT(buf != NULL); + + mutex_enter(&cp->cache_lock); + cp->cache_slab_free++; + + if (cp->cache_flags & KMF_HASH) { + /* + * Look up buffer in allocated-address hash table. + */ + prev_bcpp = KMEM_HASH(cp, buf); + while ((bcp = *prev_bcpp) != NULL) { + if (bcp->bc_addr == buf) { + *prev_bcpp = bcp->bc_next; + sp = bcp->bc_slab; + break; + } + cp->cache_lookup_depth++; + prev_bcpp = &bcp->bc_next; + } + } else { + bcp = KMEM_BUFCTL(cp, buf); + sp = KMEM_SLAB(cp, buf); + } + + if (bcp == NULL || sp->slab_cache != cp || !KMEM_SLAB_MEMBER(sp, buf)) { + mutex_exit(&cp->cache_lock); + kmem_error(KMERR_BADADDR, cp, buf); + return; + } + + if (KMEM_SLAB_OFFSET(sp, buf) == sp->slab_stuck_offset) { + /* + * If this is the buffer that prevented the consolidator from + * clearing the slab, we can reset the slab flags now that the + * buffer is freed. (It makes sense to do this in + * kmem_cache_free(), where the client gives up ownership of the + * buffer, but on the hot path the test is too expensive.) + */ + kmem_slab_move_yes(cp, sp, buf); + } + + if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) { + if (cp->cache_flags & KMF_CONTENTS) + ((kmem_bufctl_audit_t *)bcp)->bc_contents = + kmem_log_enter(kmem_content_log, buf, + cp->cache_contents); + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + + bcp->bc_next = sp->slab_head; + sp->slab_head = bcp; + + cp->cache_bufslab++; + ASSERT(sp->slab_refcnt >= 1); + + if (--sp->slab_refcnt == 0) { + /* + * There are no outstanding allocations from this slab, + * so we can reclaim the memory. + */ + if (sp->slab_chunks == 1) { + list_remove(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count--; + } else { + avl_remove(&cp->cache_partial_slabs, sp); + } + + cp->cache_buftotal -= sp->slab_chunks; + cp->cache_bufslab -= sp->slab_chunks; + /* + * Defer releasing the slab to the virtual memory subsystem + * while there is a pending move callback, since we guarantee + * that buffers passed to the move callback have only been + * touched by kmem or by the client itself. Since the memory + * patterns baddcafe (uninitialized) and deadbeef (freed) both + * set at least one of the two lowest order bits, the client can + * test those bits in the move callback to determine whether or + * not it knows about the buffer (assuming that the client also + * sets one of those low order bits whenever it frees a buffer). + */ + if (cp->cache_defrag == NULL || + (avl_is_empty(&cp->cache_defrag->kmd_moves_pending) && + !(sp->slab_flags & KMEM_SLAB_MOVE_PENDING))) { + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + } else { + list_t *deadlist = + &cp->cache_defrag->kmd_deadlist; + /* + * Slabs are inserted at both ends of the + * deadlist to distinguish between slabs + * freed while move callbacks are pending + * (list head) and a slab freed while the + * lock is dropped in kmem_move_buffers() + * (list tail) so that in both cases + * slab_destroy() is called from the + * right context. + */ + if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) { + list_insert_tail(deadlist, sp); + } else { + list_insert_head(deadlist, sp); + } + cp->cache_defrag->kmd_deadcount++; + mutex_exit(&cp->cache_lock); + } + return; + } + + if (bcp->bc_next == NULL) { + /* Transition the slab from completely allocated to partial. */ + ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1)); + ASSERT(sp->slab_chunks > 1); + list_remove(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count--; + avl_add(&cp->cache_partial_slabs, sp); + } else { + (void) avl_update_gt(&cp->cache_partial_slabs, sp); + } + + ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == + (cp->cache_complete_slab_count + + avl_numnodes(&cp->cache_partial_slabs) + + (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount))); + mutex_exit(&cp->cache_lock); +} + +/* + * Return -1 if kmem_error, 1 if constructor fails, 0 if successful. + */ +static int +kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct, + caddr_t caller) +{ + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl; + uint32_t mtbf; + + if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) { + kmem_error(KMERR_BADBUFTAG, cp, buf); + return (-1); + } + + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_ALLOC; + + if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) { + kmem_error(KMERR_BADBUFCTL, cp, buf); + return (-1); + } + + if (cp->cache_flags & KMF_DEADBEEF) { + if (!construct && (cp->cache_flags & KMF_LITE)) { + if (*(uint64_t *)buf != KMEM_FREE_PATTERN) { + kmem_error(KMERR_MODIFIED, cp, buf); + return (-1); + } + if (cp->cache_constructor != NULL) + *(uint64_t *)buf = btp->bt_redzone; + else + *(uint64_t *)buf = KMEM_UNINITIALIZED_PATTERN; + } else { + construct = 1; + if (verify_and_copy_pattern(KMEM_FREE_PATTERN, + KMEM_UNINITIALIZED_PATTERN, buf, + cp->cache_verify)) { + kmem_error(KMERR_MODIFIED, cp, buf); + return (-1); + } + } + } + btp->bt_redzone = KMEM_REDZONE_PATTERN; + + if ((mtbf = kmem_mtbf | cp->cache_mtbf) != 0 && + gethrtime() % mtbf == 0 && + (kmflag & (KM_NOSLEEP | KM_PANIC)) == KM_NOSLEEP) { + kmem_log_event(kmem_failure_log, cp, NULL, NULL); + if (!construct && cp->cache_destructor != NULL) + cp->cache_destructor(buf, cp->cache_private); + } else { + mtbf = 0; + } + + if (mtbf || (construct && cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, kmflag) != 0)) { + atomic_inc_64(&cp->cache_alloc_fail); + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE; + if (cp->cache_flags & KMF_DEADBEEF) + copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); + kmem_slab_free(cp, buf); + return (1); + } + + if (cp->cache_flags & KMF_AUDIT) { + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + + if ((cp->cache_flags & KMF_LITE) && + !(cp->cache_cflags & KMC_KMEM_ALLOC)) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller); + } + + return (0); +} + +static int +kmem_cache_free_debug(kmem_cache_t *cp, void *buf, caddr_t caller) +{ + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl; + kmem_slab_t *sp; + + if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_ALLOC)) { + if (btp->bt_bxstat == ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) { + kmem_error(KMERR_DUPFREE, cp, buf); + return (-1); + } + sp = kmem_findslab(cp, buf); + if (sp == NULL || sp->slab_cache != cp) + kmem_error(KMERR_BADADDR, cp, buf); + else + kmem_error(KMERR_REDZONE, cp, buf); + return (-1); + } + + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE; + + if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) { + kmem_error(KMERR_BADBUFCTL, cp, buf); + return (-1); + } + + if (btp->bt_redzone != KMEM_REDZONE_PATTERN) { + kmem_error(KMERR_REDZONE, cp, buf); + return (-1); + } + + if (cp->cache_flags & KMF_AUDIT) { + if (cp->cache_flags & KMF_CONTENTS) + bcp->bc_contents = kmem_log_enter(kmem_content_log, + buf, cp->cache_contents); + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + + if ((cp->cache_flags & KMF_LITE) && + !(cp->cache_cflags & KMC_KMEM_ALLOC)) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller); + } + + if (cp->cache_flags & KMF_DEADBEEF) { + if (cp->cache_flags & KMF_LITE) + btp->bt_redzone = *(uint64_t *)buf; + else if (cp->cache_destructor != NULL) + cp->cache_destructor(buf, cp->cache_private); + + copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); + } + + return (0); +} + +/* + * Free each object in magazine mp to cp's slab layer, and free mp itself. + */ +static void +kmem_magazine_destroy(kmem_cache_t *cp, kmem_magazine_t *mp, int nrounds) +{ + int round; + + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); + + for (round = 0; round < nrounds; round++) { + void *buf = mp->mag_round[round]; + + if (cp->cache_flags & KMF_DEADBEEF) { + if (verify_pattern(KMEM_FREE_PATTERN, buf, + cp->cache_verify) != NULL) { + kmem_error(KMERR_MODIFIED, cp, buf); + continue; + } + if ((cp->cache_flags & KMF_LITE) && + cp->cache_destructor != NULL) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + *(uint64_t *)buf = btp->bt_redzone; + cp->cache_destructor(buf, cp->cache_private); + *(uint64_t *)buf = KMEM_FREE_PATTERN; + } + } else if (cp->cache_destructor != NULL) { + cp->cache_destructor(buf, cp->cache_private); + } + + kmem_slab_free(cp, buf); + kpreempt(KPREEMPT_SYNC); + } + ASSERT(KMEM_MAGAZINE_VALID(cp, mp)); + kmem_cache_free(cp->cache_magtype->mt_cache, mp); +} + +/* + * Allocate a magazine from the depot. + */ +static kmem_magazine_t * +kmem_depot_alloc(kmem_cache_t *cp, kmem_maglist_t *mlp) +{ + kmem_magazine_t *mp; + + /* + * If we can't get the depot lock without contention, + * update our contention count. We use the depot + * contention rate to determine whether we need to + * increase the magazine size for better scalability. + */ + if (!mutex_tryenter(&cp->cache_depot_lock)) { + mutex_enter(&cp->cache_depot_lock); + cp->cache_depot_contention++; + } + + if ((mp = mlp->ml_list) != NULL) { + ASSERT(KMEM_MAGAZINE_VALID(cp, mp)); + mlp->ml_list = mp->mag_next; + if (--mlp->ml_total < mlp->ml_min) + mlp->ml_min = mlp->ml_total; + mlp->ml_alloc++; + } + + mutex_exit(&cp->cache_depot_lock); + + return (mp); +} + +/* + * Free a magazine to the depot. + */ +static void +kmem_depot_free(kmem_cache_t *cp, kmem_maglist_t *mlp, kmem_magazine_t *mp) +{ + mutex_enter(&cp->cache_depot_lock); + ASSERT(KMEM_MAGAZINE_VALID(cp, mp)); + mp->mag_next = mlp->ml_list; + mlp->ml_list = mp; + mlp->ml_total++; + mutex_exit(&cp->cache_depot_lock); +} + +/* + * Update the working set statistics for cp's depot. + */ +static void +kmem_depot_ws_update(kmem_cache_t *cp) +{ + mutex_enter(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit = cp->cache_full.ml_min; + cp->cache_full.ml_min = cp->cache_full.ml_total; + cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_min; + cp->cache_empty.ml_min = cp->cache_empty.ml_total; + mutex_exit(&cp->cache_depot_lock); +} + +/* + * Set the working set statistics for cp's depot to zero. (Everything is + * eligible for reaping.) + */ +void +kmem_depot_ws_zero(kmem_cache_t *cp) +{ + mutex_enter(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit = cp->cache_full.ml_total; + cp->cache_full.ml_min = cp->cache_full.ml_total; + cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_total; + cp->cache_empty.ml_min = cp->cache_empty.ml_total; + mutex_exit(&cp->cache_depot_lock); +} + +/* + * The number of bytes to reap before we call kpreempt(). The default (1MB) + * causes us to preempt reaping up to hundres of times per second. Using a + * larger value (1GB) causes this to have virtually no effect. + */ +size_t kmem_reap_preempt_bytes = 64 * 1024 * 1024; + + +/* + * Reap all magazines that have fallen out of the depot's working set. + */ +static void +kmem_depot_ws_reap(kmem_cache_t *cp) +{ + size_t bytes = 0; + long reap; + kmem_magazine_t *mp; + + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); + + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + while (reap-- && + (mp = kmem_depot_alloc(cp, &cp->cache_full)) != NULL) { + kmem_magazine_destroy(cp, mp, cp->cache_magtype->mt_magsize); + bytes += cp->cache_magtype->mt_magsize * cp->cache_bufsize; + if (bytes > kmem_reap_preempt_bytes) { + kpreempt(KPREEMPT_SYNC); + bytes = 0; + } + } + + reap = MIN(cp->cache_empty.ml_reaplimit, cp->cache_empty.ml_min); + while (reap-- && + (mp = kmem_depot_alloc(cp, &cp->cache_empty)) != NULL) { + kmem_magazine_destroy(cp, mp, 0); + bytes += cp->cache_magtype->mt_magsize * cp->cache_bufsize; + if (bytes > kmem_reap_preempt_bytes) { + kpreempt(KPREEMPT_SYNC); + bytes = 0; + } + } +} + +static void +kmem_cpu_reload(kmem_cpu_cache_t *ccp, kmem_magazine_t *mp, int rounds) +{ + ASSERT((ccp->cc_loaded == NULL && ccp->cc_rounds == -1) || + (ccp->cc_loaded && ccp->cc_rounds + rounds == ccp->cc_magsize)); + ASSERT(ccp->cc_magsize > 0); + + ccp->cc_ploaded = ccp->cc_loaded; + ccp->cc_prounds = ccp->cc_rounds; + ccp->cc_loaded = mp; + ccp->cc_rounds = rounds; +} + +/* + * Intercept kmem alloc/free calls during crash dump in order to avoid + * changing kmem state while memory is being saved to the dump device. + * Otherwise, ::kmem_verify will report "corrupt buffers". Note that + * there are no locks because only one CPU calls kmem during a crash + * dump. To enable this feature, first create the associated vmem + * arena with VMC_DUMPSAFE. + */ +static void *kmem_dump_start; /* start of pre-reserved heap */ +static void *kmem_dump_end; /* end of heap area */ +static void *kmem_dump_curr; /* current free heap pointer */ +static size_t kmem_dump_size; /* size of heap area */ + +/* append to each buf created in the pre-reserved heap */ +typedef struct kmem_dumpctl { + void *kdc_next; /* cache dump free list linkage */ +} kmem_dumpctl_t; + +#define KMEM_DUMPCTL(cp, buf) \ +((kmem_dumpctl_t *)P2ROUNDUP((uintptr_t)(buf) + (cp)->cache_bufsize, \ +sizeof (void *))) + +/* Keep some simple stats. */ +#define KMEM_DUMP_LOGS (100) + +typedef struct kmem_dump_log { + kmem_cache_t *kdl_cache; + uint_t kdl_allocs; /* # of dump allocations */ + uint_t kdl_frees; /* # of dump frees */ + uint_t kdl_alloc_fails; /* # of allocation failures */ + uint_t kdl_free_nondump; /* # of non-dump frees */ + uint_t kdl_unsafe; /* cache was used, but unsafe */ +} kmem_dump_log_t; + +static kmem_dump_log_t *kmem_dump_log; +static int kmem_dump_log_idx; + +#define KDI_LOG(cp, stat) { \ +kmem_dump_log_t *kdl; \ +if ((kdl = (kmem_dump_log_t *)((cp)->cache_dumplog)) != NULL) { \ +kdl->stat++; \ +} else if (kmem_dump_log_idx < KMEM_DUMP_LOGS) { \ +kdl = &kmem_dump_log[kmem_dump_log_idx++]; \ +kdl->stat++; \ +kdl->kdl_cache = (cp); \ +(cp)->cache_dumplog = kdl; \ +} \ +} + +/* set non zero for full report */ +uint_t kmem_dump_verbose = 0; + +/* stats for overize heap */ +uint_t kmem_dump_oversize_allocs = 0; +uint_t kmem_dump_oversize_max = 0; + +static void +kmem_dumppr(char **pp, char *e, const char *format, ...) +{ + char *p = *pp; + + if (p < e) { + int n; + va_list ap; + + va_start(ap, format); + n = vsnprintf(p, e - p, format, ap); + va_end(ap); + *pp = p + n; + } +} + +/* + * Called when dumpadm(1M) configures dump parameters. + */ +void +kmem_dump_init(size_t size) +{ + if (kmem_dump_start != NULL) + zfs_kmem_free(kmem_dump_start, kmem_dump_size); + + if (kmem_dump_log == NULL) + kmem_dump_log = + (kmem_dump_log_t *)zfs_kmem_zalloc( + KMEM_DUMP_LOGS * sizeof (kmem_dump_log_t), KM_SLEEP); + + kmem_dump_start = zfs_kmem_alloc(size, KM_SLEEP); + + if (kmem_dump_start != NULL) { + kmem_dump_size = size; + kmem_dump_curr = kmem_dump_start; + kmem_dump_end = (void *)((char *)kmem_dump_start + size); + copy_pattern(KMEM_UNINITIALIZED_PATTERN, kmem_dump_start, size); + } else { + kmem_dump_size = 0; + kmem_dump_curr = NULL; + kmem_dump_end = NULL; + } +} + +/* + * Set flag for each kmem_cache_t if is safe to use alternate dump + * memory. Called just before panic crash dump starts. Set the flag + * for the calling CPU. + */ +void +kmem_dump_begin(void) +{ + if (kmem_dump_start != NULL) { + kmem_cache_t *cp; + + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + if (cp->cache_arena->vm_cflags & VMC_DUMPSAFE) { + cp->cache_flags |= KMF_DUMPDIVERT; + ccp->cc_flags |= KMF_DUMPDIVERT; + ccp->cc_dump_rounds = ccp->cc_rounds; + ccp->cc_dump_prounds = ccp->cc_prounds; + ccp->cc_rounds = ccp->cc_prounds = -1; + } else { + cp->cache_flags |= KMF_DUMPUNSAFE; + ccp->cc_flags |= KMF_DUMPUNSAFE; + } + } + } +} + +/* + * finished dump intercept + * print any warnings on the console + * return verbose information to dumpsys() in the given buffer + */ +size_t +kmem_dump_finish(char *buf, size_t size) +{ + int kdi_idx; + int kdi_end = kmem_dump_log_idx; + int percent = 0; + int header = 0; + int warn = 0; + size_t used; + kmem_cache_t *cp; + kmem_dump_log_t *kdl; + char *e = buf + size; + char *p = buf; + + if (kmem_dump_size == 0 || kmem_dump_verbose == 0) + return (0); + + used = (char *)kmem_dump_curr - (char *)kmem_dump_start; + percent = (used * 100) / kmem_dump_size; + + kmem_dumppr(&p, e, "%% heap used,%d\n", percent); + kmem_dumppr(&p, e, "used bytes,%ld\n", used); + kmem_dumppr(&p, e, "heap size,%ld\n", kmem_dump_size); + kmem_dumppr(&p, e, "Oversize allocs,%d\n", + kmem_dump_oversize_allocs); + kmem_dumppr(&p, e, "Oversize max size,%ld\n", + kmem_dump_oversize_max); + + for (kdi_idx = 0; kdi_idx < kdi_end; kdi_idx++) { + kdl = &kmem_dump_log[kdi_idx]; + cp = kdl->kdl_cache; + if (cp == NULL) + break; + if (kdl->kdl_alloc_fails) + ++warn; + if (header == 0) { + kmem_dumppr(&p, e, + "Cache Name,Allocs,Frees,Alloc Fails," + "Nondump Frees,Unsafe Allocs/Frees\n"); + header = 1; + } + kmem_dumppr(&p, e, "%s,%d,%d,%d,%d,%d\n", + cp->cache_name, kdl->kdl_allocs, kdl->kdl_frees, + kdl->kdl_alloc_fails, kdl->kdl_free_nondump, + kdl->kdl_unsafe); + } + + /* return buffer size used */ + if (p < e) + bzero(p, e - p); + return (p - buf); +} + +/* + * Allocate a constructed object from alternate dump memory. + */ +void * +kmem_cache_alloc_dump(kmem_cache_t *cp, int kmflag) +{ + void *buf; + void *curr; + char *bufend; + + /* return a constructed object */ + if ((buf = cp->cache_dumpfreelist) != NULL) { + cp->cache_dumpfreelist = KMEM_DUMPCTL(cp, buf)->kdc_next; + KDI_LOG(cp, kdl_allocs); + return (buf); + } + + /* create a new constructed object */ + curr = kmem_dump_curr; + buf = (void *)P2ROUNDUP((uintptr_t)curr, cp->cache_align); + bufend = (char *)KMEM_DUMPCTL(cp, buf) + sizeof (kmem_dumpctl_t); + + /* hat layer objects cannot cross a page boundary */ + if (cp->cache_align < PAGESIZE) { + char *page = (char *)P2ROUNDUP((uintptr_t)buf, PAGESIZE); + if (bufend > page) { + bufend += page - (char *)buf; + buf = (void *)page; + } + } + + /* fall back to normal alloc if reserved area is used up */ + if (bufend > (char *)kmem_dump_end) { + kmem_dump_curr = kmem_dump_end; + KDI_LOG(cp, kdl_alloc_fails); + return (NULL); + } + + /* + * Must advance curr pointer before calling a constructor that + * may also allocate memory. + */ + kmem_dump_curr = bufend; + + /* run constructor */ + if (cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, kmflag) + != 0) { +#ifdef DEBUG + printf("name='%s' cache=0x%p: kmem cache constructor failed\n", + cp->cache_name, (void *)cp); +#endif + /* reset curr pointer iff no allocs were done */ + if (kmem_dump_curr == bufend) + kmem_dump_curr = curr; + + /* fall back to normal alloc if the constructor fails */ + KDI_LOG(cp, kdl_alloc_fails); + return (NULL); + } + + KDI_LOG(cp, kdl_allocs); + return (buf); +} + +/* + * Free a constructed object in alternate dump memory. + */ +int +kmem_cache_free_dump(kmem_cache_t *cp, void *buf) +{ + /* save constructed buffers for next time */ + if ((char *)buf >= (char *)kmem_dump_start && + (char *)buf < (char *)kmem_dump_end) { + KMEM_DUMPCTL(cp, buf)->kdc_next = cp->cache_dumpfreelist; + cp->cache_dumpfreelist = buf; + KDI_LOG(cp, kdl_frees); + return (0); + } + + /* count all non-dump buf frees */ + KDI_LOG(cp, kdl_free_nondump); + + /* just drop buffers that were allocated before dump started */ + if (kmem_dump_curr < kmem_dump_end) + return (0); + + /* fall back to normal free if reserved area is used up */ + return (1); +} + +/* + * Allocate a constructed object from cache cp. + */ +void * +kmem_cache_alloc(kmem_cache_t *cp, int kmflag) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + kmem_magazine_t *fmp; + void *buf; + mutex_enter(&ccp->cc_lock); + for (;;) { + /* + * If there's an object available in the current CPU's + * loaded magazine, just take it and return. + */ + if (ccp->cc_rounds > 0) { + buf = ccp->cc_loaded->mag_round[--ccp->cc_rounds]; + ccp->cc_alloc++; + mutex_exit(&ccp->cc_lock); + if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & + KMF_DUMPDIVERT)); + KDI_LOG(cp, kdl_unsafe); + } + if ((ccp->cc_flags & KMF_BUFTAG) && + kmem_cache_alloc_debug(cp, buf, kmflag, 0, + caller()) != 0) { + if (kmflag & KM_NOSLEEP) + return (NULL); + mutex_enter(&ccp->cc_lock); + continue; + } + } + return (buf); + } + + /* + * The loaded magazine is empty. If the previously loaded + * magazine was full, exchange them and try again. + */ + if (ccp->cc_prounds > 0) { + kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds); + continue; + } + + /* + * Return an alternate buffer at dump time to preserve + * the heap. + */ + if (ccp->cc_flags & (KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); + /* log it so that we can warn about it */ + KDI_LOG(cp, kdl_unsafe); + } else { + if ((buf = kmem_cache_alloc_dump(cp, kmflag)) != + NULL) { + mutex_exit(&ccp->cc_lock); + return (buf); + } + break; /* fall back to slab layer */ + } + } + + /* + * If the magazine layer is disabled, break out now. + */ + if (ccp->cc_magsize == 0) + break; + + /* + * Try to get a full magazine from the depot. + */ + fmp = kmem_depot_alloc(cp, &cp->cache_full); + if (fmp != NULL) { + if (ccp->cc_ploaded != NULL) + kmem_depot_free(cp, &cp->cache_empty, + ccp->cc_ploaded); + kmem_cpu_reload(ccp, fmp, ccp->cc_magsize); + continue; + } + + /* + * There are no full magazines in the depot, + * so fall through to the slab layer. + */ + break; + } + mutex_exit(&ccp->cc_lock); + + /* + * We couldn't allocate a constructed object from the magazine layer, + * so get a raw buffer from the slab layer and apply its constructor. + */ + buf = kmem_slab_alloc(cp, kmflag); + + if (buf == NULL) + return (NULL); + + if (cp->cache_flags & KMF_BUFTAG) { + /* + * Make kmem_cache_alloc_debug() apply the constructor for us. + */ + int rc = kmem_cache_alloc_debug(cp, buf, kmflag, 1, caller()); + if (rc != 0) { + if (kmflag & KM_NOSLEEP) + return (NULL); + /* + * kmem_cache_alloc_debug() detected corruption + * but didn't panic (kmem_panic <= 0). We should not be + * here because the constructor failed (indicated by a + * return code of 1). Try again. + */ + ASSERT(rc == -1); + return (kmem_cache_alloc(cp, kmflag)); + } + return (buf); + } + + if (cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, kmflag) != 0) { + atomic_inc_64(&cp->cache_alloc_fail); + kmem_slab_free(cp, buf); + return (NULL); + } + + return (buf); +} + +/* + * The freed argument tells whether or not kmem_cache_free_debug() has already + * been called so that we can avoid the duplicate free error. For example, a + * buffer on a magazine has already been freed by the client but is still + * constructed. + */ +static void +kmem_slab_free_constructed(kmem_cache_t *cp, void *buf, boolean_t freed) +{ + if (!freed && (cp->cache_flags & KMF_BUFTAG)) + if (kmem_cache_free_debug(cp, buf, caller()) == -1) + return; + + /* + * Note that if KMF_DEADBEEF is in effect and KMF_LITE is not, + * kmem_cache_free_debug() will have already applied the destructor. + */ + if ((cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) != KMF_DEADBEEF && + cp->cache_destructor != NULL) { + if (cp->cache_flags & KMF_DEADBEEF) { /* KMF_LITE implied */ + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + *(uint64_t *)buf = btp->bt_redzone; + cp->cache_destructor(buf, cp->cache_private); + *(uint64_t *)buf = KMEM_FREE_PATTERN; + } else { + cp->cache_destructor(buf, cp->cache_private); + } + } + + kmem_slab_free(cp, buf); +} + +/* + * Used when there's no room to free a buffer to the per-CPU cache. + * Drops and re-acquires &ccp->cc_lock, and returns non-zero if the + * caller should try freeing to the per-CPU cache again. + * Note that we don't directly install the magazine in the cpu cache, + * since its state may have changed wildly while the lock was dropped. + */ +static int +kmem_cpucache_magazine_alloc(kmem_cpu_cache_t *ccp, kmem_cache_t *cp) +{ + kmem_magazine_t *emp; + kmem_magtype_t *mtp; + + ASSERT(MUTEX_HELD(&ccp->cc_lock)); + ASSERT(((uint_t)ccp->cc_rounds == ccp->cc_magsize || + ((uint_t)ccp->cc_rounds == -1)) && + ((uint_t)ccp->cc_prounds == ccp->cc_magsize || + ((uint_t)ccp->cc_prounds == -1))); + + emp = kmem_depot_alloc(cp, &cp->cache_empty); + if (emp != NULL) { + if (ccp->cc_ploaded != NULL) + kmem_depot_free(cp, &cp->cache_full, + ccp->cc_ploaded); + kmem_cpu_reload(ccp, emp, 0); + return (1); + } + /* + * There are no empty magazines in the depot, + * so try to allocate a new one. We must drop all locks + * across kmem_cache_alloc() because lower layers may + * attempt to allocate from this cache. + */ + mtp = cp->cache_magtype; + mutex_exit(&ccp->cc_lock); + emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP); + mutex_enter(&ccp->cc_lock); + + if (emp != NULL) { + /* + * We successfully allocated an empty magazine. + * However, we had to drop ccp->cc_lock to do it, + * so the cache's magazine size may have changed. + * If so, free the magazine and try again. + */ + if (ccp->cc_magsize != mtp->mt_magsize) { + mutex_exit(&ccp->cc_lock); + kmem_cache_free(mtp->mt_cache, emp); + mutex_enter(&ccp->cc_lock); + return (1); + } + + /* + * We got a magazine of the right size. Add it to + * the depot and try the whole dance again. + */ + kmem_depot_free(cp, &cp->cache_empty, emp); + return (1); + } + + /* + * We couldn't allocate an empty magazine, + * so fall through to the slab layer. + */ + return (0); +} + +/* + * If the cache's parent arena is a leaf arena (i.e., it imports all its memory) + * then we can consider it fragmented if either there is 1 GiB free in the arena + * or one eighth of the arena is free. + * + * This is useful in kmem_cache_free{_debug} to determine whether to free to the + * slab layer if the loaded magazine is full. + */ +static inline boolean_t +kmem_cache_parent_arena_fragmented(kmem_cache_t *cp) +{ + const vmem_kstat_t *kp = &cp->cache_arena->vm_kstat; + const int64_t vk_import = kp->vk_mem_import.value.ui64; + const int64_t vk_inuse = kp->vk_mem_inuse.value.ui64; + const int64_t vk_total = kp->vk_mem_total.value.ui64; + + if (vk_import == vk_total && vk_inuse < vk_total) { + const int64_t vk_free = vk_total - vk_inuse; + const int64_t highthresh = 1024LL*1024LL*1024LL; + // we are fragmented if we have 1GiB free + if (vk_free >= highthresh) + return (B_TRUE); + // we are fragmented if at least 1/8 of the + // total arena space is free + if (vk_free > 0 && vk_total > 0) { + const int64_t eighth_total = vk_total / 8; + if (vk_free >= eighth_total) + return (B_TRUE); + } + } + return (B_FALSE); +} + +/* + * Free a constructed object to cache cp. + */ +void +kmem_cache_free(kmem_cache_t *cp, void *buf) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + /* + * The client must not free either of the buffers passed to the move + * callback function. + */ + ASSERT(cp->cache_defrag == NULL || + cp->cache_defrag->kmd_thread != spl_current_thread() || + (buf != cp->cache_defrag->kmd_from_buf && + buf != cp->cache_defrag->kmd_to_buf)); + + if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); + /* log it so that we can warn about it */ + KDI_LOG(cp, kdl_unsafe); + } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) { + return; + } + if (ccp->cc_flags & KMF_BUFTAG) { + if (kmem_cache_free_debug(cp, buf, caller()) == -1) + return; + } + } + + mutex_enter(&ccp->cc_lock); + /* + * Any changes to this logic should be reflected in kmem_slab_prefill() + */ + for (;;) { + /* + * If there's a slot available in the current CPU's + * loaded magazine, just put the object there and return. + */ + if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) { + ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf; + ccp->cc_free++; + mutex_exit(&ccp->cc_lock); + return; + } + + /* + * If the magazine layer is disabled, break out now. + */ + if (ccp->cc_magsize == 0) { + break; + } + + /* + * The magazine layer is on, but the loaded magazine is now + * full (of allocatable constructed elements). + * + * If the cache's arena is badly fragmented, break out now; + * this frees to the slab layer. + * + * Note: this is not reflected in kmem_slab_prefill() which + * deals with a freshly allocated slab. + */ + + if (kmem_free_to_slab_when_fragmented == 1 && + kmem_cache_parent_arena_fragmented(cp)) + break; + + /* + * The loaded magazine is full. If the previously loaded + * magazine was empty, exchange them and try again. + */ + if (ccp->cc_prounds == 0) { + kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds); + continue; + } + + if (!kmem_cpucache_magazine_alloc(ccp, cp)) { + /* + * We couldn't free our constructed object to the + * magazine layer, so apply its destructor and free it + * to the slab layer. + */ + break; + } + } + mutex_exit(&ccp->cc_lock); + kpreempt(KPREEMPT_SYNC); + kmem_slab_free_constructed(cp, buf, B_TRUE); +} + +/* + * Free a constructed object to cache cp. + * Do not free to the magazine layer. + * This is essentially just kmem_cache_free() without + * the for(;;) loop or the ccp critical section. + */ +void +kmem_cache_free_to_slab(kmem_cache_t *cp, void *buf) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + /* + * The client must not free either of the buffers passed to the move + * callback function. + */ + ASSERT(cp->cache_defrag == NULL || + cp->cache_defrag->kmd_thread != spl_current_thread() || + (buf != cp->cache_defrag->kmd_from_buf && + buf != cp->cache_defrag->kmd_to_buf)); + + if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); + /* log it so that we can warn about it */ + KDI_LOG(cp, kdl_unsafe); + } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) { + return; + } + if (ccp->cc_flags & KMF_BUFTAG) { + if (kmem_cache_free_debug(cp, buf, caller()) == -1) + return; + } + } + + /* omitted the for(;;) loop from kmem_cache_free */ + /* also do not take ccp mutex */ + + kmem_slab_free_constructed(cp, buf, B_TRUE); +} + +static void +kmem_slab_prefill(kmem_cache_t *cp, kmem_slab_t *sp) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + kmem_bufctl_t *next, *head; + size_t nbufs; + + /* + * Completely allocate the newly created slab and put the pre-allocated + * buffers in magazines. Any of the buffers that cannot be put in + * magazines must be returned to the slab. + */ + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(cp->cache_constructor == NULL); + ASSERT(sp->slab_cache == cp); + ASSERT(sp->slab_refcnt == 1); + ASSERT(sp->slab_head != NULL && sp->slab_chunks > sp->slab_refcnt); + ASSERT(avl_find(&cp->cache_partial_slabs, sp, NULL) == NULL); + + head = sp->slab_head; + nbufs = (sp->slab_chunks - sp->slab_refcnt); + sp->slab_head = NULL; + sp->slab_refcnt += nbufs; + cp->cache_bufslab -= nbufs; + cp->cache_slab_alloc += nbufs; + list_insert_head(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count++; + mutex_exit(&cp->cache_lock); + mutex_enter(&ccp->cc_lock); + + while (head != NULL) { + void *buf = KMEM_BUF(cp, head); + /* + * If there's a slot available in the current CPU's + * loaded magazine, just put the object there and + * continue. + */ + if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) { + ccp->cc_loaded->mag_round[ccp->cc_rounds++] = + buf; + ccp->cc_free++; + nbufs--; + head = head->bc_next; + continue; + } + + /* + * The loaded magazine is full. If the previously + * loaded magazine was empty, exchange them and try + * again. + */ + if (ccp->cc_prounds == 0) { + kmem_cpu_reload(ccp, ccp->cc_ploaded, + ccp->cc_prounds); + continue; + } + + /* + * If the magazine layer is disabled, break out now. + */ + + if (ccp->cc_magsize == 0) { + break; + } + + if (!kmem_cpucache_magazine_alloc(ccp, cp)) + break; + } + mutex_exit(&ccp->cc_lock); + if (nbufs != 0) { + ASSERT(head != NULL); + + /* + * If there was a failure, return remaining objects to + * the slab + */ + while (head != NULL) { + ASSERT(nbufs != 0); + next = head->bc_next; + head->bc_next = NULL; + kmem_slab_free(cp, KMEM_BUF(cp, head)); + head = next; + nbufs--; + } + } + ASSERT(head == NULL); + ASSERT(nbufs == 0); + mutex_enter(&cp->cache_lock); +} + +void * +zfs_kmem_zalloc(size_t size, int kmflag) +{ + size_t index; + void *buf; + + if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) { + kmem_cache_t *cp = kmem_alloc_table[index]; + buf = kmem_cache_alloc(cp, kmflag); + if (buf != NULL) { + if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE; + ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size); + + if (cp->cache_flags & KMF_LITE) { + KMEM_BUFTAG_LITE_ENTER(btp, + kmem_lite_count, caller()); + } + } + bzero(buf, size); + } + } else { + buf = zfs_kmem_alloc(size, kmflag); + if (buf != NULL) + bzero(buf, size); + } + return (buf); +} + +void * +zfs_kmem_alloc(size_t size, int kmflag) +{ + size_t index; + kmem_cache_t *cp; + void *buf; + + if (size == 0) + return (KMEM_ZERO_SIZE_PTR); + + if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) { + cp = kmem_alloc_table[index]; + /* fall through to kmem_cache_alloc() */ + + } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) < + kmem_big_alloc_table_max) { + cp = kmem_big_alloc_table[index]; + /* fall through to kmem_cache_alloc() */ + + } else { + + buf = vmem_alloc(kmem_oversize_arena, size, + kmflag & KM_VMFLAGS); + if (buf == NULL) + kmem_log_event(kmem_failure_log, NULL, NULL, + (void *)size); + else if (KMEM_DUMP(kmem_slab_cache)) { + /* stats for dump intercept */ + kmem_dump_oversize_allocs++; + if (size > kmem_dump_oversize_max) + kmem_dump_oversize_max = size; + } + return (buf); + } + + buf = kmem_cache_alloc(cp, kmflag); + if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp) && buf != NULL) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE; + ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size); + + if (cp->cache_flags & KMF_LITE) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller()); + } + } + return (buf); +} + +void +zfs_kmem_free(void *buf, size_t size) +{ + size_t index; + kmem_cache_t *cp; + + if (size == 0 || buf == KMEM_ZERO_SIZE_PTR || buf == NULL) + return; + + if ((index = (size - 1) >> KMEM_ALIGN_SHIFT) < KMEM_ALLOC_TABLE_MAX) { + cp = kmem_alloc_table[index]; + /* fall through to kmem_cache_free() */ + + } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) < + kmem_big_alloc_table_max) { + cp = kmem_big_alloc_table[index]; + /* fall through to kmem_cache_free() */ + + } else { + vmem_free(kmem_oversize_arena, buf, size); + return; + } + + if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + uint32_t *ip = (uint32_t *)btp; + if (ip[1] != KMEM_SIZE_ENCODE(size)) { + if (*(uint64_t *)buf == KMEM_FREE_PATTERN) { + kmem_error(KMERR_DUPFREE, cp, buf); + return; + } + if (KMEM_SIZE_VALID(ip[1])) { + ip[0] = KMEM_SIZE_ENCODE(size); + kmem_error(KMERR_BADSIZE, cp, buf); + } else { + kmem_error(KMERR_REDZONE, cp, buf); + } + return; + } + if (((uint8_t *)buf)[size] != KMEM_REDZONE_BYTE) { + kmem_error(KMERR_REDZONE, cp, buf); + return; + } + btp->bt_redzone = KMEM_REDZONE_PATTERN; + if (cp->cache_flags & KMF_LITE) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, + caller()); + } + } + kmem_cache_free(cp, buf); +} + +/* + * Try to allocate at least `size' bytes of memory without sleeping or + * panicking. Return actual allocated size in `asize'. If allocation failed, + * try final allocation with sleep or panic allowed. + */ +void * +kmem_alloc_tryhard(size_t size, size_t *asize, int kmflag) +{ + void *p; + + *asize = P2ROUNDUP(size, KMEM_ALIGN); + do { + p = kmem_alloc(*asize, (kmflag | KM_NOSLEEP) & ~KM_PANIC); + if (p != NULL) + return (p); + *asize += KMEM_ALIGN; + } while (*asize <= PAGESIZE); + + *asize = P2ROUNDUP(size, KMEM_ALIGN); + return (zfs_kmem_alloc(*asize, kmflag)); +} + +/* + * Reclaim all unused memory from a cache. + */ +static void +kmem_cache_reap(kmem_cache_t *cp) +{ + ASSERT(taskq_member(kmem_taskq, curthread)); + + cp->cache_reap++; + + /* + * Ask the cache's owner to free some memory if possible. + * The idea is to handle things like the inode cache, which + * typically sits on a bunch of memory that it doesn't truly + * *need*. Reclaim policy is entirely up to the owner; this + * callback is just an advisory plea for help. + */ + if (cp->cache_reclaim != NULL) { + long delta; + + /* + * Reclaimed memory should be reapable (not included in the + * depot's working set). + */ + delta = cp->cache_full.ml_total; + cp->cache_reclaim(cp->cache_private); + delta = cp->cache_full.ml_total - delta; + if (delta > 0) { + mutex_enter(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit += delta; + cp->cache_full.ml_min += delta; + mutex_exit(&cp->cache_depot_lock); + } + } + + kmem_depot_ws_reap(cp); + + if (cp->cache_defrag != NULL && !kmem_move_noreap) { + kmem_cache_defrag(cp); + } +} + +static void +kmem_reap_timeout(void *flag_arg) +{ + uint32_t *flag = (uint32_t *)flag_arg; + + ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace); + *flag = 0; +} + +static void +kmem_reap_done(void *flag) +{ + (void) bsd_timeout(kmem_reap_timeout, flag, &kmem_reap_interval); +} + +static void +kmem_reap_start(void *flag) +{ + ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace); + + if (flag == &kmem_reaping) { + kmem_cache_applyall(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP); + /* + * if we have segkp under heap, reap segkp cache. + */ + } + else + kmem_cache_applyall_id(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP); + + /* + * We use taskq_dispatch() to schedule a timeout to clear + * the flag so that kmem_reap() becomes self-throttling: + * we won't reap again until the current reap completes *and* + * at least kmem_reap_interval ticks have elapsed. + */ + if (!taskq_dispatch(kmem_taskq, kmem_reap_done, flag, TQ_NOSLEEP)) + kmem_reap_done(flag); +} + +static void +kmem_reap_common(void *flag_arg) +{ + uint32_t *flag = (uint32_t *)flag_arg; + + + if (MUTEX_HELD(&kmem_cache_lock) || kmem_taskq == NULL || + atomic_cas_32(flag, 0, 1) != 0) + return; + + /* + * It may not be kosher to do memory allocation when a reap is called + * is called (for example, if vmem_populate() is in the call chain). + * So we start the reap going with a TQ_NOALLOC dispatch. If the + * dispatch fails, we reset the flag, and the next reap will try again. + */ + if (!taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC)) + *flag = 0; +} + +/* + * Reclaim all unused memory from all caches. Called from the VM system + * when memory gets tight. + */ +void +kmem_reap(void) +{ + kmem_reap_common(&kmem_reaping); +} + +/* + * Reclaim all unused memory from identifier arenas, called when a vmem + * arena not back by memory is exhausted. Since reaping memory-backed caches + * cannot help with identifier exhaustion, we avoid both a large amount of + * work and unwanted side-effects from reclaim callbacks. + */ +void +kmem_reap_idspace(void) +{ + kmem_reap_common(&kmem_reaping_idspace); +} + +/* + * Purge all magazines from a cache and set its magazine limit to zero. + * All calls are serialized by the kmem_taskq lock, except for the final + * call from kmem_cache_destroy(). + */ +static void +kmem_cache_magazine_purge(kmem_cache_t *cp) +{ + kmem_cpu_cache_t *ccp; + kmem_magazine_t *mp, *pmp; + int rounds, prounds, cpu_seqid; + + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + ccp = &cp->cache_cpu[cpu_seqid]; + + mutex_enter(&ccp->cc_lock); + mp = ccp->cc_loaded; + pmp = ccp->cc_ploaded; + rounds = ccp->cc_rounds; + prounds = ccp->cc_prounds; + ccp->cc_loaded = NULL; + ccp->cc_ploaded = NULL; + ccp->cc_rounds = -1; + ccp->cc_prounds = -1; + ccp->cc_magsize = 0; + mutex_exit(&ccp->cc_lock); + + if (mp) + kmem_magazine_destroy(cp, mp, rounds); + + if (pmp) + kmem_magazine_destroy(cp, pmp, prounds); + } + + kmem_depot_ws_zero(cp); + kmem_depot_ws_reap(cp); +} + +/* + * Enable per-cpu magazines on a cache. + */ +static void +kmem_cache_magazine_enable(kmem_cache_t *cp) +{ + int cpu_seqid; + + if (cp->cache_flags & KMF_NOMAGAZINE) + return; + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + mutex_enter(&ccp->cc_lock); + ccp->cc_magsize = cp->cache_magtype->mt_magsize; + mutex_exit(&ccp->cc_lock); + } + +} + +static void +kmem_cache_magazine_disable(kmem_cache_t *cp) +{ + int cpu_seqid; + + if (cp->cache_flags & KMF_NOMAGAZINE) + return; + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + mutex_enter(&ccp->cc_lock); + ccp->cc_magsize = 0; + mutex_exit(&ccp->cc_lock); + } + +} + +/* + * Allow our caller to determine if there are running reaps. + * + * This call is very conservative and may return B_TRUE even when + * reaping activity isn't active. If it returns B_FALSE, then reaping + * activity is definitely inactive. + */ +boolean_t +kmem_cache_reap_active(void) +{ + return (!taskq_empty(kmem_taskq)); +} + +/* + * Reap (almost) everything right now. + */ +void +kmem_cache_reap_now(kmem_cache_t *cp) +{ + ASSERT(list_link_active(&cp->cache_link)); + + kmem_depot_ws_zero(cp); + + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_depot_ws_reap, cp, TQ_SLEEP); +} + +/* + * Recompute a cache's magazine size. The trade-off is that larger magazines + * provide a higher transfer rate with the depot, while smaller magazines + * reduce memory consumption. Magazine resizing is an expensive operation; + * it should not be done frequently. + * + * Changes to the magazine size are serialized by the kmem_taskq lock. + * + * Note: at present this only grows the magazine size. It might be useful + * to allow shrinkage too. + */ +static void +kmem_cache_magazine_resize(kmem_cache_t *cp) +{ + kmem_magtype_t *mtp = cp->cache_magtype; + + ASSERT(taskq_member(kmem_taskq, curthread)); + + if (cp->cache_chunksize < mtp->mt_maxbuf) { + kmem_cache_magazine_purge(cp); + mutex_enter(&cp->cache_depot_lock); + cp->cache_magtype = ++mtp; + cp->cache_depot_contention_prev = + cp->cache_depot_contention + INT_MAX; + mutex_exit(&cp->cache_depot_lock); + kmem_cache_magazine_enable(cp); + } +} + +/* + * Rescale a cache's hash table, so that the table size is roughly the + * cache size. We want the average lookup time to be extremely small. + */ +static void +kmem_hash_rescale(kmem_cache_t *cp) +{ + kmem_bufctl_t **old_table, **new_table, *bcp; + size_t old_size, new_size, h; + + ASSERT(taskq_member(kmem_taskq, curthread)); + + new_size = MAX(KMEM_HASH_INITIAL, + 1 << (highbit(3 * cp->cache_buftotal + 4) - 2)); + old_size = cp->cache_hash_mask + 1; + + if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) + return; + + new_table = vmem_alloc(kmem_hash_arena, new_size * sizeof (void *), + VM_NOSLEEP); + if (new_table == NULL) + return; + bzero(new_table, new_size * sizeof (void *)); + + mutex_enter(&cp->cache_lock); + + old_size = cp->cache_hash_mask + 1; + old_table = cp->cache_hash_table; + + cp->cache_hash_mask = new_size - 1; + cp->cache_hash_table = new_table; + cp->cache_rescale++; + + for (h = 0; h < old_size; h++) { + bcp = old_table[h]; + while (bcp != NULL) { + void *addr = bcp->bc_addr; + kmem_bufctl_t *next_bcp = bcp->bc_next; + kmem_bufctl_t **hash_bucket = KMEM_HASH(cp, addr); + bcp->bc_next = *hash_bucket; + *hash_bucket = bcp; + bcp = next_bcp; + } + } + + mutex_exit(&cp->cache_lock); + + vmem_free(kmem_hash_arena, old_table, old_size * sizeof (void *)); +} + +/* + * Perform periodic maintenance on a cache: hash rescaling, depot working-set + * update, magazine resizing, and slab consolidation. + */ +static void +kmem_cache_update(kmem_cache_t *cp) +{ + int need_hash_rescale = 0; + int need_magazine_resize = 0; + + /* + * If the cache has become much larger or smaller than its hash table, + * fire off a request to rescale the hash table. + */ + mutex_enter(&cp->cache_lock); + + if ((cp->cache_flags & KMF_HASH) && + (cp->cache_buftotal > (cp->cache_hash_mask << 1) || + (cp->cache_buftotal < (cp->cache_hash_mask >> 1) && + cp->cache_hash_mask > KMEM_HASH_INITIAL))) + need_hash_rescale = 1; + + mutex_exit(&cp->cache_lock); + + /* + * Update the depot working set statistics. + */ + kmem_depot_ws_update(cp); + + /* + * If there's a lot of contention in the depot, + * increase the magazine size. + */ + mutex_enter(&cp->cache_depot_lock); + + if (cp->cache_chunksize < cp->cache_magtype->mt_maxbuf && + (int)(cp->cache_depot_contention - + cp->cache_depot_contention_prev) > kmem_depot_contention) + need_magazine_resize = 1; + + cp->cache_depot_contention_prev = cp->cache_depot_contention; + + mutex_exit(&cp->cache_depot_lock); + + if (need_hash_rescale) + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_hash_rescale, cp, TQ_NOSLEEP); + + if (need_magazine_resize) + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_magazine_resize, + cp, TQ_NOSLEEP); + + // smd : the following if is only true for the dnode cache + if (cp->cache_defrag != NULL) + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_scan, cp, TQ_NOSLEEP); + +#ifdef DEBUG + else { + // for every other cache, duplicate some of the logic from + // kmem_cache_scan() below + // run reap occasionally even if there is plenty of memory + uint16_t debug_rand; + + (void) random_get_bytes((uint8_t *)&debug_rand, 2); + if (!kmem_move_noreap && + ((debug_rand % kmem_mtb_reap) == 0)) { + /* + * no mutex above, so no need to give it up as + * in kmem_cache_scan() + */ + } + } +#endif + +} + +static void kmem_update(void *); + +static void +kmem_update_timeout(void *dummy) +{ + (void) bsd_timeout(kmem_update, dummy, &kmem_reap_interval); +} + +static void +kmem_update(void *dummy) +{ + kmem_cache_applyall(kmem_cache_update, NULL, TQ_NOSLEEP); + + /* + * We use taskq_dispatch() to reschedule the timeout so that + * kmem_update() becomes self-throttling: it won't schedule + * new tasks until all previous tasks have completed. + */ + if (!taskq_dispatch(kmem_taskq, kmem_update_timeout, dummy, TQ_NOSLEEP)) + kmem_update_timeout(NULL); + +} + +static int +kmem_cache_kstat_update(kstat_t *ksp, int rw) +{ + struct kmem_cache_kstat *kmcp = &kmem_cache_kstat; + kmem_cache_t *cp = ksp->ks_private; + uint64_t cpu_buf_avail; + uint64_t buf_avail = 0; + int cpu_seqid; + long reap; + + if (rw == KSTAT_WRITE) + return (EACCES); + + mutex_enter(&cp->cache_lock); + + kmcp->kmc_alloc_fail.value.ui64 = cp->cache_alloc_fail; + kmcp->kmc_alloc.value.ui64 = cp->cache_slab_alloc; + kmcp->kmc_free.value.ui64 = cp->cache_slab_free; + kmcp->kmc_slab_alloc.value.ui64 = cp->cache_slab_alloc; + kmcp->kmc_slab_free.value.ui64 = cp->cache_slab_free; + kmcp->kmc_no_vba_success.value.ui64 = cp->no_vba_success; + kmcp->kmc_no_vba_fail.value.ui64 = cp->no_vba_fail; + kmcp->kmc_arc_no_grow_set.value.ui64 = cp->arc_no_grow_set; + kmcp->kmc_arc_no_grow.value.ui64 = cp->arc_no_grow; + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + + mutex_enter(&ccp->cc_lock); + + cpu_buf_avail = 0; + if (ccp->cc_rounds > 0) + cpu_buf_avail += ccp->cc_rounds; + if (ccp->cc_prounds > 0) + cpu_buf_avail += ccp->cc_prounds; + + kmcp->kmc_alloc.value.ui64 += ccp->cc_alloc; + kmcp->kmc_free.value.ui64 += ccp->cc_free; + buf_avail += cpu_buf_avail; + + mutex_exit(&ccp->cc_lock); + } + + mutex_enter(&cp->cache_depot_lock); + + kmcp->kmc_depot_alloc.value.ui64 = cp->cache_full.ml_alloc; + kmcp->kmc_depot_free.value.ui64 = cp->cache_empty.ml_alloc; + kmcp->kmc_depot_contention.value.ui64 = cp->cache_depot_contention; + kmcp->kmc_full_magazines.value.ui64 = cp->cache_full.ml_total; + kmcp->kmc_empty_magazines.value.ui64 = cp->cache_empty.ml_total; + kmcp->kmc_magazine_size.value.ui64 = + (cp->cache_flags & KMF_NOMAGAZINE) ? + 0 : cp->cache_magtype->mt_magsize; + + kmcp->kmc_alloc.value.ui64 += cp->cache_full.ml_alloc; + kmcp->kmc_free.value.ui64 += cp->cache_empty.ml_alloc; + buf_avail += cp->cache_full.ml_total * cp->cache_magtype->mt_magsize; + + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + reap = MIN(reap, cp->cache_full.ml_total); + + mutex_exit(&cp->cache_depot_lock); + + kmcp->kmc_buf_size.value.ui64 = cp->cache_bufsize; + kmcp->kmc_align.value.ui64 = cp->cache_align; + kmcp->kmc_chunk_size.value.ui64 = cp->cache_chunksize; + kmcp->kmc_slab_size.value.ui64 = cp->cache_slabsize; + kmcp->kmc_buf_constructed.value.ui64 = buf_avail; + buf_avail += cp->cache_bufslab; + kmcp->kmc_buf_avail.value.ui64 = buf_avail; + kmcp->kmc_buf_inuse.value.ui64 = cp->cache_buftotal - buf_avail; + kmcp->kmc_buf_total.value.ui64 = cp->cache_buftotal; + kmcp->kmc_buf_max.value.ui64 = cp->cache_bufmax; + kmcp->kmc_slab_create.value.ui64 = cp->cache_slab_create; + kmcp->kmc_slab_destroy.value.ui64 = cp->cache_slab_destroy; + kmcp->kmc_hash_size.value.ui64 = (cp->cache_flags & KMF_HASH) ? + cp->cache_hash_mask + 1 : 0; + kmcp->kmc_hash_lookup_depth.value.ui64 = cp->cache_lookup_depth; + kmcp->kmc_hash_rescale.value.ui64 = cp->cache_rescale; + kmcp->kmc_vmem_source.value.ui64 = cp->cache_arena->vm_id; + kmcp->kmc_reap.value.ui64 = cp->cache_reap; + + if (cp->cache_defrag == NULL) { + kmcp->kmc_move_callbacks.value.ui64 = 0; + kmcp->kmc_move_yes.value.ui64 = 0; + kmcp->kmc_move_no.value.ui64 = 0; + kmcp->kmc_move_later.value.ui64 = 0; + kmcp->kmc_move_dont_need.value.ui64 = 0; + kmcp->kmc_move_dont_know.value.ui64 = 0; + kmcp->kmc_move_hunt_found.value.ui64 = 0; + kmcp->kmc_move_slabs_freed.value.ui64 = 0; + kmcp->kmc_defrag.value.ui64 = 0; + kmcp->kmc_scan.value.ui64 = 0; + kmcp->kmc_move_reclaimable.value.ui64 = 0; + } else { + int64_t reclaimable; + + kmem_defrag_t *kd = cp->cache_defrag; + kmcp->kmc_move_callbacks.value.ui64 = kd->kmd_callbacks; + kmcp->kmc_move_yes.value.ui64 = kd->kmd_yes; + kmcp->kmc_move_no.value.ui64 = kd->kmd_no; + kmcp->kmc_move_later.value.ui64 = kd->kmd_later; + kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need; + kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know; + kmcp->kmc_move_hunt_found.value.ui64 = 0; + kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed; + kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags; + kmcp->kmc_scan.value.ui64 = kd->kmd_scans; + + reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1); + reclaimable = MAX(reclaimable, 0); + reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize); + kmcp->kmc_move_reclaimable.value.ui64 = reclaimable; + } + + mutex_exit(&cp->cache_lock); + return (0); +} + +/* + * Return a named statistic about a particular cache. + * This shouldn't be called very often, so it's currently designed for + * simplicity (leverages existing kstat support) rather than efficiency. + */ +uint64_t +kmem_cache_stat(kmem_cache_t *cp, char *name) +{ + int i; + kstat_t *ksp = cp->cache_kstat; + kstat_named_t *knp = (kstat_named_t *)&kmem_cache_kstat; + uint64_t value = 0; + + if (ksp != NULL) { + mutex_enter(&kmem_cache_kstat_lock); + (void) kmem_cache_kstat_update(ksp, KSTAT_READ); + for (i = 0; i < ksp->ks_ndata; i++) { + if (strcmp(knp[i].name, name) == 0) { + value = knp[i].value.ui64; + break; + } + } + mutex_exit(&kmem_cache_kstat_lock); + } + return (value); +} + +// TRUE if we have more than a critical minimum of memory +// used in arc_memory_throttle; if FALSE, we throttle +static inline bool +spl_minimal_physmem_p_logic() +{ + // do we have enough memory to avoid throttling? + if (vm_page_free_wanted > 0) + return (false); + if (vm_page_free_count < (vm_page_free_min + 512)) + // 512 pages above 3500 (normal vm_page_free_min) + // 2MiB above 13 MiB + return (false); + return (true); +} + +int32_t +spl_minimal_physmem_p(void) +{ + + // arc will throttle throttle if we are paging, otherwise + // we want a small bit of pressure here so that we can compete + // a little with the xnu buffer cache + + return (spl_free > -1024LL); +} + +/* + * Return the maximum amount of memory that is (in theory) allocatable + * from the heap. This may be used as an estimate only since there + * is no guarentee this space will still be available when an allocation + * request is made, nor that the space may be allocated in one big request + * due to kernel heap fragmentation. + */ +size_t +kmem_maxavail(void) +{ +#ifndef APPLE + // spgcnt_t pmem = availrmem - tune.t_minarmem; + // spgcnt_t vmem = btop(vmem_size(heap_arena, VMEM_FREE)); + // + // return ((size_t)ptob(MAX(MIN(pmem, vmem), 0))); +#endif + return (physmem * PAGE_SIZE); +} + +/* + * Indicate whether memory-intensive kmem debugging is enabled. + */ +int +kmem_debugging(void) +{ + return (kmem_flags & (KMF_AUDIT | KMF_REDZONE)); +} + +/* binning function, sorts finely at the two extremes */ +#define KMEM_PARTIAL_SLAB_WEIGHT(sp, binshift) \ +((((sp)->slab_refcnt <= (binshift)) || \ +(((sp)->slab_chunks - (sp)->slab_refcnt) <= (binshift))) \ +? -(sp)->slab_refcnt \ +: -((binshift) + ((sp)->slab_refcnt >> (binshift)))) + +/* + * Minimizing the number of partial slabs on the freelist minimizes + * fragmentation (the ratio of unused buffers held by the slab layer). There are + * two ways to get a slab off of the freelist: 1) free all the buffers on the + * slab, and 2) allocate all the buffers on the slab. It follows that we want + * the most-used slabs at the front of the list where they have the best chance + * of being completely allocated, and the least-used slabs at a safe distance + * from the front to improve the odds that the few remaining buffers will all be + * freed before another allocation can tie up the slab. For that reason a slab + * with a higher slab_refcnt sorts less than than a slab with a lower + * slab_refcnt. + * + * However, if a slab has at least one buffer that is deemed unfreeable, we + * would rather have that slab at the front of the list regardless of + * slab_refcnt, since even one unfreeable buffer makes the entire slab + * unfreeable. If the client returns KMEM_CBRC_NO in response to a cache_move() + * callback, the slab is marked unfreeable for as long as it remains on the + * freelist. + */ +static int +kmem_partial_slab_cmp(const void *pp0, const void *pp1) +{ + const kmem_cache_t *cp; + const kmem_slab_t *s0 = pp0; + const kmem_slab_t *s1 = pp1; + int w0, w1; + size_t binshift; + + ASSERT(KMEM_SLAB_IS_PARTIAL(s0)); + ASSERT(KMEM_SLAB_IS_PARTIAL(s1)); + ASSERT(s0->slab_cache == s1->slab_cache); + cp = s1->slab_cache; + ASSERT(MUTEX_HELD((struct kmutex *)&cp->cache_lock)); + binshift = cp->cache_partial_binshift; + + /* weight of first slab */ + w0 = KMEM_PARTIAL_SLAB_WEIGHT(s0, binshift); + if (s0->slab_flags & KMEM_SLAB_NOMOVE) { + w0 -= cp->cache_maxchunks; + } + + /* weight of second slab */ + w1 = KMEM_PARTIAL_SLAB_WEIGHT(s1, binshift); + if (s1->slab_flags & KMEM_SLAB_NOMOVE) { + w1 -= cp->cache_maxchunks; + } + + if (w0 < w1) + return (-1); + if (w0 > w1) + return (1); + + // compare slab age if available + hrtime_t c0 = s0->slab_create_time, c1 = s1->slab_create_time; + if (c0 != 0 && c1 != 0 && c0 != c1) { + // higher time is newer; newer sorts before older + if (c0 < c1) // c0 is older than c1 + return (1); // so c0 sorts after c1 + if (c0 > c1) + return (-1); + } + + /* compare pointer values */ + if ((uintptr_t)s0 < (uintptr_t)s1) + return (-1); + if ((uintptr_t)s0 > (uintptr_t)s1) + return (1); + + return (0); +} + +/* + * It must be valid to call the destructor (if any) on a newly created object. + * That is, the constructor (if any) must leave the object in a valid state for + * the destructor. + */ +kmem_cache_t * +kmem_cache_create( + char *name, /* descriptive name for this cache */ + size_t bufsize, /* size of the objects it manages */ + size_t align, /* required object alignment */ + int (*constructor)(void *, void *, int), /* object constructor */ + void (*destructor)(void *, void *), /* object destructor */ + void (*reclaim)(void *), /* memory reclaim callback */ + void *private, /* pass-thru arg for constr/destr/reclaim */ + vmem_t *vmp, /* vmem source for slab allocation */ + int cflags) /* cache creation flags */ +{ + int cpu_seqid; + size_t chunksize; + kmem_cache_t *cp; + kmem_magtype_t *mtp; + size_t csize = KMEM_CACHE_SIZE(max_ncpus); + +#ifdef DEBUG + /* + * Cache names should conform to the rules for valid C identifiers + */ + if (!strident_valid(name)) { + cmn_err(CE_CONT, + "kmem_cache_create: '%s' is an invalid cache name\n" + "cache names must conform to the rules for " + "C identifiers\n", name); + } +#endif /* DEBUG */ + + if (vmp == NULL) + vmp = kmem_default_arena; + + /* + * If this kmem cache has an identifier vmem arena as its source, mark + * it such to allow kmem_reap_idspace(). + */ + ASSERT(!(cflags & KMC_IDENTIFIER)); /* consumer should not set this */ + if (vmp->vm_cflags & VMC_IDENTIFIER) + cflags |= KMC_IDENTIFIER; + + /* + * Get a kmem_cache structure. We arrange that cp->cache_cpu[] + * is aligned on a KMEM_CPU_CACHE_SIZE boundary to prevent + * false sharing of per-CPU data. + */ + cp = vmem_xalloc(kmem_cache_arena, csize, + KMEM_CPU_CACHE_SIZE, + P2NPHASE(csize, KMEM_CPU_CACHE_SIZE), + 0, NULL, NULL, VM_SLEEP); + bzero(cp, csize); + list_link_init(&cp->cache_link); + + if (align == 0) + align = KMEM_ALIGN; + + /* + * If we're not at least KMEM_ALIGN aligned, we can't use free + * memory to hold bufctl information (because we can't safely + * perform word loads and stores on it). + */ + if (align < KMEM_ALIGN) + cflags |= KMC_NOTOUCH; + + if ((align & (align - 1)) != 0 || align > vmp->vm_quantum) + panic("kmem_cache_create: bad alignment %lu", align); + + mutex_enter(&kmem_flags_lock); + if (kmem_flags & KMF_RANDOMIZE) + kmem_flags = (((kmem_flags | ~KMF_RANDOM) + 1) & KMF_RANDOM) | + KMF_RANDOMIZE; + cp->cache_flags = (kmem_flags | cflags) & KMF_DEBUG; + mutex_exit(&kmem_flags_lock); + + /* + * Make sure all the various flags are reasonable. + */ + ASSERT(!(cflags & KMC_NOHASH) || !(cflags & KMC_NOTOUCH)); + + if (cp->cache_flags & KMF_LITE) { + if (bufsize >= kmem_lite_minsize && + align <= kmem_lite_maxalign && + P2PHASE(bufsize, kmem_lite_maxalign) != 0) { + cp->cache_flags |= KMF_BUFTAG; + cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL); + } else { + cp->cache_flags &= ~KMF_DEBUG; + } + } + + if (cp->cache_flags & KMF_DEADBEEF) + cp->cache_flags |= KMF_REDZONE; + + if ((cflags & KMC_QCACHE) && (cp->cache_flags & KMF_AUDIT)) + cp->cache_flags |= KMF_NOMAGAZINE; + + if (cflags & KMC_NODEBUG) + cp->cache_flags &= ~KMF_DEBUG; + + if (cflags & KMC_NOTOUCH) + cp->cache_flags &= ~KMF_TOUCH; + + if (cflags & KMC_PREFILL) + cp->cache_flags |= KMF_PREFILL; + + if (cflags & KMC_NOHASH) + cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL); + + if (cflags & KMC_NOMAGAZINE) + cp->cache_flags |= KMF_NOMAGAZINE; + + if ((cp->cache_flags & KMF_AUDIT) && !(cflags & KMC_NOTOUCH)) + cp->cache_flags |= KMF_REDZONE; + + if (!(cp->cache_flags & KMF_AUDIT)) + cp->cache_flags &= ~KMF_CONTENTS; + + if ((cp->cache_flags & KMF_BUFTAG) && bufsize >= kmem_minfirewall && + !(cp->cache_flags & KMF_LITE) && !(cflags & KMC_NOHASH)) + cp->cache_flags |= KMF_FIREWALL; + + if (vmp != kmem_default_arena || kmem_firewall_arena == NULL) + cp->cache_flags &= ~KMF_FIREWALL; + + if (cp->cache_flags & KMF_FIREWALL) { + cp->cache_flags &= ~KMF_BUFTAG; + cp->cache_flags |= KMF_NOMAGAZINE; + ASSERT(vmp == kmem_default_arena); + vmp = kmem_firewall_arena; + } + + /* + * Set cache properties. + */ + (void) strncpy(cp->cache_name, name, KMEM_CACHE_NAMELEN); + strident_canon(cp->cache_name, KMEM_CACHE_NAMELEN + 1); + cp->cache_bufsize = bufsize; + cp->cache_align = align; + cp->cache_constructor = constructor; + cp->cache_destructor = destructor; + cp->cache_reclaim = reclaim; + cp->cache_private = private; + cp->cache_arena = vmp; + cp->cache_cflags = cflags; + + /* + * Determine the chunk size. + */ + chunksize = bufsize; + + if (align >= KMEM_ALIGN) { + chunksize = P2ROUNDUP(chunksize, KMEM_ALIGN); + cp->cache_bufctl = chunksize - KMEM_ALIGN; + } + + if (cp->cache_flags & KMF_BUFTAG) { + cp->cache_bufctl = chunksize; + cp->cache_buftag = chunksize; + if (cp->cache_flags & KMF_LITE) + chunksize += KMEM_BUFTAG_LITE_SIZE(kmem_lite_count); + else + chunksize += sizeof (kmem_buftag_t); + } + + if (cp->cache_flags & KMF_DEADBEEF) { + cp->cache_verify = MIN(cp->cache_buftag, kmem_maxverify); + if (cp->cache_flags & KMF_LITE) + cp->cache_verify = sizeof (uint64_t); + } + + cp->cache_contents = MIN(cp->cache_bufctl, kmem_content_maxsave); + + cp->cache_chunksize = chunksize = P2ROUNDUP(chunksize, align); + + /* + * Now that we know the chunk size, determine the optimal slab size. + */ + + size_t vquantum = vmp->vm_quantum; + + if ((cflags & KMC_ARENA_SLAB) == KMC_ARENA_SLAB) { + VERIFY3U((vmp->vm_cflags & VMC_NO_QCACHE), ==, VMC_NO_QCACHE); + VERIFY3U(vmp->vm_min_import, >, 0); + VERIFY3U(vmp->vm_min_import, >=, (2 * vmp->vm_quantum)); + VERIFY(ISP2(vmp->vm_min_import)); + vquantum = vmp->vm_min_import >> 1; + } + + if (vmp == kmem_firewall_arena) { + cp->cache_slabsize = P2ROUNDUP(chunksize, vquantum); + cp->cache_mincolor = cp->cache_slabsize - chunksize; + cp->cache_maxcolor = cp->cache_mincolor; + cp->cache_flags |= KMF_HASH; + ASSERT(!(cp->cache_flags & KMF_BUFTAG)); + } else if ((cflags & KMC_NOHASH) || (!(cflags & KMC_NOTOUCH) && + !(cp->cache_flags & KMF_AUDIT) && + chunksize < vquantum / + KMEM_VOID_FRACTION)) { + cp->cache_slabsize = vquantum; + cp->cache_mincolor = 0; + cp->cache_maxcolor = + (cp->cache_slabsize - sizeof (kmem_slab_t)) % chunksize; + ASSERT(chunksize + sizeof (kmem_slab_t) <= cp->cache_slabsize); + ASSERT(!(cp->cache_flags & KMF_AUDIT)); + } else { + size_t chunks, bestfit, waste, slabsize; + size_t minwaste = LONG_MAX; + + for (chunks = 1; chunks <= KMEM_VOID_FRACTION; chunks++) { + slabsize = P2ROUNDUP(chunksize * chunks, + vquantum); + chunks = slabsize / chunksize; + waste = (slabsize % chunksize) / chunks; + if (waste < minwaste) { + minwaste = waste; + bestfit = slabsize; + } + } + if (cflags & KMC_QCACHE) + bestfit = VMEM_QCACHE_SLABSIZE(vmp->vm_qcache_max); + cp->cache_slabsize = bestfit; + cp->cache_mincolor = 0; + cp->cache_maxcolor = bestfit % chunksize; + cp->cache_flags |= KMF_HASH; + } + + cp->cache_maxchunks = (cp->cache_slabsize / cp->cache_chunksize); + cp->cache_partial_binshift = highbit(cp->cache_maxchunks / 16) + 1; + + /* + * Disallowing prefill when either the DEBUG or HASH flag is set or when + * there is a constructor avoids some tricky issues with debug setup + * that may be revisited later. We cannot allow prefill in a + * metadata cache because of potential recursion. + */ + if (vmp == kmem_msb_arena || + cp->cache_flags & (KMF_HASH | KMF_BUFTAG) || + cp->cache_constructor != NULL) + cp->cache_flags &= ~KMF_PREFILL; + + if (cp->cache_flags & KMF_HASH) { + ASSERT(!(cflags & KMC_NOHASH)); + cp->cache_bufctl_cache = (cp->cache_flags & KMF_AUDIT) ? + kmem_bufctl_audit_cache : kmem_bufctl_cache; + } + + if (cp->cache_maxcolor >= vquantum) + cp->cache_maxcolor = vquantum - 1; + + cp->cache_color = cp->cache_mincolor; + + /* + * Initialize the rest of the slab layer. + */ + mutex_init(&cp->cache_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&cp->cache_partial_slabs, kmem_partial_slab_cmp, + sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link)); + /* LINTED: E_TRUE_LOGICAL_EXPR */ + ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t)); + /* reuse partial slab AVL linkage for complete slab list linkage */ + list_create(&cp->cache_complete_slabs, + sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link)); + + if (cp->cache_flags & KMF_HASH) { + cp->cache_hash_table = vmem_alloc(kmem_hash_arena, + KMEM_HASH_INITIAL * sizeof (void *), + VM_SLEEP); + bzero(cp->cache_hash_table, + KMEM_HASH_INITIAL * sizeof (void *)); + cp->cache_hash_mask = KMEM_HASH_INITIAL - 1; + cp->cache_hash_shift = highbit((ulong_t)chunksize) - 1; + } + + /* + * Initialize the depot. + */ + mutex_init(&cp->cache_depot_lock, NULL, MUTEX_DEFAULT, NULL); + + for (mtp = kmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) + continue; + + cp->cache_magtype = mtp; + + /* + * Initialize the CPU layer. + */ + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + mutex_init(&ccp->cc_lock, NULL, MUTEX_DEFAULT, NULL); // XNU + ccp->cc_flags = cp->cache_flags; + ccp->cc_rounds = -1; + ccp->cc_prounds = -1; + } + + /* + * Create the cache's kstats. + */ + if ((cp->cache_kstat = kstat_create("unix", 0, cp->cache_name, + "kmem_cache", KSTAT_TYPE_NAMED, + sizeof (kmem_cache_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + cp->cache_kstat->ks_data = &kmem_cache_kstat; + cp->cache_kstat->ks_update = kmem_cache_kstat_update; + cp->cache_kstat->ks_private = cp; + cp->cache_kstat->ks_lock = &kmem_cache_kstat_lock; + kstat_install(cp->cache_kstat); + } + + /* + * Add the cache to the global list. This makes it visible + * to kmem_update(), so the cache must be ready for business. + */ + mutex_enter(&kmem_cache_lock); + list_insert_tail(&kmem_caches, cp); + mutex_exit(&kmem_cache_lock); + + if (kmem_ready) + kmem_cache_magazine_enable(cp); + + return (cp); +} + +static int +kmem_move_cmp(const void *buf, const void *p) +{ + const kmem_move_t *kmm = p; + uintptr_t v1 = (uintptr_t)buf; + uintptr_t v2 = (uintptr_t)kmm->kmm_from_buf; + return (v1 < v2 ? -1 : (v1 > v2 ? 1 : 0)); +} + +static void +kmem_reset_reclaim_threshold(kmem_defrag_t *kmd) +{ + kmd->kmd_reclaim_numer = 1; +} + +/* + * Initially, when choosing candidate slabs for buffers to move, we want to be + * very selective and take only slabs that are less than + * (1 / KMEM_VOID_FRACTION) allocated. If we have difficulty finding candidate + * slabs, then we raise the allocation ceiling incrementally. The reclaim + * threshold is reset to (1 / KMEM_VOID_FRACTION) as soon as the cache is no + * longer fragmented. + */ +static void +kmem_adjust_reclaim_threshold(kmem_defrag_t *kmd, int direction) +{ + if (direction > 0) { + /* make it easier to find a candidate slab */ + if (kmd->kmd_reclaim_numer < (KMEM_VOID_FRACTION - 1)) { + kmd->kmd_reclaim_numer++; + } + } else { + /* be more selective */ + if (kmd->kmd_reclaim_numer > 1) { + kmd->kmd_reclaim_numer--; + } + } +} + +uint64_t +spl_kmem_cache_inuse(kmem_cache_t *cache) +{ + return (cache->cache_buftotal); +} + +uint64_t +spl_kmem_cache_entry_size(kmem_cache_t *cache) +{ + return (cache->cache_bufsize); +} + +void +kmem_cache_set_move(kmem_cache_t *cp, + kmem_cbrc_t (*move)(void *, void *, size_t, void *)) +{ + kmem_defrag_t *defrag; + + ASSERT(move != NULL); + /* + * The consolidator does not support NOTOUCH caches because kmem cannot + * initialize their slabs with the 0xbaddcafe memory pattern, which sets + * a low order bit usable by clients to distinguish uninitialized memory + * from known objects (see kmem_slab_create). + */ + ASSERT(!(cp->cache_cflags & KMC_NOTOUCH)); + ASSERT(!(cp->cache_cflags & KMC_IDENTIFIER)); + + /* + * We should not be holding anyone's cache lock when calling + * kmem_cache_alloc(), so allocate in all cases before acquiring the + * lock. + */ + defrag = kmem_cache_alloc(kmem_defrag_cache, KM_SLEEP); + + mutex_enter(&cp->cache_lock); + + if (KMEM_IS_MOVABLE(cp)) { + if (cp->cache_move == NULL) { + ASSERT(cp->cache_slab_alloc == 0); + + cp->cache_defrag = defrag; + defrag = NULL; /* nothing to free */ + bzero(cp->cache_defrag, sizeof (kmem_defrag_t)); + avl_create(&cp->cache_defrag->kmd_moves_pending, + kmem_move_cmp, sizeof (kmem_move_t), + offsetof(kmem_move_t, kmm_entry)); + /* LINTED: E_TRUE_LOGICAL_EXPR */ + ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t)); + /* reuse the slab's AVL linkage for deadlist linkage */ + list_create(&cp->cache_defrag->kmd_deadlist, + sizeof (kmem_slab_t), + offsetof(kmem_slab_t, slab_link)); + kmem_reset_reclaim_threshold(cp->cache_defrag); + } + cp->cache_move = move; + } + + mutex_exit(&cp->cache_lock); + + if (defrag != NULL) { + kmem_cache_free(kmem_defrag_cache, defrag); /* unused */ + } +} + +void +kmem_qcache_destroy() +{ + kmem_cache_t *cp; + kmem_cache_t *cache_to_destroy = NULL; + + do { + cache_to_destroy = NULL; + mutex_enter(&kmem_cache_lock); + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (cp->cache_cflags & KMC_QCACHE) { + cache_to_destroy = cp; + break; + } + } + mutex_exit(&kmem_cache_lock); + + if (cache_to_destroy) { + kmem_cache_destroy(cache_to_destroy); + } + } while (cache_to_destroy); +} + +void +kmem_cache_destroy(kmem_cache_t *cp) +{ + int cpu_seqid; + + /* + * Remove the cache from the global cache list so that no one else + * can schedule tasks on its behalf, wait for any pending tasks to + * complete, purge the cache, and then destroy it. + */ + mutex_enter(&kmem_cache_lock); + list_remove(&kmem_caches, cp); + mutex_exit(&kmem_cache_lock); + + if (kmem_taskq != NULL) + taskq_wait(kmem_taskq); + + if (kmem_move_taskq != NULL && cp->cache_defrag != NULL) + taskq_wait(kmem_move_taskq); + + kmem_cache_magazine_purge(cp); + + mutex_enter(&cp->cache_lock); + + if (cp->cache_buftotal != 0) + cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty", + cp->cache_name, (void *)cp); + if (cp->cache_defrag != NULL) { + avl_destroy(&cp->cache_defrag->kmd_moves_pending); + list_destroy(&cp->cache_defrag->kmd_deadlist); + kmem_cache_free(kmem_defrag_cache, cp->cache_defrag); + cp->cache_defrag = NULL; + } + /* + * The cache is now dead. There should be no further activity. We + * enforce this by setting land mines in the constructor, destructor, + * reclaim, and move routines that induce a kernel text fault if + * invoked. + */ + cp->cache_constructor = (int (*)(void *, void *, int))1; + cp->cache_destructor = (void (*)(void *, void *))2; + cp->cache_reclaim = (void (*)(void *))3; + cp->cache_move = (kmem_cbrc_t (*)(void *, void *, size_t, void *))4; + mutex_exit(&cp->cache_lock); + + kstat_delete(cp->cache_kstat); + + if (cp->cache_hash_table != NULL) + vmem_free(kmem_hash_arena, cp->cache_hash_table, + (cp->cache_hash_mask + 1) * sizeof (void *)); + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) + mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock); + + mutex_destroy(&cp->cache_depot_lock); + mutex_destroy(&cp->cache_lock); + + vmem_free(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus)); +} + +static void +kmem_alloc_caches_create(const int *array, size_t count, + kmem_cache_t **alloc_table, size_t maxbuf, + uint_t shift) +{ + char name[KMEM_CACHE_NAMELEN + 1]; + size_t table_unit = (1 << shift); /* range of one alloc_table entry */ + size_t size = table_unit; + int i; + + for (i = 0; i < count; i++) { + size_t cache_size = array[i]; + size_t align = KMEM_ALIGN; + kmem_cache_t *cp; + + /* if the table has an entry for maxbuf, we're done */ + if (size > maxbuf) + break; + + /* cache size must be a multiple of the table unit */ + ASSERT(P2PHASE(cache_size, table_unit) == 0); + + /* + * If they allocate a multiple of the coherency granularity, + * they get a coherency-granularity-aligned address. + */ + if (IS_P2ALIGNED(cache_size, 64)) + align = 64; + if (IS_P2ALIGNED(cache_size, PAGESIZE)) + align = PAGESIZE; + (void) snprintf(name, sizeof (name), + "kmem_alloc_%lu", cache_size); + cp = kmem_cache_create(name, cache_size, align, + NULL, NULL, NULL, NULL, NULL, KMC_KMEM_ALLOC | KMF_HASH); + + while (size <= cache_size) { + alloc_table[(size - 1) >> shift] = cp; + size += table_unit; + } + } + + ASSERT(size > maxbuf); /* i.e. maxbuf <= max(cache_size) */ +} + +static void +kmem_alloc_caches_destroy() +{ + kmem_cache_t *cache_to_destroy = NULL; + kmem_cache_t *cp = NULL; + + do { + cache_to_destroy = NULL; + + // Locate the first cache that has the KMC_KMEM_ALLOC flag. + mutex_enter(&kmem_cache_lock); + + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (cp->cache_cflags & KMC_KMEM_ALLOC) { + cache_to_destroy = cp; + break; + } + } + + mutex_exit(&kmem_cache_lock); + + // Destroy the cache + if (cache_to_destroy) { + kmem_cache_destroy(cache_to_destroy); + } + + } while (cache_to_destroy); +} + +static void +kmem_destroy_cache_by_name(const char *substr) +{ + kmem_cache_t *cache_to_destroy = NULL; + kmem_cache_t *cp = NULL; + + do { + cache_to_destroy = NULL; + + // Locate the first cache that has the KMC_KMEM_ALLOC flag. + mutex_enter(&kmem_cache_lock); + + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (kmem_strstr(cp->cache_name, substr)) { + cache_to_destroy = cp; + break; + } + } + + mutex_exit(&kmem_cache_lock); + + // Destroy the cache + if (cache_to_destroy) { + kmem_cache_destroy(cache_to_destroy); + } + + } while (cache_to_destroy); +} + +static void +kmem_cache_init(int pass, int use_large_pages) +{ + int i; + size_t maxbuf; + kmem_magtype_t *mtp; + + for (i = 0; i < sizeof (kmem_magtype) / sizeof (*mtp); i++) { + char name[KMEM_CACHE_NAMELEN + 1]; + + mtp = &kmem_magtype[i]; + (void) snprintf(name, KMEM_CACHE_NAMELEN, "%s%d", + KMEM_MAGAZINE_PREFIX, + mtp->mt_magsize); + mtp->mt_cache = kmem_cache_create( + name, + (mtp->mt_magsize + 1) * sizeof (void *), + mtp->mt_align, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + } + + kmem_slab_cache = kmem_cache_create("kmem_slab_cache", + sizeof (kmem_slab_t), 0, NULL, NULL, + NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + kmem_bufctl_cache = kmem_cache_create("kmem_bufctl_cache", + sizeof (kmem_bufctl_t), 0, + NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + kmem_bufctl_audit_cache = kmem_cache_create("kmem_bufctl_audit_cache", + sizeof (kmem_bufctl_audit_t), + 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + if (pass == 2) { + kmem_va_arena = vmem_create(KMEM_VA_PREFIX, + NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, heap_arena, + 2 * PAGESIZE, VM_SLEEP); + + kmem_default_arena = vmem_create("kmem_default", + NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, kmem_va_arena, + 0, VMC_DUMPSAFE | VM_SLEEP); + + /* Figure out what our maximum cache size is */ + maxbuf = kmem_max_cached; + if (maxbuf <= KMEM_MAXBUF) { + maxbuf = 0; + kmem_max_cached = KMEM_MAXBUF; + } else { + size_t size = 0; + size_t max = + sizeof (kmem_big_alloc_sizes) / sizeof (int); + /* + * Round maxbuf up to an existing cache size. If maxbuf + * is larger than the largest cache, we truncate it to + * the largest cache's size. + */ + for (i = 0; i < max; i++) { + size = kmem_big_alloc_sizes[i]; + if (maxbuf <= size) + break; + } + kmem_max_cached = maxbuf = size; + } + + /* + * The big alloc table may not be completely overwritten, so + * we clear out any stale cache pointers from the first pass. + */ + bzero(kmem_big_alloc_table, sizeof (kmem_big_alloc_table)); + } else { + /* + * During the first pass, the kmem_alloc_* caches + * are treated as metadata. + */ + kmem_default_arena = kmem_msb_arena; + maxbuf = KMEM_BIG_MAXBUF_32BIT; + } + + /* + * Set up the default caches to back kmem_alloc() + */ + kmem_alloc_caches_create( + kmem_alloc_sizes, sizeof (kmem_alloc_sizes) / sizeof (int), + kmem_alloc_table, KMEM_MAXBUF, KMEM_ALIGN_SHIFT); + + kmem_alloc_caches_create( + kmem_big_alloc_sizes, sizeof (kmem_big_alloc_sizes) / sizeof (int), + kmem_big_alloc_table, maxbuf, KMEM_BIG_SHIFT); + + kmem_big_alloc_table_max = maxbuf >> KMEM_BIG_SHIFT; +} + +struct free_slab { + vmem_t *vmp; + size_t slabsize; + void *slab; + list_node_t next; +}; + +static list_t freelist; + + +void +kmem_cache_build_slablist(kmem_cache_t *cp) +{ + int cpu_seqid; + + vmem_t *vmp = cp->cache_arena; + kmem_slab_t *sp; + struct free_slab *fs; + + for (sp = list_head(&cp->cache_complete_slabs); sp != NULL; + sp = list_next(&cp->cache_complete_slabs, sp)) { + + MALLOC(fs, struct free_slab *, sizeof (struct free_slab), + M_TEMP, M_WAITOK); + fs->vmp = vmp; + fs->slabsize = cp->cache_slabsize; + fs->slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, + vmp->vm_quantum); + list_link_init(&fs->next); + list_insert_tail(&freelist, fs); + } + + for (sp = avl_first(&cp->cache_partial_slabs); sp != NULL; + sp = AVL_NEXT(&cp->cache_partial_slabs, sp)) { + + MALLOC(fs, struct free_slab *, sizeof (struct free_slab), + M_TEMP, M_WAITOK); + fs->vmp = vmp; + fs->slabsize = cp->cache_slabsize; + fs->slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, + vmp->vm_quantum); + list_link_init(&fs->next); + list_insert_tail(&freelist, fs); + } + + + kstat_delete(cp->cache_kstat); + + if (cp->cache_hash_table != NULL) + vmem_free(kmem_hash_arena, cp->cache_hash_table, + (cp->cache_hash_mask + 1) * sizeof (void *)); + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) + mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock); + + mutex_destroy(&cp->cache_depot_lock); + mutex_destroy(&cp->cache_lock); + + vmem_free(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus)); +} + + +static void +kmem_cache_fini() +{ + kmem_cache_t *cp; + int i; + struct free_slab *fs; + + list_create(&freelist, sizeof (struct free_slab), + offsetof(struct free_slab, next)); + + mutex_enter(&kmem_cache_lock); + + while ((cp = list_head(&kmem_caches))) { + list_remove(&kmem_caches, cp); + mutex_exit(&kmem_cache_lock); + kmem_cache_build_slablist(cp); + mutex_enter(&kmem_cache_lock); + } + + mutex_exit(&kmem_cache_lock); + + i = 0; + while ((fs = list_head(&freelist))) { + i++; + list_remove(&freelist, fs); + vmem_free(fs->vmp, fs->slab, fs->slabsize); + FREE(fs, M_TEMP); + + } + printf("SPL: Released %u slabs\n", i); + list_destroy(&freelist); +} + + +// this is intended to substitute for kmem_avail() in arc.c +int64_t +spl_free_wrapper(void) +{ + return (spl_free); +} + +// this is intended to substitute for kmem_avail() in arc.c +// when arc_reclaim_thread() calls spl_free_set_pressure(0); +int64_t +spl_free_manual_pressure_wrapper(void) +{ + return (spl_free_manual_pressure); +} + +uint64_t +spl_free_last_pressure_wrapper(void) +{ + return (spl_free_last_pressure); +} + +int64_t +spl_free_set_and_wait_pressure(int64_t new_p, boolean_t fast, + clock_t check_interval) +{ + + int64_t snapshot_pressure = 0; + + if (new_p <= 0) + return (0); + + spl_free_fast_pressure = fast; + + if (spl_free_manual_pressure >= 0) + spl_free_manual_pressure += new_p; + else + spl_free_manual_pressure = new_p; + + // wait for another thread to reset pressure + const uint64_t start = zfs_lbolt(); + const uint64_t end_by = start + (hz*60); + const uint64_t double_at = start + (hz/2); + const uint64_t double_again_at = start + hz; + bool doubled = false, doubled_again = false; + uint64_t now; + + spl_free_last_pressure = start; + + for (; spl_free_manual_pressure != 0; ) { + // has another thread set spl_free_manual_pressure? + if (spl_free_manual_pressure < new_p) + spl_free_manual_pressure = new_p; + snapshot_pressure = spl_free_manual_pressure; + mutex_enter(&spl_free_thread_lock); + cv_timedwait_hires(&spl_free_thread_cv, + &spl_free_thread_lock, check_interval, 0, 0); + mutex_exit(&spl_free_thread_lock); + now = zfs_lbolt(); + if (now > end_by) { + printf("%s: ERROR: timed out after one minute!\n", + __func__); + break; + } else if (now > double_again_at && !doubled_again) { + doubled_again = true; + new_p *= 2; + } else if (now > double_at) { + doubled = true; + new_p *= 2; + } + } + return (snapshot_pressure); +} + +// routinely called by arc_reclaim_thread() with new_p == 0 +void +spl_free_set_pressure(int64_t new_p) +{ + if (new_p > spl_free_manual_pressure || new_p <= 0) + spl_free_manual_pressure = new_p; + if (new_p == 0) { + spl_free_fast_pressure = FALSE; + // wake up both spl_free_thread() to recalculate spl_free + // and any spl_free_set_and_wait_pressure() threads + cv_broadcast(&spl_free_thread_cv); + } + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_set_pressure_both(int64_t new_p, boolean_t fast) +{ + spl_free_fast_pressure = fast; + if (new_p > spl_free_manual_pressure || new_p <= 0) + spl_free_manual_pressure = new_p; + spl_free_last_pressure = zfs_lbolt(); +} + +void spl_free_maybe_reap(void); + +void +spl_free_set_emergency_pressure(int64_t new_p) +{ + spl_free_fast_pressure = TRUE; + if (new_p > spl_free_manual_pressure || new_p <= 0) + spl_free_manual_pressure = new_p; + spl_free_maybe_reap(); + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_set_emergency_pressure_additive(int64_t new_p) +{ + spl_free_fast_pressure = TRUE; + spl_free_manual_pressure += new_p; + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_set_pressure_additive(int64_t new_p) +{ + spl_free_manual_pressure += new_p; + spl_free_last_pressure = zfs_lbolt(); +} + +boolean_t +spl_free_fast_pressure_wrapper() +{ + return (spl_free_fast_pressure); +} + +void +spl_free_set_fast_pressure(boolean_t state) +{ + spl_free_fast_pressure = state; + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_reap_caches(void) +{ + // note: this may take some time + static hrtime_t last_reap = 0; + const hrtime_t reap_after = SEC2NSEC(60); + const hrtime_t curtime = gethrtime(); + + if (curtime - last_reap < reap_after) + return; + + vmem_qcache_reap(zio_arena_parent); + kmem_reap(); + vmem_qcache_reap(kmem_va_arena); +} + +void +spl_free_maybe_reap(void) +{ + static _Atomic uint64_t last_reap = 0; + const uint64_t lockout_time = 60 * hz; + + uint64_t now = zfs_lbolt(); + if (now > last_reap + lockout_time) { + last_reap = now; + spl_free_maybe_reap_flag = true; + } +} + +boolean_t +spl_maybe_send_large_pressure(uint64_t now, uint64_t minutes, boolean_t full) +{ + static volatile _Atomic uint64_t spl_last_large_pressure = 0; + const uint64_t interval_ticks = minutes * 60ULL * (uint64_t)hz; + + if (spl_last_large_pressure + interval_ticks > now) + return (false); + + spl_last_large_pressure = now; + + const int64_t sixteenth_physmem = (int64_t)real_total_memory / 16LL; + const int64_t sixtyfourth_physmem = sixteenth_physmem / 4LL; + int64_t howmuch = sixteenth_physmem; + + if (full == false) + howmuch = sixtyfourth_physmem; + + + dprintf("SPL: %s: %lld bytes at time %llu\n", + __func__, howmuch, now); + + spl_free_set_emergency_pressure(howmuch); + + return (true); +} + +static void +spl_free_thread() +{ + callb_cpr_t cpr; + uint64_t last_update = zfs_lbolt(); + int64_t last_spl_free; + double ema_new = 0; + double ema_old = 0; + double alpha; + + CALLB_CPR_INIT(&cpr, &spl_free_thread_lock, callb_generic_cpr, FTAG); + + spl_free = (int64_t)PAGESIZE * + (int64_t)(vm_page_free_count - vm_page_free_min); + + mutex_enter(&spl_free_thread_lock); + + dprintf("SPL: beginning spl_free_thread() loop, spl_free == %lld\n", + spl_free); + + uint64_t recent_lowmem = 0; + uint64_t last_disequilibrium = 0; + + while (!spl_free_thread_exit) { + mutex_exit(&spl_free_thread_lock); + boolean_t lowmem = false; + boolean_t emergency_lowmem = false; + int64_t base; + int64_t new_spl_free = 0LL; + + spl_stats.spl_free_wake_count.value.ui64++; + + if (spl_free_maybe_reap_flag == true) { + spl_free_maybe_reap_flag = false; + spl_free_reap_caches(); + } + + uint64_t time_now = zfs_lbolt(); + uint64_t time_now_seconds = 0; + if (time_now > hz) + time_now_seconds = time_now / hz; + + last_spl_free = spl_free; + + new_spl_free = 0LL; + + /* + * if there is pressure that has not yet reached + * arc_reclaim_thread() then start with a negative + * new_spl_free + */ + if (spl_free_manual_pressure > 0) { + int64_t old_pressure = spl_free_manual_pressure; + new_spl_free -= old_pressure * 2LL; + lowmem = true; + if (spl_free_fast_pressure) { + emergency_lowmem = true; + new_spl_free -= old_pressure * 4LL; + } + } + + /* + * can we allocate at least a 64 MiB segment + * from spl_heap_arena? this probes the reserve + * and also the largest imported spans, which + * vmem_alloc can fragment if needed. + */ + boolean_t reserve_low = false; + extern vmem_t *spl_heap_arena; + const uint64_t sixtyfour = 64ULL*1024ULL*1024ULL; + const uint64_t rvallones = (sixtyfour << 1ULL) - 1ULL; + const uint64_t rvmask = ~rvallones; + uint64_t rvfreebits = spl_heap_arena->vm_freemap; + + if ((rvfreebits & rvmask) == 0) { + reserve_low = true; + } else { + new_spl_free += (int64_t)sixtyfour; + } + + // do we have lots of memory in the spl_heap_arena ? + + boolean_t early_lots_free = false; + const uint64_t onetwentyeight = 128ULL*1024ULL*1024ULL; + const uint64_t sixteen = 16ULL*1024ULL*1024ULL; + if (!reserve_low) { + early_lots_free = true; + } else if (vmem_size_semi_atomic(spl_heap_arena, + VMEM_FREE) > onetwentyeight) { + early_lots_free = true; + new_spl_free += (int64_t)sixteen; + } + + // do we have lots of memory in the bucket_arenas ? + + extern int64_t vmem_buckets_size(int); // non-locking + int64_t buckets_free = vmem_buckets_size(VMEM_FREE); + if ((uint64_t)buckets_free != spl_buckets_mem_free) + spl_buckets_mem_free = (uint64_t)buckets_free; + + if (buckets_free >= 512LL*1024LL*1024LL) { + early_lots_free = true; + new_spl_free += (int64_t)sixteen; + } + if (buckets_free >= 1024LL*1024LL*1024LL) { + reserve_low = false; + new_spl_free += (int64_t)sixteen; + } + + /* + * if we have neither alloced or freed in + * several minutes, then we do not need to + * shrink back if there is a momentary transient + * memory spike (i.e., one that lasts less than a second) + */ + boolean_t memory_equilibrium = false; + const uint64_t five_minutes = 300ULL; + const uint64_t one_minute = 60ULL; + uint64_t last_xat_alloc_seconds = spl_xat_lastalloc; + uint64_t last_xat_free_seconds = spl_xat_lastfree; + + if (last_xat_alloc_seconds + five_minutes > time_now_seconds && + last_xat_free_seconds + five_minutes > time_now_seconds) { + if (last_disequilibrium + one_minute > + time_now_seconds) { + memory_equilibrium = true; + last_disequilibrium = 0; + } + } else { + last_disequilibrium = time_now_seconds; + } + + boolean_t just_alloced = false; + if (last_xat_alloc_seconds + 1 > time_now_seconds) + just_alloced = true; + + /* + * this is a sign of a period of time of low system + * memory, however XNU's generation of this variable + * is not very predictable, but generally it should be + * taken seriously when it's positive (it is often falsely 0) + */ + if ((vm_page_free_wanted > 0 && reserve_low && + !early_lots_free && !memory_equilibrium && + !just_alloced) || vm_page_free_wanted >= 1024) { + int64_t bminus = (int64_t)vm_page_free_wanted * + (int64_t)PAGESIZE * -16LL; + if (bminus > -16LL*1024LL*1024LL) + bminus = -16LL*1024LL*1024LL; + new_spl_free += bminus; + lowmem = true; + emergency_lowmem = true; + // atomic swaps to set these variables used in arc.c + int64_t previous_highest_pressure = 0; + int64_t new_p = -bminus; + previous_highest_pressure = spl_free_manual_pressure; + if (new_p > previous_highest_pressure || new_p <= 0) { + boolean_t fast = FALSE; + if (vm_page_free_wanted > vm_page_free_min / 8) + fast = TRUE; + spl_free_set_pressure_both(-16LL * new_spl_free, + fast); + } + last_disequilibrium = time_now_seconds; + } else if (vm_page_free_wanted > 0) { + int64_t bytes_wanted = (int64_t)vm_page_free_wanted * + (int64_t)PAGESIZE; + new_spl_free -= bytes_wanted; + if (reserve_low && !early_lots_free) { + lowmem = true; + if (recent_lowmem == 0) { + recent_lowmem = time_now; + } + if (!memory_equilibrium) { + last_disequilibrium = time_now_seconds; + } + } + } + + /* + * these variables are reliably maintained by XNU + * if vm_page_free_count > vm_page_free_min, then XNU + * is scanning pages and we may want to try to free some memory + */ + int64_t above_min_free_pages = (int64_t)vm_page_free_count - + (int64_t)vm_page_free_min; + int64_t above_min_free_bytes = (int64_t)PAGESIZE * + above_min_free_pages; + + /* + * vm_page_free_min normally 3500, page free target + * normally 4000 but not exported so we are not scanning + * if we are 500 pages above vm_page_free_min. even if + * we're scanning we may have plenty of space in the + * reserve arena, in which case we should not react too strongly + */ + + if (above_min_free_bytes < (int64_t)PAGESIZE * 500LL && + reserve_low && !early_lots_free && !memory_equilibrium) { + // trigger a reap below + lowmem = true; + } + + extern volatile unsigned int vm_page_speculative_count; + if ((above_min_free_bytes < 0LL && reserve_low && + !early_lots_free && !memory_equilibrium && !just_alloced) || + above_min_free_bytes <= -4LL*1024LL*1024LL) { + int64_t new_p = -1LL * above_min_free_bytes; + boolean_t fast = FALSE; + emergency_lowmem = true; + lowmem = true; + recent_lowmem = time_now; + last_disequilibrium = time_now_seconds; + int64_t spec_bytes = (int64_t)vm_page_speculative_count + * (int64_t)PAGESIZE; + if (vm_page_free_wanted > 0 || new_p > spec_bytes) { + // force a stronger reaction from ARC if we are + // also low on speculative pages (xnu prefetched + // file blocks with no clients yet) + fast = TRUE; + } + spl_free_set_pressure_both(new_p, fast); + } else if (above_min_free_bytes < 0LL && !early_lots_free) { + lowmem = true; + if (recent_lowmem == 0) + recent_lowmem = time_now; + if (!memory_equilibrium) + last_disequilibrium = time_now_seconds; + } + + new_spl_free += above_min_free_bytes; + + /* + * If we have already detected a memory shortage + * and we have not reaped in a while (a short while + * for emergency_lowmem), then do a kmem_reap() now. + * See http://comments.gmane.org/gmane.os.illumos.devel/22552 + * (notably Richard Elling's "A kernel module can call + * kmem_reap() whenever it wishes and some modules, + * like zfs, do so." If we reap, stop processing spl_free + * on this pass, to let the reaps (and arc, if pressure + * has been set above) do their job for a few milliseconds. + */ + if (emergency_lowmem || lowmem) { + static uint64_t last_reap = 0; + uint64_t now = time_now; + uint64_t elapsed = 60*hz; + if (emergency_lowmem) + elapsed = 15*hz; // min.freq. kmem_reap_interval + if (now - last_reap > elapsed) { + last_reap = now; + /* + * spl_free_reap_caches() calls functions + * that will acquire locks and can take a while + * so set spl_free to a small positive value + * to stop arc shrinking too much during this + * period when we expect to be freeing up + * arc-usable memory, but low enough that + * arc_no_grow likely will be set. + */ + const int64_t two_spamax = 32LL * 1024LL * + 1024LL; + if (spl_free < two_spamax) + spl_free = two_spamax; // atomic! + spl_free_reap_caches(); + // we do not have any lock now, so we can jump + // to just before the thread-suspending code + goto justwait; + } + } + + /* + * a number or exceptions to reverse the lowmem + * / emergency_lowmem states if we have recently reaped. + * we also take the strong reaction sting out of + * the set pressure by turning off spl_free_fast_pressure, + * since that automatically provokes an arc shrink + * and arc reap. + */ + + if (!reserve_low || early_lots_free || memory_equilibrium || + just_alloced) { + lowmem = false; + emergency_lowmem = false; + spl_free_fast_pressure = FALSE; + } + + if (vm_page_speculative_count > 0) { + /* + * speculative memory can be squeezed a bit; it is + * file blocks that have been prefetched by xnu but + * are not (yet) in use by any consumer + */ + if (vm_page_speculative_count / 4 + vm_page_free_count > + vm_page_free_min) { + emergency_lowmem = false; + spl_free_fast_pressure = FALSE; + } + if (vm_page_speculative_count / 2 + vm_page_free_count > + vm_page_free_min) { + lowmem = false; + spl_free_fast_pressure = FALSE; + } + } + + /* + * Stay in a low memory condition for several seconds + * after we first detect that we are in it, giving the + * system (arc, xnu and userland) time to adapt + */ + if (!lowmem && recent_lowmem > 0) { + if (recent_lowmem + 4*hz < time_now) + lowmem = true; + else + recent_lowmem = 0; + } + + /* + * if we are in a lowmem "hangover", cure it with + * pressure, then wait for the pressure to take + * effect in arc.c code. triggered when we have had + * at least one lowmem in the previous few seconds + * -- possibly two (one that causes a reap, one + * that falls through to the 4 second hold above). + */ + if (recent_lowmem == time_now && early_lots_free && + reserve_low) { + /* + * we can't grab 64 MiB as a single segment, + * but otherwise have ample memory brought in from xnu, + * but recently we had lowmem... and still have lowmem. + * cure this condition with a dose of pressure. + */ + if (above_min_free_bytes < 0) { + int64_t old_p = spl_free_manual_pressure; + if (old_p <= -above_min_free_bytes) { + recent_lowmem = 0; + spl_free_manual_pressure = + -above_min_free_bytes; + goto justwait; + } + } + } + + base = new_spl_free; + + // adjust for available memory in spl_heap_arena + // cf arc_available_memory() + if (!emergency_lowmem) { + extern vmem_t *spl_default_arena; + int64_t heap_free = (int64_t)vmem_size_semi_atomic( + spl_heap_arena, VMEM_FREE); + // grabbed buckets_free up above; we are OK with + // change to it in the meanwhile, + // it'll get an update on the next run. + int64_t combined_free = heap_free + buckets_free; + + if (combined_free != 0) { + const int64_t mb = 1024*1024; + if (!lowmem && above_min_free_bytes > + (int64_t)PAGESIZE * 10000LL) { + if (above_min_free_bytes < 64LL * mb) + new_spl_free += combined_free / + 16; + else if (above_min_free_bytes < + 128LL * mb) + new_spl_free += combined_free / + 8; + else if (above_min_free_bytes < + 256LL * mb) + new_spl_free += combined_free / + 4; + else + new_spl_free += combined_free / + 2; + } else { + new_spl_free -= 16LL * mb; + } + } + + // memory footprint has gotten really big, + // decrease spl_free substantially + int64_t total_mem_used = (int64_t) + segkmem_total_mem_allocated; + if ((segkmem_total_mem_allocated * 100LL / + real_total_memory) > 70) { + new_spl_free -= total_mem_used / 64; + } else if ((segkmem_total_mem_allocated * 100LL / + real_total_memory) > 75) { + new_spl_free -= total_mem_used / 32; + lowmem = true; + } + } + + // Adjust in the face of a large ARC. + // We don't treat (zfs) metadata and non-metadata + // differently here, and leave policy with respect + // to the relative value of each up to arc.c. + // O3X arc.c does not (yet) take these arena sizes into + // account like Illumos's does. + uint64_t zio_size = vmem_size_semi_atomic(zio_arena_parent, + VMEM_ALLOC | VMEM_FREE); + // wrap this in a basic block for lexical scope SSA convenience + if (zio_size > 0) { + static uint64_t zio_last_too_big = 0; + static int64_t imposed_cap = 75; + const uint64_t seconds_of_lower_cap = 10*hz; + uint64_t now = time_now; + uint32_t zio_pct = (uint32_t)(zio_size * 100ULL / + real_total_memory); + // if not hungry for memory, shrink towards a + // 75% total memory cap on zfs_file_data + if (!lowmem && !emergency_lowmem && zio_pct > 75 && + (now > zio_last_too_big + seconds_of_lower_cap)) { + new_spl_free -= zio_size / 64; + zio_last_too_big = now; + imposed_cap = 75; + } else if (lowmem || emergency_lowmem) { + // shrink towards stricter caps if we are hungry + // for memory + const uint32_t lowmem_cap = 25; + const uint32_t emergency_lowmem_cap = 5; + // we don't want the lowest cap to be so low + // that we will not make any use of the fixed + // size reserve + if (lowmem && zio_pct > lowmem_cap) { + new_spl_free -= zio_size / 32; + zio_last_too_big = now; + imposed_cap = lowmem_cap; + } + if (emergency_lowmem && zio_pct > + emergency_lowmem_cap) { + new_spl_free -= zio_size / 8; + zio_last_too_big = now; + imposed_cap = emergency_lowmem_cap; + } + } + if (zio_last_too_big != now && + now < zio_last_too_big + seconds_of_lower_cap && + zio_pct > imposed_cap) { + new_spl_free -= zio_size / 64; + } + } + + // try to get 1/64 of spl_heap_arena freed up + if (emergency_lowmem && new_spl_free >= 0LL) { + extern vmem_t *spl_root_arena; + uint64_t root_size = vmem_size_semi_atomic( + spl_heap_arena, VMEM_ALLOC | VMEM_FREE); + uint64_t root_free = vmem_size_semi_atomic( + spl_heap_arena, VMEM_FREE); + int64_t difference = root_size - root_free; + int64_t target = root_size / 64; + if (difference < target) { + new_spl_free -= target; + } + // and we should definitely not be returning + // positive now + if (new_spl_free >= 0LL) + new_spl_free = -1024LL; + } + + double delta = (double)new_spl_free - (double)last_spl_free; + + boolean_t spl_free_is_negative = false; + + if (new_spl_free < 0LL) { + spl_stats.spl_spl_free_negative_count.value.ui64++; + spl_free_is_negative = true; + } + + // NOW set spl_free from calculated new_spl_free + spl_free = new_spl_free; + // the direct equivalent of : + // __c11_atomic_store(&spl_free, new_spl_free, + // __ATOMIC_SEQ_CST); + + /* + * Because we're already negative, arc is likely to have + * been signalled already. We can rely on the _maybe_ in + * spl-vmem.c:xnu_alloc_throttled() [XAT] to try to give + * arc a kick with greater probability. However, if we've + * gone negative several times, and have not tried a full + * kick in a long time, do so now; if the full kick is + * refused because there has been a kick too few minutes + * ago, try a gentler kick. We do this outside the lock, + * as spl_maybe_send_large_pressure may need to take a + * mutex, and we forbid further mutex entry when + * spl_free_lock is held. + */ + + if (spl_free_is_negative) { + static volatile _Atomic uint32_t + negatives_since_last_kick = 0; + + if (negatives_since_last_kick++ > 8) { + if (spl_maybe_send_large_pressure(time_now, 360, + true) || + spl_maybe_send_large_pressure(time_now, 60, + false)) { + negatives_since_last_kick = 0; + } + } + } + + if (lowmem) + recent_lowmem = time_now; + + // maintain an exponential moving average for the ema kstat + if (last_update > hz) + alpha = 1.0; + else { + double td_tick = (double)(time_now - last_update); + alpha = td_tick / (double)(hz*50.0); // roughly 0.02 + } + + ema_new = (alpha * delta) + (1.0 - alpha)*ema_old; + spl_free_delta_ema = ema_new; + ema_old = ema_new; + + justwait: + mutex_enter(&spl_free_thread_lock); + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&spl_free_thread_cv, + &spl_free_thread_lock, MSEC2NSEC(10), 0, 0); + CALLB_CPR_SAFE_END(&cpr, &spl_free_thread_lock); + } + spl_free_thread_exit = FALSE; + dprintf("SPL: spl_free_thread_exit set to FALSE " \ + "and exiting: cv_broadcasting\n"); + spl_free_manual_pressure = 0; + cv_broadcast(&spl_free_thread_cv); + CALLB_CPR_EXIT(&cpr); + dprintf("SPL: %s thread_exit\n", __func__); + thread_exit(); +} + + +static int +spl_kstat_update(kstat_t *ksp, int rw) +{ + spl_stats_t *ks = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + + if (ks->spl_spl_free_manual_pressure.value.i64 != + spl_free_manual_pressure) { + spl_free_set_pressure( + ks->spl_spl_free_manual_pressure.value.i64 * 1024 * + 1024); + if (ks->spl_spl_free_manual_pressure.value.i64 > 0) { + spl_free_reap_caches(); + } + } + + if (ks->spl_spl_free_fast_pressure.value.i64 != + spl_free_fast_pressure) { + if (spl_free_wrapper() != 0) { + spl_free_set_fast_pressure(TRUE); + } + } + + if (ks->spl_bucket_tunable_large_span.value.ui64 != + spl_bucket_tunable_large_span) { + spl_set_bucket_tunable_large_span( + ks->spl_bucket_tunable_large_span.value.ui64); + } + + if (ks->spl_bucket_tunable_small_span.value.ui64 != + spl_bucket_tunable_small_span) { + spl_set_bucket_tunable_small_span( + ks->spl_bucket_tunable_small_span.value.ui64); + } + + if (ks->spl_frag_max_walk.value.ui64 != spl_frag_max_walk) { + spl_frag_max_walk = ks->spl_frag_max_walk.value.ui64; + } + + if (ks->kmem_free_to_slab_when_fragmented.value.ui64 != + kmem_free_to_slab_when_fragmented) { + kmem_free_to_slab_when_fragmented = + ks->kmem_free_to_slab_when_fragmented.value.ui64; + } + + } else { + ks->spl_os_alloc.value.ui64 = segkmem_total_mem_allocated; + ks->spl_active_threads.value.ui64 = zfs_threads; + ks->spl_active_mutex.value.ui64 = zfs_active_mutex; + ks->spl_active_rwlock.value.ui64 = zfs_active_rwlock; + ks->spl_active_tsd.value.ui64 = spl_tsd_size(); + ks->spl_spl_free.value.i64 = spl_free; + ks->spl_spl_free_manual_pressure.value.i64 = + spl_free_manual_pressure; + ks->spl_spl_free_fast_pressure.value.i64 = + spl_free_fast_pressure; + ks->spl_spl_free_delta_ema.value.i64 = spl_free_delta_ema; + ks->spl_osif_malloc_success.value.ui64 = + stat_osif_malloc_success; + ks->spl_osif_malloc_bytes.value.ui64 = stat_osif_malloc_bytes; + ks->spl_osif_free.value.ui64 = stat_osif_free; + ks->spl_osif_free_bytes.value.ui64 = stat_osif_free_bytes; + ks->spl_bucket_non_pow2_allocs.value.ui64 = + spl_bucket_non_pow2_allocs; + + ks->spl_vmem_unconditional_allocs.value.ui64 = + spl_vmem_unconditional_allocs; + ks->spl_vmem_unconditional_alloc_bytes.value.ui64 = + spl_vmem_unconditional_alloc_bytes; + ks->spl_vmem_conditional_allocs.value.ui64 = + spl_vmem_conditional_allocs; + ks->spl_vmem_conditional_alloc_bytes.value.ui64 = + spl_vmem_conditional_alloc_bytes; + ks->spl_vmem_conditional_alloc_deny.value.ui64 = + spl_vmem_conditional_alloc_deny; + ks->spl_vmem_conditional_alloc_deny_bytes.value.ui64 = + spl_vmem_conditional_alloc_deny_bytes; + + ks->spl_xat_success.value.ui64 = spl_xat_success; + ks->spl_xat_late_success.value.ui64 = spl_xat_late_success; + ks->spl_xat_late_success_nosleep.value.ui64 = + spl_xat_late_success_nosleep; + ks->spl_xat_pressured.value.ui64 = spl_xat_pressured; + ks->spl_xat_bailed.value.ui64 = spl_xat_bailed; + ks->spl_xat_bailed_contended.value.ui64 = + spl_xat_bailed_contended; + ks->spl_xat_lastalloc.value.ui64 = spl_xat_lastalloc; + ks->spl_xat_lastfree.value.ui64 = spl_xat_lastfree; + ks->spl_xat_forced.value.ui64 = spl_xat_forced; + ks->spl_xat_sleep.value.ui64 = spl_xat_sleep; + ks->spl_xat_late_deny.value.ui64 = spl_xat_late_deny; + ks->spl_xat_no_waiters.value.ui64 = spl_xat_no_waiters; + ks->spl_xft_wait.value.ui64 = spl_xft_wait; + + ks->spl_vba_parent_memory_appeared.value.ui64 = + spl_vba_parent_memory_appeared; + ks->spl_vba_parent_memory_blocked.value.ui64 = + spl_vba_parent_memory_blocked; + ks->spl_vba_hiprio_blocked.value.ui64 = spl_vba_hiprio_blocked; + ks->spl_vba_cv_timeout.value.ui64 = spl_vba_cv_timeout; + ks->spl_vba_loop_timeout.value.ui64 = spl_vba_loop_timeout; + ks->spl_vba_cv_timeout_blocked.value.ui64 = + spl_vba_cv_timeout_blocked; + ks->spl_vba_loop_timeout_blocked.value.ui64 = + spl_vba_loop_timeout_blocked; + ks->spl_vba_sleep.value.ui64 = spl_vba_sleep; + ks->spl_vba_loop_entries.value.ui64 = spl_vba_loop_entries; + + ks->spl_bucket_tunable_large_span.value.ui64 = + spl_bucket_tunable_large_span; + ks->spl_bucket_tunable_small_span.value.ui64 = + spl_bucket_tunable_small_span; + + ks->spl_buckets_mem_free.value.ui64 = spl_buckets_mem_free; + ks->spl_arc_no_grow_bits.value.ui64 = spl_arc_no_grow_bits; + ks->spl_arc_no_grow_count.value.ui64 = spl_arc_no_grow_count; + + ks->spl_frag_max_walk.value.ui64 = spl_frag_max_walk; + ks->spl_frag_walked_out.value.ui64 = spl_frag_walked_out; + ks->spl_frag_walk_cnt.value.ui64 = spl_frag_walk_cnt; + + ks->spl_arc_reclaim_avoided.value.ui64 = + spl_arc_reclaim_avoided; + + ks->kmem_free_to_slab_when_fragmented.value.ui64 = + kmem_free_to_slab_when_fragmented; + } + + return (0); +} + +void +spl_kmem_init(uint64_t xtotal_memory) +{ + int old_kmem_flags = kmem_flags; + int use_large_pages = 0; + size_t maxverify, minfirewall; + + dprintf("SPL: KMEM starting. Total memory %llu\n", xtotal_memory); + + // Initialise the kstat lock + mutex_init(&kmem_cache_lock, "kmem_cache_lock", MUTEX_DEFAULT, NULL); + mutex_init(&kmem_flags_lock, "kmem_flags_lock", MUTEX_DEFAULT, NULL); + mutex_init(&kmem_cache_kstat_lock, "kmem_kstat_lock", MUTEX_DEFAULT, + NULL); + + spl_kstat_init(); + + + /* + * Small-memory systems (< 24 MB) can't handle kmem_flags overhead. + */ + if (physmem < btop(24 << 20) && !(old_kmem_flags & KMF_STICKY)) + kmem_flags = 0; + + /* + * Don't do firewalled allocations if the heap is less than 1TB + * (i.e. on a 32-bit kernel) + * The resulting VM_NEXTFIT allocations would create too much + * fragmentation in a small heap. + */ + maxverify = minfirewall = PAGESIZE / 2; + + + /* LINTED */ + ASSERT(sizeof (kmem_cpu_cache_t) == KMEM_CPU_CACHE_SIZE); + + list_create(&kmem_caches, sizeof (kmem_cache_t), + offsetof(kmem_cache_t, cache_link)); + + kernelheap_init(); + + kmem_metadata_arena = vmem_create("kmem_metadata", NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, heap_arena, 8 * PAGESIZE, + VM_SLEEP | VMC_NO_QCACHE); + + kmem_msb_arena = vmem_create("kmem_msb", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, kmem_metadata_arena, 0, + VMC_DUMPSAFE | VM_SLEEP); + + kmem_cache_arena = vmem_create("kmem_cache", NULL, 0, KMEM_ALIGN, + vmem_alloc, vmem_free, kmem_metadata_arena, 0, VM_SLEEP); + + kmem_hash_arena = vmem_create("kmem_hash", NULL, 0, KMEM_ALIGN, + vmem_alloc, vmem_free, kmem_metadata_arena, 0, VM_SLEEP); + + kmem_log_arena = vmem_create("kmem_log", NULL, 0, KMEM_ALIGN, + vmem_alloc, vmem_free, kmem_metadata_arena, 0, VM_SLEEP); + + /* temporary oversize arena for mod_read_system_file */ + kmem_oversize_arena = vmem_create("kmem_oversize", NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, heap_arena, 0, VM_SLEEP); + + // statically declared above kmem_reap_interval = 15 * hz; + + /* + * Read /etc/system. This is a chicken-and-egg problem because + * kmem_flags may be set in /etc/system, but mod_read_system_file() + * needs to use the allocator. The simplest solution is to create + * all the standard kmem caches, read /etc/system, destroy all the + * caches we just created, and then create them all again in light + * of the (possibly) new kmem_flags and other kmem tunables. + */ + + if (old_kmem_flags & KMF_STICKY) + kmem_flags = old_kmem_flags; + + if (!(kmem_flags & KMF_AUDIT)) + vmem_seg_size = offsetof(vmem_seg_t, vs_thread); + + if (kmem_maxverify == 0) + kmem_maxverify = maxverify; + + if (kmem_minfirewall == 0) + kmem_minfirewall = minfirewall; + + /* + * give segkmem a chance to figure out if we are using large pages + * for the kernel heap + */ + // use_large_pages = segkmem_lpsetup(); + use_large_pages = 0; + + /* + * To protect against corruption, we keep the actual number of callers + * KMF_LITE records seperate from the tunable. We arbitrarily clamp + * to 16, since the overhead for small buffers quickly gets out of + * hand. + * + * The real limit would depend on the needs of the largest KMC_NOHASH + * cache. + */ + kmem_lite_count = MIN(MAX(0, kmem_lite_pcs), 16); + kmem_lite_pcs = kmem_lite_count; + + kmem_cache_init(2, use_large_pages); + + if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) { + if (kmem_transaction_log_size == 0) + kmem_transaction_log_size = MIN(kmem_maxavail() / 50ULL, + PAGESIZE<<4); + kmem_transaction_log = kmem_log_init(kmem_transaction_log_size); + } + + if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) { + if (kmem_content_log_size == 0) + kmem_content_log_size = MIN(kmem_maxavail() / 50ULL, + PAGESIZE<<4); + kmem_content_log = kmem_log_init(kmem_content_log_size); + } + + kmem_failure_log = kmem_log_init(kmem_failure_log_size); + + kmem_slab_log = kmem_log_init(kmem_slab_log_size); + + spl_tsd_init(); + spl_rwlock_init(); + spl_taskq_init(); + + /* + * Warn about invalid or dangerous values of kmem_flags. + * Always warn about unsupported values. + */ + if (((kmem_flags & ~(KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | + KMF_CONTENTS | KMF_LITE)) != 0) || + ((kmem_flags & KMF_LITE) && kmem_flags != KMF_LITE)) + cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x. " + "See the Solaris Tunable Parameters Reference Manual.", + kmem_flags); + +#ifdef DEBUG + if ((kmem_flags & KMF_DEBUG) == 0) + cmn_err(CE_NOTE, "kmem debugging disabled."); +#else + /* + * For non-debug kernels, the only "normal" flags are 0, KMF_LITE, + * KMF_REDZONE, and KMF_CONTENTS (the last because it is only enabled + * if KMF_AUDIT is set). We should warn the user about the performance + * penalty of KMF_AUDIT or KMF_DEADBEEF if they are set and KMF_LITE + * isn't set (since that disables AUDIT). + */ + if (!(kmem_flags & KMF_LITE) && + (kmem_flags & (KMF_AUDIT | KMF_DEADBEEF)) != 0) + cmn_err(CE_WARN, "High-overhead kmem debugging features " + "enabled (kmem_flags = 0x%x). Performance degradation " + "and large memory overhead possible. See the Solaris " + "Tunable Parameters Reference Manual.", kmem_flags); +#endif /* not DEBUG */ + + segkmem_zio_init(); + + kmem_cache_applyall(kmem_cache_magazine_enable, NULL, TQ_SLEEP); + + kmem_ready = 1; + + // Install spl kstats + spl_ksp = kstat_create("spl", 0, "spl_misc", "misc", KSTAT_TYPE_NAMED, + sizeof (spl_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); + + if (spl_ksp != NULL) { + spl_ksp->ks_data = &spl_stats; + spl_ksp->ks_update = spl_kstat_update; + kstat_install(spl_ksp); + } +} + +void +spl_kmem_fini(void) +{ + + kmem_cache_applyall(kmem_cache_magazine_disable, NULL, TQ_SLEEP); + + kstat_delete(spl_ksp); + + kmem_log_fini(kmem_slab_log); + kmem_log_fini(kmem_failure_log); + + if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) { + if (kmem_content_log_size == 0) + kmem_content_log_size = kmem_maxavail() / 50; + kmem_log_fini(kmem_content_log); + } + + if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) { + if (kmem_transaction_log_size == 0) + kmem_transaction_log_size = kmem_maxavail() / 50; + kmem_log_fini(kmem_transaction_log); + } + + // Destroy all the "general allocation" caches + kmem_alloc_caches_destroy(); + + // Destroy the VA associated caches + kmem_destroy_cache_by_name(KMEM_VA_PREFIX); + + kmem_qcache_destroy(); + // Destroy metadata caches + kmem_cache_destroy(kmem_bufctl_cache); + kmem_cache_destroy(kmem_bufctl_audit_cache); + kmem_cache_destroy(kmem_slab_cache); // Dont think this one + + // Some caches cannot be destroyed as + // they mutually reference each other. + // So we explicitly pull them apart piece-by-piece. + kmem_cache_fini(); + + segkmem_zio_fini(); + + // Now destroy the vmem arenas used by kmem. + vmem_destroy(kmem_default_arena); + vmem_destroy(kmem_va_arena); + vmem_destroy(kmem_oversize_arena); + vmem_destroy(kmem_log_arena); + vmem_destroy(kmem_hash_arena); + vmem_destroy(kmem_cache_arena); + vmem_destroy(kmem_msb_arena); + vmem_destroy(kmem_metadata_arena); + + kernelheap_fini(); + + list_destroy(&kmem_caches); + + mutex_destroy(&kmem_cache_kstat_lock); + mutex_destroy(&kmem_flags_lock); + mutex_destroy(&kmem_cache_lock); +} + +static void +kmem_move_init(void) +{ + kmem_defrag_cache = kmem_cache_create("kmem_defrag_cache", + sizeof (kmem_defrag_t), 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + kmem_move_cache = kmem_cache_create("kmem_move_cache", + sizeof (kmem_move_t), 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + /* + * kmem guarantees that move callbacks are sequential and that even + * across multiple caches no two moves ever execute simultaneously. + * Move callbacks are processed on a separate taskq so that client code + * does not interfere with internal maintenance tasks. + */ + kmem_move_taskq = taskq_create("kmem_move_taskq", 1, + minclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE); +} + +void +kmem_move_fini(void) +{ + + taskq_wait(kmem_move_taskq); + taskq_destroy(kmem_move_taskq); + kmem_move_taskq = 0; + + kmem_cache_destroy(kmem_move_cache); + kmem_cache_destroy(kmem_defrag_cache); + +} + +void +spl_kmem_thread_init(void) +{ + kmem_move_init(); + + // Initialize the spl_free locks + mutex_init(&spl_free_thread_lock, "spl_free_thead_lock", MUTEX_DEFAULT, + NULL); + + kmem_taskq = taskq_create("kmem_taskq", 1, minclsyspri, + 300, INT_MAX, TASKQ_PREPOPULATE); + + spl_free_thread_exit = FALSE; + (void) cv_init(&spl_free_thread_cv, NULL, CV_DEFAULT, NULL); + (void) thread_create(NULL, 0, spl_free_thread, 0, 0, 0, 0, 92); +} + +void +spl_kmem_thread_fini(void) +{ + shutting_down = 1; + + mutex_enter(&spl_free_thread_lock); + spl_free_thread_exit = TRUE; + while (spl_free_thread_exit) { + cv_signal(&spl_free_thread_cv); + cv_wait(&spl_free_thread_cv, &spl_free_thread_lock); + } + mutex_exit(&spl_free_thread_lock); + cv_destroy(&spl_free_thread_cv); + mutex_destroy(&spl_free_thread_lock); + + bsd_untimeout(kmem_update, 0); + bsd_untimeout(kmem_reap_timeout, &kmem_reaping); + bsd_untimeout(kmem_reap_timeout, &kmem_reaping_idspace); + + taskq_wait(kmem_taskq); + + taskq_destroy(kmem_taskq); + kmem_taskq = 0; + + kmem_move_fini(); + +} + +void +spl_kmem_mp_init(void) +{ + kmem_update_timeout(NULL); +} + +/* + * Return the slab of the allocated buffer, or NULL if the buffer is not + * allocated. This function may be called with a known slab address to determine + * whether or not the buffer is allocated, or with a NULL slab address to obtain + * an allocated buffer's slab. + */ +static kmem_slab_t * +kmem_slab_allocated(kmem_cache_t *cp, kmem_slab_t *sp, void *buf) +{ + kmem_bufctl_t *bcp, *bufbcp; + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(sp == NULL || KMEM_SLAB_MEMBER(sp, buf)); + + if (cp->cache_flags & KMF_HASH) { + for (bcp = *KMEM_HASH(cp, buf); + (bcp != NULL) && (bcp->bc_addr != buf); + bcp = bcp->bc_next) { + continue; + } + ASSERT(sp != NULL && bcp != NULL ? sp == bcp->bc_slab : 1); + return (bcp == NULL ? NULL : bcp->bc_slab); + } + + if (sp == NULL) { + sp = KMEM_SLAB(cp, buf); + } + bufbcp = KMEM_BUFCTL(cp, buf); + for (bcp = sp->slab_head; + (bcp != NULL) && (bcp != bufbcp); + bcp = bcp->bc_next) { + continue; + } + return (bcp == NULL ? sp : NULL); +} + +static boolean_t +kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags) +{ + long refcnt = sp->slab_refcnt; + + ASSERT(cp->cache_defrag != NULL); + + /* + * For code coverage we want to be able to move an object within the + * same slab (the only partial slab) even if allocating the destination + * buffer resulted in a completely allocated slab. + */ + if (flags & KMM_DEBUG) { + return ((flags & KMM_DESPERATE) || + ((sp->slab_flags & KMEM_SLAB_NOMOVE) == 0)); + } + + /* If we're desperate, we don't care if the client said NO. */ + if (flags & KMM_DESPERATE) { + return (refcnt < sp->slab_chunks); /* any partial */ + } + + if (sp->slab_flags & KMEM_SLAB_NOMOVE) { + return (B_FALSE); + } + + if ((refcnt == 1) || kmem_move_any_partial) { + return (refcnt < sp->slab_chunks); + } + + /* + * The reclaim threshold is adjusted at each kmem_cache_scan() so that + * slabs with a progressively higher percentage of used buffers can be + * reclaimed until the cache as a whole is no longer fragmented. + * + * sp->slab_refcnt kmd_reclaim_numer + * --------------- < ------------------ + * sp->slab_chunks KMEM_VOID_FRACTION + */ + return ((refcnt * KMEM_VOID_FRACTION) < + (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer)); +} + +/* + * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(), + * or when the buffer is freed. + */ +static void +kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf) +{ + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, from_buf)); + + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + return; + } + + if (sp->slab_flags & KMEM_SLAB_NOMOVE) { + if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) { + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_flags &= ~KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = (uint32_t)-1; + avl_add(&cp->cache_partial_slabs, sp); + } + } else { + sp->slab_later_count = 0; + sp->slab_stuck_offset = (uint32_t)-1; + } +} + +static void +kmem_slab_move_no(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf) +{ + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, from_buf)); + + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + return; + } + + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_later_count = 0; + sp->slab_flags |= KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, from_buf); + avl_add(&cp->cache_partial_slabs, sp); +} + +static void kmem_move_end(kmem_cache_t *, kmem_move_t *); + +/* + * The move callback takes two buffer addresses, the buffer to be moved, and a + * newly allocated and constructed buffer selected by kmem as the destination. + * It also takes the size of the buffer and an optional user argument specified + * at cache creation time. kmem guarantees that the buffer to be moved has not + * been unmapped by the virtual memory subsystem. Beyond that, it cannot + * guarantee the present whereabouts of the buffer to be moved, so it is up to + * the client to safely determine whether or not it is still using the buffer. + * The client must not free either of the buffers passed to the move callback, + * since kmem wants to free them directly to the slab layer. The client response + * tells kmem which of the two buffers to free: + * + * YES kmem frees the old buffer (the move was successful) + * NO kmem frees the new buffer, marks the slab of the old buffer + * non-reclaimable to avoid bothering the client again + * LATER kmem frees the new buffer, increments slab_later_count + * DONT_KNOW kmem frees the new buffer + * DONT_NEED kmem frees both the old buffer and the new buffer + * + * The pending callback argument now being processed contains both of the + * buffers (old and new) passed to the move callback function, the slab of the + * old buffer, and flags related to the move request, such as whether or not the + * system was desperate for memory. + * + * Slabs are not freed while there is a pending callback, but instead are kept + * on a deadlist, which is drained after the last callback completes. This means + * that slabs are safe to access until kmem_move_end(), no matter how many of + * their buffers have been freed. Once slab_refcnt reaches zero, it stays at + * zero for as long as the slab remains on the deadlist and until the slab is + * freed. + */ +static void +kmem_move_buffer(kmem_move_t *callback) +{ + kmem_cbrc_t response; + kmem_slab_t *sp = callback->kmm_from_slab; + kmem_cache_t *cp = sp->slab_cache; + boolean_t free_on_slab; + + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf)); + + /* + * The number of allocated buffers on the slab may have changed since we + * last checked the slab's reclaimability (when the pending move was + * enqueued), or the client may have responded NO when asked to move + * another buffer on the same slab. + */ + if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) { + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + /* + * Checking the slab layer is easy, so we might as well do that here + * in case we can avoid bothering the client. + */ + mutex_enter(&cp->cache_lock); + free_on_slab = (kmem_slab_allocated(cp, sp, + callback->kmm_from_buf) == NULL); + mutex_exit(&cp->cache_lock); + + if (free_on_slab) { + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + if (cp->cache_flags & KMF_BUFTAG) { + /* + * Make kmem_cache_alloc_debug() apply the constructor for us. + */ + if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf, + KM_NOSLEEP, 1, caller()) != 0) { + kmem_move_end(cp, callback); + return; + } + } else if (cp->cache_constructor != NULL && + cp->cache_constructor(callback->kmm_to_buf, cp->cache_private, + KM_NOSLEEP) != 0) { + atomic_inc_64(&cp->cache_alloc_fail); + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + cp->cache_defrag->kmd_callbacks++; + cp->cache_defrag->kmd_thread = spl_current_thread(); + cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf; + cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf; + DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *, + callback); + + response = cp->cache_move(callback->kmm_from_buf, + callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private); + + DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *, + callback, kmem_cbrc_t, response); + cp->cache_defrag->kmd_thread = NULL; + cp->cache_defrag->kmd_from_buf = NULL; + cp->cache_defrag->kmd_to_buf = NULL; + + if (response == KMEM_CBRC_YES) { + cp->cache_defrag->kmd_yes++; + kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); + /* slab safe to access until kmem_move_end() */ + if (sp->slab_refcnt == 0) + cp->cache_defrag->kmd_slabs_freed++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + kmem_move_end(cp, callback); + return; + } + + switch (response) { + case KMEM_CBRC_NO: + cp->cache_defrag->kmd_no++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_no(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_LATER: + cp->cache_defrag->kmd_later++; + mutex_enter(&cp->cache_lock); + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + mutex_exit(&cp->cache_lock); + break; + } + + if (++sp->slab_later_count >= KMEM_DISBELIEF) { + kmem_slab_move_no(cp, sp, + callback->kmm_from_buf); + } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) { + sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, + callback->kmm_from_buf); + } + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_DONT_NEED: + cp->cache_defrag->kmd_dont_need++; + kmem_slab_free_constructed(cp, callback->kmm_from_buf, + B_FALSE); + if (sp->slab_refcnt == 0) + cp->cache_defrag->kmd_slabs_freed++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_DONT_KNOW: + /* + * If we don't know if we can move this buffer or not, + * we'll just assume that we can't: if the buffer is + * in fact free, then it is sitting in one of the + * per-CPU magazines or in a full magazine in the depot + * layer. Either way, because defrag is induced in the + * same logic that reaps a cache, it's likely that full + * magazines will be returned to the system soon + * (thereby accomplishing what we're trying to + * accomplish here: return those magazines to their + * slabs). Given this, any work that we might do now to + * locate a buffer in a magazine is wasted (and + * expensive!) work; we bump a counter in this case and + * otherwise assume that we can't move it. + */ + cp->cache_defrag->kmd_dont_know++; + break; + default: + panic("'%s' (%p) unexpected move callback " + "response %d\n", cp->cache_name, (void *)cp, + response); + } + + kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE); + kmem_move_end(cp, callback); +} + +/* Return B_FALSE if there is insufficient memory for the move request. */ +static boolean_t +kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) +{ + void *to_buf; + avl_index_t index; + kmem_move_t *callback, *pending; + ulong_t n; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + + callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP); + + if (callback == NULL) + return (B_FALSE); + + callback->kmm_from_slab = sp; + callback->kmm_from_buf = buf; + callback->kmm_flags = flags; + + mutex_enter(&cp->cache_lock); + + n = avl_numnodes(&cp->cache_partial_slabs); + if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) { + mutex_exit(&cp->cache_lock); + kmem_cache_free(kmem_move_cache, callback); + return (B_TRUE); /* there is no need for the move request */ + } + + pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index); + if (pending != NULL) { + /* + * If the move is already pending and we're desperate now, + * update the move flags. + */ + if (flags & KMM_DESPERATE) { + pending->kmm_flags |= KMM_DESPERATE; + } + mutex_exit(&cp->cache_lock); + kmem_cache_free(kmem_move_cache, callback); + return (B_TRUE); + } + + to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs), + B_FALSE); + callback->kmm_to_buf = to_buf; + avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index); + + mutex_exit(&cp->cache_lock); + + if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer, + callback, TQ_NOSLEEP)) { + mutex_enter(&cp->cache_lock); + avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); + mutex_exit(&cp->cache_lock); + kmem_slab_free(cp, to_buf); + kmem_cache_free(kmem_move_cache, callback); + return (B_FALSE); + } + + return (B_TRUE); +} + +static void +kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback) +{ + avl_index_t index; + + ASSERT(cp->cache_defrag != NULL); + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + mutex_enter(&cp->cache_lock); + VERIFY(avl_find(&cp->cache_defrag->kmd_moves_pending, + callback->kmm_from_buf, &index) != NULL); + avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); + if (avl_is_empty(&cp->cache_defrag->kmd_moves_pending)) { + list_t *deadlist = &cp->cache_defrag->kmd_deadlist; + kmem_slab_t *sp; + + /* + * The last pending move completed. Release all slabs + * from the front of the dead list except for any slab + * at the tail that needs to be released from the context + * of kmem_move_buffers(). kmem deferred unmapping the + * buffers on these slabs in order to guarantee that + * buffers passed to the move callback have been touched + * only by kmem or by the client itself. + */ + while ((sp = list_remove_head(deadlist)) != NULL) { + if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) { + list_insert_tail(deadlist, sp); + break; + } + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + mutex_enter(&cp->cache_lock); + } + } + mutex_exit(&cp->cache_lock); + kmem_cache_free(kmem_move_cache, callback); +} + +/* + * Move buffers from least used slabs first by scanning backwards from the end + * of the partial slab list. Scan at most max_scan candidate slabs and move + * buffers from at most max_slabs slabs (0 for all partial slabs in both cases). + * If desperate to reclaim memory, move buffers from any partial slab, otherwise + * skip slabs with a ratio of allocated buffers at or above the current + * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the + * scan is aborted) so that the caller can adjust the reclaimability threshold + * depending on how many reclaimable slabs it finds. + * + * kmem_move_buffers() drops and reacquires cache_lock every time it issues a + * move request, since it is not valid for kmem_move_begin() to call + * kmem_cache_alloc() or taskq_dispatch() with cache_lock held. + */ +static int +kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, + int flags) +{ + kmem_slab_t *sp; + void *buf; + int i, j; /* slab index, buffer index */ + int s; /* reclaimable slabs */ + int b; /* allocated (movable) buffers on reclaimable slab */ + boolean_t success; + int refcnt; + int nomove; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(kmem_move_cache != NULL); + ASSERT(cp->cache_move != NULL && cp->cache_defrag != NULL); + ASSERT((flags & KMM_DEBUG) ? !avl_is_empty(&cp->cache_partial_slabs) : + avl_numnodes(&cp->cache_partial_slabs) > 1); + + if (kmem_move_blocked) { + return (0); + } + + if (kmem_move_fulltilt) { + flags |= KMM_DESPERATE; + } + + if (max_scan == 0 || (flags & KMM_DESPERATE)) { + /* + * Scan as many slabs as needed to find the desired number of + * candidate slabs. + */ + max_scan = (size_t)-1; + } + + if (max_slabs == 0 || (flags & KMM_DESPERATE)) { + /* Find as many candidate slabs as possible. */ + max_slabs = (size_t)-1; + } + + sp = avl_last(&cp->cache_partial_slabs); + ASSERT(KMEM_SLAB_IS_PARTIAL(sp)); + for (i = 0, s = 0; (i < max_scan) && (s < max_slabs) && (sp != NULL) && + ((sp != avl_first(&cp->cache_partial_slabs)) || + (flags & KMM_DEBUG)); + sp = AVL_PREV(&cp->cache_partial_slabs, sp), i++) { + + if (!kmem_slab_is_reclaimable(cp, sp, flags)) { + continue; + } + s++; + + /* Look for allocated buffers to move. */ + for (j = 0, b = 0, buf = sp->slab_base; + (j < sp->slab_chunks) && (b < sp->slab_refcnt); + buf = (((char *)buf) + cp->cache_chunksize), j++) { + + if (kmem_slab_allocated(cp, sp, buf) == NULL) { + continue; + } + + b++; + + /* + * Prevent the slab from being destroyed while we drop + * cache_lock and while the pending move is not yet + * registered. Flag the pending move while + * kmd_moves_pending may still be empty, since we can't + * yet rely on a non-zero pending move count to prevent + * the slab from being destroyed. + */ + ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING)); + sp->slab_flags |= KMEM_SLAB_MOVE_PENDING; + /* + * Recheck refcnt and nomove after reacquiring the lock, + * since these control the order of partial slabs, and + * we want to know if we can pick up the scan where we + * left off. + */ + refcnt = sp->slab_refcnt; + nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE); + mutex_exit(&cp->cache_lock); + + success = kmem_move_begin(cp, sp, buf, flags); + + /* + * Now, before the lock is reacquired, kmem could + * process all pending move requests and purge the + * deadlist, so that upon reacquiring the lock, sp has + * been remapped. Or, the client may free all the + * objects on the slab while the pending moves are still + * on the taskq. Therefore, the KMEM_SLAB_MOVE_PENDING + * flag causes the slab to be put at the end of the + * deadlist and prevents it from being destroyed, since + * we plan to destroy it here after reacquiring the + * lock. + */ + mutex_enter(&cp->cache_lock); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING; + + if (sp->slab_refcnt == 0) { + list_t *deadlist = + &cp->cache_defrag->kmd_deadlist; + list_remove(deadlist, sp); + + if (!avl_is_empty( + &cp->cache_defrag->kmd_moves_pending)) { + /* + * A pending move makes it unsafe to + * destroy the slab, because even though + * the move is no longer needed, the + * context where that is determined + * requires the slab to exist. + * Fortunately, a pending move also + * means we don't need to destroy the + * slab here, since it will get + * destroyed along with any other slabs + * on the deadlist after the last + * pending move completes. + */ + list_insert_head(deadlist, sp); + return (-1); + } + + /* + * Destroy the slab now if it was completely + * freed while we dropped cache_lock and there + * are no pending moves. Since slab_refcnt + * cannot change once it reaches zero, no new + * pending moves from that slab are possible. + */ + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + mutex_enter(&cp->cache_lock); + /* + * Since we can't pick up the scan where we left + * off, abort the scan and say nothing about the + * number of reclaimable slabs. + */ + return (-1); + } + + if (!success) { + /* + * Abort the scan if there is not enough memory + * for the request and say nothing about the + * number of reclaimable slabs. + */ + return (-1); + } + + /* + * The slab's position changed while the lock was + * dropped, so we don't know where we are in the + * sequence any more. + */ + if (sp->slab_refcnt != refcnt) { + /* + * If this is a KMM_DEBUG move, the slab_refcnt + * may have changed because we allocated a + * destination buffer on the same slab. In that + * case, we're not interested in counting it. + */ + return (-1); + } + if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) + return (-1); + + /* + * Generating a move request allocates a destination + * buffer from the slab layer, bumping the first partial + * slab if it is completely allocated. If the current + * slab becomes the first partial slab as a result, we + * can't continue to scan backwards. + * + * If this is a KMM_DEBUG move and we allocated the + * destination buffer from the last partial slab, then + * the buffer we're moving is on the same slab and our + * slab_refcnt has changed, causing us to return before + * reaching here if there are no partial slabs left. + */ + ASSERT(!avl_is_empty(&cp->cache_partial_slabs)); + if (sp == avl_first(&cp->cache_partial_slabs)) { + /* + * We're not interested in a second KMM_DEBUG + * move. + */ + goto end_scan; + } + } + } +end_scan: + + return (s); +} + +typedef struct kmem_move_notify_args { + kmem_cache_t *kmna_cache; + void *kmna_buf; +} kmem_move_notify_args_t; + +static void +kmem_cache_move_notify_task(void *arg) +{ + kmem_move_notify_args_t *args = arg; + kmem_cache_t *cp = args->kmna_cache; + void *buf = args->kmna_buf; + kmem_slab_t *sp; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(list_link_active(&cp->cache_link)); + + zfs_kmem_free(args, sizeof (kmem_move_notify_args_t)); + mutex_enter(&cp->cache_lock); + sp = kmem_slab_allocated(cp, NULL, buf); + + /* Ignore the notification if the buffer is no longer allocated. */ + if (sp == NULL) { + mutex_exit(&cp->cache_lock); + return; + } + + /* Ignore the notification if there's no reason to move the buffer. */ + if (avl_numnodes(&cp->cache_partial_slabs) > 1) { + /* + * So far the notification is not ignored. Ignore the + * notification if the slab is not marked by an earlier refusal + * to move a buffer. + */ + if (!(sp->slab_flags & KMEM_SLAB_NOMOVE) && + (sp->slab_later_count == 0)) { + mutex_exit(&cp->cache_lock); + return; + } + + kmem_slab_move_yes(cp, sp, buf); + ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING)); + sp->slab_flags |= KMEM_SLAB_MOVE_PENDING; + mutex_exit(&cp->cache_lock); + /* see kmem_move_buffers() about dropping the lock */ + (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY); + mutex_enter(&cp->cache_lock); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING; + if (sp->slab_refcnt == 0) { + list_t *deadlist = &cp->cache_defrag->kmd_deadlist; + list_remove(deadlist, sp); + + if (!avl_is_empty( + &cp->cache_defrag->kmd_moves_pending)) { + list_insert_head(deadlist, sp); + mutex_exit(&cp->cache_lock); + return; + } + + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + return; + } + } else { + kmem_slab_move_yes(cp, sp, buf); + } + mutex_exit(&cp->cache_lock); +} + +void +kmem_cache_move_notify(kmem_cache_t *cp, void *buf) +{ + kmem_move_notify_args_t *args; + + args = zfs_kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP); + if (args != NULL) { + args->kmna_cache = cp; + args->kmna_buf = buf; + if (!taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_move_notify_task, args, + TQ_NOSLEEP)) + zfs_kmem_free(args, sizeof (kmem_move_notify_args_t)); + } +} + +static void +kmem_cache_defrag(kmem_cache_t *cp) +{ + size_t n; + + ASSERT(cp->cache_defrag != NULL); + + mutex_enter(&cp->cache_lock); + n = avl_numnodes(&cp->cache_partial_slabs); + if (n > 1) { + /* kmem_move_buffers() drops and reacquires cache_lock */ + cp->cache_defrag->kmd_defrags++; + (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE); + } + mutex_exit(&cp->cache_lock); +} + +/* Is this cache above the fragmentation threshold? */ +static boolean_t +kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree) +{ + /* + * nfree kmem_frag_numer + * ------------------ > --------------- + * cp->cache_buftotal kmem_frag_denom + */ + return ((nfree * kmem_frag_denom) > + (cp->cache_buftotal * kmem_frag_numer)); +} + +static boolean_t +kmem_cache_is_fragmented(kmem_cache_t *cp, boolean_t *doreap) +{ + boolean_t fragmented; + uint64_t nfree; + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + *doreap = B_FALSE; + + if (kmem_move_fulltilt) { + if (avl_numnodes(&cp->cache_partial_slabs) > 1) { + return (B_TRUE); + } + } else { + if ((cp->cache_complete_slab_count + avl_numnodes( + &cp->cache_partial_slabs)) < kmem_frag_minslabs) { + return (B_FALSE); + } + } + + nfree = cp->cache_bufslab; + fragmented = ((avl_numnodes(&cp->cache_partial_slabs) > 1) && + kmem_cache_frag_threshold(cp, nfree)); + + /* + * Free buffers in the magazine layer appear allocated from the point of + * view of the slab layer. We want to know if the slab layer would + * appear fragmented if we included free buffers from magazines that + * have fallen out of the working set. + */ + if (!fragmented) { + long reap; + + mutex_enter(&cp->cache_depot_lock); + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + reap = MIN(reap, cp->cache_full.ml_total); + mutex_exit(&cp->cache_depot_lock); + + nfree += ((uint64_t)reap * cp->cache_magtype->mt_magsize); + if (kmem_cache_frag_threshold(cp, nfree)) { + *doreap = B_TRUE; + } + } + + return (fragmented); +} + +/* Called periodically from kmem_taskq */ +static void +kmem_cache_scan(kmem_cache_t *cp) +{ + boolean_t reap = B_FALSE; + kmem_defrag_t *kmd; + + ASSERT(taskq_member(kmem_taskq, curthread)); + + mutex_enter(&cp->cache_lock); + + kmd = cp->cache_defrag; + if (kmd->kmd_consolidate > 0) { + kmd->kmd_consolidate--; + mutex_exit(&cp->cache_lock); + kmem_cache_reap(cp); + return; + } + + if (kmem_cache_is_fragmented(cp, &reap)) { + size_t slabs_found; + + /* + * Consolidate reclaimable slabs from the end of the partial + * slab list (scan at most kmem_reclaim_scan_range slabs to find + * reclaimable slabs). Keep track of how many candidate slabs we + * looked for and how many we actually found so we can adjust + * the definition of a candidate slab if we're having trouble + * finding them. + * + * kmem_move_buffers() drops and reacquires cache_lock. + */ + kmd->kmd_scans++; + slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range, + kmem_reclaim_max_slabs, 0); + kmd->kmd_slabs_sought += kmem_reclaim_max_slabs; + kmd->kmd_slabs_found += slabs_found; + + if (++kmd->kmd_tries >= kmem_reclaim_scan_range) { + kmd->kmd_tries = 0; + + /* + * If we had difficulty finding candidate slabs in + * previous scans, adjust the threshold so that + * candidates are easier to find. + */ + if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) { + kmem_adjust_reclaim_threshold(kmd, -1); + } else if ((kmd->kmd_slabs_found * 2) < + kmd->kmd_slabs_sought) { + kmem_adjust_reclaim_threshold(kmd, 1); + } + kmd->kmd_slabs_sought = 0; + kmd->kmd_slabs_found = 0; + } + } else { + kmem_reset_reclaim_threshold(cp->cache_defrag); +#ifdef DEBUG + if (!avl_is_empty(&cp->cache_partial_slabs)) { + /* + * In a debug kernel we want the consolidator to + * run occasionally even when there is plenty of + * memory. + */ + uint16_t debug_rand; + + /* + * smd: note that this only gets called for the + * dnode cache because only the dnode cache has + * kmem_cache_set_move() applied to it + * brendon says move is voluntary and "tricky" + * the reason this is not called is because the source + * is kmem_cache_update(), that only calls this + * function (kmem_cache_scan()) + * if there is a move/defrag (same thing) associated + * with it so hoist some of this code up to to + * kmem_cache_update + */ + + (void) random_get_bytes((uint8_t *)&debug_rand, 2); + if (!kmem_move_noreap && + ((debug_rand % kmem_mtb_reap) == 0)) { + mutex_exit(&cp->cache_lock); + kmem_mtb_reap_count++; + return; + } else if ((debug_rand % kmem_mtb_move) == 0) { + kmd->kmd_scans++; + (void) kmem_move_buffers(cp, + kmem_reclaim_scan_range, 1, KMM_DEBUG); + } + } +#endif /* DEBUG */ + } + + mutex_exit(&cp->cache_lock); + +} + +// =============================================================== +// Status +// =============================================================== + + +size_t +kmem_size(void) +{ + return (total_memory); // smd +} + +// this is used in arc_reclaim_needed. if 1, reclaim is needed. +// returning 1 has the effect of throttling ARC, so be careful. +int +spl_vm_pool_low(void) +{ + bool m = spl_minimal_physmem_p_logic(); + + if (m) + return (0); + else + return (1); +} + +// =============================================================== +// String handling +// =============================================================== + +char * +kmem_strdup(const char *str) +{ + char *buf; + int len; + len = strlen(str) + 1; + buf = kmem_alloc(len, KM_SLEEP); + strlcpy(buf, str, len); + return (buf); +} + +void +kmem_strfree(char *str) +{ + zfs_kmem_free(str, strlen(str) + 1); +} + +char * +kvasprintf(const char *fmt, va_list ap) +{ + unsigned int len; + char *p; + va_list aq; + + va_copy(aq, ap); + len = vsnprintf(NULL, 0, fmt, aq); + va_end(aq); + p = zfs_kmem_alloc(len+1, KM_SLEEP); + if (!p) + return (NULL); + + vsnprintf(p, len+1, fmt, ap); + + return (p); +} + +char * +kmem_vasprintf(const char *fmt, va_list ap) +{ + va_list aq; + char *ptr; + + do { + va_copy(aq, ap); + ptr = kvasprintf(fmt, aq); + va_end(aq); + } while (ptr == NULL); + + return (ptr); +} + +char * +kmem_asprintf(const char *fmt, ...) +{ + va_list ap; + char *ptr; + + do { + va_start(ap, fmt); + ptr = kvasprintf(fmt, ap); + va_end(ap); + } while (ptr == NULL); + + return (ptr); +} + +char * +kmem_strstr(const char *in, const char *str) +{ + char c; + size_t len; + + c = *str++; + if (!c) + return ((char *)in); // Trivial empty string case + + len = strlen(str); + do { + char sc; + + do { + sc = *in++; + if (!sc) + return ((char *)0); + } while (sc != c); + } while (strncmp(in, str, len) != 0); + + return ((char *)(in - 1)); +} + + +// suppress timer and related logic for this kmem cache can live here +// three new per-kmem-cache stats: counters: non-vba-success non-vba-fail; +// flag: arc_no_grow +// from zfs/include/sys/spa.h + +#define SPA_MINBLOCKSHIFT 9 +#define SPA_MAXBLOCKSHIFT 24 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +typedef struct { + _Atomic(kmem_cache_t *)cp_metadata; + _Atomic(kmem_cache_t *)cp_filedata; + uint16_t pointed_to; + _Atomic int64_t suppress_count; + _Atomic uint64_t last_bumped; +} ksupp_t; + +typedef struct { + ksupp_t *ks_entry; +} iksupp_t; + +ksupp_t ksvec[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT] = + { { NULL, NULL, false, 0, 0 } }; +iksupp_t iksvec[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT] = + { { NULL } }; + +static bool spl_zio_no_grow_inited = false; + +/* + * Test that cp is in ks->cp_metadata or ks->cp_filedata; if so just return + * otherwise, choose the first (and possibly second) NULL + * and try to set it to cp. + * If successful, return. otherwise, sanity check that + * nobody has set ks->cp_metadata or ks->cp_filedata to cp already, and + * that ks->cp_metadata != ks->cp_filedata. + */ + +static void +ks_set_cp(ksupp_t *ks, kmem_cache_t *cp, const size_t cachenum) +{ + + ASSERT(cp != NULL); + ASSERT(ks != NULL); + + if (ks->cp_metadata == cp || ks->cp_filedata == cp) + return; + + const uint64_t b = cachenum; + + bool cp_is_metadata = false; + + vmem_t *vmp = cp->cache_arena; + + ASSERT(vmp == zio_metadata_arena || vmp == zio_arena); + + if (vmp == zio_metadata_arena) + cp_is_metadata = true; + + if (cp_is_metadata) { + for (uint32_t i = 0; ; i++) { + if (i >= 1000000) { + panic("SPL: %s: iterated out trying to set " + "ks->cp_metadata (%s)\n", __func__, + cp->cache_name); + } + kmem_cache_t *expected = NULL; + if (__c11_atomic_compare_exchange_strong( + &ks->cp_metadata, &expected, cp, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + dprintf("SPL: %s: set iskvec[%llu].ks->" + "cp_metadata (%s) OK\n", __func__, + b, cp->cache_name); + return; + } else if (ks->cp_metadata == cp) { + return; + } else if (ks->cp_metadata == NULL) { + continue; + } else { + panic("%s: CAS failed for iksvec[%llu]." + "ks->cp_metadata: %s wanted %s set\n", + __func__, b, cp->cache_name, + ks->cp_metadata->cache_name); + } + } + } else { + for (int32_t j = 0; ; j++) { + if (j >= 1000000) { + panic("SPL: %s: iterated out trying to set " + "ks->cp_filedata (%s)\n", __func__, + cp->cache_name); + } + kmem_cache_t *expected = NULL; + if (__c11_atomic_compare_exchange_strong( + &ks->cp_filedata, &expected, cp, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + dprintf("SPL: %s: set iskvec[%llu].ks->" + "cp_filedata (%s) OK\n", __func__, + b, cp->cache_name); + return; + } else if (ks->cp_filedata == cp) { + return; + } else if (ks->cp_filedata == NULL) { + continue; + } else { + panic("%s: CAS failed for iksvec[%llu].ks->" + "cp_metadata: %s wanted %s set\n", + __func__, b, cp->cache_name, + ks->cp_filedata->cache_name); + } + } + } +} + +void +spl_zio_no_grow_init(void) +{ + // this is the logic from zio.c:zio_init() + + ASSERT(spl_zio_no_grow_inited == false); + + size_t c = 0; + + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + size_t size = (c+1) << SPA_MINBLOCKSHIFT; + size_t p2 = size; + size_t align = 0; + + while (!ISP2(p2)) + p2 &= p2 - 1; + + if (size <= 4 * SPA_MINBLOCKSIZE) { + align = SPA_MINBLOCKSIZE; + } else if (size <= 128 * 1024 && IS_P2ALIGNED(size, p2 >> 4)) { + align = MIN(p2 >> 4, PAGESIZE); + } else if (IS_P2ALIGNED(size, p2 >> 3)) { + align = MIN(p2 >> 3, PAGESIZE); + } + + if (align != 0) { + iksvec[c].ks_entry = &ksvec[c]; + iksvec[c].ks_entry->pointed_to++; + } + } + + while (--c != 0) { + ASSERT(iksvec[c].ks_entry != NULL); + ASSERT(iksvec[c].ks_entry->pointed_to > 0); + if (iksvec[c - 1].ks_entry == NULL) { + iksvec[c - 1].ks_entry = iksvec[c].ks_entry; + iksvec[c - 1].ks_entry->pointed_to++; + } + } + + spl_zio_no_grow_inited = true; + + dprintf("SPL: %s done.\n", __func__); +} + +static void +spl_zio_no_grow_clear() +{ + for (size_t c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + ksupp_t *ks = iksvec[c].ks_entry; + ks->cp_metadata = NULL; + ks->cp_filedata = NULL; + ks->pointed_to = false; + ks->suppress_count = 0; + ks->last_bumped = 0; + iksvec[c].ks_entry = NULL; + } +} + +void +spl_zio_no_grow_fini(void) +{ + // zio_fini() is at its end, so the kmem_caches are gone, + // consequently this is safe. + spl_zio_no_grow_inited = false; + spl_zio_no_grow_clear(); + spl_zio_no_grow_init(); +} + +static void +spl_zio_set_no_grow(const size_t size, kmem_cache_t *cp, const size_t cachenum) +{ + ASSERT(spl_zio_no_grow_inited == true); + ASSERT(iksvec[cachenum].ks_entry != NULL); + + ksupp_t *ks = iksvec[cachenum].ks_entry; + + // maybe update size->cp mapping vector + + ks_set_cp(ks, cp, cachenum); + + if (ks->cp_metadata != cp && ks->cp_filedata != cp) { + panic("ks_cp_set bad for %s", cp->cache_name); + } + + // suppress the bucket for two allocations (is _Atomic) + ks->suppress_count += 2; + ks->last_bumped = zfs_lbolt(); +} + +bool +spl_zio_is_suppressed(const size_t size, const uint64_t now, + const boolean_t buf_is_metadata, kmem_cache_t **zp) +{ + + ASSERT(spl_zio_no_grow_inited == true); + + const size_t cachenum = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(cachenum, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + ksupp_t *ks = iksvec[cachenum].ks_entry; + + if (ks == NULL) { + return (false); + } else if (ks->pointed_to < 1) { + ASSERT(ks->pointed_to > 0); // throw an assertion + dprintf("SPL: %s: ERROR: iksvec[%llu].ks_entry->pointed_to " + "== %u for size %llu\n", __func__, (uint64_t)cachenum, + ks->pointed_to, (uint64_t)size); + return (false); + } else if (ks->suppress_count == 0) { + return (false); + } else { + const uint64_t two_minutes = 120 * hz; + if (ks->last_bumped + two_minutes >= now) { + ks->suppress_count = 0; + ks->last_bumped = now; + return (false); + } else { + ks->suppress_count--; + } + if (buf_is_metadata) { + if (ks->cp_metadata == NULL) { + ks_set_cp(ks, zp[cachenum], cachenum); + if (ks->cp_metadata != NULL) { + atomic_inc_64( + &ks->cp_metadata->arc_no_grow); + } else { + dprintf("WARNING: %s: ks_set_cp->" + "metadata == NULL after " + "ks_set_cp !size = %lu\n", + __func__, size); + } + } else { + atomic_inc_64(&ks->cp_metadata->arc_no_grow); + } + } else { + if (ks->cp_filedata == NULL) { + ks_set_cp(ks, zp[cachenum], cachenum); + if (ks->cp_filedata != NULL) { + atomic_inc_64( + &ks->cp_filedata->arc_no_grow); + } else { + dprintf("WARNING: %s: " + "ks_set_cp->filedata == NULL " + "after ks_set_cp !" + "size = %lu\n", + __func__, size); + } + } else { + atomic_inc_64(&ks->cp_filedata->arc_no_grow); + } + + } + return (true); + } +} + + +/* + * spl_zio_kmem_cache_alloc(): try to get an allocation without descending + * to the bucket layer, and if that fails, set a flag for spl_arc_no_grow() + * then perform the allocation normally. + */ + +void * +spl_zio_kmem_cache_alloc(kmem_cache_t *cp, int kmflag, size_t size, + size_t cachenum) +{ + // called by: + // spl_zio_kmem_cache_alloc(zio_buf_cache[size], kmflag, size, cachenum) + // spl_zio_kmem_cache_alloc(zio_data_buf_cache[size], kmflag, size, + // cachenum) + // those are e.g. + // kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPAMINBLOCKSHIFT] + // and are indexed as size_t cachenum = (size - 1) >> SPA_MIN~BLOCKSHIFT + // VERIFY3U(cachenum, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + // try to get memory from no lower than the bucket_heap + void *m = kmem_cache_alloc(cp, kmflag | KM_NO_VBA | KM_NOSLEEP); + + if (m != NULL) { + atomic_inc_64(&cp->no_vba_success); + return (m); + } + + atomic_inc_64(&cp->no_vba_fail); + + // we will have to go below the bucket_heap to a bucket arena. + // if the bucket arena cannot obviously satisfy the allocation, + // and xnu is tight for memory, then we turn on the no_grow suppression + + extern vmem_t *spl_vmem_bucket_arena_by_size(size_t); + extern uint64_t vmem_xnu_useful_bytes_free(void); + extern int vmem_canalloc_atomic(vmem_t *, size_t); + + vmem_t *bvmp = spl_vmem_bucket_arena_by_size(size); + + if (! vmem_canalloc_atomic(bvmp, size) && + vmem_xnu_useful_bytes_free() < 16ULL*1024ULL*1024ULL) { + spl_zio_set_no_grow(size, cp, cachenum); + atomic_inc_64(&cp->arc_no_grow_set); + } + + // perform the allocation as requested + void *n = kmem_cache_alloc(cp, kmflag); + + return (n); +} + +/* + * return true if the reclaim thread should be awakened + * because we do not have enough memory on hand + */ +boolean_t +spl_arc_reclaim_needed(const size_t bytes, kmem_cache_t **zp) +{ + + /* + * fast path: + * if our argument is 0, then do the equivalent of + * if (arc_available_memory() < 0) return (B_TRUE); + * which is traditional arc.c appraoch + * so we can arc_reclaim_needed() -> spl_arc_reclaim_needed(0) + * if we desire. + */ + if (bytes == 0 && spl_free < 0) { + return (B_TRUE); + } + + // copy some code from zio_buf_alloc() + size_t c = (bytes - 1) >> SPA_MINBLOCKSHIFT; + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + // if there is free memory in the kmem cache slab layer + // then we do not have to reclaim + + if (zp[c]->cache_bufslab > 1) { + if (spl_free < 0) + atomic_inc_64(&spl_arc_reclaim_avoided); + return (B_FALSE); + } + + extern uint64_t vmem_xnu_useful_bytes_free(void); + const uint64_t min_threshold = 64ULL*1024ULL*1024ULL; + const uint64_t pm_pct = real_total_memory >> 8; + const uint64_t high_threshold = MAX(min_threshold, (uint64_t)pm_pct); + const uint64_t low_threshold = bytes; + + const uint64_t f = vmem_xnu_useful_bytes_free(); + + if (f <= low_threshold) { + return (B_TRUE); + } else if (f > high_threshold) { + if (spl_free < 0) + atomic_inc_64(&spl_arc_reclaim_avoided); + return (B_FALSE); + } + + if (spl_free < 0) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* small auxiliary function since we do not export struct kmem_cache to zfs */ +size_t +kmem_cache_bufsize(kmem_cache_t *cp) +{ + return (cp->cache_bufsize); +} + +/* + * check that we would not have KMERR_BADCACHE error in the event + * we did kmem_cache_free(cp, buf) in a DEBUG setting + * + * returns: NULL if the buf is not found in any cache + * cparg if the buf is found in cparg + * a pointer to the cache the buf is found in, if not cparg + */ + +kmem_cache_t * +kmem_cache_buf_in_cache(kmem_cache_t *cparg, void *bufarg) +{ + kmem_cache_t *cp = cparg; + kmem_slab_t *sp; + void *buf = bufarg; + + sp = kmem_findslab(cp, buf); + if (sp == NULL) { + for (cp = list_tail(&kmem_caches); cp != NULL; + cp = list_prev(&kmem_caches, cp)) { + if ((sp = kmem_findslab(cp, buf)) != NULL) + break; + } + } + + if (sp == NULL) { + printf("SPL: %s: KMERR_BADADDR orig cache = %s\n", + __func__, cparg->cache_name); + return (NULL); + } + + if (cp == NULL) { + printf("SPL: %s: ERROR cp == NULL; cparg == %s", + __func__, cparg->cache_name); + return (NULL); + } + + if (cp != cparg) { + printf("SPL: %s: KMERR_BADCACHE arg cache = %s but found " + "in %s instead\n", + __func__, cparg->cache_name, cp->cache_name); + return (cp); + } + + ASSERT(cp == cparg); + + return (cp); +} diff --git a/module/os/macos/spl/spl-kstat.c b/module/os/macos/spl/spl-kstat.c new file mode 100644 index 0000000000..96939736c3 --- /dev/null +++ b/module/os/macos/spl/spl-kstat.c @@ -0,0 +1,1212 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * Copyright (C) 2014 Brendon Humphrey + * + */ + +/* + * Provides an implementation of kstat that is backed by OSX sysctls. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * We need to get dynamically allocated memory from the kernel allocator + * (Our needs are small, we wont blow the zone_map). + */ +extern void *kalloc(vm_size_t size); +extern void kfree(void *data, vm_size_t size); + +/* + * Statically declared toplevel OID that all kstats + * will hang off. + */ +struct sysctl_oid_list sysctl__kstat_children; +SYSCTL_DECL(_kstat); +SYSCTL_NODE(, OID_AUTO, kstat, CTLFLAG_RW, 0, "kstat tree"); + +/* + * Sysctl node tree structure. + * + * These are wired into the OSX sysctl structure + * and also stored a list/tree/whatever for easy + * location and destruction at shutdown time. + */ +typedef struct sysctl_tree_node { + char tn_kstat_name[KSTAT_STRLEN + 1]; + struct sysctl_oid_list tn_children; + struct sysctl_oid tn_oid; + struct sysctl_tree_node *tn_next; +} sysctl_tree_node_t; + +/* + * Each named kstats consists of one or more named + * fields which are implemented as OIDs parented + * off the kstat OID. + * + * To implement the kstat interface, we need to be able + * to call the update() function on the kstat to + * allow the owner to populate the kstat values from + * internal data. + * + * To do this we need the address of the kstat_named_t + * which contains the data value, and the owning kstat_t. + * + * OIDs allow a single void* user argument, so we will + * use a structure that contains both values and + * point to that. + */ +typedef struct sysctl_leaf { + kstat_t *l_ksp; + kstat_named_t *l_named; + struct sysctl_oid l_oid; /* kstats are backed w/sysctl */ + char l_name[KSTAT_STRLEN + 1]; /* Name of the related sysctl */ + int l_oid_registered; /* !0 = registered */ +} sysctl_leaf_t; + +/* + * Extended kstat structure -- for internal use only. + */ +typedef struct ekstat { + kstat_t e_ks; /* the kstat itself */ + size_t e_size; /* total allocation size */ + kthread_t *e_owner; /* thread holding this kstat */ + kcondvar_t e_cv; /* wait for owner == NULL */ + /* contains the named values from the kstat */ + struct sysctl_oid_list e_children; + struct sysctl_oid e_oid; /* the kstat is itself an OID */ + /* array of OIDs that implement the children */ + sysctl_leaf_t *e_vals; + uint64_t e_num_vals; /* size of e_vals array */ +} ekstat_t; + +struct sysctl_tree_node *tree_nodes = 0; +struct sysctl_oid *e_sysctl = 0; + +/* sbuf_new() and family does exist in XNU, but Apple wont let us call them */ +#define M_SBUF 105 /* string buffers */ +#define SBMALLOC(size) _MALLOC(size, M_SBUF, M_WAITOK) +#define SBFREE(buf) FREE(buf, M_SBUF) + +#define SBUF_SETFLAG(s, f) do { (s)->s_flags |= (f); } while (0) +#define SBUF_CLEARFLAG(s, f) do { (s)->s_flags &= ~(f); } while (0) +#define SBUF_ISDYNAMIC(s) ((s)->s_flags & SBUF_DYNAMIC) +#define SBUF_ISDYNSTRUCT(s) ((s)->s_flags & SBUF_DYNSTRUCT) +#define SBUF_HASOVERFLOWED(s) ((s)->s_flags & SBUF_OVERFLOWED) +#define SBUF_HASROOM(s) ((s)->s_len < (s)->s_size - 1) +#define SBUF_FREESPACE(s) ((s)->s_size - (s)->s_len - 1) +#define SBUF_CANEXTEND(s) ((s)->s_flags & SBUF_AUTOEXTEND) + +#define SBUF_MINEXTENDSIZE 16 /* Should be power of 2. */ +#define SBUF_MAXEXTENDSIZE PAGE_SIZE +#define SBUF_MAXEXTENDINCR PAGE_SIZE + +void +sbuf_finish(struct sbuf *s) +{ + s->s_buf[s->s_len] = '\0'; + SBUF_CLEARFLAG(s, SBUF_OVERFLOWED); + SBUF_SETFLAG(s, SBUF_FINISHED); +} + +char * +sbuf_data(struct sbuf *s) +{ + return (s->s_buf); +} + +int +sbuf_len(struct sbuf *s) +{ + if (SBUF_HASOVERFLOWED(s)) { + return (-1); + } + return (s->s_len); +} + +void +sbuf_delete(struct sbuf *s) +{ + int isdyn; + if (SBUF_ISDYNAMIC(s)) { + SBFREE(s->s_buf); + } + isdyn = SBUF_ISDYNSTRUCT(s); + bzero(s, sizeof (*s)); + if (isdyn) { + SBFREE(s); + } +} + +static int +sbuf_extendsize(int size) +{ + int newsize; + + newsize = SBUF_MINEXTENDSIZE; + while (newsize < size) { + if (newsize < (int)SBUF_MAXEXTENDSIZE) { + newsize *= 2; + } else { + newsize += SBUF_MAXEXTENDINCR; + } + } + + return (newsize); +} + +static int +sbuf_extend(struct sbuf *s, int addlen) +{ + char *newbuf; + int newsize; + + if (!SBUF_CANEXTEND(s)) { + return (-1); + } + + newsize = sbuf_extendsize(s->s_size + addlen); + newbuf = (char *)SBMALLOC(newsize); + if (newbuf == NULL) { + return (-1); + } + bcopy(s->s_buf, newbuf, s->s_size); + if (SBUF_ISDYNAMIC(s)) { + SBFREE(s->s_buf); + } else { + SBUF_SETFLAG(s, SBUF_DYNAMIC); + } + s->s_buf = newbuf; + s->s_size = newsize; + return (0); +} + +struct sbuf * +sbuf_new(struct sbuf *s, char *buf, int length, int flags) +{ + flags &= SBUF_USRFLAGMSK; + if (s == NULL) { + s = (struct sbuf *)SBMALLOC(sizeof (*s)); + if (s == NULL) { + return (NULL); + } + bzero(s, sizeof (*s)); + s->s_flags = flags; + SBUF_SETFLAG(s, SBUF_DYNSTRUCT); + } else { + bzero(s, sizeof (*s)); + s->s_flags = flags; + } + s->s_size = length; + if (buf) { + s->s_buf = buf; + return (s); + } + if (flags & SBUF_AUTOEXTEND) { + s->s_size = sbuf_extendsize(s->s_size); + } + s->s_buf = (char *)SBMALLOC(s->s_size); + if (s->s_buf == NULL) { + if (SBUF_ISDYNSTRUCT(s)) { + SBFREE(s); + } + return (NULL); + } + SBUF_SETFLAG(s, SBUF_DYNAMIC); + return (s); +} + +int +sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap) +{ + __builtin_va_list ap_copy; /* XXX tduffy - blame on him */ + int len; + + if (SBUF_HASOVERFLOWED(s)) { + return (-1); + } + + do { + va_copy(ap_copy, ap); + len = vsnprintf(&s->s_buf[s->s_len], SBUF_FREESPACE(s) + 1, + fmt, ap_copy); + va_end(ap_copy); + } while (len > SBUF_FREESPACE(s) && + sbuf_extend(s, len - SBUF_FREESPACE(s)) == 0); + s->s_len += min(len, SBUF_FREESPACE(s)); + if (!SBUF_HASROOM(s) && !SBUF_CANEXTEND(s)) { + SBUF_SETFLAG(s, SBUF_OVERFLOWED); + } + + if (SBUF_HASOVERFLOWED(s)) { + return (-1); + } + return (0); +} + +int +sbuf_printf(struct sbuf *s, const char *fmt, ...) +{ + va_list ap; + int result; + + va_start(ap, fmt); + result = sbuf_vprintf(s, fmt, ap); + va_end(ap); + return (result); +} + +static void +kstat_set_string(char *dst, const char *src) +{ + bzero(dst, KSTAT_STRLEN); + (void) strlcpy(dst, src, KSTAT_STRLEN); +} + +static struct sysctl_oid * +get_oid_with_name(struct sysctl_oid_list *list, char *name) +{ + struct sysctl_oid *oidp; + + SLIST_FOREACH(oidp, list, oid_link) { + if (strcmp(name, oidp->oid_name) == 0) { + return (oidp); + } + } + + return (0); +} + +static void +init_oid_tree_node(struct sysctl_oid_list *parent, char *name, + sysctl_tree_node_t *node) +{ + strlcpy(node->tn_kstat_name, name, KSTAT_STRLEN); + + node->tn_oid.oid_parent = parent; + node->tn_oid.oid_link.sle_next = 0; + node->tn_oid.oid_number = OID_AUTO; + node->tn_oid.oid_arg2 = 0; + node->tn_oid.oid_name = &node->tn_kstat_name[0]; + node->tn_oid.oid_descr = ""; + node->tn_oid.oid_version = SYSCTL_OID_VERSION; + node->tn_oid.oid_refcnt = 0; + node->tn_oid.oid_handler = 0; + node->tn_oid.oid_kind = CTLTYPE_NODE|CTLFLAG_RW|CTLFLAG_OID2; + node->tn_oid.oid_fmt = "N"; + node->tn_oid.oid_arg1 = (void*)(&node->tn_children); + + sysctl_register_oid(&node->tn_oid); + + node->tn_next = tree_nodes; + tree_nodes = node; +} + +static struct sysctl_oid_list * +get_kstat_parent(struct sysctl_oid_list *root, char *module_name, + char *class_name) +{ + struct sysctl_oid *the_module = 0; + struct sysctl_oid *the_class = 0; + sysctl_tree_node_t *new_node = 0; + struct sysctl_oid_list *container = root; + + /* + * Locate/create the module + */ + the_module = get_oid_with_name(root, module_name); + + if (!the_module) { + new_node = kalloc(sizeof (sysctl_tree_node_t)); + bzero(new_node, sizeof (sysctl_tree_node_t)); + init_oid_tree_node(root, module_name, new_node); + the_module = &new_node->tn_oid; + } + + /* + * Locate/create the class + */ + container = the_module->oid_arg1; + the_class = get_oid_with_name(container, class_name); + + if (!the_class) { + new_node = kalloc(sizeof (sysctl_tree_node_t)); + bzero(new_node, sizeof (sysctl_tree_node_t)); + init_oid_tree_node(container, class_name, new_node); + the_class = &new_node->tn_oid; + } + + container = the_class->oid_arg1; + return (container); +} + +static int +kstat_default_update(kstat_t *ksp, int rw) +{ + ASSERT(ksp != NULL); + + if (rw == KSTAT_WRITE) + return (EACCES); + + return (0); +} + +static int +kstat_resize_raw(kstat_t *ksp) +{ + if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) + return (ENOMEM); + + kfree(ksp->ks_raw_buf, ksp->ks_raw_bufsize); + ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); + ksp->ks_raw_buf = kalloc(ksp->ks_raw_bufsize); + + return (0); +} + +static void * +kstat_raw_default_addr(kstat_t *ksp, loff_t n) +{ + if (n == 0) + return (ksp->ks_data); + return (NULL); +} + +#define HD_COLUMN_MASK 0xff +#define HD_DELIM_MASK 0xff00 +#define HD_OMIT_COUNT (1 << 16) +#define HD_OMIT_HEX (1 << 17) +#define HD_OMIT_CHARS (1 << 18) + +void +sbuf_hexdump(struct sbuf *sb, const void *ptr, int length, const char *hdr, + int flags) +{ + int i, j, k; + int cols; + const unsigned char *cp; + char delim; + + if ((flags & HD_DELIM_MASK) != 0) + delim = (flags & HD_DELIM_MASK) >> 8; + else + delim = ' '; + + if ((flags & HD_COLUMN_MASK) != 0) + cols = flags & HD_COLUMN_MASK; + else + cols = 16; + + cp = ptr; + for (i = 0; i < length; i += cols) { + if (hdr != NULL) + sbuf_printf(sb, "%s", hdr); + + if ((flags & HD_OMIT_COUNT) == 0) + sbuf_printf(sb, "%04x ", i); + + if ((flags & HD_OMIT_HEX) == 0) { + for (j = 0; j < cols; j++) { + k = i + j; + if (k < length) + sbuf_printf(sb, "%c%02x", delim, cp[k]); + else + sbuf_printf(sb, " "); + } + } + + if ((flags & HD_OMIT_CHARS) == 0) { + sbuf_printf(sb, " |"); + for (j = 0; j < cols; j++) { + k = i + j; + if (k >= length) + sbuf_printf(sb, " "); + else if (cp[k] >= ' ' && cp[k] <= '~') + sbuf_printf(sb, "%c", cp[k]); + else + sbuf_printf(sb, "."); + } + sbuf_printf(sb, "|"); + } + sbuf_printf(sb, "\n"); + } +} + +static int +kstat_handle_raw SYSCTL_HANDLER_ARGS +{ + struct sbuf *sb; + void *data; + kstat_t *ksp = arg1; + void *(*addr_op)(kstat_t *ksp, loff_t index); + int n, has_header, rc = 0; + + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + if (sb == NULL) + return (ENOMEM); + + if (ksp->ks_raw_ops.addr) + addr_op = ksp->ks_raw_ops.addr; + else + addr_op = kstat_raw_default_addr; + + VERIFY3P(ksp->ks_lock, !=, NULL); + mutex_enter(ksp->ks_lock); + + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + + ksp->ks_raw_bufsize = PAGE_SIZE; + ksp->ks_raw_buf = kalloc(PAGE_SIZE); + + n = 0; + has_header = (ksp->ks_raw_ops.headers || + ksp->ks_raw_ops.seq_headers); + +restart_headers: + if (ksp->ks_raw_ops.headers) { + rc = ksp->ks_raw_ops.headers( + ksp->ks_raw_buf, ksp->ks_raw_bufsize); + } else if (ksp->ks_raw_ops.seq_headers) { + struct seq_file f; + + f.sf_buf = ksp->ks_raw_buf; + f.sf_size = ksp->ks_raw_bufsize; + rc = ksp->ks_raw_ops.seq_headers(&f); + } + if (has_header) { + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart_headers; + if (rc == 0) + sbuf_printf(sb, "\n%s", ksp->ks_raw_buf); + } + + while ((data = addr_op(ksp, n)) != NULL) { +restart: + if (ksp->ks_raw_ops.data) { + rc = ksp->ks_raw_ops.data(ksp->ks_raw_buf, + ksp->ks_raw_bufsize, data); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (rc == 0) + sbuf_printf(sb, "%s", ksp->ks_raw_buf); + + } else { + ASSERT(ksp->ks_ndata == 1); + sbuf_hexdump(sb, ksp->ks_data, + ksp->ks_data_size, NULL, 0); + } + n++; + } + kfree(ksp->ks_raw_buf, PAGE_SIZE); + mutex_exit(ksp->ks_lock); + rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); + sbuf_delete(sb); + return (rc); +} + +static int +kstat_handle_io SYSCTL_HANDLER_ARGS +{ + struct sbuf *sb; + kstat_t *ksp = arg1; + kstat_io_t *kip = ksp->ks_data; + int rc; + + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + if (sb == NULL) + return (ENOMEM); + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + + /* though wlentime & friends are signed, they will never be negative */ + sbuf_printf(sb, + "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " + "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", + kip->nread, kip->nwritten, + kip->reads, kip->writes, + kip->wtime, kip->wlentime, kip->wlastupdate, + kip->rtime, kip->rlentime, kip->rlastupdate, + kip->wcnt, kip->rcnt); + sbuf_finish(sb); + rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); + sbuf_delete(sb); + return (rc); +} + +static int +kstat_handle_i64 SYSCTL_HANDLER_ARGS +{ + int error = 0; + sysctl_leaf_t *params = (sysctl_leaf_t *)(arg1); + kstat_named_t *named = params->l_named; + kstat_t *ksp = params->l_ksp; + kmutex_t *lock = ksp->ks_lock; + int lock_needs_release = 0; + + if (lock && !MUTEX_NOT_HELD(lock)) { + mutex_enter(lock); + lock_needs_release = 1; + } + + if (ksp->ks_update) { + ksp->ks_update(ksp, KSTAT_READ); + } + + if (!error && req->newptr) { + /* + * Write request - first read add current values for the kstat + * (remember that is sysctl is likely only one of many + * values that make up the kstat). + */ + + /* Copy the new value from user space */ + (void) copyin(req->newptr, &named->value.i64, + sizeof (named->value.i64)); + + /* and invoke the update operation */ + if (ksp->ks_update) { + error = ksp->ks_update(ksp, KSTAT_WRITE); + } + } else { + /* + * Read request + */ + error = SYSCTL_OUT(req, &named->value.i64, sizeof (int64_t)); + } + + if (lock_needs_release) { + mutex_exit(lock); + } + + return (error); +} + +static int +kstat_handle_ui64 SYSCTL_HANDLER_ARGS +{ + int error = 0; + sysctl_leaf_t *params = (sysctl_leaf_t *)(arg1); + kstat_named_t *named = params->l_named; + kstat_t *ksp = params->l_ksp; + kmutex_t *lock = ksp->ks_lock; + int lock_needs_release = 0; + + if (lock && !MUTEX_NOT_HELD(lock)) { + mutex_enter(lock); + lock_needs_release = 1; + } + + if (ksp->ks_update) { + ksp->ks_update(ksp, KSTAT_READ); + } + + if (!error && req->newptr) { + /* + * Write request - first read add current values for the kstat + * (remember that is sysctl is likely only one of many + * values that make up the kstat). + */ + + /* Copy the new value from user space */ + (void) copyin(req->newptr, &named->value.ui64, + sizeof (named->value.ui64)); + + /* and invoke the update operation */ + if (ksp->ks_update) { + error = ksp->ks_update(ksp, KSTAT_WRITE); + } + } else { + /* + * Read request + */ + error = SYSCTL_OUT(req, &named->value.ui64, sizeof (uint64_t)); + } + + if (lock_needs_release) { + mutex_exit(lock); + } + + return (error); +} + +static int +kstat_handle_string SYSCTL_HANDLER_ARGS +{ + int error = 0; + sysctl_leaf_t *params = (sysctl_leaf_t *)(arg1); + kstat_named_t *named = params->l_named; + kstat_t *ksp = params->l_ksp; + kmutex_t *lock = ksp->ks_lock; + int lock_needs_release = 0; + + if (lock && !MUTEX_NOT_HELD(lock)) { + mutex_enter(lock); + lock_needs_release = 1; + } + + if (ksp->ks_update) { + ksp->ks_update(ksp, KSTAT_READ); + } + + if (!error && req->newptr) { + + /* Copy the new value from user space (copyin done by XNU) */ + kstat_named_setstr(named, (const char *)(req->newptr)); + + /* and invoke the update operation: last call out */ + if (ksp->ks_update) { + error = ksp->ks_update(ksp, KSTAT_WRITE); + } + + } else { + + error = SYSCTL_OUT(req, named->value.string.addr.ptr, + named->value.string.len); + } + + if (lock_needs_release) { + mutex_exit(lock); + } + + return (error); +} + +kstat_t * +kstat_create(const char *ks_module, int ks_instance, const char *ks_name, + const char *ks_class, uchar_t ks_type, ulong_t ks_ndata, uchar_t ks_flags) +{ + kstat_t *ksp = 0; + ekstat_t *e = 0; + size_t size = 0; + + if (ks_class == NULL) + ks_class = "misc"; + + /* + * Allocate memory for the new kstat header. + */ + size = sizeof (ekstat_t); + e = (ekstat_t *)kalloc(size); + bzero(e, size); + if (e == NULL) { + cmn_err(CE_NOTE, "kstat_create('%s', %d, '%s'): " + "insufficient kernel memory", + ks_module, ks_instance, ks_name); + return (NULL); + } + e->e_size = size; + + cv_init(&e->e_cv, NULL, CV_DEFAULT, NULL); + + /* + * Initialize as many fields as we can. The caller may reset + * ks_lock, ks_update, ks_private, and ks_snapshot as necessary. + * Creators of virtual kstats may also reset ks_data. It is + * also up to the caller to initialize the kstat data section, + * if necessary. All initialization must be complete before + * calling kstat_install(). + */ + ksp = &e->e_ks; + + ksp->ks_crtime = gethrtime(); + kstat_set_string(ksp->ks_module, ks_module); + ksp->ks_instance = ks_instance; + kstat_set_string(ksp->ks_name, ks_name); + ksp->ks_type = ks_type; + kstat_set_string(ksp->ks_class, ks_class); + ksp->ks_flags = ks_flags | KSTAT_FLAG_INVALID; + ksp->ks_snaptime = ksp->ks_crtime; + ksp->ks_update = kstat_default_update; + + mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); + ksp->ks_lock = &ksp->ks_private_lock; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + ksp->ks_ndata = 1; + ksp->ks_data_size = ks_ndata; + break; + case KSTAT_TYPE_NAMED: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + ASSERT(ks_ndata == 1); + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); + break; + default: + panic("Undefined kstat type %d\n", ksp->ks_type); + } + + + + /* + * Initialise the sysctl that represents this kstat + */ + e->e_children.slh_first = 0; + + e->e_oid.oid_parent = get_kstat_parent(&sysctl__kstat_children, + ksp->ks_module, ksp->ks_class); + e->e_oid.oid_link.sle_next = 0; + e->e_oid.oid_number = OID_AUTO; + e->e_oid.oid_arg2 = 0; + e->e_oid.oid_name = ksp->ks_name; + e->e_oid.oid_descr = ""; + e->e_oid.oid_version = SYSCTL_OID_VERSION; + e->e_oid.oid_refcnt = 0; + e->e_oid.oid_handler = 0; + e->e_oid.oid_kind = CTLTYPE_NODE|CTLFLAG_RW|CTLFLAG_OID2; + e->e_oid.oid_fmt = "N"; + e->e_oid.oid_arg1 = (void*)(&e->e_children); + + /* If VIRTUAL we allocate memory to store data */ + if (ks_flags & KSTAT_FLAG_VIRTUAL) + ksp->ks_data = NULL; + else + ksp->ks_data = (void *)kmem_zalloc( + ksp->ks_data_size, KM_SLEEP); + + sysctl_register_oid(&e->e_oid); + + return (ksp); +} + +void +kstat_install(kstat_t *ksp) +{ + ekstat_t *e = (ekstat_t *)ksp; + kstat_named_t *named_base = 0; + sysctl_leaf_t *vals_base = 0; + sysctl_leaf_t *params = 0; + int oid_permissions = CTLFLAG_RD; + + if (ksp->ks_type == KSTAT_TYPE_NAMED) { + + if (ksp->ks_flags & KSTAT_FLAG_WRITABLE) { + oid_permissions |= CTLFLAG_RW; + } + + // Create the leaf node OID objects + e->e_vals = (sysctl_leaf_t *)kalloc(ksp->ks_ndata * + sizeof (sysctl_leaf_t)); + bzero(e->e_vals, ksp->ks_ndata * sizeof (sysctl_leaf_t)); + e->e_num_vals = ksp->ks_ndata; + + named_base = (kstat_named_t *)(ksp->ks_data); + vals_base = e->e_vals; + + for (int i = 0; i < ksp->ks_ndata; i++) { + int oid_valid = 1; + + kstat_named_t *named = &named_base[i]; + sysctl_leaf_t *val = &vals_base[i]; + + // Perform basic initialisation of the sysctl. + // + // The sysctl: kstat.... + snprintf(val->l_name, KSTAT_STRLEN, "%s", named->name); + + val->l_oid.oid_parent = &e->e_children; + val->l_oid.oid_link.sle_next = 0; + val->l_oid.oid_number = OID_AUTO; + val->l_oid.oid_arg2 = 0; + val->l_oid.oid_name = val->l_name; + val->l_oid.oid_descr = ""; + val->l_oid.oid_version = SYSCTL_OID_VERSION; + val->l_oid.oid_refcnt = 0; + + // Based on the kstat type flags, provide location + // of data item and associated type and handler + // flags to the sysctl. + switch (named->data_type) { + case KSTAT_DATA_INT64: + params = (sysctl_leaf_t *)kalloc( + sizeof (sysctl_leaf_t)); + params->l_named = named; + params->l_ksp = ksp; + + val->l_oid.oid_handler = + kstat_handle_i64; + val->l_oid.oid_kind = CTLTYPE_QUAD | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "Q"; + val->l_oid.oid_arg1 = (void*)params; + params = 0; + break; + case KSTAT_DATA_UINT64: + params = (sysctl_leaf_t *)kalloc( + sizeof (sysctl_leaf_t)); + params->l_named = named; + params->l_ksp = ksp; + + val->l_oid.oid_handler = + kstat_handle_ui64; + val->l_oid.oid_kind = CTLTYPE_QUAD | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "Q"; + val->l_oid.oid_arg1 = (void*)params; + break; + case KSTAT_DATA_INT32: + val->l_oid.oid_handler = + sysctl_handle_int; + val->l_oid.oid_kind = CTLTYPE_INT | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "I"; + val->l_oid.oid_arg1 = &named->value.i32; + break; + case KSTAT_DATA_UINT32: + val->l_oid.oid_handler = + sysctl_handle_int; + val->l_oid.oid_kind = CTLTYPE_INT | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "IU"; + val->l_oid.oid_arg1 = + &named->value.ui32; + break; + case KSTAT_DATA_LONG: + val->l_oid.oid_handler = + sysctl_handle_long; + val->l_oid.oid_kind = CTLTYPE_INT | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "L"; + val->l_oid.oid_arg1 = &named->value.l; + break; + case KSTAT_DATA_ULONG: + val->l_oid.oid_handler = + sysctl_handle_long; + val->l_oid.oid_kind = CTLTYPE_INT | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "L"; + val->l_oid.oid_arg1 = &named->value.ul; + break; + case KSTAT_DATA_STRING: + params = (sysctl_leaf_t *)kalloc( + sizeof (sysctl_leaf_t)); + params->l_named = named; + params->l_ksp = ksp; + val->l_oid.oid_handler = + kstat_handle_string; + val->l_oid.oid_kind = CTLTYPE_STRING | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "S"; + val->l_oid.oid_arg1 = (void*)params; + + named->value.string.addr.ptr = NULL; + named->value.string.len = 0; + break; + + case KSTAT_DATA_CHAR: + default: + oid_valid = 0; + break; + } + + /* + * Finally publish the OID, provided that there were + * no issues initialising it. + */ + if (oid_valid) { + sysctl_register_oid(&val->l_oid); + val->l_oid_registered = 1; + } else { + val->l_oid_registered = 0; + } + } + + } else if (ksp->ks_type == KSTAT_TYPE_RAW) { + + e->e_vals = (sysctl_leaf_t *)kalloc(sizeof (sysctl_leaf_t)); + bzero(e->e_vals, sizeof (sysctl_leaf_t)); + e->e_num_vals = 1; + sysctl_leaf_t *val = e->e_vals; + + snprintf(val->l_name, KSTAT_STRLEN, "%s", ksp->ks_name); + + val->l_oid.oid_parent = &e->e_children; + val->l_oid.oid_link.sle_next = 0; + val->l_oid.oid_number = OID_AUTO; + val->l_oid.oid_arg2 = 0; + val->l_oid.oid_name = val->l_name; + val->l_oid.oid_descr = ""; + val->l_oid.oid_version = SYSCTL_OID_VERSION; + val->l_oid.oid_refcnt = 0; + + if (ksp->ks_raw_ops.data) { + val->l_oid.oid_handler = + kstat_handle_raw; + val->l_oid.oid_kind = CTLTYPE_STRING | + CTLFLAG_RD; + val->l_oid.oid_fmt = "A"; + val->l_oid.oid_arg1 = (void *) ksp; + sysctl_register_oid(&val->l_oid); + } else { + val->l_oid.oid_handler = + kstat_handle_raw; + val->l_oid.oid_kind = CTLTYPE_OPAQUE | + CTLFLAG_RD; + val->l_oid.oid_fmt = "A"; + val->l_oid.oid_arg1 = (void *) ksp; + sysctl_register_oid(&val->l_oid); + } + + } else if (ksp->ks_type == KSTAT_TYPE_IO) { + + e->e_vals = (sysctl_leaf_t *)kalloc(sizeof (sysctl_leaf_t)); + bzero(e->e_vals, sizeof (sysctl_leaf_t)); + e->e_num_vals = 1; + sysctl_leaf_t *val = e->e_vals; + + snprintf(val->l_name, KSTAT_STRLEN, "%s", ksp->ks_name); + + val->l_oid.oid_parent = &e->e_children; + val->l_oid.oid_link.sle_next = 0; + val->l_oid.oid_number = OID_AUTO; + val->l_oid.oid_arg2 = 0; + val->l_oid.oid_name = val->l_name; + val->l_oid.oid_descr = ""; + val->l_oid.oid_version = SYSCTL_OID_VERSION; + val->l_oid.oid_refcnt = 0; + + val->l_oid.oid_handler = + kstat_handle_io; + val->l_oid.oid_kind = CTLTYPE_STRING | + CTLFLAG_RD; + val->l_oid.oid_fmt = "A"; + val->l_oid.oid_arg1 = (void *) ksp; + sysctl_register_oid(&val->l_oid); + } + + ksp->ks_flags &= ~KSTAT_FLAG_INVALID; +} + +static void +remove_child_sysctls(ekstat_t *e) +{ + kstat_t *ksp = &e->e_ks; + kstat_named_t *named_base = (kstat_named_t *)(ksp->ks_data); + sysctl_leaf_t *vals_base = e->e_vals; + + for (int i = 0; i < ksp->ks_ndata; i++) { + if (vals_base[i].l_oid_registered) { + sysctl_unregister_oid(&vals_base[i].l_oid); + vals_base[i].l_oid_registered = 0; + } + + if (named_base[i].data_type == KSTAT_DATA_INT64 || + named_base[i].data_type == KSTAT_DATA_UINT64 || + named_base[i].data_type == KSTAT_DATA_STRING) { + + sysctl_leaf_t *leaf = (sysctl_leaf_t *) + vals_base[i].l_oid.oid_arg1; /* params */ + kfree(leaf, sizeof (sysctl_leaf_t)); + + if (named_base[i].data_type == KSTAT_DATA_STRING) { + kstat_named_setstr(&named_base[i], NULL); + } + } + } +} + +void +kstat_delete(kstat_t *ksp) +{ + ekstat_t *e = (ekstat_t *)ksp; + kmutex_t *lock = ksp->ks_lock; + int lock_needs_release = 0; + + // destroy the sysctl + if (ksp->ks_type == KSTAT_TYPE_NAMED) { + + if (lock && MUTEX_NOT_HELD(lock)) { + mutex_enter(lock); + lock_needs_release = 1; + } + + remove_child_sysctls(e); + + if (lock_needs_release) { + mutex_exit(lock); + } + } + + sysctl_unregister_oid(&e->e_oid); + + if (e->e_vals) { + kfree(e->e_vals, sizeof (sysctl_leaf_t) * e->e_num_vals); + } + + if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) + kmem_free(ksp->ks_data, ksp->ks_data_size); + + ksp->ks_lock = NULL; + mutex_destroy(&ksp->ks_private_lock); + + cv_destroy(&e->e_cv); + kfree(e, e->e_size); +} + +void +kstat_named_setstr(kstat_named_t *knp, const char *src) +{ + void *data; + int len; + + if (knp->data_type != KSTAT_DATA_STRING) + panic("kstat_named_setstr('%p', '%p'): " + "named kstat is not of type KSTAT_DATA_STRING", + (void *)knp, (void *)src); + + data = KSTAT_NAMED_STR_PTR(knp); + len = KSTAT_NAMED_STR_BUFLEN(knp); + + if (data != NULL && len > 0) { + + // If strings are the same, don't bother swapping them + if (src != NULL && + strcmp(src, data) == 0) + return; + + kfree(data, len); + KSTAT_NAMED_STR_PTR(knp) = NULL; + KSTAT_NAMED_STR_BUFLEN(knp) = 0; + } + + if (src == NULL) + return; + + len = strlen(src) + 1; + + data = kalloc(len); + strlcpy(data, src, len); + KSTAT_NAMED_STR_PTR(knp) = data; + KSTAT_NAMED_STR_BUFLEN(knp) = len; +} + +void +kstat_named_init(kstat_named_t *knp, const char *name, uchar_t data_type) +{ + kstat_set_string(knp->name, name); + knp->data_type = data_type; + + if (data_type == KSTAT_DATA_STRING) + kstat_named_setstr(knp, NULL); +} + + +void +kstat_waitq_enter(kstat_io_t *kiop) +{ +} + +void +kstat_waitq_exit(kstat_io_t *kiop) +{ +} + +void +kstat_runq_enter(kstat_io_t *kiop) +{ +} + +void +kstat_runq_exit(kstat_io_t *kiop) +{ +} + +void +__kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} + +void +__kstat_set_seq_raw_ops(kstat_t *ksp, + int (*headers)(struct seq_file *f), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.seq_headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} + +void +spl_kstat_init() +{ + /* + * Create the kstat root OID + */ + sysctl_register_oid(&sysctl__kstat); +} + +void +spl_kstat_fini() +{ + /* + * Destroy the kstat module/class/name tree + * + * Done in two passes, first unregisters all + * of the oids, second releases all the memory. + */ + + sysctl_tree_node_t *iter = tree_nodes; + while (iter) { + sysctl_tree_node_t *tn = iter; + iter = tn->tn_next; + sysctl_unregister_oid(&tn->tn_oid); + } + + while (tree_nodes) { + sysctl_tree_node_t *tn = tree_nodes; + tree_nodes = tn->tn_next; + kfree(tn, sizeof (sysctl_tree_node_t)); + } + + /* + * Destroy the root oid + */ + sysctl_unregister_oid(&sysctl__kstat); +} diff --git a/module/os/macos/spl/spl-list.c b/module/os/macos/spl/spl-list.c new file mode 100644 index 0000000000..ede7a29c42 --- /dev/null +++ b/module/os/macos/spl/spl-list.c @@ -0,0 +1,197 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Generic doubly-linked list implementation + */ + +#include +#include +#include +#include + + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_prev = node; \ + lnew->list_next = node->list_next; \ + node->list_next->list_prev = lnew; \ + node->list_next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_next = node; \ + lnew->list_prev = node->list_prev; \ + node->list_prev->list_next = lnew; \ + node->list_prev = lnew; \ +} + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT(list); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.list_next = list->list_head.list_prev = + &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT(list); + ASSERT(list->list_head.list_next == node); + ASSERT(list->list_head.list_prev == node); + + node->list_next = node->list_prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + ASSERT(lold->list_next != NULL); + lold->list_prev->list_next = lold->list_next; + lold->list_next->list_prev = lold->list_prev; + lold->list_next = lold->list_prev = NULL; +} + + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_next != &list->list_head) + return (list_object(list, node->list_next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_prev != &list->list_head) + return (list_object(list, node->list_prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT(dst->list_size == src->list_size); + ASSERT(dst->list_offset == src->list_offset); + + if (list_empty(src)) + return; + + dstnode->list_prev->list_next = srcnode->list_next; + srcnode->list_next->list_prev = dstnode->list_prev; + dstnode->list_prev = srcnode->list_prev; + srcnode->list_prev->list_next = dstnode; + + /* empty src list */ + srcnode->list_next = srcnode->list_prev = srcnode; +} + +int +list_link_active(list_node_t *link) +{ + return (link->list_next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/module/os/macos/spl/spl-mutex.c b/module/os/macos/spl/spl-mutex.c new file mode 100644 index 0000000000..b7b47f42c4 --- /dev/null +++ b/module/os/macos/spl/spl-mutex.c @@ -0,0 +1,415 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013,2020 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Not defined in headers +extern boolean_t lck_mtx_try_lock(lck_mtx_t *lck); + + +static lck_attr_t *zfs_lock_attr = NULL; +static lck_grp_attr_t *zfs_group_attr = NULL; + +static lck_grp_t *zfs_mutex_group = NULL; + +uint64_t zfs_active_mutex = 0; + +#ifdef SPL_DEBUG_MUTEX +#include +static list_t mutex_list; +static kmutex_t mutex_list_mutex; + + +struct leak { + list_node_t mutex_leak_node; + +#define SPL_DEBUG_MUTEX_MAXCHAR 32 + char location_file[SPL_DEBUG_MUTEX_MAXCHAR]; + char location_function[SPL_DEBUG_MUTEX_MAXCHAR]; + uint64_t location_line; + void *mp; + + uint64_t wdlist_locktime; // time lock was taken + char wdlist_file[32]; // storing holder + uint64_t wdlist_line; +}; + +static int wdlist_exit = 0; + +void +spl_wdlist_settime(void *mpleak, uint64_t value) +{ + struct leak *leak = (struct leak *)mpleak; + if (!leak) + return; + leak->wdlist_locktime = value; +} + +inline static void +spl_wdlist_check(void *ignored) +{ + struct leak *mp; + printf("SPL: Mutex watchdog is alive\n"); + + while (!wdlist_exit) { + delay(hz * SPL_MUTEX_WATCHDOG_SLEEP); + uint64_t noe = gethrestime_sec(); + lck_mtx_lock((lck_mtx_t *)&mutex_list_mutex.m_lock); + for (mp = list_head(&mutex_list); + mp; + mp = list_next(&mutex_list, mp)) { + uint64_t locktime = mp->wdlist_locktime; + if ((locktime > 0) && (noe > locktime) && + noe - locktime >= SPL_MUTEX_WATCHDOG_TIMEOUT) { + printf("SPL: mutex (%p) held for %llus by " + "'%s':%llu\n", mp, noe - + mp->wdlist_locktime, mp->wdlist_file, + mp->wdlist_line); + } // if old + } // for all + lck_mtx_unlock((lck_mtx_t *)&mutex_list_mutex.m_lock); + } // while not exit + + printf("SPL: watchdog thread exit\n"); + wdlist_exit = 2; + thread_exit(); +} +#endif + + +int +spl_mutex_subsystem_init(void) +{ + zfs_lock_attr = lck_attr_alloc_init(); + zfs_group_attr = lck_grp_attr_alloc_init(); + zfs_mutex_group = lck_grp_alloc_init("zfs-mutex", zfs_group_attr); + +#ifdef SPL_DEBUG_MUTEX + { + unsigned char mutex[128]; + int i; + + memset(mutex, 0xAF, sizeof (mutex)); + lck_mtx_init((lck_mtx_t *)&mutex[0], zfs_mutex_group, + zfs_lock_attr); + for (i = sizeof (mutex) -1; i >= 0; i--) + if (mutex[i] != 0xAF) + break; + + printf("SPL: mutex size is %u\n", i+1); + + } + + list_create(&mutex_list, sizeof (struct leak), + offsetof(struct leak, mutex_leak_node)); + lck_mtx_init((lck_mtx_t *)&mutex_list_mutex.m_lock, zfs_mutex_group, + zfs_lock_attr); + mutex_list_mutex.m_initialised = MUTEX_INIT; + + (void) thread_create(NULL, 0, spl_wdlist_check, 0, 0, 0, 0, 92); +#endif + return (0); +} + + + +void +spl_mutex_subsystem_fini(void) +{ +#ifdef SPL_DEBUG_MUTEX + uint64_t total = 0; + printf("Dumping leaked mutex allocations...\n"); + + wdlist_exit = 1; + + mutex_enter(&mutex_list_mutex); + while (1) { + struct leak *leak, *runner; + uint32_t found; + + leak = list_head(&mutex_list); + + if (leak) { + list_remove(&mutex_list, leak); + } + if (!leak) + break; + + // Run through list and count up how many times this leak is + // found, removing entries as we go. + for (found = 1, runner = list_head(&mutex_list); + runner; + runner = runner ? list_next(&mutex_list, runner) : + list_head(&mutex_list)) { + + if (strcmp(leak->location_file, runner->location_file) + == 0 && strcmp(leak->location_function, + runner->location_function) == 0 && + leak->location_line == runner->location_line) { + // Same place + found++; + list_remove(&mutex_list, runner); + FREE(runner, M_TEMP); + runner = NULL; + } // if same + + } // for all nodes + + printf(" mutex %p : %s %s %llu : # leaks: %u\n", + leak->mp, + leak->location_file, + leak->location_function, + leak->location_line, + found); + + FREE(leak, M_TEMP); + total += found; + + } + mutex_exit(&mutex_list_mutex); + printf("Dumped %llu leaked allocations. Wait for watchdog " + "to exit..\n", total); + + while (wdlist_exit != 2) + delay(hz>>4); + + lck_mtx_destroy((lck_mtx_t *)&mutex_list_mutex.m_lock, zfs_mutex_group); + list_destroy(&mutex_list); +#endif + + lck_attr_free(zfs_lock_attr); + zfs_lock_attr = NULL; + + lck_grp_attr_free(zfs_group_attr); + zfs_group_attr = NULL; + + lck_grp_free(zfs_mutex_group); + zfs_mutex_group = NULL; +} + + + +#ifdef SPL_DEBUG_MUTEX +void +spl_mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc, + const char *file, const char *fn, int line) +#else +void +spl_mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc) +#endif +{ + ASSERT(type != MUTEX_SPIN); + ASSERT(ibc == NULL); + +#ifdef SPL_DEBUG_MUTEX + VERIFY3U(mp->m_initialised, !=, MUTEX_INIT); +#endif + + lck_mtx_init((lck_mtx_t *)&mp->m_lock, zfs_mutex_group, zfs_lock_attr); + mp->m_owner = NULL; + + atomic_inc_64(&zfs_active_mutex); + +#ifdef SPL_DEBUG_MUTEX + mp->m_initialised = MUTEX_INIT; + + struct leak *leak; + + MALLOC(leak, struct leak *, + sizeof (struct leak), M_TEMP, M_WAITOK); + + if (leak) { + bzero(leak, sizeof (struct leak)); + strlcpy(leak->location_file, file, SPL_DEBUG_MUTEX_MAXCHAR); + strlcpy(leak->location_function, fn, SPL_DEBUG_MUTEX_MAXCHAR); + leak->location_line = line; + leak->mp = mp; + + mutex_enter(&mutex_list_mutex); + list_link_init(&leak->mutex_leak_node); + list_insert_tail(&mutex_list, leak); + mp->leak = leak; + mutex_exit(&mutex_list_mutex); + } + leak->wdlist_locktime = 0; + leak->wdlist_file[0] = 0; + leak->wdlist_line = 0; +#endif +} + +void +spl_mutex_destroy(kmutex_t *mp) +{ + if (!mp) + return; + +#ifdef SPL_DEBUG_MUTEX + VERIFY3U(mp->m_initialised, ==, MUTEX_INIT); +#endif + + if (mp->m_owner != 0) + panic("SPL: releasing held mutex"); + + lck_mtx_destroy((lck_mtx_t *)&mp->m_lock, zfs_mutex_group); + + atomic_dec_64(&zfs_active_mutex); + +#ifdef SPL_DEBUG_MUTEX + mp->m_initialised = MUTEX_DESTROYED; + + if (mp->leak) { + struct leak *leak = (struct leak *)mp->leak; + mutex_enter(&mutex_list_mutex); + list_remove(&mutex_list, leak); + mp->leak = NULL; + mutex_exit(&mutex_list_mutex); + FREE(leak, M_TEMP); + } +#endif +} + + + +#ifdef SPL_DEBUG_MUTEX +void +spl_mutex_enter(kmutex_t *mp, char *file, int line) +#else +void +spl_mutex_enter(kmutex_t *mp) +#endif +{ +#ifdef SPL_DEBUG_MUTEX + VERIFY3U(mp->m_initialised, ==, MUTEX_INIT); +#endif + + if (mp->m_owner == current_thread()) + panic("mutex_enter: locking against myself!"); + +#ifdef DEBUG + if (*((uint64_t *)mp) == 0xdeadbeefdeadbeef) { + panic("SPL: mutex_enter"); + } +#endif + + lck_mtx_lock((lck_mtx_t *)&mp->m_lock); + mp->m_owner = current_thread(); + +#ifdef SPL_DEBUG_MUTEX + if (mp->leak) { + struct leak *leak = (struct leak *)mp->leak; + leak->wdlist_locktime = gethrestime_sec(); + strlcpy(leak->wdlist_file, file, sizeof (leak->wdlist_file)); + leak->wdlist_line = line; + } +#endif + +} + +void +spl_mutex_exit(kmutex_t *mp) +{ +#ifdef DEBUG + if (*((uint64_t *)mp) == 0xdeadbeefdeadbeef) { + panic("SPL: mutex_exit"); + } +#endif + +#ifdef SPL_DEBUG_MUTEX + VERIFY3U(mp->m_initialised, ==, MUTEX_INIT); +#endif + +#ifdef SPL_DEBUG_MUTEX + if (mp->leak) { + struct leak *leak = (struct leak *)mp->leak; + uint64_t locktime = leak->wdlist_locktime; + uint64_t noe = gethrestime_sec(); + if ((locktime > 0) && (noe > locktime) && + noe - locktime >= SPL_MUTEX_WATCHDOG_TIMEOUT) { + printf("SPL: mutex (%p) finally released after %llus " + "by '%s':%llu\n", leak, noe - leak->wdlist_locktime, + leak->wdlist_file, leak->wdlist_line); + } + leak->wdlist_locktime = 0; + leak->wdlist_file[0] = 0; + leak->wdlist_line = 0; + } +#endif + mp->m_owner = NULL; + lck_mtx_unlock((lck_mtx_t *)&mp->m_lock); +} + + +int +spl_mutex_tryenter(kmutex_t *mp) +{ + int held; + +#ifdef SPL_DEBUG_MUTEX + VERIFY3U(mp->m_initialised, ==, MUTEX_INIT); +#endif + + if (mp->m_owner == current_thread()) + panic("mutex_tryenter: locking against myself!"); + + held = lck_mtx_try_lock((lck_mtx_t *)&mp->m_lock); + if (held) { + mp->m_owner = current_thread(); + +#ifdef SPL_DEBUG_MUTEX + if (mp->leak) { + struct leak *leak = (struct leak *)mp->leak; + leak->wdlist_locktime = gethrestime_sec(); + strlcpy(leak->wdlist_file, "tryenter", + sizeof (leak->wdlist_file)); + leak->wdlist_line = 123; + } +#endif + + } + return (held); +} + +int +spl_mutex_owned(kmutex_t *mp) +{ + return (mp->m_owner == current_thread()); +} + +struct kthread * +spl_mutex_owner(kmutex_t *mp) +{ + return (mp->m_owner); +} diff --git a/module/os/macos/spl/spl-osx.c b/module/os/macos/spl/spl-osx.c new file mode 100644 index 0000000000..b65cb66e4d --- /dev/null +++ b/module/os/macos/spl/spl-osx.c @@ -0,0 +1,488 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define _task_user_ +#include + +#include +#include + +extern int system_inshutdown; + +static utsname_t utsname_static = { { 0 } }; + +unsigned int max_ncpus = 0; +uint64_t total_memory = 0; +uint64_t real_total_memory = 0; + +// Size in bytes of the memory allocated in seg_kmem +extern uint64_t segkmem_total_mem_allocated; + +extern char hostname[MAXHOSTNAMELEN]; + +utsname_t * +utsname(void) +{ + return (&utsname_static); +} + +/* + * Solaris delay is in ticks (hz) and Darwin uses microsecs + * 1 HZ is 10 milliseconds + */ +void +osx_delay(int ticks) +{ + if (ticks < 2) { + // IODelay spins and takes microseconds as an argument + // don't spend more than 10msec spinning. + IODelay(ticks * 10000); + return; + } + + // ticks are 10 msec units + int64_t ticks_to_go = (int64_t)ticks * 10LL; + // zfs_lbolt() is in 10 mec units + int64_t start_tick = (int64_t)zfs_lbolt(); + int64_t end_tick = start_tick + (int64_t)ticks_to_go; + + do { + IOSleep(ticks_to_go); + int64_t cur_tick = (int64_t)zfs_lbolt(); + ticks_to_go = (end_tick - cur_tick); + } while (ticks_to_go > 0); + +} + + +uint32_t +zone_get_hostid(void *zone) +{ + size_t len; + uint32_t myhostid = 0; + + len = sizeof (myhostid); + sysctlbyname("kern.hostid", &myhostid, &len, NULL, 0); + return (myhostid); +} + +extern void *(*__ihook_malloc)(size_t size); +extern void (*__ihook_free)(void *); + +const char * +spl_panicstr(void) +{ + return (NULL); +} + +int +spl_system_inshutdown(void) +{ + return (system_inshutdown); +} + +#include +typedef struct mach_header_64 kernel_mach_header_t; +#include +typedef struct nlist_64 kernel_nlist_t; + +typedef struct segment_command_64 kernel_segment_command_t; + +typedef struct _loaded_kext_summary { + char name[KMOD_MAX_NAME]; + uuid_t uuid; + uint64_t address; + uint64_t size; + uint64_t version; + uint32_t loadTag; + uint32_t flags; + uint64_t reference_list; +} OSKextLoadedKextSummary; + +typedef struct _loaded_kext_summary_header { + uint32_t version; + uint32_t entry_size; + uint32_t numSummaries; + uint32_t reserved; /* explicit alignment for gdb */ + OSKextLoadedKextSummary summaries[0]; +} OSKextLoadedKextSummaryHeader; + +extern OSKextLoadedKextSummaryHeader * gLoadedKextSummaries; + +typedef struct _cframe_t { + struct _cframe_t *prev; + uintptr_t caller; +#if PRINT_ARGS_FROM_STACK_FRAME + unsigned args[0]; +#endif +} cframe_t; + +extern kernel_mach_header_t _mh_execute_header; + +extern kmod_info_t *kmod; /* the list of modules */ + +extern addr64_t kvtophys(vm_offset_t va); + +static int +panic_print_macho_symbol_name(kernel_mach_header_t *mh, vm_address_t search, + const char *module_name) +{ + kernel_nlist_t *sym = NULL; + struct load_command *cmd; + kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; + struct symtab_command *orig_st = NULL; + unsigned int i; + char *strings, *bestsym = NULL; + vm_address_t bestaddr = 0, diff, curdiff; + + /* + * Assume that if it's loaded and linked into the kernel, + * it's a valid Mach-O + */ + cmd = (struct load_command *)&mh[1]; + for (i = 0; i < mh->ncmds; i++) { + if (cmd->cmd == LC_SEGMENT_64) { + kernel_segment_command_t *orig_sg = + (kernel_segment_command_t *)cmd; + + if (strncmp(SEG_TEXT, orig_sg->segname, + sizeof (orig_sg->segname)) == 0) + orig_ts = orig_sg; + else if (strncmp(SEG_LINKEDIT, orig_sg->segname, + sizeof (orig_sg->segname)) == 0) + orig_le = orig_sg; + /* pre-Lion i386 kexts have a single unnamed segment */ + else if (strncmp("", orig_sg->segname, + sizeof (orig_sg->segname)) == 0) + orig_ts = orig_sg; + } else if (cmd->cmd == LC_SYMTAB) + orig_st = (struct symtab_command *)cmd; + + cmd = (struct load_command *)((uintptr_t)cmd + cmd->cmdsize); + } + + if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) + return (0); + + if ((search < orig_ts->vmaddr) || + (search >= orig_ts->vmaddr + orig_ts->vmsize)) { + /* search out of range for this mach header */ + return (0); + } + + sym = (kernel_nlist_t *)(uintptr_t)(orig_le->vmaddr + + orig_st->symoff - orig_le->fileoff); + strings = (char *)(uintptr_t)(orig_le->vmaddr + + orig_st->stroff - orig_le->fileoff); + diff = search; + + for (i = 0; i < orig_st->nsyms; i++) { + if (sym[i].n_type & N_STAB) continue; + + if (sym[i].n_value <= search) { + curdiff = search - (vm_address_t)sym[i].n_value; + if (curdiff < diff) { + diff = curdiff; + bestaddr = sym[i].n_value; + bestsym = strings + sym[i].n_un.n_strx; + } + } + } + + if (bestsym != NULL) { + if (diff != 0) { + printf("%s : %s + 0x%lx", module_name, bestsym, + (unsigned long)diff); + } else { + printf("%s : %s", module_name, bestsym); + } + return (1); + } + return (0); +} + + +static void +panic_print_kmod_symbol_name(vm_address_t search) +{ + uint_t i; + + if (gLoadedKextSummaries == NULL) + return; + for (i = 0; i < gLoadedKextSummaries->numSummaries; ++i) { + OSKextLoadedKextSummary *summary = + gLoadedKextSummaries->summaries + i; + + if ((search >= summary->address) && + (search < (summary->address + summary->size))) { + kernel_mach_header_t *header = + (kernel_mach_header_t *)(uintptr_t)summary->address; + if (panic_print_macho_symbol_name(header, search, + summary->name) == 0) { + printf("%s + %llu", summary->name, + (unsigned long)search - summary->address); + } + break; + } + } +} + + +static void +panic_print_symbol_name(vm_address_t search) +{ + /* try searching in the kernel */ + if (panic_print_macho_symbol_name(&_mh_execute_header, + search, "mach_kernel") == 0) { + /* that failed, now try to search for the right kext */ + panic_print_kmod_symbol_name(search); + } +} + + +void +spl_backtrace(char *thesignal) +{ + void *stackptr; + + printf("SPL: backtrace \"%s\"\n", thesignal); + +#if defined(__i386__) + __asm__ volatile("movl %%ebp, %0" : "=m" (stackptr)); +#elif defined(__x86_64__) + __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr)); +#endif + + int frame_index; + int nframes = 16; + cframe_t *frame = (cframe_t *)stackptr; + + for (frame_index = 0; frame_index < nframes; frame_index++) { + vm_offset_t curframep = (vm_offset_t)frame; + if (!curframep) + break; + if (curframep & 0x3) { + printf("SPL: Unaligned frame\n"); + break; + } + if (!kvtophys(curframep) || + !kvtophys(curframep + sizeof (cframe_t) - 1)) { + printf("SPL: No mapping exists for frame pointer\n"); + break; + } + printf("SPL: %p : 0x%lx ", frame, frame->caller); + panic_print_symbol_name((vm_address_t)frame->caller); + printf("\n"); + frame = frame->prev; + } +} + +int +getpcstack(uintptr_t *pcstack, int pcstack_limit) +{ + int depth = 0; + void *stackptr; + +#if defined(__i386__) + __asm__ volatile("movl %%ebp, %0" : "=m" (stackptr)); +#elif defined(__x86_64__) + __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr)); +#endif + + int frame_index; + int nframes = pcstack_limit; + cframe_t *frame = (cframe_t *)stackptr; + + for (frame_index = 0; frame_index < nframes; frame_index++) { + vm_offset_t curframep = (vm_offset_t)frame; + if (!curframep) + break; + if (curframep & 0x3) { + break; + } + if (!kvtophys(curframep) || + !kvtophys(curframep + sizeof (cframe_t) - 1)) { + break; + } + pcstack[depth++] = frame->caller; + frame = frame->prev; + } + + return (depth); +} + +void +print_symbol(uintptr_t symbol) +{ + printf("SPL: "); + panic_print_symbol_name((vm_address_t)(symbol)); + printf("\n"); +} + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + int ret = 0; + + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) + bcopy(from, to, len); + else + ret = copyin((user_addr_t)from, (void *)to, len); + + return (ret); +} + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + int ret = 0; + + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + bcopy(from, to, len); + } else { + ret = copyout(from, (user_addr_t)to, len); + } + + return (ret); +} + +/* + * Technically, this call does not exist in illumos, but we use it for + * consistency. + */ +int +ddi_copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done) +{ + int ret; + size_t local_done; + +#undef copyinstr + ret = copyinstr((user_addr_t)uaddr, kaddr, len, &local_done); + if (done != NULL) + *done = local_done; + return (ret); +} + +kern_return_t +spl_start(kmod_info_t *ki, void *d) +{ + printf("SPL: loading\n"); + + int ncpus; + size_t len = sizeof (ncpus); + + /* + * Boot load time is excessively early, so we have to wait + * until certain subsystems are available. Surely there is + * a more elegant way to do this wait? + */ + + while (current_proc() == NULL) { + printf("SPL: waiting for kernel init...\n"); + delay(hz>>1); + } + + while (1) { + len = sizeof (total_memory); + sysctlbyname("hw.memsize", &total_memory, &len, NULL, 0); + if (total_memory != 0) break; + + printf("SPL: waiting for sysctl...\n"); + delay(hz>>1); + } + + sysctlbyname("hw.logicalcpu_max", &max_ncpus, &len, NULL, 0); + if (!max_ncpus) max_ncpus = 1; + + /* + * Setting the total memory to physmem * 50% here, since kmem is + * not in charge of all memory and we need to leave some room for + * the OS X allocator. We internally add pressure if we step over it + */ + real_total_memory = total_memory; + total_memory = total_memory * 50ULL / 100ULL; + physmem = total_memory / PAGE_SIZE; + + len = sizeof (utsname_static.sysname); + sysctlbyname("kern.ostype", &utsname_static.sysname, &len, NULL, 0); + + /* + * For some reason, (CTLFLAG_KERN is not set) looking up hostname + * returns 1. So we set it to uuid just to give it *something*. + * As it happens, ZFS sets the nodename on init. + */ + len = sizeof (utsname_static.nodename); + sysctlbyname("kern.uuid", &utsname_static.nodename, &len, NULL, 0); + + len = sizeof (utsname_static.release); + sysctlbyname("kern.osrelease", &utsname_static.release, &len, NULL, 0); + + len = sizeof (utsname_static.version); + sysctlbyname("kern.version", &utsname_static.version, &len, NULL, 0); + + strlcpy(utsname_static.nodename, hostname, + sizeof (utsname_static.nodename)); + + spl_mutex_subsystem_init(); + spl_kmem_init(total_memory); + spl_vnode_init(); + spl_kmem_thread_init(); + spl_kmem_mp_init(); + + return (KERN_SUCCESS); +} + +kern_return_t +spl_stop(kmod_info_t *ki, void *d) +{ + spl_kmem_thread_fini(); + spl_vnode_fini(); + spl_taskq_fini(); + spl_rwlock_fini(); + spl_tsd_fini(); + spl_kmem_fini(); + spl_kstat_fini(); + spl_mutex_subsystem_fini(); + + return (KERN_SUCCESS); +} diff --git a/module/os/macos/spl/spl-policy.c b/module/os/macos/spl/spl-policy.c new file mode 100644 index 0000000000..f49a080f4a --- /dev/null +++ b/module/os/macos/spl/spl-policy.c @@ -0,0 +1,184 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#include +#include +#include + +int +spl_priv_check_cred(kauth_cred_t cred, int priv, __unused int flags) +{ + int error; + + if (kauth_cred_getuid(cred) == 0) { + error = 0; + goto out; + } + + /* + * The default is deny, so if no policies have granted it, reject + * with a privilege error here. + */ + error = EPERM; +out: + return (error); +} + +int +secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_VFS_UNMOUNT, 0)); +} + +int +secpolicy_nfs(const cred_t *cr) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_NFS_DAEMON, 0)); +} + +int +secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_ZFS_POOL_CONFIG, 0)); +} + +int +secpolicy_zfs(const cred_t *cr) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_VFS_MOUNT, 0)); +} + +int +secpolicy_zinject(const cred_t *cr) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_ZFS_INJECT, 0)); +} + +int +secpolicy_vnode_any_access(const cred_t *cr, vnode_t *vp, uid_t owner) +{ + // FIXME + return (0); +} + +int +secpolicy_vnode_access2(const cred_t *cr, vnode_t *vp, uid_t owner, + mode_t curmode, mode_t wantmode) +{ + // FIXME + return (0); +} + +int +secpolicy_vnode_setattr(cred_t *cr, struct vnode *vp, vattr_t *vap, + const vattr_t *ovap, int flags, + int unlocked_access(void *, int, cred_t *), + void *node) +{ + // FIXME + return (0); +} + +int +secpolicy_vnode_stky_modify(const cred_t *cred) +{ + return (EPERM); +} + +int +secpolicy_setid_setsticky_clear(vnode_t *vp, vattr_t *vap, const vattr_t *ovap, + cred_t *cr) +{ + // FIXME + return (0); +} + +int +secpolicy_vnode_remove(struct vnode *vp, const cred_t *cr) +{ + return (0); +} + +int +secpolicy_vnode_create_gid(const cred_t *cred) +{ + return (0); +} + +int +secpolicy_vnode_setids_setgids(struct vnode *vp, const cred_t *cr, + gid_t gid) +{ + return (0); +} + +int +secpolicy_vnode_setdac(struct vnode *vp, const cred_t *cr, uid_t u) +{ + return (0); +} + +int +secpolicy_vnode_chown(struct vnode *vp, const cred_t *cr, uid_t u) +{ + return (0); +} + +int +secpolicy_vnode_setid_retain(const cred_t *cr, int fal) +{ + return (0); +} + +int +secpolicy_xvattr(vattr_t *vap, uid_t uid, const cred_t *cr, mode_t mod) +{ + return (0); +} + +int +secpolicy_setid_clear(vattr_t *vap, const cred_t *cr) +{ + return (0); +} + +int +secpolicy_basic_link(const cred_t *cr) +{ + return (0); +} + +int +secpolicy_fs_mount_clearopts(const cred_t *cr, struct mount *mp) +{ + return (0); +} + +int +secpolicy_fs_mount(const cred_t *cr, struct vnode *vp, struct mount *mp) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_VFS_MOUNT, 0)); +} + +int +secpolicy_zfs_proc(cred_t *cr, proc_t *proc) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_VFS_MOUNT, 0)); +} diff --git a/module/os/macos/spl/spl-proc.c b/module/os/macos/spl/spl-proc.c new file mode 100644 index 0000000000..9c90559f0f --- /dev/null +++ b/module/os/macos/spl/spl-proc.c @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +struct proc { + void *nothing; +}; + +struct proc p0 = {0}; diff --git a/module/os/macos/spl/spl-proc_list.c b/module/os/macos/spl/spl-proc_list.c new file mode 100644 index 0000000000..566c71c992 --- /dev/null +++ b/module/os/macos/spl/spl-proc_list.c @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +extern void *kalloc(vm_size_t size); +extern void kfree(void *data, vm_size_t size); + +typedef struct procfs_list_iter { + procfs_list_t *pli_pl; + void *pli_elt; +} pli_t; + +void +seq_printf(struct seq_file *f, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + (void) vsnprintf(f->sf_buf, f->sf_size, fmt, adx); + va_end(adx); +} + +static int +procfs_list_update(kstat_t *ksp, int rw) +{ + procfs_list_t *pl = ksp->ks_private; + + if (rw == KSTAT_WRITE) + pl->pl_clear(pl); + + return (0); +} + +static int +procfs_list_data(char *buf, size_t size, void *data) +{ + pli_t *p; + void *elt; + procfs_list_t *pl; + struct seq_file f; + + p = data; + pl = p->pli_pl; + elt = p->pli_elt; + kfree(p, sizeof (*p)); + f.sf_buf = buf; + f.sf_size = size; + return (pl->pl_show(&f, elt)); +} + +static void * +procfs_list_addr(kstat_t *ksp, loff_t n) +{ + procfs_list_t *pl = ksp->ks_private; + void *elt = ksp->ks_private1; + pli_t *p = NULL; + + + if (n == 0) + ksp->ks_private1 = list_head(&pl->pl_list); + else if (elt) + ksp->ks_private1 = list_next(&pl->pl_list, elt); + + if (ksp->ks_private1) { + p = kalloc(sizeof (*p)); + p->pli_pl = pl; + p->pli_elt = ksp->ks_private1; + } + + return (p); +} + + +void +procfs_list_install(const char *module, + const char *submodule, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + kstat_t *procfs_kstat; + + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_show = show; + procfs_list->pl_show_header = show_header; + procfs_list->pl_clear = clear; + procfs_list->pl_next_id = 1; + procfs_list->pl_node_offset = procfs_list_node_off; + + procfs_kstat = kstat_create(module, 0, name, submodule, + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + + if (procfs_kstat) { + procfs_kstat->ks_lock = &procfs_list->pl_lock; + procfs_kstat->ks_ndata = UINT32_MAX; + procfs_kstat->ks_private = procfs_list; + procfs_kstat->ks_update = procfs_list_update; + kstat_set_seq_raw_ops(procfs_kstat, show_header, + procfs_list_data, procfs_list_addr); + kstat_install(procfs_kstat); + procfs_list->pl_private = procfs_kstat; + } +} + +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{ +} + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + kstat_delete(procfs_list->pl_private); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} diff --git a/module/os/macos/spl/spl-processor.c b/module/os/macos/spl/spl-processor.c new file mode 100644 index 0000000000..6587aa7aee --- /dev/null +++ b/module/os/macos/spl/spl-processor.c @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include + +extern int cpu_number(void); + +uint32_t +getcpuid() +{ + return ((uint32_t)cpu_number()); +} + +uint64_t +spl_cpuid_features(void) +{ + i386_cpu_info_t *info; + + info = cpuid_info(); + return (info->cpuid_features); +} + +uint64_t +spl_cpuid_leaf7_features(void) +{ + i386_cpu_info_t *info; + + info = cpuid_info(); + return (info->cpuid_leaf7_features); +} diff --git a/module/os/macos/spl/spl-rwlock.c b/module/os/macos/spl/spl-rwlock.c new file mode 100644 index 0000000000..22fc260080 --- /dev/null +++ b/module/os/macos/spl/spl-rwlock.c @@ -0,0 +1,397 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + + +static lck_attr_t *zfs_rwlock_attr = NULL; +static lck_grp_attr_t *zfs_rwlock_group_attr = NULL; +static lck_grp_t *zfs_rwlock_group = NULL; + +uint64_t zfs_active_rwlock = 0; + +#ifdef SPL_DEBUG_RWLOCK +#include +static list_t rwlock_list; +static kmutex_t rwlock_list_mutex; +struct leak { + list_node_t rwlock_leak_node; + +#define SPL_DEBUG_RWLOCK_MAXCHAR 32 + char location_file[SPL_DEBUG_RWLOCK_MAXCHAR]; + char location_function[SPL_DEBUG_RWLOCK_MAXCHAR]; + uint64_t location_line; + void *mp; + + uint64_t wdlist_locktime; // time lock was taken + char wdlist_file[32]; // storing holder + uint64_t wdlist_line; +}; + +#endif + +/* + * We run rwlock with DEBUG on for now, as it protects against + * uninitialised access etc, and almost no cost. + */ +#ifndef DEBUG +#define DEBUG +#endif + +#ifdef DEBUG +int +rw_isinit(krwlock_t *rwlp) +{ + if (rwlp->rw_pad != 0x012345678) + return (0); + return (1); +} +#endif + + +#ifdef SPL_DEBUG_RWLOCK +void +rw_initx(krwlock_t *rwlp, char *name, krw_type_t type, __unused void *arg, + const char *file, const char *fn, int line) +#else +void +rw_init(krwlock_t *rwlp, char *name, krw_type_t type, __unused void *arg) +#endif +{ + ASSERT(type != RW_DRIVER); + +#ifdef DEBUG + VERIFY3U(rwlp->rw_pad, !=, 0x012345678); +#endif + + lck_rw_init((lck_rw_t *)&rwlp->rw_lock[0], + zfs_rwlock_group, zfs_rwlock_attr); + rwlp->rw_owner = NULL; + rwlp->rw_readers = 0; +#ifdef DEBUG + rwlp->rw_pad = 0x012345678; +#endif + atomic_inc_64(&zfs_active_rwlock); + +#ifdef SPL_DEBUG_RWLOCK + struct leak *leak; + + MALLOC(leak, struct leak *, + sizeof (struct leak), M_TEMP, M_WAITOK); + + if (leak) { + bzero(leak, sizeof (struct leak)); + strlcpy(leak->location_file, file, SPL_DEBUG_RWLOCK_MAXCHAR); + strlcpy(leak->location_function, fn, SPL_DEBUG_RWLOCK_MAXCHAR); + leak->location_line = line; + leak->mp = rwlp; + + mutex_enter(&rwlock_list_mutex); + list_link_init(&leak->rwlock_leak_node); + list_insert_tail(&rwlock_list, leak); + rwlp->leak = leak; + mutex_exit(&rwlock_list_mutex); + } + leak->wdlist_locktime = 0; + leak->wdlist_file[0] = 0; + leak->wdlist_line = 0; +#endif +} + +void +rw_destroy(krwlock_t *rwlp) +{ +#ifdef DEBUG + VERIFY3U(rwlp->rw_pad, ==, 0x012345678); +#endif + + lck_rw_destroy((lck_rw_t *)&rwlp->rw_lock[0], zfs_rwlock_group); +#ifdef DEBUG + rwlp->rw_pad = 0x99; +#endif + atomic_dec_64(&zfs_active_rwlock); + ASSERT(rwlp->rw_owner == NULL); + ASSERT(rwlp->rw_readers == 0); + +#ifdef SPL_DEBUG_RWLOCK + if (rwlp->leak) { + struct leak *leak = (struct leak *)rwlp->leak; + mutex_enter(&rwlock_list_mutex); + list_remove(&rwlock_list, leak); + rwlp->leak = NULL; + mutex_exit(&rwlock_list_mutex); + FREE(leak, M_TEMP); + } +#endif +} + +void +rw_enter(krwlock_t *rwlp, krw_t rw) +{ +#ifdef DEBUG + if (rwlp->rw_pad != 0x012345678) + panic("rwlock %p not initialised\n", rwlp); +#endif + + if (rw == RW_READER) { + lck_rw_lock_shared((lck_rw_t *)&rwlp->rw_lock[0]); + atomic_inc_32((volatile uint32_t *)&rwlp->rw_readers); + ASSERT(rwlp->rw_owner == 0); + } else { + if (rwlp->rw_owner == current_thread()) + panic("rw_enter: locking against myself!"); + lck_rw_lock_exclusive((lck_rw_t *)&rwlp->rw_lock[0]); + ASSERT(rwlp->rw_owner == 0); + ASSERT(rwlp->rw_readers == 0); + rwlp->rw_owner = current_thread(); + } +} + +/* + * kernel private from osfmk/kern/locks.h + */ +extern boolean_t lck_rw_try_lock(lck_rw_t *lck, lck_rw_type_t lck_rw_type); + +int +rw_tryenter(krwlock_t *rwlp, krw_t rw) +{ + int held = 0; + +#ifdef DEBUG + if (rwlp->rw_pad != 0x012345678) + panic("rwlock %p not initialised\n", rwlp); +#endif + + if (rw == RW_READER) { + held = lck_rw_try_lock((lck_rw_t *)&rwlp->rw_lock[0], + LCK_RW_TYPE_SHARED); + if (held) + atomic_inc_32((volatile uint32_t *)&rwlp->rw_readers); + } else { + if (rwlp->rw_owner == current_thread()) + panic("rw_tryenter: locking against myself!"); + held = lck_rw_try_lock((lck_rw_t *)&rwlp->rw_lock[0], + LCK_RW_TYPE_EXCLUSIVE); + if (held) + rwlp->rw_owner = current_thread(); + } + + return (held); +} + +/* + * It appears a difference between Darwin's + * lck_rw_lock_shared_to_exclusive() and Solaris's rw_tryupgrade() and + * FreeBSD's sx_try_upgrade() is that on failure to upgrade, the prior + * held shared/reader lock is lost on Darwin, but retained on + * Solaris/FreeBSD. We could re-acquire the lock in this situation, + * but it enters a possibility of blocking, when tryupgrade is meant + * to be non-blocking. + * Also note that XNU's lck_rw_lock_shared_to_exclusive() is always + * blocking (when waiting on readers), which means we can not use it. + */ +int +rw_tryupgrade(krwlock_t *rwlp) +{ + int held = 0; + + if (rwlp->rw_owner == current_thread()) + panic("rw_enter: locking against myself!"); + + /* More readers than us? give up */ + if (rwlp->rw_readers != 1) + return (0); + + /* + * It is ON. We need to drop our READER lock, and try to + * grab the WRITER as quickly as possible. + */ + atomic_dec_32((volatile uint32_t *)&rwlp->rw_readers); + lck_rw_unlock_shared((lck_rw_t *)&rwlp->rw_lock[0]); + + /* Grab the WRITER lock */ + held = lck_rw_try_lock((lck_rw_t *)&rwlp->rw_lock[0], + LCK_RW_TYPE_EXCLUSIVE); + + if (held) { + /* Looks like we won */ + rwlp->rw_owner = current_thread(); + ASSERT(rwlp->rw_readers == 0); + return (1); + } + + /* + * The worst has happened, we failed to grab WRITE lock, either + * due to another WRITER lock, or, some READER came along. + * IllumOS implementation returns with the READER lock again + * so we need to grab it. + */ + rw_enter(rwlp, RW_READER); + return (0); + +} + +void +rw_exit(krwlock_t *rwlp) +{ + if (rwlp->rw_owner == current_thread()) { + rwlp->rw_owner = NULL; + ASSERT(rwlp->rw_readers == 0); + lck_rw_unlock_exclusive((lck_rw_t *)&rwlp->rw_lock[0]); + } else { + atomic_dec_32((volatile uint32_t *)&rwlp->rw_readers); + ASSERT(rwlp->rw_owner == 0); + lck_rw_unlock_shared((lck_rw_t *)&rwlp->rw_lock[0]); + } +} + + +int +rw_lock_held(krwlock_t *rwlp) +{ + /* + * ### not sure about this one ### + */ + return (rwlp->rw_owner == current_thread() || rwlp->rw_readers > 0); +} + +int +rw_write_held(krwlock_t *rwlp) +{ + return (rwlp->rw_owner == current_thread()); +} + +void +rw_downgrade(krwlock_t *rwlp) +{ + if (rwlp->rw_owner != current_thread()) + panic("SPL: rw_downgrade not WRITE lock held\n"); + rwlp->rw_owner = NULL; + lck_rw_lock_exclusive_to_shared((lck_rw_t *)&rwlp->rw_lock[0]); + atomic_inc_32((volatile uint32_t *)&rwlp->rw_readers); +} + +int +spl_rwlock_init(void) +{ + zfs_rwlock_attr = lck_attr_alloc_init(); + zfs_rwlock_group_attr = lck_grp_attr_alloc_init(); + zfs_rwlock_group = lck_grp_alloc_init("zfs-rwlock", + zfs_rwlock_group_attr); + +#ifdef SPL_DEBUG_RWLOCK + list_create(&rwlock_list, sizeof (struct leak), + offsetof(struct leak, rwlock_leak_node)); + lck_mtx_init((lck_mtx_t *)&rwlock_list_mutex.m_lock, + zfs_rwlock_group, zfs_rwlock_attr); +#endif + + return (0); +} + +void +spl_rwlock_fini(void) +{ + +#ifdef SPL_DEBUG_RWLOCK + uint64_t total = 0; + printf("Dumping leaked rwlock allocations...\n"); + + mutex_enter(&rwlock_list_mutex); + while (1) { + struct leak *leak, *runner; + uint32_t found; + + leak = list_head(&rwlock_list); + + if (leak) { + list_remove(&rwlock_list, leak); + } + if (!leak) break; + + // Run through list and count up how many times this leak is + // found, removing entries as we go. + for (found = 1, runner = list_head(&rwlock_list); + runner; + runner = runner ? list_next(&rwlock_list, runner) : + list_head(&rwlock_list)) { + + if (strcmp(leak->location_file, runner->location_file) + == 0 && + strcmp(leak->location_function, + runner->location_function) == 0 && + leak->location_line == runner->location_line) { + // Same place + found++; + list_remove(&rwlock_list, runner); + FREE(runner, M_TEMP); + runner = NULL; + } // if same + + } // for all nodes + + printf(" rwlock %p : %s %s %llu : # leaks: %u\n", + leak->mp, + leak->location_file, + leak->location_function, + leak->location_line, + found); + + FREE(leak, M_TEMP); + total += found; + + } + mutex_exit(&rwlock_list_mutex); + printf("Dumped %llu leaked allocations.\n", total); + + lck_mtx_destroy((lck_mtx_t *)&rwlock_list_mutex.m_lock, + zfs_rwlock_group); + list_destroy(&rwlock_list); +#endif + + lck_grp_free(zfs_rwlock_group); + zfs_rwlock_group = NULL; + + lck_grp_attr_free(zfs_rwlock_group_attr); + zfs_rwlock_group_attr = NULL; + + lck_attr_free(zfs_rwlock_attr); + zfs_rwlock_attr = NULL; + + ASSERT3U(zfs_active_rwlock, ==, 0); +} diff --git a/module/os/macos/spl/spl-seg_kmem.c b/module/os/macos/spl/spl-seg_kmem.c new file mode 100644 index 0000000000..f2b36280f3 --- /dev/null +++ b/module/os/macos/spl/spl-seg_kmem.c @@ -0,0 +1,289 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include + +#include +#include +// ugly: smd +#ifdef kmem_free +#undef kmem_free +#endif + +#include +#include +#include + +#include + +/* + * seg_kmem is the primary kernel memory segment driver. It + * maps the kernel heap [kernelheap, ekernelheap), module text, + * and all memory which was allocated before the VM was initialized + * into kas. + * + * Pages which belong to seg_kmem are hashed into &kvp vnode at + * an offset equal to (u_offset_t)virt_addr, and have p_lckcnt >= 1. + * They must never be paged out since segkmem_fault() is a no-op to + * prevent recursive faults. + * + * Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on + * __x86 and are unlocked (p_sharelock == 0) on __sparc. Once __x86 + * supports relocation the #ifdef kludges can be removed. + * + * seg_kmem pages may be subject to relocation by page_relocate(), + * provided that the HAT supports it; if this is so, segkmem_reloc + * will be set to a nonzero value. All boot time allocated memory as + * well as static memory is considered off limits to relocation. + * Pages are "relocatable" if p_state does not have P_NORELOC set, so + * we request P_NORELOC pages for memory that isn't safe to relocate. + * + * The kernel heap is logically divided up into four pieces: + * + * heap32_arena is for allocations that require 32-bit absolute + * virtual addresses (e.g. code that uses 32-bit pointers/offsets). + * + * heap_core is for allocations that require 2GB *relative* + * offsets; in other words all memory from heap_core is within + * 2GB of all other memory from the same arena. This is a requirement + * of the addressing modes of some processors in supervisor code. + * + * heap_arena is the general heap arena. + * + * static_arena is the static memory arena. Allocations from it + * are not subject to relocation so it is safe to use the memory + * physical address as well as the virtual address (e.g. the VA to + * PA translations are static). Caches may import from static_arena; + * all other static memory allocations should use static_alloc_arena. + * + * On some platforms which have limited virtual address space, seg_kmem + * may share [kernelheap, ekernelheap) with seg_kp; if this is so, + * segkp_bitmap is non-NULL, and each bit represents a page of virtual + * address space which is actually seg_kp mapped. + */ + +/* + * Rough stubbed Port for XNU. + * + * Copyright (c) 2014 Brendon Humphrey (brendon.humphrey@mac.com) + */ + + +#ifdef _KERNEL +#define XNU_KERNEL_PRIVATE +#include +extern vm_map_t kernel_map; + +/* + * These extern prototypes has to be carefully checked against XNU source + * in case Apple changes them. They are not defined in the "allowed" parts + * of the kernel.framework + */ +typedef uint8_t vm_tag_t; + +/* + * Tag we use to identify memory we have allocated + * + * (VM_KERN_MEMORY_KEXT - mach_vm_statistics.h) + */ +#define SPL_TAG 6 + +/* + * In kernel lowlevel form of malloc. + */ +extern kern_return_t kernel_memory_allocate(vm_map_t map, void **addrp, + vm_size_t size, vm_offset_t mask, int flags, vm_tag_t tag); + +/* + * Free memory + */ +extern void kmem_free(vm_map_t map, void *addr, vm_size_t size); + +#endif /* _KERNEL */ + +typedef int page_t; + +void *segkmem_alloc(vmem_t *vmp, size_t size, int vmflag); +void segkmem_free(vmem_t *vmp, void *inaddr, size_t size); + +/* Total memory held allocated */ +uint64_t segkmem_total_mem_allocated = 0; + +/* primary kernel heap arena */ +vmem_t *heap_arena; + +/* qcaches for zio and abd arenas */ +vmem_t *zio_arena_parent; + +/* arena for allocating file data */ +vmem_t *zio_arena; + +/* and for allocation of zfs metadata */ +vmem_t *zio_metadata_arena; + +#ifdef _KERNEL +extern uint64_t total_memory; +uint64_t stat_osif_malloc_success = 0; +uint64_t stat_osif_free = 0; +uint64_t stat_osif_malloc_bytes = 0; +uint64_t stat_osif_free_bytes = 0; +#endif + +void * +osif_malloc(uint64_t size) +{ +#ifdef _KERNEL + void *tr; + + kern_return_t kr = kernel_memory_allocate(kernel_map, + &tr, size, PAGESIZE, 0, SPL_TAG); + + if (kr == KERN_SUCCESS) { + atomic_inc_64(&stat_osif_malloc_success); + atomic_add_64(&segkmem_total_mem_allocated, size); + atomic_add_64(&stat_osif_malloc_bytes, size); + return (tr); + } else { + // well, this can't really happen, kernel_memory_allocate + // would panic instead + return (NULL); + } +#else + return (malloc(size)); +#endif +} + +void +osif_free(void* buf, uint64_t size) +{ +#ifdef _KERNEL + kmem_free(kernel_map, buf, size); + atomic_inc_64(&stat_osif_free); + atomic_sub_64(&segkmem_total_mem_allocated, size); + atomic_add_64(&stat_osif_free_bytes, size); +#else + free(buf); +#endif /* _KERNEL */ +} + +/* + * Configure vmem, such that the heap arena is fed, + * and drains to the kernel low level allocator. + */ +void +kernelheap_init() +{ + heap_arena = vmem_init("heap", NULL, 0, PAGESIZE, segkmem_alloc, + segkmem_free); +} + + +void +kernelheap_fini(void) +{ + vmem_fini(heap_arena); +} + +void * +segkmem_alloc(vmem_t *vmp, size_t size, int maybe_unmasked_vmflag) +{ + return (osif_malloc(size)); +} + +void +segkmem_free(vmem_t *vmp, void *inaddr, size_t size) +{ + osif_free(inaddr, size); + // since this is mainly called by spl_root_arena and free_arena, + // do we really want to wake up a waiter, just because we have + // transferred from one to the other? + // we already have vmem_add_a_gibibyte waking up waiters + // so specializing here seems wasteful + // (originally included in vmem_experiments) + // cv_signal(&vmp->vm_cv); +} + +/* + * OSX does not use separate heaps for the ZIO buffers, + * the ZFS code is structured such that the zio caches will + * fallback to using the kmem_default arena same + * as all the other caches. + */ +// smd: we nevertheless plumb in an arena with heap as parent, so that +// we can track stats and maintain the VM_ / qc settings differently +void +segkmem_zio_init() +{ + // note: from startup.c and vm_machparam: SEGZIOMINSIZE = 512M. + // and SEGZSIOMAXSIZE = 512G; if physmem is between the two, then + // segziosize is (physmem - SEGZIOMAXSIZE) / 2. + + // Illumos does not segregate zio_metadata_arena out of heap, + // almost exclusively for reasons involving panic dump data + // retention. However, parenting zio_metadata_arena to + // spl_root_arena and giving it its own qcaches provides better + // kstat observability *and* noticeably better performance in + // realworld (zfs/dmu) metadata-heavy activity. Additionally, + // the qcaches pester spl_heap_arena only for slabs 256k and bigger, + // and each of the qcache entries (powers of two from PAGESIZE to + // 64k) are *exact-fit* and therefore dramatically reduce internal + // fragmentation and more than pay off for the extra code and (tiny) + // extra data for holding the arenas' segment tables. + + extern vmem_t *spl_heap_arena; + + zio_arena_parent = vmem_create("zfs_qcache", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, spl_heap_arena, + 16 * 1024, VM_SLEEP | VMC_TIMEFREE); + + ASSERT(zio_arena_parent != NULL); + + zio_arena = vmem_create("zfs_file_data", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, zio_arena_parent, + 0, VM_SLEEP); + + zio_metadata_arena = vmem_create("zfs_metadata", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, zio_arena_parent, + 0, VM_SLEEP); + + ASSERT(zio_arena != NULL); + ASSERT(zio_metadata_arena != NULL); + + extern void spl_zio_no_grow_init(void); + spl_zio_no_grow_init(); +} + +void +segkmem_zio_fini(void) +{ + if (zio_arena) { + vmem_destroy(zio_arena); + } + if (zio_metadata_arena) { + vmem_destroy(zio_metadata_arena); + } + if (zio_arena_parent) { + vmem_destroy(zio_arena_parent); + } +} diff --git a/module/os/macos/spl/spl-taskq.c b/module/os/macos/spl/spl-taskq.c new file mode 100644 index 0000000000..7040c66f11 --- /dev/null +++ b/module/os/macos/spl/spl-taskq.c @@ -0,0 +1,2529 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + */ + +/* + * Copyright (C) 2015, 2020 Jorgen Lundman + */ + +/* + * Kernel task queues: general-purpose asynchronous task scheduling. + * + * A common problem in kernel programming is the need to schedule tasks + * to be performed later, by another thread. There are several reasons + * you may want or need to do this: + * + * (1) The task isn't time-critical, but your current code path is. + * + * (2) The task may require grabbing locks that you already hold. + * + * (3) The task may need to block (e.g. to wait for memory), but you + * cannot block in your current context. + * + * (4) Your code path can't complete because of some condition, but you can't + * sleep or fail, so you queue the task for later execution when condition + * disappears. + * + * (5) You just want a simple way to launch multiple tasks in parallel. + * + * Task queues provide such a facility. In its simplest form (used when + * performance is not a critical consideration) a task queue consists of a + * single list of tasks, together with one or more threads to service the + * list. There are some cases when this simple queue is not sufficient: + * + * (1) The task queues are very hot and there is a need to avoid data and lock + * contention over global resources. + * + * (2) Some tasks may depend on other tasks to complete, so they can't be put in + * the same list managed by the same thread. + * + * (3) Some tasks may block for a long time, and this should not block other + * tasks in the queue. + * + * To provide useful service in such cases we define a "dynamic task queue" + * which has an individual thread for each of the tasks. These threads are + * dynamically created as they are needed and destroyed when they are not in + * use. The API for managing task pools is the same as for managing task queues + * with the exception of a taskq creation flag TASKQ_DYNAMIC which tells that + * dynamic task pool behavior is desired. + * + * Dynamic task queues may also place tasks in the normal queue (called "backing + * queue") when task pool runs out of resources. Users of task queues may + * disallow such queued scheduling by specifying TQ_NOQUEUE in the dispatch + * flags. + * + * The backing task queue is also used for scheduling internal tasks needed for + * dynamic task queue maintenance. + * + * INTERFACES ================================================================== + * + * taskq_t *taskq_create(name, nthreads, pri, minalloc, maxall, flags); + * + * Create a taskq with specified properties. + * Possible 'flags': + * + * TASKQ_DYNAMIC: Create task pool for task management. If this flag is + * specified, 'nthreads' specifies the maximum number of threads in + * the task queue. Task execution order for dynamic task queues is + * not predictable. + * + * If this flag is not specified (default case) a + * single-list task queue is created with 'nthreads' threads + * servicing it. Entries in this queue are managed by + * taskq_ent_alloc() and taskq_ent_free() which try to keep the + * task population between 'minalloc' and 'maxalloc', but the + * latter limit is only advisory for TQ_SLEEP dispatches and the + * former limit is only advisory for TQ_NOALLOC dispatches. If + * TASKQ_PREPOPULATE is set in 'flags', the taskq will be + * prepopulated with 'minalloc' task structures. + * + * Since non-DYNAMIC taskqs are queues, tasks are guaranteed to be + * executed in the order they are scheduled if nthreads == 1. + * If nthreads > 1, task execution order is not predictable. + * + * TASKQ_PREPOPULATE: Prepopulate task queue with threads. + * Also prepopulate the task queue with 'minalloc' task structures. + * + * TASKQ_THREADS_CPU_PCT: This flag specifies that 'nthreads' should be + * interpreted as a percentage of the # of online CPUs on the + * system. The taskq subsystem will automatically adjust the + * number of threads in the taskq in response to CPU online + * and offline events, to keep the ratio. nthreads must be in + * the range [0,100]. + * + * The calculation used is: + * + * MAX((ncpus_online * percentage)/100, 1) + * + * This flag is not supported for DYNAMIC task queues. + * This flag is not compatible with TASKQ_CPR_SAFE. + * + * TASKQ_CPR_SAFE: This flag specifies that users of the task queue will + * use their own protocol for handling CPR issues. This flag is not + * supported for DYNAMIC task queues. This flag is not compatible + * with TASKQ_THREADS_CPU_PCT. + * + * The 'pri' field specifies the default priority for the threads that + * service all scheduled tasks. + * + * taskq_t *taskq_create_instance(name, instance, nthreads, pri, minalloc, + * maxall, flags); + * + * Like taskq_create(), but takes an instance number (or -1 to indicate + * no instance). + * + * taskq_t *taskq_create_proc(name, nthreads, pri, minalloc, maxall, proc, + * flags); + * + * Like taskq_create(), but creates the taskq threads in the specified + * system process. If proc != &p0, this must be called from a thread + * in that process. + * + * taskq_t *taskq_create_sysdc(name, nthreads, minalloc, maxall, proc, + * dc, flags); + * + * Like taskq_create_proc(), but the taskq threads will use the + * System Duty Cycle (SDC) scheduling class with a duty cycle of dc. + * + * void taskq_destroy(tap): + * + * Waits for any scheduled tasks to complete, then destroys the taskq. + * Caller should guarantee that no new tasks are scheduled in the closing + * taskq. + * + * taskqid_t taskq_dispatch(tq, func, arg, flags): + * + * Dispatches the task "func(arg)" to taskq. The 'flags' indicates whether + * the caller is willing to block for memory. The function returns an + * opaque value which is zero iff dispatch fails. If flags is TQ_NOSLEEP + * or TQ_NOALLOC and the task can't be dispatched, taskq_dispatch() fails + * and returns (taskqid_t)0. + * + * ASSUMES: func != NULL. + * + * Possible flags: + * TQ_NOSLEEP: Do not wait for resources; may fail. + * + * TQ_NOALLOC: Do not allocate memory; may fail. May only be used with + * non-dynamic task queues. + * + * TQ_NOQUEUE: Do not enqueue a task if it can't dispatch it due to + * lack of available resources and fail. If this flag is not + * set, and the task pool is exhausted, the task may be scheduled + * in the backing queue. This flag may ONLY be used with dynamic + * task queues. + * + * NOTE: This flag should always be used when a task queue is used + * for tasks that may depend on each other for completion. + * Enqueueing dependent tasks may create deadlocks. + * + * TQ_SLEEP: May block waiting for resources. May still fail for + * dynamic task queues if TQ_NOQUEUE is also specified, otherwise + * always succeed. + * + * TQ_FRONT: Puts the new task at the front of the queue. Be careful. + * + * NOTE: Dynamic task queues are much more likely to fail in + * taskq_dispatch() (especially if TQ_NOQUEUE was specified), so it + * is important to have backup strategies handling such failures. + * + * void taskq_dispatch_ent(tq, func, arg, flags, tqent) + * + * This is a light-weight form of taskq_dispatch(), that uses a + * preallocated taskq_ent_t structure for scheduling. As a + * result, it does not perform allocations and cannot ever fail. + * Note especially that it cannot be used with TASKQ_DYNAMIC + * taskqs. The memory for the tqent must not be modified or used + * until the function (func) is called. (However, func itself + * may safely modify or free this memory, once it is called.) + * Note that the taskq framework will NOT free this memory. + * + * void taskq_wait(tq): + * + * Waits for all previously scheduled tasks to complete. + * + * NOTE: It does not stop any new task dispatches. + * Do NOT call taskq_wait() from a task: it will cause deadlock. + * + * void taskq_suspend(tq) + * + * Suspend all task execution. Tasks already scheduled for a dynamic task + * queue will still be executed, but all new scheduled tasks will be + * suspended until taskq_resume() is called. + * + * int taskq_suspended(tq) + * + * Returns 1 if taskq is suspended and 0 otherwise. It is intended to + * ASSERT that the task queue is suspended. + * + * void taskq_resume(tq) + * + * Resume task queue execution. + * + * int taskq_member(tq, thread) + * + * Returns 1 if 'thread' belongs to taskq 'tq' and 0 otherwise. The + * intended use is to ASSERT that a given function is called in taskq + * context only. + * + * system_taskq + * + * Global system-wide dynamic task queue for common uses. It may be used by + * any subsystem that needs to schedule tasks and does not need to manage + * its own task queues. It is initialized quite early during system boot. + * + * IMPLEMENTATION ============================================================== + * + * This is schematic representation of the task queue structures. + * + * taskq: + * +-------------+ + * | tq_lock | +---< taskq_ent_free() + * +-------------+ | + * |... | | tqent: tqent: + * +-------------+ | +------------+ +------------+ + * | tq_freelist |-->| tqent_next |--> ... ->| tqent_next | + * +-------------+ +------------+ +------------+ + * |... | | ... | | ... | + * +-------------+ +------------+ +------------+ + * | tq_task | | + * | | +-------------->taskq_ent_alloc() + * +--------------------------------------------------------------------------+ + * | | | tqent tqent | + * | +---------------------+ +--> +------------+ +--> +------------+ | + * | | ... | | | func, arg | | | func, arg | | + * +>+---------------------+ <---|-+ +------------+ <---|-+ +------------+ | + * | tq_taskq.tqent_next | ----+ | | tqent_next | --->+ | | tqent_next |--+ + * +---------------------+ | +------------+ ^ | +------------+ + * +-| tq_task.tqent_prev | +--| tqent_prev | | +--| tqent_prev | ^ + * | +---------------------+ +------------+ | +------------+ | + * | |... | | ... | | | ... | | + * | +---------------------+ +------------+ | +------------+ | + * | ^ | | + * | | | | + * +--------------------------------------+--------------+ TQ_APPEND() -+ + * | | | + * |... | taskq_thread()-----+ + * +-------------+ + * | tq_buckets |--+-------> [ NULL ] (for regular task queues) + * +-------------+ | + * | DYNAMIC TASK QUEUES: + * | + * +-> taskq_bucket[nCPU] taskq_bucket_dispatch() + * +-------------------+ ^ + * +--->| tqbucket_lock | | + * | +-------------------+ +--------+ +--------+ + * | | tqbucket_freelist |-->| tqent |-->...| tqent | ^ + * | +-------------------+<--+--------+<--...+--------+ | + * | | ... | | thread | | thread | | + * | +-------------------+ +--------+ +--------+ | + * | +-------------------+ | + * taskq_dispatch()--+--->| tqbucket_lock | TQ_APPEND()------+ + * TQ_HASH() | +-------------------+ +--------+ +--------+ + * | | tqbucket_freelist |-->| tqent |-->...| tqent | + * | +-------------------+<--+--------+<--...+--------+ + * | | ... | | thread | | thread | + * | +-------------------+ +--------+ +--------+ + * +---> ... + * + * + * Task queues use tq_task field to link new entry in the queue. The queue is a + * circular doubly-linked list. Entries are put in the end of the list with + * TQ_APPEND() and processed from the front of the list by taskq_thread() in + * FIFO order. Task queue entries are cached in the free list managed by + * taskq_ent_alloc() and taskq_ent_free() functions. + * + * All threads used by task queues mark t_taskq field of the thread to + * point to the task queue. + * + * Taskq Thread Management ----------------------------------------------------- + * + * Taskq's non-dynamic threads are managed with several variables and flags: + * + * * tq_nthreads - The number of threads in taskq_thread() for the + * taskq. + * + * * tq_active - The number of threads not waiting on a CV in + * taskq_thread(); includes newly created threads + * not yet counted in tq_nthreads. + * + * * tq_nthreads_target + * - The number of threads desired for the taskq. + * + * * tq_flags & TASKQ_CHANGING + * - Indicates that tq_nthreads != tq_nthreads_target. + * + * * tq_flags & TASKQ_THREAD_CREATED + * - Indicates that a thread is being created in the taskq. + * + * During creation, tq_nthreads and tq_active are set to 0, and + * tq_nthreads_target is set to the number of threads desired. The + * TASKQ_CHANGING flag is set, and taskq_thread_create() is called to + * create the first thread. taskq_thread_create() increments tq_active, + * sets TASKQ_THREAD_CREATED, and creates the new thread. + * + * Each thread starts in taskq_thread(), clears the TASKQ_THREAD_CREATED + * flag, and increments tq_nthreads. It stores the new value of + * tq_nthreads as its "thread_id", and stores its thread pointer in the + * tq_threadlist at the (thread_id - 1). We keep the thread_id space + * densely packed by requiring that only the largest thread_id can exit during + * normal adjustment. The exception is during the destruction of the + * taskq; once tq_nthreads_target is set to zero, no new threads will be created + * for the taskq queue, so every thread can exit without any ordering being + * necessary. + * + * Threads will only process work if their thread id is <= tq_nthreads_target. + * + * When TASKQ_CHANGING is set, threads will check the current thread target + * whenever they wake up, and do whatever they can to apply its effects. + * + * TASKQ_THREAD_CPU_PCT -------------------------------------------------------- + * + * When a taskq is created with TASKQ_THREAD_CPU_PCT, we store their requested + * percentage in tq_threads_ncpus_pct, start them off with the correct thread + * target, and add them to the taskq_cpupct_list for later adjustment. + * + * We register taskq_cpu_setup() to be called whenever a CPU changes state. It + * walks the list of TASKQ_THREAD_CPU_PCT taskqs, adjusts their nthread_target + * if need be, and wakes up all of the threads to process the change. + * + * Dynamic Task Queues Implementation ------------------------------------------ + * + * For a dynamic task queues there is a 1-to-1 mapping between a thread and + * taskq_ent_structure. Each entry is serviced by its own thread and each thread + * is controlled by a single entry. + * + * Entries are distributed over a set of buckets. To avoid using modulo + * arithmetics the number of buckets is 2^n and is determined as the nearest + * power of two roundown of the number of CPUs in the system. Tunable + * variable 'taskq_maxbuckets' limits the maximum number of buckets. Each entry + * is attached to a bucket for its lifetime and can't migrate to other buckets. + * + * Entries that have scheduled tasks are not placed in any list. The dispatch + * function sets their "func" and "arg" fields and signals the corresponding + * thread to execute the task. Once the thread executes the task it clears the + * "func" field and places an entry on the bucket cache of free entries pointed + * by "tqbucket_freelist" field. ALL entries on the free list should have "func" + * field equal to NULL. The free list is a circular doubly-linked list identical + * in structure to the tq_task list above, but entries are taken from it in LIFO + * order - the last freed entry is the first to be allocated. The + * taskq_bucket_dispatch() function gets the most recently used entry from the + * free list, sets its "func" and "arg" fields and signals a worker thread. + * + * After executing each task a per-entry thread taskq_d_thread() places its + * entry on the bucket free list and goes to a timed sleep. If it wakes up + * without getting new task it removes the entry from the free list and destroys + * itself. The thread sleep time is controlled by a tunable variable + * `taskq_thread_timeout'. + * + * There are various statistics kept in the bucket which allows for later + * analysis of taskq usage patterns. Also, a global copy of taskq creation and + * death statistics is kept in the global taskq data structure. Since thread + * creation and death happen rarely, updating such global data does not present + * a performance problem. + * + * NOTE: Threads are not bound to any CPU and there is absolutely no association + * between the bucket and actual thread CPU, so buckets are used only to + * split resources and reduce resource contention. Having threads attached + * to the CPU denoted by a bucket may reduce number of times the job + * switches between CPUs. + * + * Current algorithm creates a thread whenever a bucket has no free + * entries. It would be nice to know how many threads are in the running + * state and don't create threads if all CPUs are busy with existing + * tasks, but it is unclear how such strategy can be implemented. + * + * Currently buckets are created statically as an array attached to task + * queue. On some system with nCPUs < max_ncpus it may waste system + * memory. One solution may be allocation of buckets when they are first + * touched, but it is not clear how useful it is. + * + * SUSPEND/RESUME implementation ----------------------------------------------- + * + * Before executing a task taskq_thread() (executing non-dynamic task + * queues) obtains taskq's thread lock as a reader. The taskq_suspend() + * function gets the same lock as a writer blocking all non-dynamic task + * execution. The taskq_resume() function releases the lock allowing + * taskq_thread to continue execution. + * + * For dynamic task queues, each bucket is marked as TQBUCKET_SUSPEND by + * taskq_suspend() function. After that taskq_bucket_dispatch() always + * fails, so that taskq_dispatch() will either enqueue tasks for a + * suspended backing queue or fail if TQ_NOQUEUE is specified in dispatch + * flags. + * + * NOTE: taskq_suspend() does not immediately block any tasks already + * scheduled for dynamic task queues. It only suspends new tasks + * scheduled after taskq_suspend() was called. + * + * taskq_member() function works by comparing a thread t_taskq pointer with + * the passed thread pointer. + * + * LOCKS and LOCK Hierarchy ---------------------------------------------------- + * + * There are three locks used in task queues: + * + * 1) The taskq_t's tq_lock, protecting global task queue state. + * + * 2) Each per-CPU bucket has a lock for bucket management. + * + * 3) The global taskq_cpupct_lock, which protects the list of + * TASKQ_THREADS_CPU_PCT taskqs. + * + * If both (1) and (2) are needed, tq_lock should be taken *after* the bucket + * lock. + * + * If both (1) and (3) are needed, tq_lock should be taken *after* + * taskq_cpupct_lock. + * + * DEBUG FACILITIES ------------------------------------------------------------ + * + * For DEBUG kernels it is possible to induce random failures to + * taskq_dispatch() function when it is given TQ_NOSLEEP argument. The value of + * taskq_dmtbf and taskq_smtbf tunables control the mean time between induced + * failures for dynamic and static task queues respectively. + * + * Setting TASKQ_STATISTIC to 0 will disable per-bucket statistics. + * + * TUNABLES -------------------------------------------------------------------- + * + * system_taskq_size - Size of the global system_taskq. + * This value is multiplied by nCPUs to determine + * actual size. + * Default value: 64 + * + * taskq_minimum_nthreads_max + * - Minimum size of the thread list for a taskq. + * Useful for testing different thread pool + * sizes by overwriting tq_nthreads_target. + * + * taskq_thread_timeout - Maximum idle time for taskq_d_thread() + * Default value: 5 minutes + * + * taskq_maxbuckets - Maximum number of buckets in any task queue + * Default value: 128 + * + * taskq_search_depth - Maximum # of buckets searched for a free entry + * Default value: 4 + * + * taskq_dmtbf - Mean time between induced dispatch failures + * for dynamic task queues. + * Default value: UINT_MAX (no induced failures) + * + * taskq_smtbf - Mean time between induced dispatch failures + * for static task queues. + * Default value: UINT_MAX (no induced failures) + * + * CONDITIONAL compilation ----------------------------------------------------- + * + * TASKQ_STATISTIC - If set will enable bucket statistic (default). + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For throttlefree */ +#include +#include +#include + +static kmem_cache_t *taskq_ent_cache, *taskq_cache; + +static uint_t taskq_tsd; + +/* + * Pseudo instance numbers for taskqs without explicitly provided instance. + */ +static vmem_t *taskq_id_arena; + +/* Global system task queue for common use */ +taskq_t *system_taskq = NULL; +taskq_t *system_delay_taskq = NULL; + +/* + * Maximum number of entries in global system taskq is + * system_taskq_size * max_ncpus + */ +#define SYSTEM_TASKQ_SIZE 64 +int system_taskq_size = SYSTEM_TASKQ_SIZE; + +/* + * Minimum size for tq_nthreads_max; useful for those who want to play around + * with increasing a taskq's tq_nthreads_target. + */ +int taskq_minimum_nthreads_max = 1; + +/* + * We want to ensure that when taskq_create() returns, there is at least + * one thread ready to handle requests. To guarantee this, we have to wait + * for the second thread, since the first one cannot process requests until + * the second thread has been created. + */ +#define TASKQ_CREATE_ACTIVE_THREADS 2 + +/* Maximum percentage allowed for TASKQ_THREADS_CPU_PCT */ +#define TASKQ_CPUPCT_MAX_PERCENT 1000 +int taskq_cpupct_max_percent = TASKQ_CPUPCT_MAX_PERCENT; + +/* + * Dynamic task queue threads that don't get any work within + * taskq_thread_timeout destroy themselves + */ +#define TASKQ_THREAD_TIMEOUT (60 * 5) +int taskq_thread_timeout = TASKQ_THREAD_TIMEOUT; + +#define TASKQ_MAXBUCKETS 128 +int taskq_maxbuckets = TASKQ_MAXBUCKETS; + +/* + * When a bucket has no available entries another buckets are tried. + * taskq_search_depth parameter limits the amount of buckets that we search + * before failing. This is mostly useful in systems with many CPUs where we may + * spend too much time scanning busy buckets. + */ +#define TASKQ_SEARCH_DEPTH 4 +int taskq_search_depth = TASKQ_SEARCH_DEPTH; + +/* + * Hashing function: mix various bits of x. May be pretty much anything. + */ +#define TQ_HASH(x) ((x) ^ ((x) >> 11) ^ ((x) >> 17) ^ ((x) ^ 27)) + +/* + * We do not create any new threads when the system is low on memory and start + * throttling memory allocations. The following macro tries to estimate such + * condition. + */ +#ifdef __APPLE__ +#define ENOUGH_MEMORY() (!spl_vm_pool_low()) +#else +#define ENOUGH_MEMORY() (freemem > throttlefree) +#endif + +/* + * Static functions. + */ +static taskq_t *taskq_create_common(const char *, int, int, pri_t, int, + int, proc_t *, uint_t, uint_t); +static void taskq_thread(void *); +static void taskq_d_thread(taskq_ent_t *); +static void taskq_bucket_extend(void *); +static int taskq_constructor(void *, void *, int); +static void taskq_destructor(void *, void *); +static int taskq_ent_constructor(void *, void *, int); +static void taskq_ent_destructor(void *, void *); +static taskq_ent_t *taskq_ent_alloc(taskq_t *, int); +static void taskq_ent_free(taskq_t *, taskq_ent_t *); +static int taskq_ent_exists(taskq_t *, task_func_t, void *); +static taskq_ent_t *taskq_bucket_dispatch(taskq_bucket_t *, task_func_t, + void *); + +/* + * Task queues kstats. + */ +struct taskq_kstat { + kstat_named_t tq_pid; + kstat_named_t tq_tasks; + kstat_named_t tq_executed; + kstat_named_t tq_maxtasks; + kstat_named_t tq_totaltime; + kstat_named_t tq_nalloc; + kstat_named_t tq_nactive; + kstat_named_t tq_pri; + kstat_named_t tq_nthreads; +} taskq_kstat = { + { "pid", KSTAT_DATA_UINT64 }, + { "tasks", KSTAT_DATA_UINT64 }, + { "executed", KSTAT_DATA_UINT64 }, + { "maxtasks", KSTAT_DATA_UINT64 }, + { "totaltime", KSTAT_DATA_UINT64 }, + { "nactive", KSTAT_DATA_UINT64 }, + { "nalloc", KSTAT_DATA_UINT64 }, + { "priority", KSTAT_DATA_UINT64 }, + { "threads", KSTAT_DATA_UINT64 }, +}; + +struct taskq_d_kstat { + kstat_named_t tqd_pri; + kstat_named_t tqd_btasks; + kstat_named_t tqd_bexecuted; + kstat_named_t tqd_bmaxtasks; + kstat_named_t tqd_bnalloc; + kstat_named_t tqd_bnactive; + kstat_named_t tqd_btotaltime; + kstat_named_t tqd_hits; + kstat_named_t tqd_misses; + kstat_named_t tqd_overflows; + kstat_named_t tqd_tcreates; + kstat_named_t tqd_tdeaths; + kstat_named_t tqd_maxthreads; + kstat_named_t tqd_nomem; + kstat_named_t tqd_disptcreates; + kstat_named_t tqd_totaltime; + kstat_named_t tqd_nalloc; + kstat_named_t tqd_nfree; +} taskq_d_kstat = { + { "priority", KSTAT_DATA_UINT64 }, + { "btasks", KSTAT_DATA_UINT64 }, + { "bexecuted", KSTAT_DATA_UINT64 }, + { "bmaxtasks", KSTAT_DATA_UINT64 }, + { "bnalloc", KSTAT_DATA_UINT64 }, + { "bnactive", KSTAT_DATA_UINT64 }, + { "btotaltime", KSTAT_DATA_UINT64 }, + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "overflows", KSTAT_DATA_UINT64 }, + { "tcreates", KSTAT_DATA_UINT64 }, + { "tdeaths", KSTAT_DATA_UINT64 }, + { "maxthreads", KSTAT_DATA_UINT64 }, + { "nomem", KSTAT_DATA_UINT64 }, + { "disptcreates", KSTAT_DATA_UINT64 }, + { "totaltime", KSTAT_DATA_UINT64 }, + { "nalloc", KSTAT_DATA_UINT64 }, + { "nfree", KSTAT_DATA_UINT64 }, +}; + +static kmutex_t taskq_kstat_lock; +static kmutex_t taskq_d_kstat_lock; +static int taskq_kstat_update(kstat_t *, int); +static int taskq_d_kstat_update(kstat_t *, int); + +/* + * List of all TASKQ_THREADS_CPU_PCT taskqs. + */ +static list_t taskq_cpupct_list; /* protected by cpu_lock */ + +/* + * Collect per-bucket statistic when TASKQ_STATISTIC is defined. + */ +#define TASKQ_STATISTIC 1 + +#if TASKQ_STATISTIC +#define TQ_STAT(b, x) b->tqbucket_stat.x++ +#else +#define TQ_STAT(b, x) +#endif + +/* + * Random fault injection. + */ +uint_t taskq_random; +uint_t taskq_dmtbf = UINT_MAX; /* mean time between injected failures */ +uint_t taskq_smtbf = UINT_MAX; /* mean time between injected failures */ + +/* + * TQ_NOSLEEP dispatches on dynamic task queues are always allowed to fail. + * + * TQ_NOSLEEP dispatches on static task queues can't arbitrarily fail because + * they could prepopulate the cache and make sure that they do not use more + * then minalloc entries. So, fault injection in this case insures that + * either TASKQ_PREPOPULATE is not set or there are more entries allocated + * than is specified by minalloc. TQ_NOALLOC dispatches are always allowed + * to fail, but for simplicity we treat them identically to TQ_NOSLEEP + * dispatches. + */ +#ifdef DEBUG +#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag) \ + taskq_random = (taskq_random * 2416 + 374441) % 1771875;\ + if ((flag & TQ_NOSLEEP) && \ + taskq_random < 1771875 / taskq_dmtbf) { \ + return (0); \ + } + +#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag) \ + taskq_random = (taskq_random * 2416 + 374441) % 1771875;\ + if ((flag & (TQ_NOSLEEP | TQ_NOALLOC)) && \ + (!(tq->tq_flags & TASKQ_PREPOPULATE) || \ + (tq->tq_nalloc > tq->tq_minalloc)) && \ + (taskq_random < (1771875 / taskq_smtbf))) { \ + mutex_exit(&tq->tq_lock); \ + return (0); \ + } +#else +#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag) +#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag) +#endif + +#define IS_EMPTY(l) (((l).tqent_prev == (l).tqent_next) && \ + ((l).tqent_prev == &(l))) + +/* + * Append `tqe' in the end of the doubly-linked list denoted by l. + */ +#define TQ_APPEND(l, tqe) { \ + tqe->tqent_next = &l; \ + tqe->tqent_prev = l.tqent_prev; \ + tqe->tqent_next->tqent_prev = tqe; \ + tqe->tqent_prev->tqent_next = tqe; \ +} +/* + * Prepend 'tqe' to the beginning of l + */ +#define TQ_PREPEND(l, tqe) { \ + tqe->tqent_next = l.tqent_next; \ + tqe->tqent_prev = &l; \ + tqe->tqent_next->tqent_prev = tqe; \ + tqe->tqent_prev->tqent_next = tqe; \ +} + +/* + * Schedule a task specified by func and arg into the task queue entry tqe. + */ +#define TQ_DO_ENQUEUE(tq, tqe, func, arg, front) { \ + ASSERT(MUTEX_HELD(&tq->tq_lock)); \ + _NOTE(CONSTCOND) \ + if (front) { \ + TQ_PREPEND(tq->tq_task, tqe); \ + } else { \ + TQ_APPEND(tq->tq_task, tqe); \ + } \ + tqe->tqent_func = (func); \ + tqe->tqent_arg = (arg); \ + tq->tq_tasks++; \ + if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks) \ + tq->tq_maxtasks = tq->tq_tasks - tq->tq_executed; \ + cv_signal(&tq->tq_dispatch_cv); \ + DTRACE_PROBE2(taskq__enqueue, taskq_t *, tq, taskq_ent_t *, tqe); \ +} + +#define TQ_ENQUEUE(tq, tqe, func, arg) \ + TQ_DO_ENQUEUE(tq, tqe, func, arg, 0) + +#define TQ_ENQUEUE_FRONT(tq, tqe, func, arg) \ + TQ_DO_ENQUEUE(tq, tqe, func, arg, 1) + +/* + * Do-nothing task which may be used to prepopulate thread caches. + */ +/*ARGSUSED*/ +void +nulltask(void *unused) +{ +} + +/*ARGSUSED*/ +static int +taskq_constructor(void *buf, void *cdrarg, int kmflags) +{ + taskq_t *tq = buf; + + bzero(tq, sizeof (taskq_t)); + + mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); + cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_exit_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL); + + tq->tq_task.tqent_next = &tq->tq_task; + tq->tq_task.tqent_prev = &tq->tq_task; + + return (0); +} + +/*ARGSUSED*/ +static void +taskq_destructor(void *buf, void *cdrarg) +{ + taskq_t *tq = buf; + + ASSERT(tq->tq_nthreads == 0); + ASSERT(tq->tq_buckets == NULL); + ASSERT(tq->tq_tcreates == 0); + ASSERT(tq->tq_tdeaths == 0); + + mutex_destroy(&tq->tq_lock); + rw_destroy(&tq->tq_threadlock); + cv_destroy(&tq->tq_dispatch_cv); + cv_destroy(&tq->tq_exit_cv); + cv_destroy(&tq->tq_wait_cv); + cv_destroy(&tq->tq_maxalloc_cv); +} + +/*ARGSUSED*/ +static int +taskq_ent_constructor(void *buf, void *cdrarg, int kmflags) +{ + taskq_ent_t *tqe = buf; + + tqe->tqent_thread = NULL; + cv_init(&tqe->tqent_cv, NULL, CV_DEFAULT, NULL); +#ifdef __APPLE__ + /* Simulate TS_STOPPED */ + mutex_init(&tqe->tqent_thread_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&tqe->tqent_thread_cv, NULL, CV_DEFAULT, NULL); +#endif /* __APPLE__ */ + + return (0); +} + +/*ARGSUSED*/ +static void +taskq_ent_destructor(void *buf, void *cdrarg) +{ + taskq_ent_t *tqe = buf; + + ASSERT(tqe->tqent_thread == NULL); + cv_destroy(&tqe->tqent_cv); +#ifdef __APPLE__ + /* See comment in taskq_d_thread(). */ + mutex_destroy(&tqe->tqent_thread_lock); + cv_destroy(&tqe->tqent_thread_cv); +#endif /* __APPLE__ */ +} + +int +spl_taskq_init(void) +{ + tsd_create(&taskq_tsd, NULL); + + taskq_ent_cache = kmem_cache_create("taskq_ent_cache", + sizeof (taskq_ent_t), 0, taskq_ent_constructor, + taskq_ent_destructor, NULL, NULL, NULL, 0); + taskq_cache = kmem_cache_create("taskq_cache", sizeof (taskq_t), + 0, taskq_constructor, taskq_destructor, NULL, NULL, NULL, 0); + taskq_id_arena = vmem_create("taskq_id_arena", + (void *)1, INT32_MAX, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + + list_create(&taskq_cpupct_list, sizeof (taskq_t), + offsetof(taskq_t, tq_cpupct_link)); + + mutex_init(&taskq_kstat_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&taskq_d_kstat_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +void +spl_taskq_fini(void) +{ + mutex_destroy(&taskq_d_kstat_lock); + mutex_destroy(&taskq_kstat_lock); + + if (taskq_cache) { + kmem_cache_destroy(taskq_cache); + taskq_cache = NULL; + } + if (taskq_ent_cache) { + kmem_cache_destroy(taskq_ent_cache); + taskq_ent_cache = NULL; + } + + list_destroy(&taskq_cpupct_list); + + vmem_destroy(taskq_id_arena); + + tsd_destroy(&taskq_tsd); +} + + + + +static void +taskq_update_nthreads(taskq_t *tq, uint_t ncpus) +{ + uint_t newtarget = TASKQ_THREADS_PCT(ncpus, tq->tq_threads_ncpus_pct); + +#ifndef __APPLE__ + ASSERT(MUTEX_HELD(&cpu_lock)); +#endif + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + /* We must be going from non-zero to non-zero; no exiting. */ + ASSERT3U(tq->tq_nthreads_target, !=, 0); + ASSERT3U(newtarget, !=, 0); + + ASSERT3U(newtarget, <=, tq->tq_nthreads_max); + if (newtarget != tq->tq_nthreads_target) { + tq->tq_flags |= TASKQ_CHANGING; + tq->tq_nthreads_target = newtarget; + cv_broadcast(&tq->tq_dispatch_cv); + cv_broadcast(&tq->tq_exit_cv); + } +} + +#ifndef __APPLE__ +/* No dynamic CPU add/remove in XNU, so we can just use static ncpu math */ + +/* called during task queue creation */ +static void +taskq_cpupct_install(taskq_t *tq, cpupart_t *cpup) +{ + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + + mutex_enter(&cpu_lock); + mutex_enter(&tq->tq_lock); + tq->tq_cpupart = cpup->cp_id; + taskq_update_nthreads(tq, cpup->cp_ncpus); + mutex_exit(&tq->tq_lock); + + list_insert_tail(&taskq_cpupct_list, tq); + mutex_exit(&cpu_lock); +} + +static void +taskq_cpupct_remove(taskq_t *tq) +{ + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + + mutex_enter(&cpu_lock); + list_remove(&taskq_cpupct_list, tq); + mutex_exit(&cpu_lock); +} + +/*ARGSUSED*/ +static int +taskq_cpu_setup(cpu_setup_t what, int id, void *arg) +{ + taskq_t *tq; + cpupart_t *cp = cpu[id]->cpu_part; + uint_t ncpus = cp->cp_ncpus; + + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(ncpus > 0); + + switch (what) { + case CPU_OFF: + case CPU_CPUPART_OUT: + /* offlines are called *before* the cpu is offlined. */ + if (ncpus > 1) + ncpus--; + break; + + case CPU_ON: + case CPU_CPUPART_IN: + break; + + default: + return (0); /* doesn't affect cpu count */ + } + + for (tq = list_head(&taskq_cpupct_list); tq != NULL; + tq = list_next(&taskq_cpupct_list, tq)) { + + mutex_enter(&tq->tq_lock); + /* + * If the taskq is part of the cpuset which is changing, + * update its nthreads_target. + */ + if (tq->tq_cpupart == cp->cp_id) { + taskq_update_nthreads(tq, ncpus); + } + mutex_exit(&tq->tq_lock); + } + return (0); +} + +void +taskq_mp_init(void) +{ + mutex_enter(&cpu_lock); + register_cpu_setup_func(taskq_cpu_setup, NULL); + /* + * Make sure we're up to date. At this point in boot, there is only + * one processor set, so we only have to update the current CPU. + */ + (void) taskq_cpu_setup(CPU_ON, CPU->cpu_id, NULL); + mutex_exit(&cpu_lock); +} +#endif /* __APPLE__ */ + + +/* + * Create global system dynamic task queue. + */ +void +system_taskq_init(void) +{ + system_taskq = taskq_create_common("system_taskq", 0, + system_taskq_size * max_ncpus, minclsyspri, 4, 512, &p0, 0, + TASKQ_DYNAMIC | TASKQ_PREPOPULATE); + system_delay_taskq = taskq_create("system_delay_taskq", max_ncpus, + minclsyspri, 0, 0, 0); +} + + +void +system_taskq_fini(void) +{ + if (system_taskq) + taskq_destroy(system_delay_taskq); + if (system_taskq) + taskq_destroy(system_taskq); + system_taskq = NULL; +} + +/* + * taskq_ent_alloc() + * + * Allocates a new taskq_ent_t structure either from the free list or from the + * cache. Returns NULL if it can't be allocated. + * + * Assumes: tq->tq_lock is held. + */ +static taskq_ent_t * +taskq_ent_alloc(taskq_t *tq, int flags) +{ + int kmflags = (flags & TQ_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + taskq_ent_t *tqe; + clock_t wait_time; + clock_t wait_rv; + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + /* + * TQ_NOALLOC allocations are allowed to use the freelist, even if + * we are below tq_minalloc. + */ +again: if ((tqe = tq->tq_freelist) != NULL && + ((flags & TQ_NOALLOC) || tq->tq_nalloc >= tq->tq_minalloc)) { + tq->tq_freelist = tqe->tqent_next; + } else { + if (flags & TQ_NOALLOC) + return (NULL); + + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (kmflags & KM_NOSLEEP) + return (NULL); + + /* + * We don't want to exceed tq_maxalloc, but we can't + * wait for other tasks to complete (and thus free up + * task structures) without risking deadlock with + * the caller. So, we just delay for one second + * to throttle the allocation rate. If we have tasks + * complete before one second timeout expires then + * taskq_ent_free will signal us and we will + * immediately retry the allocation (reap free). + */ + wait_time = ddi_get_lbolt() + hz; + while (tq->tq_freelist == NULL) { + tq->tq_maxalloc_wait++; + wait_rv = cv_timedwait(&tq->tq_maxalloc_cv, + &tq->tq_lock, wait_time); + tq->tq_maxalloc_wait--; + if (wait_rv == -1) + break; + } + if (tq->tq_freelist) + goto again; /* reap freelist */ + + } + mutex_exit(&tq->tq_lock); + + tqe = kmem_cache_alloc(taskq_ent_cache, kmflags); + + mutex_enter(&tq->tq_lock); + if (tqe != NULL) + tq->tq_nalloc++; + } + return (tqe); +} + +/* + * taskq_ent_free() + * + * Free taskq_ent_t structure by either putting it on the free list or freeing + * it to the cache. + * + * Assumes: tq->tq_lock is held. + */ +static void +taskq_ent_free(taskq_t *tq, taskq_ent_t *tqe) +{ + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + if (tq->tq_nalloc <= tq->tq_minalloc) { + tqe->tqent_next = tq->tq_freelist; + tq->tq_freelist = tqe; + } else { + tq->tq_nalloc--; + mutex_exit(&tq->tq_lock); + kmem_cache_free(taskq_ent_cache, tqe); + mutex_enter(&tq->tq_lock); + } + + if (tq->tq_maxalloc_wait) + cv_signal(&tq->tq_maxalloc_cv); +} + +/* + * taskq_ent_exists() + * + * Return 1 if taskq already has entry for calling 'func(arg)'. + * + * Assumes: tq->tq_lock is held. + */ +static int +taskq_ent_exists(taskq_t *tq, task_func_t func, void *arg) +{ + taskq_ent_t *tqe; + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + for (tqe = tq->tq_task.tqent_next; tqe != &tq->tq_task; + tqe = tqe->tqent_next) + if ((tqe->tqent_func == func) && (tqe->tqent_arg == arg)) + return (1); + return (0); +} + +/* + * Dispatch a task "func(arg)" to a free entry of bucket b. + * + * Assumes: no bucket locks is held. + * + * Returns: a pointer to an entry if dispatch was successful. + * NULL if there are no free entries or if the bucket is suspended. + */ +static taskq_ent_t * +taskq_bucket_dispatch(taskq_bucket_t *b, task_func_t func, void *arg) +{ + taskq_ent_t *tqe; + + ASSERT(MUTEX_NOT_HELD(&b->tqbucket_lock)); + ASSERT(func != NULL); + + mutex_enter(&b->tqbucket_lock); + + ASSERT(b->tqbucket_nfree != 0 || IS_EMPTY(b->tqbucket_freelist)); + ASSERT(b->tqbucket_nfree == 0 || !IS_EMPTY(b->tqbucket_freelist)); + + /* + * Get en entry from the freelist if there is one. + * Schedule task into the entry. + */ + if ((b->tqbucket_nfree != 0) && + !(b->tqbucket_flags & TQBUCKET_SUSPEND)) { + tqe = b->tqbucket_freelist.tqent_prev; + + ASSERT(tqe != &b->tqbucket_freelist); + ASSERT(tqe->tqent_thread != NULL); + + tqe->tqent_prev->tqent_next = tqe->tqent_next; + tqe->tqent_next->tqent_prev = tqe->tqent_prev; + b->tqbucket_nalloc++; + b->tqbucket_nfree--; + tqe->tqent_func = func; + tqe->tqent_arg = arg; + TQ_STAT(b, tqs_hits); + cv_signal(&tqe->tqent_cv); + DTRACE_PROBE2(taskq__d__enqueue, taskq_bucket_t *, b, + taskq_ent_t *, tqe); + } else { + tqe = NULL; + TQ_STAT(b, tqs_misses); + } + mutex_exit(&b->tqbucket_lock); + return (tqe); +} + +/* + * Dispatch a task. + * + * Assumes: func != NULL + * + * Returns: NULL if dispatch failed. + * non-NULL if task dispatched successfully. + * Actual return value is the pointer to taskq entry that was used to + * dispatch a task. This is useful for debugging. + */ +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_bucket_t *bucket = NULL; /* Which bucket needs extension */ + taskq_ent_t *tqe = NULL; + taskq_ent_t *tqe1; + uint_t bsize; + + ASSERT(tq != NULL); + ASSERT(func != NULL); + + if (!(tq->tq_flags & TASKQ_DYNAMIC)) { + /* + * TQ_NOQUEUE flag can't be used with non-dynamic task queues. + */ + ASSERT(!(flags & TQ_NOQUEUE)); + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flags); + + if ((tqe = taskq_ent_alloc(tq, flags)) == NULL) { + mutex_exit(&tq->tq_lock); + return (0); + } + /* Make sure we start without any flags */ + tqe->tqent_un.tqent_flags = 0; + + if (flags & TQ_FRONT) { + TQ_ENQUEUE_FRONT(tq, tqe, func, arg); + } else { + TQ_ENQUEUE(tq, tqe, func, arg); + } + mutex_exit(&tq->tq_lock); + return ((taskqid_t)tqe); + } + + /* + * Dynamic taskq dispatching. + */ + ASSERT(!(flags & (TQ_NOALLOC | TQ_FRONT))); + TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flags); + + bsize = tq->tq_nbuckets; + + if (bsize == 1) { + /* + * In a single-CPU case there is only one bucket, so get + * entry directly from there. + */ + if ((tqe = taskq_bucket_dispatch(tq->tq_buckets, func, arg)) + != NULL) + return ((taskqid_t)tqe); /* Fastpath */ + bucket = tq->tq_buckets; + } else { + int loopcount; + taskq_bucket_t *b; + // uintptr_t h = ((uintptr_t)CPU + (uintptr_t)arg) >> 3; + uintptr_t h = ((uintptr_t)(cpu_number()<<3) + + (uintptr_t)arg) >> 3; + + h = TQ_HASH(h); + + /* + * The 'bucket' points to the original bucket that we hit. If we + * can't allocate from it, we search other buckets, but only + * extend this one. + */ + b = &tq->tq_buckets[h & (bsize - 1)]; + ASSERT(b->tqbucket_taskq == tq); /* Sanity check */ + + /* + * Do a quick check before grabbing the lock. If the bucket does + * not have free entries now, chances are very small that it + * will after we take the lock, so we just skip it. + */ + if (b->tqbucket_nfree != 0) { + if ((tqe = taskq_bucket_dispatch(b, func, arg)) != NULL) + return ((taskqid_t)tqe); /* Fastpath */ + } else { + TQ_STAT(b, tqs_misses); + } + + bucket = b; + loopcount = MIN(taskq_search_depth, bsize); + /* + * If bucket dispatch failed, search loopcount number of buckets + * before we give up and fail. + */ + do { + b = &tq->tq_buckets[++h & (bsize - 1)]; + ASSERT(b->tqbucket_taskq == tq); /* Sanity check */ + loopcount--; + + if (b->tqbucket_nfree != 0) { + tqe = taskq_bucket_dispatch(b, func, arg); + } else { + TQ_STAT(b, tqs_misses); + } + } while ((tqe == NULL) && (loopcount > 0)); + } + + /* + * At this point we either scheduled a task and (tqe != NULL) or failed + * (tqe == NULL). Try to recover from fails. + */ + + /* + * For KM_SLEEP dispatches, try to extend the bucket and retry dispatch. + */ + if ((tqe == NULL) && !(flags & TQ_NOSLEEP)) { + /* + * taskq_bucket_extend() may fail to do anything, but this is + * fine - we deal with it later. If the bucket was successfully + * extended, there is a good chance that taskq_bucket_dispatch() + * will get this new entry, unless someone is racing with us and + * stealing the new entry from under our nose. + * taskq_bucket_extend() may sleep. + */ + taskq_bucket_extend(bucket); + TQ_STAT(bucket, tqs_disptcreates); + if ((tqe = taskq_bucket_dispatch(bucket, func, arg)) != NULL) + return ((taskqid_t)tqe); + } + + ASSERT(bucket != NULL); + + /* + * Since there are not enough free entries in the bucket, add a + * taskq entry to extend it in the background using backing queue + * (unless we already have a taskq entry to perform that extension). + */ + mutex_enter(&tq->tq_lock); + if (!taskq_ent_exists(tq, taskq_bucket_extend, bucket)) { + if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) { + TQ_ENQUEUE_FRONT(tq, tqe1, taskq_bucket_extend, bucket); + } else { + TQ_STAT(bucket, tqs_nomem); + } + } + + /* + * Dispatch failed and we can't find an entry to schedule a task. + * Revert to the backing queue unless TQ_NOQUEUE was asked. + */ + if ((tqe == NULL) && !(flags & TQ_NOQUEUE)) { + if ((tqe = taskq_ent_alloc(tq, flags)) != NULL) { + TQ_ENQUEUE(tq, tqe, func, arg); + } else { + TQ_STAT(bucket, tqs_nomem); + } + } + mutex_exit(&tq->tq_lock); + + return ((taskqid_t)tqe); +} + +/* + * FIXME, Linux has added the ability to start taskq with a given + * delay. + */ +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + clock_t timo; + + /* If it has already expired, just dispatch */ + timo = expire_time - ddi_get_lbolt(); + if (timo <= 0) + return (taskq_dispatch(tq, func, arg, flags)); + + /* Insert delayed code here: */ + return (0); +} + +void +taskq_init_ent(taskq_ent_t *t) +{ + memset(t, 0, sizeof (*t)); +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *tqe) +{ + ASSERT(func != NULL); + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + tqe->tqent_un.tqent_flags |= TQENT_FLAG_PREALLOC; + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + if (flags & TQ_FRONT) { + TQ_ENQUEUE_FRONT(tq, tqe, func, arg); + } else { + TQ_ENQUEUE(tq, tqe, func, arg); + } + mutex_exit(&tq->tq_lock); +} + +/* + * Allow our caller to ask if there are tasks pending on the queue. + */ +boolean_t +taskq_empty(taskq_t *tq) +{ + boolean_t rv; + + mutex_enter(&tq->tq_lock); + rv = (tq->tq_task.tqent_next == &tq->tq_task) && (tq->tq_active == 0); + mutex_exit(&tq->tq_lock); + + return (rv); +} + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (IS_EMPTY(*t)); +} + +/* + * Wait for all pending tasks to complete. + * Calling taskq_wait from a task will cause deadlock. + */ +void +taskq_wait(taskq_t *tq) +{ +#ifndef __APPLE__ + ASSERT(tq != curthread->t_taskq); +#endif + + if (tq == NULL) + return; + + mutex_enter(&tq->tq_lock); + while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + mutex_exit(&tq->tq_lock); + + if (tq->tq_flags & TASKQ_DYNAMIC) { + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + mutex_enter(&b->tqbucket_lock); + while (b->tqbucket_nalloc > 0) + cv_wait(&b->tqbucket_cv, &b->tqbucket_lock); + mutex_exit(&b->tqbucket_lock); + } + } +} + +/* + * ZOL implements taskq_wait_id() that can wait for a specific + * taskq to finish, rather than all active taskqs. Until it is + * implemented, we wait for all to complete. + */ +void +taskq_wait_id(taskq_t *tq, taskqid_t id) +{ + return (taskq_wait(tq)); +} + +/* + * Suspend execution of tasks. + * + * Tasks in the queue part will be suspended immediately upon return from this + * function. Pending tasks in the dynamic part will continue to execute, but all + * new tasks will be suspended. + */ +void +taskq_suspend(taskq_t *tq) +{ + rw_enter(&tq->tq_threadlock, RW_WRITER); + + if (tq->tq_flags & TASKQ_DYNAMIC) { + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + mutex_enter(&b->tqbucket_lock); + b->tqbucket_flags |= TQBUCKET_SUSPEND; + mutex_exit(&b->tqbucket_lock); + } + } + /* + * Mark task queue as being suspended. Needed for taskq_suspended(). + */ + mutex_enter(&tq->tq_lock); + ASSERT(!(tq->tq_flags & TASKQ_SUSPENDED)); + tq->tq_flags |= TASKQ_SUSPENDED; + mutex_exit(&tq->tq_lock); +} + +/* + * returns: 1 if tq is suspended, 0 otherwise. + */ +int +taskq_suspended(taskq_t *tq) +{ + return ((tq->tq_flags & TASKQ_SUSPENDED) != 0); +} + +/* + * Resume taskq execution. + */ +void +taskq_resume(taskq_t *tq) +{ + ASSERT(RW_WRITE_HELD(&tq->tq_threadlock)); + + if (tq->tq_flags & TASKQ_DYNAMIC) { + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + mutex_enter(&b->tqbucket_lock); + b->tqbucket_flags &= ~TQBUCKET_SUSPEND; + mutex_exit(&b->tqbucket_lock); + } + } + mutex_enter(&tq->tq_lock); + ASSERT(tq->tq_flags & TASKQ_SUSPENDED); + tq->tq_flags &= ~TASKQ_SUSPENDED; + mutex_exit(&tq->tq_lock); + + rw_exit(&tq->tq_threadlock); +} + +int +taskq_member(taskq_t *tq, kthread_t *thread) +{ + return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, thread)); +} + +taskq_t * +taskq_of_curthread(void) +{ + return (tsd_get(taskq_tsd)); +} + +/* + * Cancel an already dispatched task given the task id. Still pending tasks + * will be immediately canceled, and if the task is active the function will + * block until it completes. Preallocated tasks which are canceled must be + * freed by the caller. + */ +int +taskq_cancel_id(taskq_t *tq, taskqid_t id) +{ + // taskq_t *task = (taskq_t *) id; + + /* So we want to tell task to stop, and wait until it does */ + if (!EMPTY_TASKQ(tq)) + taskq_wait(tq); + + return (0); +} + +/* + * Creates a thread in the taskq. We only allow one outstanding create at + * a time. We drop and reacquire the tq_lock in order to avoid blocking other + * taskq activity while thread_create() or lwp_kernel_create() run. + * + * The first time we're called, we do some additional setup, and do not + * return until there are enough threads to start servicing requests. + */ +static void +taskq_thread_create(taskq_t *tq) +{ + kthread_t *t; + const boolean_t first = (tq->tq_nthreads == 0); + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + ASSERT(tq->tq_flags & TASKQ_CHANGING); + ASSERT(tq->tq_nthreads < tq->tq_nthreads_target); + ASSERT(!(tq->tq_flags & TASKQ_THREAD_CREATED)); + + tq->tq_flags |= TASKQ_THREAD_CREATED; + tq->tq_active++; + mutex_exit(&tq->tq_lock); + + /* + * With TASKQ_DUTY_CYCLE the new thread must have an LWP + * as explained in ../disp/sysdc.c (for the msacct data). + * Otherwise simple kthreads are preferred. + */ + if ((tq->tq_flags & TASKQ_DUTY_CYCLE) != 0) { + /* Enforced in taskq_create_common */ + printf("SPL: taskq_thread_create(TASKQ_DUTY_CYCLE) seen\n"); +#ifndef __APPLE__ + ASSERT3P(tq->tq_proc, !=, &p0); + t = lwp_kernel_create(tq->tq_proc, taskq_thread, tq, TS_RUN, + tq->tq_pri); +#endif + } else { + t = thread_create(NULL, 0, taskq_thread, tq, 0, tq->tq_proc, + TS_RUN, tq->tq_pri); + } + + if (!first) { + mutex_enter(&tq->tq_lock); + return; + } + + /* + * We know the thread cannot go away, since tq cannot be + * destroyed until creation has completed. We can therefore + * safely dereference t. + */ + if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) { +#ifdef __APPLE__ + mutex_enter(&tq->tq_lock); + taskq_update_nthreads(tq, max_ncpus); + mutex_exit(&tq->tq_lock); +#else + taskq_cpupct_install(tq, t->t_cpupart); +#endif + } + mutex_enter(&tq->tq_lock); + + /* Wait until we can service requests. */ + while (tq->tq_nthreads != tq->tq_nthreads_target && + tq->tq_nthreads < TASKQ_CREATE_ACTIVE_THREADS) { + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + } +} + +/* + * Common "sleep taskq thread" function, which handles CPR stuff, as well + * as giving a nice common point for debuggers to find inactive threads. + */ +static clock_t +taskq_thread_wait(taskq_t *tq, kmutex_t *mx, kcondvar_t *cv, + callb_cpr_t *cprinfo, clock_t timeout) +{ + clock_t ret = 0; + + if (!(tq->tq_flags & TASKQ_CPR_SAFE)) { + CALLB_CPR_SAFE_BEGIN(cprinfo); + } + if ((signed long)timeout < 0) + cv_wait(cv, mx); + else + ret = cv_reltimedwait(cv, mx, timeout, TR_CLOCK_TICK); + + if (!(tq->tq_flags & TASKQ_CPR_SAFE)) { + CALLB_CPR_SAFE_END(cprinfo, mx); + } + + return (ret); +} + +/* + * Worker thread for processing task queue. + */ +static void +taskq_thread(void *arg) +{ + int thread_id; + + taskq_t *tq = arg; + taskq_ent_t *tqe; + callb_cpr_t cprinfo; + hrtime_t start, end; + boolean_t freeit; + + CALLB_CPR_INIT(&cprinfo, &tq->tq_lock, callb_generic_cpr, + tq->tq_name); + + tsd_set(taskq_tsd, tq); + mutex_enter(&tq->tq_lock); + thread_id = ++tq->tq_nthreads; + ASSERT(tq->tq_flags & TASKQ_THREAD_CREATED); + ASSERT(tq->tq_flags & TASKQ_CHANGING); + tq->tq_flags &= ~TASKQ_THREAD_CREATED; + + VERIFY3S(thread_id, <=, tq->tq_nthreads_max); + + if (tq->tq_nthreads_max == 1) + tq->tq_thread = (kthread_t *)curthread; + else + tq->tq_threadlist[thread_id - 1] = (kthread_t *)curthread; + + /* Allow taskq_create_common()'s taskq_thread_create() to return. */ + if (tq->tq_nthreads == TASKQ_CREATE_ACTIVE_THREADS) + cv_broadcast(&tq->tq_wait_cv); + + for (;;) { + if (tq->tq_flags & TASKQ_CHANGING) { + /* See if we're no longer needed */ + if (thread_id > tq->tq_nthreads_target) { + /* + * To preserve the one-to-one mapping between + * thread_id and thread, we must exit from + * highest thread ID to least. + * + * However, if everyone is exiting, the order + * doesn't matter, so just exit immediately. + * (this is safe, since you must wait for + * nthreads to reach 0 after setting + * tq_nthreads_target to 0) + */ + if (thread_id == tq->tq_nthreads || + tq->tq_nthreads_target == 0) + break; + + /* Wait for higher thread_ids to exit */ + (void) taskq_thread_wait(tq, &tq->tq_lock, + &tq->tq_exit_cv, &cprinfo, -1); + continue; + } + + /* + * If no thread is starting taskq_thread(), we can + * do some bookkeeping. + */ + if (!(tq->tq_flags & TASKQ_THREAD_CREATED)) { + /* Check if we've reached our target */ + if (tq->tq_nthreads == tq->tq_nthreads_target) { + tq->tq_flags &= ~TASKQ_CHANGING; + cv_broadcast(&tq->tq_wait_cv); + } + /* Check if we need to create a thread */ + if (tq->tq_nthreads < tq->tq_nthreads_target) { + taskq_thread_create(tq); + continue; /* tq_lock was dropped */ + } + } + } + if ((tqe = tq->tq_task.tqent_next) == &tq->tq_task) { + if (--tq->tq_active == 0) + cv_broadcast(&tq->tq_wait_cv); + (void) taskq_thread_wait(tq, &tq->tq_lock, + &tq->tq_dispatch_cv, &cprinfo, -1); + tq->tq_active++; + continue; + } + + tqe->tqent_prev->tqent_next = tqe->tqent_next; + tqe->tqent_next->tqent_prev = tqe->tqent_prev; + mutex_exit(&tq->tq_lock); + + /* + * For prealloc'd tasks, we don't free anything. We + * have to check this now, because once we call the + * function for a prealloc'd taskq, we can't touch the + * tqent any longer (calling the function returns the + * ownershp of the tqent back to caller of + * taskq_dispatch.) + */ + if ((!(tq->tq_flags & TASKQ_DYNAMIC)) && + (tqe->tqent_un.tqent_flags & TQENT_FLAG_PREALLOC)) { + /* clear pointers to assist assertion checks */ + tqe->tqent_next = tqe->tqent_prev = NULL; + freeit = B_FALSE; + } else { + freeit = B_TRUE; + } + + rw_enter(&tq->tq_threadlock, RW_READER); + start = gethrtime(); + DTRACE_PROBE2(taskq__exec__start, taskq_t *, tq, + taskq_ent_t *, tqe); + tqe->tqent_func(tqe->tqent_arg); + DTRACE_PROBE2(taskq__exec__end, taskq_t *, tq, + taskq_ent_t *, tqe); + end = gethrtime(); + rw_exit(&tq->tq_threadlock); + + mutex_enter(&tq->tq_lock); + tq->tq_totaltime += end - start; + tq->tq_executed++; + + if (freeit) + taskq_ent_free(tq, tqe); + } + + if (tq->tq_nthreads_max == 1) + tq->tq_thread = NULL; + else + tq->tq_threadlist[thread_id - 1] = NULL; + + /* We're exiting, and therefore no longer active */ + ASSERT(tq->tq_active > 0); + tq->tq_active--; + + ASSERT(tq->tq_nthreads > 0); + tq->tq_nthreads--; + + /* Wake up anyone waiting for us to exit */ + cv_broadcast(&tq->tq_exit_cv); + if (tq->tq_nthreads == tq->tq_nthreads_target) { + if (!(tq->tq_flags & TASKQ_THREAD_CREATED)) + tq->tq_flags &= ~TASKQ_CHANGING; + + cv_broadcast(&tq->tq_wait_cv); + } + + tsd_set(taskq_tsd, NULL); + + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); + +} + +/* + * Worker per-entry thread for dynamic dispatches. + */ +static void +taskq_d_thread(taskq_ent_t *tqe) +{ + taskq_bucket_t *bucket = tqe->tqent_un.tqent_bucket; + taskq_t *tq = bucket->tqbucket_taskq; + kmutex_t *lock = &bucket->tqbucket_lock; + kcondvar_t *cv = &tqe->tqent_cv; + callb_cpr_t cprinfo; + clock_t w; + + CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, tq->tq_name); + +#ifdef __APPLE__ + /* + * There's no way in Mac OS X KPI to create a thread + * in a suspended state (TS_STOPPED). So instead we + * use tqent_thread as a flag and wait for it to get + * initialized. + */ + mutex_enter(&tqe->tqent_thread_lock); + while (tqe->tqent_thread == (kthread_t *)0xCEDEC0DE) + cv_wait(&tqe->tqent_thread_cv, &tqe->tqent_thread_lock); + mutex_exit(&tqe->tqent_thread_lock); +#endif + + mutex_enter(lock); + + for (;;) { + /* + * If a task is scheduled (func != NULL), execute it, otherwise + * sleep, waiting for a job. + */ + if (tqe->tqent_func != NULL) { + hrtime_t start; + hrtime_t end; + + ASSERT(bucket->tqbucket_nalloc > 0); + + /* + * It is possible to free the entry right away before + * actually executing the task so that subsequent + * dispatches may immediately reuse it. But this, + * effectively, creates a two-length queue in the entry + * and may lead to a deadlock if the execution of the + * current task depends on the execution of the next + * scheduled task. So, we keep the entry busy until the + * task is processed. + */ + + mutex_exit(lock); + start = gethrtime(); + DTRACE_PROBE3(taskq__d__exec__start, taskq_t *, tq, + taskq_bucket_t *, bucket, taskq_ent_t *, tqe); + tqe->tqent_func(tqe->tqent_arg); + DTRACE_PROBE3(taskq__d__exec__end, taskq_t *, tq, + taskq_bucket_t *, bucket, taskq_ent_t *, tqe); + end = gethrtime(); + mutex_enter(lock); + bucket->tqbucket_totaltime += end - start; + + /* + * Return the entry to the bucket free list. + */ + tqe->tqent_func = NULL; + TQ_APPEND(bucket->tqbucket_freelist, tqe); + bucket->tqbucket_nalloc--; + bucket->tqbucket_nfree++; + ASSERT(!IS_EMPTY(bucket->tqbucket_freelist)); + /* + * taskq_wait() waits for nalloc to drop to zero on + * tqbucket_cv. + */ + cv_signal(&bucket->tqbucket_cv); + } + + /* + * At this point the entry must be in the bucket free list - + * either because it was there initially or because it just + * finished executing a task and put itself on the free list. + */ + ASSERT(bucket->tqbucket_nfree > 0); + /* + * Go to sleep unless we are closing. + * If a thread is sleeping too long, it dies. + */ + if (! (bucket->tqbucket_flags & TQBUCKET_CLOSE)) { + w = taskq_thread_wait(tq, lock, cv, + &cprinfo, taskq_thread_timeout * hz); + } + + /* + * At this point we may be in two different states: + * + * (1) tqent_func is set which means that a new task is + * dispatched and we need to execute it. + * + * (2) Thread is sleeping for too long or we are closing. In + * both cases destroy the thread and the entry. + */ + + /* If func is NULL we should be on the freelist. */ + ASSERT((tqe->tqent_func != NULL) || + (bucket->tqbucket_nfree > 0)); + /* If func is non-NULL we should be allocated */ + ASSERT((tqe->tqent_func == NULL) || + (bucket->tqbucket_nalloc > 0)); + + /* Check freelist consistency */ + ASSERT((bucket->tqbucket_nfree > 0) || + IS_EMPTY(bucket->tqbucket_freelist)); + ASSERT((bucket->tqbucket_nfree == 0) || + !IS_EMPTY(bucket->tqbucket_freelist)); + + if ((tqe->tqent_func == NULL) && + ((w == -1) || (bucket->tqbucket_flags & TQBUCKET_CLOSE))) { + /* + * This thread is sleeping for too long or we are + * closing - time to die. + * Thread creation/destruction happens rarely, + * so grabbing the lock is not a big performance issue. + * The bucket lock is dropped by CALLB_CPR_EXIT(). + */ + + /* Remove the entry from the free list. */ + tqe->tqent_prev->tqent_next = tqe->tqent_next; + tqe->tqent_next->tqent_prev = tqe->tqent_prev; + ASSERT(bucket->tqbucket_nfree > 0); + bucket->tqbucket_nfree--; + + TQ_STAT(bucket, tqs_tdeaths); + cv_signal(&bucket->tqbucket_cv); + tqe->tqent_thread = NULL; + mutex_enter(&tq->tq_lock); + tq->tq_tdeaths++; + mutex_exit(&tq->tq_lock); + CALLB_CPR_EXIT(&cprinfo); + kmem_cache_free(taskq_ent_cache, tqe); + thread_exit(); + } + } +} + + +/* + * Taskq creation. May sleep for memory. + * Always use automatically generated instances to avoid kstat name space + * collisions. + */ + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + + return (taskq_create_common(name, 0, nthreads, pri, minalloc, + maxalloc, &p0, 0, flags | TASKQ_NOINSTANCE)); +} + +/* + * Create an instance of task queue. It is legal to create task queues with the + * same name and different instances. + * + * taskq_create_instance is used by ddi_taskq_create() where it gets the + * instance from ddi_get_instance(). In some cases the instance is not + * initialized and is set to -1. This case is handled as if no instance was + * passed at all. + */ +taskq_t * +taskq_create_instance(const char *name, int instance, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + ASSERT((instance >= 0) || (instance == -1)); + + if (instance < 0) { + flags |= TASKQ_NOINSTANCE; + } + + return (taskq_create_common(name, instance, nthreads, + pri, minalloc, maxalloc, &p0, 0, flags)); +} + +taskq_t * +taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, proc_t *proc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); +#ifndef __APPLE__ + ASSERT(proc->p_flag & SSYS); +#endif + return (taskq_create_common(name, 0, nthreads, pri, minalloc, + maxalloc, proc, 0, flags | TASKQ_NOINSTANCE)); +} + +taskq_t * +taskq_create_sysdc(const char *name, int nthreads, int minalloc, + int maxalloc, proc_t *proc, uint_t dc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); +#ifndef __APPLE__ + ASSERT(proc->p_flag & SSYS); +#endif + return (taskq_create_common(name, 0, nthreads, minclsyspri, minalloc, + maxalloc, proc, dc, flags | TASKQ_NOINSTANCE | TASKQ_DUTY_CYCLE)); +} + +static taskq_t * +taskq_create_common(const char *name, int instance, int nthreads, pri_t pri, + int minalloc, int maxalloc, proc_t *proc, uint_t dc, uint_t flags) +{ + taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_SLEEP); +#ifdef __APPLE__ + uint_t ncpus = max_ncpus; +#else + uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus); +#endif + uint_t bsize; /* # of buckets - always power of 2 */ + int max_nthreads; + + /* + * We are not allowed to use TASKQ_DYNAMIC with taskq_dispatch_ent() + * but that is done by spa.c - so we will simply mask DYNAMIC out. + */ + flags &= ~TASKQ_DYNAMIC; + + /* + * TASKQ_DYNAMIC, TASKQ_CPR_SAFE and TASKQ_THREADS_CPU_PCT are all + * mutually incompatible. + */ + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_CPR_SAFE)); + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_THREADS_CPU_PCT)); + IMPLY((flags & TASKQ_CPR_SAFE), !(flags & TASKQ_THREADS_CPU_PCT)); + + /* Cannot have DYNAMIC with DUTY_CYCLE */ + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_DUTY_CYCLE)); + + /* Cannot have DUTY_CYCLE with a p0 kernel process */ + IMPLY((flags & TASKQ_DUTY_CYCLE), proc != &p0); + + /* Cannot have DC_BATCH without DUTY_CYCLE */ + ASSERT((flags & (TASKQ_DUTY_CYCLE|TASKQ_DC_BATCH)) != TASKQ_DC_BATCH); + + ASSERT(proc != NULL); + + bsize = 1 << (highbit(ncpus) - 1); + ASSERT(bsize >= 1); + bsize = MIN(bsize, taskq_maxbuckets); + + if (flags & TASKQ_DYNAMIC) { + ASSERT3S(nthreads, >=, 1); + tq->tq_maxsize = nthreads; + + /* For dynamic task queues use just one backup thread */ + nthreads = max_nthreads = 1; + + } else if (flags & TASKQ_THREADS_CPU_PCT) { + uint_t pct; + ASSERT3S(nthreads, >=, 0); + pct = nthreads; + + if (pct > taskq_cpupct_max_percent) + pct = taskq_cpupct_max_percent; + + /* + * If you're using THREADS_CPU_PCT, the process for the + * taskq threads must be curproc. This allows any pset + * binding to be inherited correctly. If proc is &p0, + * we won't be creating LWPs, so new threads will be assigned + * to the default processor set. + */ + /* ASSERT(curproc == proc || proc == &p0); */ + tq->tq_threads_ncpus_pct = pct; + nthreads = 1; /* corrected in taskq_thread_create() */ + max_nthreads = TASKQ_THREADS_PCT(max_ncpus, pct); + + } else { + ASSERT3S(nthreads, >=, 1); + max_nthreads = nthreads; + } + + if (max_nthreads < taskq_minimum_nthreads_max) + max_nthreads = taskq_minimum_nthreads_max; + + /* + * Make sure the name is 0-terminated, and conforms to the rules for + * C indentifiers + */ + (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1); + strident_canon(tq->tq_name, TASKQ_NAMELEN + 1); + + tq->tq_flags = flags | TASKQ_CHANGING; + tq->tq_active = 0; + tq->tq_instance = instance; + tq->tq_nthreads_target = nthreads; + tq->tq_nthreads_max = max_nthreads; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_nbuckets = bsize; + tq->tq_proc = proc; + tq->tq_pri = pri; + tq->tq_DC = dc; + list_link_init(&tq->tq_cpupct_link); + + if (max_nthreads > 1) + tq->tq_threadlist = kmem_alloc( + sizeof (kthread_t *) * max_nthreads, KM_SLEEP); + + mutex_enter(&tq->tq_lock); + if (flags & TASKQ_PREPOPULATE) { + while (minalloc-- > 0) + taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP)); + } + + /* + * Before we start creating threads for this taskq, take a + * zone hold so the zone can't go away before taskq_destroy + * makes sure all the taskq threads are gone. This hold is + * similar in purpose to those taken by zthread_create(). + */ +#ifndef __APPLE__ + zone_hold(tq->tq_proc->p_zone); +#endif + /* + * Create the first thread, which will create any other threads + * necessary. taskq_thread_create will not return until we have + * enough threads to be able to process requests. + */ + taskq_thread_create(tq); + mutex_exit(&tq->tq_lock); + + if (flags & TASKQ_DYNAMIC) { + taskq_bucket_t *bucket = kmem_zalloc(sizeof (taskq_bucket_t) * + bsize, KM_SLEEP); + int b_id; + + tq->tq_buckets = bucket; + + /* Initialize each bucket */ + for (b_id = 0; b_id < bsize; b_id++, bucket++) { + mutex_init(&bucket->tqbucket_lock, NULL, MUTEX_DEFAULT, + NULL); + cv_init(&bucket->tqbucket_cv, NULL, CV_DEFAULT, NULL); + bucket->tqbucket_taskq = tq; + bucket->tqbucket_freelist.tqent_next = + bucket->tqbucket_freelist.tqent_prev = + &bucket->tqbucket_freelist; + if (flags & TASKQ_PREPOPULATE) + taskq_bucket_extend(bucket); + } + } + + /* + * Install kstats. + * We have two cases: + * 1) Instance is provided to taskq_create_instance(). In this case it + * should be >= 0 and we use it. + * + * 2) Instance is not provided and is automatically generated + */ + if (flags & TASKQ_NOINSTANCE) { + instance = tq->tq_instance = + (int)(uintptr_t)vmem_alloc(taskq_id_arena, 1, VM_SLEEP); + } + + if (flags & TASKQ_DYNAMIC) { + if ((tq->tq_kstat = kstat_create("unix", instance, + tq->tq_name, "taskq_d", KSTAT_TYPE_NAMED, + sizeof (taskq_d_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + tq->tq_kstat->ks_lock = &taskq_d_kstat_lock; + tq->tq_kstat->ks_data = &taskq_d_kstat; + tq->tq_kstat->ks_update = taskq_d_kstat_update; + tq->tq_kstat->ks_private = tq; + kstat_install(tq->tq_kstat); + } + } else { + if ((tq->tq_kstat = kstat_create("unix", instance, tq->tq_name, + "taskq", KSTAT_TYPE_NAMED, + sizeof (taskq_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + tq->tq_kstat->ks_lock = &taskq_kstat_lock; + tq->tq_kstat->ks_data = &taskq_kstat; + tq->tq_kstat->ks_update = taskq_kstat_update; + tq->tq_kstat->ks_private = tq; + kstat_install(tq->tq_kstat); + } + } + + return (tq); +} + +/* + * taskq_destroy(). + * + * Assumes: by the time taskq_destroy is called no one will use this task queue + * in any way and no one will try to dispatch entries in it. + */ +void +taskq_destroy(taskq_t *tq) +{ + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + + ASSERT(! (tq->tq_flags & TASKQ_CPR_SAFE)); + + /* + * Destroy kstats. + */ + if (tq->tq_kstat != NULL) { + kstat_delete(tq->tq_kstat); + tq->tq_kstat = NULL; + } + + /* + * Destroy instance if needed. + */ + if (tq->tq_flags & TASKQ_NOINSTANCE) { + vmem_free(taskq_id_arena, (void *)(uintptr_t)(tq->tq_instance), + 1); + tq->tq_instance = 0; + } + + /* + * Unregister from the cpupct list. + */ +#ifndef __APPLE__ + if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) { + taskq_cpupct_remove(tq); + } +#endif + + /* + * Wait for any pending entries to complete. + */ + taskq_wait(tq); + + mutex_enter(&tq->tq_lock); + ASSERT((tq->tq_task.tqent_next == &tq->tq_task) && + (tq->tq_active == 0)); + + /* notify all the threads that they need to exit */ + tq->tq_nthreads_target = 0; + + tq->tq_flags |= TASKQ_CHANGING; + cv_broadcast(&tq->tq_dispatch_cv); + cv_broadcast(&tq->tq_exit_cv); + + while (tq->tq_nthreads != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + + if (tq->tq_nthreads_max != 1) + kmem_free(tq->tq_threadlist, sizeof (kthread_t *) * + tq->tq_nthreads_max); + + tq->tq_minalloc = 0; + while (tq->tq_nalloc != 0) + taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP)); + + mutex_exit(&tq->tq_lock); + + /* + * Mark each bucket as closing and wakeup all sleeping threads. + */ + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + taskq_ent_t *tqe; + + mutex_enter(&b->tqbucket_lock); + + b->tqbucket_flags |= TQBUCKET_CLOSE; + /* Wakeup all sleeping threads */ + + for (tqe = b->tqbucket_freelist.tqent_next; + tqe != &b->tqbucket_freelist; tqe = tqe->tqent_next) + cv_signal(&tqe->tqent_cv); + + ASSERT(b->tqbucket_nalloc == 0); + + /* + * At this point we waited for all pending jobs to complete (in + * both the task queue and the bucket and no new jobs should + * arrive. Wait for all threads to die. + */ + while (b->tqbucket_nfree > 0) + cv_wait(&b->tqbucket_cv, &b->tqbucket_lock); + mutex_exit(&b->tqbucket_lock); + mutex_destroy(&b->tqbucket_lock); + cv_destroy(&b->tqbucket_cv); + } + + if (tq->tq_buckets != NULL) { + ASSERT(tq->tq_flags & TASKQ_DYNAMIC); + kmem_free(tq->tq_buckets, + sizeof (taskq_bucket_t) * tq->tq_nbuckets); + + /* Cleanup fields before returning tq to the cache */ + tq->tq_buckets = NULL; + tq->tq_tcreates = 0; + tq->tq_tdeaths = 0; + } else { + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + } + + /* + * Now that all the taskq threads are gone, we can + * drop the zone hold taken in taskq_create_common + */ +#ifndef __APPLE__ + zone_rele(tq->tq_proc->p_zone); +#endif + + tq->tq_threads_ncpus_pct = 0; + tq->tq_totaltime = 0; + tq->tq_tasks = 0; + tq->tq_maxtasks = 0; + tq->tq_executed = 0; + kmem_cache_free(taskq_cache, tq); +} + +/* + * Extend a bucket with a new entry on the free list and attach a worker thread + * to it. + * + * Argument: pointer to the bucket. + * + * This function may quietly fail. It is only used by taskq_dispatch() which + * handles such failures properly. + */ +static void +taskq_bucket_extend(void *arg) +{ + taskq_ent_t *tqe; + taskq_bucket_t *b = (taskq_bucket_t *)arg; + taskq_t *tq = b->tqbucket_taskq; + int nthreads; +#ifdef __APPLE__ + kthread_t *thread; +#endif + + if (! ENOUGH_MEMORY()) { + TQ_STAT(b, tqs_nomem); + return; + } + + mutex_enter(&tq->tq_lock); + + /* + * Observe global taskq limits on the number of threads. + */ + if (tq->tq_tcreates++ - tq->tq_tdeaths > tq->tq_maxsize) { + tq->tq_tcreates--; + mutex_exit(&tq->tq_lock); + return; + } + mutex_exit(&tq->tq_lock); + + tqe = kmem_cache_alloc(taskq_ent_cache, KM_NOSLEEP); + + if (tqe == NULL) { + mutex_enter(&tq->tq_lock); + TQ_STAT(b, tqs_nomem); + tq->tq_tcreates--; + mutex_exit(&tq->tq_lock); + return; + } + + ASSERT(tqe->tqent_thread == NULL); + + tqe->tqent_un.tqent_bucket = b; + +#ifdef __APPLE__ + /* + * There's no way in Mac OS X KPI to create a thread + * in a suspended state (TS_STOPPED). So instead we + * use tqent_thread as a flag and the thread must wait + * for it to be initialized (below). + */ + tqe->tqent_thread = (kthread_t *)0xCEDEC0DE; + thread = thread_create(NULL, 0, (void (*)(void *))taskq_d_thread, + tqe, 0, pp0, TS_RUN, tq->tq_pri); +#else + + /* + * Create a thread in a TS_STOPPED state first. If it is successfully + * created, place the entry on the free list and start the thread. + */ + tqe->tqent_thread = thread_create(NULL, 0, taskq_d_thread, tqe, + 0, tq->tq_proc, TS_STOPPED, tq->tq_pri); +#endif /* __APPLE__ */ + + /* + * Once the entry is ready, link it to the the bucket free list. + */ + mutex_enter(&b->tqbucket_lock); + tqe->tqent_func = NULL; + TQ_APPEND(b->tqbucket_freelist, tqe); + b->tqbucket_nfree++; + TQ_STAT(b, tqs_tcreates); + +#if TASKQ_STATISTIC + nthreads = b->tqbucket_stat.tqs_tcreates - + b->tqbucket_stat.tqs_tdeaths; + b->tqbucket_stat.tqs_maxthreads = MAX(nthreads, + b->tqbucket_stat.tqs_maxthreads); +#endif + + mutex_exit(&b->tqbucket_lock); + /* + * Start the stopped thread. + */ +#ifdef __APPLE__ + mutex_enter(&tqe->tqent_thread_lock); + tqe->tqent_thread = thread; + cv_signal(&tqe->tqent_thread_cv); + mutex_exit(&tqe->tqent_thread_lock); +#else + thread_lock(tqe->tqent_thread); + tqe->tqent_thread->t_taskq = tq; + tqe->tqent_thread->t_schedflag |= TS_ALLSTART; + setrun_locked(tqe->tqent_thread); + thread_unlock(tqe->tqent_thread); +#endif /* __APPLE__ */ +} + +static int +taskq_kstat_update(kstat_t *ksp, int rw) +{ + struct taskq_kstat *tqsp = &taskq_kstat; + taskq_t *tq = ksp->ks_private; + + if (rw == KSTAT_WRITE) + return (EACCES); + +#ifdef __APPLE__ + tqsp->tq_pid.value.ui64 = 0; /* kernel_task'd pid is 0 */ +#else + tqsp->tq_pid.value.ui64 = proc_pid(tq->tq_proc->p_pid); +#endif + tqsp->tq_tasks.value.ui64 = tq->tq_tasks; + tqsp->tq_executed.value.ui64 = tq->tq_executed; + tqsp->tq_maxtasks.value.ui64 = tq->tq_maxtasks; + tqsp->tq_totaltime.value.ui64 = tq->tq_totaltime; + tqsp->tq_nactive.value.ui64 = tq->tq_active; + tqsp->tq_nalloc.value.ui64 = tq->tq_nalloc; + tqsp->tq_pri.value.ui64 = tq->tq_pri; + tqsp->tq_nthreads.value.ui64 = tq->tq_nthreads; + return (0); +} + +static int +taskq_d_kstat_update(kstat_t *ksp, int rw) +{ + struct taskq_d_kstat *tqsp = &taskq_d_kstat; + taskq_t *tq = ksp->ks_private; + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ASSERT(tq->tq_flags & TASKQ_DYNAMIC); + + tqsp->tqd_btasks.value.ui64 = tq->tq_tasks; + tqsp->tqd_bexecuted.value.ui64 = tq->tq_executed; + tqsp->tqd_bmaxtasks.value.ui64 = tq->tq_maxtasks; + tqsp->tqd_bnalloc.value.ui64 = tq->tq_nalloc; + tqsp->tqd_bnactive.value.ui64 = tq->tq_active; + tqsp->tqd_btotaltime.value.ui64 = tq->tq_totaltime; + tqsp->tqd_pri.value.ui64 = tq->tq_pri; + + tqsp->tqd_hits.value.ui64 = 0; + tqsp->tqd_misses.value.ui64 = 0; + tqsp->tqd_overflows.value.ui64 = 0; + tqsp->tqd_tcreates.value.ui64 = 0; + tqsp->tqd_tdeaths.value.ui64 = 0; + tqsp->tqd_maxthreads.value.ui64 = 0; + tqsp->tqd_nomem.value.ui64 = 0; + tqsp->tqd_disptcreates.value.ui64 = 0; + tqsp->tqd_totaltime.value.ui64 = 0; + tqsp->tqd_nalloc.value.ui64 = 0; + tqsp->tqd_nfree.value.ui64 = 0; + + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + tqsp->tqd_hits.value.ui64 += b->tqbucket_stat.tqs_hits; + tqsp->tqd_misses.value.ui64 += b->tqbucket_stat.tqs_misses; + tqsp->tqd_overflows.value.ui64 += b->tqbucket_stat.tqs_overflow; + tqsp->tqd_tcreates.value.ui64 += b->tqbucket_stat.tqs_tcreates; + tqsp->tqd_tdeaths.value.ui64 += b->tqbucket_stat.tqs_tdeaths; + tqsp->tqd_maxthreads.value.ui64 += + b->tqbucket_stat.tqs_maxthreads; + tqsp->tqd_nomem.value.ui64 += b->tqbucket_stat.tqs_nomem; + tqsp->tqd_disptcreates.value.ui64 += + b->tqbucket_stat.tqs_disptcreates; + tqsp->tqd_totaltime.value.ui64 += b->tqbucket_totaltime; + tqsp->tqd_nalloc.value.ui64 += b->tqbucket_nalloc; + tqsp->tqd_nfree.value.ui64 += b->tqbucket_nfree; + } + return (0); +} + +int +EMPTY_TASKQ(taskq_t *tq) +{ +#ifdef _KERNEL + return ((tq)->tq_task.tqent_next == &(tq)->tq_task); +#else + return (tq->tq_task.tqent_next == &tq->tq_task || tq->tq_active == 0); +#endif +} diff --git a/module/os/macos/spl/spl-thread.c b/module/os/macos/spl/spl-thread.c new file mode 100644 index 0000000000..886190cba7 --- /dev/null +++ b/module/os/macos/spl/spl-thread.c @@ -0,0 +1,148 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +uint64_t zfs_threads = 0; + +kthread_t * +spl_thread_create( + caddr_t stk, + size_t stksize, + void (*proc)(void *), + void *arg, + size_t len, + /* struct proc *pp, */ + int state, +#ifdef SPL_DEBUG_THREAD + char *filename, + int line, +#endif + pri_t pri) +{ + kern_return_t result; + thread_t thread; + +#ifdef SPL_DEBUG_THREAD + printf("Start thread pri %d by '%s':%d\n", pri, + filename, line); +#endif + + result = kernel_thread_start((thread_continue_t)proc, arg, &thread); + + if (result != KERN_SUCCESS) + return (NULL); + + /* Improve the priority when asked to do so */ + if (pri > minclsyspri) { + thread_precedence_policy_data_t policy; + + /* + * kernel priorities (osfmk/kern/sched.h) + * + * 96 Reserved (real-time) + * 95 Kernel mode only + * A + * + + * (16 levels) + * + + * V + * 80 Kernel mode only + * 79 System high priority + * + * spl/include/sys/sysmacros.h + * #define maxclsyspri 89 + * #define minclsyspri 81 BASEPRI_KERNEL + * #define defclsyspri 81 BASEPRI_KERNEL + * + * Calling policy.importance = 10 will create + * a default pri (81) at pri (91). + * + * So asking for pri (85) we do 85-81 = 4. + * + * IllumOS priorities are: + * #define MAXCLSYSPRI 99 + * #define MINCLSYSPRI 60 + */ + + policy.importance = (pri - minclsyspri); + + thread_policy_set(thread, + THREAD_PRECEDENCE_POLICY, + (thread_policy_t)&policy, + THREAD_PRECEDENCE_POLICY_COUNT); + } + + thread_deallocate(thread); + + atomic_inc_64(&zfs_threads); + + return ((kthread_t *)thread); +} + +kthread_t * +spl_current_thread(void) +{ + thread_t cur_thread = current_thread(); + return ((kthread_t *)cur_thread); +} + +void +spl_thread_exit(void) +{ + atomic_dec_64(&zfs_threads); + + tsd_thread_exit(); + (void) thread_terminate(current_thread()); +} + + +/* + * IllumOS has callout.c - place it here until we find a better place + */ +callout_id_t +timeout_generic(int type, void (*func)(void *), void *arg, + hrtime_t expiration, hrtime_t resolution, int flags) +{ + struct timespec ts; + hrt2ts(expiration, &ts); + bsd_timeout(func, arg, &ts); + /* + * bsd_untimeout() requires func and arg to cancel the timeout, so + * pass it back as the callout_id. If we one day were to implement + * untimeout_generic() they would pass it back to us + */ + return ((callout_id_t)arg); +} diff --git a/module/os/macos/spl/spl-time.c b/module/os/macos/spl/spl-time.c new file mode 100644 index 0000000000..151691d60b --- /dev/null +++ b/module/os/macos/spl/spl-time.c @@ -0,0 +1,138 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include +#include + +/* + * gethrtime() provides high-resolution timestamps with + * machine-dependent origin Hence its primary use is to specify + * intervals. + */ + +static hrtime_t +zfs_abs_to_nano(uint64_t elapsed) +{ + static mach_timebase_info_data_t sTimebaseInfo = { 0, 0 }; + + /* + * If this is the first time we've run, get the timebase. + * We can use denom == 0 to indicate that sTimebaseInfo is + * uninitialised because it makes no sense to have a zero + * denominator in a fraction. + */ + + if (sTimebaseInfo.denom == 0) { + (void) clock_timebase_info(&sTimebaseInfo); + } + + /* + * Convert to nanoseconds. + * return (elapsed * (uint64_t)sTimebaseInfo.numer) / + * (uint64_t)sTimebaseInfo.denom; + * + * Provided the final result is representable in 64 bits the + * following maneuver will deliver that result without intermediate + * overflow. + */ + if (sTimebaseInfo.denom == sTimebaseInfo.numer) + return (elapsed); + else if (sTimebaseInfo.denom == 1) + return (elapsed * (uint64_t)sTimebaseInfo.numer); + else { + /* Decompose elapsed = eta32 * 2^32 + eps32: */ + uint64_t eta32 = elapsed >> 32; + uint64_t eps32 = elapsed & 0x00000000ffffffffLL; + + uint32_t numer = sTimebaseInfo.numer; + uint32_t denom = sTimebaseInfo.denom; + + /* Form product of elapsed64 (decomposed) and numer: */ + uint64_t mu64 = numer * eta32; + uint64_t lambda64 = numer * eps32; + + /* Divide the constituents by denom: */ + uint64_t q32 = mu64/denom; + uint64_t r32 = mu64 - (q32 * denom); /* mu64 % denom */ + + return ((q32 << 32) + ((r32 << 32) + lambda64) / denom); + } +} + + +hrtime_t +gethrtime(void) +{ + static uint64_t start = 0; + if (start == 0) + start = mach_absolute_time(); + return (zfs_abs_to_nano(mach_absolute_time() - start)); +} + + +void +gethrestime(struct timespec *ts) +{ + nanotime(ts); +} + +time_t +gethrestime_sec(void) +{ + struct timeval tv; + + microtime(&tv); + return (tv.tv_sec); +} + +void +hrt2ts(hrtime_t hrt, struct timespec *tsp) +{ + uint32_t sec, nsec, tmp; + + tmp = (uint32_t)(hrt >> 30); + sec = tmp - (tmp >> 2); + sec = tmp - (sec >> 5); + sec = tmp + (sec >> 1); + sec = tmp - (sec >> 6) + 7; + sec = tmp - (sec >> 3); + sec = tmp + (sec >> 1); + sec = tmp + (sec >> 3); + sec = tmp + (sec >> 4); + tmp = (sec << 7) - sec - sec - sec; + tmp = (tmp << 7) - tmp - tmp - tmp; + tmp = (tmp << 7) - tmp - tmp - tmp; + nsec = (uint32_t)hrt - (tmp << 9); + while (nsec >= NANOSEC) { + nsec -= NANOSEC; + sec++; + } + tsp->tv_sec = (time_t)sec; + tsp->tv_nsec = nsec; +} diff --git a/module/os/macos/spl/spl-tsd.c b/module/os/macos/spl/spl-tsd.c new file mode 100644 index 0000000000..6ca970a9f9 --- /dev/null +++ b/module/os/macos/spl/spl-tsd.c @@ -0,0 +1,389 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2014 Jorgen Lundman + * + * A thread will call tsd_create(&key, dtor) to allocate a new + * "variable" placement, called a "key". In illumos, this is the index + * into an array of dtors. (If dtor is passed as NULL, TSD internally + * set it to an empty function). So if the dtor array[i] is NULL, it + * is "free" and can be allocated. (returned as *key = i). + * illumos will grow this dtor array with realloc when required. + * Then Any Thread can set a value on this "key index", and this value + * is specific to each thread by calling tsd_set(key, value). + * And can be retrieved with tsd_get(key). + * When tsd_destroy(key) is called, we need to loop through all + * threads different "values", and call the dtor on each one. + * Likewise, we need to know when a thread exists, so we can clean up + * the values (by calling dtor for each one) so we patch into the + * thread_exit() call, to also call tsd_thread_exit(). + * + * In OsX, we build an array of the dtors, and return the key index, + * this is to store the dtor, and know which "key" values are valid. + * Then we build an AVL tree, indexed by , to store + * each thread's value. This allows us to do key access quick. + * On thread_exit, we iterate the dtor array, and for each key + * remove . + * On tsd_destroy(key), we use AVL find nearest with , then + * avl_next as long as key remains the same, to remove each thread value. + * + * Note a key of "0" is considered "invalid" in IllumOS, so we return + * a "1" based index, even though internally it is 0 based. + * + */ + +#include +#include +#include +#include +#include + +/* Initial size of array, and realloc growth size */ +#define TSD_ALLOC_SIZE 10 + +/* array of dtors, allocated in init */ +static dtor_func_t *tsd_dtor_array = NULL; +static uint32_t tsd_dtor_size = 0; +static avl_tree_t tsd_tree; + +struct spl_tsd_node_s +{ + /* The index/key */ + uint_t tsd_key; + thread_t tsd_thread; + + /* The payload */ + void *tsd_value; + + /* Internal mumbo */ + avl_node_t tsd_link_node; +}; +typedef struct spl_tsd_node_s spl_tsd_node_t; + +static kmutex_t spl_tsd_mutex; + +/* + * tsd_set - set thread specific data + * @key: lookup key + * @value: value to set + * + * Caller must prevent racing tsd_create() or tsd_destroy(), protected + * from racing tsd_get() or tsd_set() because it is thread specific. + * This function has been optimized to be fast for the update case. + * When setting the tsd initially it will be slower due to additional + * required locking and potential memory allocations. + * If the value is set to NULL, we also release it. + */ +int +tsd_set(uint_t key, void *value) +{ + spl_tsd_node_t *entry = NULL; + spl_tsd_node_t search; + avl_index_t loc; + uint_t i; + + /* Invalid key values? */ + if ((key < 1) || + (key >= tsd_dtor_size)) { + return (EINVAL); + } + + i = key - 1; + + /* + * First handle the easy case, already has a node/value + * so we just need to find it, update it. + */ + + search.tsd_key = i; + search.tsd_thread = current_thread(); + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + mutex_exit(&spl_tsd_mutex); + + if (entry) { + + /* If value is set to NULL, release it as well */ + if (value == NULL) { + mutex_enter(&spl_tsd_mutex); + avl_remove(&tsd_tree, entry); + mutex_exit(&spl_tsd_mutex); + kmem_free(entry, sizeof (*entry)); + return (0); + } + entry->tsd_value = value; + return (0); + } + + /* No node, we need to create a new one and insert it. */ + /* But if the value is NULL, then why create one eh? */ + if (value == NULL) + return (0); + + entry = kmem_alloc(sizeof (spl_tsd_node_t), KM_SLEEP); + + entry->tsd_key = i; + entry->tsd_thread = current_thread(); + entry->tsd_value = value; + + mutex_enter(&spl_tsd_mutex); + avl_add(&tsd_tree, entry); + mutex_exit(&spl_tsd_mutex); + + return (0); +} + +/* + * tsd_get - get thread specific data for specified thread + * @key: lookup key + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get_by_thread(uint_t key, thread_t thread) +{ + spl_tsd_node_t *entry = NULL; + spl_tsd_node_t search; + avl_index_t loc; + uint_t i; + + /* Invalid key values? */ + if ((key < 1) || + (key >= tsd_dtor_size)) { + return (NULL); + } + + i = key - 1; + + search.tsd_key = i; + search.tsd_thread = thread; + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + mutex_exit(&spl_tsd_mutex); + + return (entry ? entry->tsd_value : NULL); +} + +void * +tsd_get(uint_t key) +{ + return (tsd_get_by_thread(key, current_thread())); +} + +static void +tsd_internal_dtor(void *value) +{ +} + +/* + * Create TSD for a pid and fill in key with unique value, remember the dtor + * + * We cheat and create an entry with pid=0, to keep the dtor. + */ +void +tsd_create(uint_t *keyp, dtor_func_t dtor) +{ + uint_t i; + + if (*keyp) + return; + + // Iterate the dtor_array, looking for first NULL + for (i = 0; i < TSD_ALLOC_SIZE; i++) { + if (tsd_dtor_array[i] == NULL) break; + } + + /* Do we need to grow the list? */ + if (i >= tsd_dtor_size) { + printf("SPL: tsd list growing not implemented\n"); + return; + } + + if (dtor == NULL) + dtor = tsd_internal_dtor; + + tsd_dtor_array[i] = dtor; + + *keyp = i + 1; +} + +void +tsd_destroy(uint_t *keyp) +{ + spl_tsd_node_t *entry = NULL, *next = NULL; + spl_tsd_node_t search; + avl_index_t loc; + dtor_func_t dtor = NULL; + uint_t i; + + /* Invalid key values? */ + if ((*keyp < 1) || + (*keyp >= tsd_dtor_size)) { + return; + } + + i = *keyp - 1; + *keyp = 0; + + ASSERT(tsd_dtor_array[i] != NULL); + + dtor = tsd_dtor_array[i]; + tsd_dtor_array[i] = NULL; + + /* + * For each thread; + * if it has a value + * call the dtor + */ + search.tsd_key = i; + search.tsd_thread = NULL; + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + + /* + * "entry" should really be NULL here, as we searched for the + * NULL thread + */ + if (entry == NULL) + entry = avl_nearest(&tsd_tree, loc, AVL_AFTER); + + /* Now, free node, and go to next, as long as the key matches */ + while (entry && (entry->tsd_key == i)) { + next = AVL_NEXT(&tsd_tree, entry); + + /* If we have a value, call the dtor for this thread */ + if (entry->tsd_value) + dtor(entry->tsd_value); + + avl_remove(&tsd_tree, entry); + + kmem_free(entry, sizeof (*entry)); + + entry = next; + } + + mutex_exit(&spl_tsd_mutex); +} + + + +/* + * A thread is exiting, clear out any tsd values it might have. + */ +void +tsd_thread_exit(void) +{ + spl_tsd_node_t *entry = NULL; + spl_tsd_node_t search; + avl_index_t loc; + int i; + + search.tsd_thread = current_thread(); + + /* For all defined dtor/values */ + for (i = 0; i < tsd_dtor_size; i++) { + + /* If not allocated, skip */ + if (tsd_dtor_array[i] == NULL) continue; + + /* Find out of this thread has a value */ + search.tsd_key = i; + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + if (entry) avl_remove(&tsd_tree, entry); + mutex_exit(&spl_tsd_mutex); + + if (entry == NULL) continue; + + /* If we have a value, call dtor */ + if (entry->tsd_value) + tsd_dtor_array[i](entry->tsd_value); + + kmem_free(entry, sizeof (*entry)); + } // for all i +} + +static int +tsd_tree_cmp(const void *arg1, const void *arg2) +{ + const spl_tsd_node_t *node1 = arg1; + const spl_tsd_node_t *node2 = arg2; + if (node1->tsd_key > node2->tsd_key) + return (1); + if (node1->tsd_key < node2->tsd_key) + return (-1); + if (node1->tsd_thread > node2->tsd_thread) + return (1); + if (node1->tsd_thread < node2->tsd_thread) + return (-1); + return (0); +} + +int +spl_tsd_init(void) +{ + tsd_dtor_array = kmem_zalloc(sizeof (dtor_func_t) * TSD_ALLOC_SIZE, + KM_SLEEP); + tsd_dtor_size = TSD_ALLOC_SIZE; + + mutex_init(&spl_tsd_mutex, NULL, MUTEX_DEFAULT, NULL); + avl_create(&tsd_tree, tsd_tree_cmp, + sizeof (spl_tsd_node_t), + offsetof(spl_tsd_node_t, tsd_link_node)); + return (0); +} + + +uint64_t +spl_tsd_size(void) +{ + return (avl_numnodes(&tsd_tree)); +} + +void +spl_tsd_fini(void) +{ + spl_tsd_node_t *entry = NULL; + void *cookie = NULL; + + printf("SPL: tsd unloading %llu\n", spl_tsd_size()); + + mutex_enter(&spl_tsd_mutex); + cookie = NULL; + while ((entry = avl_destroy_nodes(&tsd_tree, &cookie))) { + kmem_free(entry, sizeof (*entry)); + } + mutex_exit(&spl_tsd_mutex); + + avl_destroy(&tsd_tree); + mutex_destroy(&spl_tsd_mutex); + + kmem_free(tsd_dtor_array, sizeof (dtor_func_t) * tsd_dtor_size); + tsd_dtor_size = 0; +} diff --git a/module/os/macos/spl/spl-vmem.c b/module/os/macos/spl/spl-vmem.c new file mode 100644 index 0000000000..f3692f33b7 --- /dev/null +++ b/module/os/macos/spl/spl-vmem.c @@ -0,0 +1,3940 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2017 Sean Doran + */ + +/* + * Big Theory Statement for the virtual memory allocator. + * + * For a more complete description of the main ideas, see: + * + * Jeff Bonwick and Jonathan Adams, + * + * Magazines and vmem: Extending the Slab Allocator to Many CPUs and + * Arbitrary Resources. + * + * Proceedings of the 2001 Usenix Conference. + * Available as http://www.usenix.org/event/usenix01/bonwick.html + * + * + * 1. General Concepts + * ------------------- + * + * 1.1 Overview + * ------------ + * We divide the kernel address space into a number of logically distinct + * pieces, or *arenas*: text, data, heap, stack, and so on. Within these + * arenas we often subdivide further; for example, we use heap addresses + * not only for the kernel heap (kmem_alloc() space), but also for DVMA, + * bp_mapin(), /dev/kmem, and even some device mappings like the TOD chip. + * The kernel address space, therefore, is most accurately described as + * a tree of arenas in which each node of the tree *imports* some subset + * of its parent. The virtual memory allocator manages these arenas and + * supports their natural hierarchical structure. + * + * 1.2 Arenas + * ---------- + * An arena is nothing more than a set of integers. These integers most + * commonly represent virtual addresses, but in fact they can represent + * anything at all. For example, we could use an arena containing the + * integers minpid through maxpid to allocate process IDs. vmem_create() + * and vmem_destroy() create and destroy vmem arenas. In order to + * differentiate between arenas used for adresses and arenas used for + * identifiers, the VMC_IDENTIFIER flag is passed to vmem_create(). This + * prevents identifier exhaustion from being diagnosed as general memory + * failure. + * + * 1.3 Spans + * --------- + * We represent the integers in an arena as a collection of *spans*, or + * contiguous ranges of integers. For example, the kernel heap consists + * of just one span: [kernelheap, ekernelheap). Spans can be added to an + * arena in two ways: explicitly, by vmem_add(), or implicitly, by + * importing, as described in Section 1.5 below. + * + * 1.4 Segments + * ------------ + * Spans are subdivided into *segments*, each of which is either allocated + * or free. A segment, like a span, is a contiguous range of integers. + * Each allocated segment [addr, addr + size) represents exactly one + * vmem_alloc(size) that returned addr. Free segments represent the space + * between allocated segments. If two free segments are adjacent, we + * coalesce them into one larger segment; that is, if segments [a, b) and + * [b, c) are both free, we merge them into a single segment [a, c). + * The segments within a span are linked together in increasing-address order + * so we can easily determine whether coalescing is possible. + * + * Segments never cross span boundaries. When all segments within + * an imported span become free, we return the span to its source. + * + * 1.5 Imported Memory + * ------------------- + * As mentioned in the overview, some arenas are logical subsets of + * other arenas. For example, kmem_va_arena (a virtual address cache + * that satisfies most kmem_slab_create() requests) is just a subset + * of heap_arena (the kernel heap) that provides caching for the most + * common slab sizes. When kmem_va_arena runs out of virtual memory, + * it *imports* more from the heap; we say that heap_arena is the + * *vmem source* for kmem_va_arena. vmem_create() allows you to + * specify any existing vmem arena as the source for your new arena. + * Topologically, since every arena is a child of at most one source, + * the set of all arenas forms a collection of trees. + * + * 1.6 Constrained Allocations + * --------------------------- + * Some vmem clients are quite picky about the kind of address they want. + * For example, the DVMA code may need an address that is at a particular + * phase with respect to some alignment (to get good cache coloring), or + * that lies within certain limits (the addressable range of a device), + * or that doesn't cross some boundary (a DMA counter restriction) -- + * or all of the above. vmem_xalloc() allows the client to specify any + * or all of these constraints. + * + * 1.7 The Vmem Quantum + * -------------------- + * Every arena has a notion of 'quantum', specified at vmem_create() time, + * that defines the arena's minimum unit of currency. Most commonly the + * quantum is either 1 or PAGESIZE, but any power of 2 is legal. + * All vmem allocations are guaranteed to be quantum-aligned. + * + * 1.8 Quantum Caching + * ------------------- + * A vmem arena may be so hot (frequently used) that the scalability of vmem + * allocation is a significant concern. We address this by allowing the most + * common allocation sizes to be serviced by the kernel memory allocator, + * which provides low-latency per-cpu caching. The qcache_max argument to + * vmem_create() specifies the largest allocation size to cache. + * + * 1.9 Relationship to Kernel Memory Allocator + * ------------------------------------------- + * Every kmem cache has a vmem arena as its slab supplier. The kernel memory + * allocator uses vmem_alloc() and vmem_free() to create and destroy slabs. + * + * + * 2. Implementation + * ----------------- + * + * 2.1 Segment lists and markers + * ----------------------------- + * The segment structure (vmem_seg_t) contains two doubly-linked lists. + * + * The arena list (vs_anext/vs_aprev) links all segments in the arena. + * In addition to the allocated and free segments, the arena contains + * special marker segments at span boundaries. Span markers simplify + * coalescing and importing logic by making it easy to tell both when + * we're at a span boundary (so we don't coalesce across it), and when + * a span is completely free (its neighbors will both be span markers). + * + * Imported spans will have vs_import set. + * + * The next-of-kin list (vs_knext/vs_kprev) links segments of the same type: + * (1) for allocated segments, vs_knext is the hash chain linkage; + * (2) for free segments, vs_knext is the freelist linkage; + * (3) for span marker segments, vs_knext is the next span marker. + * + * 2.2 Allocation hashing + * ---------------------- + * We maintain a hash table of all allocated segments, hashed by address. + * This allows vmem_free() to discover the target segment in constant time. + * vmem_update() periodically resizes hash tables to keep hash chains short. + * + * 2.3 Freelist management + * ----------------------- + * We maintain power-of-2 freelists for free segments, i.e. free segments + * of size >= 2^n reside in vmp->vm_freelist[n]. To ensure constant-time + * allocation, vmem_xalloc() looks not in the first freelist that *might* + * satisfy the allocation, but in the first freelist that *definitely* + * satisfies the allocation (unless VM_BESTFIT is specified, or all larger + * freelists are empty). For example, a 1000-byte allocation will be + * satisfied not from the 512..1023-byte freelist, whose members *might* + * contains a 1000-byte segment, but from a 1024-byte or larger freelist, + * the first member of which will *definitely* satisfy the allocation. + * This ensures that vmem_xalloc() works in constant time. + * + * We maintain a bit map to determine quickly which freelists are non-empty. + * vmp->vm_freemap & (1 << n) is non-zero iff vmp->vm_freelist[n] is non-empty. + * + * The different freelists are linked together into one large freelist, + * with the freelist heads serving as markers. Freelist markers simplify + * the maintenance of vm_freemap by making it easy to tell when we're taking + * the last member of a freelist (both of its neighbors will be markers). + * + * 2.4 Vmem Locking + * ---------------- + * For simplicity, all arena state is protected by a per-arena lock. + * For very hot arenas, use quantum caching for scalability. + * + * 2.5 Vmem Population + * ------------------- + * Any internal vmem routine that might need to allocate new segment + * structures must prepare in advance by calling vmem_populate(), which + * will preallocate enough vmem_seg_t's to get is through the entire + * operation without dropping the arena lock. + * + * 2.6 Auditing + * ------------ + * If KMF_AUDIT is set in kmem_flags, we audit vmem allocations as well. + * Since virtual addresses cannot be scribbled on, there is no equivalent + * in vmem to redzone checking, deadbeef, or other kmem debugging features. + * Moreover, we do not audit frees because segment coalescing destroys the + * association between an address and its segment structure. Auditing is + * thus intended primarily to keep track of who's consuming the arena. + * Debugging support could certainly be extended in the future if it proves + * necessary, but we do so much live checking via the allocation hash table + * that even non-DEBUG systems get quite a bit of sanity checking already. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VMEM_INITIAL 21 /* early vmem arenas */ +#define VMEM_SEG_INITIAL 800 + +/* + * Adding a new span to an arena requires two segment structures: one to + * represent the span, and one to represent the free segment it contains. + */ +#define VMEM_SEGS_PER_SPAN_CREATE 2 + +/* + * Allocating a piece of an existing segment requires 0-2 segment structures + * depending on how much of the segment we're allocating. + * + * To allocate the entire segment, no new segment structures are needed; we + * simply move the existing segment structure from the freelist to the + * allocation hash table. + * + * To allocate a piece from the left or right end of the segment, we must + * split the segment into two pieces (allocated part and remainder), so we + * need one new segment structure to represent the remainder. + * + * To allocate from the middle of a segment, we need two new segment strucures + * to represent the remainders on either side of the allocated part. + */ +#define VMEM_SEGS_PER_EXACT_ALLOC 0 +#define VMEM_SEGS_PER_LEFT_ALLOC 1 +#define VMEM_SEGS_PER_RIGHT_ALLOC 1 +#define VMEM_SEGS_PER_MIDDLE_ALLOC 2 + +/* + * vmem_populate() preallocates segment structures for vmem to do its work. + * It must preallocate enough for the worst case, which is when we must import + * a new span and then allocate from the middle of it. + */ +#define VMEM_SEGS_PER_ALLOC_MAX \ +(VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC) + +/* + * The segment structures themselves are allocated from vmem_seg_arena, so + * we have a recursion problem when vmem_seg_arena needs to populate itself. + * We address this by working out the maximum number of segment structures + * this act will require, and multiplying by the maximum number of threads + * that we'll allow to do it simultaneously. + * + * The worst-case segment consumption to populate vmem_seg_arena is as + * follows (depicted as a stack trace to indicate why events are occurring): + * + * (In order to lower the fragmentation in the heap_arena, we specify a + * minimum import size for the vmem_metadata_arena which is the same size + * as the kmem_va quantum cache allocations. This causes the worst-case + * allocation from the vmem_metadata_arena to be 3 segments.) + * + * vmem_alloc(vmem_seg_arena) -> 2 segs (span create + exact alloc) + * segkmem_alloc(vmem_metadata_arena) + * vmem_alloc(vmem_metadata_arena) -> 3 segs (span create + left alloc) + * vmem_alloc(heap_arena) -> 1 seg (left alloc) + * page_create() + * hat_memload() + * kmem_cache_alloc() + * kmem_slab_create() + * vmem_alloc(hat_memload_arena) -> 2 segs (span create + exact alloc) + * segkmem_alloc(heap_arena) + * vmem_alloc(heap_arena) -> 1 seg (left alloc) + * page_create() + * hat_memload() -> (hat layer won't recurse further) + * + * The worst-case consumption for each arena is 3 segment structures. + * Of course, a 3-seg reserve could easily be blown by multiple threads. + * Therefore, we serialize all allocations from vmem_seg_arena (which is OK + * because they're rare). We cannot allow a non-blocking allocation to get + * tied up behind a blocking allocation, however, so we use separate locks + * for VM_SLEEP and VM_NOSLEEP allocations. Similarly, VM_PUSHPAGE allocations + * must not block behind ordinary VM_SLEEPs. In addition, if the system is + * panicking then we must keep enough resources for panic_thread to do its + * work. Thus we have at most four threads trying to allocate from + * vmem_seg_arena, and each thread consumes at most three segment structures, + * so we must maintain a 12-seg reserve. + */ +#define VMEM_POPULATE_RESERVE 12 + +/* + * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures + * so that it can satisfy the worst-case allocation *and* participate in + * worst-case allocation from vmem_seg_arena. + */ +#define VMEM_MINFREE (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX) + +static vmem_t vmem0[VMEM_INITIAL]; +static vmem_t *vmem_populator[VMEM_INITIAL]; +static uint32_t vmem_id; +static uint32_t vmem_populators; +static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL]; +static vmem_seg_t *vmem_segfree; +static kmutex_t vmem_list_lock; +static kmutex_t vmem_segfree_lock; +static kmutex_t vmem_sleep_lock; +static kmutex_t vmem_nosleep_lock; +static kmutex_t vmem_pushpage_lock; +static kmutex_t vmem_panic_lock; +static kmutex_t vmem_xnu_alloc_lock; +static vmem_t *vmem_list; +static vmem_t *vmem_metadata_arena; +static vmem_t *vmem_seg_arena; +static vmem_t *vmem_hash_arena; +static vmem_t *vmem_vmem_arena; +vmem_t *spl_default_arena; // The bottom-most arena for SPL +static vmem_t *spl_default_arena_parent; // dummy arena as a placeholder +#define VMEM_BUCKETS 13 +#define VMEM_BUCKET_LOWBIT 12 +#define VMEM_BUCKET_HIBIT 24 +static vmem_t *vmem_bucket_arena[VMEM_BUCKETS]; +vmem_t *spl_heap_arena; +static void *spl_heap_arena_initial_alloc; +static size_t spl_heap_arena_initial_alloc_size = 0; +#define NUMBER_OF_ARENAS_IN_VMEM_INIT 21 +/* vmem_update() every 15 seconds */ +static struct timespec vmem_update_interval = {15, 0}; +uint32_t vmem_mtbf; /* mean time between failures [default: off] */ +size_t vmem_seg_size = sizeof (vmem_seg_t); + +// must match with include/sys/vmem_impl.h +static vmem_kstat_t vmem_kstat_template = { + { "mem_inuse", KSTAT_DATA_UINT64 }, + { "mem_import", KSTAT_DATA_UINT64 }, + { "mem_total", KSTAT_DATA_UINT64 }, + { "vmem_source", KSTAT_DATA_UINT32 }, + { "alloc", KSTAT_DATA_UINT64 }, + { "free", KSTAT_DATA_UINT64 }, + { "wait", KSTAT_DATA_UINT64 }, + { "fail", KSTAT_DATA_UINT64 }, + { "lookup", KSTAT_DATA_UINT64 }, + { "search", KSTAT_DATA_UINT64 }, + { "populate_fail", KSTAT_DATA_UINT64 }, + { "contains", KSTAT_DATA_UINT64 }, + { "contains_search", KSTAT_DATA_UINT64 }, + { "parent_alloc", KSTAT_DATA_UINT64 }, + { "parent_free", KSTAT_DATA_UINT64 }, + { "threads_waiting", KSTAT_DATA_UINT64 }, + { "excess", KSTAT_DATA_UINT64 }, +}; + + +/* + * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k'). + */ +#define VMEM_INSERT(vprev, vsp, type) \ +{ \ +vmem_seg_t *_vnext = (vprev)->vs_##type##next; \ +(vsp)->vs_##type##next = (_vnext); \ +(vsp)->vs_##type##prev = (vprev); \ +(vprev)->vs_##type##next = (vsp); \ +(_vnext)->vs_##type##prev = (vsp); \ +} + +#define VMEM_DELETE(vsp, type) \ +{ \ +vmem_seg_t *_vprev = (vsp)->vs_##type##prev; \ +vmem_seg_t *_vnext = (vsp)->vs_##type##next; \ +(_vprev)->vs_##type##next = (_vnext); \ +(_vnext)->vs_##type##prev = (_vprev); \ +} + +// vmem thread block count +uint64_t spl_vmem_threads_waiting = 0; + +// number of allocations > minalloc +uint64_t spl_bucket_non_pow2_allocs = 0; + +// allocator kstats +uint64_t spl_vmem_unconditional_allocs = 0; +uint64_t spl_vmem_unconditional_alloc_bytes = 0; +uint64_t spl_vmem_conditional_allocs = 0; +uint64_t spl_vmem_conditional_alloc_bytes = 0; +uint64_t spl_vmem_conditional_alloc_deny = 0; +uint64_t spl_vmem_conditional_alloc_deny_bytes = 0; + +// bucket allocator kstat +uint64_t spl_xat_success = 0; +uint64_t spl_xat_late_success = 0; +uint64_t spl_xat_late_success_nosleep = 0; +uint64_t spl_xat_pressured = 0; +uint64_t spl_xat_bailed = 0; +uint64_t spl_xat_bailed_contended = 0; +uint64_t spl_xat_lastalloc = 0; +uint64_t spl_xat_lastfree = 0; +uint64_t spl_xat_forced = 0; +uint64_t spl_xat_sleep = 0; +uint64_t spl_xat_late_deny = 0; +uint64_t spl_xat_no_waiters = 0; +uint64_t spl_xft_wait = 0; + +uint64_t spl_vba_parent_memory_appeared = 0; +uint64_t spl_vba_parent_memory_blocked = 0; +uint64_t spl_vba_hiprio_blocked = 0; +uint64_t spl_vba_cv_timeout = 0; +uint64_t spl_vba_loop_timeout = 0; +uint64_t spl_vba_cv_timeout_blocked = 0; +uint64_t spl_vba_loop_timeout_blocked = 0; +uint64_t spl_vba_sleep = 0; +uint64_t spl_vba_loop_entries = 0; + +// bucket minimum span size tunables +uint64_t spl_bucket_tunable_large_span = 0; +uint64_t spl_bucket_tunable_small_span = 0; + +// for XAT & XATB visibility into VBA queue +static _Atomic uint32_t spl_vba_threads[VMEM_BUCKETS] = { 0 }; +static uint32_t + vmem_bucket_id_to_bucket_number[NUMBER_OF_ARENAS_IN_VMEM_INIT] = { 0 }; +boolean_t spl_arc_no_grow(size_t, boolean_t, kmem_cache_t **); +_Atomic uint64_t spl_arc_no_grow_bits = 0; +uint64_t spl_arc_no_grow_count = 0; + +// compare span ages this many steps from the head of the freelist +uint64_t spl_frag_max_walk = 1000; +uint64_t spl_frag_walked_out = 0; +uint64_t spl_frag_walk_cnt = 0; + +extern void spl_free_set_emergency_pressure(int64_t p); +extern uint64_t segkmem_total_mem_allocated; +extern uint64_t total_memory; + +extern void IOSleep(unsigned milliseconds); + +/* + * Get a vmem_seg_t from the global segfree list. + */ +static vmem_seg_t * +vmem_getseg_global(void) +{ + vmem_seg_t *vsp; + + mutex_enter(&vmem_segfree_lock); + if ((vsp = vmem_segfree) != NULL) + vmem_segfree = vsp->vs_knext; + mutex_exit(&vmem_segfree_lock); + + if (vsp != NULL) + vsp->vs_span_createtime = 0; + + return (vsp); +} + +/* + * Put a vmem_seg_t on the global segfree list. + */ +static void +vmem_putseg_global(vmem_seg_t *vsp) +{ + mutex_enter(&vmem_segfree_lock); + vsp->vs_knext = vmem_segfree; + vmem_segfree = vsp; + mutex_exit(&vmem_segfree_lock); +} + +/* + * Get a vmem_seg_t from vmp's segfree list. + */ +static vmem_seg_t * +vmem_getseg(vmem_t *vmp) +{ + vmem_seg_t *vsp; + + ASSERT(vmp->vm_nsegfree > 0); + + vsp = vmp->vm_segfree; + vmp->vm_segfree = vsp->vs_knext; + vmp->vm_nsegfree--; + + return (vsp); +} + +/* + * Put a vmem_seg_t on vmp's segfree list. + */ +static void +vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp) +{ + vsp->vs_knext = vmp->vm_segfree; + vmp->vm_segfree = vsp; + vmp->vm_nsegfree++; +} + + +/* + * Add vsp to the appropriate freelist, at the appropriate location, + * keeping the freelist sorted by age. + */ + +/* + * return true when we continue the for loop in + * vmem_freelist_insert_sort_by_time + */ +static inline bool +flist_sort_compare(bool newfirst, + const vmem_seg_t *vhead, + const vmem_seg_t *nextlist, + vmem_seg_t *p, vmem_seg_t *to_insert) +{ + /* + * vsp is the segment we are inserting into the freelist + * p is a freelist poniter or an element inside a non-empty freelist + * if we return false, then vsp is inserted immedaitely after p, + */ + + // always enter the for loop if we're at the front of a flist + if (p == vhead) + return (true); + + const vmem_seg_t *n = p->vs_knext; + + if (n == nextlist || n == NULL) { + // if we are at the tail of the flist, then + // insert vsp between p and n + return (false); + } + + if (n->vs_import == true && to_insert->vs_import == false) { + /* + * put non-imported segments before imported segments + * no matter what their respective create times are, + * thereby making imported segments more likely "age out" + */ + return (false); // inserts to_insert between p and n + } + + if (newfirst == true) { + if (n->vs_span_createtime < to_insert->vs_span_createtime) { + // n is older than me, so insert me between p and n + return (false); + } + } else { + if (n->vs_span_createtime > to_insert->vs_span_createtime) { + // n is newer than me, so insert me between p and n + return (false); + } + } + // continue iterating + return (true); +} + +static void +vmem_freelist_insert_sort_by_time(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(vmp->vm_cflags & VMC_TIMEFREE); + ASSERT(vsp->vs_span_createtime > 0); + + const bool newfirst = 0 == (vmp->vm_cflags & VMC_OLDFIRST); + + const uint64_t abs_max_walk_steps = 1ULL << 30ULL; + uint32_t max_walk_steps = (uint32_t)MIN(spl_frag_max_walk, + abs_max_walk_steps); + + vmem_seg_t *vprev; + + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + + /* + * in vmem_create_common() the freelists are arranged: + * freelist[0].vs_kprev = NULL, freelist[VMEM_FREELISTS].vs_knext = NULL + * freelist[1].vs_kprev = freelist[0], freelist[1].vs_knext = + * freelist[2] ... + * from vmem_freelist_insert(): + * VS_SIZE is the segment size (->vs_end - ->vs_start), so say 8k-512 + * highbit is the higest bit set PLUS 1, so in this case would be the + * 16k list. so below, vprev is therefore pointing to the 8k list + * in vmem_alloc, the unconstrained allocation takes, for a 8k-512 + * block: vsp = flist[8k].vs_knext + * and calls vmem_seg_create() which sends any leftovers from vsp + * to vmem_freelist_insert + * + * vmem_freelist_insert would take the seg (as above, 8k-512 size), + * vprev points to the 16k list, and VMEM_INSERT(vprev, vsp, k) + * inserts the segment immediately after + * + * so vmem_seg_create(...8k-512...) pushes to the head of the 8k list, + * and vmem_alloc(...8-512k...) will pull from the head of the 8k list + * + * below we may want to push to the TAIL of the 8k list, which is + * just before flist[16k]. + */ + + vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; + + int my_listnum = highbit(VS_SIZE(vsp)) - 1; + + ASSERT(my_listnum >= 1); + ASSERT(my_listnum < VMEM_FREELISTS); + + int next_listnum = my_listnum + 1; + + const vmem_seg_t *nextlist = + (vmem_seg_t *)&vmp->vm_freelist[next_listnum]; + + ASSERT(vsp->vs_span_createtime != 0); + if (vsp->vs_span_createtime == 0) { + printf("SPL: %s: WARNING: vsp->vs_span_createtime == 0 (%s)!\n", + __func__, vmp->vm_name); + } + + // continuing our example, starts with p at flist[8k] + // and n at the following freelist entry + + const vmem_seg_t *vhead = vprev; + vmem_seg_t *p = vprev; + vmem_seg_t *n = p->vs_knext; + + // walk from the freelist head looking for + // a segment whose creation time is earlier than + // the segment to be inserted's creation time, + // then insert before that segment. + + for (uint32_t step = 0; + flist_sort_compare(newfirst, vhead, nextlist, p, vsp) == true; + step++) { + // iterating while predecessor pointer p was created + // at a later tick than funcarg vsp. + // + // below we set p to n and update n. + ASSERT(n != NULL); + if (n == nextlist) { + dprintf("SPL: %s: at marker (%s)(steps: %u) " + "p->vs_start, end == %lu, %lu\n", + __func__, vmp->vm_name, step, + (uintptr_t)p->vs_start, (uintptr_t)p->vs_end); + // IOSleep(1); + // the next entry is the next marker (e.g. 16k marker) + break; + } + if (n->vs_start == 0) { + // from vmem_freelist_delete, this is a head + dprintf("SPL: %s: n->vs_start == 0 (%s)(steps: %u) " + "p->vs_start, end == %lu, %lu\n", + __func__, vmp->vm_name, step, + (uintptr_t)p->vs_start, (uintptr_t)p->vs_end); + // IOSleep(1); + break; + } + if (step >= max_walk_steps) { + ASSERT(nextlist->vs_kprev != NULL); + // we have walked far enough. + // put this segment at the tail of the freelist. + if (nextlist->vs_kprev != NULL) { + n = (vmem_seg_t *)nextlist; + p = nextlist->vs_kprev; + } + dprintf("SPL: %s: walked out (%s)\n", __func__, + vmp->vm_name); + // IOSleep(1); + atomic_inc_64(&spl_frag_walked_out); + break; + } + if (n->vs_knext == NULL) { + dprintf("SPL: %s: n->vs_knext == NULL (my_listnum " + "== %d)\n", __func__, my_listnum); + // IOSleep(1); + break; + } + p = n; + n = n->vs_knext; + atomic_inc_64(&spl_frag_walk_cnt); + } + + ASSERT(p != NULL); + + // insert segment between p and n + + vsp->vs_type = VMEM_FREE; + vmp->vm_freemap |= VS_SIZE(vprev); + VMEM_INSERT(p, vsp, k); + + cv_broadcast(&vmp->vm_cv); +} + +/* + * Add vsp to the appropriate freelist. + */ +static void +vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp) +{ + + if (vmp->vm_cflags & VMC_TIMEFREE) { + vmem_freelist_insert_sort_by_time(vmp, vsp); + return; + } + + vmem_seg_t *vprev; + + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + + vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; + vsp->vs_type = VMEM_FREE; + vmp->vm_freemap |= VS_SIZE(vprev); + VMEM_INSERT(vprev, vsp, k); + + cv_broadcast(&vmp->vm_cv); +} + +/* + * Take vsp from the freelist. + */ +static void +vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + ASSERT(vsp->vs_type == VMEM_FREE); + + if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) { + /* + * The segments on both sides of 'vsp' are freelist heads, + * so taking vsp leaves the freelist at vsp->vs_kprev empty. + */ + ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev)); + vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev); + } + VMEM_DELETE(vsp, k); +} + +/* + * Add vsp to the allocated-segment hash table and update kstats. + */ +static void +vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp) +{ + vmem_seg_t **bucket; + + vsp->vs_type = VMEM_ALLOC; + bucket = VMEM_HASH(vmp, vsp->vs_start); + vsp->vs_knext = *bucket; + *bucket = vsp; + + if (vmem_seg_size == sizeof (vmem_seg_t)) { + // vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack, + // VMEM_STACK_DEPTH); + // vsp->vs_thread = curthread; + vsp->vs_depth = 0; + vsp->vs_thread = 0; + vsp->vs_timestamp = gethrtime(); + } else { + vsp->vs_depth = 0; + } + + vmp->vm_kstat.vk_alloc.value.ui64++; + vmp->vm_kstat.vk_mem_inuse.value.ui64 += VS_SIZE(vsp); +} + +/* + * Remove vsp from the allocated-segment hash table and update kstats. + */ +static vmem_seg_t * +vmem_hash_delete(vmem_t *vmp, uintptr_t addr, size_t size) +{ + vmem_seg_t *vsp, **prev_vspp; + + prev_vspp = VMEM_HASH(vmp, addr); + while ((vsp = *prev_vspp) != NULL) { + if (vsp->vs_start == addr) { + *prev_vspp = vsp->vs_knext; + break; + } + vmp->vm_kstat.vk_lookup.value.ui64++; + prev_vspp = &vsp->vs_knext; + } + + if (vsp == NULL) + panic("vmem_hash_delete(%p, %lx, %lu): bad free " + "(name: %s, addr, size)", + (void *)vmp, addr, size, vmp->vm_name); + if (VS_SIZE(vsp) != size) + panic("vmem_hash_delete(%p, %lx, %lu): (%s) wrong size" + "(expect %lu)", + (void *)vmp, addr, size, vmp->vm_name, VS_SIZE(vsp)); + + vmp->vm_kstat.vk_free.value.ui64++; + vmp->vm_kstat.vk_mem_inuse.value.ui64 -= size; + + return (vsp); +} + +/* + * Create a segment spanning the range [start, end) and add it to the arena. + */ +static vmem_seg_t * +vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end) +{ + vmem_seg_t *newseg = vmem_getseg(vmp); + + newseg->vs_start = start; + newseg->vs_end = end; + newseg->vs_type = 0; + newseg->vs_import = 0; + newseg->vs_span_createtime = 0; + + VMEM_INSERT(vprev, newseg, a); + + return (newseg); +} + +/* + * Remove segment vsp from the arena. + */ +static void +vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(vsp->vs_type != VMEM_ROTOR); + VMEM_DELETE(vsp, a); + + vmem_putseg(vmp, vsp); +} + +/* + * Add the span [vaddr, vaddr + size) to vmp and update kstats. + */ +static vmem_seg_t * +vmem_span_create(vmem_t *vmp, void *vaddr, size_t size, uint8_t import) +{ + vmem_seg_t *newseg, *span; + uintptr_t start = (uintptr_t)vaddr; + uintptr_t end = start + size; + + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + + if ((start | end) & (vmp->vm_quantum - 1)) + panic("vmem_span_create(%p, %p, %lu): misaligned (%s)", + (void *)vmp, vaddr, size, vmp->vm_name); + + span = vmem_seg_create(vmp, vmp->vm_seg0.vs_aprev, start, end); + span->vs_type = VMEM_SPAN; + span->vs_import = import; + + hrtime_t t = 0; + if (vmp->vm_cflags & VMC_TIMEFREE) { + t = gethrtime(); + } + span->vs_span_createtime = t; + + VMEM_INSERT(vmp->vm_seg0.vs_kprev, span, k); + + newseg = vmem_seg_create(vmp, span, start, end); + newseg->vs_span_createtime = t; + + vmem_freelist_insert(vmp, newseg); + + if (import) + vmp->vm_kstat.vk_mem_import.value.ui64 += size; + vmp->vm_kstat.vk_mem_total.value.ui64 += size; + + return (newseg); +} + +/* + * Remove span vsp from vmp and update kstats. + */ +static void +vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp) +{ + vmem_seg_t *span = vsp->vs_aprev; + size_t size = VS_SIZE(vsp); + + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + ASSERT(span->vs_type == VMEM_SPAN); + + if (span->vs_import) + vmp->vm_kstat.vk_mem_import.value.ui64 -= size; + vmp->vm_kstat.vk_mem_total.value.ui64 -= size; + + VMEM_DELETE(span, k); + + vmem_seg_destroy(vmp, vsp); + vmem_seg_destroy(vmp, span); +} + +/* + * Allocate the subrange [addr, addr + size) from segment vsp. + * If there are leftovers on either side, place them on the freelist. + * Returns a pointer to the segment representing [addr, addr + size). + */ +static vmem_seg_t * +vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size) +{ + uintptr_t vs_start = vsp->vs_start; + uintptr_t vs_end = vsp->vs_end; + size_t vs_size = vs_end - vs_start; + size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); + uintptr_t addr_end = addr + realsize; + + ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0); + ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0); + ASSERT(vsp->vs_type == VMEM_FREE); + ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1); + ASSERT(addr - 1 <= addr_end - 1); + + hrtime_t parent_seg_span_createtime = vsp->vs_span_createtime; + + /* + * If we're allocating from the start of the segment, and the + * remainder will be on the same freelist, we can save quite + * a bit of work. + */ + if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) { + ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); + vsp->vs_start = addr_end; + vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size); + vsp->vs_span_createtime = parent_seg_span_createtime; + vmem_hash_insert(vmp, vsp); + return (vsp); + } + + vmem_freelist_delete(vmp, vsp); + + if (vs_end != addr_end) { + vmem_seg_t *v = vmem_seg_create(vmp, vsp, addr_end, vs_end); + v->vs_span_createtime = parent_seg_span_createtime; + vmem_freelist_insert(vmp, v); + } + + if (vs_start != addr) { + vmem_seg_t *v = + vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr); + v->vs_span_createtime = parent_seg_span_createtime; + vmem_freelist_insert(vmp, v); + } + + vsp->vs_start = addr; + vsp->vs_end = addr + size; + + vsp->vs_span_createtime = parent_seg_span_createtime; + + vmem_hash_insert(vmp, vsp); + return (vsp); +} + +/* + * Returns 1 if we are populating, 0 otherwise. + * Call it if we want to prevent recursion from HAT. + */ +int +vmem_is_populator() +{ + return (mutex_owner(&vmem_sleep_lock) == curthread || + mutex_owner(&vmem_nosleep_lock) == curthread || + mutex_owner(&vmem_pushpage_lock) == curthread || + mutex_owner(&vmem_panic_lock) == curthread); +} + +/* + * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures. + */ +static int +vmem_populate(vmem_t *vmp, int vmflag) +{ + char *p; + vmem_seg_t *vsp; + ssize_t nseg; + size_t size; + kmutex_t *lp; + int i; + + while (vmp->vm_nsegfree < VMEM_MINFREE && + (vsp = vmem_getseg_global()) != NULL) + vmem_putseg(vmp, vsp); + + if (vmp->vm_nsegfree >= VMEM_MINFREE) + return (1); + + /* + * If we're already populating, tap the reserve. + */ + if (vmem_is_populator()) { + ASSERT(vmp->vm_cflags & VMC_POPULATOR); + return (1); + } + + mutex_exit(&vmp->vm_lock); + + // if (panic_thread == curthread) + // lp = &vmem_panic_lock; + // else + + if (vmflag & VM_NOSLEEP) + lp = &vmem_nosleep_lock; + else if (vmflag & VM_PUSHPAGE) + lp = &vmem_pushpage_lock; + else + lp = &vmem_sleep_lock; + + mutex_enter(lp); + + nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE; + size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum); + nseg = size / vmem_seg_size; + + /* + * The following vmem_alloc() may need to populate vmem_seg_arena + * and all the things it imports from. When doing so, it will tap + * each arena's reserve to prevent recursion (see the block comment + * above the definition of VMEM_POPULATE_RESERVE). + */ + p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_KMFLAGS); + if (p == NULL) { + mutex_exit(lp); + mutex_enter(&vmp->vm_lock); + vmp->vm_kstat.vk_populate_fail.value.ui64++; + return (0); + } + + /* + * Restock the arenas that may have been depleted during population. + */ + for (i = 0; i < vmem_populators; i++) { + mutex_enter(&vmem_populator[i]->vm_lock); + while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE) + vmem_putseg(vmem_populator[i], + (vmem_seg_t *)(p + --nseg * vmem_seg_size)); + mutex_exit(&vmem_populator[i]->vm_lock); + } + + mutex_exit(lp); + mutex_enter(&vmp->vm_lock); + + /* + * Now take our own segments. + */ + ASSERT(nseg >= VMEM_MINFREE); + while (vmp->vm_nsegfree < VMEM_MINFREE) + vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size)); + + /* + * Give the remainder to charity. + */ + while (nseg > 0) + vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size)); + + return (1); +} + +/* + * Advance a walker from its previous position to 'afterme'. + * Note: may drop and reacquire vmp->vm_lock. + */ +static void +vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme) +{ + vmem_seg_t *vprev = walker->vs_aprev; + vmem_seg_t *vnext = walker->vs_anext; + vmem_seg_t *vsp = NULL; + + VMEM_DELETE(walker, a); + + if (afterme != NULL) + VMEM_INSERT(afterme, walker, a); + + /* + * The walker segment's presence may have prevented its neighbors + * from coalescing. If so, coalesce them now. + */ + if (vprev->vs_type == VMEM_FREE) { + if (vnext->vs_type == VMEM_FREE) { + ASSERT(vprev->vs_end == vnext->vs_start); + ASSERT(vprev->vs_span_createtime == + vnext->vs_span_createtime); + vmem_freelist_delete(vmp, vnext); + vmem_freelist_delete(vmp, vprev); + vprev->vs_end = vnext->vs_end; + vmem_freelist_insert(vmp, vprev); + vmem_seg_destroy(vmp, vnext); + } + vsp = vprev; + } else if (vnext->vs_type == VMEM_FREE) { + vsp = vnext; + } + + /* + * vsp could represent a complete imported span, + * in which case we must return it to the source. + */ + if (vsp != NULL && vsp->vs_aprev->vs_import && + vmp->vm_source_free != NULL && + vsp->vs_aprev->vs_type == VMEM_SPAN && + vsp->vs_anext->vs_type == VMEM_SPAN) { + void *vaddr = (void *)vsp->vs_start; + size_t size = VS_SIZE(vsp); + ASSERT(size == VS_SIZE(vsp->vs_aprev)); + vmem_freelist_delete(vmp, vsp); + vmem_span_destroy(vmp, vsp); + vmp->vm_kstat.vk_parent_free.value.ui64++; + mutex_exit(&vmp->vm_lock); + vmp->vm_source_free(vmp->vm_source, vaddr, size); + mutex_enter(&vmp->vm_lock); + } +} + +/* + * VM_NEXTFIT allocations deliberately cycle through all virtual addresses + * in an arena, so that we avoid reusing addresses for as long as possible. + * This helps to catch used-after-freed bugs. It's also the perfect policy + * for allocating things like process IDs, where we want to cycle through + * all values in order. + */ +static void * +vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + vmem_seg_t *vsp, *rotor; + uintptr_t addr; + size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); + size_t vs_size; + + mutex_enter(&vmp->vm_lock); + + if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) { + mutex_exit(&vmp->vm_lock); + return (NULL); + } + + /* + * The common case is that the segment right after the rotor is free, + * and large enough that extracting 'size' bytes won't change which + * freelist it's on. In this case we can avoid a *lot* of work. + * Instead of the normal vmem_seg_alloc(), we just advance the start + * address of the victim segment. Instead of moving the rotor, we + * create the new segment structure *behind the rotor*, which has + * the same effect. And finally, we know we don't have to coalesce + * the rotor's neighbors because the new segment lies between them. + */ + rotor = &vmp->vm_rotor; + vsp = rotor->vs_anext; + if (vsp->vs_type == VMEM_FREE && (vs_size = VS_SIZE(vsp)) > realsize && + P2SAMEHIGHBIT(vs_size, vs_size - realsize)) { + ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); + addr = vsp->vs_start; + vsp->vs_start = addr + realsize; + hrtime_t t = vsp->vs_span_createtime; + vmem_hash_insert(vmp, + vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size)); + vsp->vs_span_createtime = t; + mutex_exit(&vmp->vm_lock); + return ((void *)addr); + } + + /* + * Starting at the rotor, look for a segment large enough to + * satisfy the allocation. + */ + for (;;) { + atomic_inc_64(&vmp->vm_kstat.vk_search.value.ui64); + if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) + break; + vsp = vsp->vs_anext; + if (vsp == rotor) { + /* + * We've come full circle. One possibility is that the + * there's actually enough space, but the rotor itself + * is preventing the allocation from succeeding because + * it's sitting between two free segments. Therefore, + * we advance the rotor and see if that liberates a + * suitable segment. + */ + vmem_advance(vmp, rotor, rotor->vs_anext); + vsp = rotor->vs_aprev; + if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) + break; + /* + * If there's a lower arena we can import from, or it's + * a VM_NOSLEEP allocation, let vmem_xalloc() handle it. + * Otherwise, wait until another thread frees something. + */ + if (vmp->vm_source_alloc != NULL || + (vmflag & VM_NOSLEEP)) { + mutex_exit(&vmp->vm_lock); + return (vmem_xalloc(vmp, size, vmp->vm_quantum, + 0, 0, NULL, NULL, + vmflag & (VM_KMFLAGS | VM_NEXTFIT))); + } + atomic_inc_64(&vmp->vm_kstat.vk_wait.value.ui64); + atomic_inc_64( + &vmp->vm_kstat.vk_threads_waiting.value.ui64); + atomic_inc_64(&spl_vmem_threads_waiting); + if (spl_vmem_threads_waiting > 1) + dprintf("SPL: %s: waiting for %lu sized alloc " + "after full circle of %s, waiting " + "threads %llu, total threads waiting " + "= %llu.\n", + __func__, size, vmp->vm_name, + vmp->vm_kstat.vk_threads_waiting.value.ui64, + spl_vmem_threads_waiting); + cv_wait(&vmp->vm_cv, &vmp->vm_lock); + atomic_dec_64(&spl_vmem_threads_waiting); + atomic_dec_64( + &vmp->vm_kstat.vk_threads_waiting.value.ui64); + vsp = rotor->vs_anext; + } + } + + /* + * We found a segment. Extract enough space to satisfy the allocation. + */ + addr = vsp->vs_start; + vsp = vmem_seg_alloc(vmp, vsp, addr, size); + ASSERT(vsp->vs_type == VMEM_ALLOC && + vsp->vs_start == addr && vsp->vs_end == addr + size); + + /* + * Advance the rotor to right after the newly-allocated segment. + * That's where the next VM_NEXTFIT allocation will begin searching. + */ + vmem_advance(vmp, rotor, vsp); + mutex_exit(&vmp->vm_lock); + return ((void *)addr); +} + +/* + * Checks if vmp is guaranteed to have a size-byte buffer somewhere on its + * freelist. If size is not a power-of-2, it can return a false-negative. + * + * Used to decide if a newly imported span is superfluous after re-acquiring + * the arena lock. + */ +static int +vmem_canalloc(vmem_t *vmp, size_t size) +{ + int hb; + int flist = 0; + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + + if ((size & (size - 1)) == 0) + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + else if ((hb = highbit(size)) < VMEM_FREELISTS) + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1ULL << hb)); + + return (flist); +} + +// Convenience functions for use when gauging +// allocation ability when not holding the lock. +// These are unreliable because vmp->vm_freemap is +// liable to change immediately after being examined. +int +vmem_canalloc_lock(vmem_t *vmp, size_t size) +{ + mutex_enter(&vmp->vm_lock); + int i = vmem_canalloc(vmp, size); + mutex_exit(&vmp->vm_lock); + return (i); +} + +int +vmem_canalloc_atomic(vmem_t *vmp, size_t size) +{ + int hb; + int flist = 0; + + ulong_t freemap = + __c11_atomic_load((_Atomic ulong_t *)&vmp->vm_freemap, + __ATOMIC_SEQ_CST); + + if (ISP2(size)) + flist = lowbit(P2ALIGN(freemap, size)); + else if ((hb = highbit(size)) < VMEM_FREELISTS) + flist = lowbit(P2ALIGN(freemap, 1ULL << hb)); + + return (flist); +} + +static inline uint64_t +spl_vmem_xnu_useful_bytes_free(void) +{ + extern volatile unsigned int vm_page_free_wanted; + extern volatile unsigned int vm_page_free_count; + extern volatile unsigned int vm_page_free_min; + + if (vm_page_free_wanted > 0) + return (0); + + uint64_t bytes_free = (uint64_t)vm_page_free_count * (uint64_t)PAGESIZE; + uint64_t bytes_min = (uint64_t)vm_page_free_min * (uint64_t)PAGESIZE; + + if (bytes_free <= bytes_min) + return (0); + + uint64_t useful_free = bytes_free - bytes_min; + + return (useful_free); +} + +uint64_t +vmem_xnu_useful_bytes_free(void) +{ + return (spl_vmem_xnu_useful_bytes_free()); +} + + +static void * +spl_vmem_malloc_unconditionally_unlocked(size_t size) +{ + extern void *osif_malloc(uint64_t); + atomic_inc_64(&spl_vmem_unconditional_allocs); + atomic_add_64(&spl_vmem_unconditional_alloc_bytes, size); + return (osif_malloc(size)); +} + +static void * +spl_vmem_malloc_unconditionally(size_t size) +{ + mutex_enter(&vmem_xnu_alloc_lock); + void *m = spl_vmem_malloc_unconditionally_unlocked(size); + mutex_exit(&vmem_xnu_alloc_lock); + return (m); +} + +static void * +spl_vmem_malloc_if_no_pressure(size_t size) +{ + // The mutex serializes concurrent callers, providing time for + // the variables in spl_vmem_xnu_useful_bytes_free() to be updated. + mutex_enter(&vmem_xnu_alloc_lock); + if (spl_vmem_xnu_useful_bytes_free() > (MAX(size, 1024ULL*1024ULL))) { + extern void *osif_malloc(uint64_t); + void *p = osif_malloc(size); + if (p != NULL) { + spl_vmem_conditional_allocs++; + spl_vmem_conditional_alloc_bytes += size; + } + mutex_exit(&vmem_xnu_alloc_lock); + return (p); + } else { + spl_vmem_conditional_alloc_deny++; + spl_vmem_conditional_alloc_deny_bytes += size; + mutex_exit(&vmem_xnu_alloc_lock); + return (NULL); + } +} + +/* + * Allocate size bytes at offset phase from an align boundary such that the + * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) + * that does not straddle a nocross-aligned boundary. + */ +void * +vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase, + size_t nocross, void *minaddr, void *maxaddr, int vmflag) +{ + vmem_seg_t *vsp; + vmem_seg_t *vbest = NULL; + uintptr_t addr, taddr, start, end; + uintptr_t align = (align_arg != 0) ? align_arg : vmp->vm_quantum; + void *vaddr, *xvaddr = NULL; + size_t xsize; + int hb, flist, resv; + uint32_t mtbf; + + if ((align | phase | nocross) & (vmp->vm_quantum - 1)) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "parameters not vm_quantum aligned", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + + if (nocross != 0 && + (align > nocross || P2ROUNDUP(phase + size, align) > nocross)) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "overconstrained allocation", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + + if (phase >= align || (align & (align - 1)) != 0 || + (nocross & (nocross - 1)) != 0) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "parameters inconsistent or invalid", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + + if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && + (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) + return (NULL); + + mutex_enter(&vmp->vm_lock); + for (;;) { + if (vmp->vm_nsegfree < VMEM_MINFREE && + !vmem_populate(vmp, vmflag)) + break; +do_alloc: + /* + * highbit() returns the highest bit + 1, which is exactly + * what we want: we want to search the first freelist whose + * members are *definitely* large enough to satisfy our + * allocation. However, there are certain cases in which we + * want to look at the next-smallest freelist (which *might* + * be able to satisfy the allocation): + * + * (1) The size is exactly a power of 2, in which case + * the smaller freelist is always big enough; + * + * (2) All other freelists are empty; + * + * (3) We're in the highest possible freelist, which is + * always empty (e.g. the 4GB freelist on 32-bit systems); + * + * (4) We're doing a best-fit or first-fit allocation. + */ + if ((size & (size - 1)) == 0) { + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + } else { + hb = highbit(size); + if ((vmp->vm_freemap >> hb) == 0 || + hb == VMEM_FREELISTS || + (vmflag & (VM_BESTFIT | VM_FIRSTFIT))) + hb--; + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); + } + + for (vbest = NULL, vsp = (flist == 0) ? NULL : + vmp->vm_freelist[flist - 1].vs_knext; + vsp != NULL; vsp = vsp->vs_knext) { + atomic_inc_64(&vmp->vm_kstat.vk_search.value.ui64); + if (vsp->vs_start == 0) { + /* + * We're moving up to a larger freelist, + * so if we've already found a candidate, + * the fit can't possibly get any better. + */ + if (vbest != NULL) + break; + /* + * Find the next non-empty freelist. + */ + flist = lowbit(P2ALIGN(vmp->vm_freemap, + VS_SIZE(vsp))); + if (flist-- == 0) + break; + vsp = (vmem_seg_t *)&vmp->vm_freelist[flist]; + ASSERT(vsp->vs_knext->vs_type == VMEM_FREE); + continue; + } + if (vsp->vs_end - 1 < (uintptr_t)minaddr) + continue; + if (vsp->vs_start > (uintptr_t)maxaddr - 1) + continue; + start = MAX(vsp->vs_start, (uintptr_t)minaddr); + end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; + taddr = P2PHASEUP(start, align, phase); + if (P2BOUNDARY(taddr, size, nocross)) + taddr += + P2ROUNDUP(P2NPHASE(taddr, nocross), align); + if ((taddr - start) + size > end - start || + (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest))) + continue; + vbest = vsp; + addr = taddr; + if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size) + break; + } + if (vbest != NULL) + break; + ASSERT(xvaddr == NULL); + if (size == 0) + panic("vmem_xalloc(): size == 0"); + if (vmp->vm_source_alloc != NULL && nocross == 0 && + minaddr == NULL && maxaddr == NULL) { + size_t aneeded, asize; + size_t aquantum = MAX(vmp->vm_quantum, + vmp->vm_source->vm_quantum); + size_t aphase = phase; + if ((align > aquantum) && + !(vmp->vm_cflags & VMC_XALIGN)) { + aphase = (P2PHASE(phase, aquantum) != 0) ? + align - vmp->vm_quantum : align - aquantum; + ASSERT(aphase >= phase); + } + aneeded = MAX(size + aphase, vmp->vm_min_import); + asize = P2ROUNDUP(aneeded, aquantum); + + if (asize < size) { + /* + * The rounding induced overflow; return NULL + * if we are permitted to fail the allocation + * (and explicitly panic if we aren't). + */ + if ((vmflag & VM_NOSLEEP) && + !(vmflag & VM_PANIC)) { + mutex_exit(&vmp->vm_lock); + return (NULL); + } + + panic("vmem_xalloc(): size overflow"); + } + + /* + * Determine how many segment structures we'll consume. + * The calculation must be precise because if we're + * here on behalf of vmem_populate(), we are taking + * segments from a very limited reserve. + */ + if (size == asize && !(vmp->vm_cflags & VMC_XALLOC)) + resv = VMEM_SEGS_PER_SPAN_CREATE + + VMEM_SEGS_PER_EXACT_ALLOC; + else if (phase == 0 && + align <= vmp->vm_source->vm_quantum) + resv = VMEM_SEGS_PER_SPAN_CREATE + + VMEM_SEGS_PER_LEFT_ALLOC; + else + resv = VMEM_SEGS_PER_ALLOC_MAX; + + ASSERT(vmp->vm_nsegfree >= resv); + vmp->vm_nsegfree -= resv; /* reserve our segs */ + mutex_exit(&vmp->vm_lock); + if (vmp->vm_cflags & VMC_XALLOC) { + // size_t oasize = asize; + vaddr = ((vmem_ximport_t *) + vmp->vm_source_alloc)(vmp->vm_source, + &asize, align, vmflag & VM_KMFLAGS); + // ASSERT(asize >= oasize); + ASSERT(P2PHASE(asize, + vmp->vm_source->vm_quantum) == 0); + ASSERT(!(vmp->vm_cflags & VMC_XALIGN) || + IS_P2ALIGNED(vaddr, align)); + } else { + atomic_inc_64( + &vmp->vm_kstat.vk_parent_alloc.value.ui64); + vaddr = vmp->vm_source_alloc(vmp->vm_source, + asize, vmflag & (VM_KMFLAGS | VM_NEXTFIT)); + } + mutex_enter(&vmp->vm_lock); + vmp->vm_nsegfree += resv; /* claim reservation */ + aneeded = size + align - vmp->vm_quantum; + aneeded = P2ROUNDUP(aneeded, vmp->vm_quantum); + if (vaddr != NULL) { + /* + * Since we dropped the vmem lock while + * calling the import function, other + * threads could have imported space + * and made our import unnecessary. In + * order to save space, we return + * excess imports immediately. + */ + // but if there are threads waiting below, + // do not return the excess import, rather + // wake those threads up so they can use it. + if (asize > aneeded && + vmp->vm_source_free != NULL && + vmp->vm_kstat.vk_threads_waiting.value.ui64 + == 0 && vmem_canalloc(vmp, aneeded)) { + ASSERT(resv >= + VMEM_SEGS_PER_MIDDLE_ALLOC); + xvaddr = vaddr; + xsize = asize; + goto do_alloc; + } else if ( + vmp->vm_kstat.vk_threads_waiting.value.ui64 + > 0) { + vmp->vm_kstat.vk_excess.value.ui64++; + cv_broadcast(&vmp->vm_cv); + } + vbest = vmem_span_create(vmp, vaddr, asize, 1); + addr = P2PHASEUP(vbest->vs_start, align, phase); + break; + } else if (vmem_canalloc(vmp, aneeded)) { + /* + * Our import failed, but another thread + * added sufficient free memory to the arena + * to satisfy our request. Go back and + * grab it. + */ + ASSERT(resv >= VMEM_SEGS_PER_MIDDLE_ALLOC); + goto do_alloc; + } + } + + /* + * If the requestor chooses to fail the allocation attempt + * rather than reap wait and retry - get out of the loop. + */ + if (vmflag & VM_ABORT) + break; + mutex_exit(&vmp->vm_lock); + +#if 0 + if (vmp->vm_cflags & VMC_IDENTIFIER) + kmem_reap_idspace(); + else + kmem_reap(); +#endif + + mutex_enter(&vmp->vm_lock); + if (vmflag & VM_NOSLEEP) + break; + atomic_inc_64(&vmp->vm_kstat.vk_wait.value.ui64); + atomic_inc_64(&vmp->vm_kstat.vk_threads_waiting.value.ui64); + atomic_inc_64(&spl_vmem_threads_waiting); + if (spl_vmem_threads_waiting > 0) { + dprintf("SPL: %s: vmem waiting for %lu sized alloc " + "for %s, waiting threads %llu, total threads " + "waiting = %llu\n", + __func__, size, vmp->vm_name, + vmp->vm_kstat.vk_threads_waiting.value.ui64, + spl_vmem_threads_waiting); + extern int64_t spl_free_set_and_wait_pressure(int64_t, + boolean_t, clock_t); + extern int64_t spl_free_manual_pressure_wrapper(void); + mutex_exit(&vmp->vm_lock); + // release other waiting threads + spl_free_set_pressure(0); + int64_t target_pressure = size * + spl_vmem_threads_waiting; + int64_t delivered_pressure = + spl_free_set_and_wait_pressure(target_pressure, + TRUE, USEC2NSEC(500)); + dprintf("SPL: %s: pressure %lld targeted, %lld " + "delivered\n", __func__, target_pressure, + delivered_pressure); + mutex_enter(&vmp->vm_lock); + } + cv_wait(&vmp->vm_cv, &vmp->vm_lock); + atomic_dec_64(&spl_vmem_threads_waiting); + atomic_dec_64(&vmp->vm_kstat.vk_threads_waiting.value.ui64); + } + if (vbest != NULL) { + ASSERT(vbest->vs_type == VMEM_FREE); + ASSERT(vbest->vs_knext != vbest); + /* re-position to end of buffer */ + if (vmflag & VM_ENDALLOC) { + addr += ((vbest->vs_end - (addr + size)) / align) * + align; + } + (void) vmem_seg_alloc(vmp, vbest, addr, size); + mutex_exit(&vmp->vm_lock); + if (xvaddr) { + atomic_inc_64(&vmp->vm_kstat.vk_parent_free.value.ui64); + vmp->vm_source_free(vmp->vm_source, xvaddr, xsize); + } + ASSERT(P2PHASE(addr, align) == phase); + ASSERT(!P2BOUNDARY(addr, size, nocross)); + ASSERT(addr >= (uintptr_t)minaddr); + ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); + return ((void *)addr); + } + if (0 == (vmflag & VM_NO_VBA)) { + vmp->vm_kstat.vk_fail.value.ui64++; + } + mutex_exit(&vmp->vm_lock); + if (vmflag & VM_PANIC) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "cannot satisfy mandatory allocation", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + ASSERT(xvaddr == NULL); + return (NULL); +} + +/* + * Free the segment [vaddr, vaddr + size), where vaddr was a constrained + * allocation. vmem_xalloc() and vmem_xfree() must always be paired because + * both routines bypass the quantum caches. + */ +void +vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) +{ + vmem_seg_t *vsp, *vnext, *vprev; + + mutex_enter(&vmp->vm_lock); + + vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size); + vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum); + + /* + * Attempt to coalesce with the next segment. + */ + vnext = vsp->vs_anext; + if (vnext->vs_type == VMEM_FREE) { + ASSERT(vsp->vs_end == vnext->vs_start); + vmem_freelist_delete(vmp, vnext); + vsp->vs_end = vnext->vs_end; + vmem_seg_destroy(vmp, vnext); + } + + /* + * Attempt to coalesce with the previous segment. + */ + vprev = vsp->vs_aprev; + if (vprev->vs_type == VMEM_FREE) { + ASSERT(vprev->vs_end == vsp->vs_start); + vmem_freelist_delete(vmp, vprev); + vprev->vs_end = vsp->vs_end; + vmem_seg_destroy(vmp, vsp); + vsp = vprev; + } + + /* + * If the entire span is free, return it to the source. + */ + if (vsp->vs_aprev->vs_import && vmp->vm_source_free != NULL && + vsp->vs_aprev->vs_type == VMEM_SPAN && + vsp->vs_anext->vs_type == VMEM_SPAN) { + vaddr = (void *)vsp->vs_start; + size = VS_SIZE(vsp); + ASSERT(size == VS_SIZE(vsp->vs_aprev)); + vmem_span_destroy(vmp, vsp); + vmp->vm_kstat.vk_parent_free.value.ui64++; + mutex_exit(&vmp->vm_lock); + vmp->vm_source_free(vmp->vm_source, vaddr, size); + } else { + vmem_freelist_insert(vmp, vsp); + mutex_exit(&vmp->vm_lock); + } +} + +/* + * Allocate size bytes from arena vmp. Returns the allocated address + * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP, + * and may also specify best-fit, first-fit, or next-fit allocation policy + * instead of the default instant-fit policy. VM_SLEEP allocations are + * guaranteed to succeed. + */ +void * +vmem_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + vmem_seg_t *vsp; + uintptr_t addr; + int hb; + int flist = 0; + uint32_t mtbf; + + if (size - 1 < vmp->vm_qcache_max) + return (kmem_cache_alloc(vmp->vm_qcache[(size - 1) >> + vmp->vm_qshift], vmflag & VM_KMFLAGS)); + + if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && + (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) + return (NULL); + + if (vmflag & VM_NEXTFIT) + return (vmem_nextfit_alloc(vmp, size, vmflag)); + + if (vmflag & (VM_BESTFIT | VM_FIRSTFIT)) + return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0, + NULL, NULL, vmflag)); + if (vmp->vm_cflags & VM_NEXTFIT) + return (vmem_nextfit_alloc(vmp, size, vmflag)); + + /* + * Unconstrained instant-fit allocation from the segment list. + */ + mutex_enter(&vmp->vm_lock); + + if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) { + if ((size & (size - 1)) == 0) + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + else if ((hb = highbit(size)) < VMEM_FREELISTS) + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); + } + + if (flist-- == 0) { + mutex_exit(&vmp->vm_lock); + return (vmem_xalloc(vmp, size, vmp->vm_quantum, + 0, 0, NULL, NULL, vmflag)); + } + + ASSERT(size <= (1UL << flist)); + vsp = vmp->vm_freelist[flist].vs_knext; + addr = vsp->vs_start; + if (vmflag & VM_ENDALLOC) { + addr += vsp->vs_end - (addr + size); + } + (void) vmem_seg_alloc(vmp, vsp, addr, size); + mutex_exit(&vmp->vm_lock); + return ((void *)addr); +} + +/* + * Free the segment [vaddr, vaddr + size). + */ +void +vmem_free(vmem_t *vmp, void *vaddr, size_t size) +{ + if (size - 1 < vmp->vm_qcache_max) + kmem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], + vaddr); + else + vmem_xfree(vmp, vaddr, size); +} + +/* + * Determine whether arena vmp contains the segment [vaddr, vaddr + size). + */ +int +vmem_contains(vmem_t *vmp, void *vaddr, size_t size) +{ + uintptr_t start = (uintptr_t)vaddr; + uintptr_t end = start + size; + vmem_seg_t *vsp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + + mutex_enter(&vmp->vm_lock); + vmp->vm_kstat.vk_contains.value.ui64++; + for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) { + vmp->vm_kstat.vk_contains_search.value.ui64++; + ASSERT(vsp->vs_type == VMEM_SPAN); + if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1) + break; + } + mutex_exit(&vmp->vm_lock); + return (vsp != seg0); +} + +/* + * Add the span [vaddr, vaddr + size) to arena vmp. + */ +void * +vmem_add(vmem_t *vmp, void *vaddr, size_t size, int vmflag) +{ + if (vaddr == NULL || size == 0) + panic("vmem_add(%p, %p, %lu): bad arguments", + (void *)vmp, vaddr, size); + + ASSERT(!vmem_contains(vmp, vaddr, size)); + + mutex_enter(&vmp->vm_lock); + if (vmem_populate(vmp, vmflag)) + (void) vmem_span_create(vmp, vaddr, size, 0); + else + vaddr = NULL; + mutex_exit(&vmp->vm_lock); + return (vaddr); +} + +/* + * Walk the vmp arena, applying func to each segment matching typemask. + * If VMEM_REENTRANT is specified, the arena lock is dropped across each + * call to func(); otherwise, it is held for the duration of vmem_walk() + * to ensure a consistent snapshot. Note that VMEM_REENTRANT callbacks + * are *not* necessarily consistent, so they may only be used when a hint + * is adequate. + */ +void +vmem_walk(vmem_t *vmp, int typemask, + void (*func)(void *, void *, size_t), void *arg) +{ + vmem_seg_t *vsp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t walker; + + if (typemask & VMEM_WALKER) + return; + + bzero(&walker, sizeof (walker)); + walker.vs_type = VMEM_WALKER; + + mutex_enter(&vmp->vm_lock); + VMEM_INSERT(seg0, &walker, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) { + if (vsp->vs_type & typemask) { + void *start = (void *)vsp->vs_start; + size_t size = VS_SIZE(vsp); + if (typemask & VMEM_REENTRANT) { + vmem_advance(vmp, &walker, vsp); + mutex_exit(&vmp->vm_lock); + func(arg, start, size); + mutex_enter(&vmp->vm_lock); + vsp = &walker; + } else { + func(arg, start, size); + } + } + } + vmem_advance(vmp, &walker, NULL); + mutex_exit(&vmp->vm_lock); +} + +/* + * Return the total amount of memory whose type matches typemask. Thus: + * + * typemask VMEM_ALLOC yields total memory allocated (in use). + * typemask VMEM_FREE yields total memory free (available). + * typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size. + */ +size_t +vmem_size(vmem_t *vmp, int typemask) +{ + int64_t size = 0; + + if (typemask & VMEM_ALLOC) + size += (int64_t)vmp->vm_kstat.vk_mem_inuse.value.ui64; + if (typemask & VMEM_FREE) + size += (int64_t)vmp->vm_kstat.vk_mem_total.value.ui64 - + (int64_t)vmp->vm_kstat.vk_mem_inuse.value.ui64; + if (size < 0) + size = 0; + + return ((size_t)size); +} + +size_t +vmem_size_locked(vmem_t *vmp, int typemask) +{ + boolean_t m = (mutex_owner(&vmp->vm_lock) == curthread); + + if (!m) + mutex_enter(&vmp->vm_lock); + size_t s = vmem_size(vmp, typemask); + if (!m) + mutex_exit(&vmp->vm_lock); + return (s); +} + +size_t +vmem_size_semi_atomic(vmem_t *vmp, int typemask) +{ + int64_t size = 0; + uint64_t inuse = 0; + uint64_t total = 0; + + __sync_swap(&total, vmp->vm_kstat.vk_mem_total.value.ui64); + __sync_swap(&inuse, vmp->vm_kstat.vk_mem_inuse.value.ui64); + + int64_t inuse_signed = (int64_t)inuse; + int64_t total_signed = (int64_t)total; + + if (typemask & VMEM_ALLOC) + size += inuse_signed; + if (typemask & VMEM_FREE) + size += total_signed - inuse_signed; + + if (size < 0) + size = 0; + + return ((size_t)size); +} + +size_t +spl_vmem_size(vmem_t *vmp, int typemask) +{ + return (vmem_size_locked(vmp, typemask)); +} + +/* + * Create an arena called name whose initial span is [base, base + size). + * The arena's natural unit of currency is quantum, so vmem_alloc() + * guarantees quantum-aligned results. The arena may import new spans + * by invoking afunc() on source, and may return those spans by invoking + * ffunc() on source. To make small allocations fast and scalable, + * the arena offers high-performance caching for each integer multiple + * of quantum up to qcache_max. + */ +static vmem_t * +vmem_create_common(const char *name, void *base, size_t size, size_t quantum, + void *(*afunc)(vmem_t *, size_t, int), + void (*ffunc)(vmem_t *, void *, size_t), + vmem_t *source, size_t qcache_max, int vmflag) +{ + int i; + size_t nqcache; + vmem_t *vmp, *cur, **vmpp; + vmem_seg_t *vsp; + vmem_freelist_t *vfp; + uint32_t id = atomic_inc_32_nv(&vmem_id); + + if (vmem_vmem_arena != NULL) { + vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t), + vmflag & VM_KMFLAGS); + } else { + ASSERT(id <= VMEM_INITIAL); + vmp = &vmem0[id - 1]; + } + + /* An identifier arena must inherit from another identifier arena */ + ASSERT(source == NULL || ((source->vm_cflags & VMC_IDENTIFIER) == + (vmflag & VMC_IDENTIFIER))); + + if (vmp == NULL) + return (NULL); + bzero(vmp, sizeof (vmem_t)); + + (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name); + mutex_init(&vmp->vm_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vmp->vm_cv, NULL, CV_DEFAULT, NULL); + vmp->vm_cflags = vmflag; + vmflag &= VM_KMFLAGS; + + hrtime_t hrnow = gethrtime(); + + vmp->vm_createtime = hrnow; + + vmp->vm_quantum = quantum; + vmp->vm_qshift = highbit(quantum) - 1; + nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX); + + for (i = 0; i <= VMEM_FREELISTS; i++) { + vfp = &vmp->vm_freelist[i]; + vfp->vs_end = 1UL << i; + vfp->vs_knext = (vmem_seg_t *)(vfp + 1); + vfp->vs_kprev = (vmem_seg_t *)(vfp - 1); + } + + vmp->vm_freelist[0].vs_kprev = NULL; + vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL; + vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0; + vmp->vm_hash_table = vmp->vm_hash0; + vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1; + vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); + + vsp = &vmp->vm_seg0; + vsp->vs_anext = vsp; + vsp->vs_aprev = vsp; + vsp->vs_knext = vsp; + vsp->vs_kprev = vsp; + vsp->vs_type = VMEM_SPAN; + vsp->vs_span_createtime = hrnow; + + vsp = &vmp->vm_rotor; + vsp->vs_type = VMEM_ROTOR; + VMEM_INSERT(&vmp->vm_seg0, vsp, a); + + bcopy(&vmem_kstat_template, &vmp->vm_kstat, sizeof (vmem_kstat_t)); + + vmp->vm_id = id; + if (source != NULL) + vmp->vm_kstat.vk_source_id.value.ui32 = source->vm_id; + vmp->vm_source = source; + vmp->vm_source_alloc = afunc; + vmp->vm_source_free = ffunc; + + /* + * Some arenas (like vmem_metadata and kmem_metadata) cannot + * use quantum caching to lower fragmentation. Instead, we + * increase their imports, giving a similar effect. + */ + if (vmp->vm_cflags & VMC_NO_QCACHE) { + if (qcache_max > VMEM_NQCACHE_MAX && ISP2(qcache_max)) { + vmp->vm_min_import = qcache_max; + } else { + vmp->vm_min_import = + VMEM_QCACHE_SLABSIZE(nqcache << vmp->vm_qshift); + } + nqcache = 0; + } + + if (nqcache != 0) { + ASSERT(!(vmflag & VM_NOSLEEP)); + vmp->vm_qcache_max = nqcache << vmp->vm_qshift; + for (i = 0; i < nqcache; i++) { + char buf[VMEM_NAMELEN + 21]; + (void) snprintf(buf, VMEM_NAMELEN + 20, "%s_%lu", + vmp->vm_name, (i + 1) * quantum); + vmp->vm_qcache[i] = kmem_cache_create(buf, + (i + 1) * quantum, quantum, NULL, NULL, NULL, + NULL, vmp, KMC_QCACHE | KMC_NOTOUCH); + } + } + + if ((vmp->vm_ksp = kstat_create("vmem", vmp->vm_id, vmp->vm_name, + "vmem", KSTAT_TYPE_NAMED, sizeof (vmem_kstat_t) / + sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) != NULL) { + vmp->vm_ksp->ks_data = &vmp->vm_kstat; + kstat_install(vmp->vm_ksp); + } + + mutex_enter(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != NULL) + vmpp = &cur->vm_next; + *vmpp = vmp; + mutex_exit(&vmem_list_lock); + + if (vmp->vm_cflags & VMC_POPULATOR) { + ASSERT(vmem_populators < VMEM_INITIAL); + vmem_populator[atomic_inc_32_nv(&vmem_populators) - 1] = vmp; + mutex_enter(&vmp->vm_lock); + (void) vmem_populate(vmp, vmflag | VM_PANIC); + mutex_exit(&vmp->vm_lock); + } + + if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) { + vmem_destroy(vmp); + return (NULL); + } + + return (vmp); +} + +vmem_t * +vmem_xcreate(const char *name, void *base, size_t size, size_t quantum, + vmem_ximport_t *afunc, vmem_free_t *ffunc, vmem_t *source, + size_t qcache_max, int vmflag) +{ + ASSERT(!(vmflag & (VMC_POPULATOR | VMC_XALLOC))); + vmflag &= ~(VMC_POPULATOR | VMC_XALLOC); + + return (vmem_create_common(name, base, size, quantum, + (vmem_alloc_t *)afunc, ffunc, source, qcache_max, + vmflag | VMC_XALLOC)); +} + +vmem_t * +vmem_create(const char *name, void *base, size_t size, size_t quantum, + vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source, + size_t qcache_max, int vmflag) +{ + ASSERT(!(vmflag & (VMC_XALLOC | VMC_XALIGN))); + vmflag &= ~(VMC_XALLOC | VMC_XALIGN); + + return (vmem_create_common(name, base, size, quantum, + afunc, ffunc, source, qcache_max, vmflag)); +} + +/* + * Destroy arena vmp. + */ +void +vmem_destroy(vmem_t *vmp) +{ + vmem_t *cur, **vmpp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t *vsp, *anext; + size_t leaked; + + /* + * set vm_nsegfree to zero because vmem_free_span_list + * would have already freed vm_segfree. + */ + vmp->vm_nsegfree = 0; + mutex_enter(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != vmp) + vmpp = &cur->vm_next; + *vmpp = vmp->vm_next; + mutex_exit(&vmem_list_lock); + + leaked = vmem_size(vmp, VMEM_ALLOC); + if (leaked != 0) + printf("SPL: vmem_destroy('%s'): leaked %lu %s\n", + vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? + "identifiers" : "bytes"); + + if (vmp->vm_hash_table != vmp->vm_hash0) + vmem_free(vmem_hash_arena, vmp->vm_hash_table, + (vmp->vm_hash_mask + 1) * sizeof (void *)); + + /* + * Give back the segment structures for anything that's left in the + * arena, e.g. the primary spans and their free segments. + */ + VMEM_DELETE(&vmp->vm_rotor, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = anext) { + anext = vsp->vs_anext; + vmem_putseg_global(vsp); + } + + while (vmp->vm_nsegfree > 0) + vmem_putseg_global(vmem_getseg(vmp)); + + kstat_delete(vmp->vm_ksp); + + mutex_destroy(&vmp->vm_lock); + cv_destroy(&vmp->vm_cv); + vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); +} + + +/* + * Destroy arena vmp. + */ +void +vmem_destroy_internal(vmem_t *vmp) +{ + vmem_t *cur, **vmpp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t *vsp, *anext; + size_t leaked; + + mutex_enter(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != vmp) + vmpp = &cur->vm_next; + *vmpp = vmp->vm_next; + mutex_exit(&vmem_list_lock); + + leaked = vmem_size(vmp, VMEM_ALLOC); + if (leaked != 0) + printf("SPL: vmem_destroy('%s'): leaked %lu %s\n", + vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? + "identifiers" : "bytes"); + + if (vmp->vm_hash_table != vmp->vm_hash0) + if (vmem_hash_arena != NULL) + vmem_free(vmem_hash_arena, vmp->vm_hash_table, + (vmp->vm_hash_mask + 1) * sizeof (void *)); + + /* + * Give back the segment structures for anything that's left in the + * arena, e.g. the primary spans and their free segments. + */ + VMEM_DELETE(&vmp->vm_rotor, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = anext) { + anext = vsp->vs_anext; + vmem_putseg_global(vsp); + } + + while (vmp->vm_nsegfree > 0) + vmem_putseg_global(vmem_getseg(vmp)); + + if (!(vmp->vm_cflags & VMC_IDENTIFIER) && + vmem_size(vmp, VMEM_ALLOC) != 0) + printf("SPL: vmem_destroy('%s'): STILL %lu bytes at " + "kstat_delete() time\n", + vmp->vm_name, vmem_size(vmp, VMEM_ALLOC)); + + kstat_delete(vmp->vm_ksp); + + mutex_destroy(&vmp->vm_lock); + cv_destroy(&vmp->vm_cv); + + // Alas, to free, requires access to "vmem_vmem_arena" the very thing + // we release first. + // vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); +} + +/* + * Only shrink vmem hashtable if it is 1<vm_kstat.vk_alloc.value.ui64 - + vmp->vm_kstat.vk_free.value.ui64); + + new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); + old_size = vmp->vm_hash_mask + 1; + + if ((old_size >> vmem_rescale_minshift) <= new_size && + new_size <= (old_size << 1)) + return; + + new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *), + VM_NOSLEEP); + if (new_table == NULL) + return; + bzero(new_table, new_size * sizeof (void *)); + + mutex_enter(&vmp->vm_lock); + + old_size = vmp->vm_hash_mask + 1; + old_table = vmp->vm_hash_table; + + vmp->vm_hash_mask = new_size - 1; + vmp->vm_hash_table = new_table; + vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); + + for (h = 0; h < old_size; h++) { + vsp = old_table[h]; + while (vsp != NULL) { + uintptr_t addr = vsp->vs_start; + vmem_seg_t *next_vsp = vsp->vs_knext; + vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr); + vsp->vs_knext = *hash_bucket; + *hash_bucket = vsp; + vsp = next_vsp; + } + } + + mutex_exit(&vmp->vm_lock); + + if (old_table != vmp->vm_hash0) + vmem_free(vmem_hash_arena, old_table, + old_size * sizeof (void *)); +} + +/* + * Perform periodic maintenance on all vmem arenas. + */ + +void +vmem_update(void *dummy) +{ + vmem_t *vmp; + + mutex_enter(&vmem_list_lock); + for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) { + /* + * If threads are waiting for resources, wake them up + * periodically so they can issue another kmem_reap() + * to reclaim resources cached by the slab allocator. + */ + cv_broadcast(&vmp->vm_cv); + + /* + * Rescale the hash table to keep the hash chains short. + */ + vmem_hash_rescale(vmp); + } + mutex_exit(&vmem_list_lock); + + (void) bsd_timeout(vmem_update, dummy, &vmem_update_interval); +} + +void +vmem_qcache_reap(vmem_t *vmp) +{ + int i; + + /* + * Reap any quantum caches that may be part of this vmem. + */ + for (i = 0; i < VMEM_NQCACHE_MAX; i++) + if (vmp->vm_qcache[i]) + kmem_cache_reap_now(vmp->vm_qcache[i]); +} + +/* given a size, return the appropriate vmem_bucket_arena[] entry */ + +static inline uint16_t +vmem_bucket_number(size_t size) +{ + // For VMEM_BUCKET_HIBIT == 12, + // vmem_bucket_arena[n] holds allocations from 2^[n+11]+1 to 2^[n+12], + // so for [n] = 0, 2049-4096, for [n]=5 65537-131072, + // for [n]=7 (256k+1)-512k + // set hb: 512k == 19, 256k+1 == 19, 256k == 18, ... + const int hb = highbit(size-1); + + int bucket = hb - VMEM_BUCKET_LOWBIT; + + // very large allocations go into the 16 MiB bucket + if (hb > VMEM_BUCKET_HIBIT) + bucket = VMEM_BUCKET_HIBIT - VMEM_BUCKET_LOWBIT; + + // very small allocations go into the 4 kiB bucket + if (bucket < 0) + bucket = 0; + + return (bucket); +} + +static inline vmem_t * +vmem_bucket_arena_by_size(size_t size) +{ + uint16_t bucket = vmem_bucket_number(size); + + return (vmem_bucket_arena[bucket]); +} + +vmem_t * +spl_vmem_bucket_arena_by_size(size_t size) +{ + return (vmem_bucket_arena_by_size(size)); +} + +static inline void +vmem_bucket_wake_all_waiters(void) +{ + for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { + const int bucket = i - VMEM_BUCKET_LOWBIT; + vmem_t *bvmp = vmem_bucket_arena[bucket]; + cv_broadcast(&bvmp->vm_cv); + } + cv_broadcast(&spl_heap_arena->vm_cv); +} + +/* + * xnu_alloc_throttled_bail() : spin looking for memory + * + */ + +static inline void * +xnu_alloc_throttled_bail(uint64_t now_ticks, vmem_t *calling_vmp, + size_t size, int vmflags) +{ + // spin looking for memory + const uint64_t bigtarget = MAX(size, 16ULL*1024ULL*1024ULL); + static volatile _Atomic bool alloc_lock = false; + static volatile _Atomic uint64_t force_time = 0; + + uint64_t timeout_ticks = hz / 2; + if (vmflags & VM_PUSHPAGE) + timeout_ticks = hz / 4; + + uint64_t timeout_time = now_ticks + timeout_ticks; + + for (uint32_t suspends = 0, blocked_suspends = 0, + try_no_pressure = 0; /* empty */; /* empty */) { + if (force_time + timeout_ticks > timeout_time) { + // another thread has forced an allocation + // by timing out. push our deadline into the future. + timeout_time = force_time + timeout_ticks; + } + if (alloc_lock) { + blocked_suspends++; + IOSleep(1); + } else if (spl_vmem_xnu_useful_bytes_free() >= bigtarget) { + bool f = false; + // if alloc_lock == f then alloc_lock = true and result + // is true otherwise result is false and f = true + if (!__c11_atomic_compare_exchange_strong(&alloc_lock, + &f, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + /* + * avoid (highly unlikely) data race on + * alloc_lock. if alloc_lock has become true + * while we were in the else if expression + * then we effectively optimize away the + * (relaxed) load of alloc_lock (== true) + * into f and continue. + */ + continue; + } + // alloc_lock is now visible as true to all threads + try_no_pressure++; + void *m = spl_vmem_malloc_if_no_pressure(size); + if (m != NULL) { + uint64_t ticks = zfs_lbolt() - now_ticks; + dprintf("SPL: %s returning %llu bytes after " + "%llu ticks (hz=%u, seconds = %llu), " + "%u suspends, %u blocked, %u tries (%s)\n", + __func__, (uint64_t)size, + ticks, hz, ticks/hz, suspends, + blocked_suspends, try_no_pressure, + calling_vmp->vm_name); + // atomic seq cst, so is published to all + // threads + alloc_lock = false; + return (m); + } else { + alloc_lock = false; + spl_free_set_emergency_pressure(bigtarget); + suspends++; + IOSleep(1); + } + } else if (zfs_lbolt() > timeout_time) { + bool f = false; + if (!__c11_atomic_compare_exchange_strong(&alloc_lock, + &f, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + // avoid (highly unlikely) data race on + // alloc_lock as above + continue; + } + void *mp = spl_vmem_malloc_unconditionally(size); + uint64_t now = zfs_lbolt(); + uint64_t ticks = now - now_ticks; + force_time = now; + dprintf("SPL: %s TIMEOUT %llu bytes after " + "%llu ticks (hz=%u, seconds=%llu), " + "%u suspends, %u blocked, %u tries (%s)\n", + __func__, (uint64_t)size, + ticks, hz, ticks/hz, suspends, + blocked_suspends, try_no_pressure, + calling_vmp->vm_name); + alloc_lock = false; + atomic_inc_64(&spl_xat_forced); + return (mp); + } else { + spl_free_set_emergency_pressure(bigtarget); + suspends++; + IOSleep(1); + } + } +} + +static void * +xnu_alloc_throttled(vmem_t *bvmp, size_t size, int vmflag) +{ + // the caller is one of the bucket arenas. + // null_vmp will be spl_default_arena_parent, which is + // just a placeholder. + + uint64_t now = zfs_lbolt(); + const uint64_t entry_now = now; + + void *m = spl_vmem_malloc_if_no_pressure(size); + + if (m != NULL) { + atomic_inc_64(&spl_xat_success); + spl_xat_lastalloc = gethrtime(); + // wake up waiters on all the arena condvars + // since there is apparently no memory shortage. + vmem_bucket_wake_all_waiters(); + return (m); + } else { + spl_free_set_emergency_pressure((int64_t)size); + } + + if (vmflag & VM_PANIC) { + // force an allocation now to avoid a panic + spl_xat_lastalloc = gethrtime(); + spl_free_set_emergency_pressure(4LL * (int64_t)size); + void *p = spl_vmem_malloc_unconditionally(size); + // p cannot be NULL (unconditional kernel malloc always works + // or panics) + // therefore: success, wake all waiters on alloc|free condvar + // wake up arena waiters to let them know there is memory + // available in the arena; let waiters on other bucket arenas + // continue sleeping. + cv_broadcast(&bvmp->vm_cv); + return (p); + } + + if (vmflag & VM_NOSLEEP) { + spl_free_set_emergency_pressure(MAX(2LL * (int64_t)size, + 16LL*1024LL*1024LL)); + /* cheating a bit, but not really waiting */ + kpreempt(KPREEMPT_SYNC); + void *p = spl_vmem_malloc_if_no_pressure(size); + if (p != NULL) { + atomic_inc_64(&spl_xat_late_success_nosleep); + cv_broadcast(&bvmp->vm_cv); + spl_xat_lastalloc = gethrtime(); + } + // if p == NULL, then there will be an increment in + // the fail kstat + return (p); + } + + /* + * Loop for a while trying to satisfy VM_SLEEP allocations. + * + * If we are able to allocate memory, then return the pointer. + * + * We return NULL if some other thread's activity has caused + * sufficient memory to appear in this arena that we can satisfy + * the allocation. + * + * We call xnu_alloc_throttle_bail() after a few milliseconds of + * waiting; it will either return a pointer to newly allocated + * memory or NULL. We return the result. + * + */ + + const uint32_t bucket_number = + vmem_bucket_id_to_bucket_number[bvmp->vm_id]; + static volatile _Atomic uint32_t waiters = 0; + + waiters++; + + if (waiters == 1UL) + atomic_inc_64(&spl_xat_no_waiters); + + static _Atomic uint32_t max_waiters_seen = 0; + + if (waiters > max_waiters_seen) { + max_waiters_seen = waiters; + dprintf("SPL: %s: max_waiters_seen increased to %u\n", __func__, + max_waiters_seen); + } + + boolean_t local_xat_pressured = false; + + for (; /* empty */; /* empty */) { + clock_t wait_time = USEC2NSEC(500UL * MAX(waiters, 1UL)); + mutex_enter(&bvmp->vm_lock); + spl_xat_sleep++; + if (local_xat_pressured) { + spl_xat_pressured++; + local_xat_pressured = false; + } + (void) cv_timedwait_hires(&bvmp->vm_cv, &bvmp->vm_lock, + wait_time, 0, 0); + mutex_exit(&bvmp->vm_lock); + now = zfs_lbolt(); + // We may be here because of a broadcast to &vmp->vm_cv, + // causing xnu to schedule all the sleepers in priority-weighted + // FIFO order. Because of the mutex_exit(), the sections below + // here may be entered concurrently. + // spl_vmem_malloc_if_no_pressure does a mutex, so avoid calling + // it unless there is a chance it will succeed. + if (spl_vmem_xnu_useful_bytes_free() > (MAX(size, + 16ULL*1024ULL*1024ULL))) { + void *a = spl_vmem_malloc_if_no_pressure(size); + if (a != NULL) { + atomic_inc_64(&spl_xat_late_success); + spl_xat_lastalloc = gethrtime(); + waiters--; + // Wake up all waiters on the bucket arena + // locks, since the system apparently has + // memory again. + vmem_bucket_wake_all_waiters(); + return (a); + } else { + // Probably vm_page_free_count changed while + // we were in the mutex queue in + // spl_vmem_malloc_if_no_pressure(). There is + // therefore no point in doing the bail-out + // check below, so go back to the top of the + // for loop. + atomic_inc_64(&spl_xat_late_deny); + continue; + } + } + if (now > entry_now + hz / 4 || + spl_vba_threads[bucket_number] > 1UL) { + // If there are other threads waiting for us + // in vba() then when we satisfy this allocation, + // we satisfy more than one thread, so invoke XATB(). + // Otherwise, if we have had no luck for 250 ms, then + // switch to XATB() which is much more aggressive. + if (spl_vba_threads[bucket_number] > 1UL) + atomic_inc_64(&spl_xat_bailed_contended); + atomic_inc_64(&spl_xat_bailed); + static _Atomic uint32_t bailing_threads = 0; + static _Atomic uint32_t max_bailers_seen = 0; + bailing_threads++; + if (bailing_threads > max_bailers_seen) { + max_bailers_seen = bailing_threads; + dprintf("SPL: %s: max_bailers_seen increased " + "to %u\n", __func__, max_bailers_seen); + } + void *b = + xnu_alloc_throttled_bail(now, bvmp, size, vmflag); + bailing_threads--; + spl_xat_lastalloc = gethrtime(); + // wake up waiters on the arena lock, + // since they now have memory they can use. + cv_broadcast(&bvmp->vm_cv); + // open turnstile after having bailed, rather + // than before + waiters--; + return (b); + } else if (now - entry_now > 0 && + ((now - entry_now) % (hz/10))) { + spl_free_set_emergency_pressure(MAX(size, + 16LL*1024LL*1024LL)); + local_xat_pressured = true; + } + } +} + +static void +xnu_free_throttled(vmem_t *vmp, void *vaddr, size_t size) +{ + extern void osif_free(void *, uint64_t); + + // Serialize behind a (short) spin-sleep delay, giving + // xnu time to do freelist management and + // PT teardowns + + // In the usual case there is only one thread in this function, + // so we proceed waitlessly to osif_free(). + + // When there are multiple threads here, we delay the 2nd and later. + + // Explict race: + // The osif_free() is not protected by the vmem_xnu_alloc_lock + // mutex; that is just used for implementing the delay. Consequently, + // the waiters on the same lock in spl_vmem_malloc_if_no_pressure may + // falsely see too small a value for vm_page_free_count. We don't + // care in part because xnu performs poorly when doing + // free-then-allocate anwyay. + + // a_waiters gauges the loop exit checking and sleep duration; + // it is a count of the number of threads trying to do work + // in this function. + static volatile _Atomic uint32_t a_waiters = 0; + + // is_freeing protects the osif_free() call; see comment below + static volatile _Atomic bool is_freeing = false; + + a_waiters++; // generates "lock incl ..." + + static _Atomic uint32_t max_waiters_seen = 0; + + if (a_waiters > max_waiters_seen) { + max_waiters_seen = a_waiters; + dprintf("SPL: %s: max_waiters_seen increased to %u\n", + __func__, max_waiters_seen); + } + + for (uint32_t iter = 0; a_waiters > 1UL; iter++) { + // there is more than one thread here, so suspend and + // sleep for 1 ms + atomic_inc_64(&spl_xft_wait); + IOSleep(1); + // If are growing old in this loop, then see if + // anyone else is still in osif_free. If not, + // we can exit. + if (iter >= a_waiters) { + // if is_freeing == f, then set is_freeing to true with + // release semantics (i.e. "push" it to other cores) + // then break; otherwise, set f to true relaxedly (i.e., + // optimize it out) + bool f = false; + if (__c11_atomic_compare_exchange_weak(&is_freeing, + &f, true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { + break; + } + } + } + // If there is more than one thread in this function, osif_free() is + // protected by is_freeing. Release it after the osif_free() + // call has been made and the lastfree bookkeeping has been done. + osif_free(vaddr, size); + spl_xat_lastfree = gethrtime(); + is_freeing = false; + a_waiters--; + kpreempt(KPREEMPT_SYNC); + // since we just gave back xnu enough to satisfy an allocation + // in at least the smaller buckets, let's wake up anyone in + // the cv_wait() in vmem_xalloc([bucket_#], ...) + vmem_bucket_wake_all_waiters(); +} + +// return 0 if the bit was unset before the atomic OR. +static inline bool +vba_atomic_lock_bucket(volatile _Atomic uint16_t *bbap, uint16_t bucket_bit) +{ + + // We use a test-and-set of the appropriate bit + // in buckets_busy_allocating; if it was not set, + // then break out of the loop. + // + // This compiles into an orl, cmpxchgw instruction pair. + // the return from __c11_atomic_fetch_or() is the + // previous value of buckets_busy_allocating. + + uint16_t prev = + __c11_atomic_fetch_or(bbap, bucket_bit, __ATOMIC_SEQ_CST); + if (prev & bucket_bit) + return (false); // we did not acquire the bit lock here + else + return (true); // we turned the bit from 0 to 1 +} + +static void * +vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) +{ + + if (vmflags & VM_NO_VBA) + return (NULL); + + // caller is spl_heap_arena looking for memory. + // null_vmp will be spl_default_arena_parent, and so + // is just a placeholder. + + vmem_t *calling_arena = spl_heap_arena; + + static volatile _Atomic uint32_t hipriority_allocators = 0; + boolean_t local_hipriority_allocator = false; + + if (0 != (vmflags & (VM_PUSHPAGE | VM_NOSLEEP | VM_PANIC | VM_ABORT))) { + local_hipriority_allocator = true; + hipriority_allocators++; + } + + if (!ISP2(size)) + atomic_inc_64(&spl_bucket_non_pow2_allocs); + + vmem_t *bvmp = vmem_bucket_arena_by_size(size); + + // there are 13 buckets, so use a 16-bit scalar to hold + // a set of bits, where each bit corresponds to an in-progress + // vmem_alloc(bucket, ...) below. + + static volatile _Atomic uint16_t buckets_busy_allocating = 0; + const uint16_t bucket_number = vmem_bucket_number(size); + const uint16_t bucket_bit = (uint16_t)1 << bucket_number; + + spl_vba_threads[bucket_number]++; + + static volatile _Atomic uint32_t waiters = 0; + + // First, if we are VM_SLEEP, check for memory, try some pressure, + // and if that doesn't work, force entry into the loop below. + + bool loop_once = false; + + if ((vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) == 0 && + ! vmem_canalloc_atomic(bvmp, size)) { + if (spl_vmem_xnu_useful_bytes_free() < (MAX(size, + 16ULL*1024ULL*1024ULL))) { + spl_free_set_emergency_pressure(size); + IOSleep(1); + if (!vmem_canalloc_atomic(bvmp, size) && + (spl_vmem_xnu_useful_bytes_free() < (MAX(size, + 16ULL*1024ULL*1024ULL)))) { + loop_once = true; + } + } + } + + // spin-sleep: if we would need to go to the xnu allocator. + // + // We want to avoid a burst of allocs from bucket_heap's children + // successively hitting a low-memory condition, or alternatively + // each successfully importing memory from xnu when they can share + // a single import. + // + // We also want to take advantage of any memory that becomes available + // in bucket_heap. + // + // If there is more than one thread in this function (~ few percent) + // then the subsequent threads are put into the loop below. They + // can escape the loop if they are [1]non-waiting allocations, or + // [2]if they become the only waiting thread, or + // [3]if the cv_timedwait_hires returns -1 (which represents EWOULDBLOCK + // from msleep() which gets it from _sleep()'s THREAD_TIMED_OUT) + // allocating in the bucket, or [4]if this thread has (rare condition) + // spent a quarter of a second in the loop. + + if (waiters++ > 1 || loop_once) { + atomic_inc_64(&spl_vba_loop_entries); + } + + static _Atomic uint32_t max_waiters_seen = 0; + + if (waiters > max_waiters_seen) { + max_waiters_seen = waiters; + dprintf("SPL: %s: max_waiters_seen increased to %u\n", __func__, + max_waiters_seen); + } + + // local counters, to be added atomically to global kstat variables + uint64_t local_memory_blocked = 0, local_cv_timeout = 0; + uint64_t local_loop_timeout = 0; + uint64_t local_cv_timeout_blocked = 0, local_loop_timeout_blocked = 0; + uint64_t local_sleep = 0, local_hipriority_blocked = 0; + + const uint64_t loop_ticks = 25; // a tick is 10 msec, so 250 msec + const uint64_t hiprio_loop_ticks = 4; // 40 msec + + for (uint64_t entry_time = zfs_lbolt(), + loop_timeout = entry_time + loop_ticks, + hiprio_timeout = entry_time + hiprio_loop_ticks, timedout = 0; + waiters > 1UL || loop_once; /* empty */) { + loop_once = false; + // non-waiting allocations should proceeed to vmem_alloc() + // immediately + if (vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) { + break; + } + if (vmem_canalloc_atomic(bvmp, size)) { + // We can probably vmem_alloc(bvmp, size, vmflags). + // At worst case it will give us a NULL and we will + // end up on the vmp's cv_wait. + // + // We can have threads with different bvmp + // taking this exit, and will proceed concurrently. + // + // However, we should protect against a burst of + // callers hitting the same bvmp before the allocation + // results are reflected in + // vmem_canalloc_atomic(bvmp, ...) + if (local_hipriority_allocator == false && + hipriority_allocators > 0) { + // more high priority allocations are wanted, + // so this thread stays here + local_hipriority_blocked++; + } else if (vba_atomic_lock_bucket( + &buckets_busy_allocating, bucket_bit)) { + // we are not being blocked by another allocator + // to the same bucket, or any higher priority + // allocator + atomic_inc_64(&spl_vba_parent_memory_appeared); + break; + // The vmem_alloc() should return extremely + // quickly from an INSTANTFIT allocation that + // canalloc predicts will succeed. + } else { + // another thread is trying to use the free + // memory in the bucket_## arena; there might + // still be free memory there after its + // allocation is completed, and there might be + // excess in the bucket_heap arena, so stick + // around in this loop. + local_memory_blocked++; + cv_broadcast(&bvmp->vm_cv); + } + } + if (timedout > 0) { + if (local_hipriority_allocator == false && + hipriority_allocators > 0) { + local_hipriority_blocked++; + } else if (vba_atomic_lock_bucket( + &buckets_busy_allocating, bucket_bit)) { + if (timedout & 1) + local_cv_timeout++; + if (timedout & 6 || zfs_lbolt() >= loop_timeout) + local_loop_timeout++; + break; + } else { + if (timedout & 1) { + local_cv_timeout_blocked++; + } + if (timedout & 6) { + local_loop_timeout_blocked++; + } else if (zfs_lbolt() > loop_timeout) { + timedout |= 2; + } + // flush the current thread in xat() out of + // xat()'s for() loop and into xat_bail() + cv_broadcast(&bvmp->vm_cv); + } + } + // The bucket is already allocating, or the bucket needs + // more memory to satisfy vmem_allocat(bvmp, size, VM_NOSLEEP), + // or we want to give the bucket some time to acquire more + // memory. + // substitute for the vmp arena's cv_wait in vmem_xalloc() + // (vmp is the bucket_heap AKA spl_heap_arena) + mutex_enter(&calling_arena->vm_lock); + local_sleep++; + if (local_sleep >= 1000ULL) { + atomic_add_64(&spl_vba_sleep, local_sleep - 1ULL); + local_sleep = 1ULL; + atomic_add_64(&spl_vba_cv_timeout_blocked, + local_cv_timeout_blocked); + local_cv_timeout_blocked = 0; + atomic_add_64(&spl_vba_loop_timeout_blocked, + local_loop_timeout_blocked); + local_loop_timeout_blocked = 0; + atomic_add_64(&spl_vba_hiprio_blocked, + local_hipriority_blocked); + local_hipriority_blocked = 0; + if (local_memory_blocked > 1ULL) { + atomic_add_64(&spl_vba_parent_memory_blocked, + local_memory_blocked - 1ULL); + local_memory_blocked = 1ULL; + } + } + clock_t wait_time = MSEC2NSEC(30); + if (timedout > 0 || local_memory_blocked > 0) { + wait_time = MSEC2NSEC(1); + } + int ret = cv_timedwait_hires(&calling_arena->vm_cv, + &calling_arena->vm_lock, + wait_time, 0, 0); + // We almost certainly have exited because of a + // signal/broadcast, but maybe just timed out. + // Either way, recheck memory. + mutex_exit(&calling_arena->vm_lock); + if (ret == -1) { + // cv_timedwait_hires timer expired + timedout |= 1; + cv_broadcast(&bvmp->vm_cv); + } else if ((timedout & 2) == 0) { + // we were awakened; check to see if we have been + // in the for loop for a long time + uint64_t n = zfs_lbolt(); + if (n > loop_timeout) { + timedout |= 2; + extern uint64_t real_total_memory; + spl_free_set_emergency_pressure( + real_total_memory / 64LL); + // flush the current thread in xat() out of + // xat()'s for() loop and into xat_bail() + cv_broadcast(&bvmp->vm_cv); + } else if (local_hipriority_allocator && + n > hiprio_timeout && waiters > 1UL) { + timedout |= 4; + } + } + } + + /* + * Turn on the exclusion bit in buckets_busy_allocating, to + * prevent multiple threads from calling vmem_alloc() on the + * same bucket arena concurrently rather than serially. + * + * This principally reduces the liklihood of asking xnu for + * more memory when other memory is or becomes available. + * + * This exclusion only applies to VM_SLEEP allocations; + * others (VM_PANIC, VM_NOSLEEP, VM_ABORT) will go to + * vmem_alloc() concurrently with any other threads. + * + * Since we aren't doing a test-and-set operation like above, + * we can just use |= and &= below and get correct atomic + * results, instead of using: + * + * __c11_atomic_fetch_or(&buckets_busy_allocating, + * bucket_bit, __ATOMIC_SEQ_CST); + * with the &= down below being written as + * __c11_atomic_fetch_and(&buckets_busy_allocating, + * ~bucket_bit, __ATOMIC_SEQ_CST); + * + * and this makes a difference with no optimization either + * compiling the whole file or with __attribute((optnone)) + * in front of the function decl. In particular, the non- + * optimized version that uses the builtin __c11_atomic_fetch_{and,or} + * preserves the C program order in the machine language output, + * inersting cmpxchgws, while all optimized versions, and the + * non-optimized version using the plainly-written version, reorder + * the "orw regr, memory" and "andw register, memory" (these are atomic + * RMW operations in x86-64 when the memory is naturally aligned) so + * that the strong memory model x86-64 promise that later loads see the + * results of earlier stores. + * + * clang+llvm simply are good at optimizing _Atomics and + * the optimized code differs only in line numbers and + * among all three approaches (as plainly written, using + * the __c11_atomic_fetch_{or,and} with sequential consistency, + * or when compiling with at least -O optimization so an + * atomic_or_16(&buckets_busy_allocating) built with GCC intrinsics + * is actually inlined rather than a function call). + * + */ + + // in case we left the loop by being the only waiter, stop the + // next thread arriving from leaving the for loop because + // vmem_canalloc(bvmp, that_thread's_size) is true. + + buckets_busy_allocating |= bucket_bit; + + // update counters + if (local_sleep > 0) + atomic_add_64(&spl_vba_sleep, local_sleep); + if (local_memory_blocked > 0) + atomic_add_64(&spl_vba_parent_memory_blocked, + local_memory_blocked); + if (local_cv_timeout > 0) + atomic_add_64(&spl_vba_cv_timeout, local_cv_timeout); + if (local_cv_timeout_blocked > 0) + atomic_add_64(&spl_vba_cv_timeout_blocked, + local_cv_timeout_blocked); + if (local_loop_timeout > 0) + atomic_add_64(&spl_vba_loop_timeout, local_loop_timeout); + if (local_loop_timeout_blocked > 0) + atomic_add_64(&spl_vba_loop_timeout_blocked, + local_loop_timeout_blocked); + if (local_hipriority_blocked > 0) + atomic_add_64(&spl_vba_hiprio_blocked, + local_hipriority_blocked); + + // There is memory in this bucket, or there are no other waiters, + // or we aren't a VM_SLEEP allocation, or we iterated out of the + // for loop. + // vmem_alloc() and vmem_xalloc() do their own mutex serializing + // on bvmp->vm_lock, so we don't have to here. + // + // vmem_alloc may take some time to return (especially for VM_SLEEP + // allocations where we did not take the vm_canalloc(bvmp...) break out + // of the for loop). Therefore, if we didn't enter the for loop at all + // because waiters was 0 when we entered this function, + // subsequent callers will enter the for loop. + + void *m = vmem_alloc(bvmp, size, vmflags); + + // allow another vmem_canalloc() through for this bucket + // by atomically turning off the appropriate bit + + /* + * Except clang+llvm DTRT because of _Atomic, could be written as: + * __c11_atomic_fetch_and(&buckets_busy_allocating, + * ~bucket_bit, __ATOMIC_SEQ_CST); + * + * On processors with more relaxed memory models, it might be + * more efficient to do so with release semantics here, and + * in the atomic |= above, with acquire semantics in the bit tests, + * but on the other hand it may be hard to do better than clang+llvm. + */ + + buckets_busy_allocating &= ~bucket_bit; + + if (local_hipriority_allocator) + hipriority_allocators--; + + // if we got an allocation, wake up the arena cv waiters + // to let them try to exit the for(;;) loop above and + // exit the cv_wait() in vmem_xalloc(vmp, ...) + + if (m != NULL) { + cv_broadcast(&calling_arena->vm_cv); + } + + waiters--; + spl_vba_threads[bucket_number]--; + return (m); +} + +static void +vmem_bucket_free(vmem_t *null_vmp, void *vaddr, size_t size) +{ + vmem_t *calling_arena = spl_heap_arena; + + vmem_free(vmem_bucket_arena_by_size(size), vaddr, size); + + // wake up arena waiters to let them try an alloc + cv_broadcast(&calling_arena->vm_cv); +} + +static inline int64_t +vmem_bucket_arena_free(uint16_t bucket) +{ + VERIFY(bucket < VMEM_BUCKETS); + return ((int64_t)vmem_size_semi_atomic(vmem_bucket_arena[bucket], + VMEM_FREE)); +} + +static inline int64_t +vmem_bucket_arena_used(int bucket) +{ + VERIFY(bucket < VMEM_BUCKETS); + return ((int64_t)vmem_size_semi_atomic(vmem_bucket_arena[bucket], + VMEM_ALLOC)); +} + + +int64_t +vmem_buckets_size(int typemask) +{ + int64_t total_size = 0; + + for (int i = 0; i < VMEM_BUCKETS; i++) { + int64_t u = vmem_bucket_arena_used(i); + int64_t f = vmem_bucket_arena_free(i); + if (typemask & VMEM_ALLOC) + total_size += u; + if (typemask & VMEM_FREE) + total_size += f; + } + if (total_size < 0) + total_size = 0; + + return ((size_t)total_size); +} + +static uint64_t +spl_validate_bucket_span_size(uint64_t val) +{ + if (!ISP2(val)) { + printf("SPL: %s: WARNING %llu is not a power of two, " + "not changing.\n", __func__, val); + return (0); + } + if (val < 128ULL*1024ULL || val > 16ULL*1024ULL*1024ULL) { + printf("SPL: %s: WARNING %llu is out of range [128k - 16M], " + "not changing.\n", __func__, val); + return (0); + } + return (val); +} + +static inline void +spl_modify_bucket_span_size(int bucket, uint64_t size) +{ + vmem_t *bvmp = vmem_bucket_arena[bucket]; + + mutex_enter(&bvmp->vm_lock); + bvmp->vm_min_import = size; + mutex_exit(&bvmp->vm_lock); +} + +static inline void +spl_modify_bucket_array() +{ + for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { + // i = 12, bucket = 0, contains allocs from 8192 to 16383 bytes, + // and should never ask xnu for < 16384 bytes, so as to avoid + // asking xnu for a non-power-of-two size. + const int bucket = i - VMEM_BUCKET_LOWBIT; + const uint32_t bucket_alloc_minimum_size = 1UL << (uint32_t)i; + const uint32_t bucket_parent_alloc_minimum_size = + bucket_alloc_minimum_size * 2UL; + + switch (i) { + // see vmem_init() below for details + case 16: + case 17: + spl_modify_bucket_span_size(bucket, + MAX(spl_bucket_tunable_small_span, + bucket_parent_alloc_minimum_size)); + break; + default: + spl_modify_bucket_span_size(bucket, + MAX(spl_bucket_tunable_large_span, + bucket_parent_alloc_minimum_size)); + break; + } + } +} + +static inline void +spl_printf_bucket_span_sizes(void) +{ + // this doesn't have to be super-exact + dprintf("SPL: %s: ", __func__); + for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { + int bnum = i - VMEM_BUCKET_LOWBIT; + vmem_t *bvmp = vmem_bucket_arena[bnum]; + dprintf("%llu ", (uint64_t)bvmp->vm_min_import); + } + dprintf("\n"); +} + +static inline void +spl_set_bucket_spans(uint64_t l, uint64_t s) +{ + if (spl_validate_bucket_span_size(l) && + spl_validate_bucket_span_size(s)) { + atomic_swap_64(&spl_bucket_tunable_large_span, l); + atomic_swap_64(&spl_bucket_tunable_small_span, s); + spl_modify_bucket_array(); + } +} + +void +spl_set_bucket_tunable_large_span(uint64_t size) +{ + uint64_t s = 0; + + mutex_enter(&vmem_xnu_alloc_lock); + atomic_swap_64(&s, spl_bucket_tunable_small_span); + spl_set_bucket_spans(size, s); + mutex_exit(&vmem_xnu_alloc_lock); + + spl_printf_bucket_span_sizes(); +} + +void +spl_set_bucket_tunable_small_span(uint64_t size) +{ + uint64_t l = 0; + + mutex_enter(&vmem_xnu_alloc_lock); + atomic_swap_64(&l, spl_bucket_tunable_large_span); + spl_set_bucket_spans(l, size); + mutex_exit(&vmem_xnu_alloc_lock); + + spl_printf_bucket_span_sizes(); +} + +static void * +spl_vmem_default_alloc(vmem_t *vmp, size_t size, int vmflags) +{ + extern void *osif_malloc(uint64_t); + return (osif_malloc(size)); +} + +static void +spl_vmem_default_free(vmem_t *vmp, void *vaddr, size_t size) +{ + extern void osif_free(void *, uint64_t); + osif_free(vaddr, size); +} + +vmem_t * +vmem_init(const char *heap_name, + void *heap_start, size_t heap_size, size_t heap_quantum, + void *(*heap_alloc)(vmem_t *, size_t, int), + void (*heap_free)(vmem_t *, void *, size_t)) +{ + uint32_t id; + int nseg = VMEM_SEG_INITIAL; + vmem_t *heap; + + // XNU mutexes need initialisation + mutex_init(&vmem_list_lock, "vmem_list_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_segfree_lock, "vmem_segfree_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_sleep_lock, "vmem_sleep_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_nosleep_lock, "vmem_nosleep_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_pushpage_lock, "vmem_pushpage_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_panic_lock, "vmem_panic_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_xnu_alloc_lock, "vmem_xnu_alloc_lock", MUTEX_DEFAULT, + NULL); + + while (--nseg >= 0) + vmem_putseg_global(&vmem_seg0[nseg]); + + /* + * On OSX we ultimately have to use the OS allocator + * as the ource and sink of memory as it is allocated + * and freed. + * + * The spl_root_arena_parent is needed in order to provide a + * base arena with an always-NULL afunc and ffunc in order to + * end the searches done by vmem_[x]alloc and vm_xfree; it + * serves no other purpose; its stats will always be zero. + * + */ + + // id 0 + spl_default_arena_parent = vmem_create("spl_default_arena_parent", + NULL, 0, heap_quantum, NULL, NULL, NULL, 0, VM_SLEEP); + + // illumos/openzfs has a gigantic pile of memory that it can use + // for its first arena; + // o3x is not so lucky, so we start with this + static char initial_default_block[16ULL*1024ULL*1024ULL] + __attribute__((aligned(4096))) = { 0 }; + + // The default arena is very low-bandwidth; it supplies the initial + // large allocation for the heap arena below, and it serves as the + // parent of the vmem_metadata arena. It will typically do only 2 + // or 3 parent_alloc calls (to spl_vmem_default_alloc) in total. + + spl_default_arena = vmem_create("spl_default_arena", // id 1 + initial_default_block, 16ULL*1024ULL*1024ULL, + heap_quantum, spl_vmem_default_alloc, spl_vmem_default_free, + spl_default_arena_parent, 16ULL*1024ULL*1024ULL, + VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); + + VERIFY(spl_default_arena != NULL); + + // The bucket arenas satisfy allocations & frees from the bucket heap + // that are dispatched to the bucket whose power-of-two label is the + // smallest allocation that vmem_bucket_allocate will ask for. + // + // The bucket arenas in turn exchange memory with XNU's allocator/freer + // in large spans (~ 1 MiB is stable on all systems but creates bucket + // fragmentation) + // + // Segregating by size constrains internal fragmentation within the + // bucket and provides kstat.vmem visiblity and span-size policy to + // be applied to particular buckets (notably the sources of most + // allocations, see the comments below) + // + // For VMEM_BUCKET_HIBIT == 12, + // vmem_bucket_arena[n] holds allocations from 2^[n+11]+1 to 2^[n+12], + // so for [n] = 0, 2049-4096, for [n]=5 65537-131072, + // for [n]=7 (256k+1)-512k + // + // so "kstat.vmvm.vmem.bucket_1048576" should be read as the bucket + // arena containing allocations 1 MiB and smaller, but larger + // than 512 kiB. + + // create arenas for the VMEM_BUCKETS, id 2 - id 14 + + extern uint64_t real_total_memory; + VERIFY3U(real_total_memory, >=, 1024ULL*1024ULL*1024ULL); + + // adjust minimum bucket span size for memory size + // see comments in the switch below + // large span: 1 MiB and bigger on large-memory (> 32 GiB) systems + // small span: 256 kiB and bigger on large-memory systems + const uint64_t k = 1024ULL; + const uint64_t qm = 256ULL * k; + const uint64_t m = 1024ULL* k; + const uint64_t big = MAX(real_total_memory / (k * 32ULL), m); + const uint64_t small = MAX(real_total_memory / (k * 128ULL), qm); + spl_bucket_tunable_large_span = MIN(big, 16ULL * m); + spl_bucket_tunable_small_span = small; + dprintf("SPL: %s: real_total_memory %llu, large spans %llu, small " + "spans %llu\n", __func__, real_total_memory, + spl_bucket_tunable_large_span, spl_bucket_tunable_small_span); + char *buf = vmem_alloc(spl_default_arena, VMEM_NAMELEN + 21, VM_SLEEP); + for (int32_t i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { + size_t minimum_allocsize = 0; + const uint64_t bucket_largest_size = (1ULL << (uint64_t)i); + (void) snprintf(buf, VMEM_NAMELEN + 20, "%s_%llu", + "bucket", bucket_largest_size); + dprintf("SPL: %s creating arena %s (i == %d)\n", __func__, buf, + i); + switch (i) { + case 15: + case 16: + /* + * With the arrival of abd, the 2^15 (== 32768) and 2^16 + * buckets are by far the most busy, holding + * respectively the qcache spans of kmem_va (the + * kmem_alloc et al. heap) and zfs_qcache (notably the + * source for the abd_chunk arena) + * + * The lifetime of early (i.e., after import and mount) + * allocations can be highly variable, leading + * to persisting fragmentation from the first eviction + * after arc has grown large. This can happen if, for + * example, there substantial import and mounting (and + * mds/mdworker and backupd scanning) activity before a + * user logs in and starts demanding memory in userland + * (e.g. by firing up a browser or mail app). + * + * Crucially, this makes it difficult to give back + * memory to xnu without holding the ARC size down for + * long periods of time. + * + * We can mitigate this by exchanging smaller + * amounts of memory with xnu for these buckets. + * There are two downsides: xnu's memory + * freelist will be prone to greater + * fragmentation, which will affect all + * allocation and free activity using xnu's + * allocator including kexts other than our; and + * we are likely to have more waits in the throttled + * alloc function, as more threads are likely to require + * slab importing into the kmem layer and fewer threads + * can be satisfied by a small allocation vs a large + * one. + * + * The import sizes are sysadmin-tunable by setting + * kstat.spl.misc.spl_misc.spl_tunable_small_span + * to a power-of-two number of bytes in zsysctl.conf + * should a sysadmin prefer non-early allocations to + * be larger or smaller depending on system performance + * and workload. + * + * However, a zfs booting system must use the defaults + * here for the earliest allocations, therefore they. + * should be only large enough to protect system + * performance if the sysadmin never changes the tunable + * span sizes. + */ + minimum_allocsize = MAX(spl_bucket_tunable_small_span, + bucket_largest_size * 4); + break; + default: + /* + * These buckets are all relatively low bandwidth and + * with relatively uniform lifespans for most + * allocations (borrowed arc buffers dominate). + * They should be large enough that they do not + * pester xnu. + */ + minimum_allocsize = MAX(spl_bucket_tunable_large_span, + bucket_largest_size * 4); + break; + } + dprintf("SPL: %s setting bucket %d (%d) to size %llu\n", + __func__, i, (int)(1 << i), (uint64_t)minimum_allocsize); + const int bucket_number = i - VMEM_BUCKET_LOWBIT; + vmem_t *b = vmem_create(buf, NULL, 0, heap_quantum, + xnu_alloc_throttled, xnu_free_throttled, + spl_default_arena_parent, minimum_allocsize, + VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE | VMC_TIMEFREE); + VERIFY(b != NULL); + b->vm_min_import = minimum_allocsize; + b->vm_source = b; + vmem_bucket_arena[bucket_number] = b; + vmem_bucket_id_to_bucket_number[b->vm_id] = bucket_number; + } + + vmem_free(spl_default_arena, buf, VMEM_NAMELEN + 21); + // spl_heap_arena, the bucket heap, is the primary interface + // to the vmem system + + // all arenas not rooted to vmem_metadata will be rooted to + // spl_heap arena. + + spl_heap_arena = vmem_create("bucket_heap", // id 15 + NULL, 0, heap_quantum, + vmem_bucket_alloc, vmem_bucket_free, spl_default_arena_parent, 0, + VM_SLEEP | VMC_TIMEFREE | VMC_OLDFIRST); + + VERIFY(spl_heap_arena != NULL); + + // add a fixed-sized allocation to spl_heap_arena; this reduces the + // need to talk to the bucket arenas by a substantial margin + // (kstat.vmem.vmem.bucket_heap.{alloc+free} is much greater than + // kstat.vmem.vmem.bucket_heap.parent_{alloc+free}, and improves with + // increasing initial fixed allocation size. + + const size_t mib = 1024ULL * 1024ULL; + const size_t gib = 1024ULL * mib; + size_t resv_size = 128ULL * mib; + extern uint64_t real_total_memory; + + if (real_total_memory >= 4ULL * gib) + resv_size = 256ULL * mib; + if (real_total_memory >= 8ULL * gib) + resv_size = 512ULL * mib; + if (real_total_memory >= 16ULL * gib) + resv_size = gib; + + dprintf("SPL: %s adding fixed allocation of %llu to the bucket_heap\n", + __func__, (uint64_t)resv_size); + + spl_heap_arena_initial_alloc = vmem_add(spl_heap_arena, + vmem_alloc(spl_default_arena, resv_size, VM_SLEEP), + resv_size, VM_SLEEP); + + VERIFY(spl_heap_arena_initial_alloc != NULL); + + spl_heap_arena_initial_alloc_size = resv_size; + + // kstat.vmem.vmem.heap : kmem_cache_alloc() and similar calls + // to handle in-memory datastructures other than arc and zio buffers. + + heap = vmem_create(heap_name, // id 16 + NULL, 0, heap_quantum, + vmem_alloc, vmem_free, spl_heap_arena, 0, + VM_SLEEP); + + VERIFY(heap != NULL); + + // Root all the low bandwidth metadata arenas to the default arena. + // The vmem_metadata allocations will all be 32 kiB or larger, + // and the total allocation will generally cap off around 24 MiB. + + vmem_metadata_arena = vmem_create("vmem_metadata", // id 17 + NULL, 0, heap_quantum, vmem_alloc, vmem_free, spl_default_arena, + 8 * PAGESIZE, VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); + + VERIFY(vmem_metadata_arena != NULL); + + vmem_seg_arena = vmem_create("vmem_seg", // id 18 + NULL, 0, heap_quantum, + vmem_alloc, vmem_free, vmem_metadata_arena, 0, + VM_SLEEP | VMC_POPULATOR); + + VERIFY(vmem_seg_arena != NULL); + + vmem_hash_arena = vmem_create("vmem_hash", // id 19 + NULL, 0, 8, + vmem_alloc, vmem_free, vmem_metadata_arena, 0, + VM_SLEEP); + + VERIFY(vmem_hash_arena != NULL); + + vmem_vmem_arena = vmem_create("vmem_vmem", // id 20 + vmem0, sizeof (vmem0), 1, + vmem_alloc, vmem_free, vmem_metadata_arena, 0, + VM_SLEEP); + + VERIFY(vmem_vmem_arena != NULL); + + // 21 (0-based) vmem_create before this line. - macroized + // NUMBER_OF_ARENAS_IN_VMEM_INIT + for (id = 0; id < vmem_id; id++) { + (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t), + 1, 0, 0, &vmem0[id], &vmem0[id + 1], + VM_NOSLEEP | VM_BESTFIT | VM_PANIC); + } + + dprintf("SPL: starting vmem_update() thread\n"); + vmem_update(NULL); + + return (heap); +} + +struct free_slab { + vmem_t *vmp; + size_t slabsize; + void *slab; + list_node_t next; +}; +static list_t freelist; + +static void vmem_fini_freelist(void *vmp, void *start, size_t size) +{ + struct free_slab *fs; + + MALLOC(fs, struct free_slab *, sizeof (struct free_slab), M_TEMP, + M_WAITOK); + fs->vmp = vmp; + fs->slabsize = size; + fs->slab = start; + list_link_init(&fs->next); + list_insert_tail(&freelist, fs); +} + +void +vmem_free_span_list(void) +{ + int total = 0; + int total_count = 0; + struct free_slab *fs; +// int release = 1; + + while ((fs = list_head(&freelist))) { + total_count++; + total += fs->slabsize; + list_remove(&freelist, fs); + /* + * Commenting out due to BSOD during uninstallation, + * will revisit later. + * + * for (int id = 0; id < VMEM_INITIAL; id++) { + * if (&vmem0[id] == fs->slab) { + * release = 0; + * break; + * } + * } + * + * if (release) + * fs->vmp->vm_source_free(fs->vmp, fs->slab, + * fs->slabsize); + * release = 1; + * + */ + FREE(fs, M_TEMP); + } +} + +static void +vmem_fini_void(void *vmp, void *start, uint32_t size) +{ +} + +void +vmem_fini(vmem_t *heap) +{ + struct free_slab *fs; + uint64_t total; + + bsd_untimeout(vmem_update, NULL); + + dprintf("SPL: %s: stopped vmem_update. Creating list and walking " + "arenas.\n", __func__); + + /* Create a list of slabs to free by walking the list of allocs */ + list_create(&freelist, sizeof (struct free_slab), + offsetof(struct free_slab, next)); + + /* Walk to list of allocations */ + + /* + * walking with VMEM_REENTRANT causes segment consolidation and + * freeing of spans the freelist contains a list of segments that + * are still allocated at the time of the walk; unfortunately the + * lists cannot be exact without complex multiple passes, locking, + * and a more complex vmem_fini_freelist(). + * + * Walking without VMEM_REENTRANT can produce a nearly-exact list + * of unfreed spans, which Illumos would then free directly after + * the list is complete. + * + * Unfortunately in O3X, that lack of exactness can lead to a panic + * caused by attempting to free to xnu memory that we already freed + * to xnu. Fortunately, we can get a sense of what would have been + * destroyed after the (non-reentrant) walking, and we printf that + * at the end of this function. + */ + + // Walk all still-alive arenas from leaves to the root + + vmem_walk(heap, VMEM_ALLOC | VMEM_REENTRANT, vmem_fini_void, heap); + + vmem_walk(heap, VMEM_ALLOC, vmem_fini_freelist, heap); + + vmem_free_span_list(); + dprintf("\nSPL: %s destroying heap\n", __func__); + vmem_destroy(heap); // PARENT: spl_heap_arena + + dprintf("SPL: %s: walking spl_heap_arena, aka bucket_heap (pass 1)\n", + __func__); + + vmem_walk(spl_heap_arena, VMEM_ALLOC | VMEM_REENTRANT, vmem_fini_void, + spl_heap_arena); + + dprintf("SPL: %s: calling vmem_xfree(spl_default_arena, ptr, %llu);\n", + __func__, (uint64_t)spl_heap_arena_initial_alloc_size); + + // forcibly remove the initial alloc from spl_heap_arena arena, whether + // or not it is empty. below this point, any activity on + // spl_default_arena other than a non-reentrant(!) walk and a destroy + // is unsafe (UAF or MAF). + // However, all the children of spl_heap_arena should now be destroyed. + + vmem_xfree(spl_default_arena, spl_heap_arena_initial_alloc, + spl_heap_arena_initial_alloc_size); + + printf("SPL: %s: walking spl_heap_arena, aka bucket_heap (pass 2)\n", + __func__); + + vmem_walk(spl_heap_arena, VMEM_ALLOC, vmem_fini_freelist, + spl_heap_arena); + vmem_free_span_list(); + + printf("SPL: %s: walking bucket arenas...\n", __func__); + + for (int i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { + const int bucket = i - VMEM_BUCKET_LOWBIT; + vmem_walk(vmem_bucket_arena[bucket], + VMEM_ALLOC | VMEM_REENTRANT, vmem_fini_void, + vmem_bucket_arena[bucket]); + + vmem_walk(vmem_bucket_arena[bucket], VMEM_ALLOC, + vmem_fini_freelist, vmem_bucket_arena[bucket]); + } + vmem_free_span_list(); + + dprintf("SPL: %s destroying spl_bucket_arenas...", __func__); + for (int32_t i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { + vmem_t *vmpt = vmem_bucket_arena[i - VMEM_BUCKET_LOWBIT]; + dprintf(" %llu", (1ULL << i)); + vmem_destroy(vmpt); // parent: spl_default_arena_parent + } + dprintf("\n"); + + printf("SPL: %s: walking vmem metadata-related arenas...\n", __func__); + + vmem_walk(vmem_vmem_arena, VMEM_ALLOC | VMEM_REENTRANT, + vmem_fini_void, vmem_vmem_arena); + + vmem_walk(vmem_vmem_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_vmem_arena); + + vmem_free_span_list(); + + // We should not do VMEM_REENTRANT on vmem_seg_arena or + // vmem_hash_arena or below to avoid causing work in + // vmem_seg_arena and vmem_hash_arena. + + vmem_walk(vmem_seg_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_seg_arena); + + vmem_free_span_list(); + + vmem_walk(vmem_hash_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_hash_arena); + vmem_free_span_list(); + + vmem_walk(vmem_metadata_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_metadata_arena); + + vmem_free_span_list(); + dprintf("SPL: %s walking the root arena (spl_default_arena)...\n", + __func__); + + vmem_walk(spl_default_arena, VMEM_ALLOC, + vmem_fini_freelist, spl_default_arena); + + vmem_free_span_list(); + + dprintf("SPL: %s destroying bucket heap\n", __func__); + // PARENT: spl_default_arena_parent (but depends on buckets) + vmem_destroy(spl_heap_arena); + + // destroying the vmem_vmem arena and any arena afterwards + // requires the use of vmem_destroy_internal(), which does + // not talk to vmem_vmem_arena like vmem_destroy() does. + // dprintf("SPL: %s destroying vmem_vmem_arena\n", __func__); + // vmem_destroy_internal(vmem_vmem_arena); + // parent: vmem_metadata_arena + + // destroying the seg arena means we must no longer + // talk to vmem_populate() + dprintf("SPL: %s destroying vmem_seg_arena\n", __func__); + vmem_destroy(vmem_seg_arena); + + // vmem_hash_arena may be freed-to in vmem_destroy_internal() + // so it should be just before the vmem_metadata_arena. + dprintf("SPL: %s destroying vmem_hash_arena\n", __func__); + vmem_destroy(vmem_hash_arena); // parent: vmem_metadata_arena + vmem_hash_arena = NULL; + + // XXX: if we panic on unload below here due to destroyed mutex, + // vmem_init() will need some reworking (e.g. have + // vmem_metadata_arena talk directly to xnu), or alternatively a + // vmem_destroy_internal_internal() function that does not touch + // vmem_hash_arena will need writing. + + dprintf("SPL: %s destroying vmem_metadata_arena\n", __func__); + vmem_destroy(vmem_metadata_arena); // parent: spl_default_arena + + dprintf("\nSPL: %s destroying spl_default_arena\n", __func__); + vmem_destroy(spl_default_arena); // parent: spl_default_arena_parent + dprintf("\nSPL: %s destroying spl_default_arena_parant\n", __func__); + vmem_destroy(spl_default_arena_parent); + + dprintf("SPL: %s destroying vmem_vmem_arena\n", __func__); + vmem_destroy_internal(vmem_vmem_arena); + + printf("SPL: arenas removed, now try destroying mutexes... "); + + printf("vmem_xnu_alloc_lock "); + mutex_destroy(&vmem_xnu_alloc_lock); + printf("vmem_panic_lock "); + mutex_destroy(&vmem_panic_lock); + printf("vmem_pushpage_lock "); + mutex_destroy(&vmem_pushpage_lock); + printf("vmem_nosleep_lock "); + mutex_destroy(&vmem_nosleep_lock); + printf("vmem_sleep_lock "); + mutex_destroy(&vmem_sleep_lock); + printf("vmem_segfree_lock "); + mutex_destroy(&vmem_segfree_lock); + printf("vmem_list_lock "); + mutex_destroy(&vmem_list_lock); + + printf("\nSPL: %s: walking list of live slabs at time of call to %s\n", + __func__, __func__); + + // annoyingly, some of these should be returned to xnu, but + // we have no idea which have already been freed to xnu, and + // freeing a second time results in a panic. + + /* Now release the list of allocs to built above */ + total = 0; + uint64_t total_count = 0; + while ((fs = list_head(&freelist))) { + total_count++; + total += fs->slabsize; + list_remove(&freelist, fs); + // extern void segkmem_free(vmem_t *, void *, size_t); + // segkmem_free(fs->vmp, fs->slab, fs->slabsize); + FREE(fs, M_TEMP); + } + printf("SPL: WOULD HAVE released %llu bytes (%llu spans) from arenas\n", + total, total_count); + list_destroy(&freelist); + printf("SPL: %s: Brief delay for readability...\n", __func__); + delay(hz); + printf("SPL: %s: done!\n", __func__); +} + +/* + * return true if inuse is much smaller than imported + */ +static inline bool +bucket_fragmented(const uint16_t bn, const uint64_t now) +{ + + // early during uptime, just let buckets grow. + + if (now < 600 * hz) + return (false); + + // if there has been no pressure in the past five minutes, + // then we will just let the bucket grow. + + const uint64_t timeout = 5ULL * 60ULL * hz; + + if (spl_free_last_pressure_wrapper() + timeout < now) + return (false); + + const vmem_t *vmp = vmem_bucket_arena[bn]; + + const int64_t imported = + (int64_t)vmp->vm_kstat.vk_mem_import.value.ui64; + const int64_t inuse = + (int64_t)vmp->vm_kstat.vk_mem_inuse.value.ui64; + const int64_t tiny = 64LL*1024LL*1024LL; + const int64_t small = tiny * 2LL; // 128 M + const int64_t medium = small * 2LL; // 256 + const int64_t large = medium * 2LL; // 512 + const int64_t huge = large * 2LL; // 1 G + const int64_t super_huge = huge * 2LL; // 2 + + const int64_t amount_free = imported - inuse; + + if (amount_free <= tiny || imported <= small) + return (false); + + const int64_t percent_free = (amount_free * 100LL) / imported; + + if (percent_free > 75LL) { + return (true); + } else if (imported <= medium) { + return (percent_free >= 50); + } else if (imported <= large) { + return (percent_free >= 33); + } else if (imported <= huge) { + return (percent_free >= 25); + } else if (imported <= super_huge) { + return (percent_free >= 15); + } else { + return (percent_free >= 10); + } +} + +/* + * return true if the bucket for size is fragmented + */ +static inline bool +spl_arc_no_grow_impl(const uint16_t b, const size_t size, + const boolean_t buf_is_metadata, kmem_cache_t **kc) +{ + static _Atomic uint8_t frag_suppression_counter[VMEM_BUCKETS] = { 0 }; + + const uint64_t now = zfs_lbolt(); + + const bool fragmented = bucket_fragmented(b, now); + + if (fragmented) { + if (size < 32768) { + // Don't suppress small qcached blocks when the + // qcache size (bucket_262144) is fragmented, + // since they will push everything else towards + // the tails of ARC lists without eating up a large + // amount of space themselves. + return (false); + } + const uint32_t b_bit = (uint32_t)1 << (uint32_t)b; + spl_arc_no_grow_bits |= b_bit; + const uint32_t sup_at_least_every = MIN(b_bit, 255); + const uint32_t sup_at_most_every = MAX(b_bit, 16); + const uint32_t sup_every = MIN(sup_at_least_every, + sup_at_most_every); + if (frag_suppression_counter[b] >= sup_every) { + frag_suppression_counter[b] = 0; + return (true); + } else { + frag_suppression_counter[b]++; + return (false); + } + } else { + const uint32_t b_bit = (uint32_t)1 << (uint32_t)b; + spl_arc_no_grow_bits &= ~b_bit; + } + + extern bool spl_zio_is_suppressed(const size_t, const uint64_t, + const boolean_t, kmem_cache_t **); + + return (spl_zio_is_suppressed(size, now, buf_is_metadata, kc)); +} + +static inline uint16_t +vmem_bucket_number_arc_no_grow(const size_t size) +{ + // qcaching on arc + if (size < 128*1024) + return (vmem_bucket_number(262144)); + else + return (vmem_bucket_number(size)); +} + +boolean_t +spl_arc_no_grow(size_t size, boolean_t buf_is_metadata, kmem_cache_t **zp) +{ + const uint16_t b = vmem_bucket_number_arc_no_grow(size); + + const bool rv = spl_arc_no_grow_impl(b, size, buf_is_metadata, zp); + + if (rv) { + atomic_inc_64(&spl_arc_no_grow_count); + } + + return ((boolean_t)rv); +} diff --git a/module/os/macos/spl/spl-vnode.c b/module/os/macos/spl/spl-vnode.c new file mode 100644 index 0000000000..bec92021b8 --- /dev/null +++ b/module/os/macos/spl/spl-vnode.c @@ -0,0 +1,497 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + + +int +vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, + struct vnode **vpp, enum create crwhy, mode_t umask) +{ + vfs_context_t vctx; + int fmode; + int error; + + fmode = filemode; + if (crwhy) + fmode |= O_CREAT; + // TODO I think this should be 'fmode' instead of 'filemode' + vctx = vfs_context_create((vfs_context_t)0); + error = vnode_open(pnamep, filemode, createmode, 0, vpp, vctx); + (void) vfs_context_rele(vctx); + return (error); +} + +int +vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode, + struct vnode **vpp, enum create crwhy, + mode_t umask, struct vnode *startvp) +{ + char *path; + int pathlen = MAXPATHLEN; + int error; + + path = (char *)kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + error = vn_getpath(startvp, path, &pathlen); + if (error == 0) { + strlcat(path, pnamep, MAXPATHLEN); + error = vn_open(path, seg, filemode, createmode, vpp, crwhy, + umask); + } + + kmem_free(path, MAXPATHLEN); + return (error); +} + +extern errno_t vnode_rename(const char *, const char *, int, vfs_context_t); + +errno_t +vnode_rename(const char *from, const char *to, int flags, vfs_context_t vctx) +{ + /* + * We need proper KPI changes to be able to safely update + * the zpool.cache file. For now, we return EPERM. + */ + return (EPERM); +} + +int +vn_rename(char *from, char *to, enum uio_seg seg) +{ + vfs_context_t vctx; + int error; + + vctx = vfs_context_create((vfs_context_t)0); + + error = vnode_rename(from, to, 0, vctx); + + (void) vfs_context_rele(vctx); + + return (error); +} + +extern errno_t vnode_remove(const char *, int, enum vtype, vfs_context_t); + +errno_t +vnode_remove(const char *name, int flag, enum vtype type, vfs_context_t vctx) +{ + /* + * Now that zed ZFS Event Daemon can handle the rename of zpool.cache + * we will silence this limitation, and look in zed.d/config.sync.sh + */ + return (EPERM); +} + + +int +vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) +{ + vfs_context_t vctx; + enum vtype type; + int error; + + type = dirflag == RMDIRECTORY ? VDIR : VREG; + + vctx = vfs_context_create((vfs_context_t)0); + + error = vnode_remove(fnamep, 0, type, vctx); + + (void) vfs_context_rele(vctx); + + return (error); +} + +int +VOP_SPACE(struct vnode *vp, int cmd, struct flock *fl, int flags, offset_t off, + cred_t *cr, void *ctx) +{ + int error = 0; +#ifdef F_PUNCHHOLE + if (cmd == F_FREESP) { + fpunchhole_t fpht; + fpht.fp_flags = 0; + fpht.fp_offset = fl->l_start; + fpht.fp_length = fl->l_len; + if (vnode_getwithref(vp) == 0) { + error = VNOP_IOCTL(vp, F_PUNCHHOLE, (caddr_t)&fpht, 0, + ctx); + (void) vnode_put(vp); + } + } +#endif + return (error); +} + +int +VOP_CLOSE(struct vnode *vp, int flag, int count, offset_t off, + void *cr, void *k) +{ + vfs_context_t vctx; + int error; + + vctx = vfs_context_create((vfs_context_t)0); + error = vnode_close(vp, flag & FWRITE, vctx); + (void) vfs_context_rele(vctx); + return (error); +} + +int +VOP_FSYNC(struct vnode *vp, int flags, void* unused, void *uused2) +{ + vfs_context_t vctx; + int error; + + vctx = vfs_context_create((vfs_context_t)0); + error = VNOP_FSYNC(vp, (flags == FSYNC), vctx); + (void) vfs_context_rele(vctx); + return (error); +} + +int +VOP_GETATTR(struct vnode *vp, vattr_t *vap, int flags, void *x3, void *x4) +{ + vfs_context_t vctx; + int error; + + vctx = vfs_context_create((vfs_context_t)0); + error = vnode_getattr(vp, vap, vctx); + (void) vfs_context_rele(vctx); + return (error); +} + +errno_t VNOP_LOOKUP(struct vnode *, struct vnode **, + struct componentname *, vfs_context_t); + +errno_t +VOP_LOOKUP(struct vnode *vp, struct vnode **vpp, + struct componentname *cn, vfs_context_t ct) +{ + return (VNOP_LOOKUP(vp, vpp, cn, ct)); +} + +#undef VFS_ROOT + +extern int VFS_ROOT(mount_t, struct vnode **, vfs_context_t); +int +spl_vfs_root(mount_t mount, struct vnode **vp) +{ + return (VFS_ROOT(mount, vp, vfs_context_current())); +} + +void +vfs_mountedfrom(struct mount *vfsp, char *osname) +{ + (void) copystr(osname, vfs_statfs(vfsp)->f_mntfromname, MNAMELEN - 1, + 0); +} + +static kmutex_t spl_getf_lock; +static list_t spl_getf_list; + +int +spl_vnode_init(void) +{ + mutex_init(&spl_getf_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&spl_getf_list, sizeof (struct spl_fileproc), + offsetof(struct spl_fileproc, f_next)); + return (0); +} + +void +spl_vnode_fini(void) +{ + mutex_destroy(&spl_getf_lock); + list_destroy(&spl_getf_list); +} + +#include +struct fileproc; + +extern int fp_drop(struct proc *p, int fd, struct fileproc *fp, int locked); +extern int fp_drop_written(struct proc *p, int fd, struct fileproc *fp, + int locked); +extern int fp_lookup(struct proc *p, int fd, struct fileproc **resultfp, + int locked); +extern int fo_read(struct fileproc *fp, struct uio *uio, int flags, + vfs_context_t ctx); +extern int fo_write(struct fileproc *fp, struct uio *uio, int flags, + vfs_context_t ctx); +extern int file_vnode_withvid(int, struct vnode **, uint32_t *); +extern int file_drop(int); + +/* + * getf(int fd) - hold a lock on a file descriptor, to be released by calling + * releasef(). On OSX we will also look up the vnode of the fd for calls + * to spl_vn_rdwr(). + */ +void * +getf(int fd) +{ + struct fileproc *fp = NULL; + struct spl_fileproc *sfp = NULL; + struct vnode *vp = NULL; + uint32_t vid; + + /* + * We keep the "fp" pointer as well, both for unlocking in releasef() + * and used in vn_rdwr(). + */ + + sfp = kmem_alloc(sizeof (*sfp), KM_SLEEP); + if (!sfp) + return (NULL); + + if (fp_lookup(current_proc(), fd, &fp, 0 /* !locked */)) { + kmem_free(sfp, sizeof (*sfp)); + return (NULL); + } + + dprintf("current_proc %p: fd %d fp %p vp %p\n", current_proc(), + fd, fp, vp); + + sfp->f_vnode = vp; + sfp->f_fd = fd; + sfp->f_offset = 0; + sfp->f_proc = current_proc(); + sfp->f_fp = fp; + + /* Also grab vnode, so we can fish out the minor, for onexit */ + if (!file_vnode_withvid(fd, &vp, &vid)) { + sfp->f_vnode = vp; + if (vnode_vtype(vp) != VDIR) { + sfp->f_file = minor(vnode_specrdev(vp)); + } + file_drop(fd); + } + + mutex_enter(&spl_getf_lock); + list_insert_tail(&spl_getf_list, sfp); + mutex_exit(&spl_getf_lock); + + return (sfp); +} + +struct vnode * +getf_vnode(void *fp) +{ + struct spl_fileproc *sfp = (struct spl_fileproc *)fp; + struct vnode *vp = NULL; + uint32_t vid; + + if (!file_vnode_withvid(sfp->f_fd, &vp, &vid)) { + file_drop(sfp->f_fd); + } + + return (vp); +} + +void +releasef(int fd) +{ + struct spl_fileproc *fp = NULL; + struct proc *p; + + p = current_proc(); + mutex_enter(&spl_getf_lock); + for (fp = list_head(&spl_getf_list); fp != NULL; + fp = list_next(&spl_getf_list, fp)) { + if ((fp->f_proc == p) && fp->f_fd == fd) break; + } + mutex_exit(&spl_getf_lock); + if (!fp) + return; // Not found + + if (fp->f_writes) + fp_drop_written(p, fd, fp->f_fp, 0 /* !locked */); + else + fp_drop(p, fd, fp->f_fp, 0 /* !locked */); + + /* Remove node from the list */ + mutex_enter(&spl_getf_lock); + list_remove(&spl_getf_list, fp); + mutex_exit(&spl_getf_lock); + + /* Free the node */ + kmem_free(fp, sizeof (*fp)); +} + +/* + * getf()/releasef() IO handler. + */ +int spl_vn_rdwr(enum uio_rw rw, struct spl_fileproc *sfp, + caddr_t base, ssize_t len, offset_t offset, enum uio_seg seg, + int ioflag, rlim64_t ulimit, cred_t *cr, ssize_t *residp) +{ + uio_t *auio; + int spacetype; + int error = 0; + vfs_context_t vctx; + + spacetype = UIO_SEG_IS_USER_SPACE(seg) ? UIO_USERSPACE32 : UIO_SYSSPACE; + + vctx = vfs_context_create((vfs_context_t)0); + auio = uio_create(1, 0, spacetype, rw); + uio_reset(auio, offset, spacetype, rw); + uio_addiov(auio, (uint64_t)(uintptr_t)base, len); + + if (rw == UIO_READ) { + error = fo_read(sfp->f_fp, auio, ioflag, vctx); + } else { + error = fo_write(sfp->f_fp, auio, ioflag, vctx); + } + + if (residp) { + *residp = uio_resid(auio); + } else { + if (uio_resid(auio) && error == 0) + error = EIO; + } + + uio_free(auio); + vfs_context_rele(vctx); + + return (error); +} + +/* Regular vnode vn_rdwr */ +int zfs_vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, ssize_t len, + offset_t offset, enum uio_seg seg, int ioflag, rlim64_t ulimit, + cred_t *cr, ssize_t *residp) +{ + uio_t *auio; + int spacetype; + int error = 0; + vfs_context_t vctx; + + spacetype = UIO_SEG_IS_USER_SPACE(seg) ? UIO_USERSPACE32 : UIO_SYSSPACE; + + vctx = vfs_context_create((vfs_context_t)0); + auio = uio_create(1, 0, spacetype, rw); + uio_reset(auio, offset, spacetype, rw); + uio_addiov(auio, (uint64_t)(uintptr_t)base, len); + + if (rw == UIO_READ) { + error = VNOP_READ(vp, auio, ioflag, vctx); + } else { + error = VNOP_WRITE(vp, auio, ioflag, vctx); + } + + if (residp) { + *residp = uio_resid(auio); + } else { + if (uio_resid(auio) && error == 0) + error = EIO; + } + + uio_free(auio); + vfs_context_rele(vctx); + + return (error); +} + +void +spl_rele_async(void *arg) +{ + struct vnode *vp = (struct vnode *)arg; + if (vp) vnode_put(vp); +} + +void +vn_rele_async(struct vnode *vp, void *taskq) +{ + VERIFY(taskq_dispatch((taskq_t *)taskq, + (task_func_t *)spl_rele_async, vp, TQ_SLEEP) != 0); +} + +vfs_context_t +spl_vfs_context_kernel(void) +{ + return (vfs_context_kernel()); +} + +#undef build_path +extern int build_path(struct vnode *vp, char *buff, int buflen, int *outlen, + int flags, vfs_context_t ctx); + +int spl_build_path(struct vnode *vp, char *buff, int buflen, int *outlen, + int flags, vfs_context_t ctx) +{ + return (build_path(vp, buff, buflen, outlen, flags, ctx)); +} + +/* + * vnode_notify was moved from KERNEL_PRIVATE to KERNEL in 10.11, but to be + * backward compatible, we keep the wrapper for now. + */ +extern int vnode_notify(struct vnode *, uint32_t, struct vnode_attr *); +int +spl_vnode_notify(struct vnode *vp, uint32_t type, struct vnode_attr *vap) +{ + return (vnode_notify(vp, type, vap)); +} + +extern int vfs_get_notify_attributes(struct vnode_attr *vap); +int +spl_vfs_get_notify_attributes(struct vnode_attr *vap) +{ + return (vfs_get_notify_attributes(vap)); +} + +/* Root directory vnode for the system a.k.a. '/' */ +/* + * Must use vfs_rootvnode() to acquire a reference, and + * vnode_put() to release it + */ + +extern struct vnode *rootvnode; + +struct vnode * +getrootdir(void) +{ + struct vnode *rvnode; + + // Unfortunately, Apple's vfs_rootvnode() fails to check for + // NULL rootvp, and just panics. We aren't technically allowed to + // see rootvp, but in the interest of avoiding a panic... + if (rootvnode == NULL) + return (NULL); + + rvnode = vfs_rootvnode(); + if (rvnode) + vnode_put(rvnode); + return (rvnode); +} diff --git a/module/os/macos/spl/spl-xdr.c b/module/os/macos/spl/spl-xdr.c new file mode 100644 index 0000000000..4eb115017d --- /dev/null +++ b/module/os/macos/spl/spl-xdr.c @@ -0,0 +1,524 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include + + +/* + * SPL's XDR mem implementation. + * + * This is used by libnvpair to serialize/deserialize the name-value pair data + * structures into byte arrays in a well-defined and portable manner. + * + * These data structures are used by the DMU/ZFS to flexibly manipulate various + * information in memory and later serialize it/deserialize it to disk. + * Examples of usages include the pool configuration, lists of pool and dataset + * properties, etc. + * + * Reference documentation for the XDR representation and XDR operations can be + * found in RFC 1832 and xdr(3), respectively. + * + * === Implementation shortcomings === + * + * It is assumed that the following C types have the following sizes: + * + * char/unsigned char: 1 byte + * short/unsigned short: 2 bytes + * int/unsigned int: 4 bytes + * longlong_t/u_longlong_t: 8 bytes + * + * The C standard allows these types to be larger (and in the case of ints, + * shorter), so if that is the case on some compiler/architecture, the build + * will fail (on purpose). + * + * If someone wants to fix the code to work properly on such environments, then: + * + * 1) Preconditions should be added to xdrmem_enc functions to make sure the + * caller doesn't pass arguments which exceed the expected range. + * 2) Functions which take signed integers should be changed to properly do + * sign extension. + * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger + * problems than this implementation. + * + * It is also assumed that: + * + * 1) Chars have 8 bits. + * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned + * memcpy, memset and memcmp. + * 3) Arrays passed to xdr_array() are packed and the compiler/architecture + * supports element-sized-aligned memory accesses. + * 4) Negative integers are natively stored in two's complement binary + * representation. + * + * No checks are done for the 4 assumptions above, though. + * + * === Caller expectations === + * + * Existing documentation does not describe the semantics of XDR operations very + * well. Therefore, some assumptions about failure semantics will be made and + * will be described below: + * + * 1) If any encoding operation fails (e.g., due to lack of buffer space), the + * the stream should be considered valid only up to the encoding operation + * previous to the one that first failed. However, the stream size as returned + * by xdr_control() cannot be considered to be strictly correct (it may be + * bigger). + * + * Putting it another way, if there is an encoding failure it's undefined + * whether anything is added to the stream in that operation and therefore + * neither xdr_control() nor future encoding operations on the same stream can + * be relied upon to produce correct results. + * + * 2) If a decoding operation fails, it's undefined whether anything will be + * decoded into passed buffers/pointers during that operation, or what the + * values on those buffers will look like. + * + * Future decoding operations on the same stream will also have similar + * undefined behavior. + * + * 3) When the first decoding operation fails it is OK to trust the results of + * previous decoding operations on the same stream, as long as the caller + * expects a failure to be possible (e.g. due to end-of-stream). + * + * However, this is highly discouraged because the caller should know the + * stream size and should be coded to expect any decoding failure to be data + * corruption due to hardware, accidental or even malicious causes, which should + * be handled gracefully in all cases. + * + * In very rare situations where there are strong reasons to believe the data + * can be trusted to be valid and non-tampered with, then the caller may assume + * a decoding failure to be a bug (e.g. due to mismatched data types) and may + * fail non-gracefully. + * + * 4) Non-zero padding bytes will cause the decoding operation to fail. + * + * 5) Zero bytes on string types will also cause the decoding operation to fail. + * + * 6) It is assumed that either the pointer to the stream buffer given by the + * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int + * memory accesses. + * + * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap. + * + * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user + * space or MMIO space), the computer may explode. + */ + +static struct xdr_ops xdrmem_encode_ops; +static struct xdr_ops xdrmem_decode_ops; + +void +xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op) +{ + switch (op) { + case XDR_ENCODE: + xdrs->x_ops = &xdrmem_encode_ops; + break; + case XDR_DECODE: + xdrs->x_ops = &xdrmem_decode_ops; + break; + default: + printf("SPL: Invalid op value: %d\n", op); + xdrs->x_ops = NULL; /* Let the caller know we failed */ + return; + } + + xdrs->x_op = op; + xdrs->x_addr = addr; + xdrs->x_addr_end = addr + size; + + if (xdrs->x_addr_end < xdrs->x_addr) { + printf("SPL: Overflow while creating xdrmem: %p, %u\n", addr, + size); + xdrs->x_ops = NULL; + } +} +EXPORT_SYMBOL(xdrmem_create); + +static bool_t +xdrmem_control(XDR *xdrs, int req, void *info) +{ + struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info; + + if (req != XDR_GET_BYTES_AVAIL) { + printf("SPL: Called with unknown request: %d\n", req); + return (FALSE); + } + + rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */ + rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr; + + return (TRUE); +} + +static bool_t +xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(xdrs->x_addr, cp, cnt); + + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + memset(xdrs->x_addr, 0, pad); + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + static uint32_t zero = 0; + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(cp, xdrs->x_addr, cnt); + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + /* An inverted memchr() would be useful here... */ + if (memcmp(&zero, xdrs->x_addr, pad) != 0) + return (FALSE); + + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint32(XDR *xdrs, uint32_t val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *((uint32_t *)xdrs->x_addr) = BE_32(val); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_dec_uint32(XDR *xdrs, uint32_t *val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *val = BE_32(*((uint32_t *)xdrs->x_addr)); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_enc_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + // BUILD_BUG_ON(sizeof(char) != 1); + val = *((unsigned char *) cp); + + return (xdrmem_enc_uint32(xdrs, val)); +} + +static bool_t +xdrmem_dec_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + // BUILD_BUG_ON(sizeof(char) != 1); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * If any of the 3 other bytes are non-zero then val will be greater + * than 0xff and we fail because according to the RFC, this block does + * not have a char encoded in it. + */ + if (val > 0xff) + return (FALSE); + + *((unsigned char *) cp) = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp) +{ + // BUILD_BUG_ON(sizeof(unsigned short) != 2); + + return (xdrmem_enc_uint32(xdrs, *usp)); +} + +static bool_t +xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp) +{ + uint32_t val; + + // BUILD_BUG_ON(sizeof(unsigned short) != 2); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * Short ints are not in the RFC, but we assume similar logic as in + * xdrmem_dec_char(). + */ + if (val > 0xffff) + return (FALSE); + + *usp = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint(XDR *xdrs, unsigned *up) +{ + // BUILD_BUG_ON(sizeof(unsigned) != 4); + + return (xdrmem_enc_uint32(xdrs, *up)); +} + +static bool_t +xdrmem_dec_uint(XDR *xdrs, unsigned *up) +{ + // BUILD_BUG_ON(sizeof(unsigned) != 4); + + return (xdrmem_dec_uint32(xdrs, (uint32_t *)up)); +} + +static bool_t +xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + // BUILD_BUG_ON(sizeof(u_longlong_t) != 8); + + if (!xdrmem_enc_uint32(xdrs, *ullp >> 32)) + return (FALSE); + + return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff)); +} + +static bool_t +xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + uint32_t low, high; + + // BUILD_BUG_ON(sizeof(u_longlong_t) != 8); + + if (!xdrmem_dec_uint32(xdrs, &high)) + return (FALSE); + if (!xdrmem_dec_uint32(xdrs, &low)) + return (FALSE); + + *ullp = ((u_longlong_t)high << 32) | low; + + return (TRUE); +} + +static bool_t +xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i; + caddr_t addr = *arrp; + + if (*sizep > maxsize || *sizep > UINT_MAX / elsize) + return (FALSE); + + if (!xdrmem_enc_uint(xdrs, sizep)) + return (FALSE); + + for (i = 0; i < *sizep; i++) { + if (!elproc(xdrs, addr)) + return (FALSE); + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i, size; + bool_t alloc = FALSE; + caddr_t addr; + + if (!xdrmem_dec_uint(xdrs, sizep)) + return (FALSE); + + size = *sizep; + + if (size > maxsize || size > UINT_MAX / elsize) + return (FALSE); + + /* + * The Solaris man page says: "If *arrp is NULL when decoding, + * xdr_array() allocates memory and *arrp points to it". + */ + if (*arrp == NULL) { + // BUILD_BUG_ON(sizeof(uint_t) > sizeof(size_t)); + + *arrp = kmem_alloc(size * elsize, KM_NOSLEEP); + if (*arrp == NULL) + return (FALSE); + + alloc = TRUE; + } + + addr = *arrp; + + for (i = 0; i < size; i++) { + if (!elproc(xdrs, addr)) { + if (alloc) + kmem_free(*arrp, size * elsize); + return (FALSE); + } + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + size_t slen = strlen(*sp); + uint_t len; + + if (slen > maxsize) + return (FALSE); + + len = slen; + + if (!xdrmem_enc_uint(xdrs, &len)) + return (FALSE); + + return (xdrmem_enc_bytes(xdrs, *sp, len)); +} + +static bool_t +xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + uint_t size; + bool_t alloc = FALSE; + + if (!xdrmem_dec_uint(xdrs, &size)) + return (FALSE); + + if (size > maxsize || size > UINT_MAX - 1) + return (FALSE); + + /* + * Solaris man page: "If *sp is NULL when decoding, xdr_string() + * allocates memory and *sp points to it". + */ + if (*sp == NULL) { + // BUILD_BUG_ON(sizeof(uint_t) > sizeof(size_t)); + + *sp = kmem_alloc(size + 1, KM_NOSLEEP); + if (*sp == NULL) + return (FALSE); + + alloc = TRUE; + } + + if (!xdrmem_dec_bytes(xdrs, *sp, size)) + goto fail; + + if (kmemchr(*sp, 0, size) != NULL) + goto fail; + + (*sp)[size] = '\0'; + + return (TRUE); + +fail: + if (alloc) + kmem_free(*sp, size + 1); + + return (FALSE); +} + +static struct xdr_ops xdrmem_encode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_enc_char, + .xdr_u_short = xdrmem_enc_ushort, + .xdr_u_int = xdrmem_enc_uint, + .xdr_u_longlong_t = xdrmem_enc_ulonglong, + .xdr_opaque = xdrmem_enc_bytes, + .xdr_string = xdr_enc_string, + .xdr_array = xdr_enc_array +}; + +static struct xdr_ops xdrmem_decode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_dec_char, + .xdr_u_short = xdrmem_dec_ushort, + .xdr_u_int = xdrmem_dec_uint, + .xdr_u_longlong_t = xdrmem_dec_ulonglong, + .xdr_opaque = xdrmem_dec_bytes, + .xdr_string = xdr_dec_string, + .xdr_array = xdr_dec_array +}; diff --git a/module/os/macos/spl/spl-zlib.c b/module/os/macos/spl/spl-zlib.c new file mode 100644 index 0000000000..5aa92c324a --- /dev/null +++ b/module/os/macos/spl/spl-zlib.c @@ -0,0 +1,199 @@ +/* + * + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler + */ + +#include +#include +#include +#include + +#ifdef DEBUG_SUBSYSTEM +#undef DEBUG_SUBSYSTEM +#endif + +#define DEBUG_SUBSYSTEM SS_ZLIB + +static spl_kmem_cache_t *zlib_workspace_cache; + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS))); +} + +static void +zlib_workspace_free(void *workspace) +{ + kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_compress_level); + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, + size_t sourceLen) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return (Z_DATA_ERROR); + + return (err); + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_uncompress); + +int +spl_zlib_init(void) +{ + int size; + SENTRY; + + size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + + zlib_workspace_cache = kmem_cache_create( + "spl_zlib_workspace_cache", + size, 0, NULL, NULL, NULL, NULL, NULL, + KMC_VMEM | KMC_NOEMERGENCY); + if (!zlib_workspace_cache) + SRETURN(1); + + SRETURN(0); +} + +void +spl_zlib_fini(void) +{ + SENTRY; + kmem_cache_destroy(zlib_workspace_cache); + zlib_workspace_cache = NULL; + SEXIT; +} diff --git a/module/os/macos/zfs/.gitignore b/module/os/macos/zfs/.gitignore new file mode 100644 index 0000000000..aaec2f8ea2 --- /dev/null +++ b/module/os/macos/zfs/.gitignore @@ -0,0 +1,2 @@ +zfs +zfs.kext diff --git a/module/os/macos/zfs/Info.plist b/module/os/macos/zfs/Info.plist new file mode 100644 index 0000000000..761b080738 --- /dev/null +++ b/module/os/macos/zfs/Info.plist @@ -0,0 +1,115 @@ + + + + + BuildMachineOSBuild + 14C1514 + CFBundleDevelopmentRegion + English + CFBundleExecutable + zfs + CFBundleIdentifier + net.lundman.zfs + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + zfs + CFBundlePackageType + KEXT + CFBundleShortVersionString + 1.0 + CFBundleSignature + ???? + CFBundleVersion + 1.0.0 + DTCompiler + com.apple.compilers.llvm.clang.1_0 + DTPlatformBuild + 6C131e + DTPlatformVersion + GM + DTSDKBuild + 12F37 + DTSDKName + macosx10.8 + DTXcode + 0620 + DTXcodeBuild + 6C131e + IOKitPersonalities + + net.lundman.zfs + + CFBundleIdentifier + net.lundman.zfs + IOClass + net_lundman_zfs_zvol + IOMatchCategory + net_lundman_zfs_zvol + IOMediaIcon + + CFBundleIdentifier + net.lundman.zfs + IOBundleResourceFile + VolumeIcon.icns + + IOProviderClass + IOResources + IOResourceMatch + IOBSD + + net.lundman.zfs.ZFSDatasetProxy + + CFBundleIdentifier + net.lundman.zfs + IOClass + ZFSDatasetProxy + IOProbeScore + 1000 + IOMatchCategory + ZFSPool + IOProviderClass + ZFSPool + + net.lundman.zfs.ZFSDatasetScheme + + CFBundleIdentifier + net.lundman.zfs + IOClass + ZFSDatasetScheme + IOProbeScore + 5000 + IOMatchCategory + IOStorage + IOPropertyMatch + + Whole + + + IOProviderClass + IOMedia + + + NSHumanReadableCopyright + CDDL (ZFS), BSD (FreeBSD), Copyright © 2012-2020 OpenZFS on OS X. All rights reserved. + OSBundleCompatibleVersion + 1.0.0 + OSBundleLibraries + + com.apple.iokit.IOStorageFamily + 1.6 + com.apple.kpi.bsd + 8.0.0 + com.apple.kpi.iokit + 8.0.0 + com.apple.kpi.libkern + 10.0 + com.apple.kpi.mach + 8.0.0 + com.apple.kpi.unsupported + 8.0.0 + net.lundman.kernel.dependencies + 12.5.0 + + + diff --git a/module/os/macos/zfs/InfoPlist.strings b/module/os/macos/zfs/InfoPlist.strings new file mode 100644 index 0000000000..0c67376eba --- /dev/null +++ b/module/os/macos/zfs/InfoPlist.strings @@ -0,0 +1,5 @@ + + + + + diff --git a/module/os/macos/zfs/Makefile.am b/module/os/macos/zfs/Makefile.am new file mode 100644 index 0000000000..058a73ca56 --- /dev/null +++ b/module/os/macos/zfs/Makefile.am @@ -0,0 +1,353 @@ + +INFO_PLIST = Info.plist +PLIST_STRING = InfoPlist.strings + +ZFS_META_VERSION = @ZFS_META_VERSION@ +ZFS_DEBUG_STR = @ZFS_DEBUG_STR@ + +zfs_CPPFLAGS = \ + -Wall \ + -nostdinc \ + -mkernel \ + -fno-builtin-printf \ + -D__KERNEL__ \ + -D_KERNEL \ + -DKERNEL \ + -DKERNEL_PRIVATE \ + -DDRIVER_PRIVATE \ + -DNAMEDSTREAMS=1 \ + -DAPPLE \ + -DNeXT \ + -I$(top_srcdir)/include/os/macos/spl \ + -I$(top_srcdir)/include/os/macos/zfs \ + -I$(top_srcdir)/module/icp/include \ + -I$(top_srcdir)/include \ + -I@KERNEL_HEADERS@/Headers \ + -I@KERNEL_HEADERS@/PrivateHeaders \ + -I$(top_srcdir)/module/zstd/include + +zfs_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ + +zfs_CFLAGS = +zfs_CXXFLAGS = + +zfs_LDFLAGS = \ + -Xlinker \ + -kext \ + -nostdlib \ + -lkmodc++ \ + -lkmod \ + -lcc_kext + +zfs_LDADD = \ + $(top_builddir)/module/os/macos/spl/libspl.la + +zfs_LIBS = + +# If we don't set this to nothing, it adds "-lz -liconv" +LIBS = + +bin_PROGRAMS = zfs.kext +noinst_PROGRAMS = zfs + +zfs_kext_SOURCE = + +if TARGET_CPU_X86_64 +zfs_ASM_SOURCES_C = \ + ../../../icp/asm-x86_64/aes/aeskey.c \ + ../../../icp/algs/modes/gcm_pclmulqdq.c \ + ../../../zcommon/zfs_fletcher_intel.c \ + ../../../zcommon/zfs_fletcher_sse.c \ + ../../../zcommon/zfs_fletcher_avx512.c \ + ../../../zfs/vdev_raidz_math_sse2.c \ + ../../../zfs/vdev_raidz_math_ssse3.c \ + ../../../zfs/vdev_raidz_math_avx2.c \ + ../../../zfs/vdev_raidz_math_avx512f.c \ + ../../../zfs/vdev_raidz_math_avx512bw.c +zfs_ASM_SOURCES_AS = \ + ../../../icp/asm-x86_64/os/macos/aes/aes_amd64.S \ + ../../../icp/asm-x86_64/os/macos/aes/aes_aesni.S \ + ../../../icp/asm-x86_64/os/macos/modes/gcm_pclmulqdq.S \ + ../../../icp/asm-x86_64/os/macos/sha1/sha1-x86_64.S \ + ../../../icp/asm-x86_64/os/macos/sha2/sha256_impl.S \ + ../../../icp/asm-x86_64/os/macos/sha2/sha512_impl.S +else +zfs_ASM_SOURCES_C = +zfs_ASM_SOURCES_AS = +endif + +zfs_SOURCES = \ + ../../../zfs/abd.c \ + abd_os.c \ + ../../../zfs/aggsum.c \ + ../../../zfs/arc.c \ + arc_os.c \ + ../../../avl/avl.c \ + ../../../zfs/blkptr.c \ + ../../../zfs/bplist.c \ + ../../../zfs/bpobj.c \ + ../../../zfs/bptree.c \ + ../../../zfs/bqueue.c \ + ../../../zfs/btree.c \ + ../../../zcommon/cityhash.c \ + ../../../zfs/dbuf.c \ + ../../../zfs/dbuf_stats.c \ + ../../../zfs/ddt.c \ + ../../../zfs/ddt_zap.c \ + ../../../zfs/dmu.c \ + ../../../zfs/dmu_diff.c \ + ../../../zfs/dmu_object.c \ + ../../../zfs/dmu_objset.c \ + ../../../zfs/dmu_recv.c \ + ../../../zfs/dmu_redact.c \ + ../../../zfs/dmu_send.c \ + ../../../zfs/dmu_traverse.c \ + ../../../zfs/dmu_tx.c \ + ../../../zfs/dmu_zfetch.c \ + ../../../zfs/dnode.c \ + ../../../zfs/dnode_sync.c \ + ../../../zfs/dsl_bookmark.c \ + ../../../zfs/dsl_crypt.c \ + ../../../zfs/dsl_dataset.c \ + ../../../zfs/dsl_deadlist.c \ + ../../../zfs/dsl_deleg.c \ + ../../../zfs/dsl_destroy.c \ + ../../../zfs/dsl_dir.c \ + ../../../zfs/dsl_pool.c \ + ../../../zfs/dsl_prop.c \ + ../../../zfs/dsl_scan.c \ + ../../../zfs/dsl_synctask.c \ + ../../../zfs/dsl_userhold.c \ + ../../../zfs/edonr_zfs.c \ + ../../../zfs/fm.c \ + ../../../zfs/gzip.c \ + ../../../zfs/hkdf.c \ + ldi_osx.c \ + ldi_vnode.c \ + ldi_iokit.cpp \ + ../../../zfs/lz4.c \ + ../../../zfs/lzjb.c \ + ../../../zfs/metaslab.c \ + ../../../zfs/mmp.c \ + ../../../zfs/multilist.c \ + ../../../zfs/objlist.c \ + ../../../zfs/pathname.c \ + ../../../zfs/range_tree.c \ + ../../../zfs/refcount.c \ + ../../../zfs/rrwlock.c \ + ../../../zfs/sa.c \ + ../../../zfs/sha256.c \ + ../../../zfs/skein_zfs.c \ + ../../../zfs/spa.c \ + ../../../zfs/spa_boot.c \ + ../../../zfs/spa_checkpoint.c \ + ../../../zfs/spa_config.c \ + ../../../zfs/spa_errlog.c \ + ../../../zfs/spa_history.c \ + ../../../zfs/spa_log_spacemap.c \ + ../../../zfs/spa_misc.c \ + spa_misc_os.c \ + ../../../zfs/spa_stats.c \ + ../../../zfs/space_map.c \ + ../../../zfs/space_reftree.c \ + ../../../zfs/txg.c \ + ../../../zfs/uberblock.c \ + ../../../zfs/unique.c \ + ../../../zfs/vdev.c \ + ../../../zfs/vdev_cache.c \ + vdev_disk.c \ + vdev_file.c \ + ../../../zfs/vdev_indirect.c \ + ../../../zfs/vdev_indirect_births.c \ + ../../../zfs/vdev_indirect_mapping.c \ + ../../../zfs/vdev_initialize.c \ + ../../../zfs/vdev_label.c \ + ../../../zfs/vdev_mirror.c \ + ../../../zfs/vdev_missing.c \ + ../../../zfs/vdev_queue.c \ + ../../../zfs/vdev_raidz.c \ + ../../../zfs/vdev_raidz_math.c \ + ../../../zfs/vdev_raidz_math_scalar.c \ + ../../../zfs/vdev_removal.c \ + ../../../zfs/vdev_root.c \ + ../../../zfs/vdev_trim.c \ + ../../../zfs/zap.c \ + ../../../zfs/zap_leaf.c \ + ../../../zfs/zap_micro.c \ + ../../../zfs/zcp.c \ + ../../../zfs/zcp_get.c \ + ../../../zfs/zcp_global.c \ + ../../../zfs/zcp_iter.c \ + ../../../zfs/zcp_set.c \ + ../../../zfs/zcp_synctask.c \ + ../../../zfs/zfeature.c \ + ../../../zcommon/zfeature_common.c \ + zfs_acl.c \ + zfs_boot.cpp \ + ../../../zfs/zfs_byteswap.c \ + zfs_ctldir.c \ + zfs_debug.c \ + zfs_dir.c \ + ../../../zfs/zfs_fm.c \ + zfs_file_os.c \ + ../../../zfs/zfs_fuid.c \ + zfs_fuid_os.c \ + ../../../zfs/zfs_ioctl.c \ + zfs_ioctl_os.c \ + zfs_kstat_osx.c \ + ../../../zfs/zfs_log.c \ + ../../../zfs/zfs_onexit.c \ + zfs_osx.cpp \ + ../../../zfs/zfs_quota.c \ + ../../../zfs/zfs_ratelimit.c \ + ../../../zfs/zfs_replay.c \ + ../../../zfs/zfs_rlock.c \ + ../../../zfs/zfs_sa.c \ + zfs_vfsops.c \ + zfs_vnops.c \ + zfs_vnops_osx.c \ + zfs_vnops_osx_lib.c \ + zfs_znode.c \ + ../../../zfs/zil.c \ + ../../../zfs/zio.c \ + ../../../zfs/zio_checksum.c \ + zio_crypt.c \ + ../../../zfs/zio_compress.c \ + ../../../zfs/zio_inject.c \ + ../../../zfs/zle.c \ + ../../../zfs/zrlock.c \ + ../../../zfs/zthr.c \ + ../../../zfs/zvol.c \ + zvol_os.c \ + zvolIO.cpp \ + ZFSDatasetProxy.cpp \ + ZFSDatasetScheme.cpp \ + ZFSDataset.cpp \ + ZFSPool.cpp \ + ../../../nvpair/fnvpair.c \ + ../../../nvpair/nvpair.c \ + ../../../nvpair/nvpair_alloc_fixed.c \ + ../../../nvpair/nvpair_alloc_spl.c \ + ../../../unicode/u8_textprep.c \ + ../../../unicode/uconv.c \ + ../../../zcommon/zfs_comutil.c \ + ../../../zcommon/zfs_deleg.c \ + ../../../zcommon/zfs_fletcher.c \ + ../../../zcommon/zfs_fletcher_superscalar.c \ + ../../../zcommon/zfs_fletcher_superscalar4.c \ + ../../../zcommon/zfs_namecheck.c \ + ../../../zcommon/zfs_prop.c \ + ../../../zcommon/zpool_prop.c \ + ../../../zcommon/zprop_common.c \ + ../../../icp/api/kcf_cipher.c \ + ../../../icp/api/kcf_digest.c \ + ../../../icp/api/kcf_mac.c \ + ../../../icp/api/kcf_miscapi.c \ + ../../../icp/api/kcf_ctxops.c \ + ../../../icp/core/kcf_callprov.c \ + ../../../icp/core/kcf_prov_tabs.c \ + ../../../icp/core/kcf_sched.c \ + ../../../icp/core/kcf_mech_tabs.c \ + ../../../icp/core/kcf_prov_lib.c \ + ../../../icp/spi/kcf_spi.c \ + ../../../icp/io/aes.c \ + ../../../icp/io/edonr_mod.c \ + ../../../icp/io/sha2_mod.c \ + ../../../icp/io/sha1_mod.c \ + ../../../icp/io/skein_mod.c \ + ../../../icp/os/modhash.c \ + ../../../icp/os/modconf.c \ + ../../../icp/algs/edonr/edonr.c \ + ../../../icp/algs/modes/cbc.c \ + ../../../icp/algs/modes/ccm.c \ + ../../../icp/algs/modes/ctr.c \ + ../../../icp/algs/modes/ecb.c \ + ../../../icp/algs/modes/gcm_generic.c \ + ../../../icp/algs/modes/gcm.c \ + ../../../icp/algs/modes/modes.c \ + ../../../icp/algs/sha2/sha2.c \ + ../../../icp/algs/skein/skein.c \ + ../../../icp/algs/skein/skein_block.c \ + ../../../icp/algs/skein/skein_iv.c \ + ../../../icp/algs/aes/aes_impl_aesni.c \ + ../../../icp/algs/aes/aes_impl_generic.c \ + ../../../icp/algs/aes/aes_impl_x86-64.c \ + ../../../icp/algs/aes/aes_impl.c \ + ../../../icp/algs/aes/aes_modes.c \ + ../../../icp/illumos-crypto.c \ + ../../../lua/lapi.c \ + ../../../lua/lauxlib.c \ + ../../../lua/lbaselib.c \ + ../../../lua/lcode.c \ + ../../../lua/lcompat.c \ + ../../../lua/lcorolib.c \ + ../../../lua/lctype.c \ + ../../../lua/ldebug.c \ + ../../../lua/ldo.c \ + ../../../lua/lfunc.c \ + ../../../lua/lgc.c \ + ../../../lua/llex.c \ + ../../../lua/lmem.c \ + ../../../lua/lobject.c \ + ../../../lua/lopcodes.c \ + ../../../lua/lparser.c \ + ../../../lua/lstate.c \ + ../../../lua/lstring.c \ + ../../../lua/lstrlib.c \ + ../../../lua/ltable.c \ + ../../../lua/ltablib.c \ + ../../../lua/ltm.c \ + ../../../lua/lvm.c \ + ../../../lua/lzio.c \ + ../../../lua/setjmp/setjmp.S \ + ../../../zstd/lib/zstd.c \ + ../../../zstd/zfs_zstd.c \ + $(zfs_ASM_SOURCES_C) \ + $(zfs_ASM_SOURCES_AS) + +# Ensure these files are always built with -O2 to avoid stack overflow. +../../../zfs/zfs-dsl_scan.$(OBJEXT): CFLAGS := $(CFLAGS:-O0%=-O2) +../../../lua/zfs-lvm.$(OBJEXT): CFLAGS := $(CFLAGS:-O0%=-O2) + +# Zstd uses -O3 by default, so we should follow +../../../zstd/lib/zfs-zstd.$(OBJEXT): CFLAGS := -fno-tree-vectorize -O3 + + +KERNEL_MODDIR= $(DESTDIR)@KERNEL_MODPREFIX@/zfs.kext + +dist_noinst_DATA = $(PLIST_STRING) $(INFO_PLIST) + +zfs.kext$(EXEEXT): zfs $(PLIST_STRING) $(INFO_PLIST) + @echo "" + @mkdir -p zfs.kext/Contents/Resources/English.lproj zfs.kext/Contents/MacOS + @cp -f $(INFO_PLIST) zfs.kext/Contents/ + /usr/libexec/PlistBuddy -c "Set :CFBundleVersion $(ZFS_META_VERSION)" zfs.kext/Contents/Info.plist + /usr/libexec/PlistBuddy -c "Set :CFBundleShortVersionString $(ZFS_META_VERSION)" zfs.kext/Contents/Info.plist + /usr/libexec/PlistBuddy -c "Delete :OSBundleLibraries:net.lundman.kernel.dependencies" zfs.kext/Contents/Info.plist + /usr/libexec/PlistBuddy -c "Add :OSBundleLibraries:net.lundman.kernel.dependencies.$(ZFS_META_VERSION) string 12.5.0" zfs.kext/Contents/Info.plist + @cp -f $(PLIST_STRING) zfs.kext/Contents/Resources/English.lproj/ + @cp -f zfs zfs.kext/Contents/MacOS/ + @mkdir -p zfs.kext/Contents/PlugIns/KernelExports.kext/ + @cp -f $(top_srcdir)/module/os/macos/kernel/kernelexports zfs.kext/Contents/PlugIns/KernelExports.kext/KernelExports + @cp -f $(top_srcdir)/module/os/macos/kernel/Info.plist zfs.kext/Contents/PlugIns/KernelExports.kext/ + /usr/libexec/PlistBuddy -c "Set :CFBundleIdentifier net.lundman.kernel.dependencies.$(ZFS_META_VERSION)" zfs.kext/Contents/PlugIns/KernelExports.kext/Info.plist + /usr/libexec/PlistBuddy -c "Add :OSBundleRequired string Root" zfs.kext/Contents/Info.plist + cp -f $(top_srcdir)/module/os/macos/kernel/version.plist zfs.kext/Contents/PlugIns/KernelExports.kext/ + @kextlibs -unsupported -undef-symbols -xml zfs.kext/ || echo "Ignoring errors..(Most of these are expected)" | grep -v -f $(top_srcdir)/module/os/macos/kernel/zfs.exports + +install-exec-local: zfs.kext + rm -rf $(KERNEL_MODDIR) + mkdir -p $(KERNEL_MODDIR) + rsync -r zfs.kext/ $(KERNEL_MODDIR) + chown -R root:wheel $(KERNEL_MODDIR) || echo "Unable to chown root:wheel $(KERNEL_MODDIR)" + @echo + @echo "To load module: kextload -v $(KERNEL_MODDIR)" + @echo "To uninstall module: rm -rf $(KERNEL_MODDIR)" + @echo + +uninstall-am: + rm -rf $(KERNEL_MODDIR) + +clean: + rm -rf zfs.kext/ + rm -f *.o *.lo zfs diff --git a/module/os/macos/zfs/ZFSDataset.cpp b/module/os/macos/zfs/ZFSDataset.cpp new file mode 100644 index 0000000000..ce598e35ac --- /dev/null +++ b/module/os/macos/zfs/ZFSDataset.cpp @@ -0,0 +1,854 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * ZFSDataset - proxy disk for legacy and com.apple.devicenode mounts. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DPRINTF_FUNC() do { dprintf(""); } while (0); + +OSDefineMetaClassAndStructors(ZFSDataset, IOMedia); + +#if 0 +/* XXX Only for debug tracing */ +bool +ZFSDataset::open(IOService *client, + IOOptionBits options, IOStorageAccess access) +{ + bool ret; + DPRINTF_FUNC(); + + ret = IOMedia::open(client, options, access); + + dprintf("ZFSDataset %s ret %d", ret); + return (ret); +} + +bool +ZFSDataset::isOpen(const IOService *forClient) const +{ + DPRINTF_FUNC(); + return (false); +} + +void +ZFSDataset::close(IOService *client, + IOOptionBits options) +{ + DPRINTF_FUNC(); + IOMedia::close(client, options); +} + +bool +ZFSDataset::handleOpen(IOService *client, + IOOptionBits options, void *access) +{ + bool ret; + DPRINTF_FUNC(); + + ret = IOMedia::handleOpen(client, options, access); + + dprintf("ZFSDataset %s ret %d", ret); + return (ret); +} + +bool +ZFSDataset::handleIsOpen(const IOService *client) const +{ + bool ret; + DPRINTF_FUNC(); + + ret = IOMedia::handleIsOpen(client); + + dprintf("ZFSDataset %s ret %d", ret); + return (ret); +} + +void +ZFSDataset::handleClose(IOService *client, + IOOptionBits options) +{ + DPRINTF_FUNC(); + IOMedia::handleClose(client, options); +} + +bool +ZFSDataset::attach(IOService *provider) +{ + DPRINTF_FUNC(); + return (IOMedia::attach(provider)); +} + +void +ZFSDataset::detach(IOService *provider) +{ + DPRINTF_FUNC(); + IOMedia::detach(provider); +} + +bool +ZFSDataset::start(IOService *provider) +{ + DPRINTF_FUNC(); + return (IOMedia::start(provider)); +} + +void +ZFSDataset::stop(IOService *provider) +{ + DPRINTF_FUNC(); + IOMedia::stop(provider); +} +#endif + +/* XXX Only for debug tracing */ +void +ZFSDataset::free() +{ + DPRINTF_FUNC(); + IOMedia::free(); +} + +/* + * Override init to call IOMedia init then setup properties. + */ +bool +ZFSDataset::init(UInt64 base, UInt64 size, + UInt64 preferredBlockSize, + IOMediaAttributeMask attributes, + bool isWhole, bool isWritable, + const char *contentHint, + OSDictionary *properties) +{ + OSDictionary *newProps = NULL, *deviceDict; + OSNumber *physSize, *logSize; +#if 0 + OSDictionary *protocolDict; + const OSSymbol *virtualSymbol, *internalSymbol; +#endif + bool ret; + + DPRINTF_FUNC(); + + /* Clone or create new properties dictionary */ + if (properties) newProps = OSDictionary::withDictionary(properties); + if (!newProps) newProps = OSDictionary::withCapacity(2); + + /* Allocate dictionaries, numbers, and string symbols */ + deviceDict = OSDictionary::withCapacity(2); +#if 0 + protocolDict = OSDictionary::withCapacity(2); +#endif + + physSize = OSNumber::withNumber(4096, 32); + logSize = OSNumber::withNumber(512, 32); + +#if 0 + kIOPropertyPhysicalInterconnectTypeVirtual + kIOPropertyPhysicalInterconnectTypeKey + kIOPropertyInterconnectFileKey + kIOPropertyInternalKey + kIOPropertyPhysicalInterconnectLocationKey + + kIOPropertyProtocolCharacteristicsKey + kIOPropertyMediumTypeKey + kIOPropertyLogicalBlockSizeKey + kIOPropertyPhysicalBlockSizeKey + kIOPropertyBytesPerPhysicalSectorKey + kIOPropertyDeviceCharacteristicsKey + kIOBlockStorageDeviceTypeKey + kIOBlockStorageDeviceTypeGeneric +#endif + +#if 0 + virtualSymbol = OSSymbol::withCString( + kIOPropertyPhysicalInterconnectTypeVirtual); + internalSymbol = OSSymbol::withCString( + kIOPropertyInternalKey); +#endif + + /* Validate allocations */ + if (!newProps || !deviceDict || !physSize || !logSize +#if 0 + // || !protocolDict || !virtualSymbol || !internalSymbol +#endif + ) { + dprintf("symbol allocation failed"); + OSSafeReleaseNULL(newProps); + OSSafeReleaseNULL(deviceDict); +#if 0 + OSSafeReleaseNULL(protocolDict); +#endif + OSSafeReleaseNULL(physSize); + OSSafeReleaseNULL(logSize); +#if 0 + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(internalSymbol); +#endif + return (false); + } + + /* Setup device characteristics */ + deviceDict->setObject(kIOPropertyPhysicalBlockSizeKey, physSize); + deviceDict->setObject(kIOPropertyLogicalBlockSizeKey, logSize); + OSSafeReleaseNULL(physSize); + OSSafeReleaseNULL(logSize); + +#if 0 + /* Setup protocol characteristics */ + protocolDict->setObject(kIOPropertyPhysicalInterconnectTypeKey, + virtualSymbol); + protocolDict->setObject(kIOPropertyPhysicalInterconnectLocationKey, + internalSymbol); + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(internalSymbol); +#endif + + /* XXX Setup required IOMedia props */ + + /* Set new device and protocol dictionaries */ + if (newProps->setObject(kIOPropertyDeviceCharacteristicsKey, + deviceDict) == false +#if 0 + // || + // newProps->setObject(kIOPropertyProtocolCharacteristicsKey, + // protocolDict) == false +#endif + ) { + dprintf("setup properties failed"); + OSSafeReleaseNULL(newProps); + OSSafeReleaseNULL(deviceDict); +#if 0 + OSSafeReleaseNULL(protocolDict); +#endif + return (false); + } + OSSafeReleaseNULL(deviceDict); +#if 0 + OSSafeReleaseNULL(protocolDict); +#endif + + /* Call IOMedia init with size and newProps */ + ret = IOMedia::init(base, size, preferredBlockSize, + attributes, isWhole, isWritable, contentHint, + newProps); + OSSafeReleaseNULL(newProps); + + if (!ret) dprintf("IOMedia init failed"); + + return (ret); + +#if 0 + /* Get current device and protocol dictionaries */ + lockForArbitration(); + oldDeviceDict = OSDynamicCast(OSDictionary, + getProperty(kIOStorageDeviceCharacteristicsKey)); + oldProtocolDict = OSDynamicCast(OSDictionary, + getProperty(kIOStorageProtocolCharacteristicsKey)); + if (oldDeviceDict) oldDeviceDict->retain(); + if (oldProtocolDict) oldProtocolDict->retain(); + unlockForArbitration(); + + /* Clone existing dictionaries */ + if (oldDeviceDict) { + newDeviceDict = OSDictionary::withDict(oldDeviceDict); + OSSafeReleaseNULL(oldDeviceDict); + } + if (oldProtocolDict) { + newProtocolDict = OSDictionary::withDict(oldProtocolDict); + OSSafeReleaseNULL(oldDeviceDict); + } + + /* Make new if missing */ + if (!newDeviceDict) + newDeviceDict = OSDictionary::withCapacity(2); + if (!newProtocolDict) + newProtocolDict = OSDictionary::withCapacity(2); + + /* Setup device characteristics */ + newDeviceDict->setObject(kIOStoragePhysicalBlocksizeKey, physSize); + newDeviceDict->setObject(kIOStorageLogicalBlocksizeKey, logSize); + OSSafeReleaseNULL(physSize); + OSSafeReleaseNULL(logSize); + + /* Setup protocol characteristics */ + newProtocolDict->setObject(kIOStorageProtocolInterconnectTypeKey, + virtualSymbol); + newProtocolDict->setObject(kIOStorageProtocolInterconnectNameKey, + internalSymbol); + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(internalSymbol); + + /* XXX Setup required IOMedia props */ + + /* Set new device and protocol dictionaries */ + lockForArbitration(); + setProperty(kIOStorageDeviceCharacteristicsKey, newDeviceDict); + setProperty(kIOStorageProtocolCharacteristicsKey, newProtocolDict); + unlockForArbitration(); + + /* Cleanup and return success */ + OSSafeReleaseNULL(newDeviceDict); + OSSafeReleaseNULL(newProtocolDict); + return (true); +#endif +} + +/* + * Set both the IOService name and the ZFS Dataset property. + */ +bool +ZFSDataset::setDatasetName(const char *name) +{ + OSDictionary *prevDict, *newDict = NULL; + OSString *datasetString; + const char *newname; + + if (!name || name[0] == '\0') { + dprintf("missing name"); + return (false); + } + + if ((newname = strrchr(name, '/')) == NULL) { + newname = name; + } else { + /* Advance beyond slash */ + newname++; + } + +#if 0 + size_t len; + /* Length of IOMedia name plus null terminator */ + len = (strlen(kZFSIOMediaPrefix) + strlen(name) + + strlen(kZFSIOMediaSuffix) + 1); + // len = strlen("ZFS ") + strlen(name) + strlen(" Media") + 1; + + newname = (char *)kmem_alloc(len, KM_SLEEP); +#endif + datasetString = OSString::withCString(name); + +#if 0 + nameString = OSString::withCString(newname); + if (newname == NULL || nameString == NULL) { + dprintf("couldn't make name strings"); + OSSafeReleaseNULL(nameString); + if (newname) kmem_free(newname, len); + return (false); + } +#else + if (datasetString == NULL) { + dprintf("couldn't make name strings"); + return (false); + } +#endif + +#if 0 + bzero(newname, len); + snprintf(newname, len, "%s%s%s", kZFSIOMediaPrefix, + name, kZFSIOMediaSuffix); + + ASSERT3U(strlen(newname), ==, len-1); +#endif + + /* Lock IORegistryEntry and get current prop dict */ + lockForArbitration(); + if ((prevDict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyDeviceCharacteristicsKey))) == NULL) { + /* Unlock IORegistryEntry */ + unlockForArbitration(); + dprintf("couldn't get prop dict"); + } + prevDict->retain(); + unlockForArbitration(); + + /* Clone existing dictionary */ + if (prevDict) { + if ((newDict = OSDictionary::withDictionary(prevDict)) == + NULL) { + dprintf("couldn't clone prop dict"); + } + OSSafeReleaseNULL(prevDict); + /* Non-fatal at the moment */ + } + + /* If prevDict did not exist or couldn't be copied, make new */ + if (!newDict && (newDict = OSDictionary::withCapacity(1)) == NULL) { + dprintf("couldn't make new prop dict"); + } + + /* If we have a new or copied dict at this point */ + if (newDict) { + /* Add or replace dictionary Product Name string */ + if (newDict->setObject(kIOPropertyProductNameKey, + datasetString) == false) { + dprintf("couldn't set name"); + OSSafeReleaseNULL(datasetString); + // OSSafeReleaseNULL(nameString); + // kmem_free(newname, len); + OSSafeReleaseNULL(newDict); + return (false); + } + + /* Lock IORegistryEntry and replace prop dict */ + lockForArbitration(); + if (setProperty(kIOPropertyDeviceCharacteristicsKey, + newDict) == false) { + unlockForArbitration(); + dprintf("couldn't set name"); + OSSafeReleaseNULL(datasetString); + // OSSafeReleaseNULL(nameString); + // kmem_free(newname, len); + OSSafeReleaseNULL(newDict); + return (false); + } + unlockForArbitration(); + OSSafeReleaseNULL(newDict); + } + + /* Lock IORegistryEntry to replace property and set name */ + lockForArbitration(); + /* Assign plain ZFS Dataset name */ + setProperty(kZFSDatasetNameKey, datasetString); + /* Assign IOMedia name */ + // setName(name); + setName(newname); + + /* Unlock IORegistryEntry and cleanup allocations */ + unlockForArbitration(); + // kmem_free(newname, len); + // OSSafeReleaseNULL(nameString); + return (true); +} + +#if 0 +static inline uint64_t +get_objnum(const char *name) +{ + objset_t *os = NULL; + uint64_t objnum; + int error; + + if (!name) + return (0); + + error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, FTAG, &os); + if (error != 0) { + dprintf("couldn't open dataset %d", error); + return (0); + } + + objnum = dmu_objset_id(os); + + dmu_objset_disown(os, FTAG); + + return (objnum); +} +#endif + +/* + * Create a proxy device, name it appropriately, and return it. + */ +ZFSDataset * +ZFSDataset::withDatasetNameAndSize(const char *name, uint64_t size) +{ + ZFSDataset *dataset = NULL; + objset_t *os = NULL; + OSString *uuidStr = NULL; + OSObject *property = NULL; + char uuid_cstr[37]; + uint64_t objnum, readonly, guid; +#if 0 + // uint64_t ref_size, avail_size, obj_count, obj_free; +#endif + uuid_t uuid; + int error; + bool isWritable; + + DPRINTF_FUNC(); + + if (!name || name[0] == '\0') { + dprintf("missing name"); + /* Nothing allocated or retained yet */ + return (NULL); + } + bzero(uuid_cstr, sizeof (uuid_cstr)); + +#if 0 + OSNumber *sizeNum = NULL; + property = copyProperty(kZFSPoolSizeKey, gIOServicePlane, + kIORegistryIterateRecursively|kIORegistryIterateParents); + if (!property) { + dprintf("couldn't get pool size"); + /* Nothing allocated or retained yet */ + return (NULL); + } + if ((sizeNum = OSDynamicCast(OSNumber, property)) == NULL) { + dprintf("couldn't cast pool size"); + goto error; + } + size = sizeNum->unsigned64BitValue(); + sizeNum = NULL; + OSSafeReleaseNULL(property); +#endif + + if (zfs_vfs_uuid_gen(name, uuid) != 0) { + dprintf("UUID gen failed"); + goto error; + } + // uuid_unparse(uuid, uuid_cstr); + zfs_vfs_uuid_unparse(uuid, uuid_cstr); + // snprintf(uuid_cstr, sizeof (uuid_cstr), ""); + + uuidStr = OSString::withCString(uuid_cstr); + if (!uuidStr) { + dprintf("uuidStr alloc failed"); + goto error; + } + + dataset = new ZFSDataset; + if (!dataset) { + dprintf("allocation failed"); + goto error; + } + + /* Own the dmu objset to get properties */ + error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_FALSE, FTAG, &os); + if (error != 0) { + dprintf("couldn't open dataset %d", error); + goto error; + } + + /* Get the dsl_dir to lookup object number */ + objnum = dmu_objset_id(os); + +#if 0 + dmu_objset_space(os, &ref_size, &avail_size, &obj_count, &obj_free); +#endif + + // if (os->os_dsl_dataset) + // guid = dsl_dataset_phys(os->os_dsl_dataset)->ds_guid; + guid = dmu_objset_fsid_guid(os); + // dsl_prop_get_integer(name, "guid", &guid, NULL) != 0) { + + if (dsl_prop_get_integer(name, "readonly", &readonly, NULL) != 0) { + dmu_objset_disown(os, B_FALSE, FTAG); + dprintf("get readonly property failed"); + goto error; + } + // size = (1<<30); + // isWritable = true; + dmu_objset_disown(os, B_FALSE, FTAG); + +#if 0 + size = ref_size + avail_size; +#endif + + isWritable = (readonly == 0ULL); + + if (dataset->init(/* base */ 0, size, DEV_BSIZE, + /* attributes */ 0, /* isWhole */ false, isWritable, + kZFSContentHint, /* properties */ NULL) == false) { + dprintf("init failed"); + goto error; + } + + if (dataset->setDatasetName(name) == false) { + dprintf("invalid name"); + goto error; + } + + /* Set media UUID */ + dataset->setProperty(kIOMediaUUIDKey, uuidStr); + OSSafeReleaseNULL(uuidStr); + + return (dataset); + +error: + OSSafeReleaseNULL(property); + OSSafeReleaseNULL(uuidStr); + OSSafeReleaseNULL(dataset); + return (NULL); +} + +/* + * Compatibility method simulates a read but returns all zeros. + */ +void +ZFSDataset::read(IOService *client, + UInt64 byteStart, IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + IOByteCount total, cur_len, done = 0; + addr64_t cur; + + DPRINTF_FUNC(); + if (!buffer) { + if (completion) complete(completion, kIOReturnInvalid, 0); + return; + } + + total = buffer->getLength(); + + /* XXX Get each physical segment of the buffer and zero it */ + while (done < total) { + cur_len = 0; + cur = buffer->getPhysicalSegment(done, &cur_len); + if (cur == 0) break; + if (cur_len != 0) bzero_phys(cur, cur_len); + done += cur_len; + ASSERT3U(done, <=, total); + } + ASSERT3U(done, ==, total); + + // if (!completion || !completion->action) { + if (!completion) { + dprintf("invalid completion"); + return; + } + +// (completion->action)(completion->target, completion->parameter, +// kIOReturnSuccess, total); + complete(completion, kIOReturnSuccess, total); +} + +/* + * Compatibility method simulates a write as a no-op. + */ +void +ZFSDataset::write(IOService *client, + UInt64 byteStart, IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + IOByteCount total; + DPRINTF_FUNC(); + + if (!buffer) { + if (completion) complete(completion, kIOReturnInvalid); + return; + } + + total = buffer->getLength(); + + // if (!completion || !completion->action) { + if (!completion) { + dprintf("invalid completion"); + return; + } + + /* XXX No-op, just return success */ +// (completion->action)(completion->target, completion->parameter, +// kIOReturnSuccess, total); + complete(completion, kIOReturnSuccess, total); +} + +#ifdef DEBUG +volatile SInt64 num_sync = 0; +#endif + +/* + * Compatibility method simulates a barrier sync as a no-op. + */ +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) +IOReturn +ZFSDataset::synchronize(IOService *client, + UInt64 byteStart, UInt64 byteCount, + IOStorageSynchronizeOptions options) +#else +IOReturn +ZFSDataset::synchronizeCache(IOService *client) +#endif +{ +#ifdef DEBUG + SInt64 cur_sync = 0; + DPRINTF_FUNC(); + cur_sync = OSIncrementAtomic64(&num_sync); + dprintf("sync called %lld times", cur_sync); +#endif + + /* XXX Needs to report success for mount_common() */ + return (kIOReturnSuccess); +} + +/* + * Compatibility method returns failure (unsupported). + */ +IOReturn +ZFSDataset::unmap(IOService *client, + IOStorageExtent *extents, UInt32 extentsCount, +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + IOStorageUnmapOptions options) +#else + UInt32 options) +#endif +{ + DPRINTF_FUNC(); + return (kIOReturnUnsupported); +} + +/* + * Compatibility method returns failure (no result). + */ +IOStorage * +ZFSDataset::copyPhysicalExtent(IOService *client, + UInt64 *byteStart, UInt64 *byteCount) +{ + DPRINTF_FUNC(); + return (0); + // return (IOMedia::copyPhysicalExtent(client, byteStart, byteCount)); +} + +/* + * Compatibility method simulates lock as a no-op. + */ +bool +ZFSDataset::lockPhysicalExtents(IOService *client) +{ + DPRINTF_FUNC(); + // return (IOMedia::unlockPhysicalExtents(client)); + return (true); +} + +/* + * Compatibility method simulates unlock as a no-op. + */ +void +ZFSDataset::unlockPhysicalExtents(IOService *client) +{ + DPRINTF_FUNC(); + // IOMedia::unlockPhysicalExtents(client); +} + +/* + * Compatibility method returns failure (unsupported). + */ +#if defined(MAC_OS_X_VERSION_10_10) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10) +IOReturn +ZFSDataset::setPriority(IOService *client, + IOStorageExtent *extents, UInt32 extentsCount, + IOStoragePriority priority) +{ + DPRINTF_FUNC(); + return (kIOReturnUnsupported); + // return (IOMedia::setPriority(client, extents, + // extentsCount, priority)); +} +#endif + +/* + * Compatibility method returns default system blocksize. + */ +UInt64 +ZFSDataset::getPreferredBlockSize() const +{ + DPRINTF_FUNC(); + return (DEV_BSIZE); + // return (IOMedia::getPreferredBlockSize()); +} + +/* XXX Only for debug tracing */ +UInt64 +ZFSDataset::getSize() const +{ + DPRINTF_FUNC(); + return (IOMedia::getSize()); +} + +/* XXX Only for debug tracing */ +UInt64 +ZFSDataset::getBase() const +{ + DPRINTF_FUNC(); + return (IOMedia::getBase()); +} + +/* XXX Only for debug tracing */ +bool +ZFSDataset::isEjectable() const +{ + DPRINTF_FUNC(); + return (IOMedia::isEjectable()); +} + +/* XXX Only for debug tracing */ +bool +ZFSDataset::isFormatted() const +{ + DPRINTF_FUNC(); + return (IOMedia::isFormatted()); +} + +/* XXX Only for debug tracing */ +bool +ZFSDataset::isWhole() const +{ + DPRINTF_FUNC(); + return (IOMedia::isWhole()); +} + +/* XXX Only for debug tracing */ +bool +ZFSDataset::isWritable() const +{ + DPRINTF_FUNC(); + return (IOMedia::isWritable()); +} + +/* XXX Only for debug tracing */ +const char * +ZFSDataset::getContent() const +{ + DPRINTF_FUNC(); + return (IOMedia::getContent()); +} + +/* XXX Only for debug tracing */ +const char * +ZFSDataset::getContentHint() const +{ + DPRINTF_FUNC(); + return (IOMedia::getContentHint()); +} + +/* XXX Only for debug tracing */ +IOMediaAttributeMask +ZFSDataset::getAttributes() const +{ + DPRINTF_FUNC(); + return (IOMedia::getAttributes()); +} diff --git a/module/os/macos/zfs/ZFSDatasetProxy.cpp b/module/os/macos/zfs/ZFSDatasetProxy.cpp new file mode 100644 index 0000000000..10d3dd13e9 --- /dev/null +++ b/module/os/macos/zfs/ZFSDatasetProxy.cpp @@ -0,0 +1,466 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#define DPRINTF_FUNC() do { dprintf(""); } while (0); + +/* block size is 512 B, count is 512 M blocks */ +#define ZFS_PROXY_DEV_BSIZE (UInt64)(1<<9) +#define ZFS_PROXY_DEV_BCOUNT (UInt64)(2<<29) +#define kZFSProxyGUIDKey "ZFS Pool GUID" +#define kZFSProxyReadOnlyKey "ZFS Pool Read-Only" + +OSDefineMetaClassAndStructors(ZFSDatasetProxy, IOBlockStorageDevice); + +void +ZFSDatasetProxy::free() +{ + char *str; + + /* vendor, revision, and info share a null char */ + if (vendorString) { + str = (char *)vendorString; + vendorString = 0; + if (revisionString == str) revisionString = 0; + if (infoString == str) infoString = 0; + IOFree(str, strlen(str)+1); + } + + /* Product string contains pool name */ + if (productString) { + str = (char *)productString; + productString = 0; + IOFree(str, strlen(str)+1); + } + + IOBlockStorageDevice::free(); +} + +bool +ZFSDatasetProxy::init(OSDictionary *properties) +{ + char *str = (char *)IOMalloc(1); + + if (!str) { + dprintf("string allocation failed\n"); + return (false); + } + str[0] = '\0'; + vendorString = str; + revisionString = str; + infoString = str; + + if (IOBlockStorageDevice::init(properties) == false) { + dprintf("BlockStorageDevice start failed"); + goto error; + } + + return (true); + +error: + if (str) { + vendorString = 0; + revisionString = 0; + infoString = 0; + IOFree(str, 1); + } + return (false); +} + +bool +ZFSDatasetProxy::start(IOService *provider) +{ + OSObject *property = NULL, *size = NULL; + OSString *nameString = NULL; + OSNumber *sizeNum = NULL; + OSDictionary *deviceDict = NULL, *protocolDict = NULL; + const OSSymbol *virtualSymbol = NULL, *internalSymbol = NULL; + const char *cstr = NULL; + char *pstring = NULL; + int plen = 0; + bool started = false; + + size = copyProperty(kZFSPoolSizeKey, gIOServicePlane, + (kIORegistryIterateRecursively|kIORegistryIterateParents)); + property = copyProperty(kZFSPoolNameKey, gIOServicePlane, + (kIORegistryIterateRecursively|kIORegistryIterateParents)); + + if (!size || !property) { + dprintf("couldn't get pool name or size"); + goto error; + } + + nameString = OSDynamicCast(OSString, property); + if (!nameString) { + dprintf("missing pool name"); + goto error; + } +#if 0 + /* Try hard to get the name string */ + do { + nameString = OSDynamicCast(OSString, property); + + if (nameString) nameString->retain(); + + if (!nameString) { + OSSymbol *nameSymbol; + nameSymbol = OSDynamicCast(OSSymbol, property); + if (!nameSymbol) { + dprintf("couldn't get name"); + goto error; + } + nameString = OSString::withCString( + nameSymbol->getCStringNoCopy()); + } + } while (0); +#endif + + sizeNum = OSDynamicCast(OSNumber, size); + if (!sizeNum) { + dprintf("invalid size"); + goto error; + } + _pool_bcount = sizeNum->unsigned64BitValue() / DEV_BSIZE; + sizeNum = 0; + size->release(); + size = 0; + + cstr = nameString->getCStringNoCopy(); + if (!cstr || (plen = strlen(cstr) + 1) == 1) { + goto error; + } + pstring = (char *)IOMalloc(plen); + if (!pstring) { + goto error; + } + snprintf(pstring, plen, "%s", cstr); + productString = pstring; + pstring = 0; + + if (IOBlockStorageDevice::start(provider) == false) { + dprintf("BlockStorageDevice start failed"); + goto error; + } + started = true; + + deviceDict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyDeviceCharacteristicsKey)); + if (deviceDict) { + /* Clone a new dictionary */ + deviceDict = OSDictionary::withDictionary(deviceDict); + if (!deviceDict) { + dprintf("dict clone failed"); + goto error; + } + } + + if (!deviceDict) { + dprintf("creating new device dict"); + deviceDict = OSDictionary::withCapacity(1); + } + + if (!deviceDict) { + dprintf("missing device dict"); + goto error; + } + + deviceDict->setObject(kIOPropertyProductNameKey, nameString); + OSSafeReleaseNULL(nameString); + + if (setProperty(kIOPropertyDeviceCharacteristicsKey, + deviceDict) == false) { + dprintf("device dict setProperty failed"); + goto error; + } + OSSafeReleaseNULL(deviceDict); + + protocolDict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyProtocolCharacteristicsKey)); + if (protocolDict) { + /* Clone a new dictionary */ + protocolDict = OSDictionary::withDictionary(protocolDict); + if (!protocolDict) { + dprintf("dict clone failed"); + goto error; + } + } + + if (!protocolDict) { + dprintf("creating new protocol dict"); + protocolDict = OSDictionary::withCapacity(1); + } + + if (!protocolDict) { + dprintf("missing protocol dict"); + goto error; + } + + virtualSymbol = OSSymbol::withCString( + kIOPropertyPhysicalInterconnectTypeVirtual); + internalSymbol = OSSymbol::withCString( + kIOPropertyInternalKey); + if (!virtualSymbol || !internalSymbol) { + dprintf("symbol alloc failed"); + goto error; + } + + protocolDict->setObject(kIOPropertyPhysicalInterconnectTypeKey, + virtualSymbol); + protocolDict->setObject(kIOPropertyPhysicalInterconnectLocationKey, + internalSymbol); + + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(internalSymbol); + + if (setProperty(kIOPropertyProtocolCharacteristicsKey, + protocolDict) == false) { + dprintf("protocol dict setProperty failed"); + goto error; + } + OSSafeReleaseNULL(protocolDict); + registerService(kIOServiceAsynchronous); + + return (true); + +error: + OSSafeReleaseNULL(size); + OSSafeReleaseNULL(property); + OSSafeReleaseNULL(deviceDict); + OSSafeReleaseNULL(protocolDict); + OSSafeReleaseNULL(nameString); + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(internalSymbol); + if (pstring) IOFree(pstring, plen); + if (started) IOBlockStorageDevice::stop(provider); + return (false); +} + +/* XXX IOBlockStorageDevice */ +IOReturn +ZFSDatasetProxy::doSynchronizeCache(void) +{ + DPRINTF_FUNC(); + return (kIOReturnSuccess); +} + +IOReturn +ZFSDatasetProxy::doAsyncReadWrite(IOMemoryDescriptor *buffer, + UInt64 block, UInt64 nblks, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + char zero[ZFS_PROXY_DEV_BSIZE]; + size_t len, cur, off = 0; + + DPRINTF_FUNC(); + + if (!buffer) { + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* Read vs. write */ + if (buffer->getDirection() == kIODirectionIn) { + /* Zero the read buffer */ + bzero(zero, ZFS_PROXY_DEV_BSIZE); + len = buffer->getLength(); + while (len > 0) { + cur = (len > ZFS_PROXY_DEV_BSIZE ? + ZFS_PROXY_DEV_BSIZE : len); + buffer->writeBytes(/* offset */ off, + /* buf */ zero, /* length */ cur); + off += cur; + len -= cur; + } + // dprintf("%s: read: %llu %llu", + // __func__, block, nblks); + IOStorage::complete(completion, kIOReturnSuccess, + buffer->getLength()); + return (kIOReturnSuccess); + } + + if (buffer->getDirection() != kIODirectionOut) { + dprintf("invalid direction %d", buffer->getDirection()); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* + * XXX For now this just returns error for all writes. + * If it turns out that mountroot/bdevvp try to + * verify writable status by reading a block and writing + * it back to disk, lie and say it succeeded. + */ + dprintf("write: %llu %llu", block, nblks); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); +} + +IOReturn +ZFSDatasetProxy::doEjectMedia() +{ + DPRINTF_FUNC(); + /* XXX Called at shutdown, maybe return success? */ + return (kIOReturnError); +} + +IOReturn +ZFSDatasetProxy::doFormatMedia(UInt64 byteCapacity) +{ + DPRINTF_FUNC(); + /* XXX shouldn't need it */ + return (kIOReturnError); + // return (kIOReturnSuccess); +} + +UInt32 +ZFSDatasetProxy::doGetFormatCapacities(UInt64 *capacities, + UInt32 capacitiesMaxCount) const +{ + DPRINTF_FUNC(); + if (capacities && capacitiesMaxCount > 0) { + capacities[0] = (ZFS_PROXY_DEV_BSIZE * ZFS_PROXY_DEV_BCOUNT); + dprintf("capacity %llu", capacities[0]); + } + + /* Always inform caller of capacity count */ + return (1); +} + +/* Returns full pool name from instance private var */ +char * +ZFSDatasetProxy::getProductString() +{ + if (productString) dprintf("[%s]", productString); + /* Return class private string */ + return ((char *)productString); +} + +/* Returns readonly status from instance private var */ +IOReturn +ZFSDatasetProxy::reportWriteProtection(bool *isWriteProtected) +{ + DPRINTF_FUNC(); + if (isWriteProtected) *isWriteProtected = isReadOnly; + return (kIOReturnSuccess); +} + +/* These return class static string for all instances */ +char * +ZFSDatasetProxy::getVendorString() +{ + dprintf("[%s]", vendorString); + /* Return class static string */ + return ((char *)vendorString); +} +char * +ZFSDatasetProxy::getRevisionString() +{ + dprintf("[%s]", revisionString); + /* Return class static string */ + return ((char *)revisionString); +} +char * +ZFSDatasetProxy::getAdditionalDeviceInfoString() +{ + dprintf("[%s]", infoString); + /* Return class static string */ + return ((char *)infoString); +} + +/* Always return media present and unchanged */ +IOReturn +ZFSDatasetProxy::reportMediaState(bool *mediaPresent, + bool *changedState) +{ + DPRINTF_FUNC(); + if (mediaPresent) *mediaPresent = true; + if (changedState) *changedState = false; + return (kIOReturnSuccess); +} + +/* Always report nonremovable and nonejectable */ +IOReturn +ZFSDatasetProxy::reportRemovability(bool *isRemoveable) +{ + DPRINTF_FUNC(); + if (isRemoveable) *isRemoveable = false; + return (kIOReturnSuccess); +} +IOReturn +ZFSDatasetProxy::reportEjectability(bool *isEjectable) +{ + DPRINTF_FUNC(); + if (isEjectable) *isEjectable = false; + return (kIOReturnSuccess); +} + +/* Always report 512b blocksize */ +IOReturn +ZFSDatasetProxy::reportBlockSize(UInt64 *blockSize) +{ + DPRINTF_FUNC(); + if (!blockSize) + return (kIOReturnError); + + *blockSize = ZFS_PROXY_DEV_BSIZE; + return (kIOReturnSuccess); +} + +/* XXX Calculate from dev_bcount, should get size from objset */ +/* XXX Can issue message kIOMessageMediaParametersHaveChanged to update */ +IOReturn +ZFSDatasetProxy::reportMaxValidBlock(UInt64 *maxBlock) +{ + DPRINTF_FUNC(); + if (!maxBlock) + return (kIOReturnError); + + // *maxBlock = 0; + // *maxBlock = ZFS_PROXY_DEV_BCOUNT - 1; + *maxBlock = _pool_bcount - 1; + dprintf("maxBlock %llu", *maxBlock); + + return (kIOReturnSuccess); +} + +IOReturn +ZFSDatasetProxy::getWriteCacheState(bool *enabled) +{ + dprintf("getCacheState\n"); + if (enabled) *enabled = true; + return (kIOReturnSuccess); +} + +IOReturn +ZFSDatasetProxy::setWriteCacheState(bool enabled) +{ + dprintf("setWriteCache\n"); + return (kIOReturnSuccess); +} diff --git a/module/os/macos/zfs/ZFSDatasetScheme.cpp b/module/os/macos/zfs/ZFSDatasetScheme.cpp new file mode 100644 index 0000000000..1694357b92 --- /dev/null +++ b/module/os/macos/zfs/ZFSDatasetScheme.cpp @@ -0,0 +1,1108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + * Copyright (c) 2017, Jorgen Lundman. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static ZFSDatasetScheme * +zfs_osx_proxy_scheme_by_osname(const char *osname) +{ + ZFSDatasetScheme *scheme = NULL; + OSDictionary *matching; + OSObject *object; + OSString *str; + OSIterator *iter; + char *pool_name, *slash; + size_t len; + + slash = strchr(osname, '/'); + if (slash) { + len = (slash - osname) + 1; + } else { + len = strlen(osname) + 1; + } + + pool_name = (char *)kmem_alloc(len, KM_SLEEP); + if (!pool_name) { + dprintf("string alloc failed"); + return (NULL); + } + snprintf(pool_name, len, "%s", osname); + dprintf("pool_name [%s] from %s", pool_name, osname); + + matching = IOService::serviceMatching(kZFSDatasetSchemeClass); + if (!matching) { + dprintf("couldn't get match dict"); + kmem_free(pool_name, len); + return (NULL); + } + + /* Add the pool name for exact match */ + str = OSString::withCString(pool_name); + matching->setObject(kZFSPoolNameKey, str); + OSSafeReleaseNULL(str); + + object = IOService::copyMatchingService(matching); + + if (object && (scheme = OSDynamicCast(ZFSDatasetScheme, + object)) == NULL) { + object->release(); + } + object = NULL; + + if (scheme && ((str = OSDynamicCast(OSString, + scheme->getProperty(kZFSPoolNameKey))) == NULL || + str->isEqualTo(pool_name) == false)) { + scheme->release(); + scheme = NULL; + } + + if (!scheme) { + int i; + for (i = 0; i < 12; i++) { // up to 6s + iter = IOService::getMatchingServices(matching); + if (iter) break; + IOSleep(500); + } + + if (i) dprintf("%s: tried %d times\n", __func__, i); + + if (!iter) { + dprintf("couldn't get iterator"); + kmem_free(pool_name, len); + OSSafeReleaseNULL(matching); + return (NULL); + } + + while ((object = iter->getNextObject())) { + if (iter->isValid() == false) { + iter->reset(); + continue; + } + scheme = OSDynamicCast(ZFSDatasetScheme, object); + if (!scheme) continue; + + object = scheme->getProperty(kZFSPoolNameKey, + gIOServicePlane, kIORegistryIterateParents | + kIORegistryIterateRecursively); + if (!object) continue; + + str = OSDynamicCast(OSString, object); + if (!str) continue; + + if (str->isEqualTo(pool_name)) break; + + str = NULL; + object = NULL; + scheme = NULL; + } + + if (scheme) scheme->retain(); + OSSafeReleaseNULL(iter); + } + OSSafeReleaseNULL(matching); + kmem_free(pool_name, len); + pool_name = 0; + + if (scheme == NULL) { + dprintf("no matching pool proxy"); + } + return (scheme); + +#if 0 + spa_t *spa; + ZFSPool *pool = 0; + + if (!osname || osname[0] == '\0') { + dprintf("missing dataset argument"); + return (EINVAL); + } + + /* Lookup the pool spa */ + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(osname); + if (spa && spa->spa_iokit_proxy) { + pool = spa->spa_iokit_proxy->proxy; + if (pool) pool->retain(); + } + mutex_exit(&spa_namespace_lock); + + /* Need a pool proxy to attach to */ + if (!pool) { + dprintf("couldn't get pool proxy"); + return (EINVAL); + } + return (0); +#endif +} + +/* + * Get the proxy device by matching a property name and value. + * + * Inputs: + * property: const char string. + * value: const char string. + * + * Return: + * Pointer to proxy on success, NULL on error or missing. + */ +static ZFSDataset * +zfs_osx_proxy_lookup(const char *property, OSObject *value) +{ + OSIterator *iter = NULL; + OSDictionary *matching = NULL; + OSObject *next = NULL, *prop = NULL; + ZFSDataset *dataset = NULL; + + /* Validate arguments */ + if (!property || !value || property[0] == '\0') { + dprintf("invalid argument"); + return (NULL); + } + + /* + * Create the matching dictionary for class. + * Add property and value to match dict. + */ + matching = IOService::serviceMatching(kZFSDatasetClassKey); + if ((matching) == NULL || + (matching->setObject(property, value) == false)) { + dprintf("match dictionary create failed"); + OSSafeReleaseNULL(matching); + return (NULL); + } + + /* Try to copy if there is only one match */ + next = IOService::copyMatchingService(matching); + if (next != NULL && ((dataset = OSDynamicCast(ZFSDataset, + next)) != NULL) && + (prop = dataset->getProperty(property)) != NULL && + (prop->isEqualTo(value))) { + dprintf("quick matched dataset"); + OSSafeReleaseNULL(matching); + /* Leave retain taken by copyMatching */ + return (dataset); + } + /* Unretained references */ + prop = NULL; + dataset = NULL; + /* If set, it was retained by copyMatchingService */ + OSSafeReleaseNULL(next); + + iter = IOService::getMatchingServices(matching); + OSSafeReleaseNULL(matching); + if (iter == NULL) { + dprintf("iterator failed"); + return (NULL); + } + + while ((next = iter->getNextObject())) { + dataset = OSDynamicCast(ZFSDataset, next); + if (!dataset) continue; + + if ((prop = dataset->getProperty(property)) == NULL) { + dataset = NULL; + continue; + } + + if (prop->isEqualTo(value)) { + /* Take a reference on the match */ + dprintf("found match"); + dataset->retain(); + prop = NULL; + break; + } + + prop = NULL; + dataset = NULL; + } + /* Release iterator */ + OSSafeReleaseNULL(iter); + + /* Leave retain */ + return (dataset); +#if 0 + /* + * Copy (first) matching service. + * Cast service to proxy class. + */ + if ((service = IOService::copyMatchingService(matching)) == NULL || + (dataset = OSDynamicCast(ZFSDataset, service)) == NULL) { + dprintf("matching failed"); + OSSafeReleaseNULL(service); + return (NULL); + } + + /* Leave retain from copyMatchingService */ + return (dataset); +#endif +} + +/* + * Get the proxy device for a given dataset name. + * + * Input: + * osname: dataset name e.g. pool/dataset + * + * Return: + * Valid ZFSDataset service, or NULL on error or missing. + */ +ZFSDataset * +zfs_osx_proxy_get(const char *osname) +{ + ZFSDataset *dataset; + OSString *osstr; + + /* Validate arguments, osname is limited to MAXNAMELEN */ + if (!osname || osname[0] == '\0' || osname[0] == '/' || + strnlen(osname, MAXNAMELEN+1) == (MAXNAMELEN+1)) { + dprintf("invalid argument"); + return (NULL); + } + + osstr = OSString::withCString(osname); + if (!osstr) { + dprintf("string alloc failed"); + return (NULL); + } + + dataset = zfs_osx_proxy_lookup(kZFSDatasetNameKey, osstr); + OSSafeReleaseNULL(osstr); + + if (!dataset) { + dprintf("lookup failed"); + return (NULL); + } + + return (dataset); +} + +/* + * Get the proxy device for a given a device name or path. + * + * Input: + * devpath: BSD name as const char* string, e.g. "/dev/diskN" or "diskN" + * must be null-terminated + * + * Return: + * Valid ZFSDataset service, or NULL on error or missing. + */ +static ZFSDataset * +zfs_osx_proxy_from_devpath(const char *devpath) +{ + /* XXX No need to init, will be assigned */ + ZFSDataset *dataset; + OSString *bsdstr; + const char *bsdname; + + /* Validate arguments, devpath is limited to MAXPATHLEN */ + if (!devpath || devpath[0] == '\0' || + strnlen(devpath, MAXPATHLEN+1) == (MAXPATHLEN+1)) { + dprintf("invalid argument"); + return (NULL); + } + + /* If we have a path, remove prefix */ + if (strncmp(devpath, "/dev/", 5) == 0) { + bsdname = devpath + 5; + } else { + bsdname = devpath; + } + + /* Make sure we have (at least) "diskN" at this point */ + if (strncmp(bsdname, "disk", 4) != 0 || bsdname[4] == '\0') { + dprintf("invalid bsdname %s from %s", bsdname, devpath); + return (NULL); + } + + bsdstr = OSString::withCString(bsdname); + if (!bsdstr) { + dprintf("string alloc failed"); + return (NULL); + } + + dataset = zfs_osx_proxy_lookup(kIOBSDNameKey, bsdstr); + OSSafeReleaseNULL(bsdstr); + + if (!dataset) { + dprintf("lookup with %s failed", bsdname); + return (NULL); + } + + return (dataset); +} + +/* + * Given a dataset, get the desired property and write its + * value to the caller-supplied buffer. + * + * Inputs: + * dataset: valid ZFSDataset object, should be retained by + * caller. + * property: const char* of the desired property name key. + * value: char* buffer which should be at least 'len' bytes. + * len: length of value buffer. + * + * Return: + * 0 on success, positive int on error. + */ +static int +zfs_osx_proxy_get_prop_string(ZFSDataset *dataset, + const char *property, char *value, int len) +{ + OSObject *obj; + OSString *valueString; + + /* Validate arguments */ + if (!dataset || !property || !value || len == 0) { + dprintf("invalid argument"); + return (EINVAL); + } + + /* Lock proxy while getting property */ + dataset->lockForArbitration(); + obj = dataset->copyProperty(property); + dataset->unlockForArbitration(); + + if (!obj) { + dprintf("no property %s", property); + return (ENXIO); + } + + valueString = OSDynamicCast(OSString, obj); + /* Validate property value */ + if (!valueString) { + dprintf("couldn't cast value for %s", property); + OSSafeReleaseNULL(obj); + return (ENXIO); + } + + /* Write up to len bytes */ + snprintf(value, len, "%s", valueString->getCStringNoCopy()); + + /* Release string and proxy */ + valueString = 0; + OSSafeReleaseNULL(obj); + + return (0); +} + +extern "C" { + +/* + * Given a ZFS dataset name, get the proxy device and write the + * BSD Name to the caller-supplied buffer. + * + * Inputs: + * osname: dataset name as char* string, e.g. "pool/dataset" + * must be null-terminated + * bsdname: char* string buffer where bsdname will be written + * len: length of bsdname buffer + * + * Return: + * 0 on success, positive int errno on failure. + */ +int +zfs_osx_proxy_get_bsdname(const char *osname, + char *bsdname, int len) +{ + /* XXX No need to init, will be assigned */ + ZFSDataset *dataset; + int ret; + + /* Validate arguments */ + if (!osname || !bsdname || len == 0) { + dprintf("invalid argument"); + return (EINVAL); + } + + /* Get dataset proxy (takes a retain) */ + dataset = zfs_osx_proxy_get(osname); + if (!dataset) { + dprintf("no proxy matching %s", osname); + return (ENOENT); + } + + /* Get BSD name property and write to bsdname buffer */ + ret = zfs_osx_proxy_get_prop_string(dataset, + kIOBSDNameKey, bsdname, len); + OSSafeReleaseNULL(dataset); + + if (ret != 0) { + dprintf("ret %d", ret); + } + + return (ret); +} + +/* + * Given a device name or path, get the proxy device and write the + * ZFS Dataset name to the caller-supplied buffer. + * + * Inputs: + * devpath: BSD name as const char* string, e.g. "/dev/diskN" or "diskN" + * must be null-terminated + * osname: char* string buffer where osname will be written + * len: length of osname buffer + * + * Return: + * 0 on success, positive int errno on failure. + */ +int +zfs_osx_proxy_get_osname(const char *devpath, char *osname, int len) +{ + /* XXX No need to init, will be assigned */ + ZFSDataset *dataset; + int ret; + + /* Validate arguments */ + if (!devpath || !osname || len == 0) { + dprintf("invalid argument"); + return (EINVAL); + } + + /* Get dataset proxy (takes a retain) */ + dataset = zfs_osx_proxy_from_devpath(devpath); + if (!dataset) { + dprintf("no proxy matching %s", devpath); + return (ENOENT); + } + + /* Get dataset name property and write to osname buffer */ + ret = zfs_osx_proxy_get_prop_string(dataset, + kZFSDatasetNameKey, osname, len); + OSSafeReleaseNULL(dataset); + + if (ret != 0) { + dprintf("ret %d", ret); + } + + return (ret); +} + +/* + * Check if a dataset has a proxy device. + * + * Input: + * osname: dataset name e.g. pool/dataset + * + * Return: + * 1 if exists, 0 on error or missing. + */ +int +zfs_osx_proxy_exists(const char *osname) +{ + ZFSDataset *dataset; + + /* Get dataset proxy (takes a retain) */ + if ((dataset = zfs_osx_proxy_get(osname)) != NULL) { + OSSafeReleaseNULL(dataset); + return (1); + } + + return (0); +} + +/* + * Remove the proxy device for a given dataset name. + * + * Input: + * osname: dataset name e.g. pool/dataset + */ +void +zfs_osx_proxy_remove(const char *osname) +{ + ZFSDataset *dataset; + ZFSDatasetScheme *provider; + + /* Get dataset proxy (takes a retain) */ + dataset = zfs_osx_proxy_get(osname); + if (dataset == NULL) { + dprintf("couldn't get dataset"); + return; + } +#if 0 + /* Terminate and release retain */ + dataset->terminate(kIOServiceSynchronous | kIOServiceRequired); + OSSafeReleaseNULL(dataset); +#endif + provider = OSDynamicCast(ZFSDatasetScheme, + dataset->getProvider()); + if (!provider) { + dprintf("invalid provider"); + return; + } + + OSSafeReleaseNULL(dataset); + dprintf("removing %s", osname); + provider->removeDataset(osname, /* force */ true); +} + +/* + * Create a proxy device for a given dataset name, unless one exists. + * + * Input: + * osname: dataset name e.g. pool/dataset + * + * Return: + * 0 on success, or positive int on error. + */ +int +zfs_osx_proxy_create(const char *osname) +{ + ZFSDatasetScheme *provider = NULL; + + if (!osname || osname[0] == '\0') { + dprintf("missing dataset argument"); + return (EINVAL); + } + + provider = zfs_osx_proxy_scheme_by_osname(osname); + if (provider == NULL) { + dprintf("can't get pool proxy"); + return (ENOENT); + } + + if (provider->addDataset(osname) == false) { + dprintf("couldn't add dataset"); + provider->release(); + return (ENXIO); + } + + provider->release(); + return (0); +} + +} /* extern "C" */ + +static SInt32 +orderHoles(const OSMetaClassBase *obj1, const OSMetaClassBase *obj2, + __unused void *context) +{ + OSNumber *num1, *num2; + + if (obj1 == NULL || + (num1 = OSDynamicCast(OSNumber, obj1)) == NULL) { + /* Push invalid OSNumbers to end of list */ + return (-1); + } + if (obj2 == NULL || + (num2 = OSDynamicCast(OSNumber, obj2)) == NULL) { + /* If both are non-OSNumber, same ordering */ + if (num1 == NULL) + return (0); + /* If num1 is a valid OSNumber, push num2 to end */ + return (1); + } + + /* + * A comparison result of the object: + *
    + *
  • a negative value if obj2 should precede obj1,
  • + *
  • a positive value if obj1 should precede obj2,
  • + *
  • and 0 if obj1 and obj2 have an equivalent ordering.
  • + *
+ */ + if (num1->isEqualTo(num2)) + return (0); + + if (num1->unsigned32BitValue() < num2->unsigned32BitValue()) { + return (1); + } else { + return (-1); + } +} + +OSDefineMetaClassAndStructors(ZFSDatasetScheme, IOPartitionScheme); + +void +ZFSDatasetScheme::free() +{ + OSSafeReleaseNULL(_datasets); + OSSafeReleaseNULL(_holes); + _max_id = 0; + + IOPartitionScheme::free(); +} + +bool +ZFSDatasetScheme::init(OSDictionary *properties) +{ + _datasets = OSSet::withCapacity(1); + _holes = OSOrderedSet::withCapacity(1, orderHoles); + _max_id = 0; + + if (!_datasets || !_holes) { + dprintf("OSSet allocation failed"); + OSSafeReleaseNULL(_datasets); + OSSafeReleaseNULL(_holes); + return (false); + } + + OSDictionary *newProps = NULL; + if (properties) newProps = OSDictionary::withDictionary(properties); + if (!newProps) newProps = OSDictionary::withCapacity(2); + OSString *str; + str = OSString::withCString("IOGUIDPartitionScheme"); + newProps->setObject("IOClass", str); + OSSafeReleaseNULL(str); + str = OSString::withCString("GUID_partition_scheme"); + newProps->setObject("Content Mask", str); + OSSafeReleaseNULL(str); + + if (IOPartitionScheme::init(newProps) == false) { + dprintf("IOPartitionScheme init failed"); + OSSafeReleaseNULL(newProps); + OSSafeReleaseNULL(_datasets); + OSSafeReleaseNULL(_holes); + return (false); + } + OSSafeReleaseNULL(newProps); + + return (true); +} + +bool +ZFSDatasetScheme::start(IOService *provider) +{ + OSObject *pool_name; + + if (IOPartitionScheme::start(provider) == false) { + dprintf("IOPartitionScheme start failed"); + return (false); + } + + pool_name = getProperty(kZFSPoolNameKey, + gIOServicePlane, kIORegistryIterateRecursively| + kIORegistryIterateParents); + if (pool_name) { + setProperty(kZFSPoolNameKey, pool_name); + } + + // registerService(kIOServiceAsynchronous); + registerService(kIOServiceSynchronous); + + return (true); +} + +IOService * +ZFSDatasetScheme::probe(IOService *provider, SInt32 *score) +{ + OSObject *property; + IOService *parent; + + /* First ask IOPartitionScheme to probe */ + if (IOPartitionScheme::probe(provider, score) == 0) { + dprintf("IOPartitionScheme probe failed"); + return (0); + } + + /* Check for ZFS Pool Name first */ + property = getProperty(kZFSPoolNameKey, gIOServicePlane, + kIORegistryIterateRecursively|kIORegistryIterateParents); + if (!property) { + dprintf("no pool name"); + return (0); + } + + /* Make sure we have a target, and valid provider below */ + if (provider == NULL || + OSDynamicCast(IOMedia, provider) == NULL || + (parent = provider->getProvider()) == NULL) { + dprintf("invalid provider"); + return (0); + } + + /* Make sure provider is driver, and has valid provider below */ + if (OSDynamicCast(IOBlockStorageDriver, parent) == NULL || + (parent = parent->getProvider()) == NULL) { + dprintf("invalid parent"); + return (0); + } + + /* Make sure the parent provider is a proxy */ + if (OSDynamicCast(ZFSDatasetProxy, parent) == NULL) { + dprintf("invalid grandparent"); + return (0); + } + + /* Successful match */ + dprintf("Match"); + // *score = 5000; + return (this); +} + +uint32_t +ZFSDatasetScheme::getNextPartitionID() +{ + uint32_t ret_id = 0ULL; + + /* Try to lock, unless service is terminated */ + if (lockForArbitration(false) == false) { + dprintf("service is terminated"); + return (0ULL); + } + + /* If the partiton list is sparse (has holes) */ + if (_holes->getCount() != 0) { + OSNumber *id_num = OSDynamicCast(OSNumber, + _holes->getFirstObject()); + + /* Just in case the list is invalid */ +#ifdef DEBUG + if (!id_num) panic("invalid hole list"); +#endif + + if (id_num) { + id_num->retain(); + _holes->removeObject(id_num); + ret_id = id_num->unsigned32BitValue(); + OSSafeReleaseNULL(id_num); + goto out; + } + } + + /* If no holes were found, just get next id */ + ret_id = (_max_id += 1); + +out: + unlockForArbitration(); + return (ret_id); +} + +void ZFSDatasetScheme::returnPartitionID(uint32_t part_id) +{ + OSNumber *id_num = OSNumber::withNumber(part_id, 32); + + if (!id_num) dprintf("alloc failed"); + /* XXX Continue and try to decrement max_id if possible */ + + if (lockForArbitration(false) == false) { + dprintf("service is terminated"); + OSSafeReleaseNULL(id_num); + return; + } + + /* Decrementing highest part id */ + if (part_id == _max_id) { + /* First, decrement max */ + _max_id--; + /* no longer needed */ + OSSafeReleaseNULL(id_num); + + /* Now iterate down the hole list */ + while ((id_num = OSDynamicCast(OSNumber, + _holes->getLastObject()))) { + /* Only need to remove consecutive matches */ + if (id_num->unsigned32BitValue() != (_max_id)) { + break; + } + + /* Remove this num from hole list */ + id_num->retain(); + _holes->removeObject(id_num); + OSSafeReleaseNULL(id_num); + /* Decrement max */ + _max_id--; + } + /* Creating a new 'hole' in the ID namespace */ + } else { + /* Better have been able to allocate OSNum */ + if (!id_num) { + unlockForArbitration(); +#ifdef DEBUG + panic("ZFSDatasetScheme %s failed to return partID", + __func__); +#endif + return; + } + + /* + * OSOrderedSet only enforces ordering when + * using setObject(anObject) interface. + * Therefore _holes must not use setFirstObject, + * setLastObject, setObject(index, anObject) + */ + + /* Add a new OSNum to hole list */ + _holes->setObject(id_num); + OSSafeReleaseNULL(id_num); + } + + unlockForArbitration(); +} + +bool +ZFSDatasetScheme::addDataset(const char *osname) +{ + ZFSDataset *dataset; + OSObject *obj; + OSNumber *sizeNum; + char location[24]; + uint64_t size; + uint32_t part_id; + + obj = copyProperty(kZFSPoolSizeKey, gIOServicePlane, + kIORegistryIterateRecursively|kIORegistryIterateParents); + if (!obj) { + dprintf("missing pool size"); + return (false); + } + sizeNum = OSDynamicCast(OSNumber, obj); + if (!sizeNum) { + dprintf("invalid pool size"); + return (false); + } + size = sizeNum->unsigned64BitValue(); + sizeNum = 0; + OSSafeReleaseNULL(obj); + + part_id = getNextPartitionID(); + /* Only using non-zero partition ids */ + if (part_id == 0) { + dprintf("invalid partition ID"); + return (false); + } + snprintf(location, sizeof (location), "%u", part_id); + +#if 0 + OSString *locationStr; + locationStr = OSString::withCString(location); + if (!locationStr) { + dprintf("location string alloc failed"); + return (false); + } + OSSafeReleaseNULL(locationStr); +#endif + + dataset = ZFSDataset::withDatasetNameAndSize(osname, size); + if (!dataset) { + dprintf("couldn't add %s", osname); + return (false); + } + + /* Set location in plane and partiton ID property */ + dataset->setLocation(location); +#ifdef kIOMediaBaseKey + dataset->setProperty(kIOMediaBaseKey, 0ULL, 64); +#endif + dataset->setProperty(kIOMediaPartitionIDKey, part_id, 32); + + // This sets the "diskutil list -> TYPE" field + dataset->setProperty("Content", "ZFS Dataset"); + // This matches with Info.plist, so it calls zfs.util for NAME + dataset->setProperty("Content Hint", + "6A898CC3-1DD2-11B2-99A6-080020736631"); + + if (dataset->attach(this) == false) { + dprintf("attach failed"); + OSSafeReleaseNULL(dataset); + return (false); + } + + if (dataset->start(this) == false) { + dprintf("start failed"); + dataset->detach(this); + OSSafeReleaseNULL(dataset); + return (false); + } + + /* Protect the OSSet by taking IOService lock */ + lockForArbitration(); + _datasets->setObject(dataset); + unlockForArbitration(); + + // dataset->registerService(kIOServiceAsynchronous); + dataset->registerService(kIOServiceSynchronous); + + /* Adding to OSSet takes a retain */ + OSSafeReleaseNULL(dataset); + + return (true); +} + +bool +ZFSDatasetScheme::removeDataset(const char *osname, bool force) +{ + OSCollectionIterator *iter; + ZFSDataset *dataset = NULL; + OSNumber *partNum; + uint32_t part_id = 0; + bool locked; + + if ((locked = lockForArbitration(false)) == false) { + dprintf("couldn't lock terminated service"); + } + + iter = OSCollectionIterator::withCollection(_datasets); + if (!iter) { + dprintf("couldn't get dataset iterator"); + return (false); + } + + while ((dataset = OSDynamicCast(ZFSDataset, + iter->getNextObject())) != NULL) { + OSObject *property; + OSString *str; + + property = dataset->getProperty(kZFSDatasetNameKey); + if (!property) continue; + + str = OSDynamicCast(OSString, property); + if (!str) continue; + + if (str->isEqualTo(osname)) { + _datasets->removeObject(dataset); + break; + } + } + + if (!dataset) { + dprintf("couldn't get dataset"); + iter->release(); + return (false); + } + + dataset->retain(); + iter->release(); + iter = 0; + + if (locked) unlockForArbitration(); + + partNum = OSDynamicCast(OSNumber, + dataset->getProperty(kIOMediaPartitionIDKey)); + if (!partNum) { + dprintf("couldn't get partition number"); + } else { + part_id = partNum->unsigned32BitValue(); + } + + if (force) { + dataset->terminate(kIOServiceSynchronous| + kIOServiceRequired); + } else { + dataset->terminate(kIOServiceSynchronous); + } + + dataset->release(); + dataset = 0; + + /* Only return non-zero partition ids */ + if (part_id != 0) { + dprintf("terminated partition %u", part_id); + returnPartitionID(part_id); + } + + return (true); +} + +/* Compatibility shims */ +void +ZFSDatasetScheme::read(IOService *client, + UInt64 byteStart, + IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + IOStorage::complete(completion, kIOReturnError, 0); +} + +void +ZFSDatasetScheme::write(IOService *client, + UInt64 byteStart, + IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + IOStorage::complete(completion, kIOReturnError, 0); +} + +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) +IOReturn +ZFSDatasetScheme::synchronize(IOService *client, + UInt64 byteStart, + UInt64 byteCount, + IOStorageSynchronizeOptions options) +#else +IOReturn +ZFSDatasetScheme::synchronizeCache(IOService *client) +#endif +{ + return (kIOReturnUnsupported); +} + +IOReturn +ZFSDatasetScheme::unmap(IOService *client, + IOStorageExtent *extents, + UInt32 extentsCount, +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + IOStorageUnmapOptions options) +#else + UInt32 options) +#endif +{ + return (kIOReturnUnsupported); +} + +bool +ZFSDatasetScheme::lockPhysicalExtents(IOService *client) +{ + return (false); +} + +IOStorage * +ZFSDatasetScheme::copyPhysicalExtent(IOService *client, + UInt64 * byteStart, + UInt64 * byteCount) +{ + return (NULL); +} + +void +ZFSDatasetScheme::unlockPhysicalExtents(IOService *client) +{ +} + +#if defined(MAC_OS_X_VERSION_10_10) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10) +IOReturn +ZFSDatasetScheme::setPriority(IOService *client, + IOStorageExtent *extents, + UInt32 extentsCount, + IOStoragePriority priority) +{ + return (kIOReturnUnsupported); +} +#endif diff --git a/module/os/macos/zfs/ZFSPool.cpp b/module/os/macos/zfs/ZFSPool.cpp new file mode 100644 index 0000000000..405d00c8f4 --- /dev/null +++ b/module/os/macos/zfs/ZFSPool.cpp @@ -0,0 +1,868 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#include +#include +#include +#include +#include + +extern "C" { +#include +#include +#include +} /* extern "C" */ + +#include + +#define DPRINTF_FUNC() do { dprintf("%s\n", __func__); } while (0); + +#if 0 +/* block size is 512 B, count is 512 M blocks */ +#define ZFS_POOL_DEV_BSIZE (UInt64)(1<<9) +#define ZFS_POOL_DEV_BCOUNT (UInt64)(2<<29) +#endif + +/* + * Returns handle to ZFS IOService, with a retain count. + */ +static IOService * +copy_zfs_handle() +{ + /* Get the ZFS service handle the 'hard way' */ + OSDictionary *matching; + IOService *service = 0; + + matching = IOService::serviceMatching("net_lundman_zfs_zvol"); + if (matching) { + service = IOService::copyMatchingService(matching); + OSSafeReleaseNULL(matching); + } + + if (!service) { + dprintf("couldn't get zfs IOService"); + return (NULL); + } + + return (service); +#if 0 + /* Got service, make sure it casts */ + zfs_hl = OSDynamicCast(net_lundman_zfs_zvol, service); + if (zfs_hl == NULL) { + dprintf("couldn't get zfs_hl"); + /* Drop retain from copyMatchingService */ + OSSafeReleaseNULL(service); + return (NULL); + } + + return (zfs_hl); +#endif +} + +OSDefineMetaClassAndStructors(ZFSPool, IOService); + +#if 0 +bool +ZFSPool::open(IOService *client, IOOptionBits options, void *arg) +{ + bool ret; + + IOLog("ZFSPool %s\n", __func__); + + ret = IOService::open(client, options, arg); + + IOLog("ZFSPool %s ret %d\n", __func__, ret); + + return (ret); +} + +bool +ZFSPool::isOpen(const IOService *forClient) const +{ + IOLog("ZFSPool %s\n", __func__); + return (false); +} + +void +ZFSPool::close(IOService *client, IOOptionBits options) +{ + IOLog("ZFSPool %s\n", __func__); + IOService::close(client, options); +} +#endif + +bool +ZFSPool::handleOpen(IOService *client, + IOOptionBits options, void *arg) +{ + bool ret = true; + + dprintf(""); + // IOLog("ZFSPool %s\n", __func__); + + /* XXX IOService open() locks for arbitration around handleOpen */ + // lockForArbitration(); + _openClients->setObject(client); + ret = _openClients->containsObject(client); + // unlockForArbitration(); + + return (ret); +// return (IOService::handleOpen(client, options, NULL)); +} + +bool +ZFSPool::handleIsOpen(const IOService *client) const +{ + bool ret; + + dprintf(""); + // IOLog("ZFSPool %s\n", __func__); + + /* XXX IOService isOpen() locks for arbitration around handleIsOpen */ + // lockForArbitration(); + ret = _openClients->containsObject(client); + // unlockForArbitration(); + + return (ret); +// return (IOService::handleIsOpen(client)); +} + +void +ZFSPool::handleClose(IOService *client, + IOOptionBits options) +{ + dprintf(""); + // IOLog("ZFSPool %s\n", __func__); + + /* XXX IOService close() locks for arbitration around handleClose */ + // lockForArbitration(); + if (_openClients->containsObject(client) == false) { + dprintf("not open"); + } + /* Remove client from set */ + _openClients->removeObject(client); + // unlockForArbitration(); + +// IOService::handleClose(client, options); +} + +#if 0 +/* XXX IOBlockStorageDevice */ +void +ZFSPool::read(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion) +{ + IOLog("ZFSPool %s\n", __func__); + IOStorage::complete(completion, kIOReturnError, 0); +} + +void +ZFSPool::write(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion) +{ + IOLog("ZFSPool %s\n", __func__); + IOStorage::complete(completion, kIOReturnError, 0); +} +#endif + +bool +ZFSPool::setPoolName(const char *name) +{ +/* Assign dataset name from null-terminated string */ + OSString *dsstr; + // const OSSymbol *dsstr; +#if 0 + OSDictionary *dict; + char *newname, *oldname; +#else + char *newname; +#endif + size_t len; + + DPRINTF_FUNC(); + + /* Validate arguments */ + if (!name || (len = strnlen(name, + ZFS_MAX_DATASET_NAME_LEN)) == 0) { + dprintf("missing argument"); + return (false); + } + + /* Truncate too-long names (shouldn't happen) */ + if (len == ZFS_MAX_DATASET_NAME_LEN && + name[ZFS_MAX_DATASET_NAME_LEN] != '\0') { + dprintf("name too long [%s]", name); + /* XXX Just truncate the name */ + len--; + } + + /* Allocate room for name plus null char */ + newname = (char *)kmem_alloc(len+1, KM_SLEEP); + if (!newname) { + dprintf("string alloc failed"); + return (false); + } + snprintf(newname, len+1, "%s", name); + newname[len] = '\0'; /* just in case */ + + /* Save an OSString copy for IORegistry */ + dsstr = OSString::withCString(newname); + // dsstr = OSSymbol::withCString(newname); + + kmem_free(newname, len+1); + + if (!dsstr) { + dprintf("OSString failed"); + return (false); + } + +#if 0 + /* Swap into class private var */ + oldname = (char *)productString; + productString = newname; + newname = 0; + if (oldname) { + kmem_free(oldname, strlen(oldname)+1); + oldname = 0; + } + + /* Get and clone device characteristics prop dict */ + if ((dict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyDeviceCharacteristicsKey))) == NULL || + (dict = OSDictionary::withDictionary(dict)) == NULL) { + dprintf("couldn't clone prop dict"); + /* Should only happen during initialization */ + } + + if (dict) { + /* Copy string, add to dictionary, and replace prop dict */ + if (dict->setObject(kIOPropertyProductNameKey, + dsstr) == false || + setProperty(kIOPropertyDeviceCharacteristicsKey, + dict) == false) { + dprintf("couldn't set name"); + OSSafeReleaseNULL(dsstr); + OSSafeReleaseNULL(dict); + return (false); + } + OSSafeReleaseNULL(dict); + } +#endif + + /* Set Pool name IOregistry property */ + setProperty(kZFSPoolNameKey, dsstr); + + /* Finally, set the IORegistryEntry/IOService name */ + setName(dsstr->getCStringNoCopy()); + OSSafeReleaseNULL(dsstr); + + return (true); +} + +bool +ZFSPool::init(OSDictionary *properties, spa_t *spa) +{ +#if 0 + /* Allocate dictionaries and symbols */ + OSDictionary *pdict = OSDictionary::withCapacity(2); + OSDictionary *ddict = OSDictionary::withCapacity(4); + const OSSymbol *virtualSymbol = OSSymbol::withCString( + kIOPropertyPhysicalInterconnectTypeVirtual); + const OSSymbol *locationSymbol = OSSymbol::withCString( + kIOPropertyInternalExternalKey); + const OSSymbol *ssdSymbol = OSSymbol::withCString( + kIOPropertyMediumTypeSolidStateKey); + OSNumber *physSize = NULL, *logSize = NULL; + const OSSymbol *vendorSymbol = 0; + const OSSymbol *revisionSymbol = 0; + const OSSymbol *blankSymbol = 0; + OSBoolean *rdonly = 0; + UInt64 phys_bsize, log_bsize; + OSString *str = 0; + const char *cstr = 0; +#endif + uint64_t space; + bool ret = false; + + DPRINTF_FUNC(); + +#if 0 + physSize = OSNumber::withNumber((uint32_t)ZFS_POOL_DEV_BSIZE, 32); + logSize = OSNumber::withNumber((uint32_t)ZFS_POOL_DEV_BSIZE, 32); +#endif + if (!spa) { + dprintf("missing spa"); + goto error; + } + +#if 0 + /* Get physical and logical size from spa */ + phys_bsize = (1ULL<spa_max_ashift); + log_bsize = (1ULL<spa_min_ashift); +#endif + +#if 0 + /* Workaround glitchy behavior with large bsize in xnu */ + if (log_bsize > 8192) log_bsize = 8192; +#endif + +#if 0 + /* XXX Shouldn't be possible */ + if (log_bsize == 0) log_bsize = DEV_BSIZE; + + physSize = OSNumber::withNumber((uint32_t)phys_bsize, 32); + logSize = OSNumber::withNumber((uint32_t)log_bsize, 32); + + /* Validate allocations */ + if (!pdict || !ddict || !virtualSymbol || !locationSymbol || + !ssdSymbol || !physSize || !logSize) { + dprintf("allocation failed"); + goto error; + } +#endif + + /* Need an OSSet for open clients */ + _openClients = OSSet::withCapacity(1); + if (_openClients == NULL) { + dprintf("client OSSet failed"); + goto error; + } + + /* Set spa pointer and this Pool object's name to match */ + if (!spa) { + dprintf("missing spa"); + goto error; + } + _spa = spa; + // setName(spa_name(spa)); + +#if 0 + /* Init class statics every time an instance inits */ + /* Shared across instances, but doesn't hurt to reprint */ + if (vendorString == NULL) { + char *string; + int len = strlen("zpool")+1; + string = (char *)kmem_alloc(len, KM_SLEEP); + if (!string) goto error; + snprintf(string, len, "zpool"); + vendorString = string; + } + + if (revisionString == NULL) { + char *string; + int len = strlen("0.1")+1; + string = (char *)kmem_alloc(len, KM_SLEEP); + if (!string) goto error; + snprintf(string, len, "0.1"); + revisionString = string; + } + + if (revisionString == NULL) { + char *string; + int len = strlen("ZFS Pool")+1; + string = (char *)kmem_alloc(len, KM_SLEEP); + if (!string) goto error; + snprintf(string, len, "ZFS pool"); + infoString = string; + } + + /* For IORegistry keys, cache OSSymbols for class statics */ + /* Leverages OSSymbol cahce pool to reuse across instances */ + vendorSymbol = OSSymbol::withCString(vendorString); + revisionSymbol = OSSymbol::withCString(revisionString); + blankSymbol = OSSymbol::withCString(""); + if (!vendorSymbol || !revisionSymbol || !blankSymbol) { + dprintf("class symbols failed"); + goto error; + } +#endif + + /* Call super init */ + if (IOService::init(properties) == false) { + dprintf("device init failed"); + goto error; + } + +#if 0 + /* Set class private vars */ + productString = NULL; + isReadOnly = false; // XXX should really be true initially + + /* Set Protocol Characteristics */ + if (pdict->setObject(kIOPropertyPhysicalInterconnectLocationKey, + locationSymbol) == false || + pdict->setObject(kIOPropertyPhysicalInterconnectTypeKey, + virtualSymbol) == false) { + dprintf("pdict set properties failed"); + goto error; + } + setProperty(kIOPropertyProtocolCharacteristicsKey, pdict); + + /* Set Device Characteristics */ + if (ddict->setObject(kIOPropertyVendorNameKey, + vendorSymbol) == false || + ddict->setObject(kIOPropertyProductRevisionLevelKey, + revisionSymbol) == false || + ddict->setObject(kIOPropertyProductSerialNumberKey, + blankSymbol) == false || + ddict->setObject(kIOPropertyPhysicalBlockSizeKey, + physSize) == false || + ddict->setObject(kIOPropertyLogicalBlockSizeKey, + logSize) == false || + ddict->setObject(kIOPropertyMediumTypeKey, + ssdSymbol) == false) { + dprintf("ddict set properties failed"); + goto error; + } + setProperty(kIOPropertyDeviceCharacteristicsKey, ddict); + + /* Check for passed in readonly status */ + if (properties && (rdonly = OSDynamicCast(OSBoolean, + properties->getObject(kZFSPoolReadOnlyKey))) != NULL) { + /* Got the boolean */ + isReadOnly = rdonly->getValue(); + dprintf("set %s", (isReadOnly ? "readonly" : "readwrite")); + } + + /* Check for passed in pool GUID */ + if (properties && (str = OSDynamicCast(OSString, + properties->getObject(kZFSPoolGUIDKey))) != NULL) { + /* Got the string, try to set GUID */ + str->retain(); + if (ddict->setObject(kZFSPoolGUIDKey, str) == false) { + dprintf("couldn't set GUID"); + OSSafeReleaseNULL(str); + goto error; + } +#ifdef DEBUG + cstr = str->getCStringNoCopy(); + dprintf("set GUID"); + cstr = 0; +#endif + OSSafeReleaseNULL(str); + } +#endif + + if (setPoolName(spa_name(spa)) == false) { + dprintf("setPoolName failed"); + goto error; + } + + space = spa_get_dspace(spa); +dprintf("space %llu", space); + setProperty(kZFSPoolSizeKey, space, 64); + +#if 0 + /* Check for passed in pool name */ + if (properties && (str = OSDynamicCast(OSString, + properties->getObject(kZFSPoolNameKey))) != NULL && + (cstr = str->getCStringNoCopy()) != NULL) { + /* Got the string, try to set name */ + str->retain(); + if (setPoolName(cstr) == false) { + /* Unlikely */ + dprintf("couldn't setup pool" + " name property [%s]", cstr); + OSSafeReleaseNULL(str); + goto error; + } + + dprintf("set pool name [%s]", cstr); + OSSafeReleaseNULL(str); + } else { + if (setPoolName("invalid") == false) { + dprintf("setPoolName failed"); + goto error; + } + dprintf("set name [invalid]"); + } +#endif + + /* Success */ + ret = true; + +error: +#if 0 + /* All of these will be released on error */ + OSSafeReleaseNULL(pdict); + OSSafeReleaseNULL(ddict); + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(locationSymbol); + OSSafeReleaseNULL(ssdSymbol); + OSSafeReleaseNULL(physSize); + OSSafeReleaseNULL(logSize); + OSSafeReleaseNULL(vendorSymbol); + OSSafeReleaseNULL(revisionSymbol); + OSSafeReleaseNULL(blankSymbol); + OSSafeReleaseNULL(str); +#endif + return (ret); +} + +void +ZFSPool::free() +{ + OSSet *oldSet; +#if 0 + char *pstring; +#endif + + if (_openClients) { + oldSet = _openClients; + _openClients = 0; + OSSafeReleaseNULL(oldSet); + } + _spa = 0; + +#if 0 + pstring = (char *)productString; + productString = 0; + if (pstring) kmem_free(pstring, strlen(pstring) + 1); +#endif + + IOService::free(); +} + +extern "C" { + +void +spa_iokit_pool_proxy_destroy(spa_t *spa) +{ + ZFSPool *proxy; + spa_iokit_t *wrapper; + + if (!spa) { + printf("missing spa"); + return; + } + + /* Get pool proxy */ + wrapper = spa->spa_iokit_proxy; + spa->spa_iokit_proxy = NULL; + + if (wrapper == NULL) { + printf("missing spa_iokit_proxy"); + return; + } + + proxy = wrapper->proxy; + + /* Free the struct */ + kmem_free(wrapper, sizeof (spa_iokit_t)); + if (!proxy) { + printf("missing proxy"); + return; + } + + if (proxy->terminate(kIOServiceSynchronous| + kIOServiceRequired) == false) { + dprintf("terminate failed"); + } + proxy->release(); + + /* + * IOService *provider; + * provider = proxy->getProvider(); + * + * proxy->detach(provider); + * proxy->stop(provider); + * + * proxy->release(); + */ +} + +int +spa_iokit_pool_proxy_create(spa_t *spa) +{ + IOService *zfs_hl; + ZFSPool *proxy; + spa_iokit_t *wrapper; + + if (!spa) { + dprintf("missing spa"); + return (EINVAL); + } + + /* Allocate C struct */ + if ((wrapper = (spa_iokit_t *)kmem_alloc(sizeof (spa_iokit_t), + KM_SLEEP)) == NULL) { + dprintf("couldn't allocate wrapper"); + return (ENOMEM); + } + + /* Get ZFS IOService */ + if ((zfs_hl = copy_zfs_handle()) == NULL) { + dprintf("couldn't get ZFS handle"); + kmem_free(wrapper, sizeof (spa_iokit_t)); + return (ENODEV); + } + + /* Allocate and init ZFS pool proxy */ + proxy = ZFSPool::withProviderAndPool(zfs_hl, spa); + if (!proxy) { + dprintf("Pool proxy creation failed"); + kmem_free(wrapper, sizeof (spa_iokit_t)); + OSSafeReleaseNULL(zfs_hl); + return (ENOMEM); + } + /* Drop retain from copy_zfs_handle */ + OSSafeReleaseNULL(zfs_hl); + + /* Set pool proxy */ + wrapper->proxy = proxy; + spa->spa_iokit_proxy = wrapper; + + return (0); +} + +} /* extern "C" */ + +ZFSPool * +ZFSPool::withProviderAndPool(IOService *zfs_hl, spa_t *spa) +{ + ZFSPool *proxy = new ZFSPool; + + if (!proxy) { + printf("allocation failed"); + return (0); + } + + if (proxy->init(0, spa) == false || + proxy->attach(zfs_hl) == false) { + printf("init/attach failed"); + OSSafeReleaseNULL(proxy); + return (0); + } + + if (proxy->start(zfs_hl) == false) { + printf("start failed"); + proxy->detach(zfs_hl); + OSSafeReleaseNULL(proxy); + return (0); + } + + /* Open zfs_hl, adding proxy to its open clients */ + // if (proxy->open(zfs_hl) == false) { + if (zfs_hl->open(proxy) == false) { + printf("open failed"); + proxy->stop(zfs_hl); + proxy->detach(zfs_hl); + OSSafeReleaseNULL(proxy); + return (0); + } + proxy->registerService(kIOServiceAsynchronous); + + return (proxy); +} + +#if 0 +/* XXX IOBlockStorageDevice */ +IOReturn +ZFSPool::doSynchronizeCache(void) +{ + dprintf(""); + return (kIOReturnSuccess); +} + +IOReturn +ZFSPool::doAsyncReadWrite(IOMemoryDescriptor *buffer, + UInt64 block, UInt64 nblks, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + char zero[ZFS_POOL_DEV_BSIZE]; + size_t len, cur, off = 0; + + DPRINTF_FUNC(); + + if (!buffer) { + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* Read vs. write */ + if (buffer->getDirection() == kIODirectionIn) { + /* Zero the read buffer */ + bzero(zero, ZFS_POOL_DEV_BSIZE); + len = buffer->getLength(); + while (len > 0) { + cur = (len > ZFS_POOL_DEV_BSIZE ? + ZFS_POOL_DEV_BSIZE : len); + buffer->writeBytes(/* offset */ off, + /* buf */ zero, /* length */ cur); + off += cur; + len -= cur; + } + // dprintf("read: %llu %llu", block, nblks); + IOStorage::complete(completion, kIOReturnSuccess, + buffer->getLength()); + return (kIOReturnSuccess); + } + + if (buffer->getDirection() != kIODirectionOut) { + dprintf("invalid direction %d", buffer->getDirection()); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* + * XXX For now this just returns error for all writes. + * If it turns out that mountroot/bdevvp try to + * verify writable status by reading a block and writing + * it back to disk, lie and say it succeeded. + */ + dprintf("write: %llu %llu", block, nblks); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); +} + +IOReturn +ZFSPool::doEjectMedia() +{ + DPRINTF_FUNC(); + /* XXX Called at shutdown, maybe return success? */ + return (kIOReturnError); +} + +IOReturn +ZFSPool::doFormatMedia(UInt64 byteCapacity) +{ + DPRINTF_FUNC(); + /* XXX shouldn't need it */ + return (kIOReturnError); + // return (kIOReturnSuccess); +} + +UInt32 +ZFSPool::doGetFormatCapacities(UInt64 *capacities, + UInt32 capacitiesMaxCount) const +{ + DPRINTF_FUNC(); + if (capacities && capacitiesMaxCount > 0) { + capacities[0] = (ZFS_POOL_DEV_BSIZE * ZFS_POOL_DEV_BCOUNT); + dprintf("capacity %llu", capacities[0]); + } + + /* Always inform caller of capacity count */ + return (1); +} + +/* Returns full pool name from instance private var */ +char * +ZFSPool::getProductString() +{ + if (productString) dprintf("[%s]", productString); + /* Return class private string */ + return ((char *)productString); +} + +/* Returns readonly status from instance private var */ +IOReturn +ZFSPool::reportWriteProtection(bool *isWriteProtected) +{ + DPRINTF_FUNC(); + if (isWriteProtected) *isWriteProtected = isReadOnly; + return (kIOReturnSuccess); +} + +/* These return class static string for all instances */ +char * +ZFSPool::getVendorString() +{ + dprintf("[%s]", vendorString); + /* Return class static string */ + return ((char *)vendorString); +} +char * +ZFSPool::getRevisionString() +{ + dprintf("[%s]", revisionString); + /* Return class static string */ + return ((char *)revisionString); +} +char * +ZFSPool::getAdditionalDeviceInfoString() +{ + dprintf("[%s]", infoString); + /* Return class static string */ + return ((char *)infoString); +} + +/* Always return media present and unchanged */ +IOReturn +ZFSPool::reportMediaState(bool *mediaPresent, + bool *changedState) +{ + DPRINTF_FUNC(); + if (mediaPresent) *mediaPresent = true; + if (changedState) *changedState = false; + return (kIOReturnSuccess); +} + +/* Always report nonremovable and nonejectable */ +IOReturn +ZFSPool::reportRemovability(bool *isRemoveable) +{ + DPRINTF_FUNC(); + if (isRemoveable) *isRemoveable = false; + return (kIOReturnSuccess); +} +IOReturn +ZFSPool::reportEjectability(bool *isEjectable) +{ + DPRINTF_FUNC(); + if (isEjectable) *isEjectable = false; + return (kIOReturnSuccess); +} + +/* Always report 512b blocksize */ +IOReturn +ZFSPool::reportBlockSize(UInt64 *blockSize) +{ + DPRINTF_FUNC(); + if (!blockSize) + return (kIOReturnError); + + *blockSize = ZFS_POOL_DEV_BSIZE; + return (kIOReturnSuccess); +} + +/* XXX Calculate from dev_bcount, should get size from objset */ +/* XXX Can issue message kIOMessageMediaParametersHaveChanged to update */ +IOReturn +ZFSPool::reportMaxValidBlock(UInt64 *maxBlock) +{ + DPRINTF_FUNC(); + if (!maxBlock) + return (kIOReturnError); + + // *maxBlock = 0; + *maxBlock = ZFS_POOL_DEV_BCOUNT - 1; + dprintf("maxBlock %llu", *maxBlock); + + return (kIOReturnSuccess); +} +#endif diff --git a/module/os/macos/zfs/abd_os.c b/module/os/macos/zfs/abd_os.c new file mode 100644 index 0000000000..4a702b1ab4 --- /dev/null +++ b/module/os/macos/zfs/abd_os.c @@ -0,0 +1,482 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2020 by Jorgen Lundman. All rights reserved. + */ + +/* + * See abd.c for a general overview of the arc buffered data (ABD). + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled. + */ + +#include +#include +#include +#include +#include + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 4096; + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + + +/* + * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are + * just a single zero'd sized zfs_abd_chunk_size buffer. This + * allows us to conserve memory by only using a single zero buffer + * for the scatter chunks. + */ +abd_t *abd_zero_scatter = NULL; +static char *abd_zero_buf = NULL; + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +static size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + ABD_SCATTER(abd).abd_offset + abd->abd_size)); +} + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + size_t n = abd_scatter_chunkcnt(abd); + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + n * zfs_abd_chunk_size - abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - n * zfs_abd_chunk_size); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + /* + * There is no scatter linear pages in FreeBSD so there is an + * if an error if the ABD has been marked as a linear page. + */ + VERIFY(!abd_is_linear_page(abd)); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + zfs_abd_chunk_size); + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + ASSERT3P( + ABD_SCATTER(abd).abd_chunks[i], !=, NULL); + } +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + size_t n = abd_chunkcnt_for_bytes(size); + for (int i = 0; i < n; i++) { + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + ABD_SCATTER(abd).abd_chunks[i] = c; + } + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; +} + +void +abd_free_chunks(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + } +} + +abd_t * +abd_alloc_struct(size_t size) +{ + size_t chunkcnt = abd_chunkcnt_for_bytes(size); + size_t abd_size = offsetof(abd_t, + abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(MAX(abd_size, sizeof (abd_t)), KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + abd->abd_orig_size = MAX(abd_size, sizeof (abd_t)); + list_link_init(&abd->abd_gang_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); + int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + mutex_destroy(&abd->abd_mtx); + ASSERT(!list_link_active(&abd->abd_gang_link)); + kmem_free(abd, MAX(size, sizeof(abd_t))); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +/* + * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where + * each chunk in the scatterlist will be set to abd_zero_buf. + */ +static void +abd_alloc_zero_scatter(void) +{ + size_t n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP); + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + + abd_zero_scatter->abd_flags = ABD_FLAG_OWNER | ABD_FLAG_ZEROS; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + abd_zero_scatter->abd_parent = NULL; + zfs_refcount_create(&abd_zero_scatter->abd_children); + + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + ABD_SCATTER(abd_zero_scatter).abd_chunk_size = + zfs_abd_chunk_size; + + for (int i = 0; i < n; i++) { + ABD_SCATTER(abd_zero_scatter).abd_chunks[i] = + abd_zero_buf; + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size); +} + +static void +abd_free_zero_scatter(void) +{ + zfs_refcount_destroy(&abd_zero_scatter->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size); + + abd_free_struct(abd_zero_scatter); + abd_zero_scatter = NULL; + kmem_free(abd_zero_buf, zfs_abd_chunk_size); +} + +void +abd_init(void) +{ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } + + abd_alloc_zero_scatter(); +} + +void +abd_fini(void) +{ + abd_free_zero_scatter(); + + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* + * FreeBSD does not have have scatter linear pages + * so there is an error. + */ + VERIFY(0); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * This is just a helper function to abd_get_offset_scatter() to alloc a + * scatter ABD using the calculated chunkcnt based on the offset within the + * parent ABD. + */ +static abd_t * +abd_alloc_scatter_offset_chunkcnt(size_t chunkcnt) +{ + size_t abd_size = offsetof(abd_t, + abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(MAX(abd_size, sizeof (abd_t)), KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + abd->abd_orig_size = MAX(abd_size, sizeof (abd_t)); + list_link_init(&abd->abd_gang_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + +abd_t * +abd_get_offset_scatter(abd_t *sabd, size_t off) +{ + abd_t *abd = NULL; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size; + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&ABD_SCATTER(abd).abd_chunks, + &ABD_SCATTER(sabd).abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + + return (abd); +} + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + ASSERT(!abd_is_gang(abd)); + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + ABD_SCATTER(aiter->iter_abd).abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +void +abd_cache_reap_now(void) +{ + kmem_cache_reap_now(abd_chunk_cache); +} diff --git a/module/os/macos/zfs/arc_os.c b/module/os/macos/zfs/arc_os.c new file mode 100644 index 0000000000..23328a3780 --- /dev/null +++ b/module/os/macos/zfs/arc_os.c @@ -0,0 +1,883 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +extern arc_stats_t arc_stats; + +static kmutex_t arc_reclaim_lock; +static kcondvar_t arc_reclaim_thread_cv; +static boolean_t arc_reclaim_thread_exit; +static kcondvar_t arc_reclaim_waiters_cv; + +/* + * log2(fraction of ARC which must be free to allow growing). + * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, + * when reading a new block into the ARC, we will evict an equal-sized block + * from the ARC. + * + * This must be less than arc_shrink_shift, so that when we shrink the ARC, + * we will still not allow it to grow. + */ +extern int arc_no_grow_shift; + + +/* + * Return a default max arc size based on the amount of physical memory. + */ +uint64_t +arc_default_max(uint64_t min, uint64_t allmem) +{ + /* Default to 1/3 of all memory. */ + return (MAX(allmem / 3, min)); +} + +#ifdef _KERNEL + +/* Remove these uses of _Atomic */ +static _Atomic boolean_t arc_reclaim_in_loop = B_FALSE; +static _Atomic int64_t reclaim_shrink_target = 0; + +/* + * Return maximum amount of memory that we could possibly use. Reduced + * to half of all memory in user space which is primarily used for testing. + */ +uint64_t +arc_all_memory(void) +{ + return (kmem_size()); +} + +/* + * Return the amount of memory that is considered free. In user space + * which is primarily used for testing we pretend that free memory ranges + * from 0-20% of all memory. + */ +uint64_t +arc_free_memory(void) +{ + int64_t avail; + + avail = spl_free_wrapper(); + return (avail >= 0LL ? avail : 0LL); +} + +/* + * Return the amount of memory that can be consumed before reclaim will be + * needed. Positive if there is sufficient free memory, negative indicates + * the amount of memory that needs to be freed up. + */ +int64_t +arc_available_memory(void) +{ + return (arc_free_memory() - arc_sys_free); +} + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + int64_t available_memory = spl_free_wrapper(); + int64_t freemem = available_memory / PAGESIZE; + static uint64_t page_load = 0; + static uint64_t last_txg = 0; + +#if defined(__i386) + available_memory = + MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); +#endif + + if (txg > last_txg) { + last_txg = txg; + page_load = 0; + } + + if (freemem > physmem * arc_lotsfree_percent / 100) { + page_load = 0; + return (0); + } + + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + + if (spl_free_manual_pressure_wrapper() != 0 && + arc_reclaim_in_loop == B_FALSE) { + cv_signal(&arc_reclaim_thread_cv); + kpreempt(KPREEMPT_SYNC); + page_load = 0; + } + + if (!spl_minimal_physmem_p() && page_load > 0) { + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + printf("ZFS: %s: !spl_minimal_physmem_p(), available_memory " + "== %lld, page_load = %llu, txg = %llu, reserve = %llu\n", + __func__, available_memory, page_load, txg, reserve); + if (arc_reclaim_in_loop == B_FALSE) + cv_signal(&arc_reclaim_thread_cv); + kpreempt(KPREEMPT_SYNC); + page_load = 0; + return (SET_ERROR(EAGAIN)); + } + + if (arc_reclaim_needed() && page_load > 0) { + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + printf("ZFS: %s: arc_reclaim_needed(), available_memory " + "== %lld, page_load = %llu, txg = %llu, reserve = %lld\n", + __func__, available_memory, page_load, txg, reserve); + if (arc_reclaim_in_loop == B_FALSE) + cv_signal(&arc_reclaim_thread_cv); + kpreempt(KPREEMPT_SYNC); + page_load = 0; + return (SET_ERROR(EAGAIN)); + } + + /* as with sun, assume we are reclaiming */ + if (available_memory <= 0 || page_load > available_memory / 4) { + return (SET_ERROR(ERESTART)); + } + + if (!spl_minimal_physmem_p()) { + page_load += reserve/8; + return (0); + } + + page_load = 0; + + return (0); +} + +int64_t +arc_shrink(int64_t to_free) +{ + int64_t shrank = 0; + int64_t arc_c_before = arc_c; + int64_t arc_adjust_evicted = 0; + + uint64_t asize = aggsum_value(&arc_size); + if (arc_c > arc_c_min) { + + if (arc_c > arc_c_min + to_free) + atomic_add_64(&arc_c, -to_free); + else + arc_c = arc_c_min; + + atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); + if (asize < arc_c) + arc_c = MAX(asize, arc_c_min); + if (arc_p > arc_c) + arc_p = (arc_c >> 1); + ASSERT(arc_c >= arc_c_min); + ASSERT((int64_t)arc_p >= 0); + } + + shrank = arc_c_before - arc_c; + + return (shrank + arc_adjust_evicted); +} + + +/* + * arc.c has a arc_reap_zthr we should probably use, instead of + * having our own legacy arc_reclaim_thread(). + */ +static void arc_kmem_reap_now(void) +{ + arc_wait_for_eviction(0); + + /* arc.c will do the heavy lifting */ + arc_kmem_reap_soon(); + + /* Now some OsX additionals */ + extern kmem_cache_t *abd_chunk_cache; + extern kmem_cache_t *znode_cache; + + kmem_cache_reap_now(abd_chunk_cache); + if (znode_cache) kmem_cache_reap_now(znode_cache); + + if (zio_arena_parent != NULL) { + /* + * Ask the vmem arena to reclaim unused memory from its + * quantum caches. + */ + vmem_qcache_reap(zio_arena_parent); + } +} + + + +/* + * Threads can block in arc_get_data_impl() waiting for this thread to evict + * enough data and signal them to proceed. When this happens, the threads in + * arc_get_data_impl() are sleeping while holding the hash lock for their + * particular arc header. Thus, we must be careful to never sleep on a + * hash lock in this thread. This is to prevent the following deadlock: + * + * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", + * waiting for the reclaim thread to signal it. + * + * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, + * fails, and goes to sleep forever. + * + * This possible deadlock is avoided by always acquiring a hash lock + * using mutex_tryenter() from arc_reclaim_thread(). + */ +static void +arc_reclaim_thread(void *unused) +{ + hrtime_t growtime = 0; + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_reclaim_lock); + while (!arc_reclaim_thread_exit) { + arc_reclaim_in_loop = B_TRUE; + uint64_t evicted = 0; + + mutex_exit(&arc_reclaim_lock); + + if (reclaim_shrink_target > 0) { + int64_t t = reclaim_shrink_target; + reclaim_shrink_target = 0; + evicted = arc_shrink(t); + extern kmem_cache_t *abd_chunk_cache; + kmem_cache_reap_now(abd_chunk_cache); + IOSleep(1); + goto lock_and_sleep; + } + + int64_t pre_adjust_free_memory = MIN(spl_free_wrapper(), + arc_available_memory()); + + int64_t manual_pressure = spl_free_manual_pressure_wrapper(); + spl_free_set_pressure(0); // clears both spl pressure variables + + /* + * We call arc_adjust() before (possibly) calling + * arc_kmem_reap_now(), so that we can wake up + * arc_get_data_impl() sooner. + */ + arc_wait_for_eviction(0); + + int64_t free_memory = arc_available_memory(); + + int64_t post_adjust_manual_pressure = + spl_free_manual_pressure_wrapper(); + manual_pressure = MAX(manual_pressure, + post_adjust_manual_pressure); + spl_free_set_pressure(0); + + int64_t post_adjust_free_memory = + MIN(spl_free_wrapper(), arc_available_memory()); + + // if arc_adjust() evicted, we expect post_adjust_free_memory + // to be larger than pre_adjust_free_memory (as there should + // be more free memory). + int64_t d_adj = post_adjust_free_memory - + pre_adjust_free_memory; + + if (manual_pressure > 0 && post_adjust_manual_pressure == 0) { + // pressure did not get re-signalled during arc_adjust() + if (d_adj >= 0) { + manual_pressure -= MIN(evicted, d_adj); + } else { + manual_pressure -= evicted; + } + } else if (evicted > 0 && manual_pressure > 0 && + post_adjust_manual_pressure > 0) { + // otherwise use the most recent pressure value + manual_pressure = post_adjust_manual_pressure; + } + + free_memory = post_adjust_free_memory; + + if (free_memory >= 0 && manual_pressure <= 0 && evicted > 0) { + extern kmem_cache_t *abd_chunk_cache; + kmem_cache_reap_now(abd_chunk_cache); + } + + if (free_memory < 0 || manual_pressure > 0) { + + if (free_memory <= + (arc_c >> arc_no_grow_shift) + SPA_MAXBLOCKSIZE) { + arc_no_grow = B_TRUE; + + /* + * Absorb occasional low memory conditions, as they + * may be caused by a single sequentially writing thread + * pushing a lot of dirty data into the ARC. + * + * In particular, we want to quickly + * begin re-growing the ARC if we are + * not in chronic high pressure. + * However, if we're in chronic high + * pressure, we want to reduce reclaim + * thread work by keeping arc_no_grow set. + * + * If growtime is in the past, then set it to last + * half a second (which is the length of the + * cv_timedwait_hires() call below; if this works, + * that value should be a parameter, #defined or constified. + * + * If growtime is in the future, then make sure that it + * is no further than 60 seconds into the future. + * If it's in the nearer future, then grow growtime by + * an exponentially increasing value starting with 500msec. + * + */ + const hrtime_t curtime = gethrtime(); + const hrtime_t agr = SEC2NSEC(arc_grow_retry); + static int grow_pass = 0; + + if (growtime == 0) { + growtime = curtime + MSEC2NSEC(500); + grow_pass = 0; + } else { + // check for 500ms not being enough + ASSERT3U(growtime, >, curtime); + if (growtime <= curtime) + growtime = curtime + + MSEC2NSEC(500); + + // growtime is in the future! + const hrtime_t difference = + growtime - curtime; + + if (difference >= agr) { + // cap arc_grow_retry secs now + growtime = curtime + agr - 1LL; + grow_pass = 0; + } else { + hrtime_t grow_by = + MSEC2NSEC(500) * + (1LL << grow_pass); + + if (grow_by > (agr >> 1)) + grow_by = agr >> 1; + + growtime += grow_by; + + // add 512 seconds maximum + if (grow_pass < 10) + grow_pass++; + } + } + } + + arc_warm = B_TRUE; + + arc_kmem_reap_now(); + + /* + * If we are still low on memory, shrink the ARC + * so that we have arc_shrink_min free space. + */ + free_memory = arc_available_memory(); + + static int64_t old_to_free = 0; + + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + + if (to_free > 0 || manual_pressure != 0) { + // 2 * SPA_MAXBLOCKSIZE + const int64_t large_amount = + 32LL * 1024LL * 1024LL; + const int64_t huge_amount = + 128LL * 1024LL * 1024LL; + + if (to_free > large_amount || + evicted > huge_amount) + dprintf("SPL: %s: post-reap %lld " + "post-evict %lld adjusted %lld " + "pre-adjust %lld to-free %lld" + " pressure %lld\n", + __func__, free_memory, d_adj, + evicted, pre_adjust_free_memory, + to_free, manual_pressure); + to_free = MAX(to_free, manual_pressure); + + int64_t old_arc_size = + (int64_t)aggsum_value(&arc_size); + (void) arc_shrink(to_free); + int64_t new_arc_size = + (int64_t)aggsum_value(&arc_size); + int64_t arc_shrink_freed = + old_arc_size - new_arc_size; + int64_t left_to_free = + to_free - arc_shrink_freed; + if (left_to_free <= 0) { + if (arc_shrink_freed > large_amount) { + printf("ZFS: %s, arc_shrink " + "freed %lld, zeroing " + "old_to_free from %lld\n", + __func__, arc_shrink_freed, + old_to_free); + } + old_to_free = 0; + } else if (arc_shrink_freed > 2LL * + (int64_t)SPA_MAXBLOCKSIZE) { + printf("ZFS: %s, arc_shrink freed " + "%lld, setting old_to_free to " + "%lld from %lld\n", + __func__, arc_shrink_freed, + left_to_free, old_to_free); + old_to_free = left_to_free; + } else { + old_to_free = left_to_free; + } + + // If we have reduced ARC by a lot before + // this point, try to give memory back to + // lower arenas (and possibly xnu). + + int64_t total_freed = + arc_shrink_freed + evicted; + if (total_freed >= huge_amount) { + if (zio_arena_parent != NULL) + vmem_qcache_reap( + zio_arena_parent); + } + if (arc_shrink_freed > 0) + evicted += arc_shrink_freed; + } else if (old_to_free > 0) { + printf("ZFS: %s, (old_)to_free has " + "returned to zero from %lld\n", + __func__, old_to_free); + old_to_free = 0; + } + + } else if (free_memory < (arc_c >> arc_no_grow_shift) && + aggsum_value(&arc_size) > + arc_c_min + SPA_MAXBLOCKSIZE) { + // relatively low memory and arc is above arc_c_min + arc_no_grow = B_TRUE; + growtime = gethrtime() + SEC2NSEC(1); + } + + if (growtime > 0 && gethrtime() >= growtime) { + if (arc_no_grow == B_TRUE) + dprintf("ZFS: arc growtime expired\n"); + growtime = 0; + arc_no_grow = B_FALSE; + } + +lock_and_sleep: + + mutex_enter(&arc_reclaim_lock); + + /* + * If evicted is zero, we couldn't evict anything via + * arc_adjust(). This could be due to hash lock + * collisions, but more likely due to the majority of + * arc buffers being unevictable. Therefore, even if + * arc_size is above arc_c, another pass is unlikely to + * be helpful and could potentially cause us to enter an + * infinite loop. + */ + if (aggsum_compare(&arc_size, arc_c) <= 0 || evicted == 0) { + /* + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any threads before we go to sleep. + */ + cv_broadcast(&arc_reclaim_waiters_cv); + + arc_reclaim_in_loop = B_FALSE; + /* + * Block until signaled, or after one second (we + * might need to perform arc_kmem_reap_now() + * even if we aren't being signalled) + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&arc_reclaim_thread_cv, + &arc_reclaim_lock, MSEC2NSEC(500), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); + + } else if (evicted >= SPA_MAXBLOCKSIZE * 3) { + // we evicted plenty of buffers, so let's wake up + // all the waiters rather than having them stall + cv_broadcast(&arc_reclaim_waiters_cv); + } else { + // we evicted some buffers but are still overflowing, + // so wake up only one waiter + cv_signal(&arc_reclaim_waiters_cv); + } + } + + arc_reclaim_thread_exit = B_FALSE; + cv_broadcast(&arc_reclaim_thread_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ + thread_exit(); +} + +uint64_t +isqrt(uint64_t n) +{ + int i; + uint64_t r, tmp; + r = 0; + for (i = 64/2-1; i >= 0; i--) { + tmp = r | (1 << i); + if (tmp*tmp <= n) + r = tmp; + } + return (r); +} + +/* This is called before arc is initialized, and threads are not running */ +void +arc_lowmem_init(void) +{ + /* + * The ARC tries to keep at least this much memory available for the + * system. This gives the ARC time to shrink in response to memory + * pressure, before running completely out of memory and invoking the + * direct-reclaim ARC shrinker. + * + * This should be more than twice high_wmark_pages(), so that + * arc_wait_for_eviction() will wait until at least the + * high_wmark_pages() are free (see arc_evict_state_impl()). + * + * Note: Even when the system is very low on memory, the kernel's + * shrinker code may only ask for one "batch" of pages (512KB) to be + * evicted. If concurrent allocations consume these pages, there may + * still be insufficient free pages, and the OOM killer takes action. + * + * By setting arc_sys_free large enough, and having + * arc_wait_for_eviction() wait until there is at least arc_sys_free/2 + * free memory, it is much less likely that concurrent allocations can + * consume all the memory that was evicted before checking for + * OOM. + * + * It's hard to iterate the zones from a linux kernel module, which + * makes it difficult to determine the watermark dynamically. Instead + * we compute the maximum high watermark for this system, based + * on the amount of memory, assuming default parameters on Linux kernel + * 5.3. + */ + /* + * Base wmark_low is 4 * the square root of Kbytes of RAM. + */ + uint64_t allmem = kmem_size(); + long wmark = 4 * (long)isqrt(allmem/1024) * 1024; + + /* + * Clamp to between 128K and 64MB. + */ + wmark = MAX(wmark, 128 * 1024); + wmark = MIN(wmark, 64 * 1024 * 1024); + + /* + * watermark_boost can increase the wmark by up to 150%. + */ + wmark += wmark * 150 / 100; + + /* + * arc_sys_free needs to be more than 2x the watermark, because + * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up + * to 3x to ensure we're above it. + */ + arc_sys_free = wmark * 3 + allmem / 32; + +} + +/* This is called after arc is initialized, and thread are running */ +void +arc_os_init(void) +{ + mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); + cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + + arc_reclaim_thread_exit = B_FALSE; + + (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + + arc_warm = B_FALSE; + +} + +void +arc_lowmem_fini(void) +{ +} + +void +arc_os_fini(void) +{ + mutex_enter(&arc_reclaim_lock); + arc_reclaim_thread_exit = B_TRUE; + /* + * The reclaim thread will set arc_reclaim_thread_exit back to + * B_FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_reclaim_thread_exit) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); + } + mutex_exit(&arc_reclaim_lock); + + mutex_destroy(&arc_reclaim_lock); + cv_destroy(&arc_reclaim_thread_cv); + cv_destroy(&arc_reclaim_waiters_cv); +} + +/* + * Uses ARC static variables in logic. + */ +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +/* max size for dnodes */ +#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ + +/* So close, they made arc_min_prefetch_ms be static, but no others */ + +int +arc_kstat_update_osx(kstat_t *ksp, int rw) +{ + osx_kstat_t *ks = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + + /* Did we change the value ? */ + if (ks->arc_zfs_arc_max.value.ui64 != zfs_arc_max) { + + /* Assign new value */ + zfs_arc_max = ks->arc_zfs_arc_max.value.ui64; + + /* Update ARC with new value */ + if (zfs_arc_max > 64<<20 && zfs_arc_max < + physmem * PAGESIZE) + arc_c_max = zfs_arc_max; + + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + /* If meta_limit is not set, adjust it automatically */ + if (!zfs_arc_meta_limit) + arc_meta_limit = arc_c_max / 4; + } + + if (ks->arc_zfs_arc_min.value.ui64 != zfs_arc_min) { + zfs_arc_min = ks->arc_zfs_arc_min.value.ui64; + if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) { + arc_c_min = zfs_arc_min; + printf("ZFS: set arc_c_min %llu, arc_meta_min " + "%llu, zfs_arc_meta_min %llu\n", + arc_c_min, arc_meta_min, zfs_arc_meta_min); + if (arc_c < arc_c_min) { + printf("ZFS: raise arc_c %llu to " + "arc_c_min %llu\n", arc_c, + arc_c_min); + arc_c = arc_c_min; + if (arc_p < (arc_c >> 1)) { + printf("ZFS: raise arc_p %llu " + "to %llu\n", + arc_p, (arc_c >> 1)); + arc_p = (arc_c >> 1); + } + } + } + } + + if (ks->arc_zfs_arc_meta_limit.value.ui64 != + zfs_arc_meta_limit) { + zfs_arc_meta_limit = + ks->arc_zfs_arc_meta_limit.value.ui64; + + /* Allow the tunable to override if it is reasonable */ + if (zfs_arc_meta_limit > 0 && + zfs_arc_meta_limit <= arc_c_max) + arc_meta_limit = zfs_arc_meta_limit; + + if (arc_c_min < arc_meta_limit / 2 && + zfs_arc_min == 0) + arc_c_min = arc_meta_limit / 2; + + printf("ZFS: set arc_meta_limit %llu, arc_c_min %llu," + "zfs_arc_meta_limit %lu\n", + arc_meta_limit, arc_c_min, zfs_arc_meta_limit); + } + + if (ks->arc_zfs_arc_meta_min.value.ui64 != zfs_arc_meta_min) { + zfs_arc_meta_min = ks->arc_zfs_arc_meta_min.value.ui64; + if (zfs_arc_meta_min >= arc_c_min) { + printf("ZFS: probable error, zfs_arc_meta_min " + "%llu >= arc_c_min %llu\n", + zfs_arc_meta_min, arc_c_min); + } + if (zfs_arc_meta_min > 0 && + zfs_arc_meta_min <= arc_meta_limit) + arc_meta_min = zfs_arc_meta_min; + printf("ZFS: set arc_meta_min %llu\n", arc_meta_min); + } + + zfs_arc_grow_retry = ks->arc_zfs_arc_grow_retry.value.ui64; + arc_grow_retry = zfs_arc_grow_retry; + zfs_arc_shrink_shift = ks->arc_zfs_arc_shrink_shift.value.ui64; + zfs_arc_p_min_shift = ks->arc_zfs_arc_p_min_shift.value.ui64; + zfs_arc_average_blocksize = + ks->arc_zfs_arc_average_blocksize.value.ui64; + + } else { + + ks->arc_zfs_arc_max.value.ui64 = zfs_arc_max; + ks->arc_zfs_arc_min.value.ui64 = zfs_arc_min; + + ks->arc_zfs_arc_meta_limit.value.ui64 = zfs_arc_meta_limit; + ks->arc_zfs_arc_meta_min.value.ui64 = zfs_arc_meta_min; + + ks->arc_zfs_arc_grow_retry.value.ui64 = + zfs_arc_grow_retry ? zfs_arc_grow_retry : arc_grow_retry; + ks->arc_zfs_arc_shrink_shift.value.ui64 = zfs_arc_shrink_shift; + ks->arc_zfs_arc_p_min_shift.value.ui64 = zfs_arc_p_min_shift; + ks->arc_zfs_arc_average_blocksize.value.ui64 = + zfs_arc_average_blocksize; + } + return (0); +} + +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *ptr) +{ + arc_prune_t *ap = (arc_prune_t *)ptr; + arc_prune_func_t *func = ap->p_pfunc; + + if (func != NULL) + func(ap->p_adjust, ap->p_private); + + zfs_refcount_remove(&ap->p_refcnt, func); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffered they reference. This provides a mechanism to ensure the ARC can + * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * is analogous to dnlc_reduce_cache() but more generic. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +void +arc_prune_async(int64_t adjust) +{ + arc_prune_t *ap; + + mutex_enter(&arc_prune_mtx); + for (ap = list_head(&arc_prune_list); ap != NULL; + ap = list_next(&arc_prune_list, ap)) { + + if (zfs_refcount_count(&ap->p_refcnt) >= 2) + continue; + + zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); + ap->p_adjust = adjust; + if (taskq_dispatch(arc_prune_taskq, arc_prune_task, + ap, TQ_SLEEP) == TASKQID_INVALID) { + zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); + continue; + } + ARCSTAT_BUMP(arcstat_prune); + } + mutex_exit(&arc_prune_mtx); +} + +#else /* KERNEL */ + +int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + + /* Every 100 calls, free a small amount */ + if (spa_get_random(100) == 0) + lowest = -1024; + + return (lowest); +} + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + return (0); +} + +uint64_t +arc_all_memory(void) +{ + return (ptob(physmem) / 2); +} + +uint64_t +arc_free_memory(void) +{ + return (spa_get_random(arc_all_memory() * 20 / 100)); +} + +#endif /* KERNEL */ diff --git a/module/os/macos/zfs/ldi_iokit.cpp b/module/os/macos/zfs/ldi_iokit.cpp new file mode 100644 index 0000000000..28009f9d73 --- /dev/null +++ b/module/os/macos/zfs/ldi_iokit.cpp @@ -0,0 +1,1990 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * Portions of this document are copyright Oracle and Joyent. + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +/* + * Apple IOKit (c++) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS internal + */ +#include + +/* + * LDI Includes + */ +#include + +/* Debug prints */ + +/* Attach created IOService objects to the IORegistry under ZFS. */ +// #define LDI_IOREGISTRY_ATTACH + +/* + * Globals + */ +static IOService *ldi_zfs_handle; + +/* Exposed to c callers */ +extern "C" { + +struct _handle_iokit { + IOMedia *media; + IOService *client; +}; /* 16b */ + +struct _handle_notifier { + IONotifier *obj; +}; /* 8b */ + +#define LH_MEDIA(lhp) lhp->lh_tsd.iokit_tsd->media +#define LH_CLIENT(lhp) lhp->lh_tsd.iokit_tsd->client +#define LH_NOTIFIER(lhp) lhp->lh_notifier->obj + +void +handle_free_iokit(struct ldi_handle *lhp) { + if (!lhp) { + dprintf("%s missing lhp\n", __func__); + return; + } + + if (!lhp->lh_tsd.iokit_tsd) { + dprintf("%s missing iokit_tsd\n", __func__); + return; + } + + /* Free IOService client */ + if (handle_free_ioservice(lhp) != 0) { + dprintf("%s lhp %p client %s\n", + __func__, lhp, "couldn't be removed"); + } + + kmem_free(lhp->lh_tsd.iokit_tsd, sizeof (struct _handle_iokit)); + lhp->lh_tsd.iokit_tsd = 0; +} + +/* Returns handle with lock still held */ +struct ldi_handle * +handle_alloc_iokit(dev_t device, int fmode) +{ + struct ldi_handle *lhp, *retlhp; + + /* Search for existing handle */ + if ((retlhp = handle_find(device, fmode, B_TRUE)) != NULL) { + dprintf("%s found handle before alloc\n", __func__); + return (retlhp); + } + + /* Allocate an LDI IOKit handle */ + if ((lhp = handle_alloc_common(LDI_TYPE_IOKIT, device, + fmode)) == NULL) { + dprintf("%s couldn't allocate handle\n", __func__); + return (NULL); + } + + /* Allocate and clear type-specific device data */ + lhp->lh_tsd.iokit_tsd = (struct _handle_iokit *)kmem_alloc( + sizeof (struct _handle_iokit), KM_SLEEP); + LH_MEDIA(lhp) = 0; + LH_CLIENT(lhp) = 0; + + /* Allocate an IOService client for open/close */ + if (handle_alloc_ioservice(lhp) != 0) { + dprintf("%s couldn't allocate IOService client\n", __func__); + handle_release(lhp); + return (NULL); + } + + /* Add the handle to the list, or return match */ + if ((retlhp = handle_add(lhp)) == NULL) { + dprintf("%s handle_add failed\n", __func__); + handle_release(lhp); + return (NULL); + } + + /* Check if new or found handle was returned */ + if (retlhp != lhp) { + dprintf("%s found handle after alloc\n", __func__); + handle_release(lhp); + lhp = 0; + } + + return (retlhp); +} + +int +handle_free_ioservice(struct ldi_handle *lhp) +{ + /* Validate handle pointer */ + ASSERT3U(lhp, !=, NULL); +#ifdef DEBUG + if (!lhp) { + dprintf("%s missing handle\n", __func__); + return (EINVAL); + } + if (!LH_CLIENT(lhp)) { + dprintf("%s missing client\n", __func__); + return (ENODEV); + } +#endif + +#ifdef LDI_IOREGISTRY_ATTACH + /* Detach client from ZFS in IORegistry */ + LH_CLIENT(lhp)->detach(ldi_zfs_handle); +#endif + + LH_CLIENT(lhp)->stop(ldi_zfs_handle); + LH_CLIENT(lhp)->release(); + LH_CLIENT(lhp) = 0; + + return (0); +} + +int +handle_alloc_ioservice(struct ldi_handle *lhp) +{ + IOService *client; + + /* Validate handle pointer */ + ASSERT3U(lhp, !=, NULL); + if (lhp == NULL) { + dprintf("%s missing handle\n", __func__); + return (EINVAL); + } + + /* Allocate and init an IOService client for open/close */ + if ((client = new IOService) == NULL) { + dprintf("%s couldn't allocate new IOService\n", __func__); + return (ENOMEM); + } + if (client->init(0) != true) { + dprintf("%s IOService init failed\n", __func__); + client->release(); + return (ENOMEM); + } + +#ifdef LDI_IOREGISTRY_ATTACH + /* Attach client to ZFS in IORegistry */ + if (client->attach(ldi_zfs_handle) != true) { + dprintf("%s IOService attach failed\n", __func__); + client->release(); + return (ENOMEM); + } +#endif + + /* Start service */ + if (client->start(ldi_zfs_handle) != true) { + dprintf("%s IOService attach failed\n", __func__); + /* Detach client from ZFS in IORegistry */ +#ifdef LDI_IOREGISTRY_ATTACH + client->detach(ldi_zfs_handle); +#endif + client->release(); + return (ENOMEM); + } + + LH_CLIENT(lhp) = client; + return (0); +} + +/* Set status to Offline and post event */ +static bool +handle_media_terminate_cb(void* target, void* refCon, + IOService* newService, IONotifier* notifier) +{ + struct ldi_handle *lhp = (struct ldi_handle *)refCon; + +#ifdef DEBUG + if (!lhp) { + dprintf("%s missing refCon ldi_handle\n", __func__); + return (false); + } +#endif + + /* Take hold on handle to prevent removal */ + handle_hold(lhp); + + dprintf("%s setting lhp %p to Offline status\n", __func__, lhp); + if (handle_status_change(lhp, LDI_STATUS_OFFLINE) != 0) { + dprintf("%s handle_status_change failed\n", __func__); + handle_release(lhp); + return (false); + } + + handle_release(lhp); + return (true); +} + +int +handle_close_iokit(struct ldi_handle *lhp) +{ +#ifdef DEBUG + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_type, ==, LDI_TYPE_IOKIT); + ASSERT3U(lhp->lh_status, ==, LDI_STATUS_CLOSING); + + /* Validate IOMedia and IOService client */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp)) || + !OSDynamicCast(IOService, LH_CLIENT(lhp))) { + dprintf("%s invalid IOMedia or client\n", __func__); + return (ENODEV); + } +#endif /* DEBUG */ + + LH_MEDIA(lhp)->close(LH_CLIENT(lhp)); + LH_MEDIA(lhp) = 0; + return (0); +} + +static int +handle_open_iokit(struct ldi_handle *lhp, IOMedia *media) +{ +#ifdef DEBUG + ASSERT3U(lhp, !=, NULL); + ASSERT3U(media, !=, NULL); + ASSERT3U(lhp->lh_type, ==, LDI_TYPE_IOKIT); + ASSERT3U(lhp->lh_status, ==, LDI_STATUS_OPENING); + + /* Validate IOMedia and IOService client */ + if (!OSDynamicCast(IOMedia, media) || + !OSDynamicCast(IOService, LH_CLIENT(lhp))) { + dprintf("%s invalid IOMedia or client\n", __func__); + return (ENODEV); + } +#endif /* DEBUG */ + /* Retain until open or error */ + media->retain(); + + /* + * If read/write mode is requested, check that the + * device is actually writeable. + */ + if (lhp->lh_fmode & FWRITE && media->isWritable() == false) { + dprintf("%s read-write requested on %s\n", + __func__, "read-only IOMedia"); + media->release(); + return (EPERM); + } + + /* Call open with the IOService client handle */ + if (media->IOMedia::open(LH_CLIENT(lhp), 0, + (lhp->lh_fmode & FWRITE ? kIOStorageAccessReaderWriter : + kIOStorageAccessReader)) == false) { + dprintf("%s IOMedia->open failed\n", __func__); + media->release(); + return (EIO); + } + media->release(); + + /* Assign IOMedia device */ + LH_MEDIA(lhp) = media; + return (0); +} + +int +handle_get_size_iokit(struct ldi_handle *lhp, uint64_t *dev_size) +{ + if (!lhp || !dev_size) { + dprintf("%s missing lhp or dev_size\n", __func__); + return (EINVAL); + } + +#ifdef DEBUG + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s no IOMedia\n", __func__); + return (ENODEV); + } +#endif + + *dev_size = LH_MEDIA(lhp)->getSize(); + if (*dev_size == 0) { + dprintf("%s %s\n", __func__, + "IOMedia getSize returned 0"); + return (EINVAL); + } + + return (0); +} + +int +handle_get_dev_path_iokit(struct ldi_handle *lhp, + char *path, int len) +{ + int retlen = len; + + if (!lhp || !path || len == 0) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + +#ifdef DEBUG + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s no IOMedia\n", __func__); + return (ENODEV); + } +#endif + + if (LH_MEDIA(lhp)->getPath(path, &retlen, gIODTPlane) == false) { + dprintf("%s getPath failed\n", __func__); + return (EIO); + } + +dprintf("%s got path [%s]\n", __func__, path); + return (0); +} + +int handle_get_bootinfo_iokit(struct ldi_handle *lhp, + struct io_bootinfo *bootinfo) +{ + int error = 0; + + if (!lhp || !bootinfo) { + dprintf("%s missing argument\n", __func__); +printf("%s missing argument\n", __func__); + return (EINVAL); + } + + if ((error = handle_get_size_iokit(lhp, + &bootinfo->dev_size)) != 0 || + (error = handle_get_dev_path_iokit(lhp, bootinfo->dev_path, + sizeof (bootinfo->dev_path))) != 0) { + dprintf("%s get size or dev_path error %d\n", + __func__, error); + } + + return (error); +} + +int +handle_sync_iokit(struct ldi_handle *lhp) +{ +#ifdef DEBUG + /* Validate IOMedia and client */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp)) || + !OSDynamicCast(IOService, LH_CLIENT(lhp))) { + dprintf("%s invalid IOMedia or client\n", __func__); + return (ENODEV); + } +#endif + +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + /* Issue device sync */ + if (LH_MEDIA(lhp)->synchronize(LH_CLIENT(lhp), 0, 0, 0) != + kIOReturnSuccess) { + dprintf("%s %s\n", __func__, + "IOMedia synchronizeCache failed"); + return (ENOTSUP); + } +#else + /* Issue device sync */ + if (LH_MEDIA(lhp)->synchronizeCache(LH_CLIENT(lhp)) != + kIOReturnSuccess) { + dprintf("%s %s\n", __func__, + "IOMedia synchronizeCache failed"); + return (ENOTSUP); + } +#endif + + /* Success */ + return (0); +} + +static dev_t +dev_from_media(IOMedia *media) +{ + OSObject *property; + OSNumber *number; + uint32_t major, minor; + dev_t device = 0; + + /* Validate media */ + if (!media || !OSDynamicCast(IOMedia, media)) { + dprintf("%s no device\n", __func__); + return (0); + } + media->retain(); + + /* Get device major */ + if (NULL == (property = media->getProperty(kIOBSDMajorKey, + gIOServicePlane, kIORegistryIterateRecursively)) || + NULL == (number = OSDynamicCast(OSNumber, property))) { + dprintf("%s couldn't get BSD major\n", __func__); + media->release(); + return (0); + } + major = number->unsigned32BitValue(); + number = NULL; + property = NULL; + + /* Get device minor */ + if (NULL == (property = media->getProperty(kIOBSDMinorKey, + gIOServicePlane, kIORegistryIterateRecursively)) || + NULL == (number = OSDynamicCast(OSNumber, property))) { + dprintf("%s couldn't get BSD major\n", __func__); + media->release(); + return (0); + } + minor = number->unsigned32BitValue(); + number = NULL; + property = NULL; + + /* Cleanup */ + media->release(); + media = NULL; + + device = makedev(major, minor); + + /* Return 0 or valid dev_t */ + return (device); +} + +/* Returns NULL or dictionary with a retain count */ +static OSDictionary * +media_matchdict_from_dev(dev_t device) +{ + OSDictionary *matchDict; + OSNumber *majorNum, *minorNum; + + /* Validate dev_t */ + if (device == 0) { + dprintf("%s no dev_t provided\n", __func__); + return (NULL); + } + + /* Allocate OSNumbers for BSD major and minor (32-bit) */ + if (NULL == (majorNum = OSNumber::withNumber(major(device), 32)) || + NULL == (minorNum = OSNumber::withNumber(minor(device), 32))) { + dprintf("%s couldn't alloc major/minor as OSNumber\n", + __func__); + if (majorNum) { + majorNum->release(); + } + return (NULL); + } + + /* Match on IOMedia */ + if (NULL == (matchDict = IOService::serviceMatching("IOMedia")) || + !(matchDict->setObject(kIOBSDMajorKey, majorNum)) || + !(matchDict->setObject(kIOBSDMinorKey, minorNum))) { + dprintf("%s couldn't get matching dictionary\n", __func__); + if (matchDict) { + matchDict->release(); + } + majorNum->release(); + minorNum->release(); + return (NULL); + } + majorNum->release(); + minorNum->release(); + + /* Return NULL or valid OSDictionary with retain count */ + return (matchDict); +} + +/* Returns NULL or dictionary with a retain count */ +/* + * media_matchdict_from_path + * translate from paths of the form /dev/diskNsN + * or /private/var/run/disk/by-id/media- to a matching + * dictionary. + */ +static OSDictionary * +media_matchdict_from_path(const char *path) +{ + OSDictionary *matchDict = 0; + OSString *bsdName = NULL; + OSString *uuid = NULL; + const char *substr = 0; + bool ret; + + /* Validate path */ + if (path == 0 || strlen(path) <= 1) { + dprintf("%s no path provided\n", __func__); + return (NULL); + } + /* Translate /dev/diskN and InvariantDisks paths */ + if (strncmp(path, "/dev/", 5) != 0 && + strncmp(path, "/var/run/disk/by-id/", 20) != 0 && + strncmp(path, "/private/var/run/disk/by-id/", 28) != 0) { + dprintf("%s Unrecognized path %s\n", __func__, path); + return (NULL); + } + + /* Validate path and alloc bsdName */ + if (strncmp(path, "/dev/", 5) == 0) { + + /* substr starts after '/dev/' */ + substr = path + 5; + /* Get diskN from /dev/diskN or /dev/rdiskN */ + if (strncmp(substr, "disk", 4) == 0) { + bsdName = OSString::withCString(substr); + } else if (strncmp(substr, "rdisk", 5) == 0) { + bsdName = OSString::withCString(substr + 1); + } + } else if (strncmp(path, "/var/run/disk/by-id/", 20) == 0 || + strncmp(path, "/private/var/run/disk/by-id/", 28) == 0) { + /* InvariantDisks paths */ + + /* substr starts after '/by-id/' */ + substr = path + 20; + if (strncmp(path, "/private", 8) == 0) substr += 8; + + /* Handle media UUID, skip volume UUID or device GUID */ + if (strncmp(substr, "media-", 6) == 0) { + /* Lookup IOMedia with UUID */ + uuid = OSString::withCString(substr+strlen("media-")); + } else if (strncmp(substr, "volume-", 7) == 0) { + /* + * volume-UUID is specified by DiskArbitration + * when a Filesystem bundle is able to probe + * the media and retrieve/generate a UUID for + * it's contents. + * So while we could use this and have zfs.util + * probe for vdev GUID (and pool GUID) and + * generate a UUID, we would need to do the same + * here to find the disk, possibly probing + * devices to get the vdev GUID in the process. + */ + dprintf("%s Unsupported volume-UUID path %s\n", + __func__, path); + } else if (strncmp(substr, "device-", 7) == 0) { + /* Lookup IOMedia with device GUID */ + /* + * XXX Not sure when this is used, no devices + * seem to be presented this way. + */ + dprintf("%s Unsupported device-GUID path %s\n", + __func__, path); + } else { + dprintf("%s unrecognized path %s\n", __func__, path); + } + /* by-path and by-serial are handled separately */ + } + + if (!bsdName && !uuid) { + dprintf("%s Invalid path %s\n", __func__, path); + return (NULL); + } + + /* Match on IOMedia by BSD disk name */ + matchDict = IOService::serviceMatching("IOMedia"); + if (!matchDict) { + dprintf("%s couldn't get matching dictionary\n", __func__); + if (bsdName) bsdName->release(); + if (uuid) uuid->release(); + return (NULL); + } + if (bsdName) { + ret = matchDict->setObject(kIOBSDNameKey, bsdName); + bsdName->release(); + + if (!ret) { + dprintf("%s couldn't setup bsd name matching" + " dictionary\n", __func__); + matchDict->release(); + matchDict = 0; + } + if (uuid) uuid->release(); + } else if (uuid) { + if (matchDict->setObject(kIOMediaUUIDKey, uuid) == false) { + dprintf("%s couldn't setup UUID matching" + " dictionary\n", __func__); + uuid->release(); + matchDict->release(); + matchDict = 0; + } + } else { + dprintf("%s missing matching property\n", __func__); + matchDict->release(); + matchDict = 0; + } + + /* Return NULL or valid OSDictionary with retain count */ + return (matchDict); +} + +/* Returns NULL or matched IOMedia with a retain count */ +static IOMedia * +media_from_matchdict(OSDictionary *matchDict) +{ + OSIterator *iter = 0; + OSObject *obj = 0; + IOMedia *media = 0; + + if (!matchDict) { + dprintf("%s missing matching dictionary\n", __func__); + return (NULL); + } + + /* + * We could instead use copyMatchingService, since + * there should only be one match. + */ + iter = IOService::getMatchingServices(matchDict); + if (!iter) { + dprintf("%s No iterator from getMatchingServices\n", + __func__); + return (NULL); + } + + /* Get first object from iterator */ + while ((obj = iter->getNextObject()) != NULL) { + if ((media = OSDynamicCast(IOMedia, obj)) == NULL) { + obj = 0; + continue; + } + if (media->isFormatted() == false) { + obj = 0; + media = 0; + continue; + } + + media->retain(); + break; + } + + if (!media) { + dprintf("%s no match found\n", __func__); + iter->release(); + return (NULL); + } + +#ifdef DEBUG + /* Report if there were additional matches */ + if (iter->getNextObject() != NULL) { + dprintf("%s Had more potential matches\n", __func__); + } +#endif + iter->release(); + iter = 0; + + /* Return valid IOMedia with retain count */ + return (media); +} + +/* + * media_from_dev is intended to be called by ldi_open_by_name + * and ldi_open_by_dev with a dev_t, and returns NULL or an IOMedia + * device with a retain count that should be released on open. + */ +static IOMedia * +media_from_dev(dev_t device = 0) +{ + IOMedia *media; + OSDictionary *matchDict; + + /* Get matchDict, will need to be released */ + matchDict = media_matchdict_from_dev(device); + if (!matchDict) { + dprintf("%s couldn't get matching dictionary\n", __func__); + return (NULL); + } + + /* Get first matching IOMedia */ + media = media_from_matchdict(matchDict); + matchDict->release(); + matchDict = 0; + + if (!media) { + dprintf("%s no IOMedia found for dev_t %d\n", __func__, + device); + } + + /* Return NULL or valid media with retain count */ + return (media); +} + +/* + * media_from_device_path + * + * translate /private/var/run/disk/by-path/ to an IOMedia + * handle. The remainder of the path should be a valid + * path in the IORegistry IODTPlane device tree. + */ +static IOMedia * +media_from_device_path(const char *path = 0) +{ + IORegistryEntry *entry = 0; + IOMedia *media = 0; + OSString *osstr; + const char *string, *dash; + + /* Must be /var/run/disk/by-path/, but may have /private prefix */ + if (!path || path[0] == 0 || + (strncmp(path, "/var/run/disk/by-path/", 22) != 0 && + strncmp(path, "/private/var/run/disk/by-path/", 30) != 0)) { + dprintf("%s invalid path [%s]\n", __func__, + (path && path[0] != '\0' ? path : "")); + return (NULL); + } + + /* We need the leading slash in the string, so trim 21 or 29 */ + if (strncmp(path, "/private", 8) == 0) { + osstr = OSString::withCString(path+29); + } else { + osstr = OSString::withCString(path+21); + } + if (!osstr) { + dprintf("%s couldn't get string from path\n", __func__); + return (NULL); + } + + string = osstr->getCStringNoCopy(); + ASSERT(string); + + /* Convert dashes to slashes */ + while ((dash = strchr(string, '-')) != NULL) { + osstr->setChar('/', dash - string); + } + dprintf("%s string [%s]\n", __func__, string); + + entry = IORegistryEntry::fromPath(string, gIODTPlane); + string = 0; + osstr->release(); + osstr = 0; + + if (!entry) { + dprintf("%s IORegistryEntry::fromPath failed\n", __func__); + return (NULL); + } + + if ((media = OSDynamicCast(IOMedia, entry)) == NULL) { + entry->release(); + return (0); + } + + /* Leave a retain count on the media */ + return (media); +} + +/* + * media_from_serial + * + * translate /private/var/run/disk/by-serial/model-serial[:location] + * to an IOMedia handle. The path format is determined by + * InvariantDisks logic in IDSerialLinker.cpp. + */ +static IOMedia * +media_from_serial(const char *path = 0) +{ + IORegistryEntry *entry = 0; + IOMedia *media = 0; + OSDictionary *matching = 0; + OSDictionary *deviceCharacteristics = 0; + OSIterator *iter = 0; + OSString *osstr = 0; + OSString *model = 0; + OSString *serial = 0; + OSNumber *bsdUnit = 0; + OSObject *property = 0; + OSObject *propDict = 0; + OSObject *obj = 0; + const char *substr = 0; + const char *sep1 = 0, *sep2 = 0; + const char *string = 0, *space = 0; + const char *location = 0, *entryLocation = 0; + int newlen = 0, soff = 0; + bool matched = false; + + /* Must be /var/run/disk/by-serial/, but may have /private prefix */ + if (!path || path[0] == 0 || + (strncmp(path, "/var/run/disk/by-serial/", 24) != 0 && + strncmp(path, "/private/var/run/disk/by-serial/", 32) != 0)) { + dprintf("%s invalid path [%s]\n", __func__, + (path && path[0] != '\0' ? path : "")); + return (NULL); + } + + /* substr starts after '/by-serial/' */ + substr = path + 24; + if (strncmp(path, "/private", 8) == 0) substr += 8; + + /* + * For each whole-disk IOMedia: + * Search parents for deviceCharacteristics, or skip. + * Check for Model and Serial Number properties, or skip. + * Trim trailing space and swap underscores within string. + * If "model-serial" matches path so far: + * Match whole-disk IOMedia if no slice specified. + * Or get child IOMedia with matching Location property. + */ + + sep1 = strchr(substr, '-'); + sep2 = strrchr(substr, ':'); + if (sep1 == 0) { + dprintf("%s invalid by-serial path [%s]\n", __func__, substr); + return (NULL); + } + if (sep2 == 0) { + dprintf("%s no slice, whole disk [%s]\n", __func__, substr); + sep2 = substr + (strlen(substr)); + } + + if ((matching = IOService::serviceMatching("IOMedia")) == NULL) { + dprintf("%s couldn't get matching dictionary\n", __func__); + return (NULL); + } + + if ((matching->setObject(kIOMediaWholeKey, kOSBooleanTrue) == false) || + (iter = IOService::getMatchingServices(matching)) == NULL) { + dprintf("%s couldn't get IOMedia iterator\n", __func__); + matching->release(); + return (NULL); + } + matching->release(); + matching = 0; + + while ((obj = iter->getNextObject()) != NULL) { + if ((entry = OSDynamicCast(IORegistryEntry, obj)) == NULL || + (media = OSDynamicCast(IOMedia, entry)) == NULL || + media->isFormatted() == false) { + // media->isWhole() == false) { + continue; + } + + propDict = media->getProperty( + kIOPropertyDeviceCharacteristicsKey, gIOServicePlane, + (kIORegistryIterateRecursively | + kIORegistryIterateParents)); + if ((deviceCharacteristics = OSDynamicCast(OSDictionary, + propDict)) == NULL) { + dprintf("%s no device characteristics, skipping\n", + __func__); + continue; + } + + /* + * Get each property, cast as OSString, then copy + * to a new OSString. + */ + if ((property = deviceCharacteristics->getObject( + kIOPropertyProductNameKey)) == NULL || + (osstr = OSDynamicCast(OSString, property)) == NULL || + (model = OSString::withString(osstr)) == NULL) { + dprintf("%s no product name, skipping\n", __func__); + continue; + } + if ((property = deviceCharacteristics->getObject( + kIOPropertyProductSerialNumberKey)) == NULL || + (osstr = OSDynamicCast(OSString, property)) == NULL || + (serial = OSString::withString(osstr)) == NULL) { + dprintf("%s no serial number, skipping\n", __func__); + model->release(); + model = 0; + continue; + } + + string = model->getCStringNoCopy(); + if (!string) { + model->release(); + model = 0; + serial->release(); + serial = 0; + continue; + } + /* Trim trailing whitespace */ + for (newlen = strlen(string); newlen > 0; newlen--) { + if (string[newlen-1] != ' ') { + model->setChar('\0', newlen); + break; + } + } + + /* + * sep1 is the location of the first '-' in the path. + * even if there is a '-' in the model name, we can skip + * media with model names shorter than that. + */ + if (newlen == 0 || + (newlen < (sep1 - substr)) || + (substr[newlen] != '-')) { + model->release(); + model = 0; + serial->release(); + serial = 0; + continue; + } + + /* Convert spaces to underscores */ + while ((space = strchr(string, ' ')) != NULL) { + model->setChar('_', space - string); + } + + /* Compare the model string with the path */ + if (strncmp(substr, string, newlen) != 0) { + model->release(); + model = 0; + serial->release(); + serial = 0; + continue; + } + dprintf("%s model string matched [%s]\n", + __func__, model->getCStringNoCopy()); + model->release(); + model = 0; + + soff = newlen + 1; + + string = serial->getCStringNoCopy(); + if (!string) { + serial->release(); + serial = 0; + continue; + } + /* Trim trailing whitespace */ + for (newlen = strlen(string); newlen > 0; newlen--) { + if (string[newlen-1] != ' ') { + serial->setChar('\0', newlen); + break; + } + } + /* + * sep2 is the location of the last ':' in the path, or + * the end of the string if there is none. + * even if there is a ':' in the serial number, we can skip + * media with serial number strings shorter than that. + */ + if (newlen == 0 || + (newlen < (sep2 - sep1 - 1)) || + (substr[soff+newlen] != '\0' && + substr[soff+newlen] != ':')) { + serial->release(); + serial = 0; + continue; + } + + /* Convert spaces to underscores */ + while ((space = strchr(string, ' ')) != NULL) { + serial->setChar('_', space - string); + } + + /* Compare the serial string with the path */ + if (strncmp(substr+soff, string, newlen) != 0) { + serial->release(); + serial = 0; + continue; + } + dprintf("%s serial string matched [%s]\n", + __func__, serial->getCStringNoCopy()); + serial->release(); + serial = 0; + + /* + * Still need to get the slice - the component + * after an optional ':' at the end of the + * string, by searching for IOMedia with that + * location string below the whole-disk IOMedia. + */ + /* Set new location of ':' */ + sep2 = substr + (soff + newlen); + /* Found match */ + matched = true; + media->retain(); + break; + } + iter->release(); + iter = 0; + + if (!matched || !media) { + dprintf("%s no matching devices found\n", __func__); + return (NULL); + } + + /* Whole disk path will not end with ':' */ + if (sep2[0] != ':') { + dprintf("%s Found whole disk [%s]\n", __func__, path); + /* Leave a retain count on the media */ + return (media); + } + + /* Remainder of string is location */ + location = sep2 + 1; + dprintf("%s location string [%s]\n", __func__, location); + + if ((bsdUnit = OSDynamicCast(OSNumber, + media->getProperty(kIOBSDUnitKey))) == NULL) { + dprintf("%s couldn't get BSD unit number\n", __func__); + media->release(); + return (NULL); + } + if ((matching = IOService::serviceMatching("IOMedia")) == NULL || + (matching->setObject(kIOMediaWholeKey, kOSBooleanFalse)) == false || + (matching->setObject(kIOBSDUnitKey, bsdUnit)) == false || + (iter = IOService::getMatchingServices(matching)) == NULL) { + dprintf("%s iterator for location failed\n", + __func__); + + if (matching) matching->release(); + /* We had a candidate, but couldn't get the location */ + media->release(); + return (NULL); + } + matching->release(); + matching = 0; + + /* Iterate over children checking for matching location */ + matched = false; + entry = 0; + while ((obj = iter->getNextObject()) != NULL) { + if ((entry = OSDynamicCast(IORegistryEntry, obj)) == NULL || + (OSDynamicCast(IOMedia, entry)) == NULL) { + entry = 0; + continue; + } + + if ((entryLocation = entry->getLocation()) == NULL || + (strlen(entryLocation) != strlen(location)) || + strcmp(entryLocation, location) != 0) { + entry = 0; + continue; + } + + dprintf("%s found match\n", __func__); + matched = true; + entry->retain(); + break; + } + iter->release(); + iter = 0; + + /* Drop the whole-disk media */ + media->release(); + media = 0; + + /* Cast the new entry, if there is one */ + if (!entry || (media = OSDynamicCast(IOMedia, entry)) == NULL) { +if (entry) dprintf("%s had entry but couldn't cast\n", __func__); + dprintf("%s no media found for path %s\n", + __func__, path); + if (entry) entry->release(); + return (NULL); + } + + dprintf("%s media from serial number succeeded\n", __func__); + + /* Leave a retain count on the media */ + return (matched ? media : NULL); +} + +/* + * media_from_path is intended to be called by ldi_open_by_name + * with a char* path, and returns NULL or an IOMedia device with a + * retain count that should be released on open. + */ +static IOMedia * +media_from_path(const char *path = 0) +{ + IOMedia *media; + OSDictionary *matchDict; + + /* Validate path */ + if (path == 0 || strlen(path) <= 1) { + dprintf("%s no path provided\n", __func__); + return (NULL); + } + + if (strncmp(path, "/var/run/disk/by-path/", 22) == 0 || + strncmp(path, "/private/var/run/disk/by-path/", 30) == 0) { + media = media_from_device_path(path); + dprintf("%s media_from_device_path %s\n", __func__, + (media ? "succeeded" : "failed")); + return (media); + } + + if (strncmp(path, "/var/run/disk/by-serial/", 24) == 0 || + strncmp(path, "/private/var/run/disk/by-serial/", 32) == 0) { + media = media_from_serial(path); + dprintf("%s media_from_serial %s\n", __func__, + (media ? "succeeded" : "failed")); + return (media); + } + + /* Try to get /dev/disk or /private/var/run/disk/by-id path */ + matchDict = media_matchdict_from_path(path); + if (!matchDict) { + dprintf("%s couldn't get matching dictionary\n", __func__); + return (NULL); + } + + media = media_from_matchdict(matchDict); + matchDict->release(); + matchDict = 0; + + if (!media) { + dprintf("%s no IOMedia found for path %s\n", __func__, path); + } + + /* Return NULL or valid media with retain count */ + return (media); +} + +/* Define an IOKit buffer for buf_strategy_iokit */ +typedef struct ldi_iokit_buf { + IOMemoryDescriptor *iomem; + IOStorageCompletion iocompletion; + IOStorageAttributes ioattr; +} ldi_iokit_buf_t; /* XXX Currently 64b */ + +/* Completion handler for IOKit strategy */ +static void +ldi_iokit_io_intr(void *target, void *parameter, + IOReturn status, UInt64 actualByteCount) +{ + ldi_iokit_buf_t *iobp = (ldi_iokit_buf_t *)target; + ldi_buf_t *lbp = (ldi_buf_t *)parameter; + +#ifdef DEBUG + /* In debug builds, verify buffer pointers */ + ASSERT3U(lbp, !=, 0); + ASSERT3U(iobp, !=, 0); + + if (!iobp || !lbp) { + printf("%s missing a buffer\n", __func__); + return; + } + + ASSERT3U(iobp->iomem, !=, 0); + + if (!iobp->iomem) { + printf("%s missing iobp->iomem\n", __func__); + return; + } + + // this is very very very noisy in --enable-boot + // ASSERT3U(ldi_zfs_handle, !=, 0); + + if (actualByteCount == 0 || + actualByteCount != lbp->b_bcount || + status != kIOReturnSuccess) { + printf("%s %s %llx / %llx\n", __func__, + "actualByteCount != lbp->b_bcount", + actualByteCount, lbp->b_bcount); + if (ldi_zfs_handle) + printf("%s status %d %d %s\n", __func__, status, + ldi_zfs_handle->errnoFromReturn(status), + ldi_zfs_handle->stringFromReturn(status)); + else + printf("%s status %d ldi_zfs_handle is NULL\n", + __func__, status); + } +#endif + + /* Complete and release IOMemoryDescriptor */ + iobp->iomem->complete(); + iobp->iomem->release(); + iobp->iomem = 0; + + /* Compute resid */ + ASSERT3U(lbp->b_bcount, >=, actualByteCount); + lbp->b_resid = (lbp->b_bcount - actualByteCount); + + /* Set error status */ + if (status == kIOReturnSuccess && + actualByteCount != 0 && lbp->b_resid == 0) { + lbp->b_error = 0; + } else { + lbp->b_error = EIO; + } + + /* Free IOKit buffer */ + kmem_free(iobp, sizeof (ldi_iokit_buf_t)); + + /* Call original completion function */ + if (lbp->b_iodone) { + (void) lbp->b_iodone(lbp); + } +} + +/* Synchronous IO, called by buf_strategy_iokit */ +static int +buf_sync_strategy_iokit(ldi_buf_t *lbp, ldi_iokit_buf_t *iobp, + struct ldi_handle *lhp) +{ + UInt64 actualByteCount = 0; + IOReturn result; + + /* Read or write */ + if (lbp->b_flags & B_READ) { + result = LH_MEDIA(lhp)->IOStorage::read(LH_CLIENT(lhp), + dbtolb(lbp->b_lblkno), iobp->iomem, + &iobp->ioattr, &actualByteCount); + } else { + result = LH_MEDIA(lhp)->IOStorage::write(LH_CLIENT(lhp), + dbtolb(lbp->b_lblkno), iobp->iomem, + &iobp->ioattr, &actualByteCount); + } + + /* Call completion */ + ldi_iokit_io_intr((void *)iobp, (void *)lbp, + result, actualByteCount); + + /* Return success based on result */ + return (result == kIOReturnSuccess ? 0 : EIO); +} + +/* + * Uses IOMedia::read asynchronously or IOStorage::read synchronously. + * virtual void read(IOService * client, + * UInt64 byteStart, + * IOMemoryDescriptor * buffer, + * IOStorageAttributes * attributes, + * IOStorageCompletion * completion); + * virtual IOReturn read(IOService * client, + * UInt64 byteStart, + * IOMemoryDescriptor * buffer, + * IOStorageAttributes * attributes = 0, + * UInt64 * actualByteCount = 0); + */ +int +buf_strategy_iokit(ldi_buf_t *lbp, struct ldi_handle *lhp) +{ + ldi_iokit_buf_t *iobp = 0; + + ASSERT3U(lbp, !=, NULL); + ASSERT3U(lhp, !=, NULL); + +#ifdef DEBUG + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp)) || + !OSDynamicCast(IOService, LH_CLIENT(lhp))) { + dprintf("%s invalid IOMedia or client\n", __func__); + return (ENODEV); + } +#endif /* DEBUG */ + + /* Allocate an IOKit buffer */ + iobp = (ldi_iokit_buf_t *)kmem_alloc(sizeof (ldi_iokit_buf_t), + KM_SLEEP); + if (!iobp) { + dprintf("%s couldn't allocate buf_iokit_t\n", __func__); + return (ENOMEM); + } +#ifdef LDI_ZERO + /* Zero the new buffer struct */ + bzero(iobp, sizeof (ldi_iokit_buf_t)); +#endif + + /* Set completion and attributes for async IO */ + if (lbp->b_iodone != NULL) { + iobp->iocompletion.target = iobp; + iobp->iocompletion.parameter = lbp; + iobp->iocompletion.action = &ldi_iokit_io_intr; + } + +/* XXX Zeroed above if LDI_ZERO, otherwise here */ +#ifndef LDI_ZERO + /* XXX Zero the ioattr struct */ + bzero(&iobp->ioattr, sizeof (IOStorageAttributes)); +#endif + + /* Allocate a memory descriptor pointing to the data address */ + iobp->iomem = IOMemoryDescriptor::withAddress( + lbp->b_un.b_addr, lbp->b_bcount, + (lbp->b_flags & B_READ ? kIODirectionIn : kIODirectionOut)); + + /* Verify the buffer */ + if (!iobp->iomem || iobp->iomem->getLength() != lbp->b_bcount || + iobp->iomem->prepare() != kIOReturnSuccess) { + dprintf("%s couldn't allocate IO buffer\n", + __func__); + if (iobp->iomem) { + iobp->iomem->release(); + } + kmem_free(iobp, sizeof (ldi_iokit_buf_t)); + return (ENOMEM); + } + + /* Recheck instantaneous value of handle status */ + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s device not online\n", __func__); + iobp->iomem->complete(); + iobp->iomem->release(); + kmem_free(iobp, sizeof (ldi_iokit_buf_t)); + return (ENODEV); + } + + /* Synchronous or async */ + if (lbp->b_iodone == NULL) { + return (buf_sync_strategy_iokit(lbp, iobp, lhp)); + } + + /* Read or write */ + if (lbp->b_flags & B_READ) { + LH_MEDIA(lhp)->IOMedia::read(LH_CLIENT(lhp), + dbtolb(lbp->b_lblkno), iobp->iomem, + &iobp->ioattr, &iobp->iocompletion); + } else { + LH_MEDIA(lhp)->IOMedia::write(LH_CLIENT(lhp), + dbtolb(lbp->b_lblkno), iobp->iomem, + &iobp->ioattr, &iobp->iocompletion); + } + + /* Return success, will call io_intr when done */ + return (0); +} + +/* Client interface, alloc and open IOKit handle */ +int +ldi_open_by_media(IOMedia *media = 0, dev_t device = 0, + int fmode = 0, ldi_handle_t *lhp = 0) +{ + struct ldi_handle *retlhp; + ldi_status_t status; + int error; + + /* Validate IOMedia */ + if (!media || !lhp) { + dprintf("%s invalid argument %p or %p\n", + __func__, media, lhp); + return (EINVAL); + } + + /* Retain for duration of open */ + media->retain(); + + /* Get dev_t if not supplied */ + if (device == 0 && (device = dev_from_media(media)) == 0) { + dprintf("%s dev_from_media failed: %p %d\n", __func__, + media, device); + media->release(); + return (ENODEV); + } + + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*(struct ldi_handle **)lhp, ==, NULL); + + /* Allocate IOKit handle */ + retlhp = handle_alloc_iokit(device, fmode); + if (retlhp == NULL) { + dprintf("%s couldn't allocate IOKit handle\n", __func__); + media->release(); + return (ENOMEM); + } + + /* Try to open device with IOMedia */ + status = handle_open_start(retlhp); + if (status == LDI_STATUS_ONLINE) { + dprintf("%s already online, refs %d, openrefs %d\n", __func__, + retlhp->lh_ref, retlhp->lh_openref); + /* Cast retlhp and assign to lhp (may be 0) */ + *lhp = (ldi_handle_t)retlhp; + media->release(); + /* Successfully incremented open ref */ + return (0); + } + if (status != LDI_STATUS_OPENING) { + dprintf("%s invalid status %d\n", __func__, status); + handle_release(retlhp); + retlhp = 0; + media->release(); + return (ENODEV); + } + + error = handle_open_iokit(retlhp, media); + media->release(); + + if (error) { + dprintf("%s Couldn't open handle\n", __func__); + handle_open_done(retlhp, LDI_STATUS_CLOSED); + handle_release(retlhp); + retlhp = 0; + return (EIO); + } + handle_open_done(retlhp, LDI_STATUS_ONLINE); + + /* Register for disk notifications */ + handle_register_notifier(retlhp); + + /* Cast retlhp and assign to lhp (may be 0) */ + *lhp = (ldi_handle_t)retlhp; + /* Pass error from open */ + return (error); +} + +/* Client interface, find IOMedia from dev_t, alloc and open handle */ +int +ldi_open_media_by_dev(dev_t device = 0, int fmode = 0, + ldi_handle_t *lhp = 0) +{ + IOMedia *media = 0; + int error = EINVAL; + + /* Validate arguments */ + if (!lhp || device == 0) { + dprintf("%s missing argument %p %d\n", + __func__, lhp, device); + return (EINVAL); + } + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*((struct ldi_handle **)lhp), ==, NULL); + + /* Get IOMedia from major/minor */ + if ((media = media_from_dev(device)) == NULL) { + dprintf("%s media_from_dev error %d\n", + __func__, error); + return (ENODEV); + } + + /* Try to open by media */ + error = ldi_open_by_media(media, device, fmode, lhp); + + /* Release IOMedia and clear */ + media->release(); + media = 0; + + /* Pass error from open */ + return (error); +} + +/* Client interface, find dev_t and IOMedia/vnode, alloc and open handle */ +int +ldi_open_media_by_path(char *path = 0, int fmode = 0, + ldi_handle_t *lhp = 0) +{ + IOMedia *media = 0; + dev_t device = 0; + int error = EINVAL; + + /* Validate arguments */ + if (!lhp || !path) { + dprintf("%s %s %p %s %d\n", __func__, + "missing lhp or path", lhp, path, fmode); + return (EINVAL); + } + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*((struct ldi_handle **)lhp), ==, NULL); + + /* For /dev/disk*, and InvariantDisk paths */ + if ((media = media_from_path(path)) == NULL) { + dprintf("%s media_from_path failed\n", __func__); + return (ENODEV); + } + + error = ldi_open_by_media(media, device, fmode, lhp); + + /* Release IOMedia and clear */ + media->release(); + media = 0; + + /* Error check open */ + if (error) { + dprintf("%s ldi_open_by_media failed %d\n", + __func__, error); + } + + return (error); +} + +int +handle_remove_notifier(struct ldi_handle *lhp) +{ + handle_notifier_t notifier; + +#ifdef DEBUG + if (!lhp) { + dprintf("%s missing handle\n", __func__); + return (EINVAL); + } +#endif + + if (lhp->lh_notifier == 0) { + dprintf("%s no notifier installed\n", __func__); + return (0); + } + + /* First clear notifier pointer */ + notifier = lhp->lh_notifier; + lhp->lh_notifier = 0; + +#ifdef DEBUG + /* Validate IONotifier object */ + if (!OSDynamicCast(IONotifier, notifier->obj)) { + dprintf("%s %p is not an IONotifier\n", __func__, + notifier->obj); + return (EINVAL); + } +#endif + + notifier->obj->remove(); + kmem_free(notifier, sizeof (handle_notifier_t)); + return (0); +} + +int +handle_register_notifier(struct ldi_handle *lhp) +{ + OSDictionary *matchDict; + handle_notifier_t notifier; + + /* Make sure we have a handle and dev_t */ + if (!lhp || lhp->lh_dev == 0) { + dprintf("%s no handle or missing dev_t\n", __func__); + return (EINVAL); + } + + notifier = (handle_notifier_t)kmem_alloc( + sizeof (struct _handle_notifier), KM_SLEEP); + if (!notifier) { + dprintf("%s couldn't alloc notifier struct\n", __func__); + return (ENOMEM); + } + + /* Get matchDict, will need to be released */ + matchDict = media_matchdict_from_dev(lhp->lh_dev); + if (!matchDict) { + dprintf("%s couldn't get matching dictionary\n", __func__); + kmem_free(notifier, sizeof (handle_notifier_t)); + return (EINVAL); + } + + /* Register IOMedia termination notification */ + notifier->obj = IOService::addMatchingNotification( + gIOTerminatedNotification, matchDict, + handle_media_terminate_cb, /* target */ 0, + /* refCon */ (void *)lhp, /* priority */ 0); + matchDict->release(); + + /* Error check notifier */ + if (!notifier->obj) { + dprintf("%s addMatchingNotification failed\n", + __func__); + kmem_free(notifier, sizeof (handle_notifier_t)); + return (ENOMEM); + } + + /* Assign notifier to handle */ + lhp->lh_notifier = notifier; + return (0); +} + +/* Supports both IOKit and vnode handles by finding IOMedia from dev_t */ +int +handle_set_wce_iokit(struct ldi_handle *lhp, int *wce) +{ + IOMedia *media; + IORegistryEntry *parent; + IOBlockStorageDevice *device; + IOReturn result; + bool value; + + if (!lhp || !wce) { + return (EINVAL); + } + + switch (lhp->lh_type) { + case LDI_TYPE_IOKIT: + if ((media = LH_MEDIA(lhp)) == NULL) { + dprintf("%s couldn't get IOMedia\n", __func__); + return (ENODEV); + } + /* Add a retain count */ + media->retain(); + break; + case LDI_TYPE_VNODE: + if (lhp->lh_dev == 0 || + (media = media_from_dev(lhp->lh_dev)) == 0) { + dprintf("%s couldn't find IOMedia for dev_t %d\n", + __func__, lhp->lh_dev); + return (ENODEV); + } + /* Returned media has a retain count */ + break; + default: + dprintf("%s invalid handle\n", __func__); + return (EINVAL); + } + + /* Walk the parents of this media */ + for (parent = media->getParentEntry(gIOServicePlane); + parent != NULL; + parent = parent->getParentEntry(gIOServicePlane)) { + /* Until a valid device is found */ + device = OSDynamicCast(IOBlockStorageDevice, parent); + if (device != NULL) { + device->retain(); + break; + } + /* Next parent */ + } + media->release(); + media = 0; + + /* If no matching device was found */ + if (!device) { + dprintf("%s no IOBlockStorageDevice found\n", __func__); + return (ENODEV); + } + + result = device->getWriteCacheState(&value); + if (result != kIOReturnSuccess) { + // dprintf("%s couldn't get current write cache state %d\n", + // __func__, ldi_zfs_handle->errnoFromReturn(result)); + return (ENXIO); + } + + /* If requested value does not match current */ + if (value != *wce) { + value = (*wce == 1); + /* Attempt to change the value */ + result = device->setWriteCacheState(value); + } + + /* Set error and wce to return */ + if (result != kIOReturnSuccess) { + // dprintf("%s couldn't set write cache %d\n", + // __func__, ldi_zfs_handle->errnoFromReturn(result)); + /* Flip wce to indicate current status */ + *wce = !(*wce); + return (ENXIO); + } + + return (0); +} + +int +handle_get_media_info_iokit(struct ldi_handle *lhp, + struct dk_minfo *dkm) +{ + uint32_t blksize; + uint64_t blkcount; + + if (!lhp || !dkm) { + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + if ((blksize = LH_MEDIA(lhp)->getPreferredBlockSize()) == 0) { + dprintf("%s invalid blocksize\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + if ((blkcount = LH_MEDIA(lhp)->getSize() / blksize) == 0) { + dprintf("%s invalid block count\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + LH_MEDIA(lhp)->release(); + + /* Set the return values */ + dkm->dki_capacity = blkcount; + dkm->dki_lbsize = blksize; + + return (0); +} + +int +handle_get_media_info_ext_iokit(struct ldi_handle *lhp, + struct dk_minfo_ext *dkmext) +{ + OSObject *prop; + OSNumber *number; + uint32_t blksize, pblksize; + uint64_t blkcount; + + if (!lhp || !dkmext) { + dprintf("%s missing lhp or dkmext\n", __func__); + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + prop = LH_MEDIA(lhp)->getProperty(kIOPropertyPhysicalBlockSizeKey, + gIOServicePlane, kIORegistryIterateRecursively | + kIORegistryIterateParents); + + number = OSDynamicCast(OSNumber, prop); + if (!prop || !number) { + dprintf("%s couldn't get physical blocksize\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + pblksize = number->unsigned32BitValue(); + number = 0; + prop = 0; + + if ((blksize = LH_MEDIA(lhp)->getPreferredBlockSize()) == 0) { + dprintf("%s invalid blocksize\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + if ((blkcount = LH_MEDIA(lhp)->getSize() / blksize) == 0) { + dprintf("%s invalid block count\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + LH_MEDIA(lhp)->release(); + +#ifdef DEBUG + dprintf("%s phys blksize %u, logical blksize %u, blockcount %llu\n", + __func__, pblksize, blksize, blkcount); +#endif + + /* Set the return values */ + dkmext->dki_capacity = blkcount; + dkmext->dki_lbsize = blksize; + dkmext->dki_pbsize = pblksize; + + return (0); +} + +int +handle_check_media_iokit(struct ldi_handle *lhp, int *status) +{ + /* Validate arguments */ + if (!lhp || !status) { + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + /* Validate device size */ + if (LH_MEDIA(lhp)->getSize() == 0) { + dprintf("%s media reported 0 size\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + /* Validate write status if handle fmode is read-write */ + if ((lhp->lh_fmode & FWRITE) && + LH_MEDIA(lhp)->isWritable() == false) { + dprintf("%s media is not writeable\n", __func__); + LH_MEDIA(lhp)->release(); + return (EPERM); + } + + LH_MEDIA(lhp)->release(); + + /* Success */ + *status = 0; + return (0); +} + +int +handle_is_solidstate_iokit(struct ldi_handle *lhp, int *isssd) +{ + OSDictionary *propDict = 0; + OSString *property = 0; + + /* Validate arguments */ + if (!lhp || !isssd) { + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + propDict = OSDynamicCast(OSDictionary, LH_MEDIA(lhp)->getProperty( + kIOPropertyDeviceCharacteristicsKey, gIOServicePlane)); + + if (propDict != 0) { + property = OSDynamicCast(OSString, + propDict->getObject(kIOPropertyMediumTypeKey)); + propDict = 0; + } + + if (property != 0 && + property->isEqualTo(kIOPropertyMediumTypeSolidStateKey)) { + *isssd = 1; + } + property = 0; + + LH_MEDIA(lhp)->release(); + + return (0); +} + +int +handle_features_iokit(struct ldi_handle *lhp, + uint32_t *data) +{ + if (!lhp || !data) { + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + OSDictionary *dictionary = OSDynamicCast( + /* class */ OSDictionary, + /* object */ LH_MEDIA(lhp)->getProperty( + /* key */ kIOStorageFeaturesKey, + /* plane */ gIOServicePlane)); + + *data = 0; + + if (dictionary) { + OSBoolean *boolean; + +#ifdef DK_FEATURE_BARRIER + boolean = OSDynamicCast( + /* class */ OSBoolean, + /* object */ dictionary->getObject( + /* key */ kIOStorageFeatureBarrier)); + + if (boolean == kOSBooleanTrue) + *(uint32_t *)data |= DK_FEATURE_BARRIER; +#endif + + boolean = OSDynamicCast( + /* class */ OSBoolean, + /* object */ dictionary->getObject( + /* key */ kIOStorageFeatureForceUnitAccess)); + + if (boolean == kOSBooleanTrue) + *(uint32_t *)data |= DK_FEATURE_FORCE_UNIT_ACCESS; + +#ifdef DK_FEATURE_PRIORITY + boolean = OSDynamicCast( + /* class */ OSBoolean, + /* object */ dictionary->getObject( + /* key */ kIOStorageFeaturePriority)); + + if (boolean == kOSBooleanTrue) + *(uint32_t *)data |= DK_FEATURE_PRIORITY; +#endif + + boolean = OSDynamicCast( + /* class */ OSBoolean, + /* object */ dictionary->getObject( + /* key */ kIOStorageFeatureUnmap)); + + if (boolean == kOSBooleanTrue) + *(uint32_t *)data |= DK_FEATURE_UNMAP; + } + + LH_MEDIA(lhp)->release(); + return (0); +} + +int +handle_unmap_iokit(struct ldi_handle *lhp, + dkioc_free_list_ext_t *dkm) +{ + int error = 0; + + if (!lhp || !dkm) { + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + /* We need to convert illumos' dkioc_free_list_t to dk_unmap_t */ + IOStorageExtent *extents; + extents = IONew(IOStorageExtent, 1); + extents[0].byteStart = dkm->dfle_start; + extents[0].byteCount = dkm->dfle_length; + + /* + * dkm->dfl_flags vs IOStorageUnmapOptions + * #define DF_WAIT_SYNC 0x00000001 + * Wait for full write-out of free. + * IOStorageUnmapOptions is only 0 + */ + + /* issue unmap */ + error = LH_MEDIA(lhp)->unmap(LH_CLIENT(lhp), + extents, 1, 0); + + if (error != 0) { + dprintf("%s unmap: 0x%x\n", __func__, error); + // Convert IOReturn to errno + error = LH_MEDIA(lhp)->errnoFromReturn(error); + } + + IODelete(extents, IOStorageExtent, 1); + LH_MEDIA(lhp)->release(); + + return (error); +} + + +} /* extern "C" */ diff --git a/module/os/macos/zfs/ldi_osx.c b/module/os/macos/zfs/ldi_osx.c new file mode 100644 index 0000000000..0c07cec85e --- /dev/null +++ b/module/os/macos/zfs/ldi_osx.c @@ -0,0 +1,2432 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * Portions of this document are copyright Oracle and Joyent. + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +/* + * LDI Subsystem on OS X: + * + * Designed as a drop-in replacement for sunldi.h and driver_lyr.c, + * LDI abstracts away platform-specific device handling. This allows + * vdev_disk.c to more closely match 'upstream' illumos/OpenZFS. + * + * LDI handles may use IOKit or vnode ops to locate and use devices. + * - This reduces the call stack and work needed for almost all IO. + * - Allows for vdev discovery and use during early boot, before the + * root device is mounted. + * - Having both types allows use of non-standard kexts which publish + * bdevsw block devices (without IOMedia). + * + * XXX Review correct call stack using dtrace, annotate stack size. + * Previously, vnode_open and VNOP_STRATEGY were used, which required + * allocating buf_t for IO. This meant translating byte offsets to + * block numbers for every IO. Once issued, dtrace showed that a very + * large stack was required: + * VNOP_STRATEGY macro performs work then calls + * spec_strategy (vop->vop_strategy) which performs work then calls + * dkiostrategy (syscall) which passes the IO to an IOMediaBSDClient + * IOMediaBSDClient performed work and passes to its IOMedia provider + * + * Beyond that is a common path shared by vnode and IOMedia: + * IOMedia performs work, then does prepareRequest, breakUpRequest, + * deBlockRequest, and executeRequest. + * Potentially passed down the provider stack through IOPartitionMap + * then to the whole-disk IOMedia, with more work + * Passed down through IOBlockStorageDriver, with more work + * Passed down through IOBlockStorageDevice, with more work + * Finally passed to Family-specific driver (AHCI, diskimage, etc.) + * + * By directly accessing IOMedia, the stack is reduced, and byte + * offsets are passed to read()/write() via ldi_strategy. + * We still need to allocate an IOMemoryDescriptor for the data buf, + * however only an IOMemoryDescriptor::withAddress() reference is + * required, similar to buf_setdataptr. + */ + +/* + * LDI Handle hash lists: + * + * During ldi_init, LH_HASH_SZ lists and locks are allocated. New handles + * will be added to the list indexed by the hash of the dev_t number. + * + * The hash function simply performs a modulus on the dev_t number based on + * the LH_HASH_SZ, as opposed to illumos which hashes based on the vnode + * pointer. + * This has been tested by hashing disk0, disk0s1, disk0s2, disk1, disk1s1, + * etc. to verify results were distributed across hash range. + * + * OS X dev_t numbers should be unique unless a new device claims the same + * dev_t as a removed/failed device. This would only be a collision if we + * still have a handle for the failed device (notification/event handlers + * should remove these before that occurs). + * Since Offline status is a dead-end and the handle cannot be dereferenced + * or freed while iterating the hash list, it is safe to check the status + * and skip a handle if the status is Offline (without taking handle lock). + * + * XXX On illumos the hash function uses the vnode's pointer address as the + * unique key. Since vnode addresses are aligned to the size of the vnode + * struct, the hash function shifts the pointer address to the right in order + * to hash the unique bits of the address. OS X dev_t use all the bits of + * an unsigned 32-bit int. + */ + +/* + * LDI Handle locks: + * + * Handle references and list membership are protected by the hash list + * locks. + * Handle status and other fields are protected by a per-handle mutex. + * + * To prevent deadlocks and artificial delays, the hash list locks should + * be held only for handle hold/release and handle_add/remove (list + * iterate/insert/remove). Those functions avoid blocking. + * Use the handle mutex to change state, and avoid blocking there, too. + * + * XXX Right now handle_status_change does allocate for taskq_dispatch + * with the handle lock held, but uses TQ_NOSLEEP and verifies result. + * + * Non-locking ops such as ldi_strategy, ldi_get_size, and ldi_sync will + * check the instantaneous status/refs before attempting to proceed, and + * can only perform IO while the device is Online. + */ + +/* + * LDI Handle allocation: + * + * ldi_open_by_name and ldi_open_by_dev locate the device and call + * ldi_open_media_by_path, ldi_open_media_by_dev, or ldi_open_vnode_by_path. + * + * From ldi_open_by_media and _by_vnode, we call handle_alloc_{type}. Both + * call handle_alloc_common to allocate and configure the handle. + * + * A handle is allocated in the Closed state with 1 reference. The handle + * is added to the hash list on allocation, unless a duplicate handle exists + * (same dev_t as well as fmode, not in Offline status). If an existing + * handle is found, the newly allocated handle is freed. + * + * handle_open_start is called, which takes the handle lock to check current + * status. Each of these states is possible: + * Offline: device has disappeared between allocation and now (unlikely). + * Closed: new or recently closed handle, changes status to Opening. + * Closing: already in progress. Sleeps on lock and rechecks the status. + * Opening: already in progress. Sleeps on lock and rechecks the status. + * Online: no need to open device, just increment openref count. + * + * If handle_open_start changes the status to Opening, the device is opened + * by calling handle_open_iokit or handle_open_vnode. + * + * This differs from illumos driver_lyr.c where handle_alloc first opens a + * vnode for the device, allocates a handle by vnode, and finally checks for + * a duplicate handle in the list (open, alloc, find vs. alloc, open, find). + * To do so, illumos has a VOP_OPEN that is aware of layered-driver opens. + */ + +/* + * LDI Handle list membership: + * + * Allocate with one reference, to be used or released by the caller. + * Call handle_hold if additional references are needed. + * + * Call handle_release to drop reference. On last release, this calls + * handle_free (but does not remove the handle from the list, see below). + * + * Call handle_add to determine if this handle is a duplicate, inserting + * handle into list or returning an existing handle with a hold. + * Check the result and call handle_release on the new handle if another + * handle was returned (new handle is not added to list). + * + * Each call to handle_find will take optionally take a hold, which should + * be released when no longer needed (used by handle_add). + * + * Calling handle_open increments lh_openref but does not change lh_ref. + * Caller should already have called handle_hold to get a reference. + * + * If lh_ref is 1, call handle_remove_locked (with list lock) to remove the + * handle from the list, then call handle_release_locked to remove last ref + * and free. + * A handle shouldn't remain in the list in Closed status with no refs. + * + * Calling handle_close with the last openref will automatically take list + * lock, call handle_remove_locked, and then handle_release_locked. + */ + +/* + * LDI Handle device objects: + * + * Multiple read-only opens share one read-only handle. + * Multiple read-write opens share one read-write handle. + * + * IOKit handles are allocated with the dev_t number and fmode. + * handle_open_iokit is passed an IOMedia object (which should have a + * retain held). + * Once handle_open returns, the IOMedia can be released by the caller. + * + * Vnode handles are allocated with the dev_t number and fmode. + * handle_open_vnode is passed a path (null-terminated C string). + * vnode_open increments both iocount and refcount, vnode_ref increments + * usecount, vnode_put drops iocount between ops. + * vnode_getwithref takes an iocount, and vnode_rele drops usecount + * before vnode_close decrements iocount and refcount. + */ + +/* + * LDI Handle status: + * + * #define LDI_STATUS_OFFLINE 0x0 + * #define LDI_STATUS_CLOSED 0x1 + * #define LDI_STATUS_CLOSING 0x2 + * #define LDI_STATUS_OPENING 0x3 + * #define LDI_STATUS_ONLINE 0x4 + * + * The handle lock will be taken to change status. + * + * Handle state can only progress from Closed to Opening status, and must + * have a reference held to do so. The lock is dropped for open and close + * ops while the handle is in Opening or Closing status. + * + * If the open is successful, the state is set to Online (with handle lock + * held). This state is required for IO operations to be started. The state + * may have changed by the time an IO completes. + * + * For IOKit devices, and vnode devices that have an IOMedia, a callback is + * registered for IOMedia termination which changes the state to Offline and + * posts event callbacks. + * + * Closing a handle, by the user or as a result of an event, sets the state + * to Closing. Once device close is issued, the state changes from Closing + * to Closed (even if close returned failure). + * + * A handle that still has refs and openrefs will remain in the Online + * state, dropping refs and openrefs each time ldi_close is called. + * + * If there are refs but no openrefs, it remains in the Closed state, and + * drops refs each time handle_release is called. + * This allows clients to call ldi_open_by_* to reopen the handle, in the + * case where one client is opening the handle at the same time another is + * closing it. + * + * If the device has gone missing (IOMedia terminated), the handle will + * change to Offline status. This is a dead-end which issues Offline Notify + * and Finalize events, then cleans up the handle once all clients have + * called ldi_close. + * + * Once all references have been dropped, the handle is removed from the + * hash list with the hash list lock held, then freed. + */ + +/* + * LDI Events: + * + * XXX Degrade event is not implemented, doubt it will be useful. Intended + * to be set when a vdev that is backed by RAID becomes degraded. This is + * not a recommended use case for ZFS, and on OS X we only have AppleRAID + * or custom hardware or software RAID. Also per the comments, the vdev + * would be marked Degraded only to inform the user via zpool status. + * + * XXX Tested in VirtualBox by hotplugging a SATA device, have yet to + * test with USB removal, etc. + * + * ldi_register_ev_callback can be used to add a struct to the event + * callback list containing the handle pointer, a notify callback, and + * a finalize callback. + * + * Supported events are Offline Notify/Finalize, which will be + * posted when the device enters the Offline state (IOMedia terminated). + * + * The event callback functions should be non-blocking. It is recommended + * to update a flag that can be checked prior to calling ldi_strategy. + */ + +/* + * LDI client interfaces: + * + * ldi_open_by_name + * ldi_open_by_dev + * ldi_close + * + * ldi_register_ev_callback + * ldi_unregister_ev_callback + * + * ldi_get_size + * ldi_sync + * ldi_ioctl + * ldi_strategy + * + * ldi_bioinit + * ldi_biofini + */ + +/* + * LDI Buffers: + * + * ldi_strategy uses an abstract buffer for IO, so clients do not need to + * be concerned with type-specific buf_t and IOMemoryDescriptor handling. + * + * Allocate and free ldi_buf_t manually, calling ldi_bioinit after alloc + * and ldi_biofini prior to free. + * + * Synchronous IO can be performed by setting b_iodone to NULL. + * + * Allocate and use a buffer like this: + * + * ldi_buf_t *bp = (ldi_buf_t *)kmem_alloc(sizeof (ldi_buf_t), KM_SLEEP); + * // Verify allocation before proceeding + * error = ldi_bioinit(bp); + * bp->b_bcount = size; + * bp->b_bufsize = size; + * bp->b_offset = offset; + * bp->b_data = data_ptr; + * bp->b_flags = B_BUSY | B_NOCACHE | B_READ; // For example + * bp->b_iodone = &io_intr_func; // For async IO, omit for sync IO + * ldi_strategy(handle, bp); // Issue IO + * + * With an async callback function such as: + * void io_intr_func(ldi_buf_t bp, void *param) + * { + * // Check/copyout bp->b_error and bp->b_resid + * ldi_biofini(bp); + * kmem_free(bp, sizeof (ldi_buf_t)); + * } + */ + +/* + * XXX LDI TO DO + * + * LDI handle stats. In debug builds, we have IO counters - number of IOs, + * number of bytes in/out. + * kstats for handle counts and sysctls for vnode/IOKit modes also implemented. + * + * To implement events, both vnode and IOKit handles register for matching + * notifications from the IOMedia object (if found). + * Using subclassed IOService can also receive IOMessage events, which + * would be issued earlier. + * + * Vnode handles with no IOMedia could post events on (multiple) IO failures. + */ + +/* + * ZFS internal + */ +#include +#include +#include +#include +#include + +/* + * LDI Includes + */ +#include + +/* Debug prints */ +#ifdef DEBUG +#define LDI_EVDBG(args) cmn_err args +#define LDI_EVTRC(args) cmn_err args +#else +#define LDI_EVDBG(args) do {} while (0) +#define LDI_EVTRC(args) do {} while (0) +#endif + +#define ldi_log(fmt, ...) do { \ + dprintf(fmt, __VA_ARGS__); \ + /* delay(hz>>1); */ \ +_NOTE(CONSTCOND) } while (0) + +/* + * Defines + * comment out defines to alter behavior. + */ +// #define LDI_ZERO /* For debugging, zero allocations */ + +/* Find IOMedia by matching on the BSD disk name. */ +static boolean_t ldi_use_iokit_from_path = 1; + +/* Find IOMedia by matching on the BSD major/minor (dev_t) number. */ +static boolean_t ldi_use_iokit_from_dev = 1; + +/* + * Find dev_t by vnode_lookup. + * Resolves symlinks to block devices, symlinks, InvariantDisk links. + */ +static boolean_t ldi_use_dev_from_path = 1; + +/* + * Open device by vnode if all else fails. + * Not intented to be a fallback for unsuccessful IOMedia open, but rather + * for bdev devices that do not have an IOMedia (published by other KEXTs). + */ +static boolean_t ldi_use_vnode_from_path = 1; + +/* + * Sysctls + */ +#include +SYSCTL_DECL(_ldi); +SYSCTL_NODE(, OID_AUTO, ldi, CTLFLAG_RD | CTLFLAG_LOCKED, 0, ""); +SYSCTL_NODE(_ldi, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_LOCKED, 0, ""); +SYSCTL_UINT(_ldi_debug, OID_AUTO, use_iokit_from_dev, + CTLFLAG_RW | CTLFLAG_LOCKED, &ldi_use_iokit_from_dev, 0, + "ZFS LDI use iokit_from_path"); +SYSCTL_UINT(_ldi_debug, OID_AUTO, use_iokit_from_path, + CTLFLAG_RW | CTLFLAG_LOCKED, &ldi_use_iokit_from_path, 0, + "ZFS LDI use iokit_from_dev"); +SYSCTL_UINT(_ldi_debug, OID_AUTO, use_dev_from_path, + CTLFLAG_RW | CTLFLAG_LOCKED, &ldi_use_dev_from_path, 0, + "ZFS LDI use dev_from_path"); +SYSCTL_UINT(_ldi_debug, OID_AUTO, use_vnode_from_path, + CTLFLAG_RW | CTLFLAG_LOCKED, &ldi_use_vnode_from_path, 0, + "ZFS LDI use vnode_from_path"); + +/* + * Globals + */ +static volatile int64_t ldi_handle_hash_count; + +static list_t ldi_handle_hash_list[LH_HASH_SZ]; +static kmutex_t ldi_handle_hash_lock[LH_HASH_SZ]; + +/* + * Use of "ldi_ev_callback_list" must be protected by ldi_ev_lock() + * and ldi_ev_unlock(). + */ +static struct ldi_ev_callback_list ldi_ev_callback_list; + +static uint32_t ldi_ev_id_pool = 0; + +struct ldi_ev_cookie { + char *ck_evname; + uint_t ck_sync; + uint_t ck_ctype; +}; + +#define CT_DEV_EV_OFFLINE 0x1 +#define CT_DEV_EV_DEGRADED 0x2 +static struct ldi_ev_cookie ldi_ev_cookies[] = { + {LDI_EV_OFFLINE, 1, CT_DEV_EV_OFFLINE}, + {LDI_EV_DEGRADE, 0, CT_DEV_EV_DEGRADED}, + {LDI_EV_DEVICE_REMOVE, 0, 0}, + {NULL} /* must terminate list */ +}; + +/* + * kstats + */ +static kstat_t *ldi_ksp; + +typedef struct ldi_stats { + kstat_named_t handle_count; + kstat_named_t handle_count_iokit; + kstat_named_t handle_count_vnode; + kstat_named_t handle_refs; + kstat_named_t handle_open_rw; + kstat_named_t handle_open_ro; +} ldi_stats_t; + +static ldi_stats_t ldi_stats = { + { "handle_count", KSTAT_DATA_UINT64 }, + { "handle_count_iokit", KSTAT_DATA_UINT64 }, + { "handle_count_vnode", KSTAT_DATA_UINT64 }, + { "handle_refs", KSTAT_DATA_UINT64 }, + { "handle_open_rw", KSTAT_DATA_UINT64 }, + { "handle_open_ro", KSTAT_DATA_UINT64 } +}; + +#define LDISTAT(stat) (ldi_stats.stat.value.ui64) +#define LDISTAT_INCR(stat, val) \ +atomic_add_64(&ldi_stats.stat.value.ui64, (val)) +#define LDISTAT_BUMP(stat) LDISTAT_INCR(stat, 1) +#define LDISTAT_BUMPDOWN(stat) LDISTAT_INCR(stat, -1) + +/* + * Define macros for accessing layered driver hash structures + */ +#define LH_HASH(dev) handle_hash_func(dev) + +static inline uint_t +handle_hash_func(dev_t device) +{ + /* Just cast, macro does modulus to hash value */ + return ((uint_t)device % LH_HASH_SZ); +} + +typedef struct status_change_args { + struct ldi_handle *lhp; + int new_status; +} status_change_args_t; + +static void +handle_status_change_callback(void *arg) +{ + status_change_args_t *sc = (status_change_args_t *)arg; + + /* Validate arg struct */ + if (!sc || !sc->lhp) { + dprintf("%s missing callback struct %p or lh\n", + __func__, sc); + return; + } + if (sc->new_status > LDI_STATUS_ONLINE) { + dprintf("%s invalid status %d\n", + __func__, sc->new_status); + return; + } + + dprintf("%s Invoking notify for handle %p status %d\n", + __func__, sc->lhp, sc->new_status); + ldi_invoke_notify(0 /* dip */, sc->lhp->lh_dev, S_IFBLK, + LDI_EV_OFFLINE, sc->lhp); + + dprintf("%s Invoking finalize for handle %p status %d\n", + __func__, sc->lhp, sc->new_status); + ldi_invoke_finalize(0 /* dip */, sc->lhp->lh_dev, S_IFBLK, + LDI_EV_OFFLINE, LDI_EV_SUCCESS, sc->lhp); + + /* Free callback struct */ + kmem_free(sc, sizeof (status_change_args_t)); +} + +/* Protected by handle lock */ +static int +handle_status_change_locked(struct ldi_handle *lhp, int new_status) +{ + status_change_args_t *sc = 0; + + /* Validate lhp */ + if (!lhp) { + dprintf("%s missing handle\n", __func__); + return (EINVAL); + } + if (new_status > LDI_STATUS_ONLINE) { + dprintf("%s invalid status %d\n", __func__, new_status); + return (EINVAL); + } + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_dev, !=, 0); + ASSERT(MUTEX_HELD(&lhp->lh_lock)); + + /* Set the status first */ + lhp->lh_status = new_status; + + /* Only Offline needs an event */ + if (new_status != LDI_STATUS_OFFLINE) { + dprintf("%s skipping status %d\n", __func__, new_status); + return (0); + } + + dprintf("%s new_status is Offline %d\n", __func__, new_status); + + /* Allocate struct to pass to event callback */ + /* Allocating with lock held, use KM_NOSLEEP */ + sc = (status_change_args_t *)kmem_alloc(sizeof (status_change_args_t), + KM_NOSLEEP); + if (!sc) { + dprintf("%s couldn't allocate callback struct\n", + __func__); + return (ENOMEM); + } + sc->lhp = lhp; + sc->new_status = new_status; + + mutex_exit(&lhp->lh_lock); /* Currently needs to drop lock */ + handle_status_change_callback((void *)sc); + mutex_enter(&lhp->lh_lock); /* Retake before return */ + + return (0); +} + +/* Protected by handle lock */ +int +handle_status_change(struct ldi_handle *lhp, int new_status) +{ + int error; + + /* Validate lh and new_status */ + if (!lhp) { + dprintf("%s missing handle\n", __func__); + return (EINVAL); + } + if (new_status > LDI_STATUS_ONLINE) { + dprintf("%s invalid state %d\n", __func__, new_status); + return (EINVAL); + } + + mutex_enter(&lhp->lh_lock); + error = handle_status_change_locked(lhp, new_status); + mutex_exit(&lhp->lh_lock); + + return (error); +} + +/* Protected by hash list lock */ +void +handle_hold_locked(struct ldi_handle *lhp) +{ +#ifdef DEBUG + int index; + + ASSERT3U(lhp, !=, NULL); + index = LH_HASH(lhp->lh_dev); + ASSERT(MUTEX_HELD(&ldi_handle_hash_lock[index])); +#endif + + /* Increment ref count and kstat */ + lhp->lh_ref++; + LDISTAT_BUMP(handle_refs); +} + +/* Protected by hash list lock */ +void +handle_hold(struct ldi_handle *lhp) +{ + int index; + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_dev, !=, 0); + + index = LH_HASH(lhp->lh_dev); + mutex_enter(&ldi_handle_hash_lock[index]); + handle_hold_locked(lhp); + mutex_exit(&ldi_handle_hash_lock[index]); +} + +/* + * Locate existing handle in linked list, may return NULL. Optionally places a + * hold on found handle. + */ +static struct ldi_handle * +handle_find_locked(dev_t device, int fmode, boolean_t hold) +{ + struct ldi_handle *retlhp = NULL, *lhp; + int index = LH_HASH(device); + + /* Validate device */ + if (device == 0) { + dprintf("%s invalid device\n", __func__); + return (NULL); + } + /* If fmode is 0, find any handle with matching dev_t */ + + ASSERT(MUTEX_HELD(&ldi_handle_hash_lock[index])); + + /* Iterate over handle hash list */ + for (lhp = list_head(&ldi_handle_hash_list[index]); + lhp != NULL; + lhp = list_next(&ldi_handle_hash_list[index], lhp)) { + /* Check for matching dev_t and fmode (if set) */ + if (lhp->lh_dev != device) { + continue; + } + + /* Special case for find any */ + if (fmode == 0) { + /* Found a match */ + retlhp = lhp; + break; + } + + /* fmode must match write level */ + if (((lhp->lh_fmode & FWRITE) && !(fmode & FWRITE)) || + (!(lhp->lh_fmode & FWRITE) && (fmode & FWRITE))) { + continue; + } + + /* Found a match */ + retlhp = lhp; + break; + } + + /* Take hold, if requested */ + if (hold && retlhp) { + /* Caller asked for hold on found handle */ + handle_hold_locked(retlhp); + } + + return (retlhp); +} + +/* + * Call without lock held to find a handle by dev_t, + * optionally placing a hold on the found handle. + */ +struct ldi_handle * +handle_find(dev_t device, int fmode, boolean_t hold) +{ + struct ldi_handle *lhp; + int index = LH_HASH(device); + + if (device == 0) { + dprintf("%s invalid device\n", __func__); + return (NULL); + } + + /* Lock for duration of find */ + mutex_enter(&ldi_handle_hash_lock[index]); + + /* Find handle by dev_t (with hold) */ + lhp = handle_find_locked(device, fmode, hold); + + /* Unlock and return handle (could be NULL) */ + mutex_exit(&ldi_handle_hash_lock[index]); + return (lhp); +} + +static void +handle_free(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + + /* Validate lhp, references, and status */ + if (lhp->lh_ref != 0 || + lhp->lh_status != LDI_STATUS_CLOSED) { + dprintf("%s ref %d status %d\n", __func__, lhp->lh_ref, + lhp->lh_status); + } + + /* Remove notification handler */ + if (handle_remove_notifier(lhp) != 0) { + dprintf("%s lhp %p notifier %s\n", + __func__, lhp, "couldn't be removed"); + } + + /* Destroy condvar and mutex */ + cv_destroy(&lhp->lh_cv); + mutex_destroy(&lhp->lh_lock); + + /* Decrement kstat handle count */ + LDISTAT_BUMPDOWN(handle_count); + /* IOKit or vnode */ + switch (lhp->lh_type) { + case LDI_TYPE_IOKIT: + /* Decrement kstat handle count and free iokit_tsd */ + LDISTAT_BUMPDOWN(handle_count_iokit); + handle_free_iokit(lhp); + break; + + case LDI_TYPE_VNODE: + /* Decrement kstat handle count and free vnode_tsd */ + LDISTAT_BUMPDOWN(handle_count_vnode); + handle_free_vnode(lhp); + break; + default: + dprintf("%s invalid handle type\n", __func__); + break; + } + + /* Deallocate handle */ + dprintf("%s freeing %p\n", __func__, lhp); + kmem_free(lhp, sizeof (struct ldi_handle)); + lhp = 0; +} + +/* + * Remove handle from list, decrementing counters + */ +static void +handle_remove_locked(struct ldi_handle *lhp) +{ + int index; + + ASSERT3U(lhp, !=, NULL); + index = LH_HASH(lhp->lh_dev); + ASSERT(MUTEX_HELD(&ldi_handle_hash_lock[index])); + + /* Remove from list, update handle count */ + list_remove(&ldi_handle_hash_list[index], lhp); + OSDecrementAtomic(&ldi_handle_hash_count); +} + +void +handle_remove(struct ldi_handle *lhp) +{ + int index = LH_HASH(lhp->lh_dev); + + mutex_enter(&ldi_handle_hash_lock[index]); + handle_remove_locked(lhp); + mutex_exit(&ldi_handle_hash_lock[index]); +} + +/* Protected by hash list lock */ +static void +handle_release_locked(struct ldi_handle *lhp) +{ + boolean_t lastrelease = B_FALSE; + +#ifdef DEBUG + ASSERT3U(lhp, !=, NULL); + int index = LH_HASH(lhp->lh_dev); + ASSERT(MUTEX_HELD(&ldi_handle_hash_lock[index])); +#endif + + if (lhp->lh_ref != 0) { + lhp->lh_ref--; + LDISTAT_BUMPDOWN(handle_refs); + } else { + dprintf("%s with 0 refs\n", __func__); + } + + dprintf("%s %x remaining holds\n", __func__, lhp->lh_ref); + + /* If last open ref was dropped */ + lastrelease = (lhp->lh_ref == 0); + + if (lastrelease) { + dprintf("%s removing handle %p from list\n", __func__, lhp); + handle_remove_locked(lhp); + dprintf("%s freeing handle %p\n", __func__, lhp); + handle_free(lhp); + } +} + +/* Protected by hash list lock */ +void +handle_release(struct ldi_handle *lhp) +{ + int index; + + ASSERT3U(lhp, !=, NULL); + index = LH_HASH(lhp->lh_dev); + + mutex_enter(&ldi_handle_hash_lock[index]); + handle_release_locked(lhp); + mutex_exit(&ldi_handle_hash_lock[index]); +} + +/* + * Add new handle to list. + */ +static struct ldi_handle * +handle_add_locked(struct ldi_handle *lhp) +{ + struct ldi_handle *retlhp; + int index = 0; + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_dev, !=, 0); + + /* Lock should be held */ + index = LH_HASH(lhp->lh_dev); + ASSERT(MUTEX_HELD(&ldi_handle_hash_lock[index])); + + /* Search for existing handle */ + if ((retlhp = handle_find_locked(lhp->lh_dev, lhp->lh_fmode, + B_TRUE)) != NULL) { + dprintf("%s found handle %p\n", __func__, retlhp); + return (retlhp); + } + + /* Insert into list */ + list_insert_head(&ldi_handle_hash_list[index], lhp); + + /* Update handle count */ + OSIncrementAtomic(&ldi_handle_hash_count); + + /* Return success */ + return (lhp); +} + +/* + * Caller should check if returned handle is the same and free new + * handle if an existing handle was returned + */ +struct ldi_handle * +handle_add(struct ldi_handle *lhp) +{ + struct ldi_handle *retlhp; + int index; + + ASSERT3U(lhp, !=, NULL); + index = LH_HASH(lhp->lh_dev); + + mutex_enter(&ldi_handle_hash_lock[index]); + retlhp = handle_add_locked(lhp); + mutex_exit(&ldi_handle_hash_lock[index]); + + return (retlhp); +} + +/* + * Returns a handle with 1 reference and status Closed + */ +#ifdef illumos +static struct ldi_handle * +handle_alloc(vnode_t *vp, struct ldi_ident_t *li) +#else /* illumos */ +struct ldi_handle * +handle_alloc_common(uint_t type, dev_t device, int fmode) +#endif /* !illumos */ +{ + struct ldi_handle *new_lh; + size_t len; + + /* Validate arguments */ + if ((type != LDI_TYPE_IOKIT && type != LDI_TYPE_VNODE) || + device == 0 || fmode == 0) { + dprintf("%s Invalid type %d, device %d, or fmode %d\n", + __func__, type, device, fmode); + return (NULL); + } + + /* Allocate and verify */ + len = sizeof (struct ldi_handle); + if (NULL == (new_lh = (struct ldi_handle *)kmem_alloc(len, + KM_SLEEP))) { + dprintf("%s couldn't allocate ldi_handle\n", __func__); + return (NULL); + } +#ifdef LDI_ZERO + /* Clear the struct for safety */ + bzero(new_lh, len); +#endif + + /* Create handle lock */ + mutex_init(&new_lh->lh_lock, NULL, MUTEX_DEFAULT, NULL); + /* And condvar */ + cv_init(&new_lh->lh_cv, NULL, CV_DEFAULT, NULL); + + /* + * Set the handle type, which dictates the type of device pointer + * and buffers used for the lifetime of the ldi_handle + */ + new_lh->lh_type = type; + /* Set dev_t (major/minor) device number */ + new_lh->lh_dev = device; + + /* Clear list head */ + new_lh->lh_node.list_next = NULL; + new_lh->lh_node.list_prev = NULL; + + /* Initialize with 1 handle ref and 0 open refs */ + new_lh->lh_ref = 1; + new_lh->lh_openref = 0; + + /* Clear type-specific device data */ + new_lh->lh_tsd.iokit_tsd = 0; + /* No need to clear vnode_tsd in union */ + new_lh->lh_notifier = 0; + + /* Assign fmode */ + new_lh->lh_fmode = fmode; + + /* Alloc in status Closed */ + new_lh->lh_status = LDI_STATUS_CLOSED; + + /* Increment kstats */ + LDISTAT_BUMP(handle_count); + LDISTAT_BUMP(handle_refs); + if (type == LDI_TYPE_IOKIT) { + LDISTAT_BUMP(handle_count_iokit); + } else if (type == LDI_TYPE_VNODE) { + LDISTAT_BUMP(handle_count_vnode); + } + + return (new_lh); +} + +static void +handle_set_open_locked(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + ASSERT(MUTEX_HELD(&lhp->lh_lock)); + + /* Increment number of open clients */ + lhp->lh_openref++; + + /* Increment kstats */ + if (lhp->lh_fmode & FWRITE) { + LDISTAT_BUMP(handle_open_rw); + } else { + LDISTAT_BUMP(handle_open_ro); + } +} + +#if 0 +static void +handle_set_open(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + + mutex_enter(&lhp->lh_lock); + handle_set_open_locked(lhp); + mutex_exit(&lhp->lh_lock); +} +#endif + +static void +handle_clear_open_locked(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + ASSERT(MUTEX_HELD(&lhp->lh_lock)); + + /* Decrement number of open clients */ + if (lhp->lh_openref == 0) { + dprintf("%s with 0 open refs\n", __func__); + return; + } + + /* Decrement kstats */ + lhp->lh_openref--; + if (lhp->lh_fmode & FWRITE) { + LDISTAT_BUMPDOWN(handle_open_rw); + } else { + LDISTAT_BUMPDOWN(handle_open_ro); + } +} + +#if 0 +static inline void +handle_clear_open(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_dev, !=, 0); + ASSERT3U(lhp->lh_openref, !=, 0); + + mutex_enter(&lhp->lh_lock); + handle_clear_open_locked(lhp, lhp->lh_fmode); + mutex_exit(&lhp->lh_lock); +} +#endif + +static int +handle_close(struct ldi_handle *lhp) +{ +#ifdef DEBUG + int openrefs; +#endif + int error = EINVAL; + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_ref, !=, 0); + ASSERT3U(lhp->lh_openref, !=, 0); + ASSERT(lhp->lh_type == LDI_TYPE_IOKIT || + lhp->lh_type == LDI_TYPE_VNODE); + + /* Take lock */ + mutex_enter(&lhp->lh_lock); + + /* + * Possible statuses: + * Online with one or more openref + * Offline due to IOMedia termination, one or more openref remain + * Impossible or programming error: + * Closing and Closed should only be set with 0 openref + * Opening should have 0 openref so far, and clients should not be + * calling ldi_close + */ + switch (lhp->lh_status) { + case LDI_STATUS_ONLINE: + if (lhp->lh_openref == 0) { + /* Unlock and return error */ + mutex_exit(&lhp->lh_lock); + /* Shouldn't happen */ + dprintf("%s status Online with 0 openrefs\n", + __func__); + return (ENXIO); + } + + /* If multiple open refs are held */ + if (lhp->lh_openref > 1) { + goto drop_openref; + } + + /* Otherwise open with last open ref */ + /* change status to closing and proceed */ + handle_status_change_locked(lhp, LDI_STATUS_CLOSING); + /* Unlock and exit loop */ + mutex_exit(&lhp->lh_lock); + goto do_close; + + case LDI_STATUS_OFFLINE: + if (lhp->lh_openref == 0) { + /* Unlock and return error */ + mutex_exit(&lhp->lh_lock); + /* Shouldn't happen */ + dprintf("%s status Offline with 0 openrefs\n", + __func__); + return (ENXIO); + } + + /* + * Otherwise the device was marked missing and clients need + * to drop openrefs until it can be released. + */ + goto drop_openref; + + default: + mutex_exit(&lhp->lh_lock); + dprintf("%s invalid handle status %d\n", + __func__, lhp->lh_status); + return (ENXIO); + } + +drop_openref: + /* Just decrement open refs/stats */ + handle_clear_open_locked(lhp); +#ifdef DEBUG + /* Save openrefs to report after unlock */ + openrefs = lhp->lh_openref; +#endif + mutex_exit(&lhp->lh_lock); + +#ifdef DEBUG + dprintf("%s has %d remaining openrefs\n", __func__, openrefs); +#endif + return (0); + +do_close: + /* Remove notification handler */ + if (lhp->lh_notifier) { + error = handle_remove_notifier(lhp); + if (error) { + dprintf("%s lhp %p notifier %p error %d %s\n", + __func__, lhp, lhp->lh_notifier, error, + "couldn't be removed"); + /* Proceeds with close */ + } + } + + /* IOMedia or vnode */ + switch (lhp->lh_type) { + case LDI_TYPE_IOKIT: + error = handle_close_iokit(lhp); + /* Preserve error for return */ + break; + case LDI_TYPE_VNODE: + error = handle_close_vnode(lhp); + /* Preserve error for return */ + break; + } + +#ifdef DEBUG + if (error != 0) { + /* We will still set the handle to Closed status */ + dprintf("%s error %d from handle_close_{type}\n", + __func__, error); + } +#endif + + /* Take lock to drop openref and set status */ + mutex_enter(&lhp->lh_lock); + handle_clear_open_locked(lhp); + handle_status_change_locked(lhp, LDI_STATUS_CLOSED); + + /* Wake any waiting opens and unlock */ + cv_signal(&lhp->lh_cv); + mutex_exit(&lhp->lh_lock); + +dprintf("%s returning %d\n", __func__, error); + return (error); +} + +ldi_status_t +handle_open_start(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_ref, !=, 0); + + /* Take lock */ + mutex_enter(&lhp->lh_lock); + /* Loop if the handle is in opening or closing status */ + do { + /* XXX Needs sleep timeout */ + switch (lhp->lh_status) { + case LDI_STATUS_ONLINE: + /* Increment readonly / readwrite count */ + handle_set_open_locked(lhp); + mutex_exit(&lhp->lh_lock); + + /* Success */ + return (LDI_STATUS_ONLINE); + + case LDI_STATUS_CLOSED: + /* Not yet open, change status to opening and proceed */ + handle_status_change_locked(lhp, LDI_STATUS_OPENING); + + /* Unlock and exit loop */ + mutex_exit(&lhp->lh_lock); + /* Return success */ + return (LDI_STATUS_OPENING); + + case LDI_STATUS_OPENING: + case LDI_STATUS_CLOSING: + /* Open or close in progress, sleep until signaled */ + dprintf("%s sleeping on lock\n", __func__); + cv_wait(&lhp->lh_cv, &lhp->lh_lock); + continue; + default: + mutex_exit(&lhp->lh_lock); + dprintf("%s invalid handle status %d\n", + __func__, lhp->lh_status); + return (LDI_STATUS_OFFLINE); + } + } while (1); + + /* Shouldn't reach this */ + return (LDI_STATUS_CLOSED); +} + +void +handle_open_done(struct ldi_handle *lhp, ldi_status_t new_status) +{ + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_status, ==, LDI_STATUS_OPENING); + + /* Lock to change status */ + mutex_enter(&lhp->lh_lock); + + if (new_status != LDI_STATUS_ONLINE) { + /* Set status, issues event */ + handle_status_change_locked(lhp, LDI_STATUS_CLOSED); + } else { + /* Increment open count and fmode */ + handle_set_open_locked(lhp); + /* Set status, issues event */ + handle_status_change_locked(lhp, LDI_STATUS_ONLINE); + } + + /* Wake any waiting opens and unlock */ + cv_signal(&lhp->lh_cv); + mutex_exit(&lhp->lh_lock); + + /* + * Flush out any old buffers remaining from + * a previous use, only if opening read-write. + */ + if (new_status == LDI_STATUS_ONLINE && + (lhp->lh_fmode & FWRITE) && + ldi_sync((ldi_handle_t)lhp) != 0) { + dprintf("%s ldi_sync failed\n", __func__); + } +} + +/* + * Release all remaining handles (during ldi_fini) + * Unless something went wrong, all handles should + * be closed and have zero references. + */ +static void +handle_hash_release() +{ + struct ldi_handle *lhp; + int index, refs, j; + + for (index = 0; index < LH_HASH_SZ; index++) { + mutex_enter(&ldi_handle_hash_lock[index]); + if (!list_empty(&ldi_handle_hash_list[index])) { + dprintf("%s still have LDI handle(s) in list %d\n", + __func__, index); + } + + /* Iterate over the list */ + while ((lhp = list_head(&ldi_handle_hash_list[index]))) { + /* remove from list to deallocate */ + list_remove(&ldi_handle_hash_list[index], lhp); + + /* Update handle count */ + OSDecrementAtomic(&ldi_handle_hash_count); + + dprintf("%s releasing %p with %u refs and status %d\n", + __func__, lhp, lhp->lh_ref, lhp->lh_status); + /* release holds */ + refs = lhp->lh_ref; + for (j = 0; j < refs; j++) { + handle_release_locked(lhp); + } + lhp = 0; + } + + list_destroy(&ldi_handle_hash_list[index]); + mutex_exit(&ldi_handle_hash_lock[index]); + mutex_destroy(&ldi_handle_hash_lock[index]); + } +} + +/* + * LDI Event functions + */ +char * +ldi_ev_get_type(ldi_ev_cookie_t cookie) +{ + int i; + struct ldi_ev_cookie *cookie_impl = (struct ldi_ev_cookie *)cookie; + + for (i = 0; ldi_ev_cookies[i].ck_evname != NULL; i++) { + if (&ldi_ev_cookies[i] == cookie_impl) { + LDI_EVTRC((CE_NOTE, "ldi_ev_get_type: LDI: %s", + ldi_ev_cookies[i].ck_evname)); + return (ldi_ev_cookies[i].ck_evname); + } + } + + return ("UNKNOWN EVENT"); +} + +static int +ldi_native_cookie(ldi_ev_cookie_t cookie) +{ + int i; + struct ldi_ev_cookie *cookie_impl = (struct ldi_ev_cookie *)cookie; + + for (i = 0; ldi_ev_cookies[i].ck_evname != NULL; i++) { + if (&ldi_ev_cookies[i] == cookie_impl) { + LDI_EVTRC((CE_NOTE, "ldi_native_cookie: native LDI")); + return (1); + } + } + + LDI_EVTRC((CE_NOTE, "ldi_native_cookie: is NDI")); + return (0); +} + +static ldi_ev_cookie_t +ldi_get_native_cookie(const char *evname) +{ + int i; + + for (i = 0; ldi_ev_cookies[i].ck_evname != NULL; i++) { + if (strcmp(ldi_ev_cookies[i].ck_evname, evname) == 0) { + LDI_EVTRC((CE_NOTE, "ldi_get_native_cookie: found")); + return ((ldi_ev_cookie_t)&ldi_ev_cookies[i]); + } + } + + LDI_EVTRC((CE_NOTE, "ldi_get_native_cookie: NOT found")); + return (NULL); +} + +/* + * ldi_ev_lock() needs to be recursive, since layered drivers may call + * other LDI interfaces (such as ldi_close() from within the context of + * a notify callback. Since the notify callback is called with the + * ldi_ev_lock() held and ldi_close() also grabs ldi_ev_lock, the lock needs + * to be recursive. + */ +static void +ldi_ev_lock(void) +{ + LDI_EVTRC((CE_NOTE, "ldi_ev_lock: entered")); + + mutex_enter(&ldi_ev_callback_list.le_lock); + if (ldi_ev_callback_list.le_thread == curthread) { + ASSERT(ldi_ev_callback_list.le_busy >= 1); + ldi_ev_callback_list.le_busy++; + } else { + while (ldi_ev_callback_list.le_busy) + cv_wait(&ldi_ev_callback_list.le_cv, + &ldi_ev_callback_list.le_lock); + ASSERT(ldi_ev_callback_list.le_thread == NULL); + ldi_ev_callback_list.le_busy = 1; + ldi_ev_callback_list.le_thread = curthread; + } + mutex_exit(&ldi_ev_callback_list.le_lock); + + LDI_EVTRC((CE_NOTE, "ldi_ev_lock: exit")); +} + +static void +ldi_ev_unlock(void) +{ + LDI_EVTRC((CE_NOTE, "ldi_ev_unlock: entered")); + mutex_enter(&ldi_ev_callback_list.le_lock); + ASSERT(ldi_ev_callback_list.le_thread == curthread); + ASSERT(ldi_ev_callback_list.le_busy >= 1); + + ldi_ev_callback_list.le_busy--; + if (ldi_ev_callback_list.le_busy == 0) { + ldi_ev_callback_list.le_thread = NULL; + cv_signal(&ldi_ev_callback_list.le_cv); + } + mutex_exit(&ldi_ev_callback_list.le_lock); + LDI_EVTRC((CE_NOTE, "ldi_ev_unlock: exit")); +} + +int +ldi_ev_get_cookie(ldi_handle_t lh, char *evname, ldi_ev_cookie_t *cookiep) +{ + ldi_ev_cookie_t tcookie; + + LDI_EVDBG((CE_NOTE, "ldi_ev_get_cookie: entered: evname=%s", + evname ? evname : "")); + + if (lh == NULL || evname == NULL || + strlen(evname) == 0 || cookiep == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_ev_get_cookie: invalid args")); + return (LDI_EV_FAILURE); + } + + *cookiep = NULL; + + /* + * First check if it is a LDI native event + */ + tcookie = ldi_get_native_cookie(evname); + if (tcookie) { + LDI_EVDBG((CE_NOTE, "ldi_ev_get_cookie: got native cookie")); + *cookiep = tcookie; + return (LDI_EV_SUCCESS); + } + + return (LDI_EV_FAILURE); +} + +int +ldi_ev_register_callbacks(ldi_handle_t lh, ldi_ev_cookie_t cookie, + ldi_ev_callback_t *callb, void *arg, ldi_callback_id_t *id) +{ + struct ldi_handle *lhp = (struct ldi_handle *)lh; + ldi_ev_callback_impl_t *lecp; + + if (lh == NULL || cookie == NULL || callb == NULL || id == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_ev_register_callbacks: Invalid args")); + return (LDI_EV_FAILURE); + } + + if (callb->cb_vers != LDI_EV_CB_VERS) { + LDI_EVDBG((CE_NOTE, "ldi_ev_register_callbacks: Invalid vers")); + return (LDI_EV_FAILURE); + } + + if (callb->cb_notify == NULL && callb->cb_finalize == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_ev_register_callbacks: NULL callb")); + return (LDI_EV_FAILURE); + } + + *id = 0; + + lecp = kmem_zalloc(sizeof (ldi_ev_callback_impl_t), KM_SLEEP); + + ldi_ev_lock(); + + /* + * Add the notify/finalize callback to the LDI's list of callbacks. + */ + lecp->lec_lhp = lhp; + + lecp->lec_dev = lhp->lh_dev; + lecp->lec_spec = S_IFBLK; + + lecp->lec_notify = callb->cb_notify; + lecp->lec_finalize = callb->cb_finalize; + lecp->lec_arg = arg; + lecp->lec_cookie = cookie; + + lecp->lec_id = (void *)(uintptr_t)(++ldi_ev_id_pool); + + list_insert_tail(&ldi_ev_callback_list.le_head, lecp); + + *id = (ldi_callback_id_t)lecp->lec_id; + + ldi_ev_unlock(); + + LDI_EVDBG((CE_NOTE, "ldi_ev_register_callbacks: registered " + "notify/finalize")); + + return (LDI_EV_SUCCESS); +} + +static int +ldi_ev_device_match(ldi_ev_callback_impl_t *lecp, __unused dev_info_t *dip, + dev_t dev, int spec_type) +{ + ASSERT(lecp); + ASSERT(dev != DDI_DEV_T_NONE); + ASSERT(dev != NODEV); + ASSERT((dev == DDI_DEV_T_ANY && spec_type == 0) || + (spec_type == S_IFCHR || spec_type == S_IFBLK)); + ASSERT(lecp->lec_spec == S_IFCHR || lecp->lec_spec == S_IFBLK); + ASSERT(lecp->lec_dev != DDI_DEV_T_ANY); + ASSERT(lecp->lec_dev != DDI_DEV_T_NONE); + ASSERT(lecp->lec_dev != NODEV); + + if (dev != DDI_DEV_T_ANY) { + if (dev != lecp->lec_dev || spec_type != lecp->lec_spec) + return (0); + } + + LDI_EVTRC((CE_NOTE, "ldi_ev_device_match: MATCH dev=%d", + (uint32_t)dev)); + + return (1); +} + +/* + * LDI framework function to post a "notify" event to all layered drivers + * that have registered for that event + * + * Returns: + * LDI_EV_SUCCESS - registered callbacks allow event + * LDI_EV_FAILURE - registered callbacks block event + * LDI_EV_NONE - No matching LDI callbacks + * + * This function is *not* to be called by layered drivers. It is for I/O + * framework code in Solaris, such as the I/O retire code and DR code + * to call while servicing a device event such as offline or degraded. + */ +int +ldi_invoke_notify(__unused dev_info_t *dip, dev_t dev, int spec_type, + char *event, void *ev_data) +{ + ldi_ev_callback_impl_t *lecp; + list_t *listp; + int ret; + char *lec_event; + + ASSERT(dev != DDI_DEV_T_NONE); + ASSERT(dev != NODEV); + ASSERT((dev == DDI_DEV_T_ANY && spec_type == 0) || + (spec_type == S_IFCHR || spec_type == S_IFBLK)); + ASSERT(event); + + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): entered: dip=%p, ev=%s", + (void *)dip, event)); + + ret = LDI_EV_NONE; + ldi_ev_lock(); + + VERIFY(ldi_ev_callback_list.le_walker_next == NULL); + listp = &ldi_ev_callback_list.le_head; + for (lecp = list_head(listp); lecp; lecp = + ldi_ev_callback_list.le_walker_next) { + ldi_ev_callback_list.le_walker_next = list_next(listp, lecp); + + /* Check if matching device */ + if (!ldi_ev_device_match(lecp, dip, dev, spec_type)) + continue; + + if (lecp->lec_lhp == NULL) { + /* + * Consumer has unregistered the handle and so + * is no longer interested in notify events. + */ + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): No LDI " + "handle, skipping")); + continue; + } + + if (lecp->lec_notify == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): No notify " + "callback. skipping")); + continue; /* not interested in notify */ + } + + /* + * Check if matching event + */ + lec_event = ldi_ev_get_type(lecp->lec_cookie); + if (strcmp(event, lec_event) != 0) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): Not matching" + " event {%s,%s}. skipping", event, lec_event)); + continue; + } + + lecp->lec_lhp->lh_flags |= LH_FLAGS_NOTIFY; + if (lecp->lec_notify((ldi_handle_t)lecp->lec_lhp, + lecp->lec_cookie, lecp->lec_arg, ev_data) != + LDI_EV_SUCCESS) { + ret = LDI_EV_FAILURE; + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): notify" + " FAILURE")); + break; + } + + /* We have a matching callback that allows the event to occur */ + ret = LDI_EV_SUCCESS; + + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): 1 consumer success")); + } + + if (ret != LDI_EV_FAILURE) + goto out; + +#ifdef __APPLE__ + dprintf("%s offline notify failed, shouldn't happen\n", __func__); + goto out; +#endif +#ifdef illumos + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): undoing notify")); + + /* + * Undo notifies already sent + */ + lecp = list_prev(listp, lecp); + VERIFY(ldi_ev_callback_list.le_walker_prev == NULL); + for (; lecp; lecp = ldi_ev_callback_list.le_walker_prev) { + ldi_ev_callback_list.le_walker_prev = list_prev(listp, lecp); + + /* + * Check if matching device + */ + if (!ldi_ev_device_match(lecp, dip, dev, spec_type)) + continue; + + if (lecp->lec_finalize == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): no finalize, " + "skipping")); + continue; /* not interested in finalize */ + } + + /* + * it is possible that in response to a notify event a + * layered driver closed its LDI handle so it is ok + * to have a NULL LDI handle for finalize. The layered + * driver is expected to maintain state in its "arg" + * parameter to keep track of the closed device. + */ + + /* Check if matching event */ + lec_event = ldi_ev_get_type(lecp->lec_cookie); + if (strcmp(event, lec_event) != 0) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): not matching " + "event: %s,%s, skipping", event, lec_event)); + continue; + } + + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): calling finalize")); + + lecp->lec_finalize(lecp->lec_lhp, lecp->lec_cookie, + LDI_EV_FAILURE, lecp->lec_arg, ev_data); + + /* + * If LDI native event and LDI handle closed in context + * of notify, NULL out the finalize callback as we have + * already called the 1 finalize above allowed in this situation + */ + if (lecp->lec_lhp == NULL && + ldi_native_cookie(lecp->lec_cookie)) { + LDI_EVDBG((CE_NOTE, + "ldi_invoke_notify(): NULL-ing finalize after " + "calling 1 finalize following ldi_close")); + lecp->lec_finalize = NULL; + } + } +#endif /* illumos */ + +out: + ldi_ev_callback_list.le_walker_next = NULL; + ldi_ev_callback_list.le_walker_prev = NULL; + ldi_ev_unlock(); + + if (ret == LDI_EV_NONE) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): no matching " + "LDI callbacks")); + } + + return (ret); +} + +/* + * LDI framework function to invoke "finalize" callbacks for all layered + * drivers that have registered callbacks for that event. + * + * This function is *not* to be called by layered drivers. It is for I/O + * framework code in Solaris, such as the I/O retire code and DR code + * to call while servicing a device event such as offline or degraded. + */ +void +ldi_invoke_finalize(__unused dev_info_t *dip, dev_t dev, int spec_type, + char *event, int ldi_result, void *ev_data) +{ + ldi_ev_callback_impl_t *lecp; + list_t *listp; + char *lec_event; + int found = 0; + + ASSERT(dev != DDI_DEV_T_NONE); + ASSERT(dev != NODEV); + ASSERT((dev == DDI_DEV_T_ANY && spec_type == 0) || + (spec_type == S_IFCHR || spec_type == S_IFBLK)); + ASSERT(event); + ASSERT(ldi_result == LDI_EV_SUCCESS || ldi_result == LDI_EV_FAILURE); + + LDI_EVDBG((CE_NOTE, "ldi_invoke_finalize(): entered: dip=%p, result=%d" + " event=%s", (void *)dip, ldi_result, event)); + + ldi_ev_lock(); + VERIFY(ldi_ev_callback_list.le_walker_next == NULL); + listp = &ldi_ev_callback_list.le_head; + for (lecp = list_head(listp); lecp; lecp = + ldi_ev_callback_list.le_walker_next) { + ldi_ev_callback_list.le_walker_next = list_next(listp, lecp); + + if (lecp->lec_finalize == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_finalize(): No " + "finalize. Skipping")); + continue; /* Not interested in finalize */ + } + + /* + * Check if matching device + */ + if (!ldi_ev_device_match(lecp, dip, dev, spec_type)) + continue; + + /* + * It is valid for the LDI handle to be NULL during finalize. + * The layered driver may have done an LDI close in the notify + * callback. + */ + + /* + * Check if matching event + */ + lec_event = ldi_ev_get_type(lecp->lec_cookie); + if (strcmp(event, lec_event) != 0) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_finalize(): Not " + "matching event {%s,%s}. Skipping", + event, lec_event)); + continue; + } + + LDI_EVDBG((CE_NOTE, "ldi_invoke_finalize(): calling finalize")); + + found = 1; + + lecp->lec_finalize((ldi_handle_t)lecp->lec_lhp, + lecp->lec_cookie, ldi_result, lecp->lec_arg, + ev_data); + + /* + * If LDI native event and LDI handle closed in context + * of notify, NULL out the finalize callback as we have + * already called the 1 finalize above allowed in this situation + */ + if (lecp->lec_lhp == NULL && + ldi_native_cookie(lecp->lec_cookie)) { + LDI_EVDBG((CE_NOTE, + "ldi_invoke_finalize(): NULLing finalize after " + "calling 1 finalize following ldi_close")); + lecp->lec_finalize = NULL; + } + } + ldi_ev_callback_list.le_walker_next = NULL; + ldi_ev_unlock(); + + if (found) + return; + + LDI_EVDBG((CE_NOTE, "ldi_invoke_finalize(): no matching callbacks")); +} + +int +ldi_ev_remove_callbacks(ldi_callback_id_t id) +{ + ldi_ev_callback_impl_t *lecp; + ldi_ev_callback_impl_t *next; + ldi_ev_callback_impl_t *found; + list_t *listp; + + if (id == 0) { + cmn_err(CE_WARN, "ldi_ev_remove_callbacks: Invalid ID 0"); + return (LDI_EV_FAILURE); + } + + LDI_EVDBG((CE_NOTE, "ldi_ev_remove_callbacks: entered: id=%p", + (void *)id)); + + ldi_ev_lock(); + + listp = &ldi_ev_callback_list.le_head; + next = found = NULL; + for (lecp = list_head(listp); lecp; lecp = next) { + next = list_next(listp, lecp); + if (lecp->lec_id == id) { + VERIFY(found == NULL); + + /* + * If there is a walk in progress, shift that walk + * along to the next element so that we can remove + * this one. This allows us to unregister an arbitrary + * number of callbacks from within a callback. + * + * See the struct definition (in sunldi_impl.h) for + * more information. + */ + if (ldi_ev_callback_list.le_walker_next == lecp) + ldi_ev_callback_list.le_walker_next = next; + if (ldi_ev_callback_list.le_walker_prev == lecp) + ldi_ev_callback_list.le_walker_prev = list_prev( + listp, ldi_ev_callback_list.le_walker_prev); + + list_remove(listp, lecp); + found = lecp; + } + } + ldi_ev_unlock(); + + if (found == NULL) { + cmn_err(CE_WARN, "No LDI event handler for id (%p)", + (void *)id); + return (LDI_EV_SUCCESS); + } + + LDI_EVDBG((CE_NOTE, "ldi_ev_remove_callbacks: removed " + "LDI native callbacks")); + kmem_free(found, sizeof (ldi_ev_callback_impl_t)); + + return (LDI_EV_SUCCESS); +} +/* + * XXX End LDI Events + */ + +/* Client interface, find IOMedia from dev_t, alloc and open handle */ +int +ldi_open_by_dev(dev_t device, __unused int otyp, int fmode, + __unused cred_t *cred, ldi_handle_t *lhp, + __unused ldi_ident_t ident) +{ + int error = EINVAL; + + dprintf("%s dev_t %d fmode %d\n", __func__, device, fmode); + + /* Validate arguments */ + if (!lhp || device == 0) { + dprintf("%s missing argument %p %d\n", + __func__, lhp, device); + return (EINVAL); + } + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*((struct ldi_handle **)lhp), ==, NULL); + + /* Try to open by media */ + error = ldi_open_media_by_dev(device, fmode, lhp); + + /* Pass error from open */ + return (error); +} + +/* Client interface, find dev_t and IOMedia/vnode, alloc and open handle */ +int +ldi_open_by_name(char *path, int fmode, __unused cred_t *cred, + ldi_handle_t *lhp, __unused ldi_ident_t li) +{ + dev_t device = 0; + int error = EINVAL; + + dprintf("%s dev_t %d fmode %d\n", __func__, device, fmode); + + /* Validate arguments */ + if (!lhp || !path) { + dprintf("%s %s %p %s %d\n", __func__, + "missing lhp or path", lhp, path, fmode); + return (EINVAL); + } + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*((struct ldi_handle **)lhp), ==, NULL); + + /* Validate active open modes */ + if (!ldi_use_iokit_from_path && !ldi_use_dev_from_path && + !ldi_use_vnode_from_path) { + dprintf("%s no valid modes to open device\n", __func__); + return (EINVAL); + } + + /* Try to open IOMedia by path */ + if (ldi_use_iokit_from_path) { + error = ldi_open_media_by_path(path, fmode, lhp); + + /* Error check open */ + if (!error) { + return (0); + } else { + dprintf("%s ldi_open_media_by_path failed\n", + __func__); + /* Not fatal, retry by dev_t or vnode */ + } + } + + /* Get dev_t from path, try to open IOMedia by dev */ + if (ldi_use_dev_from_path) { + /* Uses vnode_lookup */ + device = dev_from_path(path); + if (device == 0) { + dprintf("%s dev_from_path failed %s\n", + __func__, path); + /* + * Both media_from_dev and vnode_from_path will fail + * if dev_from_path fails, since it uses vnode_lookup. + */ + return (ENODEV); + } + + if (ldi_use_iokit_from_dev) { + /* Searches for matching IOMedia */ + error = ldi_open_media_by_dev(device, fmode, lhp); + if (!error) { + return (0); + } else { + dprintf("%s ldi_open_media_by_dev failed %d\n", + __func__, device); + /* Not fatal, retry as vnode */ + } + } + } + + if (!ldi_use_vnode_from_path) { + return (EINVAL); + } + + /* Try to open vnode by path */ + error = ldi_open_vnode_by_path(path, device, fmode, lhp); + if (error) { + dprintf("%s ldi_open_vnode_by_path failed %d\n", __func__, + error); + } + + return (error); +} + +/* Client interface, wrapper for handle_close */ +int +ldi_close(ldi_handle_t lh, int fmode, __unused cred_t *cred) +{ + struct ldi_handle *handlep = (struct ldi_handle *)lh; + int error = EINVAL; + + ASSERT3U(handlep, !=, NULL); + ASSERT3U(handlep->lh_ref, !=, 0); + ASSERT3U(handlep->lh_fmode, ==, fmode); + + dprintf("%s dev_t %d fmode %d\n", __func__, handlep->lh_dev, fmode); + + /* Remove event callbacks */ + boolean_t notify = B_FALSE; + list_t *listp; + ldi_ev_callback_impl_t *lecp; + + /* + * Search the event callback list for callbacks with this + * handle. There are 2 cases + * 1. Called in the context of a notify. The handle consumer + * is releasing its hold on the device to allow a reconfiguration + * of the device. Simply NULL out the handle and the notify callback. + * The finalize callback is still available so that the consumer + * knows of the final disposition of the device. + * 2. Not called in the context of notify. NULL out the handle as well + * as the notify and finalize callbacks. Since the consumer has + * closed the handle, we assume it is not interested in the + * notify and finalize callbacks. + */ + ldi_ev_lock(); + + if (handlep->lh_flags & LH_FLAGS_NOTIFY) + notify = B_TRUE; + listp = &ldi_ev_callback_list.le_head; + for (lecp = list_head(listp); lecp; lecp = list_next(listp, lecp)) { + if (lecp->lec_lhp != handlep) + continue; + lecp->lec_lhp = NULL; + lecp->lec_notify = NULL; + LDI_EVDBG((CE_NOTE, "ldi_close: NULLed lh and notify")); + if (!notify) { + LDI_EVDBG((CE_NOTE, "ldi_close: NULLed finalize")); + lecp->lec_finalize = NULL; + } + } + + if (notify) + handlep->lh_flags &= ~LH_FLAGS_NOTIFY; + ldi_ev_unlock(); + + /* Close device if only one openref, or just decrement openrefs */ + if ((error = handle_close(handlep)) != 0) { + dprintf("%s error from handle_close: %d\n", + __func__, error); + } + + /* Decrement lh_ref, if last ref then remove and free */ + handle_release(handlep); + handlep = 0; + + /* XXX clear pointer arg, and return success? */ + lh = (ldi_handle_t)0; + return (0); + // return (error); +} + +/* + * Client interface, must be in LDI_STATUS_ONLINE + */ +int +ldi_get_size(ldi_handle_t lh, uint64_t *dev_size) +{ + struct ldi_handle *handlep = (struct ldi_handle *)lh; + int error; + + /* + * Ensure we have an LDI handle, and a valid dev_size and/or + * blocksize pointer. Caller must pass at least one of these. + */ + if (!handlep || !dev_size) { + dprintf("%s handle %p\n", __func__, handlep); + dprintf("%s dev_size %p\n", __func__, dev_size); + return (EINVAL); + } + + /* + * Must be in LDI_STATUS_ONLINE + * IOMedia can return getSize without being opened, but vnode + * devices must be opened first. + * Rather than have support differing behaviors, require that + * handle is open to retrieve the size. + */ + if (handlep->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s device not online\n", __func__); + return (ENODEV); + } + + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + error = handle_get_size_iokit(handlep, dev_size); + return (error); + + case LDI_TYPE_VNODE: + error = handle_get_size_vnode(handlep, dev_size); + return (error); + } + + /* Default case, shouldn't reach this */ + dprintf("%s invalid lh_type %d\n", __func__, + handlep->lh_type); + return (EINVAL); +} + +/* + * Must be in LDI_STATUS_ONLINE + * XXX Needs async callback + */ +int +ldi_sync(ldi_handle_t lh) +{ + struct ldi_handle *handlep = (struct ldi_handle *)lh; + int error; + + /* Ensure we have an LDI handle */ + if (!handlep) { + dprintf("%s no handle\n", __func__); + return (EINVAL); + } + + /* Must be in LDI_STATUS_ONLINE */ + if (handlep->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s device not online\n", __func__); + return (ENODEV); + } + + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + error = handle_sync_iokit(handlep); + return (error); + + case LDI_TYPE_VNODE: + error = handle_sync_vnode(handlep); + return (error); + } + + /* Default case, shouldn't reach this */ + dprintf("%s invalid lh_type %d\n", __func__, + handlep->lh_type); + return (EINVAL); +} + +int +ldi_ioctl(ldi_handle_t lh, int cmd, intptr_t arg, + __unused int mode, __unused cred_t *cr, __unused int *rvalp) +{ + struct ldi_handle *handlep = (struct ldi_handle *)lh; + int error = EINVAL; + struct dk_callback *dkc; + + switch (cmd) { + /* Flush write cache */ + case DKIOCFLUSHWRITECACHE: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + error = handle_sync_iokit(handlep); + break; + + case LDI_TYPE_VNODE: + error = handle_sync_vnode(handlep); + break; + + default: + error = ENOTSUP; + } + + if (!arg) { + return (error); + } + + dkc = (struct dk_callback *)arg; + /* Issue completion callback if set */ + if (dkc->dkc_callback) { + (*dkc->dkc_callback)(dkc->dkc_cookie, error); + } + + return (error); + + /* Set or clear write cache enabled */ + case DKIOCSETWCE: + /* + * There doesn't seem to be a way to do this by vnode, + * so we need to be able to locate an IOMedia and an + * IOBlockStorageDevice provider. + */ + return (handle_set_wce_iokit(handlep, (int *)arg)); + + /* Get media blocksize and block count */ + case DKIOCGMEDIAINFO: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_get_media_info_iokit(handlep, + (struct dk_minfo *)arg)); + + case LDI_TYPE_VNODE: + return (handle_get_media_info_vnode(handlep, + (struct dk_minfo *)arg)); + + default: + return (ENOTSUP); + } + + /* Get media logical/physical blocksize and block count */ + case DKIOCGMEDIAINFOEXT: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_get_media_info_ext_iokit(handlep, + (struct dk_minfo_ext *)arg)); + + case LDI_TYPE_VNODE: + return (handle_get_media_info_ext_vnode(handlep, + (struct dk_minfo_ext *)arg)); + + default: + return (ENOTSUP); + } + + /* Check device status */ + case DKIOCSTATE: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_check_media_iokit(handlep, + (int *)arg)); + + case LDI_TYPE_VNODE: + return (handle_check_media_vnode(handlep, + (int *)arg)); + + default: + return (ENOTSUP); + } + + case DKIOCISSOLIDSTATE: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_is_solidstate_iokit(handlep, + (int *)arg)); + + case LDI_TYPE_VNODE: + return (handle_is_solidstate_vnode(handlep, + (int *)arg)); + + default: + return (ENOTSUP); + } + + case DKIOCGETBOOTINFO: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_get_bootinfo_iokit(handlep, + (struct io_bootinfo *)arg)); + + case LDI_TYPE_VNODE: + return (handle_get_bootinfo_vnode(handlep, + (struct io_bootinfo *)arg)); + + default: + return (ENOTSUP); + } + + case DKIOCGETFEATURES: /* UNMAP? */ + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_features_iokit(handlep, + (uint32_t *)arg)); + + case LDI_TYPE_VNODE: + return (handle_features_vnode(handlep, + (uint32_t *)arg)); + + default: + return (ENOTSUP); + } + + case DKIOCFREE: /* UNMAP */ + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_unmap_iokit(handlep, + (dkioc_free_list_ext_t *)arg)); + + case LDI_TYPE_VNODE: + return (handle_unmap_vnode(handlep, + (dkioc_free_list_ext_t *)arg)); + + default: + return (ENOTSUP); + } + + default: + return (ENOTSUP); + } +} + +/* + * Must already have handle_open called on lh. + */ +int +ldi_strategy(ldi_handle_t lh, ldi_buf_t *lbp) +{ + struct ldi_handle *handlep = (struct ldi_handle *)lh; + int error = EINVAL; + + /* Verify arguments */ + if (!handlep || !lbp || lbp->b_bcount == 0) { + dprintf("%s missing something...\n", __func__); + dprintf("handlep [%p]\n", handlep); + dprintf("lbp [%p]\n", lbp); + if (lbp) { + dprintf("lbp->b_bcount %llu\n", + lbp->b_bcount); + } + return (EINVAL); + } + + /* Check instantaneous value of handle status */ + if (handlep->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s device not online\n", __func__); + return (ENODEV); + } + + /* IOMedia or vnode */ + /* Issue type-specific buf_strategy, preserve error */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + error = buf_strategy_iokit(lbp, handlep); + break; + case LDI_TYPE_VNODE: + error = buf_strategy_vnode(lbp, handlep); + break; + default: + dprintf("%s invalid lh_type %d\n", __func__, handlep->lh_type); + return (EINVAL); + } + + return (error); +} + +/* Client interface to get an LDI buffer */ +ldi_buf_t * +ldi_getrbuf(int flags) +{ +/* Example: bp = getrbuf(KM_SLEEP); */ + ldi_buf_t *lbp; + + /* Allocate with requested flags */ + lbp = kmem_alloc(sizeof (ldi_buf_t), flags); + /* Verify allocation */ + if (!lbp) { + return (NULL); + } + + ldi_bioinit(lbp); + + return (lbp); +} + +/* Client interface to release an LDI buffer */ +void +ldi_freerbuf(ldi_buf_t *lbp) +{ + if (!lbp) { + return; + } + + /* Deallocate */ + kmem_free(lbp, sizeof (ldi_buf_t)); +} + +void +ldi_bioinit(ldi_buf_t *lbp) +{ +#ifdef LDI_ZERO + /* Zero the new buffer struct */ + bzero(lbp, sizeof (ldi_buf_t)); +#endif + + /* Initialize defaults */ + lbp->b_un.b_addr = 0; + lbp->b_flags = 0; + lbp->b_bcount = 0; + lbp->b_bufsize = 0; + lbp->b_lblkno = 0; + lbp->b_resid = 0; + lbp->b_error = 0; +} + +/* + * IOKit C++ functions + */ +int +ldi_init(void *provider) +{ + int index; + + /* Allocate kstat pointer */ + ldi_ksp = kstat_create("zfs", 0, "ldi", "darwin", KSTAT_TYPE_NAMED, + sizeof (ldi_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (ldi_ksp == NULL) { + dprintf("%s couldn't register kstats\n", __func__); + return (ENOMEM); + } + + /* Register kstats */ + ldi_ksp->ks_data = &ldi_stats; + kstat_install(ldi_ksp); + + /* Register sysctls */ + sysctl_register_oid(&sysctl__ldi); + sysctl_register_oid(&sysctl__ldi_debug); + sysctl_register_oid(&sysctl__ldi_debug_use_iokit_from_path); + sysctl_register_oid(&sysctl__ldi_debug_use_iokit_from_dev); + sysctl_register_oid(&sysctl__ldi_debug_use_dev_from_path); + sysctl_register_oid(&sysctl__ldi_debug_use_vnode_from_path); + + /* Create handle hash lists and locks */ + ldi_handle_hash_count = 0; + for (index = 0; index < LH_HASH_SZ; index++) { + mutex_init(&ldi_handle_hash_lock[index], NULL, + MUTEX_DEFAULT, NULL); + list_create(&ldi_handle_hash_list[index], + sizeof (struct ldi_handle), + offsetof(struct ldi_handle, lh_node)); + } + + /* + * Initialize the LDI event subsystem + */ + mutex_init(&ldi_ev_callback_list.le_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ldi_ev_callback_list.le_cv, NULL, CV_DEFAULT, NULL); + ldi_ev_callback_list.le_busy = 0; + ldi_ev_callback_list.le_thread = NULL; + ldi_ev_callback_list.le_walker_next = NULL; + ldi_ev_callback_list.le_walker_prev = NULL; + list_create(&ldi_ev_callback_list.le_head, + sizeof (ldi_ev_callback_impl_t), + offsetof(ldi_ev_callback_impl_t, lec_list)); + + return (0); +} + +void +ldi_fini() +{ + /* + * Teardown the LDI event subsystem + */ + ldi_ev_lock(); +#ifdef DEBUG + if (ldi_ev_callback_list.le_busy != 1 || + ldi_ev_callback_list.le_thread != curthread || + ldi_ev_callback_list.le_walker_next != NULL || + ldi_ev_callback_list.le_walker_prev != NULL) { + dprintf("%s still has %s %llu %s %p %s %p %s %p\n", __func__, + "le_busy", ldi_ev_callback_list.le_busy, + "le_thread", ldi_ev_callback_list.le_thread, + "le_walker_next", ldi_ev_callback_list.le_walker_next, + "le_walker_prev", ldi_ev_callback_list.le_walker_prev); + } +#endif + list_destroy(&ldi_ev_callback_list.le_head); + ldi_ev_unlock(); +#ifdef DEBUG + ldi_ev_callback_list.le_busy = 0; + ldi_ev_callback_list.le_thread = NULL; + ldi_ev_callback_list.le_walker_next = NULL; + ldi_ev_callback_list.le_walker_prev = NULL; +#endif + + cv_destroy(&ldi_ev_callback_list.le_cv); + mutex_destroy(&ldi_ev_callback_list.le_lock); + + if (ldi_handle_hash_count != 0) { + dprintf("%s ldi_handle_hash_count %llu\n", __func__, + ldi_handle_hash_count); + } + + /* Destroy handle hash lists and locks */ + handle_hash_release(); + + /* Unregister sysctls */ + sysctl_unregister_oid(&sysctl__ldi_debug_use_iokit_from_path); + sysctl_unregister_oid(&sysctl__ldi_debug_use_iokit_from_dev); + sysctl_unregister_oid(&sysctl__ldi_debug_use_dev_from_path); + sysctl_unregister_oid(&sysctl__ldi_debug_use_vnode_from_path); + sysctl_unregister_oid(&sysctl__ldi_debug); + sysctl_unregister_oid(&sysctl__ldi); + + /* Unregister kstats */ + if (ldi_ksp != NULL) { + kstat_delete(ldi_ksp); + ldi_ksp = NULL; + } + + if (ldi_handle_hash_count != 0) { + dprintf("%s handle_hash_count still %llu\n", __func__, + ldi_handle_hash_count); + } +} diff --git a/module/os/macos/zfs/ldi_vnode.c b/module/os/macos/zfs/ldi_vnode.c new file mode 100644 index 0000000000..1c7b64b94b --- /dev/null +++ b/module/os/macos/zfs/ldi_vnode.c @@ -0,0 +1,1020 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * Portions of this document are copyright Oracle and Joyent. + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +/* + * ZFS internal + */ +#include + +/* + * LDI Includes + */ +#include + +/* Debug prints */ + +#define ldi_log(fmt, ...) do { \ + dprintf(fmt, __VA_ARGS__); \ + /* delay(hz>>1); */ \ +_NOTE(CONSTCOND) } while (0) + +struct _handle_vnode { + vnode_t *devvp; + char *vd_readlinkname; +}; /* 16b */ + +#define LH_VNODE(lhp) lhp->lh_tsd.vnode_tsd->devvp + +void +handle_free_vnode(struct ldi_handle *lhp) +{ + if (!lhp) { + dprintf("%s missing lhp\n", __func__); + return; + } + + if (!lhp->lh_tsd.vnode_tsd) { + dprintf("%s missing vnode_tsd\n", __func__); + return; + } + + kmem_free(lhp->lh_tsd.vnode_tsd, sizeof (struct _handle_vnode)); + lhp->lh_tsd.vnode_tsd = 0; +} + + +/* Returns handle with lock still held */ +struct ldi_handle * +handle_alloc_vnode(dev_t device, int fmode) +{ + struct ldi_handle *lhp, *retlhp; + + /* Search for existing handle */ + if ((retlhp = handle_find(device, fmode, B_TRUE)) != NULL) { + dprintf("%s found handle before alloc\n", __func__); + return (retlhp); + } + + /* Validate arguments */ + if (device == 0 || fmode == 0) { + dprintf("%s missing dev_t %d or fmode %d\n", + __func__, device, fmode); + return (NULL); + } + + /* Allocate LDI vnode handle */ + if ((lhp = handle_alloc_common(LDI_TYPE_VNODE, device, + fmode)) == NULL) { + dprintf("%s couldn't allocate lhp\n", __func__); + return (NULL); + } + + /* Allocate and clear type-specific device data */ + lhp->lh_tsd.vnode_tsd = (struct _handle_vnode *)kmem_alloc( + sizeof (struct _handle_vnode), KM_SLEEP); + LH_VNODE(lhp) = 0; + + /* Add the handle to the list, or return match */ + if ((retlhp = handle_add(lhp)) == NULL) { + dprintf("%s handle_add failed\n", __func__); + handle_release(lhp); + return (NULL); + } + + /* Check if new or found handle was returned */ + if (retlhp != lhp) { + dprintf("%s found handle after alloc\n", __func__); + handle_release(lhp); + lhp = 0; + } + + return (retlhp); +} + +int +handle_close_vnode(struct ldi_handle *lhp) +{ + vfs_context_t context; + int error = EINVAL; + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_type, ==, LDI_TYPE_VNODE); + ASSERT3U(LH_VNODE(lhp), !=, NULL); + ASSERT3U(lhp->lh_status, ==, LDI_STATUS_CLOSING); + +#ifdef DEBUG + /* Validate vnode and context */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + /* If getwithref failed, we can't call vnode_close. */ + LH_VNODE(lhp) = NULLVP; + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* For read-write, clear mountedon flag and wait for writes */ + if (lhp->lh_fmode & FWRITE) { + /* Wait for writes to complete */ + error = vnode_waitforwrites(LH_VNODE(lhp), 0, 0, 0, + "ldi::handle_close_vnode"); + if (error != 0) { + dprintf("%s waitforwrites returned %d\n", + __func__, error); + } + } + + /* Drop usecount */ + vnode_rele(LH_VNODE(lhp)); + + /* Drop iocount and refcount */ + error = vnode_close(LH_VNODE(lhp), + (lhp->lh_fmode & FWRITE ? FWASWRITTEN : 0), + context); + /* Preserve error from vnode_close */ + + /* Clear handle devvp vnode pointer */ + LH_VNODE(lhp) = NULLVP; + /* Drop VFS context */ + vfs_context_rele(context); + + if (error) { + dprintf("%s vnode_close error %d\n", + __func__, error); + } + /* Return error from close */ + return (error); +} + +static int +handle_open_vnode(struct ldi_handle *lhp, char *path) +{ + vfs_context_t context; + int error = EINVAL; + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(path, !=, NULL); + ASSERT3U(lhp->lh_type, ==, LDI_TYPE_VNODE); + ASSERT3U(lhp->lh_status, ==, LDI_STATUS_OPENING); + + /* Validate path string */ + if (!path || strlen(path) <= 1) { + dprintf("%s missing path\n", __func__); + return (EINVAL); + } + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Try to open the device by path (takes iocount) */ + error = vnode_open(path, lhp->lh_fmode, 0, 0, + &(LH_VNODE(lhp)), context); + + if (error) { + dprintf("%s vnode_open error %d\n", __func__, error); + /* Return error from vnode_open */ + return (error); + } + + /* Increase usecount, saving error. */ + error = vnode_ref(LH_VNODE(lhp)); + if (error != 0) { + dprintf("%s couldn't vnode_ref\n", __func__); + vnode_close(LH_VNODE(lhp), lhp->lh_fmode, context); + /* Return error from vnode_ref */ + return (error); + } + + /* Verify vnode refers to a block device */ + if (!vnode_isblk(LH_VNODE(lhp))) { + dprintf("%s %s is not a block device\n", + __func__, path); + vnode_rele(LH_VNODE(lhp)); + vnode_close(LH_VNODE(lhp), lhp->lh_fmode, context); + return (ENOTBLK); + } + + /* Drop iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop VFS context */ + vfs_context_rele(context); + + return (0); +} + +int +handle_get_size_vnode(struct ldi_handle *lhp, uint64_t *dev_size) +{ + vfs_context_t context; + uint64_t blkcnt = 0; + uint32_t blksize = 0; + int error = EINVAL; + +#ifdef DEBUG + if (!lhp || !dev_size) { + dprintf("%s missing lhp or dev_size\n", __func__); + return (EINVAL); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* Fetch the blocksize */ + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCGETBLOCKSIZE, + (caddr_t)&blksize, 0, context); + error = (blksize == 0 ? ENODEV : error); + + /* Fetch the block count */ + error = (error ? error : VNOP_IOCTL(LH_VNODE(lhp), + DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, + 0, context)); + error = (blkcnt == 0 ? ENODEV : error); + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop VFS context */ + vfs_context_rele(context); + + /* Cast both to 64-bit then multiply */ + *dev_size = ((uint64_t)blksize * (uint64_t)blkcnt); + if (*dev_size == 0) { + dprintf("%s invalid blksize %u or blkcnt %llu\n", + __func__, blksize, blkcnt); + return (ENODEV); + } + return (0); +} + +int +handle_get_dev_path_vnode(struct ldi_handle *lhp, char *path, int len) +{ + vfs_context_t context; + int error; + + if (!lhp || !path || len == 0) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + if ((error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCGETFIRMWAREPATH, + (caddr_t)path, len, context)) != 0) { + dprintf("%s VNOP_IOCTL error %d\n", __func__, error); + /* Preserve error to return */ + } + + /* Drop iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop VFS context */ + vfs_context_rele(context); + +if (error == 0) dprintf("%s got device path [%s]\n", __func__, path); + return (error); +} + +int +handle_get_bootinfo_vnode(struct ldi_handle *lhp, + struct io_bootinfo *bootinfo) +{ + int error; + + if (!lhp || !bootinfo) { + dprintf("%s missing argument\n", __func__); +printf("%s missing argument\n", __func__); + return (EINVAL); + } + + if ((error = handle_get_size_vnode(lhp, + &bootinfo->dev_size)) != 0 || + (error = handle_get_dev_path_vnode(lhp, bootinfo->dev_path, + sizeof (bootinfo->dev_path))) != 0) { + dprintf("%s get size or dev_path error %d\n", + __func__, error); + } + + return (error); +} + +int +handle_sync_vnode(struct ldi_handle *lhp) +{ + vfs_context_t context; + int error = EINVAL; + +#ifdef DEBUG + if (!lhp) { + dprintf("%s missing lhp\n", __func__); + return (EINVAL); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* + * Flush out any old buffers remaining from a previous use. + * buf_invalidateblks flushes UPL buffers, VNOP_FSYNC informs + * the disk device to flush write buffers to disk. + */ + error = buf_invalidateblks(LH_VNODE(lhp), BUF_WRITE_DATA, 0, 0); + + error = (error ? error : VNOP_FSYNC(LH_VNODE(lhp), + MNT_WAIT, context)); + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop VFS context */ + vfs_context_rele(context); + + if (error) { + dprintf("%s buf_invalidateblks or VNOP_FSYNC error %d\n", + __func__, error); + return (ENOTSUP); + } + return (0); +} + +/* vnode_lookup, find dev_t info */ +dev_t +dev_from_path(char *path) +{ + vfs_context_t context; + vnode_t *devvp = NULLVP; + dev_t device; + int error = EINVAL; + +#ifdef DEBUG + /* Validate path */ + if (path == 0 || strlen(path) <= 1 || path[0] != '/') { + dprintf("%s invalid path provided\n", __func__); + return (0); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (0); + } + + /* Try to lookup the vnode by path */ + error = vnode_lookup(path, 0, &devvp, context); + if (error || devvp == NULLVP) { + dprintf("%s vnode_lookup failed %d\n", __func__, error); + vfs_context_rele(context); + return (0); + } + + /* Get the rdev of this vnode */ + device = vnode_specrdev(devvp); + + /* Drop iocount on devvp */ + vnode_put(devvp); + /* Drop vfs_context */ + vfs_context_rele(context); + +#ifdef DEBUG + /* Validate dev_t */ + if (device == 0) { + dprintf("%s invalid device\n", __func__); + } +#endif + + /* Return 0 or valid dev_t */ + return (device); +} + +/* Completion handler for vnode strategy */ +static void +ldi_vnode_io_intr(buf_t bp, void *arg) +{ + ldi_buf_t *lbp = (ldi_buf_t *)arg; + + ASSERT3U(bp, !=, NULL); + ASSERT3U(lbp, !=, NULL); + + /* Copyout error and resid */ + lbp->b_error = buf_error(bp); + lbp->b_resid = buf_resid(bp); + +#ifdef DEBUG + if (lbp->b_error || lbp->b_resid != 0) { + dprintf("%s io error %d resid %llu\n", __func__, + lbp->b_error, lbp->b_resid); + } +#endif + + /* Teardown */ + buf_free(bp); + + /* Call original completion function */ + if (lbp->b_iodone) { + lbp->b_iodone(lbp); + } +} + +int +buf_strategy_vnode(ldi_buf_t *lbp, struct ldi_handle *lhp) +{ + buf_t bp = 0; + int error = EINVAL; + + ASSERT3U(lbp, !=, NULL); + ASSERT3U(lhp, !=, NULL); + +#ifdef DEBUG + if (!lbp || !lhp) { + dprintf("%s missing lbp or lhp\n", __func__); + return (EINVAL); + } + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s handle is not Online\n", __func__); + return (ENODEV); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and verify buf_t */ + if (NULL == (bp = buf_alloc(LH_VNODE(lhp)))) { + dprintf("%s couldn't allocate buf_t\n", __func__); + return (ENOMEM); + } + + /* Setup buffer */ + buf_setflags(bp, B_NOCACHE | (lbp->b_flags & B_READ ? + B_READ : B_WRITE)); + buf_setcount(bp, lbp->b_bcount); + buf_setdataptr(bp, (uintptr_t)lbp->b_un.b_addr); + buf_setblkno(bp, lbp->b_lblkno); + buf_setlblkno(bp, lbp->b_lblkno); + buf_setsize(bp, lbp->b_bufsize); + + /* For asynchronous IO */ + if (lbp->b_iodone != NULL) { + buf_setcallback(bp, &ldi_vnode_io_intr, lbp); + } + + /* Recheck instantaneous value of handle status */ + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s device not online\n", __func__); + buf_free(bp); + return (ENODEV); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + buf_free(bp); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + if (!(lbp->b_flags & B_READ)) { + /* Does not return an error status */ + vnode_startwrite(LH_VNODE(lhp)); + } + + + + /* Issue the IO, preserving error */ + error = VNOP_STRATEGY(bp); + + if (error) { + dprintf("%s VNOP_STRATEGY error %d\n", + __func__, error); + /* Reclaim write count on vnode */ + if (!(lbp->b_flags & B_READ)) { + vnode_writedone(LH_VNODE(lhp)); + } + vnode_put(LH_VNODE(lhp)); + buf_free(bp); + return (EIO); + } + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + + /* For synchronous IO, call completion */ + if (lbp->b_iodone == NULL) { + ldi_vnode_io_intr(bp, (void*)lbp); + } + + /* Pass error from VNOP_STRATEGY */ + return (error); +} + +/* Client interface, alloc and open vnode handle by pathname */ +int +ldi_open_vnode_by_path(char *path, dev_t device, + int fmode, ldi_handle_t *lhp) +{ + struct ldi_handle *retlhp; + ldi_status_t status; + int error = EIO; + + /* Validate arguments */ + if (!path || strlen(path) <= 1 || device == 0 || !lhp) { + dprintf("%s invalid argument %p %d %p\n", __func__, + path, device, lhp); + if (path) { + dprintf("*path string is %s\n", path); + } + return (EINVAL); + } + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*(struct ldi_handle **)lhp, ==, NULL); + + /* Allocate handle with path */ + retlhp = handle_alloc_vnode(device, fmode); + if (retlhp == NULL) { + dprintf("%s couldn't allocate vnode handle\n", __func__); + return (ENOMEM); + } + + /* Mark the handle as Opening, or increment openref */ + status = handle_open_start(retlhp); + if (status == LDI_STATUS_ONLINE) { + dprintf("%s already online, refs %d, openrefs %d\n", __func__, + retlhp->lh_ref, retlhp->lh_openref); + /* Cast retlhp and assign to lhp (may be 0) */ + *lhp = (ldi_handle_t)retlhp; + /* Successfully incremented open ref in open_start */ + return (0); + } + + /* If state is now Opening, try to open device by vnode */ + if (status != LDI_STATUS_OPENING || + (error = handle_open_vnode(retlhp, path)) != 0) { + dprintf("%s Couldn't open handle\n", __func__); + handle_open_done(retlhp, LDI_STATUS_CLOSED); + handle_release(retlhp); + retlhp = 0; + return ((error == EACCES) ? EROFS:EIO); + } + handle_open_done(retlhp, LDI_STATUS_ONLINE); + + /* Register for disk notifications */ + handle_register_notifier(retlhp); + + /* Cast retlhp and assign to lhp (may be 0) */ + *lhp = (ldi_handle_t)retlhp; + /* Pass error from open */ + return (error); +} + +int +handle_get_media_info_vnode(struct ldi_handle *lhp, + struct dk_minfo *dkm) +{ + vfs_context_t context; + uint32_t blksize; + uint64_t blkcount; + int error; + +#ifdef DEBUG + if (!lhp || !dkm) { + dprintf("%s missing lhp or dkm\n", __func__); + return (EINVAL); + } + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s handle is not Online\n", __func__); + return (ENODEV); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (0); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* Get the blocksize and block count */ + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCGETBLOCKSIZE, + (caddr_t)&blksize, 0, context); + error = (error ? error : VNOP_IOCTL(LH_VNODE(lhp), + DKIOCGETBLOCKCOUNT, (caddr_t)&blkcount, + 0, context)); + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop vfs_context */ + vfs_context_rele(context); + + if (error) { + dkm->dki_capacity = 0; + dkm->dki_lbsize = 0; + return (error); + } + + /* If successful, set return values */ + dkm->dki_capacity = blkcount; + dkm->dki_lbsize = blksize; + return (0); +} + +int +handle_get_media_info_ext_vnode(struct ldi_handle *lhp, + struct dk_minfo_ext *dkmext) +{ + vfs_context_t context; + uint32_t blksize, pblksize; + uint64_t blkcount; + int error; + +#ifdef DEBUG + if (!lhp || !dkmext) { + dprintf("%s missing lhp or dkmext\n", __func__); + return (EINVAL); + } + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s handle is not Online\n", __func__); + return (ENODEV); + } + + /* Validate vnode and context */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode or context\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (0); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* Get the blocksize, physical blocksize, and block count */ + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCGETBLOCKSIZE, + (caddr_t)&blksize, 0, context); + error = (error ? error : VNOP_IOCTL(LH_VNODE(lhp), + DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&pblksize, + 0, context)); + error = (error ? error : VNOP_IOCTL(LH_VNODE(lhp), + DKIOCGETBLOCKCOUNT, (caddr_t)&blkcount, + 0, context)); + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop vfs_context */ + vfs_context_rele(context); + + if (error) { + dkmext->dki_capacity = 0; + dkmext->dki_lbsize = 0; + dkmext->dki_pbsize = 0; + return (error); + } + + /* If successful, set return values */ + dkmext->dki_capacity = blkcount; + dkmext->dki_lbsize = blksize; + dkmext->dki_pbsize = pblksize; + return (0); +} + +int +handle_check_media_vnode(struct ldi_handle *lhp, int *status) +{ + if (!lhp || !status) { + dprintf("%s missing lhp or invalid status\n", __func__); + return (EINVAL); + } + + /* Validate vnode and context */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } + + /* XXX As yet unsupported */ + return (ENOTSUP); + + /* Check if the device is available and responding */ + return (0); +} + +int +handle_is_solidstate_vnode(struct ldi_handle *lhp, int *isssd) +{ + vfs_context_t context; + int error; + + if (!lhp || !isssd) { + dprintf("%s missing lhp or invalid status\n", __func__); + return (EINVAL); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCISSOLIDSTATE, + (caddr_t)isssd, 0, context); + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop vfs_context */ + vfs_context_rele(context); + + return (error); +} + +int +handle_features_vnode(struct ldi_handle *lhp, + uint32_t *features) +{ + vfs_context_t context; + int error; + +#ifdef DEBUG + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s handle is not Online\n", __func__); + return (ENODEV); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (0); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + + /* All code paths from here must vnode_put. */ + + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCGETFEATURES, + (caddr_t)features, 0, context); + + if (error) { + printf("%s: 0x%x\n", __func__, error); + } + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop vfs_context */ + vfs_context_rele(context); + + return (error); +} + +int +handle_unmap_vnode(struct ldi_handle *lhp, + dkioc_free_list_ext_t *dkm) +{ + vfs_context_t context; + int error; + +#ifdef DEBUG + if (!lhp || !dkm) { + dprintf("%s missing lhp or dkm\n", __func__); + return (EINVAL); + } + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s handle is not Online\n", __func__); + return (ENODEV); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (0); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* We need to convert illumos' dkioc_free_list_t to dk_unmap_t */ + /* We only support 1 entry now */ + dk_unmap_t dkun = { 0 }; + dk_extent_t ext; + dkun.extentsCount = 1; + dkun.extents = &ext; + ext.offset = dkm->dfle_start; + ext.length = dkm->dfle_length; + + /* + * dkm->dfl_flags vs dkun.options + * #define DF_WAIT_SYNC 0x00000001 Wait for full write-out of free. + * #define _DK_UNMAP_INITIALIZE 0x00000100 + */ + + /* issue unmap */ + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCUNMAP, + (caddr_t)&dkun, 0, context); + + if (error) { + dprintf("%s unmap: 0x%x for off %llx size %llx\n", __func__, + error, ext.offset, ext.length); + } + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop vfs_context */ + vfs_context_rele(context); + + return (error); +} diff --git a/module/os/macos/zfs/policy.c b/module/os/macos/zfs/policy.c new file mode 100644 index 0000000000..5525302266 --- /dev/null +++ b/module/os/macos/zfs/policy.c @@ -0,0 +1,354 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright (C) 2016 Lawrence Livermore National Security, LLC. + * + * For Linux the vast majority of this enforcement is already handled via + * the standard Linux VFS permission checks. However certain administrative + * commands which bypass the standard mechanisms may need to make use of + * this functionality. + */ + +#include +#include +#include + +/* + * The passed credentials cannot be directly verified because Linux only + * provides and interface to check the *current* process credentials. In + * order to handle this the capable() test is only run when the passed + * credentials match the current process credentials or the kcred. In + * all other cases this function must fail and return the passed err. + */ +static int +priv_policy_ns(const cred_t *cr, int capability, boolean_t all, int err, + struct user_namespace *ns) +{ + ASSERT3S(all, ==, B_FALSE); + + if (cr != CRED() && (cr != kcred)) + return (err); + +#if defined(CONFIG_USER_NS) + if (!(ns ? ns_capable(ns, capability) : capable(capability))) +#else + if (!capable(capability)) +#endif + return (err); + + return (0); +} + +static int +priv_policy(const cred_t *cr, int capability, boolean_t all, int err) +{ + return (priv_policy_ns(cr, capability, all, err, NULL)); +} + +static int +priv_policy_user(const cred_t *cr, int capability, boolean_t all, int err) +{ + /* + * All priv_policy_user checks are preceded by kuid/kgid_has_mapping() + * checks. If we cannot do them, we shouldn't be using ns_capable() + * since we don't know whether the affected files are valid in our + * namespace. + */ +#if defined(CONFIG_USER_NS) + return (priv_policy_ns(cr, capability, all, err, cr->user_ns)); +#else + return (priv_policy_ns(cr, capability, all, err, NULL)); +#endif +} + +/* + * Checks for operations that are either client-only or are used by + * both clients and servers. + */ +int +secpolicy_nfs(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); +} + +/* + * Catch all system configuration. + */ +int +secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); +} + +/* + * Like secpolicy_vnode_access() but we get the actual wanted mode and the + * current mode of the file, not the missing bits. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, + mode_t curmode, mode_t wantmode) +{ + return (0); +} + +/* + * This is a special routine for ZFS; it is used to determine whether + * any of the privileges in effect allow any form of access to the + * file. There's no reason to audit this or any reason to record + * this. More work is needed to do the "KPLD" stuff. + */ +int +secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + + if (inode_owner_or_capable(ip)) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + if (priv_policy_user(cr, CAP_DAC_OVERRIDE, B_FALSE, EPERM) == 0) + return (0); + + if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, B_FALSE, EPERM) == 0) + return (0); + + return (EPERM); +} + +/* + * Determine if subject can chown owner of a file. + */ +int +secpolicy_vnode_chown(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM)); +} + +/* + * Determine if subject can change group ownership of a file. + */ +int +secpolicy_vnode_create_gid(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SETGID, B_FALSE, EPERM)); +} + +/* + * Policy determines whether we can remove an entry from a directory, + * regardless of permission bits. + */ +int +secpolicy_vnode_remove(const cred_t *cr) +{ + return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM)); +} + +/* + * Determine that subject can modify the mode of a file. allzone privilege + * needed when modifying root owned object. + */ +int +secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM)); +} + +/* + * Are we allowed to retain the set-uid/set-gid bits when + * changing ownership or when writing to a file? + * "issuid" should be true when set-uid; only in that case + * root ownership is checked (setgid is assumed). + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) +{ + return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); +} + +/* + * Determine that subject can set the file setgid flag. + */ +int +secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) +{ +#if defined(CONFIG_USER_NS) + if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid))) + return (EPERM); +#endif + if (crgetfsgid(cr) != gid && !groupmember(gid, cr)) + return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); + + return (0); +} + +/* + * Determine if the subject can inject faults in the ZFS fault injection + * framework. Requires all privileges. + */ +int +secpolicy_zinject(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); +} + +/* + * Determine if the subject has permission to manipulate ZFS datasets + * (not pools). Equivalent to the SYS_MOUNT privilege. + */ +int +secpolicy_zfs(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); +} + +void +secpolicy_setid_clear(vattr_t *vap, cred_t *cr) +{ + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + (vap->va_mode & S_ISUID) != 0 && + (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~(S_ISUID|S_ISGID); + } +} + +/* + * Determine that subject can set the file setid flags. + */ +static int +secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); +} + +/* + * Determine that subject can make a file a "sticky". + * + * Enforced in the Linux VFS. + */ +static int +secpolicy_vnode_stky_modify(const cred_t *cr) +{ + return (0); +} + +int +secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, + const vattr_t *ovap, cred_t *cr) +{ + int error; + + if ((vap->va_mode & S_ISUID) != 0 && + (error = secpolicy_vnode_setid_modify(cr, + ovap->va_uid)) != 0) { + return (error); + } + + /* + * Check privilege if attempting to set the + * sticky bit on a non-directory. + */ + if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 && + secpolicy_vnode_stky_modify(cr) != 0) { + vap->va_mode &= ~S_ISVTX; + } + + /* + * Check for privilege if attempting to set the + * group-id bit. + */ + if ((vap->va_mode & S_ISGID) != 0 && + secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) { + vap->va_mode &= ~S_ISGID; + } + + return (0); +} + +/* + * Check privileges for setting xvattr attributes + */ +int +secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, mode_t type) +{ + return (secpolicy_vnode_chown(cr, owner)); +} + +/* + * Check privileges for setattr attributes. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node) +{ + return (0); +} + +/* + * Check privileges for links. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_basic_link(const cred_t *cr) +{ + return (0); +} diff --git a/module/os/macos/zfs/qat.c b/module/os/macos/zfs/qat.c new file mode 100644 index 0000000000..08613b3a20 --- /dev/null +++ b/module/os/macos/zfs/qat.c @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include +#include + +qat_stats_t qat_stats = { + { "comp_requests", KSTAT_DATA_UINT64 }, + { "comp_total_in_bytes", KSTAT_DATA_UINT64 }, + { "comp_total_out_bytes", KSTAT_DATA_UINT64 }, + { "decomp_requests", KSTAT_DATA_UINT64 }, + { "decomp_total_in_bytes", KSTAT_DATA_UINT64 }, + { "decomp_total_out_bytes", KSTAT_DATA_UINT64 }, + { "dc_fails", KSTAT_DATA_UINT64 }, + { "encrypt_requests", KSTAT_DATA_UINT64 }, + { "encrypt_total_in_bytes", KSTAT_DATA_UINT64 }, + { "encrypt_total_out_bytes", KSTAT_DATA_UINT64 }, + { "decrypt_requests", KSTAT_DATA_UINT64 }, + { "decrypt_total_in_bytes", KSTAT_DATA_UINT64 }, + { "decrypt_total_out_bytes", KSTAT_DATA_UINT64 }, + { "crypt_fails", KSTAT_DATA_UINT64 }, + { "cksum_requests", KSTAT_DATA_UINT64 }, + { "cksum_total_in_bytes", KSTAT_DATA_UINT64 }, + { "cksum_fails", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *qat_ksp = NULL; + +CpaStatus +qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes) +{ + *pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL); + if (*pp_mem_addr == NULL) + return (CPA_STATUS_RESOURCE); + return (CPA_STATUS_SUCCESS); +} + +void +qat_mem_free_contig(void **pp_mem_addr) +{ + if (*pp_mem_addr != NULL) { + kfree(*pp_mem_addr); + *pp_mem_addr = NULL; + } +} + +int +qat_init(void) +{ + qat_ksp = kstat_create("zfs", 0, "qat", "misc", + KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (qat_ksp != NULL) { + qat_ksp->ks_data = &qat_stats; + kstat_install(qat_ksp); + } + + /* + * Just set the disable flag when qat init failed, qat can be + * turned on again in post-process after zfs module is loaded, e.g.: + * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable + */ + if (qat_dc_init() != 0) + zfs_qat_compress_disable = 1; + + if (qat_cy_init() != 0) { + zfs_qat_checksum_disable = 1; + zfs_qat_encrypt_disable = 1; + } + + return (0); +} + +void +qat_fini(void) +{ + if (qat_ksp != NULL) { + kstat_delete(qat_ksp); + qat_ksp = NULL; + } + + qat_cy_fini(); + qat_dc_fini(); +} + +#endif diff --git a/module/os/macos/zfs/qat_compress.c b/module/os/macos/zfs/qat_compress.c new file mode 100644 index 0000000000..ad3ead3b16 --- /dev/null +++ b/module/os/macos/zfs/qat_compress.c @@ -0,0 +1,569 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Max instances in a QAT device, each instance is a channel to submit + * jobs to QAT hardware, this is only for pre-allocating instance and + * session arrays; the actual number of instances are defined in the + * QAT driver's configuration file. + */ +#define QAT_DC_MAX_INSTANCES 48 + +/* + * ZLIB head and foot size + */ +#define ZLIB_HEAD_SZ 2 +#define ZLIB_FOOT_SZ 4 + +static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES]; +static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES]; +static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES]; +static Cpa16U num_inst = 0; +static Cpa32U inst_num = 0; +static boolean_t qat_dc_init_done = B_FALSE; +int zfs_qat_compress_disable = 0; + +boolean_t +qat_dc_use_accel(size_t s_len) +{ + return (!zfs_qat_compress_disable && + qat_dc_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +static void +qat_dc_callback(void *p_callback, CpaStatus status) +{ + if (p_callback != NULL) + complete((struct completion *)p_callback); +} + +static void +qat_dc_clean(void) +{ + Cpa16U buff_num = 0; + Cpa16U num_inter_buff_lists = 0; + + for (Cpa16U i = 0; i < num_inst; i++) { + cpaDcStopInstance(dc_inst_handles[i]); + QAT_PHYS_CONTIG_FREE(session_handles[i]); + /* free intermediate buffers */ + if (buffer_array[i] != NULL) { + cpaDcGetNumIntermediateBuffers( + dc_inst_handles[i], &num_inter_buff_lists); + for (buff_num = 0; buff_num < num_inter_buff_lists; + buff_num++) { + CpaBufferList *buffer_inter = + buffer_array[i][buff_num]; + if (buffer_inter->pBuffers) { + QAT_PHYS_CONTIG_FREE( + buffer_inter->pBuffers->pData); + QAT_PHYS_CONTIG_FREE( + buffer_inter->pBuffers); + } + QAT_PHYS_CONTIG_FREE( + buffer_inter->pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(buffer_inter); + } + } + } + + num_inst = 0; + qat_dc_init_done = B_FALSE; +} + +int +qat_dc_init(void) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U sess_size = 0; + Cpa32U ctx_size = 0; + Cpa16U num_inter_buff_lists = 0; + Cpa16U buff_num = 0; + Cpa32U buff_meta_size = 0; + CpaDcSessionSetupData sd = {0}; + + if (qat_dc_init_done) + return (0); + + status = cpaDcGetNumInstances(&num_inst); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + /* if the user has configured no QAT compression units just return */ + if (num_inst == 0) + return (0); + + if (num_inst > QAT_DC_MAX_INSTANCES) + num_inst = QAT_DC_MAX_INSTANCES; + + status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + for (Cpa16U i = 0; i < num_inst; i++) { + cpaDcSetAddressTranslation(dc_inst_handles[i], + (void*)virt_to_phys); + + status = cpaDcBufferListGetMetaSize(dc_inst_handles[i], + 1, &buff_meta_size); + + if (status == CPA_STATUS_SUCCESS) + status = cpaDcGetNumIntermediateBuffers( + dc_inst_handles[i], &num_inter_buff_lists); + + if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0) + status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i], + num_inter_buff_lists * + sizeof (CpaBufferList *)); + + for (buff_num = 0; buff_num < num_inter_buff_lists; + buff_num++) { + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num], + sizeof (CpaBufferList)); + + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]-> + pPrivateMetaData, + buff_meta_size); + + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]->pBuffers, + sizeof (CpaFlatBuffer)); + + if (status == CPA_STATUS_SUCCESS) { + /* + * implementation requires an intermediate + * buffer approximately twice the size of + * output buffer, which is 2x max buffer + * size here. + */ + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]->pBuffers-> + pData, 2 * QAT_MAX_BUF_SIZE); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + buffer_array[i][buff_num]->numBuffers = 1; + buffer_array[i][buff_num]->pBuffers-> + dataLenInBytes = 2 * QAT_MAX_BUF_SIZE; + } + } + + status = cpaDcStartInstance(dc_inst_handles[i], + num_inter_buff_lists, buffer_array[i]); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + sd.compLevel = CPA_DC_L1; + sd.compType = CPA_DC_DEFLATE; + sd.huffType = CPA_DC_HT_FULL_DYNAMIC; + sd.sessDirection = CPA_DC_DIR_COMBINED; + sd.sessState = CPA_DC_STATELESS; + sd.deflateWindowSize = 7; + sd.checksum = CPA_DC_ADLER32; + status = cpaDcGetSessionSize(dc_inst_handles[i], + &sd, &sess_size, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size); + if (session_handles[i] == NULL) + goto fail; + + status = cpaDcInitSession(dc_inst_handles[i], + session_handles[i], + &sd, NULL, qat_dc_callback); + if (status != CPA_STATUS_SUCCESS) + goto fail; + } + + qat_dc_init_done = B_TRUE; + return (0); +fail: + qat_dc_clean(); + return (-1); +} + +void +qat_dc_fini(void) +{ + if (!qat_dc_init_done) + return; + + qat_dc_clean(); +} + +/* + * The "add" parameter is an additional buffer which is passed + * to QAT as a scratch buffer alongside the destination buffer + * in case the "compressed" data ends up being larger than the + * original source data. This is necessary to prevent QAT from + * generating buffer overflow warnings for incompressible data. + */ +static int +qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, + char *dst, int dst_len, char *add, int add_len, size_t *c_len) +{ + CpaInstanceHandle dc_inst_handle; + CpaDcSessionHandle session_handle; + CpaBufferList *buf_list_src = NULL; + CpaBufferList *buf_list_dst = NULL; + CpaFlatBuffer *flat_buf_src = NULL; + CpaFlatBuffer *flat_buf_dst = NULL; + Cpa8U *buffer_meta_src = NULL; + Cpa8U *buffer_meta_dst = NULL; + Cpa32U buffer_meta_size = 0; + CpaDcRqResults dc_results; + CpaStatus status = CPA_STATUS_FAIL; + Cpa32U hdr_sz = 0; + Cpa32U compressed_sz; + Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2; + Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2; + Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2; + Cpa32U bytes_left; + Cpa32U dst_pages = 0; + Cpa32U adler32 = 0; + char *data; + struct page *page; + struct page **in_pages = NULL; + struct page **out_pages = NULL; + struct page **add_pages = NULL; + Cpa32U page_off = 0; + struct completion complete; + Cpa32U page_num = 0; + Cpa16U i; + + /* + * We increment num_src_buf and num_dst_buf by 2 to allow + * us to handle non page-aligned buffer addresses and buffers + * whose sizes are not divisible by PAGE_SIZE. + */ + Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) + + (num_src_buf * sizeof (CpaFlatBuffer)); + Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) + + ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer)); + + status = QAT_PHYS_CONTIG_ALLOC(&in_pages, + num_src_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&out_pages, + num_dst_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&add_pages, + num_add_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + dc_inst_handle = dc_inst_handles[i]; + session_handle = session_handles[i]; + + cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf, + &buffer_meta_size); + status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf, + &buffer_meta_size); + status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + /* build source buffer list */ + status = QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1); + + buf_list_src->pBuffers = flat_buf_src; /* always point to first one */ + + /* build destination buffer list */ + status = QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); + + buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */ + + buf_list_src->numBuffers = 0; + buf_list_src->pPrivateMetaData = buffer_meta_src; + bytes_left = src_len; + data = src; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + in_pages[page_num] = page; + flat_buf_src->pData = kmap(page) + page_off; + flat_buf_src->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_src->dataLenInBytes; + data += flat_buf_src->dataLenInBytes; + flat_buf_src++; + buf_list_src->numBuffers++; + page_num++; + } + + buf_list_dst->numBuffers = 0; + buf_list_dst->pPrivateMetaData = buffer_meta_dst; + bytes_left = dst_len; + data = dst; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + flat_buf_dst->pData = kmap(page) + page_off; + out_pages[page_num] = page; + flat_buf_dst->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_dst->dataLenInBytes; + data += flat_buf_dst->dataLenInBytes; + flat_buf_dst++; + buf_list_dst->numBuffers++; + page_num++; + dst_pages++; + } + + /* map additional scratch pages into the destination buffer list */ + bytes_left = add_len; + data = add; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + flat_buf_dst->pData = kmap(page) + page_off; + add_pages[page_num] = page; + flat_buf_dst->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_dst->dataLenInBytes; + data += flat_buf_dst->dataLenInBytes; + flat_buf_dst++; + buf_list_dst->numBuffers++; + page_num++; + } + + init_completion(&complete); + + if (dir == QAT_COMPRESS) { + QAT_STAT_BUMP(comp_requests); + QAT_STAT_INCR(comp_total_in_bytes, src_len); + + cpaDcGenerateHeader(session_handle, + buf_list_dst->pBuffers, &hdr_sz); + buf_list_dst->pBuffers->pData += hdr_sz; + buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz; + status = cpaDcCompressData( + dc_inst_handle, session_handle, + buf_list_src, buf_list_dst, + &dc_results, CPA_DC_FLUSH_FINAL, + &complete); + if (status != CPA_STATUS_SUCCESS) { + goto fail; + } + + /* we now wait until the completion of the operation. */ + wait_for_completion(&complete); + + if (dc_results.status != CPA_STATUS_SUCCESS) { + status = CPA_STATUS_FAIL; + goto fail; + } + + compressed_sz = dc_results.produced; + if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) { + status = CPA_STATUS_INCOMPRESSIBLE; + goto fail; + } + + flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); + /* move to the last page */ + flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT; + + /* no space for gzip footer in the last page */ + if (((compressed_sz + hdr_sz) % PAGE_SIZE) + + ZLIB_FOOT_SZ > PAGE_SIZE) { + status = CPA_STATUS_INCOMPRESSIBLE; + goto fail; + } + + /* jump to the end of the buffer and append footer */ + flat_buf_dst->pData = + (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK) + + ((compressed_sz + hdr_sz) % PAGE_SIZE); + flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ; + + dc_results.produced = 0; + status = cpaDcGenerateFooter(session_handle, + flat_buf_dst, &dc_results); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + *c_len = compressed_sz + dc_results.produced + hdr_sz; + QAT_STAT_INCR(comp_total_out_bytes, *c_len); + } else { + ASSERT3U(dir, ==, QAT_DECOMPRESS); + QAT_STAT_BUMP(decomp_requests); + QAT_STAT_INCR(decomp_total_in_bytes, src_len); + + buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ; + buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ; + status = cpaDcDecompressData(dc_inst_handle, session_handle, + buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL, + &complete); + + if (CPA_STATUS_SUCCESS != status) { + status = CPA_STATUS_FAIL; + goto fail; + } + + /* we now wait until the completion of the operation. */ + wait_for_completion(&complete); + + if (dc_results.status != CPA_STATUS_SUCCESS) { + status = CPA_STATUS_FAIL; + goto fail; + } + + /* verify adler checksum */ + adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ); + if (adler32 != BSWAP_32(dc_results.checksum)) { + status = CPA_STATUS_FAIL; + goto fail; + } + *c_len = dc_results.produced; + QAT_STAT_INCR(decomp_total_out_bytes, *c_len); + } + +fail: + if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE) + QAT_STAT_BUMP(dc_fails); + + if (in_pages) { + for (page_num = 0; + page_num < buf_list_src->numBuffers; + page_num++) { + kunmap(in_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(in_pages); + } + + if (out_pages) { + for (page_num = 0; page_num < dst_pages; page_num++) { + kunmap(out_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(out_pages); + } + + if (add_pages) { + for (page_num = 0; + page_num < buf_list_dst->numBuffers - dst_pages; + page_num++) { + kunmap(add_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(add_pages); + } + + QAT_PHYS_CONTIG_FREE(buffer_meta_src); + QAT_PHYS_CONTIG_FREE(buffer_meta_dst); + QAT_PHYS_CONTIG_FREE(buf_list_src); + QAT_PHYS_CONTIG_FREE(buf_list_dst); + + return (status); +} + +/* + * Entry point for QAT accelerated compression / decompression. + */ +int +qat_compress(qat_compress_dir_t dir, char *src, int src_len, + char *dst, int dst_len, size_t *c_len) +{ + int ret; + size_t add_len = 0; + void *add = NULL; + + if (dir == QAT_COMPRESS) { + add_len = dst_len; + add = zio_data_buf_alloc(add_len); + } + + ret = qat_compress_impl(dir, src, src_len, dst, + dst_len, add, add_len, c_len); + + if (dir == QAT_COMPRESS) + zio_data_buf_free(add, add_len); + + return (ret); +} + +static int +param_set_qat_compress(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * zfs_qat_compress_disable = 0: enable qat compress + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_dc_init_done) { + ret = qat_dc_init(); + if (ret != 0) { + zfs_qat_compress_disable = 1; + return (ret); + } + } + return (ret); +} + +module_param_call(zfs_qat_compress_disable, param_set_qat_compress, + param_get_int, &zfs_qat_compress_disable, 0644); +MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression"); + +#endif diff --git a/module/os/macos/zfs/qat_crypt.c b/module/os/macos/zfs/qat_crypt.c new file mode 100644 index 0000000000..4771b2f3be --- /dev/null +++ b/module/os/macos/zfs/qat_crypt.c @@ -0,0 +1,630 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * This file represents the QAT implementation of checksums and encryption. + * Internally, QAT shares the same cryptographic instances for both of these + * operations, so the code has been combined here. QAT data compression uses + * compression instances, so that code is separated into qat_compress.c + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include +#include +#include +#include +#include +#include +#include "lac/cpa_cy_im.h" +#include "lac/cpa_cy_common.h" +#include + +/* + * Max instances in a QAT device, each instance is a channel to submit + * jobs to QAT hardware, this is only for pre-allocating instances + * and session arrays; the actual number of instances are defined in + * the QAT driver's configure file. + */ +#define QAT_CRYPT_MAX_INSTANCES 48 + +#define MAX_PAGE_NUM 1024 + +static Cpa32U inst_num = 0; +static Cpa16U num_inst = 0; +static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES]; +static boolean_t qat_cy_init_done = B_FALSE; +int zfs_qat_encrypt_disable = 0; +int zfs_qat_checksum_disable = 0; + +typedef struct cy_callback { + CpaBoolean verify_result; + struct completion complete; +} cy_callback_t; + +static void +symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation, + void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify) +{ + cy_callback_t *cb = p_callback; + + if (cb != NULL) { + /* indicate that the function has been called */ + cb->verify_result = verify; + complete(&cb->complete); + } +} + +boolean_t +qat_crypt_use_accel(size_t s_len) +{ + return (!zfs_qat_encrypt_disable && + qat_cy_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +boolean_t +qat_checksum_use_accel(size_t s_len) +{ + return (!zfs_qat_checksum_disable && + qat_cy_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +void +qat_cy_clean(void) +{ + for (Cpa16U i = 0; i < num_inst; i++) + cpaCyStopInstance(cy_inst_handles[i]); + + num_inst = 0; + qat_cy_init_done = B_FALSE; +} + +int +qat_cy_init(void) +{ + CpaStatus status = CPA_STATUS_FAIL; + + if (qat_cy_init_done) + return (0); + + status = cpaCyGetNumInstances(&num_inst); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + /* if the user has configured no QAT encryption units just return */ + if (num_inst == 0) + return (0); + + if (num_inst > QAT_CRYPT_MAX_INSTANCES) + num_inst = QAT_CRYPT_MAX_INSTANCES; + + status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + for (Cpa16U i = 0; i < num_inst; i++) { + status = cpaCySetAddressTranslation(cy_inst_handles[i], + (void *)virt_to_phys); + if (status != CPA_STATUS_SUCCESS) + goto error; + + status = cpaCyStartInstance(cy_inst_handles[i]); + if (status != CPA_STATUS_SUCCESS) + goto error; + } + + qat_cy_init_done = B_TRUE; + return (0); + +error: + qat_cy_clean(); + return (-1); +} + +void +qat_cy_fini(void) +{ + if (!qat_cy_init_done) + return; + + qat_cy_clean(); +} + +static CpaStatus +qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle, + CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key, + Cpa64U crypt, Cpa32U aad_len) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U ctx_size; + Cpa32U ciper_algorithm; + Cpa32U hash_algorithm; + CpaCySymSessionSetupData sd = { 0 }; + + if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) { + return (CPA_STATUS_FAIL); + } else { + ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM; + hash_algorithm = CPA_CY_SYM_HASH_AES_GCM; + } + + sd.cipherSetupData.cipherAlgorithm = ciper_algorithm; + sd.cipherSetupData.pCipherKey = key->ck_data; + sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8; + sd.hashSetupData.hashAlgorithm = hash_algorithm; + sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH; + sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN; + sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len; + sd.sessionPriority = CPA_CY_PRIORITY_NORMAL; + sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING; + sd.digestIsAppended = CPA_FALSE; + sd.verifyDigest = CPA_FALSE; + + if (dir == QAT_ENCRYPT) { + sd.cipherSetupData.cipherDirection = + CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT; + sd.algChainOrder = + CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER; + } else { + ASSERT3U(dir, ==, QAT_DECRYPT); + sd.cipherSetupData.cipherDirection = + CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT; + sd.algChainOrder = + CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH; + } + + status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = cpaCySymInitSession(inst_handle, symcallback, &sd, + *cy_session_ctx); + if (status != CPA_STATUS_SUCCESS) { + QAT_PHYS_CONTIG_FREE(*cy_session_ctx); + return (status); + } + + return (CPA_STATUS_SUCCESS); +} + +static CpaStatus +qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle, + CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U ctx_size; + Cpa32U hash_algorithm; + CpaCySymSessionSetupData sd = { 0 }; + + /* + * ZFS's SHA512 checksum is actually SHA512/256, which uses + * a different IV from standard SHA512. QAT does not support + * SHA512/256, so we can only support SHA256. + */ + if (cksum == ZIO_CHECKSUM_SHA256) + hash_algorithm = CPA_CY_SYM_HASH_SHA256; + else + return (CPA_STATUS_FAIL); + + sd.sessionPriority = CPA_CY_PRIORITY_NORMAL; + sd.symOperation = CPA_CY_SYM_OP_HASH; + sd.hashSetupData.hashAlgorithm = hash_algorithm; + sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN; + sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t); + sd.digestIsAppended = CPA_FALSE; + sd.verifyDigest = CPA_FALSE; + + status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = cpaCySymInitSession(inst_handle, symcallback, &sd, + *cy_session_ctx); + if (status != CPA_STATUS_SUCCESS) { + QAT_PHYS_CONTIG_FREE(*cy_session_ctx); + return (status); + } + + return (CPA_STATUS_SUCCESS); +} + +static CpaStatus +qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs, + CpaBufferList *src, CpaBufferList *dst) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U meta_size = 0; + + status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size); + if (status != CPA_STATUS_SUCCESS) + goto error; + + if (src != dst) { + status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData, + meta_size); + if (status != CPA_STATUS_SUCCESS) + goto error; + } + + return (CPA_STATUS_SUCCESS); + +error: + QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData); + if (src != dst) + QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData); + + return (status); +} + +int +qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, + uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf, + crypto_key_t *key, uint64_t crypt, uint32_t enc_len) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa16U i; + CpaInstanceHandle cy_inst_handle; + Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2; + Cpa32U bytes_left = 0; + Cpa8S *data = NULL; + CpaCySymSessionCtx *cy_session_ctx = NULL; + cy_callback_t cb; + CpaCySymOpData op_data = { 0 }; + CpaBufferList src_buffer_list = { 0 }; + CpaBufferList dst_buffer_list = { 0 }; + CpaFlatBuffer *flat_src_buf_array = NULL; + CpaFlatBuffer *flat_src_buf = NULL; + CpaFlatBuffer *flat_dst_buf_array = NULL; + CpaFlatBuffer *flat_dst_buf = NULL; + struct page *in_pages[MAX_PAGE_NUM]; + struct page *out_pages[MAX_PAGE_NUM]; + Cpa32U in_page_num = 0; + Cpa32U out_page_num = 0; + Cpa32U in_page_off = 0; + Cpa32U out_page_off = 0; + + if (dir == QAT_ENCRYPT) { + QAT_STAT_BUMP(encrypt_requests); + QAT_STAT_INCR(encrypt_total_in_bytes, enc_len); + } else { + QAT_STAT_BUMP(decrypt_requests); + QAT_STAT_INCR(decrypt_total_in_bytes, enc_len); + } + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + cy_inst_handle = cy_inst_handles[i]; + + status = qat_init_crypt_session_ctx(dir, cy_inst_handle, + &cy_session_ctx, key, crypt, aad_len); + if (status != CPA_STATUS_SUCCESS) { + /* don't count CCM as a failure since it's not supported */ + if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM) + QAT_STAT_BUMP(crypt_fails); + return (status); + } + + /* + * We increment nr_bufs by 2 to allow us to handle non + * page-aligned buffer addresses and buffers whose sizes + * are not divisible by PAGE_SIZE. + */ + status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs, + &src_buffer_list, &dst_buffer_list); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult, + ZIO_DATA_MAC_LEN); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv, + ZIO_DATA_IV_LEN); + if (status != CPA_STATUS_SUCCESS) + goto fail; + if (aad_len > 0) { + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData, + aad_len); + if (status != CPA_STATUS_SUCCESS) + goto fail; + bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len); + } + + bytes_left = enc_len; + data = src_buf; + flat_src_buf = flat_src_buf_array; + while (bytes_left > 0) { + in_page_off = ((long)data & ~PAGE_MASK); + in_pages[in_page_num] = qat_mem_to_page(data); + flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off; + flat_src_buf->dataLenInBytes = + min((long)PAGE_SIZE - in_page_off, (long)bytes_left); + data += flat_src_buf->dataLenInBytes; + bytes_left -= flat_src_buf->dataLenInBytes; + flat_src_buf++; + in_page_num++; + } + src_buffer_list.pBuffers = flat_src_buf_array; + src_buffer_list.numBuffers = in_page_num; + + bytes_left = enc_len; + data = dst_buf; + flat_dst_buf = flat_dst_buf_array; + while (bytes_left > 0) { + out_page_off = ((long)data & ~PAGE_MASK); + out_pages[out_page_num] = qat_mem_to_page(data); + flat_dst_buf->pData = kmap(out_pages[out_page_num]) + + out_page_off; + flat_dst_buf->dataLenInBytes = + min((long)PAGE_SIZE - out_page_off, (long)bytes_left); + data += flat_dst_buf->dataLenInBytes; + bytes_left -= flat_dst_buf->dataLenInBytes; + flat_dst_buf++; + out_page_num++; + } + dst_buffer_list.pBuffers = flat_dst_buf_array; + dst_buffer_list.numBuffers = out_page_num; + + op_data.sessionCtx = cy_session_ctx; + op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL; + op_data.cryptoStartSrcOffsetInBytes = 0; + op_data.messageLenToCipherInBytes = 0; + op_data.hashStartSrcOffsetInBytes = 0; + op_data.messageLenToHashInBytes = 0; + op_data.messageLenToCipherInBytes = enc_len; + op_data.ivLenInBytes = ZIO_DATA_IV_LEN; + bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN); + /* if dir is QAT_DECRYPT, copy digest_buf to pDigestResult */ + if (dir == QAT_DECRYPT) + bcopy(digest_buf, op_data.pDigestResult, ZIO_DATA_MAC_LEN); + + cb.verify_result = CPA_FALSE; + init_completion(&cb.complete); + status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data, + &src_buffer_list, &dst_buffer_list, NULL); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + /* we now wait until the completion of the operation. */ + wait_for_completion(&cb.complete); + + if (cb.verify_result == CPA_FALSE) { + status = CPA_STATUS_FAIL; + goto fail; + } + + if (dir == QAT_ENCRYPT) { + /* if dir is QAT_ENCRYPT, save pDigestResult to digest_buf */ + bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN); + QAT_STAT_INCR(encrypt_total_out_bytes, enc_len); + } else { + QAT_STAT_INCR(decrypt_total_out_bytes, enc_len); + } + +fail: + if (status != CPA_STATUS_SUCCESS) + QAT_STAT_BUMP(crypt_fails); + + for (i = 0; i < in_page_num; i++) + kunmap(in_pages[i]); + for (i = 0; i < out_page_num; i++) + kunmap(out_pages[i]); + + cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx); + if (aad_len > 0) + QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData); + QAT_PHYS_CONTIG_FREE(op_data.pIv); + QAT_PHYS_CONTIG_FREE(op_data.pDigestResult); + QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(cy_session_ctx); + QAT_PHYS_CONTIG_FREE(flat_src_buf_array); + QAT_PHYS_CONTIG_FREE(flat_dst_buf_array); + + return (status); +} + +int +qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp) +{ + CpaStatus status; + Cpa16U i; + CpaInstanceHandle cy_inst_handle; + Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2; + Cpa32U bytes_left = 0; + Cpa8S *data = NULL; + CpaCySymSessionCtx *cy_session_ctx = NULL; + cy_callback_t cb; + Cpa8U *digest_buffer = NULL; + CpaCySymOpData op_data = { 0 }; + CpaBufferList src_buffer_list = { 0 }; + CpaFlatBuffer *flat_src_buf_array = NULL; + CpaFlatBuffer *flat_src_buf = NULL; + struct page *in_pages[MAX_PAGE_NUM]; + Cpa32U page_num = 0; + Cpa32U page_off = 0; + + QAT_STAT_BUMP(cksum_requests); + QAT_STAT_INCR(cksum_total_in_bytes, size); + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + cy_inst_handle = cy_inst_handles[i]; + + status = qat_init_checksum_session_ctx(cy_inst_handle, + &cy_session_ctx, cksum); + if (status != CPA_STATUS_SUCCESS) { + /* don't count unsupported checksums as a failure */ + if (cksum == ZIO_CHECKSUM_SHA256 || + cksum == ZIO_CHECKSUM_SHA512) + QAT_STAT_BUMP(cksum_fails); + return (status); + } + + /* + * We increment nr_bufs by 2 to allow us to handle non + * page-aligned buffer addresses and buffers whose sizes + * are not divisible by PAGE_SIZE. + */ + status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs, + &src_buffer_list, &src_buffer_list); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer, + sizeof (zio_cksum_t)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + bytes_left = size; + data = buf; + flat_src_buf = flat_src_buf_array; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + in_pages[page_num] = qat_mem_to_page(data); + flat_src_buf->pData = kmap(in_pages[page_num]) + page_off; + flat_src_buf->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + data += flat_src_buf->dataLenInBytes; + bytes_left -= flat_src_buf->dataLenInBytes; + flat_src_buf++; + page_num++; + } + src_buffer_list.pBuffers = flat_src_buf_array; + src_buffer_list.numBuffers = page_num; + + op_data.sessionCtx = cy_session_ctx; + op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL; + op_data.hashStartSrcOffsetInBytes = 0; + op_data.messageLenToHashInBytes = size; + op_data.pDigestResult = digest_buffer; + + cb.verify_result = CPA_FALSE; + init_completion(&cb.complete); + status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data, + &src_buffer_list, &src_buffer_list, NULL); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + /* we now wait until the completion of the operation. */ + wait_for_completion(&cb.complete); + + if (cb.verify_result == CPA_FALSE) { + status = CPA_STATUS_FAIL; + goto fail; + } + + bcopy(digest_buffer, zcp, sizeof (zio_cksum_t)); + +fail: + if (status != CPA_STATUS_SUCCESS) + QAT_STAT_BUMP(cksum_fails); + + for (i = 0; i < page_num; i++) + kunmap(in_pages[i]); + + cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx); + QAT_PHYS_CONTIG_FREE(digest_buffer); + QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(cy_session_ctx); + QAT_PHYS_CONTIG_FREE(flat_src_buf_array); + + return (status); +} + +static int +param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * zfs_qat_encrypt_disable = 0: enable qat encrypt + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_cy_init_done) { + ret = qat_cy_init(); + if (ret != 0) { + zfs_qat_encrypt_disable = 1; + return (ret); + } + } + return (ret); +} + +static int +param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * set_checksum_param_ops = 0: enable qat checksum + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_cy_init_done) { + ret = qat_cy_init(); + if (ret != 0) { + zfs_qat_checksum_disable = 1; + return (ret); + } + } + return (ret); +} + +module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt, + param_get_int, &zfs_qat_encrypt_disable, 0644); +MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption"); + +module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum, + param_get_int, &zfs_qat_checksum_disable, 0644); +MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming"); + +#endif diff --git a/module/os/macos/zfs/spa_misc_os.c b/module/os/macos/zfs/spa_misc_os.c new file mode 100644 index 0000000000..aa546edd1b --- /dev/null +++ b/module/os/macos/zfs/spa_misc_os.c @@ -0,0 +1,116 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" + +const char * +spa_history_zone(void) +{ + return ("macos"); +} + +void +spa_create_os(void *arg) +{ + spa_t *spa = (spa_t *)arg; + int haslock = 0; + int error; + + haslock = mutex_owned(&spa_namespace_lock); + + /* Increase open refcount */ + spa_open_ref(spa, FTAG); + + if (haslock) { + mutex_exit(&spa_namespace_lock); + } + + /* Create IOKit pool proxy */ + if ((error = spa_iokit_pool_proxy_create(spa)) != 0) { + printf("%s spa_iokit_pool_proxy_create error %d\n", + __func__, error); + /* spa_create succeeded, ignore proxy error */ + } + + /* Cache vdev info, needs open ref above, and pool proxy */ + + if (error == 0 && (error = zfs_boot_update_bootinfo(spa)) != 0) { + printf("%s update_bootinfo error %d\n", __func__, error); + /* create succeeded, ignore error from bootinfo */ + } + + /* Drop open refcount */ + if (haslock) { + mutex_enter(&spa_namespace_lock); + } + + spa_close(spa, FTAG); +} + +void +spa_export_os(void *arg) +{ + spa_t *spa = (spa_t *)arg; + + /* Remove IOKit pool proxy */ + spa_iokit_pool_proxy_destroy(spa); +} + +void +spa_activate_os(void *arg) +{ + /* spa_t *spa = (spa_t *)arg; */ + /* Lock kext in kernel while mounted */ + OSKextRetainKextWithLoadTag(OSKextGetCurrentLoadTag()); +} + +void +spa_deactivate_os(void *arg) +{ + /* spa_t *spa = (spa_t *)arg; */ + OSKextReleaseKextWithLoadTag(OSKextGetCurrentLoadTag()); +} diff --git a/module/os/macos/zfs/trace.c b/module/os/macos/zfs/trace.c new file mode 100644 index 0000000000..0c9990e854 --- /dev/null +++ b/module/os/macos/zfs/trace.c @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Each Linux tracepoints subsystem must define CREATE_TRACE_POINTS in one + * (and only one) C file, so this dummy file exists for that purpose. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include diff --git a/module/os/macos/zfs/vdev_disk.c b/module/os/macos/zfs/vdev_disk.c new file mode 100644 index 0000000000..2b4ea93884 --- /dev/null +++ b/module/os/macos/zfs/vdev_disk.c @@ -0,0 +1,787 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Based on Apple MacZFS source code + * Copyright (c) 2014,2016 by Jorgen Lundman. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for disks. + */ + +/* XXX leave extern if declared elsewhere - originally was in zfs_ioctl.c */ +ldi_ident_t zfs_li; + +static void vdev_disk_close(vdev_t *); + +typedef struct vdev_disk_ldi_cb { + list_node_t lcb_next; + ldi_callback_id_t lcb_id; +} vdev_disk_ldi_cb_t; + +static void +vdev_disk_alloc(vdev_t *vd) +{ + vdev_disk_t *dvd; + + dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + + /* + * Create the LDI event callback list. + */ + list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), + offsetof(vdev_disk_ldi_cb_t, lcb_next)); +} + +static void +vdev_disk_free(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; + + if (dvd == NULL) + return; + + /* + * We have already closed the LDI handle. Clean up the LDI event + * callbacks and free vd->vdev_tsd. + */ + while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { + list_remove(&dvd->vd_ldi_cbs, lcb); + (void) ldi_ev_remove_callbacks(lcb->lcb_id); + kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); + } + list_destroy(&dvd->vd_ldi_cbs); + kmem_free(dvd, sizeof (vdev_disk_t)); + vd->vdev_tsd = NULL; +} + +static int +vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg, + void *ev_data) +{ + vdev_t *vd = (vdev_t *)arg; + vdev_disk_t *dvd = vd->vdev_tsd; + + /* + * Ignore events other than offline. + */ + if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) + return (LDI_EV_SUCCESS); + + /* + * All LDI handles must be closed for the state change to succeed, so + * call on vdev_disk_close() to do this. + * + * We inform vdev_disk_close that it is being called from offline + * notify context so it will defer cleanup of LDI event callbacks and + * freeing of vd->vdev_tsd to the offline finalize or a reopen. + */ + dvd->vd_ldi_offline = B_TRUE; + vdev_disk_close(vd); + + /* + * Now that the device is closed, request that the spa_async_thread + * mark the device as REMOVED and notify FMA of the removal. + */ + zfs_post_remove(vd->vdev_spa, vd); + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); + + return (LDI_EV_SUCCESS); +} + +/* ARGSUSED */ +static void +vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, + int ldi_result, void *arg, void *ev_data) +{ + vdev_t *vd = (vdev_t *)arg; + + /* + * Ignore events other than offline. + */ + if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) + return; + + /* + * We have already closed the LDI handle in notify. + * Clean up the LDI event callbacks and free vd->vdev_tsd. + */ + vdev_disk_free(vd); + /* + * Request that the vdev be reopened if the offline state change was + * unsuccessful. + */ + if (ldi_result != LDI_EV_SUCCESS) { + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); + } +} + +static ldi_ev_callback_t vdev_disk_off_callb = { + .cb_vers = LDI_EV_CB_VERS, + .cb_notify = vdev_disk_off_notify, + .cb_finalize = vdev_disk_off_finalize +}; + +/* + * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when + * even a fallback to DKIOCGMEDIAINFO fails. + */ +#ifdef DEBUG +#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) +#else +#define VDEV_DEBUG(...) /* Nothing... */ +#endif + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift, uint64_t *physical_ashift) +{ + spa_t *spa = vd->vdev_spa; + vdev_disk_t *dvd = vd->vdev_tsd; + ldi_ev_cookie_t ecookie; + vdev_disk_ldi_cb_t *lcb; + union { + struct dk_minfo_ext ude; + struct dk_minfo ud; + } dks; + struct dk_minfo_ext *dkmext = &dks.ude; + struct dk_minfo *dkm = &dks.ud; + int error; + uint64_t capacity = 0, blksz = 0, pbsize; + int isssd; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (dvd != NULL) { + if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) { + /* + * If we are opening a device in its offline notify + * context, the LDI handle was just closed. Clean + * up the LDI event callbacks and free vd->vdev_tsd. + */ + vdev_disk_free(vd); + } else { + ASSERT(vd->vdev_reopening); + goto skip_open; + } + } + + /* + * Create vd->vdev_tsd. + */ + vdev_disk_alloc(vd); + dvd = vd->vdev_tsd; + + /* + * When opening a disk device, we want to preserve the user's original + * intent. We always want to open the device by the path the user gave + * us, even if it is one of multiple paths to the same device. But we + * also want to be able to survive disks being removed/recabled. + * Therefore the sequence of opening devices is: + * + * 1. Try opening the device by path. For legacy pools without the + * 'whole_disk' property, attempt to fix the path by appending 's0'. + * + * 2. If the devid of the device matches the stored value, return + * success. + * + * 3. Otherwise, the device may have moved. Try opening the device + * by the devid instead. + */ + + error = EINVAL; /* presume failure */ + + if (vd->vdev_path != NULL) { + + /* + * If we have not yet opened the device, try to open it by the + * specified path. + */ + if (error != 0) { + error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), + kcred, &dvd->vd_lh, zfs_li); + } + + /* + * If we succeeded in opening the device, but 'vdev_wholedisk' + * is not yet set, then this must be a slice. + */ + if (error == 0 && vd->vdev_wholedisk == -1ULL) + vd->vdev_wholedisk = 0; + } + + /* + * If all else fails, then try opening by physical path (if available) + * or the logical path (if we failed due to the devid check). While not + * as reliable as the devid, this will give us something, and the higher + * level vdev validation will prevent us from opening the wrong device. + */ + if (error) { + + /* + * Note that we don't support the legacy auto-wholedisk support + * as above. This hasn't been used in a very long time and we + * don't need to propagate its oddities to this edge condition. + */ + if (error && vd->vdev_path != NULL) + error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), + kcred, &dvd->vd_lh, zfs_li); + } + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]", + error); + return (error); + } + + /* + * Register callbacks for the LDI offline event. + */ + if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == + LDI_EV_SUCCESS) { + lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); + list_insert_tail(&dvd->vd_ldi_cbs, lcb); + (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, + &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); + } + +skip_open: + /* + * Determine the actual size of the device. + */ + if (ldi_get_size(dvd->vd_lh, psize) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_disk_open: failed to get size"); + return (SET_ERROR(EINVAL)); + } + + *max_psize = *psize; + + /* + * Determine the device's minimum transfer size. + * If the ioctl isn't supported, assume DEV_BSIZE. + */ + if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, + (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { + capacity = dkmext->dki_capacity - 1; + blksz = dkmext->dki_lbsize; + pbsize = dkmext->dki_pbsize; + } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, + (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { + VDEV_DEBUG( + "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", + vd->vdev_path); + capacity = dkm->dki_capacity - 1; + blksz = dkm->dki_lbsize; + pbsize = blksz; + } else { + VDEV_DEBUG("vdev_disk_open(\"%s\"): " + "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", + vd->vdev_path, error); + pbsize = DEV_BSIZE; + } + + *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; + *physical_ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; + + if (vd->vdev_wholedisk == 1) { + int wce = 1; + + /* + * Since we own the whole disk, try to enable disk write + * caching. We ignore errors because it's OK if we can't do it. + */ + (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, + FKIOCTL, kcred, NULL); + } + + /* + * Clear the nowritecache bit, so that on a vdev_reopen() we will + * try again. + */ + vd->vdev_nowritecache = B_FALSE; + + /* Inform the ZIO pipeline that we are non-rotational */ + vd->vdev_nonrot = B_FALSE; + if (ldi_ioctl(dvd->vd_lh, DKIOCISSOLIDSTATE, (intptr_t)&isssd, + FKIOCTL, kcred, NULL) == 0) { + vd->vdev_nonrot = (isssd ? B_TRUE : B_FALSE); + } + + // Assume no TRIM + vd->vdev_has_trim = B_FALSE; + uint32_t features; + if (ldi_ioctl(dvd->vd_lh, DKIOCGETFEATURES, (intptr_t)&features, + FKIOCTL, kcred, NULL) == 0) { + if (features & DK_FEATURE_UNMAP) + vd->vdev_has_trim = B_TRUE; + } + + /* Set when device reports it supports secure TRIM. */ + // No secure trim in Apple yet. + vd->vdev_has_securetrim = B_FALSE; + + return (0); +} + +static void +vdev_disk_close(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + if (vd->vdev_reopening || dvd == NULL) + return; + + if (dvd->vd_lh != NULL) { + (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); + dvd->vd_lh = NULL; + } + + vd->vdev_delayed_close = B_FALSE; + /* + * If we closed the LDI handle due to an offline notify from LDI, + * don't free vd->vdev_tsd or unregister the callbacks here; + * the offline finalize callback or a reopen will take care of it. + */ + if (dvd->vd_ldi_offline) + return; + + vdev_disk_free(vd); +} + +int +vdev_disk_physio(vdev_t *vd, caddr_t data, + size_t size, uint64_t offset, int flags, boolean_t isdump) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) + return (EIO); + + ASSERT(vd->vdev_ops == &vdev_disk_ops); + + return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); +} + +int +vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, + size_t size, uint64_t offset, int flags) +{ + ldi_buf_t *bp; + int error = 0; + + if (vd_lh == NULL) + return (SET_ERROR(EINVAL)); + + ASSERT(flags & B_READ || flags & B_WRITE); + + bp = getrbuf(KM_SLEEP); + bp->b_flags = flags | B_BUSY | B_NOCACHE; + bp->b_bcount = size; + bp->b_un.b_addr = (void *)data; + bp->b_lblkno = lbtodb(offset); + bp->b_bufsize = size; + + error = ldi_strategy(vd_lh, bp); + ASSERT(error == 0); + + if ((error = biowait(bp)) == 0 && bp->b_resid != 0) + error = SET_ERROR(EIO); + freerbuf(bp); + + return (error); +} + +static void +vdev_disk_io_intr(ldi_buf_t *bp) +{ + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; + + /* + * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. + * Rather than teach the rest of the stack about other error + * possibilities (EFAULT, etc), we normalize the error value here. + */ + zio->io_error = (geterror(bp) != 0 ? EIO : 0); + + if (zio->io_error == 0 && bp->b_resid != 0) + zio->io_error = SET_ERROR(EIO); + + if (zio->io_type == ZIO_TYPE_READ) { + abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, + zio->io_size); + } else { + abd_return_buf(zio->io_abd, bp->b_un.b_addr, + zio->io_size); + } + + kmem_free(vb, sizeof (vdev_buf_t)); + + zio_delay_interrupt(zio); +} + +static void +vdev_disk_ioctl_free(zio_t *zio) +{ + kmem_free(zio->io_vsd, sizeof (struct dk_callback)); +} + +static const zio_vsd_ops_t vdev_disk_vsd_ops = { + vdev_disk_ioctl_free, + zio_vsd_default_cksum_report +}; + +static void +vdev_disk_ioctl_done(void *zio_arg, int error) +{ + zio_t *zio = zio_arg; + + zio->io_error = error; + + zio_interrupt(zio); +} + +static void +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_buf_t *vb; + struct dk_callback *dkc; + ldi_buf_t *bp = 0; + int flags, error = 0; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); + zio->io_vsd_ops = &vdev_disk_vsd_ops; + + dkc->dkc_callback = vdev_disk_ioctl_done; + dkc->dkc_flag = FLUSH_VOLATILE; + dkc->dkc_cookie = zio; + + error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, + (uintptr_t)dkc, FKIOCTL, kcred, NULL); + + if (error == 0) { + /* + * The ioctl will be done asychronously, + * and will call vdev_disk_ioctl_done() + * upon completion. + */ + return; + } + + zio->io_error = error; + + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + } /* io_cmd */ + + zio_execute(zio); + return; + + case ZIO_TYPE_WRITE: + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) + flags = B_WRITE; + else + flags = B_WRITE | B_ASYNC; + break; + + case ZIO_TYPE_READ: + if (zio->io_priority == ZIO_PRIORITY_SYNC_READ) + flags = B_READ; + else + flags = B_READ | B_ASYNC; + break; + + case ZIO_TYPE_TRIM: + { + dkioc_free_list_ext_t dfle; + dfle.dfle_start = zio->io_offset; + dfle.dfle_length = zio->io_size; + zio->io_error = ldi_ioctl(dvd->vd_lh, DKIOCFREE, + (uintptr_t)&dfle, FKIOCTL, kcred, NULL); + zio_interrupt(zio); + return; + } + + default: + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); + return; + } /* io_type */ + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + + /* Stop OSX from also caching our data */ + flags |= B_NOCACHE | B_PASSIVE; + + zio->io_target_timestamp = zio_handle_io_delay(zio); + + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); + + vb->vb_io = zio; + bp = &vb->vb_buf; + + ASSERT(bp != NULL); + ASSERT(zio->io_abd != NULL); + ASSERT(zio->io_size != 0); + + bioinit(bp); + bp->b_flags = B_BUSY | flags; + bp->b_bcount = zio->io_size; + + if (zio->io_type == ZIO_TYPE_READ) { + ASSERT3S(zio->io_abd->abd_size, >=, zio->io_size); + bp->b_un.b_addr = + abd_borrow_buf(zio->io_abd, zio->io_size); + } else { + ASSERT3S(zio->io_abd->abd_size, >=, zio->io_size); + bp->b_un.b_addr = + abd_borrow_buf_copy(zio->io_abd, zio->io_size); + } + + bp->b_lblkno = lbtodb(zio->io_offset); + bp->b_bufsize = zio->io_size; + bp->b_iodone = (int (*)(struct ldi_buf *))vdev_disk_io_intr; + + error = ldi_strategy(dvd->vd_lh, bp); + if (error != 0) { + dprintf("%s error from ldi_strategy %d\n", __func__, error); + zio->io_error = EIO; + kmem_free(vb, sizeof (vdev_buf_t)); + zio_execute(zio); + // zio_interrupt(zio); + } +} + +static void +vdev_disk_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. Otherwise, probe the device and + * make sure it's still accessible. + */ + if (zio->io_error == EIO && !vd->vdev_remove_wanted) { + vdev_disk_t *dvd = vd->vdev_tsd; + int state = DKIO_NONE; + + if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { + /* + * We post the resource as soon as possible, instead of + * when the async removal actually happens, because the + * DE is using this information to discard previous I/O + * errors. + */ + zfs_post_remove(zio->io_spa, vd); + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } +} + +static void +vdev_disk_hold(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* We must have a pathname, and it must be absolute. */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') + return; + + /* + * Only prefetch path and devid info if the device has + * never been opened. + */ + if (vd->vdev_tsd != NULL) + return; + +} + +static void +vdev_disk_rele(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* XXX: Implement me as a vnode rele for the device */ +} + +vdev_ops_t vdev_disk_ops = { + vdev_disk_open, + vdev_disk_close, + vdev_default_asize, + vdev_disk_io_start, + vdev_disk_io_done, + NULL, + NULL, + vdev_disk_hold, + vdev_disk_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * Given the root disk device devid or pathname, read the label from + * the device, and construct a configuration nvlist. + */ +int +vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) +{ + ldi_handle_t vd_lh; + vdev_label_t *label; + uint64_t s, size; + int l; + int error = -1; + + /* + * Read the device label and build the nvlist. + */ + + /* Apple: Error will be -1 at this point, allowing open_by_name */ + error = -1; + vd_lh = 0; /* Dismiss compiler warning */ + + if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, + zfs_li))) + return (error); + + if (ldi_get_size(vd_lh, &s)) { + (void) ldi_close(vd_lh, FREAD, kcred); + return (SET_ERROR(EIO)); + } + + size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); + label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); + + *config = NULL; + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t offset, state, txg = 0; + + /* read vdev label */ + offset = vdev_label_offset(size, l, 0); + if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, + VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state >= POOL_STATE_DESTROYED) { + nvlist_free(*config); + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(*config); + *config = NULL; + continue; + } + + break; + } + + kmem_free(label, sizeof (vdev_label_t)); + (void) ldi_close(vd_lh, FREAD, kcred); + if (*config == NULL) + error = SET_ERROR(EIDRM); + + return (error); +} diff --git a/module/os/macos/zfs/vdev_file.c b/module/os/macos/zfs/vdev_file.c new file mode 100644 index 0000000000..284b23c61b --- /dev/null +++ b/module/os/macos/zfs/vdev_file.c @@ -0,0 +1,323 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for files. + */ + +static taskq_t *vdev_file_taskq; + +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static mode_t +vdev_file_open_mode(spa_mode_t spa_mode) +{ + mode_t mode = 0; + + if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { + mode = O_RDWR; + } else if (spa_mode & SPA_MODE_READ) { + mode = O_RDONLY; + } else if (spa_mode & SPA_MODE_WRITE) { + mode = O_WRONLY; + } + + return (mode | O_LARGEFILE); +} + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift, uint64_t *physical_ashift) +{ + vdev_file_t *vf; + zfs_file_t *fp; + zfs_file_attr_t zfa; + int error = 0; + + dprintf("vdev_file_open %p\n", vd->vdev_tsd); + + /* + * Rotational optimizations only make sense on block devices. + */ + vd->vdev_nonrot = B_TRUE; + + /* + * Allow TRIM on file based vdevs. This may not always be supported, + * since it depends on your kernel version and underlying filesystem + * type but it is always safe to attempt. + */ + vd->vdev_has_trim = B_TRUE; + + /* + * Disable secure TRIM on file based vdevs. There is no way to + * request this behavior from the underlying filesystem. + */ + vd->vdev_has_securetrim = B_FALSE; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ +#ifdef _KERNEL + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + goto skip_open; + } +#endif + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + + error = zfs_file_open(vd->vdev_path + 1, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_file = fp; + + /* + * Make sure it's a regular file. + */ + if (zfs_file_getattr(fp, &zfa)) { + return (SET_ERROR(ENODEV)); + } + +skip_open: + /* + * Determine the physical size of the file. + */ + error = zfs_file_getattr(vf->vf_file, &zfa); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *max_psize = *psize = zfa.zfa_size; + *ashift = SPA_MINBLOCKSHIFT; + *physical_ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_reopening || vf == NULL) + return; + + if (vf->vf_file != NULL) { + zfs_file_close(vf->vf_file); + } + + vd->vdev_delayed_close = B_FALSE; + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +static void +vdev_file_io_strategy(void *arg) +{ + zio_t *zio = (zio_t *)arg; + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + ssize_t resid; + loff_t off; + void *data; + ssize_t size; + int err; + + off = zio->io_offset; + size = zio->io_size; + resid = 0; + + if (zio->io_type == ZIO_TYPE_READ) { + data = + abd_borrow_buf(zio->io_abd, size); + err = zfs_file_pread(vf->vf_file, data, size, off, &resid); + abd_return_buf_copy(zio->io_abd, data, size); + } else { + data = + abd_borrow_buf_copy(zio->io_abd, size); + err = zfs_file_pwrite(vf->vf_file, data, size, off, &resid); + abd_return_buf(zio->io_abd, data, size); + } + + zio->io_error = (err != 0 ? EIO : 0); + + if (zio->io_error == 0 && resid != 0) + zio->io_error = SET_ERROR(ENOSPC); + + zio_delay_interrupt(zio); +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + zio->io_error = zfs_file_fsync(vf->vf_file, + O_SYNC|O_DSYNC); + break; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_execute(zio); + return; + } else if (zio->io_type == ZIO_TYPE_TRIM) { + int mode = 0; + + ASSERT3U(zio->io_size, !=, 0); + + /* XXX FreeBSD has no fallocate routine in file ops */ + zio->io_error = zfs_file_fallocate(vf->vf_file, + mode, zio->io_offset, zio->io_size); + zio_execute(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + + VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio, + TQ_SLEEP), !=, 0); +} + + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_FILE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("vdev_file_taskq", 100, minclsyspri, + max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); + + VERIFY(vdev_file_taskq); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/module/os/macos/zfs/zfs_acl.c b/module/os/macos/zfs/zfs_acl.c new file mode 100644 index 0000000000..63cd233e10 --- /dev/null +++ b/module/os/macos/zfs/zfs_acl.c @@ -0,0 +1,2983 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + +#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} + +zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = &zfs_acl_fuid_ops; + else + aclp->z_ops = &zfs_acl_v0_ops; + return (aclp); +} + +zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (S_ISDIR(obj_mode) && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) + return (B_FALSE); + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + ASSERT(aclp); + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops->ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops->ace_flags_get(acep); + *type = aclp->z_ops->ace_type_get(acep); + *access_mask = aclp->z_ops->ace_mask_get(acep); + *who = aclp->z_ops->ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +#if 0 // unused function +static zfs_acl_node_t * +zfs_acl_curr_node(zfs_acl_t *aclp) +{ + ASSERT(aclp->z_curr_node); + return (aclp->z_curr_node); +} +#endif + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +int +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops->ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +#if 0 // unused function +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) + continue; + + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} +#endif + +static int +zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type))) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = &zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, zp->z_mode, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops->ace_mask_set(acep, access_mask); + aclp->z_ops->ace_type_set(acep, access_type); + aclp->z_ops->ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops->ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + */ +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; + + + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type))) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over any inherit_only ACEs + */ + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + + /* + * Apple has unusual expectations to emulate hfs in that the + * mode is not updated: + * -rw-r--r-- 1 root wheel 0 Nov 12 12:39 file.txt + * chmod +a "root allow execute" file.txt + * ZFS: -rwxr--r--+ 1 root wheel 0 Nov 12 12:39 file.txt + * HFS: -rw-r--r--+ 1 root wheel 0 Nov 12 12:39 file.txt + * 0: user:root allow execute + */ + if (entry_type == ACE_OWNER) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; + } + } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + *pflags &= ~ZFS_NO_EXECS_DENIED; + else + *pflags |= ZFS_NO_EXECS_DENIED; + + return (mode); +} + +/* + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. + */ +int +zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) +{ + zfs_acl_t *aclp; + int aclsize = 0; + int acl_count = 0; + zfs_acl_node_t *aclnode; + zfs_acl_phys_t znode_acl; + int version; + int error; + boolean_t drop_lock = B_FALSE; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + + /* + * close race where znode could be upgrade while trying to + * read the znode attributes. + * + * But this could only happen if the file isn't already an SA + * znode + */ + if (!zp->z_is_sa && !have_lock) { + mutex_enter(&zp->z_lock); + drop_lock = B_TRUE; + } + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; + } + + aclp = zfs_acl_alloc(version); + + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + + if (error != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + goto done; + } + + list_insert_head(&aclp->z_acl, aclnode); + + *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; +done: + if (drop_lock) + mutex_exit(&zp->z_lock); + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACLTYPE_POSIX) + return (0); + + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error == 0 && aclp->z_acl_count > 0) + zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, zp->z_uid, zp->z_gid); + + /* + * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL + * nor a DACL_ACES SA in which case ENOENT is returned from + * zfs_acl_node_read() when the SA can't be located. + * Allow chown/chgrp to succeed in these cases rather than + * returning an error that makes no sense in the context of + * the caller. + */ + if (error == ENOENT) + return (0); + + return (error); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) +{ + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + zfs_acl_phys_t acl_phys; + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); + + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + /* + * Upgrade needed? + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp, cr); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_OLD_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; + } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); + } + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); +} + +static void +zfs_acl_chmod(umode_t umode, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) +{ + void *acep = NULL; + uint64_t who; + int new_count, new_bytes; + int ace_size; + int entry_type; + uint16_t iflags, type; + uint32_t access_mask; + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops->ace_abstract_size(); + void *zacep; + boolean_t isdir; + trivial_acl_t masks; + + new_count = new_bytes = 0; + + isdir = S_ISDIR(umode); + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + entry_type = (iflags & ACE_TYPE_FLAGS); + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; + } + + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + } else { + /* + * Limit permissions granted by ACEs to be no greater + * than permissions of the requested group mode. + * Applies when the "aclmode" property is set to + * "groupmask". + */ + if ((type == ALLOW) && trim) + access_mask &= masks.group; + } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops->ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error = 0; + + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(zp->z_mode, mode, B_TRUE, + (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(umode_t umode, uint16_t acep_flags) +{ + int iflags = (acep_flags & 0xf); + + if (S_ISDIR(umode) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!S_ISDIR((umode) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(zfsvfs_t *zfsvfs, zfs_acl_t *paclp, + uint64_t umode, boolean_t *need_chmod) +{ + void *pacep = NULL; + void *acep; + zfs_acl_node_t *aclnode; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + uint_t aclinherit; + boolean_t isdir = S_ISDIR(umode); + boolean_t islnk = S_ISLNK(umode); + boolean_t isreg = S_ISREG(umode); + + *need_chmod = B_TRUE; + + aclp = zfs_acl_alloc(paclp->z_version); + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || islnk) + return (aclp); + + while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type))) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(umode, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if ((aclinherit == ZFS_ACL_PASSTHROUGH || + aclinherit == ZFS_ACL_PASSTHROUGH_X) && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == OWNING_GROUP)) && + (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((umode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + + /* + * Strip write_acl and write_owner from permissions + * when inheriting an ACE + */ + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; + } + + ace_size = aclp->z_ops->ace_size(pacep); + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops->ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops->ace_flags_get(acep); + + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + continue; + } + + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; + + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } else { + newflags &= ~ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + * Also, create FUIDs for owner and group. + */ +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) +{ + int error; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_acl_t *paclp; + gid_t gid; + boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; + boolean_t inherited = B_FALSE; + + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); + + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); + /* + * Determine uid and gid. + */ + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + } else { + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; + if (vap->va_mask & ATTR_GID) { + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + if (acl_ids->z_fgid != dzp->z_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + acl_ids->z_fgid = 0; + } + if (acl_ids->z_fgid == 0) { + if (dzp->z_mode & S_ISGID) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = dzp->z_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, + cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = zfs_fuid_idx_domain( + &zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, + FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } + } else { + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); +#ifdef __FreeBSD__ + gid = acl_ids->z_fgid = dzp->z_gid; +#else + gid = crgetgid(cr); +#endif + } + } + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; + } else { + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); + mutex_enter(&dzp->z_lock); + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; + } else { + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + mutex_exit(&dzp->z_lock); + mutex_exit(&dzp->z_acl_lock); + + if (need_chmod) { + if (vap->va_type == VDIR) + acl_ids->z_aclp->z_hints |= + ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, + trim, acl_ids->z_aclp); + } + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + + return (0); +} + +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} + +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) +{ + return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || + zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || + (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); +} + +/* + * Retrieve a file's ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclcheck, + cred_t *cr) +{ + struct kauth_acl **aclpp = (struct kauth_acl **)vsecp; + zfs_acl_t *aclp; + kauth_acl_t k_acl; + u_int32_t ace_flags = 0; + kauth_ace_rights_t rights = 0; + guid_t *guidp; + uint64_t who; + uint32_t access_mask; + uint16_t flags; + uint16_t type; + int i; + int error; + void *zacep = NULL; + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_TRUE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + if ((k_acl = kauth_acl_alloc(aclp->z_acl_count)) == NULL) { + mutex_exit(&zp->z_acl_lock); + *aclpp = (kauth_acl_t)KAUTH_FILESEC_NONE; + return (ENOMEM); + } + + dprintf("acl_count %d\n", aclp->z_acl_count); + + k_acl->acl_entrycount = aclp->z_acl_count; + k_acl->acl_flags = 0; + *aclpp = k_acl; + + /* + * Translate Open Solaris ACEs to Mac OS X ACLs + */ + i = 0; + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &flags, &type))) { + rights = 0; + ace_flags = 0; + + guidp = &k_acl->acl_ace[i].ace_applicable; + + if (flags & ACE_OWNER) { +#if HIDE_TRIVIAL_ACL + continue; +#endif + who = -1; + nfsacl_set_wellknown(KAUTH_WKG_OWNER, guidp); + } else if ((flags & OWNING_GROUP) == OWNING_GROUP) { +#if HIDE_TRIVIAL_ACL + continue; +#endif + who = -1; + nfsacl_set_wellknown(KAUTH_WKG_GROUP, guidp); + } else if (flags & ACE_EVERYONE) { +#if HIDE_TRIVIAL_ACL + continue; +#endif + who = -1; + nfsacl_set_wellknown(KAUTH_WKG_EVERYBODY, guidp); + /* Try to get a guid from our uid */ + } else { + + dprintf("ZFS: trying to map uid %d flags %x type %x\n", + who, flags, type); + + if (flags & OWNING_GROUP) { + if (kauth_cred_gid2guid(who, guidp) == 0) { + dprintf("ZFS: appears to be a group\n"); + } + } else if (kauth_cred_uid2guid(who, guidp) == 0) { + dprintf("ZFS: appears to be a user\n"); + } else { + dprintf("ZFS: Unable to map\n"); + bzero(guidp, sizeof (guid_t)); + } + } + + // access_mask = aclp->z_acl[i].a_access_mask; + if (access_mask & ACE_READ_DATA) + rights |= KAUTH_VNODE_READ_DATA; + if (access_mask & ACE_WRITE_DATA) + rights |= KAUTH_VNODE_WRITE_DATA; + if (access_mask & ACE_APPEND_DATA) + rights |= KAUTH_VNODE_APPEND_DATA; + if (access_mask & ACE_READ_NAMED_ATTRS) + rights |= KAUTH_VNODE_READ_EXTATTRIBUTES; + if (access_mask & ACE_WRITE_NAMED_ATTRS) + rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; + if (access_mask & ACE_EXECUTE) + rights |= KAUTH_VNODE_EXECUTE; + if (access_mask & ACE_DELETE_CHILD) + rights |= KAUTH_VNODE_DELETE_CHILD; + if (access_mask & ACE_READ_ATTRIBUTES) + rights |= KAUTH_VNODE_READ_ATTRIBUTES; + if (access_mask & ACE_WRITE_ATTRIBUTES) + rights |= KAUTH_VNODE_WRITE_ATTRIBUTES; + if (access_mask & ACE_DELETE) + rights |= KAUTH_VNODE_DELETE; + if (access_mask & ACE_READ_ACL) + rights |= KAUTH_VNODE_READ_SECURITY; + if (access_mask & ACE_WRITE_ACL) + rights |= KAUTH_VNODE_WRITE_SECURITY; + if (access_mask & ACE_WRITE_OWNER) + rights |= KAUTH_VNODE_TAKE_OWNERSHIP; + if (access_mask & ACE_SYNCHRONIZE) + rights |= KAUTH_VNODE_SYNCHRONIZE; + k_acl->acl_ace[i].ace_rights = rights; + + // flags = aclp->z_acl[i].a_flags; + if (flags & ACE_FILE_INHERIT_ACE) + ace_flags |= KAUTH_ACE_FILE_INHERIT; + if (flags & ACE_DIRECTORY_INHERIT_ACE) + ace_flags |= KAUTH_ACE_DIRECTORY_INHERIT; + if (flags & ACE_NO_PROPAGATE_INHERIT_ACE) + ace_flags |= KAUTH_ACE_LIMIT_INHERIT; + if (flags & ACE_INHERIT_ONLY_ACE) + ace_flags |= KAUTH_ACE_ONLY_INHERIT; + + // type = aclp->z_acl[i].a_type; + switch (type) { + case ACE_ACCESS_ALLOWED_ACE_TYPE: + ace_flags |= KAUTH_ACE_PERMIT; + break; + case ACE_ACCESS_DENIED_ACE_TYPE: + ace_flags |= KAUTH_ACE_DENY; + break; + case ACE_SYSTEM_AUDIT_ACE_TYPE: + ace_flags |= KAUTH_ACE_AUDIT; + break; + case ACE_SYSTEM_ALARM_ACE_TYPE: + ace_flags |= KAUTH_ACE_ALARM; + break; + } + k_acl->acl_ace[i].ace_flags = ace_flags; + i++; + } + k_acl->acl_entrycount = i; + mutex_exit(&zp->z_acl_lock); + + zfs_acl_free(aclp); + + return (0); +} + +int +zfs_addacl_trivial(znode_t *zp, ace_t *aces, int *nentries, int seen_type) +{ + zfs_acl_t *aclp; + uint64_t who; + uint32_t access_mask; + uint16_t flags; + uint16_t type; + int i; + int error; + void *zacep = NULL; + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_TRUE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + dprintf("ondisk acl_count %d\n", aclp->z_acl_count); + + // Start at the end + i = *nentries; + + /* + * Translate Open Solaris ACEs to Mac OS X ACLs + */ + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &flags, &type))) { + + if (flags & ACE_OWNER) { + if (seen_type & ACE_OWNER) continue; + seen_type |= ACE_OWNER; + who = -1; + } else if ((flags & OWNING_GROUP) == OWNING_GROUP) { + if (seen_type & ACE_GROUP) continue; + seen_type |= ACE_GROUP; + who = -1; + } else if (flags & ACE_EVERYONE) { + if (seen_type & ACE_EVERYONE) continue; + seen_type |= ACE_EVERYONE; + who = -1; + /* Try to get a guid from our uid */ + } else { + // Only deal with the trivials + continue; + } + + aces[i].a_who = who; + aces[i].a_access_mask = access_mask; + aces[i].a_flags = flags; + aces[i].a_type = type; + + dprintf("zfs: adding entry %d for type %x sizeof %d\n", i, type, + sizeof (aces[i])); + i++; + } + + *nentries = i; + mutex_exit(&zp->z_acl_lock); + + zfs_acl_free(aclp); + + return (0); +} + + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode, + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (SET_ERROR(EINVAL)); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size, fuidp, cr)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + + + +/* + * Set a file's ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; + uint64_t acl_obj; + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, vnode_vtype(ZTOV(zp)), vsecp, cr, + &fuidp, &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + +top: + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_NOWAIT); + if (error) { + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); + zp->z_acl_cached = aclp; + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + + +/* + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. + */ +static int +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) +{ + if ((v4_mode & WRITE_MASK) && + (vfs_isrdonly(zp->z_zfsvfs->z_vfs)) && + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { + return (SET_ERROR(EROFS)); + } + + /* + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common(). + */ + if ((v4_mode & WRITE_MASK_DATA) && + (zp->z_pflags & ZFS_IMMUTABLE)) { + return (EPERM); + } +#ifdef sun + if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (EPERM); +#else + /* + * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK + * (sunlnk) is set. We just don't allow directory removal, which is + * handled in zfs_zaccess_delete(). + */ + if ((v4_mode & ACE_DELETE) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (SET_ERROR(EPERM)); + } +#endif + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); + } + + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCES if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + ASSERT(zp->z_acl_cached); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + uint32_t mask_matched; + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (vnode_isdir(ZTOV(zp)) && (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != IDMAP_WK_CREATOR_OWNER_UID && + uid == newid) + checkit = B_TRUE; + break; + } else { + mutex_exit(&zp->z_acl_lock); + return (SET_ERROR(EIO)); + } + } + + if (checkit) { + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } + } + *working_mode &= ~mask_matched; + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (SET_ERROR(EACCES)); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + /* + * Note: ZFS_READONLY represents the "DOS R/O" attribute. + * When that flag is set, we should behave as if write access + * were not granted by anything in the ACL. In particular: + * We _must_ allow writes after opening the file r/w, then + * setting the DOS R/O attribute, and writing some more. + * (Similar to how you can write after fchmod(fd, 0444).) + * + * Therefore ZFS_READONLY is ignored in the dataset check + * above, and checked here as if part of the ACL check. + * Also note: DOS R/O is ignored for directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + !vnode_isdir(ZTOV(zp)) && + (zp->z_pflags & ZFS_READONLY)) { + return (SET_ERROR(EPERM)); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (SET_ERROR(EACCES)); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t uid = crgetuid(cr); + int error; + + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + is_attr = ((zdp->z_pflags & ZFS_XATTR) && + (vnode_isdir(ZTOV(zdp)))); + if (is_attr) + goto slow; + + + mutex_enter(&zdp->z_acl_lock); + + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + + if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + + if (uid == zdp->z_uid) { + owner = B_TRUE; + if (zdp->z_mode & S_IXUSR) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (groupmember(zdp->z_gid, cr)) { + groupmbr = B_TRUE; + if (zdp->z_mode & S_IXGRP) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_mode & S_IXOTH) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + } + + mutex_exit(&zdp->z_acl_lock); + +slow: + DTRACE_PROBE(zfs__fastpath__execute__access__miss); + ZFS_ENTER(zdp->z_zfsvfs); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); + ZFS_EXIT(zdp->z_zfsvfs); + return (error); +} + +/* + * Determine whether Access should be granted/denied. + * + * The least priv subsytem is always consulted as a basic privilege + * can define any form of access. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + boolean_t check_privs; + znode_t *xzp = NULL; + znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; + + is_attr = ((zp->z_pflags & ZFS_XATTR) && (vnode_isdir(ZTOV(zp)))); + +#ifdef __APPLE__ + /* + * In FreeBSD, we don't care about permissions of individual ADS. + * Note that not checking them is not just an optimization - without + * this shortcut, EA operations may bogusly fail with EACCES. + */ + if (zp->z_pflags & ZFS_XATTR) + return (0); +#else + /* + * If attribute then validate against base file + */ + if (is_attr) { + uint64_t parent; + + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zp->z_zfsvfs), &parent, + sizeof (parent))) != 0) + return (error); + + /* + * Cache the lookup on the parent file znode as + * zp->z_xattr_parent and hold a reference. This + * effectively pins the parent in memory until all + * child xattr znodes have been destroyed and + * release their references in zfs_inode_destroy(). + */ + error = zfs_zget(zp->z_zfsvfs, parent, &check_zp); + if (error) + return (error); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zp->z_xattr_parent == NULL) + zp->z_xattr_parent = check_zp; + rw_exit(&zp->z_xattr_lock); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } +#endif + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + /* + * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC + * in needed_bits. Map the bits mapped by working_mode (currently + * missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VWRITE; + if (working_mode & ACE_EXECUTE) + needed_bits |= VEXEC; + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits)); + } + + if (error && !check_privs) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + mode_t checkmode = 0; + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner, + needed_bits & ~checkmode, needed_bits); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, + owner); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, + owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(ZTOV(check_zp), cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, + owner); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = SET_ERROR(EACCES); + } + } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits); + } + + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +/* See zfs_zaccess_delete() */ +uint64_t zfs_write_implies_delete_child = 1; + +/* + * Determine whether delete access should be granted. + * + * The following chart outlines how we handle delete permissions which is + * how recent versions of windows (Windows 2008) handles it. The efficiency + * comes from not having to check the parent ACL where the object itself grants + * delete: + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Deny * | Permit | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Deny * | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * Re. execute permission on the directory: if that's missing, + * the vnode lookup of the target will fail before we get here. + * + * Re [*] in the table above: NFSv4 would normally Permit delete for + * these two cells of the matrix. + * See acl.h for notes on which ACE_... flags should be checked for which + * operations. Specifically, the NFSv4 committee recommendation is in + * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs + * should take precedence ahead of ALLOW ACEs. + * + * This implementation always consults the target object's ACL first. + * If a DENY ACE is present on the target object that specifies ACE_DELETE, + * delete access is denied. If an ALLOW ACE with ACE_DELETE is present on + * the target object, access is allowed. If and only if no entries with + * ACE_DELETE are present in the object's ACL, check the container's ACL + * for entries with ACE_DELETE_CHILD. + * + * A summary of the logic implemented from the table above is as follows: + * + * First check for DENY ACEs that apply. + * If either target or container has a deny, EACCES. + * + * Delete access can then be summarized as follows: + * 1: The object to be deleted grants ACE_DELETE, or + * 2: The containing directory grants ACE_DELETE_CHILD. + * In a Windows system, that would be the end of the story. + * In this system, (2) has some complications... + * 2a: "sticky" bit on a directory adds restrictions, and + * 2b: existing ACEs from previous versions of ZFS may + * not carry ACE_DELETE_CHILD where they should, so we + * also allow delete when ACE_WRITE_DATA is granted. + * + * Note: 2b is technically a work-around for a prior bug, + * which hopefully can go away some day. For those who + * no longer need the work around, and for testing, this + * work-around is made conditional via the tunable: + * zfs_write_implies_delete_child + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t wanted_dirperms; + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + boolean_t dzpcheck_privs; + boolean_t zpcheck_privs; + + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); + + /* + * Case 1: + * If target object grants ACE_DELETE then we are done. This is + * indicated by a return value of 0. For this case we don't worry + * about the sticky bit because sticky only applies to the parent + * directory and this is the child access result. + * + * If we encounter a DENY ACE here, we're also done (EACCES). + * Note that if we hit a DENY ACE here (on the target) it should + * take precedence over a DENY ACE on the container, so that when + * we have more complete auditing support we will be able to + * report an access failure against the specific target. + * (This is part of why we're checking the target first.) + */ + zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr); + if (zp_error == EACCES) { + /* We hit a DENY ACE. */ + if (!zpcheck_privs) + return (SET_ERROR(zp_error)); + + return (secpolicy_vnode_remove(ZTOV(zp), cr)); + } + if (zp_error == 0) + return (0); + + /* + * Case 2: + * If the containing directory grants ACE_DELETE_CHILD, + * or we're in backward compatibility mode and the + * containing directory has ACE_WRITE_DATA, allow. + * Case 2b is handled with wanted_dirperms. + */ + wanted_dirperms = ACE_DELETE_CHILD; + if (zfs_write_implies_delete_child) + wanted_dirperms |= ACE_WRITE_DATA; + dzp_error = zfs_zaccess_common(dzp, wanted_dirperms, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + if (dzp_error == EACCES) { + /* We hit a DENY ACE. */ + if (!dzpcheck_privs) + return (SET_ERROR(dzp_error)); + return (secpolicy_vnode_remove(ZTOV(zp), cr)); + } + + /* + * Cases 2a, 2b (continued) + * + * Note: dzp_working_mode now contains any permissions + * that were NOT granted. Therefore, if any of the + * wanted_dirperms WERE granted, we will have: + * dzp_working_mode != wanted_dirperms + * We're really asking if ANY of those permissions + * were granted, and if so, grant delete access. + */ + if (dzp_working_mode != wanted_dirperms) + dzp_error = 0; + + /* + * dzp_error is 0 if the container granted us permissions to "modify". + * If we do not have permission via one or more ACEs, our current + * privileges may still permit us to modify the container. + * + * dzpcheck_privs is false when i.e. the FS is read-only. + * Otherwise, do privilege checks for the container. + */ + if (dzp_error != 0 && dzpcheck_privs) { + uid_t owner; + /* + * The secpolicy call needs the requested access and + * the current access mode of the container, but it + * only knows about Unix-style modes (VEXEC, VWRITE), + * so this must condense the fine-grained ACE bits into + * Unix modes. + * + * The VEXEC flag is easy, because we know that has + * always been checked before we get here (during the + * lookup of the target vnode). The container has not + * granted us permissions to "modify", so we do not set + * the VWRITE flag in the current access mode. + */ + owner = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, + ZFS_OWNER); + dzp_error = secpolicy_vnode_access2(cr, ZTOV(dzp), + owner, VEXEC, VWRITE|VEXEC); + } + if (dzp_error != 0) { + /* + * Note: We may have dzp_error = -1 here (from + * zfs_zacess_common). Don't return that. + */ + return (SET_ERROR(EACCES)); + } + + /* + * At this point, we know that the directory permissions allow + * us to modify, but we still need to check for the additional + * restrictions that apply when the "sticky bit" is set. + * + * Yes, zfs_sticky_remove_access() also checks this bit, but + * checking it here and skipping the call below is nice when + * you're watching all of this with dtrace. + */ + if ((dzp->z_mode & S_ISVTX) == 0) + return (0); + /* + * zfs_sticky_remove_access will succeed if: + * 1. The sticky bit is absent. + * 2. We pass the sticky bit restrictions. + * 3. We have privileges that always allow file removal. + */ + return (zfs_sticky_remove_access(dzp, zp, cr)); +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + add_perm = (vnode_isdir(ZTOV(szp))) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + * + * BSD operating systems also require write permission + * on the directory being moved from one parent directory + * to another. + */ + if (vnode_isdir(ZTOV(szp)) && ZTOV(sdzp) != ZTOV(tdzp)) { + if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))) + return (error); + } + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp) { + if ((error = zfs_zaccess_delete(tdzp, tzp, cr))) + return (error); + } + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/module/os/macos/zfs/zfs_boot.cpp b/module/os/macos/zfs/zfs_boot.cpp new file mode 100644 index 0000000000..12303b0118 --- /dev/null +++ b/module/os/macos/zfs/zfs_boot.cpp @@ -0,0 +1,2962 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * ZFS boot utils + * + * While loading the kext, check if early boot and zfs-boot + * kernel flag. + * Allocate pool_list (and lock). + * Register matching notification zfs_boot_probe_disk to check + * IOMediaBSDClient devices as they are published (or matched?), + * passing pool_list (automatically calls handler for all + * existing devices). + * Dispatch zfs_boot_import_thread on system_taskq. + * + * In notification handler zfs_boot_probe_disk: + * Check provider IOMedia for: + * 1 Leaf node and whole disk. + * 2 Leaf node and type ZFS. + * 3 Leaf node and type FreeBSD-ZFS. + * Check IOMedia meets minimum size or bail. + * Allocate char* buffer. + * Call vdev_disk_read_rootlabel. + * XXX Alternately: + * Alloc and prep IOMemoryDescriptor. + * Open IOMedia device (read-only). + * Try to read vdev label from device. + * Close IOMedia device. + * Release IOMemoryDescriptor (data is in buffer). + * XXX + * If label was read, try to generate a config from label. + * Check pool name matches zfs-boot or bail. + * Check pool status. + * Update this vdev's path and set status. + * Set other vdevs to missing status. + * Check-in config in thread-safe manner: + * Take pool_list lock. + * If config not found, insert new config, or update existing. + * Unlock pool_list. + * If found config is complete, wake import thread. + * + * In vdev_disk_read_rootlabel: + * Use vdev_disk_physio to read label. + * If label was read, try to unpack. + * Return label or failure. + * + * In vdev_disk_physio: + * Open device (read-only) using vnop/VOP. + * Try to read vdev label from device. + * Close device using vnop/VOP. + * + * In zfs_boot_import_thread: + * Loop checking for work and sleeping on lock between loops. + * Take pool_list lock and check for work. + * Attempt to import root pool using spa_import_rootpool. + * If successful, remove notification handler (waits for + * all tasks). + * Empty and deallocate pool_list (and lock). + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +extern "C" { +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); + +} /* extern "C" */ + +#include +#include +#include +#include + +#if defined(DEBUG) || defined(ZFS_DEBUG) +#define DSTATIC +#else +#define DSTATIC static +#endif + +#ifndef verify +#define verify(EX) (void)((EX) || \ + (printf("%s, %s, %d, %s\n", #EX, __FILE__, __LINE__, __func__), 0)) +#endif /* verify */ + +/* Most of this is only built when configured with --enable-boot */ + +/* block size is 512 B, count is 512 M blocks */ +#define ZFS_BOOT_DEV_BSIZE (UInt64)(1<<9) +#define ZFS_BOOT_DEV_BCOUNT (UInt64)(2<<29) +#define ZFS_BOOT_DATASET_NAME_KEY "zfs_dataset_name" +#define ZFS_BOOT_DATASET_UUID_KEY "zfs_dataset_uuid" +#define ZFS_BOOT_DATASET_RDONLY_KEY "zfs_dataset_rdonly" +#define ZFS_MOUNTROOT_RETRIES 50 +#define ZFS_BOOTLOG_DELAY 100 + +/* + * C functions for boot-time vdev discovery + */ + +/* + * Intermediate structures used to gather configuration information. + */ +typedef struct config_entry { + uint64_t ce_txg; + nvlist_t *ce_config; + struct config_entry *ce_next; +} config_entry_t; + +typedef struct vdev_entry { + uint64_t ve_guid; + config_entry_t *ve_configs; + struct vdev_entry *ve_next; +} vdev_entry_t; + +typedef struct pool_entry { + uint64_t pe_guid; + vdev_entry_t *pe_vdevs; + struct pool_entry *pe_next; + uint64_t complete; +} pool_entry_t; + +typedef struct name_entry { + char *ne_name; + uint64_t ne_guid; + uint64_t ne_order; + uint64_t ne_num_labels; + struct name_entry *ne_next; +} name_entry_t; + +typedef struct pool_list { + pool_entry_t *pools; + name_entry_t *names; + uint64_t pool_guid; + char *pool_name; + OSSet *new_disks; + OSSet *disks; + kmutex_t lock; + kcondvar_t cv; + IOService *zfs_hl; + IONotifier *notifier; + volatile UInt64 terminating; +} pool_list_t; + +#define ZFS_BOOT_ACTIVE 0x1 +#define ZFS_BOOT_TERMINATING 0x2 +#define ZFS_BOOT_INVALID 0x99 + +#define ZFS_BOOT_PREALLOC_SET 5 + +#if 0 +static ZFSBootDevice *bootdev = 0; +#endif +static pool_list_t *zfs_boot_pool_list = 0; + +DSTATIC char * +zfs_boot_get_devid(const char *path) +{ + /* + * XXX Unavailable interface + * + * If we implement one in spl, it could + * simplify import when device paths + * have changed (e.g. USB pools). + * + * Could use ldi DeviceTree path, or + * IOService path if not in DTPlane. + */ + return (NULL); +} + +/* + * Go through and fix up any path and/or devid information for the given vdev + * configuration. + * + * Copied from libzfs_import.c + */ +DSTATIC int +zfs_boot_fix_paths(nvlist_t *nv, name_entry_t *names) +{ + nvlist_t **child; + uint_t c, children; + uint64_t guid; + name_entry_t *ne, *best; + char *path, *devid; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (zfs_boot_fix_paths(child[c], names) != 0) + return (-1); + return (0); + } + + /* + * This is a leaf (file or disk) vdev. In either case, go through + * the name list and see if we find a matching guid. If so, replace + * the path and see if we can calculate a new devid. + * + * There may be multiple names associated with a particular guid, in + * which case we have overlapping partitions or multiple paths to the + * same disk. In this case we prefer to use the path name which + * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we + * use the lowest order device which corresponds to the first match + * while traversing the ZPOOL_IMPORT_PATH search path. + */ + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + path = NULL; + + best = NULL; + for (ne = names; ne != NULL; ne = ne->ne_next) { + if (ne->ne_guid == guid) { + + if (path == NULL) { + best = ne; + break; + } + + if ((strlen(path) == strlen(ne->ne_name)) && + strncmp(path, ne->ne_name, strlen(path)) == 0) { + best = ne; + break; + } + + if (best == NULL) { + best = ne; + continue; + } + + /* Prefer paths with more vdev labels. */ + if (ne->ne_num_labels > best->ne_num_labels) { + best = ne; + continue; + } + + /* Prefer paths earlier in the search order. */ + if (ne->ne_num_labels == best->ne_num_labels && + ne->ne_order < best->ne_order) { + best = ne; + continue; + } + } + } + + if (best == NULL) + return (0); + + if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) + return (-1); + + if ((devid = zfs_boot_get_devid(best->ne_name)) == NULL) { + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + } else { + if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0) { + spa_strfree(devid); + return (-1); + } + spa_strfree(devid); + } + + return (0); +} + +/* + * Add the given configuration to the list of known devices. + * + * Copied from libzfs_import.c + * diffs: kmem_alloc, kmem_free with size + */ +DSTATIC int +zfs_boot_add_config(pool_list_t *pl, const char *path, + int order, int num_labels, nvlist_t *config) +{ + uint64_t pool_guid, vdev_guid, top_guid, txg, state; + pool_entry_t *pe; + vdev_entry_t *ve; + config_entry_t *ce; + name_entry_t *ne; + + dprintf("%s %p [%s] %d %d %p\n", __func__, + pl, path, order, num_labels, config); + + /* + * If this is a hot spare not currently in use or level 2 cache + * device, add it to the list of names to translate, but don't do + * anything else. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) == 0 && + (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && + nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { + if ((ne = (name_entry_t *) kmem_alloc( + sizeof (name_entry_t), KM_SLEEP)) == NULL) { + return (-1); + } + bzero(ne, sizeof (name_entry_t)); + + if ((ne->ne_name = spa_strdup(path)) == NULL) { + kmem_free(ne, sizeof (name_entry_t)); + return (-1); + } + ne->ne_guid = vdev_guid; + ne->ne_order = order; + ne->ne_num_labels = num_labels; + ne->ne_next = pl->names; + pl->names = ne; + return (0); + } + + /* + * If we have a valid config but cannot read any of these fields, then + * it means we have a half-initialized label. In vdev_label_init() + * we write a label with txg == 0 so that we can identify the device + * in case the user refers to the same disk later on. If we fail to + * create the pool, we'll be left with a label in this state + * which should not be considered part of a valid pool. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, + &vdev_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(config); + return (0); + } + + /* + * First, see if we know about this pool. If not, then add it to the + * list of known pools. + */ + for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { + if (pe->pe_guid == pool_guid) + break; + } + + if (pe == NULL) { + if ((pe = (pool_entry_t *) kmem_alloc( + sizeof (pool_entry_t), KM_SLEEP)) == NULL) { + nvlist_free(config); + return (-1); + } + bzero(pe, sizeof (pool_entry_t)); + pe->pe_guid = pool_guid; + pe->pe_next = pl->pools; + pl->pools = pe; + } + + /* + * Second, see if we know about this toplevel vdev. Add it if its + * missing. + */ + for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { + if (ve->ve_guid == top_guid) + break; + } + + if (ve == NULL) { + if ((ve = (vdev_entry_t *) kmem_alloc( + sizeof (vdev_entry_t), KM_SLEEP)) == NULL) { + nvlist_free(config); + return (-1); + } + bzero(ve, sizeof (vdev_entry_t)); + ve->ve_guid = top_guid; + ve->ve_next = pe->pe_vdevs; + pe->pe_vdevs = ve; + } + + /* + * Third, see if we have a config with a matching transaction group. If + * so, then we do nothing. Otherwise, add it to the list of known + * configs. + */ + for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { + if (ce->ce_txg == txg) + break; + } + + if (ce == NULL) { + if ((ce = (config_entry_t *) kmem_alloc( + sizeof (config_entry_t), KM_SLEEP)) == NULL) { + nvlist_free(config); + return (-1); + } + bzero(ce, sizeof (config_entry_t)); + ce->ce_txg = txg; + ce->ce_config = config; + ce->ce_next = ve->ve_configs; + ve->ve_configs = ce; + } else { + nvlist_free(config); + } + + /* + * At this point we've successfully added our config to the list of + * known configs. The last thing to do is add the vdev guid -> path + * mappings so that we can fix up the configuration as necessary before + * doing the import. + */ + if ((ne = (name_entry_t *) kmem_alloc( + sizeof (name_entry_t), KM_SLEEP)) == NULL) { + return (-1); + } + bzero(ne, sizeof (name_entry_t)); + + if ((ne->ne_name = spa_strdup(path)) == NULL) { + kmem_free(ne, sizeof (name_entry_t)); + return (-1); + } + + ne->ne_guid = vdev_guid; + ne->ne_order = order; + ne->ne_num_labels = num_labels; + ne->ne_next = pl->names; + pl->names = ne; + + return (0); +} + +/* + * libzfs_import used the libzfs handle and a zfs + * command to issue tryimport in-kernel via ioctl. + * This should leave config as-is, and return nvl. + * Since zfs_boot is already in-kernel, duplicate + * config into nvl, and call spa_tryimport on it. + */ +DSTATIC nvlist_t * +zfs_boot_refresh_config(nvlist_t *config) +{ + nvlist_t *nvl = 0; + + /* tryimport does not free config, and returns new nvl or null */ + nvl = spa_tryimport(config); + return (nvl); +} + +/* + * Determine if the vdev id is a hole in the namespace. + */ +DSTATIC boolean_t +zfs_boot_vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) +{ + int c; + + for (c = 0; c < holes; c++) { + /* Top-level is a hole */ + if (hole_array[c] == id) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Convert our list of pools into the definitive set of configurations. We + * start by picking the best config for each toplevel vdev. Once that's done, + * we assemble the toplevel vdevs into a full config for the pool. We make a + * pass to fix up any incorrect paths, and then add it to the main list to + * return to the user. + */ +DSTATIC nvlist_t * +zfs_boot_get_configs(pool_list_t *pl, boolean_t active_ok) +{ + pool_entry_t *pe; + vdev_entry_t *ve; + config_entry_t *ce; + nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; + nvlist_t **spares, **l2cache; + uint_t i, nspares, nl2cache; + boolean_t config_seen; + uint64_t best_txg; + char *name, *hostname = NULL; + uint64_t guid; + uint_t children = 0; + nvlist_t **child = NULL; + uint_t holes; + uint64_t *hole_array, max_id; + uint_t c; +#if 0 + boolean_t isactive; +#endif + uint64_t hostid; + nvlist_t *nvl; + boolean_t valid_top_config = B_FALSE; + + if (nvlist_alloc(&ret, 0, 0) != 0) + goto nomem; + + for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { + uint64_t id, max_txg = 0; + + if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) + goto nomem; + config_seen = B_FALSE; + + /* + * Iterate over all toplevel vdevs. Grab the pool configuration + * from the first one we find, and then go through the rest and + * add them as necessary to the 'vdevs' member of the config. + */ + for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { + + /* + * Determine the best configuration for this vdev by + * selecting the config with the latest transaction + * group. + */ + best_txg = 0; + for (ce = ve->ve_configs; ce != NULL; + ce = ce->ce_next) { + + if (ce->ce_txg > best_txg) { + tmp = ce->ce_config; + best_txg = ce->ce_txg; + } + } + + /* + * We rely on the fact that the max txg for the + * pool will contain the most up-to-date information + * about the valid top-levels in the vdev namespace. + */ + if (best_txg > max_txg) { + (void) nvlist_remove(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, + ZPOOL_CONFIG_HOLE_ARRAY, + DATA_TYPE_UINT64_ARRAY); + + max_txg = best_txg; + hole_array = NULL; + holes = 0; + max_id = 0; + valid_top_config = B_FALSE; + + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { + verify(nvlist_add_uint64(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + max_id) == 0); + valid_top_config = B_TRUE; + } + + if (nvlist_lookup_uint64_array(tmp, + ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, + &holes) == 0) { + verify(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_HOLE_ARRAY, + hole_array, holes) == 0); + } + } + + if (!config_seen) { + /* + * Copy the relevant pieces of data to the pool + * configuration: + * + * version + * pool guid + * name + * pool txg (if available) + * comment (if available) + * pool state + * hostid (if available) + * hostname (if available) + */ + uint64_t state, version, pool_txg; + char *comment = NULL; + + version = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VERSION); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_VERSION, version); + guid = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_GUID); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_GUID, guid); + name = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_POOL_NAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_POOL_NAME, name); + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_TXG, &pool_txg) == 0) + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_TXG, pool_txg); + + if (nvlist_lookup_string(tmp, + ZPOOL_CONFIG_COMMENT, &comment) == 0) + fnvlist_add_string(config, + ZPOOL_CONFIG_COMMENT, comment); + + state = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_STATE); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_STATE, state); + + hostid = 0; + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + fnvlist_add_uint64(config, + ZPOOL_CONFIG_HOSTID, hostid); + hostname = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_HOSTNAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_HOSTNAME, hostname); + } + + config_seen = B_TRUE; + } + + /* + * Add this top-level vdev to the child array. + */ + verify(nvlist_lookup_nvlist(tmp, + ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); + verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, + &id) == 0); + + if (id >= children) { + nvlist_t **newchild; + + newchild = (nvlist_t **) kmem_alloc((id + 1) * + sizeof (nvlist_t *), KM_SLEEP); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + kmem_free(child, children * + sizeof (nvlist_t *)); + child = newchild; + children = id + 1; + } + if (nvlist_dup(nvtop, &child[id], 0) != 0) + goto nomem; + + } + + /* + * If we have information about all the top-levels then + * clean up the nvlist which we've constructed. This + * means removing any extraneous devices that are + * beyond the valid range or adding devices to the end + * of our array which appear to be missing. + */ + if (valid_top_config) { + if (max_id < children) { + for (c = max_id; c < children; c++) + nvlist_free(child[c]); + children = max_id; + } else if (max_id > children) { + nvlist_t **newchild; + + newchild = (nvlist_t **) kmem_alloc((max_id) * + sizeof (nvlist_t *), KM_SLEEP); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + kmem_free(child, children * + sizeof (nvlist_t *)); + child = newchild; + children = max_id; + } + } + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + + /* + * The vdev namespace may contain holes as a result of + * device removal. We must add them back into the vdev + * tree before we process any missing devices. + */ + if (holes > 0) { + ASSERT(valid_top_config); + + for (c = 0; c < children; c++) { + nvlist_t *holey; + + if (child[c] != NULL || + !zfs_boot_vdev_is_hole(hole_array, holes, + c)) + continue; + + if (nvlist_alloc(&holey, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + + /* + * Holes in the namespace are treated as + * "hole" top-level vdevs and have a + * special flag set on them. + */ + if (nvlist_add_string(holey, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_GUID, 0ULL) != 0) { + nvlist_free(holey); + goto nomem; + } + child[c] = holey; + } + } + + /* + * Look for any missing top-level vdevs. If this is the case, + * create a faked up 'missing' vdev as a placeholder. We cannot + * simply compress the child array, because the kernel performs + * certain checks to make sure the vdev IDs match their location + * in the configuration. + */ + for (c = 0; c < children; c++) { + if (child[c] == NULL) { + nvlist_t *missing; + if (nvlist_alloc(&missing, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + if (nvlist_add_string(missing, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) != 0 || + nvlist_add_uint64(missing, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(missing, + ZPOOL_CONFIG_GUID, 0ULL) != 0) { + nvlist_free(missing); + goto nomem; + } + child[c] = missing; + } + } + + /* + * Put all of this pool's top-level vdevs into a root vdev. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) + goto nomem; + if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) != 0 || + nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || + nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + child, children) != 0) { + nvlist_free(nvroot); + goto nomem; + } + + for (c = 0; c < children; c++) + nvlist_free(child[c]); + kmem_free(child, children * sizeof (nvlist_t *)); + children = 0; + child = NULL; + + /* + * Go through and fix up any paths and/or devids based on our + * known list of vdev GUID -> path mappings. + */ + if (zfs_boot_fix_paths(nvroot, pl->names) != 0) { + nvlist_free(nvroot); + goto nomem; + } + + /* + * Add the root vdev to this pool's configuration. + */ + if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + nvroot) != 0) { + nvlist_free(nvroot); + goto nomem; + } + nvlist_free(nvroot); + + /* + * zdb uses this path to report on active pools that were + * imported or created using -R. + */ + if (active_ok) + goto add_pool; + +#if 0 +/* + * For root-pool import, no pools are active yet. + * Pool name and guid were looked up from the config and only used here. + * (Later we lookup the pool name for a separate test). + */ + /* + * Determine if this pool is currently active, in which case we + * can't actually import it. + */ + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + + if (zfs_boot_pool_active(name, guid, &isactive) != 0) + goto error; + + if (isactive) { + nvlist_free(config); + config = NULL; + continue; + } +#endif + + if ((nvl = zfs_boot_refresh_config(config)) == NULL) { + nvlist_free(config); + config = NULL; + continue; + } + + nvlist_free(config); + config = nvl; + + /* + * Go through and update the paths for spares, now that we have + * them. + */ + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + if (zfs_boot_fix_paths(spares[i], pl->names) != + 0) + goto nomem; + } + } + + /* + * Update the paths for l2cache devices. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + for (i = 0; i < nl2cache; i++) { + if (zfs_boot_fix_paths(l2cache[i], pl->names) != + 0) + goto nomem; + } + } + + /* + * Restore the original information read from the actual label. + */ + (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, + DATA_TYPE_STRING); + if (hostid != 0) { + verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, + hostid) == 0); + verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, + hostname) == 0); + } + +add_pool: + /* + * Add this pool to the list of configs. + */ + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + if (nvlist_add_nvlist(ret, name, config) != 0) + goto nomem; + + nvlist_free(config); + config = NULL; + } + + return (ret); + +nomem: +#ifdef DEBUG + printf("zfs_boot_get_configs failed to allocate memory\n"); +#endif + if (config) nvlist_free(config); + if (ret) nvlist_free(ret); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + if (children > 0) { + kmem_free(child, children * sizeof (nvlist_t *)); + } + /* + * libzfs_import simply calls free(child), we need to + * pass kmem_free the size of the array. Array is + * allocated above as (children * sizeof nvlist_t*). + */ + + return (NULL); +} + +/* + * Return the offset of the given label. + */ +DSTATIC uint64_t +zfs_boot_label_offset(uint64_t size, int l) +{ + ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); + return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); +} + +/* + * Given an IOMedia, read the label information and return an nvlist + * describing the configuration, if there is one. The number of valid + * labels found will be returned in num_labels when non-NULL. + */ +DSTATIC int +zfs_boot_read_label(IOService *zfs_hl, IOMedia *media, + nvlist_t **config, int *num_labels) +{ + IOMemoryDescriptor *buffer = NULL; + uint64_t mediaSize; + uint64_t nread = 0; + vdev_label_t *label; + nvlist_t *expected_config = NULL; + uint64_t expected_guid = 0, size, labelsize; + int l, count = 0; + IOReturn ret; + + *config = NULL; + + /* Verify IOMedia pointer and device size */ + if (!media || (mediaSize = media->getSize()) == 0) { + dprintf("%s couldn't get media or size\n", __func__); + return (-1); + } + + /* Determine vdev label size and aligned vdev size */ + labelsize = sizeof (vdev_label_t); + size = P2ALIGN_TYPED(mediaSize, labelsize, uint64_t); + + /* Allocate a buffer to read labels into */ + label = (vdev_label_t *) kmem_alloc(labelsize, KM_SLEEP); + if (!label) { + dprintf("%s couldn't allocate label for read\n", __func__); + return (-1); + } + + /* Allocate a memory descriptor with the label pointer */ + buffer = IOMemoryDescriptor::withAddress((void*)label, labelsize, + kIODirectionIn); + + /* Verify buffer was allocated */ + if (!buffer || (buffer->getLength() != labelsize)) { + dprintf("%s couldn't allocate buffer for read\n", __func__); + goto error; + } + + /* Open the device for reads */ + if (false == media->IOMedia::open(zfs_hl, 0, + kIOStorageAccessReader)) { + dprintf("%s media open failed\n", __func__); + goto error; + } + + /* Read all four vdev labels */ + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t state, guid, txg; + + /* Zero the label buffer */ + bzero(label, labelsize); + + /* Prepare the buffer for IO */ + buffer->prepare(kIODirectionIn); + + /* Read a label from the specified offset */ + ret = media->IOMedia::read(zfs_hl, + zfs_boot_label_offset(size, l), + buffer, 0, &nread); + + /* Call the buffer completion */ + buffer->complete(); + + /* Skip failed reads, try next label */ + if (ret != kIOReturnSuccess) { + dprintf("%s media->read failed\n", __func__); + continue; + } + + /* Skip incomplete reads, try next label */ + if (nread < labelsize) { + dprintf("%s nread %llu / %llu\n", + __func__, nread, labelsize); + continue; + } + + /* Skip invalid labels that can't be unpacked */ + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) + continue; + + /* Verify GUID */ + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid == 0) { + dprintf("%s nvlist_lookup guid failed %llu\n", + __func__, guid); + nvlist_free(*config); + continue; + } + + /* Verify vdev state */ + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + dprintf("%s nvlist_lookup state failed %llu\n", + __func__, state); + nvlist_free(*config); + continue; + } + + /* Verify txg number */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + dprintf("%s nvlist_lookup txg failed %llu\n", + __func__, txg); + nvlist_free(*config); + continue; + } + + /* Increment count for first match, or if guid matches */ + if (expected_guid) { + if (expected_guid == guid) + count++; + + nvlist_free(*config); + } else { + expected_config = *config; + expected_guid = guid; + count++; + } + } + + /* Close IOMedia */ + media->close(zfs_hl); + + /* Copy out the config and number of labels */ + if (num_labels != NULL) + *num_labels = count; + + kmem_free(label, labelsize); + buffer->release(); + *config = expected_config; + + return (0); + + +error: + /* Clean up */ + if (buffer) { + buffer->release(); + buffer = 0; + } + if (label) { + kmem_free(label, labelsize); + label = 0; + } + + return (-1); +} + +DSTATIC bool +zfs_boot_probe_media(void* target, void* refCon, + IOService* newService, __unused IONotifier* notifier) +{ + IOMedia *media = 0; + OSObject *isLeaf = 0; + OSString *ospath = 0; + uint64_t mediaSize = 0; + pool_list_t *pools = (pool_list_t *) refCon; + + /* Verify pool list can be cast */ + if (!pools) { + dprintf("%s invalid refCon\n", __func__); + return (false); + } + /* Should never happen */ + if (!newService) { + printf("%s %s\n", "zfs_boot_probe_media", + "called with null newService"); + return (false); + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 1\n", __func__); + return (false); + } + + /* Validate pool name */ + if (!pools->pool_name || strlen(pools->pool_name) == 0) { + dprintf("%s no pool name specified\n", __func__); + return (false); + } + + /* Get the parent IOMedia device */ + media = OSDynamicCast(IOMedia, newService->getProvider()); + + if (!media) { + dprintf("%s couldn't be cast as IOMedia\n", + __func__); + return (false); + } + + isLeaf = media->getProperty(kIOMediaLeafKey); + if (!isLeaf) { + dprintf("%s skipping non-leaf\n", __func__); + goto out; + } + + mediaSize = media->getSize(); + if (mediaSize < SPA_MINDEVSIZE) { + dprintf("%s skipping device with size %llu\n", + __func__, mediaSize); + goto out; + } + + ospath = OSDynamicCast(OSString, media->getProperty( + kIOBSDNameKey, gIOServicePlane, + kIORegistryIterateRecursively)); + if (!ospath || (ospath->getLength() == 0)) { + dprintf("%s skipping device with no bsd disk node\n", + __func__); + goto out; + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 2\n", __func__); + goto out; + } + + + /* Take pool_list lock */ + mutex_enter(&pools->lock); + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 3\n", __func__); + /* Unlock the pool list lock */ + mutex_exit(&pools->lock); + goto out; + } + + /* Add this IOMedia to the disk set */ + pools->disks->setObject(media); + + /* Unlock the pool list lock */ + mutex_exit(&pools->lock); + + /* Wakeup zfs_boot_import_thread */ + cv_signal(&pools->cv); + +out: + media = 0; + return (true); +} + +DSTATIC bool +zfs_boot_probe_disk(pool_list_t *pools, IOMedia *media) +{ + OSString *ospath, *uuid; + char *path = 0, *pname; + const char prefix[] = "/private/var/run/disk/by-id/media-"; + uint64_t this_guid; + int num_labels, err, len = 0; + nvlist_t *config; + boolean_t matched = B_FALSE; + + dprintf("%s: with %s media\n", __func__, + (media ? "valid" : "missing")); + ASSERT3U(media, !=, NULL); + + /* Verify pool list can be cast */ + if (!pools) { + dprintf("%s missing pool_list\n", __func__); + return (false); + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 1\n", __func__); + return (false); + } + + /* Validate pool name */ + if (!pools->pool_name || strlen(pools->pool_name) == 0) { + dprintf("%s no pool name specified\n", __func__); + return (false); + } + + /* Try to get a UUID from the media */ + uuid = OSDynamicCast(OSString, media->getProperty(kIOMediaUUIDKey)); + if (uuid && uuid->getLength() != 0) { + /* Allocate room for prefix, UUID, and null terminator */ + len = (strlen(prefix) + uuid->getLength()) + 1; + + path = (char *) kmem_alloc(len, KM_SLEEP); + if (!path) { + dprintf("%s couldn't allocate path\n", __func__); + return (false); + } + + snprintf(path, len, "%s%s", prefix, uuid->getCStringNoCopy()); + uuid = 0; + } else { + /* Get the BSD name as a C string */ + ospath = OSDynamicCast(OSString, media->getProperty( + kIOBSDNameKey, gIOServicePlane, + kIORegistryIterateRecursively)); + if (!ospath || (ospath->getLength() == 0)) { + dprintf("%s skipping device with no bsd disk node\n", + __func__); + return (false); + } + + /* Allocate room for "/dev/" + "diskNsN" + '\0' */ + len = (strlen("/dev/") + ospath->getLength() + 1); + path = (char *) kmem_alloc(len, KM_SLEEP); + if (!path) { + dprintf("%s couldn't allocate path\n", __func__); + return (false); + } + + /* "/dev/" is 5 characters, plus null character */ + snprintf(path, len, "/dev/%s", ospath->getCStringNoCopy()); + ospath = 0; + } + dprintf("%s path [%s]\n", __func__, (path ? path : "")); + + /* Read vdev labels, if any */ + err = zfs_boot_read_label(pools->zfs_hl, media, + &config, &num_labels); + + /* Skip disks with no labels */ + if (err != 0 || num_labels == 0 || !config) { + goto out; + } + + /* Lookup pool name */ + if (pools->pool_name != NULL && + (nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0)) { + /* Compare with pool_name */ + if (strlen(pools->pool_name) == strlen(pname) && + strncmp(pools->pool_name, pname, + strlen(pname)) == 0) { + printf("%s matched pool %s\n", + __func__, pname); + matched = B_TRUE; + } + /* Compare with pool_guid */ + } else if (pools->pool_guid != 0) { + matched = nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_GUID, + &this_guid) == 0 && + pools->pool_guid == this_guid; + } + + /* Skip non-matches */ + if (!matched) { + nvlist_free(config); + config = NULL; + goto out; + } + + /* + * Add this config to the pool list. + * Always assigns order 1 since all disks are + * referenced by /private/var/run/disk/by-id/ paths. + */ + dprintf("%s: add_config %s\n", __func__, path); + if (zfs_boot_add_config(pools, path, 1, + num_labels, config) != 0) { + printf("%s couldn't add config to pool list\n", + __func__); + } + +out: + /* Clean up */ + if (path && len > 0) { + kmem_free(path, len); + } + return (true); +} + +DSTATIC void +zfs_boot_free() +{ + pool_entry_t *pe, *penext; + vdev_entry_t *ve, *venext; + config_entry_t *ce, *cenext; + name_entry_t *ne, *nenext; + pool_list_t *pools = zfs_boot_pool_list; + + /* Verify pool list can be cast */ + if (!pools) { + dprintf("%s: no pool_list to clear\n", __func__); + return; + } + + /* Clear global ptr */ + zfs_boot_pool_list = 0; + + pools->terminating = ZFS_BOOT_TERMINATING; + + /* Remove IONotifier (waits for tasks to complete) */ + if (pools->notifier) { + pools->notifier->remove(); + pools->notifier = 0; + } + + /* Release the lock */ + mutex_destroy(&pools->lock); + + /* Release the disk set */ + if (pools->disks) { + pools->disks->flushCollection(); + pools->disks->release(); + pools->disks = 0; + } + + /* Clear the zfs IOService handle */ + if (pools->zfs_hl) { + pools->zfs_hl = 0; + } + + /* Free the pool_name string */ + if (pools->pool_name) { + kmem_free(pools->pool_name, strlen(pools->pool_name) + 1); + pools->pool_name = 0; + } + + /* Clear the pool config list */ + for (pe = pools->pools; pe != NULL; pe = penext) { + /* Clear the vdev list */ + penext = pe->pe_next; + for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { + /* Clear the vdev config list */ + venext = ve->ve_next; + for (ce = ve->ve_configs; ce != NULL; ce = cenext) { + cenext = ce->ce_next; + if (ce->ce_config) + nvlist_free(ce->ce_config); + kmem_free(ce, sizeof (config_entry_t)); + } + kmem_free(ve, sizeof (vdev_entry_t)); + } + kmem_free(pe, sizeof (pool_entry_t)); + } + pools->pools = 0; + + /* Clear the vdev name list */ + for (ne = pools->names; ne != NULL; ne = nenext) { + nenext = ne->ne_next; + if (ne->ne_name) + spa_strfree(ne->ne_name); + kmem_free(ne, sizeof (name_entry_t)); + } + pools->names = 0; + + /* Finally, free the pool list struct */ + kmem_free(pools, sizeof (pool_list_t)); + pools = 0; +} + +void +zfs_boot_fini() +{ + pool_list_t *pools = zfs_boot_pool_list; + + if (!pools) { + printf("%s no pool_list to clear\n", __func__); + return; + } + + /* Set terminating flag */ + if (false == OSCompareAndSwap64(ZFS_BOOT_ACTIVE, + ZFS_BOOT_TERMINATING, &(pools->terminating))) { + printf("%s already terminating? %llu\n", + __func__, pools->terminating); + } + + /* Wakeup zfs_boot_import_thread */ + cv_signal(&pools->cv); + + /* Clean up */ + pools = 0; +} + +#define kBootUUIDKey "boot-uuid" +#define kBootUUIDMediaKey "boot-uuid-media" + +DSTATIC int +zfs_boot_publish_bootfs(IOService *zfs_hl, pool_list_t *pools) +{ + ZFSDataset *dataset = NULL; + IOMedia *media; + IOService *resourceService = NULL; + OSDictionary *properties = NULL; + spa_t *spa = NULL; + char *zfs_bootfs = NULL; + uint64_t bootfs = 0ULL; + int error, len = ZFS_MAX_DATASET_NAME_LEN; + + dprintf("%s\n", __func__); + if (!zfs_hl || !pools) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + +#if 0 + ZFSPool *pool_proxy = NULL; + if (bootdev) { + dprintf("%s bootdev already set\n", __func__); + return (EBUSY); + } +#endif + + zfs_bootfs = (char *)kmem_alloc(len, KM_SLEEP); + if (!zfs_bootfs) { + printf("%s string alloc failed\n", __func__); + return (ENOMEM); + } + zfs_bootfs[0] = '\0'; + + mutex_enter(&spa_namespace_lock); + spa = spa_next(NULL); + if (spa) { + bootfs = spa_bootfs(spa); + } + if (bootfs == 0) { + mutex_exit(&spa_namespace_lock); + dprintf("%s no bootfs, nothing to do\n", __func__); + kmem_free(zfs_bootfs, len); + return (0); + } + +#if 0 + /* Get pool proxy */ + if (!spa->spa_iokit_proxy || + (pool_proxy = spa->spa_iokit_proxy->proxy) == NULL) { + mutex_exit(&spa_namespace_lock); + dprintf("%s no spa_pool_proxy\n", __func__); + kmem_free(zfs_bootfs, len); + return (0); + } +#endif + + error = dsl_dsobj_to_dsname(spa_name(spa), + spa_bootfs(spa), zfs_bootfs); + mutex_exit(&spa_namespace_lock); + + if (error != 0) { + dprintf("%s bootfs to name failed\n", __func__); + kmem_free(zfs_bootfs, len); + return (ENODEV); + } + + printf("%s: publishing bootfs [%s]\n", __func__, zfs_bootfs); + + /* Create prop dict for the proxy, with 6 or more keys */ + if ((properties = OSDictionary::withCapacity(6)) == NULL) { + dprintf("%s prop dict allocation failed\n", __func__); + kmem_free(zfs_bootfs, len); + return (ENOMEM); + } + + /* Set Content Hint and Content */ + do { + const OSSymbol *partUUID; + + /* ZFS (BF01) partition type */ + if ((partUUID = OSSymbol::withCString( + "6A898CC3-1DD2-11B2-99A6-080020736631")) == NULL) { + dprintf("%s couldn't make partUUID\n", __func__); + break; + // kmem_free(zfs_bootfs, len); + // return (ENOMEM); + } + + /* Assign ZFS partiton UUID to both */ + if (properties->setObject(kIOMediaContentKey, + partUUID) == false || + properties->setObject(kIOMediaContentHintKey, + partUUID) == false) { + dprintf("%s content hint failed\n", __func__); + // kmem_free(zfs_bootfs, len); + // return (ENOMEM); + } + partUUID->release(); + } while (0); + + /* XXX Set dataset name, rdonly, and UUID */ + do { + OSString *nameStr; + OSString *uuidStr; + char uuid_cstr[UUID_PRINTABLE_STRING_LENGTH]; + uuid_t uuid; + + bzero(uuid, sizeof (uuid_t)); + bzero(uuid_cstr, UUID_PRINTABLE_STRING_LENGTH); + + zfs_vfs_uuid_gen(zfs_bootfs, uuid); + zfs_vfs_uuid_unparse(uuid, uuid_cstr); + + nameStr = OSString::withCString(zfs_bootfs); + uuidStr = OSString::withCString(uuid_cstr); + + if (properties->setObject(ZFS_BOOT_DATASET_NAME_KEY, + nameStr) == false || + properties->setObject(ZFS_BOOT_DATASET_UUID_KEY, + uuidStr) == false || + properties->setObject(ZFS_BOOT_DATASET_RDONLY_KEY, + kOSBooleanFalse) == false) { + dprintf("ZFSBootDevice::%s couldn't setup" + "property dict\n", __func__); + nameStr->release(); + uuidStr->release(); + kmem_free(zfs_bootfs, len); + return (ENOMEM); + } + nameStr->release(); + uuidStr->release(); + } while (0); + + /* Create proxy device */ + error = zfs_osx_proxy_create(zfs_bootfs); + if (error == 0) { + dataset = zfs_osx_proxy_get(zfs_bootfs); + } + /* Done with this string */ + kmem_free(zfs_bootfs, len); + zfs_bootfs = 0; + + if (!dataset) { + printf("%s: couldn't create proxy device\n", + __func__); + return (ENXIO); + } + + media = OSDynamicCast(IOMedia, dataset); + if (!media) { + printf("%s: couldn't cast proxy media\n", + __func__); + dataset->release(); + return (ENXIO); + } + +#if 0 + bootdev = new ZFSBootDevice; + + if (!bootdev) { + printf("%s: couldn't create boot device\n", __func__); + return (ENOMEM); + } + + if (bootdev->init(properties) == false) { + printf("%s init failed\n", __func__); + properties->release(); + bootdev->release(); + bootdev = 0; + return (ENXIO); + } + properties->release(); + properties = 0; + + if (bootdev->attach(pool_proxy) == false) { + printf("%s attach failed\n", __func__); + bootdev->release(); + bootdev = 0; + return (ENXIO); + } + + /* Technically should start but this doesn't do much */ + if (bootdev->start(pool_proxy) == false) { + printf("%s start failed\n", __func__); + bootdev->detach(pool_proxy); + bootdev->release(); + bootdev = 0; + return (ENXIO); + } + + /* Get matching started */ + bootdev->registerService(kIOServiceAsynchronous); + // bootdev->registerService(kIOServiceSynchronous); + + do { + if (bootdev->getClient() != 0) { + media = OSDynamicCast(IOMedia, + bootdev->getClient()->getClient()); + if (media) { + media->retain(); + break; + } + } + + /* Sleep until media is available */ + /* + * XXX Should use waitForServiceMatching or IONotifier + */ + IOSleep(200); + } while (!media); + + if (!media) { + /* XXX currently unreachable */ + printf("%s couldn't get bootdev media\n", __func__); + return (ENXIO); + } +#endif + + resourceService = IOService::getResourceService(); + if (!resourceService) { + dprintf("%s missing resource service\n", __func__); + /* Handle error */ + media->release(); + return (ENXIO); + } + +#if 1 + /* XXX publish an IOMedia as the BootUUIDMedia resource */ + /* uses same method as AppleFileSystemDriver */ + + /* Set IOMedia UUID */ + /* XXX skip (moved get uuid below) */ + // media->setProperty(kIOMediaUUIDKey, uuid); + /* Publish this IOMedia as the boot-uuid-media */ + IOService::publishResource(kBootUUIDMediaKey, media); + + /* Drop retain from earlier */ + media->release(); + /* Remove boot-uuid key so AppleFileSystem stops matching */ + resourceService->removeProperty(kBootUUIDKey); +#else + OSString *uuid = 0; + /* Get the current boot-uuid string */ + uuid = OSDynamicCast(OSString, + resourceService->getProperty(kBootUUIDKey, gIOServicePlane)); + if (!uuid) { + dprintf("%s missing boot-uuid IOResource\n", __func__); + /* Handle error */ + return (ENXIO); + } + printf("%s: got boot-uuid %s\n", __func__, uuid->getCStringNoCopy()); + + /* XXX Or use below and let AppleFileSystemDriver match it */ + /* Leaves the Apple_Boot content hint (at least for now) */ + media->setProperty(kIOMediaContentHintKey, "Apple_Boot"); + media->setProperty(kIOMediaUUIDKey, uuid); + /* Register for notifications (not matching) */ + media->registerService(kIOServiceAsynchronous); + /* Drop retain from earlier */ + media->release(); +#endif + + printf("%s done\n", __func__); + return (0); +} + +DSTATIC void +zfs_boot_import_thread(void *arg) +{ + nvlist_t *configs, *nv, *newnv; + nvpair_t *elem; + IOService *zfs_hl = 0; + OSSet *disks, *new_set = 0; + OSCollectionIterator *iter = 0; + OSObject *next; + IOMedia *media; + pool_list_t *pools = (pool_list_t *)arg; + uint64_t pool_state; + boolean_t pool_imported = B_FALSE; + int error = EINVAL; + + /* Verify pool list coult be cast */ + ASSERT3U(pools, !=, 0); + if (!pools) { + printf("%s %p %s\n", "zfs_boot_import_thread", + arg, "couldn't be cast as pool_list_t*"); + return; + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 1\n", __func__); + goto out_unlocked; + } + + new_set = OSSet::withCapacity(1); + /* To swap with pools->disks while locked */ + if (!new_set) { + dprintf("%s couldn't allocate new_set\n", __func__); + goto out_unlocked; + } + + /* Take pool list lock */ + mutex_enter(&pools->lock); + + zfs_hl = pools->zfs_hl; + + /* Check for work, then sleep on the lock */ + do { + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 2\n", __func__); + goto out_locked; + } + + /* Check for work */ + if (pools->disks->getCount() == 0) { + dprintf("%s no disks to check\n", __func__); + goto next_locked; + } + + /* Swap full set with a new empty one */ + ASSERT3U(new_set, !=, 0); + disks = pools->disks; + pools->disks = new_set; + new_set = 0; + + /* Release pool list lock */ + mutex_exit(&pools->lock); + + /* Create an iterator over the objects in the set */ + iter = OSCollectionIterator::withCollection(disks); + + /* couldn't be initialized */ + if (!iter) { + dprintf("%s %s %d %s\n", "zfs_boot_import_thread", + "couldn't get iterator from collection", + disks->getCount(), "disks skipped"); + + /* Merge disks back into pools->disks */ + mutex_enter(&pools->lock); + pools->disks->merge(disks); + mutex_exit(&pools->lock); + + /* Swap 'disks' back to new_set */ + disks->flushCollection(); + new_set = disks; + disks = 0; + + continue; + } + + /* Iterate over all disks */ + while ((next = iter->getNextObject()) != NULL) { + /* Cast each IOMedia object */ + media = OSDynamicCast(IOMedia, next); + + if (!iter->isValid()) { + /* Oh gosh, need to start over */ + iter->reset(); + continue; + } + + if (!media) { + dprintf("%s couldn't cast IOMedia\n", + __func__); + continue; + } + + /* Check this IOMedia device for a vdev label */ + if (!zfs_boot_probe_disk(pools, media)) { + dprintf("%s couldn't probe disk\n", + __func__); + continue; + } + } + + /* Clean up */ + media = 0; + iter->release(); + iter = 0; + + /* Swap 'disks' back to new_set */ + disks->flushCollection(); + new_set = disks; + disks = 0; + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 3\n", __func__); + goto out_unlocked; + } + + mutex_enter(&pools->lock); + /* Check for work */ + if (pools->disks->getCount() != 0) { + dprintf("%s more disks available, looping\n", __func__); + continue; + } + /* Release pool list lock */ + mutex_exit(&pools->lock); + + /* Generate a list of pool configs to import */ + configs = zfs_boot_get_configs(pools, B_TRUE); + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 4\n", __func__); + goto out_unlocked; + } + + /* Iterate over the nvlists (stored as nvpairs in nvlist) */ + elem = NULL; + while ((elem = nvlist_next_nvpair(configs, + elem)) != NULL) { + /* Cast the nvpair back to nvlist */ + nv = NULL; + verify(nvpair_value_nvlist(elem, &nv) == 0); + + /* Check vdev state */ + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0); + if (pool_state == POOL_STATE_DESTROYED) { + dprintf("%s skipping destroyed pool\n", + __func__); + continue; + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 5\n", __func__); + goto out_unlocked; + } + + /* Try import */ + newnv = spa_tryimport(nv); + nvlist_free(nv); + nv = 0; + if (newnv) { + dprintf("%s newnv: %p\n", __func__, newnv); + + /* Stop probing disks */ + if (pools->notifier) + pools->notifier->disable(); + + /* Do import */ + pool_imported = (spa_import(pools->pool_name, + newnv, 0, 0) == 0); + nvlist_free(newnv); + newnv = 0; + // pool_imported = spa_import_rootpool(nv); + } else { + dprintf("%s no newnv returned\n", __func__); + } + + dprintf("%s spa_import returned %d\n", __func__, + pool_imported); + + if (pool_imported) { + /* Get bootfs and publish IOMedia */ + error = zfs_boot_publish_bootfs(zfs_hl, pools); + if (error != 0) { + dprintf("%s publish bootfs error %d\n", + __func__, error); + } + + goto out_unlocked; + } else { + /* Resume notifications */ + if (pools->notifier) + pools->notifier->enable(true); + } + } + + /* Retake pool list lock */ + mutex_enter(&pools->lock); + +next_locked: + /* Check for work */ + if (pools->disks->getCount() != 0) { + continue; + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 6\n", __func__); + goto out_locked; + } + + dprintf("%s sleeping on lock\n", __func__); + /* Sleep on lock, thread is resumed with lock held */ + cv_timedwait_sig(&pools->cv, &pools->lock, + ddi_get_lbolt() + hz); + + /* Loop forever */ + } while (true); + +out_locked: + /* Unlock pool list lock */ + mutex_exit(&pools->lock); + +out_unlocked: + /* Cleanup new_set */ + if (new_set) { + new_set->flushCollection(); + new_set->release(); + new_set = 0; + } + + /* Teardown pool list, lock, etc */ + zfs_boot_free(); + + return; /* taskq_dispatch */ +#if 0 + thread_exit(); /* thread_create */ +#endif +} + +DSTATIC bool +zfs_boot_check_mountroot(char **pool_name, uint64_t *pool_guid) +{ + /* + * Check if the kext is loading during early boot + * and/or check if root is mounted (IORegistry?) + * Use PE Boot Args to determine the root pool name. + */ + char *zfs_boot; + char *split; + uint64_t len; + bool result = false; + uint64_t uptime = 0; + + + if (!pool_name || !pool_guid) { + dprintf("%s %s\n", __func__, + "invalid pool_name or pool_guid ptr"); + return (false); + } + + /* XXX Ugly hack to determine if this is early boot */ + /* + * Could just check if boot-uuid (or rd= or rootdev=) + * are set, and abort otherwise + * IOResource "boot-uuid" only published before root is + * mounted, or "boot-uuid-media" once discovered + */ + clock_get_uptime(&uptime); /* uptime since boot in nanoseconds */ + dprintf("%s uptime: %llu\n", __func__, uptime); + + /* 3 billion nanoseconds ~= 3 seconds */ + // if (uptime >= 3LLU<<30) { + /* 60 billion nanoseconds ~= 60 seconds */ + if (uptime >= 7LLU<<33) { + dprintf("%s %s\n", __func__, "Already booted"); + /* + * Start the getrootdir() from working, the vfs_start() call + * isn't called until first mount, which is too late for + * spa_async_dispatch(). + */ + return (false); + } else { + dprintf("%s %s\n", __func__, "Boot time"); + } + + zfs_boot = (char *) kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (!zfs_boot) { + dprintf("%s couldn't allocate zfs_boot\n", __func__); + return (false); + } + + result = PE_parse_boot_argn("zfs_boot", zfs_boot, + ZFS_MAX_DATASET_NAME_LEN); + // dprintf( "Raw zfs_boot: [%llu] {%s}\n", + // (uint64_t)strlen(zfs_boot), zfs_boot); + + result = (result && (zfs_boot != 0) && strlen(zfs_boot) > 0); + + if (!result) { + result = PE_parse_boot_argn("rd", zfs_boot, + MAXPATHLEN); + result = (result && (zfs_boot != 0) && + strlen(zfs_boot) > 0 && + strncmp(zfs_boot, "zfs:", 4)); + // dprintf("Raw rd: [%llu] {%s}\n", + // (uint64_t)strlen(zfs_boot), zfs_boot ); + } + if (!result) { + result = PE_parse_boot_argn("rootdev", zfs_boot, + MAXPATHLEN); + result = (result && (zfs_boot != 0) && + strlen(zfs_boot) > 0 && + strncmp(zfs_boot, "zfs:", 4)); + // dprintf("Raw rootdev: [%llu] {%s}\n", + // (uint64_t)strlen(zfs_boot), zfs_boot ); + } + + /* + * XXX To Do - parse zpool_guid boot arg + */ + *pool_guid = 0; + + if (result) { + /* Check for first slash in zfs_boot */ + split = strchr(zfs_boot, '/'); + if (split) { + /* copy pool name up to first slash */ + len = (split - zfs_boot); + } else { + /* or copy whole string */ + len = strlen(zfs_boot); + } + + *pool_name = (char *) kmem_alloc(len+1, KM_SLEEP); + snprintf(*pool_name, len+1, "%s", zfs_boot); + + dprintf("Got zfs_boot: [%llu] {%s}->{%s}\n", + *pool_guid, zfs_boot, *pool_name); + } else { + dprintf("%s\n", "No zfs_boot\n"); + pool_name = 0; + } + + kmem_free(zfs_boot, ZFS_MAX_DATASET_NAME_LEN); + zfs_boot = 0; + return (result); +} + +bool +zfs_boot_init(IOService *zfs_hl) +{ + IONotifier *notifier = 0; + pool_list_t *pools = 0; + char *pool_name = 0; + uint64_t pool_guid = 0; + + zfs_boot_pool_list = 0; + + if (!zfs_hl) { + dprintf("%s: No zfs_hl provided\n", __func__); + return (false); + } + + if (!zfs_boot_check_mountroot(&pool_name, &pool_guid) || + (!pool_name && pool_guid == 0)) { + /* + * kext is not being loaded during early-boot, + * or no pool is specified for import. + */ + dprintf("%s: check failed\n", __func__); + return (true); + } + + pools = (pool_list_t *) kmem_alloc(sizeof (pool_list_t), + KM_SLEEP); + if (!pools) { + goto error; + } + bzero(pools, sizeof (pool_list_t)); + + if ((pools->disks = OSSet::withCapacity( + ZFS_BOOT_PREALLOC_SET)) == NULL) { + /* Fail if memory couldn't be allocated */ + goto error; + } + pools->terminating = ZFS_BOOT_ACTIVE; + pools->pools = 0; + pools->names = 0; + pools->pool_guid = pool_guid; + pools->pool_name = pool_name; + pools->zfs_hl = zfs_hl; + + notifier = IOService::addMatchingNotification( + gIOFirstPublishNotification, IOService::serviceMatching( + "IOMediaBSDClient"), zfs_boot_probe_media, + zfs_hl, pools, 0); + + if (!notifier) { + /* Fail if memory couldn't be allocated */ + goto error; + } + pools->notifier = notifier; + + mutex_init(&pools->lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pools->cv, NULL, CV_DEFAULT, NULL); + + /* Finally, start the import thread */ + taskq_dispatch(system_taskq, zfs_boot_import_thread, + (void*)pools, TQ_SLEEP); +#if 0 +/* Alternate method of scheduling the import thread */ + (void) thread_create(NULL, 0, zfs_boot_import_thread, + pools, 0, &p0, + TS_RUN, minclsyspri); +#endif + + zfs_boot_pool_list = pools; + + return (true); + +error: + if (pools) { + if (pools->disks) { + pools->disks->flushCollection(); + pools->disks->release(); + pools->disks = 0; + } + kmem_free(pools, sizeof (pool_list_t)); + pools = 0; + } + return (false); +} + +/* Include these functions in all builds */ + +/* + * zfs_boot_update_bootinfo_vdev_leaf + * Inputs: spa: valid pool spa pointer. vd: valid vdev pointer. + * Return: 0 on success, positive integer errno on failure. + * Callers: zfs_boot_update_bootinfo_vdev + * + * called by bootinfo_vdev with each leaf vdev. + */ +DSTATIC int +zfs_boot_update_bootinfo_vdev_leaf(OSArray *array, vdev_t *vd) +{ + OSDictionary *dict; + OSString *dev_str; + OSNumber *dev_size; + vdev_disk_t *dvd; + struct io_bootinfo *info; + int error; + + /* Validate inputs */ + if (!array || !vd) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + /* Should be called with leaf vdev */ + if (!vd->vdev_ops->vdev_op_leaf) { + dprintf("%s not a leaf vdev\n", __func__); + return (EINVAL); + } + + /* Skip hole vdevs */ + if (vd->vdev_ishole) { + dprintf("%s skipping hole in namespace\n", __func__); + return (0); + } + + /* No info available if missing */ + if (strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_MISSING) == 0) { + dprintf("%s skipping missing vdev\n", __func__); + return (0); + } + + /* Must be a disk, not a file */ + if (strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { + dprintf("%s skipping non-disk vdev\n", __func__); + return (0); + } + + /* Skip obviously non-bootable vdevs */ + if (vd->vdev_islog || + vd->vdev_isl2cache || vd->vdev_isspare) { + dprintf("%s skipping non-bootable\n", __func__); + return (0); + } + + /* Get vdev type-specific data */ + dvd = (vdev_disk_t *)vd->vdev_tsd; + if (!dvd || !dvd->vd_lh) { + dprintf("%s missing dvd or ldi handle\n", __func__); + return (0); + } + + /* Allocate an ldi io_bootinfo struct */ + info = (struct io_bootinfo *)kmem_alloc( + sizeof (struct io_bootinfo), KM_SLEEP); + if (!info) { + dprintf("%s info alloc failed\n", __func__); + return (ENOMEM); + } + bzero(info, sizeof (struct io_bootinfo)); + + /* Ask the vdev handle to fill in the info */ + error = ldi_ioctl(dvd->vd_lh, DKIOCGETBOOTINFO, + (intptr_t)info, 0, 0, 0); + if (error != 0) { + dprintf("%s ioctl error %d\n", __func__, error); + kmem_free(info, sizeof (struct io_bootinfo)); + return (EIO); + } + + /* Allocate dictionary to hold the keys */ + if ((dict = OSDictionary::withCapacity(2)) == NULL) { + dprintf("%s dictionary alloc failed\n", __func__); + kmem_free(info, sizeof (struct io_bootinfo)); + return (ENOMEM); + } + + /* Keys are path (string) and size (number) */ + dev_str = OSString::withCString(info->dev_path); + dev_size = OSNumber::withNumber(info->dev_size, + (8 * sizeof (info->dev_size))); + kmem_free(info, sizeof (struct io_bootinfo)); + info = 0; + + /* Add keys to dictionary or bail */ + if (!dev_str || !dev_size || + dict->setObject(kIOBootDevicePathKey, + dev_str) == false || + dict->setObject(kIOBootDeviceSizeKey, + dev_size) == false) { + dprintf("%s dictionary setup failed\n", __func__); + if (dev_str) dev_str->release(); + if (dev_size) dev_size->release(); + dict->release(); + dict = 0; + return (ENOMEM); + } + dev_str->release(); + dev_str = 0; + dev_size->release(); + dev_size = 0; + + /* Add dict to array */ + if (array->setObject(dict) == false) { + dprintf("%s couldn't set bootinfo\n", __func__); + dict->release(); + dict = 0; + return (ENOMEM); + } + dict->release(); + dict = 0; + + return (0); +} + +/* + * zfs_boot_update_bootinfo_vdev + * Inputs: spa: valid pool spa pointer. vd: valid vdev pointer. + * Return: 0 on success, positive integer errno on failure. + * Callers: zfs_boot_update_bootinfo + * + * called by bootinfo with root vdev, and recursively calls + * itself while iterating over children (vdevs only have a + * few levels of nesting at most). + */ +DSTATIC int +zfs_boot_update_bootinfo_vdev(OSArray *array, vdev_t *vd) +{ + int c, error; + + /* Validate inputs */ + if (!array || !vd) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + /* Skip obviously non-bootable vdevs */ + if (vd->vdev_islog || + vd->vdev_isl2cache || vd->vdev_isspare) { + dprintf("%s skipping non-bootable\n", __func__); + return (0); + } + + /* Process leaf vdevs */ + if (vd->vdev_ops->vdev_op_leaf) { + error = zfs_boot_update_bootinfo_vdev_leaf(array, vd); + if (error) + dprintf("%s bootinfo_vdev_leaf error %d\n", + __func__, error); + return (error); + } + + /* Iterate over child vdevs */ + for (c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] == NULL) { + dprintf("%s hole in vdev namespace\n", __func__); + continue; + } + + /* Recursion */ + error = zfs_boot_update_bootinfo_vdev(array, + vd->vdev_child[c]); + if (error != 0) { + dprintf("%s bootinfo_vdev_leaf error %d\n", + __func__, error); + return (error); + } + } + + return (0); +} + +extern "C" { + +/* + * zfs_boot_update_bootinfo + * Inputs: spa: valid pool spa pointer. + * Return: 0 on success, positive integer errno on failure. + * Callers: spa_open_common, spa_vdev_add, spa_vdev_remove, + * spa_vdev_attach, spa_vdev_detach. + * + * Called from spa.c on changes to the vdev layout. This + * information is assigned to the pool proxy so all zvols + * and datasets will retrieve the property through IOKit + * since it is retrieved via recursion. + * (see bless-105/Misc/BLCreateBooterInformationDictionary.c). + * If IOBootDevice property is needed for each dataset and + * zvol, we can revisit this and assign/update on all of + * these (already implemented a prototype that worked fine). + * + * Note: bootinfo is only collected for data vdevs. + * XXX We only want boot helpers there, unless there is a + * compelling argument for log, cache, or spares having + * boot helpers. + */ +int +zfs_boot_update_bootinfo(spa_t *spa) +{ + ZFSPool *pool_proxy; + OSArray *array; + int error; + + if (!spa) { + dprintf("%s missing spa\n", __func__); + return (EINVAL); + } + + /* XXX Could count vdevs first? */ + if ((array = OSArray::withCapacity(1)) == NULL) { + dprintf("%s allocation failed\n", __func__); + return (ENOMEM); + } + + /* Grab necessary locks */ + mutex_enter(&spa_namespace_lock); + spa_open_ref(spa, FTAG); + + /* Get pool proxy */ + if (!spa->spa_iokit_proxy || + (pool_proxy = spa->spa_iokit_proxy->proxy) == NULL) { + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + dprintf("%s no spa_pool_proxy\n", __func__); + return (0); + } + /* Avoid it disappearing from under us */ + pool_proxy->retain(); + + /* Don't need to hold this throughout */ + mutex_exit(&spa_namespace_lock); + + /* vdev state lock only requires an spa open ref */ + spa_vdev_state_enter(spa, SCL_NONE); + + /* Iterate over all vdevs */ + if ((error = zfs_boot_update_bootinfo_vdev(array, + spa->spa_root_vdev)) != 0) { + dprintf("%s bootinfo_vdev error %d\n", + __func__, error); + + /* Drop locks */ + (void) spa_vdev_state_exit(spa, NULL, 0); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + array->release(); + pool_proxy->release(); + return (error); + } + + /* Release locks, passing NULL vd (no change) */ + error = spa_vdev_state_exit(spa, NULL, 0); + if (error != 0) { + dprintf("%s spa_vdev_state_exit error %d\n", + __func__, error); + } + + /* setProperty adds a retain */ + pool_proxy->setProperty(kIOBootDeviceKey, array); + pool_proxy->release(); + array->release(); + + /* Drop locks */ + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + return (0); +} + +} /* extern "C" */ + +#if 0 +#ifdef ZFS_BOOT +/* Remainder only needed for boot */ + +#define DPRINTF_FUNC() dprintf("%s\n", __func__) + +#pragma mark - ZFSBootDevice + +OSDefineMetaClassAndStructors(ZFSBootDevice, IOBlockStorageDevice); +char ZFSBootDevice::vendorString[4] = "ZFS"; +char ZFSBootDevice::revisionString[4] = "0.1"; +char ZFSBootDevice::infoString[12] = "ZFS dataset"; + +#if 0 +int +zfs_boot_get_path(char *path, int len) +{ + OSString *disk = 0; + + if (!path || len == 0) { + dprintf("%s: invalid argument\n", __func__); + return (-1); + } + + if (bootdev) { + disk = OSDynamicCast(OSString, + bootdev->getProperty(kIOBSDNameKey, gIOServicePlane, + kIORegistryIterateRecursively)); + } + + if (disk) { + snprintf(path, len, "/dev/%s", disk->getCStringNoCopy()); + return (0); + } + + return (-1); +} +#endif + +bool +ZFSBootDevice::init(OSDictionary *properties) +{ + /* Allocate dictionaries and symbols */ + OSDictionary *pdict = OSDictionary::withCapacity(2); + OSDictionary *ddict = OSDictionary::withCapacity(4); + const OSSymbol *virtualSymbol = OSSymbol::withCString( + kIOPropertyPhysicalInterconnectTypeVirtual); + const OSSymbol *ramSymbol = OSSymbol::withCString( + kIOPropertyInterconnectRAMKey); + const OSSymbol *ssdSymbol = OSSymbol::withCString( + kIOPropertyMediumTypeSolidStateKey); + OSNumber *physSize = OSNumber::withNumber((uint32_t)4096, 32); + OSNumber *logSize = OSNumber::withNumber((uint32_t)512, 32); + const OSSymbol *vendorSymbol = 0; + const OSSymbol *revisionSymbol = 0; + const OSSymbol *blankSymbol = 0; + OSBoolean *rdonly = 0; + OSString *str = 0; + const char *cstr = 0; + bool ret = false; + + DPRINTF_FUNC(); + + /* Validate allocations */ + if (!pdict || !ddict || !virtualSymbol || !ramSymbol || + !ssdSymbol || !physSize || !logSize) { + dprintf("ZFSBootDevice::%s allocation failed\n", __func__); + goto error; + } + + /* Init class statics every time an instance inits */ + /* Shared across instances, but doesn't hurt to reprint */ + snprintf(vendorString, strlen("ZFS")+1, "ZFS"); + snprintf(revisionString, strlen("0.1")+1, "0.1"); + snprintf(infoString, strlen("ZFS dataset")+1, "ZFS dataset"); + + /* For IORegistry keys, cache OSSymbols for class statics */ + /* Leverages OSSymbol cahce pool to reuse across instances */ + vendorSymbol = OSSymbol::withCString(vendorString); + revisionSymbol = OSSymbol::withCString(revisionString); + blankSymbol = OSSymbol::withCString(""); + if (!vendorSymbol || !revisionSymbol || !blankSymbol) { + dprintf("ZFSBootDevice::%s class symbols failed\n", __func__); + goto error; + } + + /* Call super init */ + if (IOBlockStorageDevice::init(properties) == false) { + dprintf("ZFSBootDevice::%s device init failed\n", __func__); + goto error; + } + + /* Set class private vars */ + productString = NULL; + isReadOnly = false; // XXX should really be true initially + + /* Set Protocol Characteristics */ + if (pdict->setObject(kIOPropertyPhysicalInterconnectLocationKey, + ramSymbol) == false || + pdict->setObject(kIOPropertyPhysicalInterconnectTypeKey, + virtualSymbol) == false) { + dprintf("%s pdict set properties failed\n", __func__); + goto error; + } + setProperty(kIOPropertyProtocolCharacteristicsKey, pdict); + + /* Set Device Characteristics */ + if (ddict->setObject(kIOPropertyVendorNameKey, + vendorSymbol) == false || + ddict->setObject(kIOPropertyProductRevisionLevelKey, + revisionSymbol) == false || + ddict->setObject(kIOPropertyProductSerialNumberKey, + blankSymbol) == false || + ddict->setObject(kIOPropertyPhysicalBlockSizeKey, + physSize) == false || + ddict->setObject(kIOPropertyLogicalBlockSizeKey, + logSize) == false || + ddict->setObject(kIOPropertyMediumTypeKey, + ssdSymbol) == false) { + dprintf("%s ddict set properties failed\n", __func__); + goto error; + } + setProperty(kIOPropertyDeviceCharacteristicsKey, ddict); + + /* Check for passed in readonly status */ + if (properties && (rdonly = OSDynamicCast(OSBoolean, + properties->getObject(ZFS_BOOT_DATASET_RDONLY_KEY))) != NULL) { + /* Got the boolean */ + isReadOnly = rdonly->getValue(); + dprintf("ZFSBootDevice %s set %s\n", __func__, + (isReadOnly ? "readonly" : "readwrite")); + } + + /* Check for passed in dataset UUID */ + if (properties && (str = OSDynamicCast(OSString, + properties->getObject(ZFS_BOOT_DATASET_UUID_KEY))) != NULL && + (cstr = str->getCStringNoCopy()) != NULL) { + /* Got the string, try to set UUID */ + str->retain(); + if (ddict->setObject("Dataset UUID", str) == false) { + dprintf("ZFSBootDevice::%s failed UUID [%s]\n", + __func__, cstr); + str->release(); + goto error; + } + dprintf("ZFSBootDevice::%s set UUID [%s]\n", + __func__, cstr); + str->release(); + } + + /* Check for passed in dataset name */ + if (properties && (str = OSDynamicCast(OSString, + properties->getObject(ZFS_BOOT_DATASET_NAME_KEY))) != NULL && + (cstr = str->getCStringNoCopy()) != NULL) { + /* Got the string, try to set name */ + str->retain(); + if (setDatasetName(cstr) == false) { + /* Unlikely */ + dprintf("ZFSBootDevice %s couldn't setup dataset" + " name property [%s]\n", __func__, cstr); + str->release(); + goto error; + } + + dprintf("ZFSBootDevice %s set dataset name [%s]\n", + __func__, cstr); + str->release(); + } else { + if (setDatasetName("invalid") == false) { + dprintf("ZFSBootDevice::%s setDatasetName failed\n", + __func__); + goto error; + } + dprintf("ZFSBootDevice %s set name [invalid]\n", __func__); + } + + /* Success */ + ret = true; + +error: + if (pdict) pdict->release(); + if (ddict) ddict->release(); + if (virtualSymbol) virtualSymbol->release(); + if (ramSymbol) ramSymbol->release(); + if (ssdSymbol) ssdSymbol->release(); + if (physSize) physSize->release(); + if (logSize) logSize->release(); + if (vendorSymbol) vendorSymbol->release(); + if (revisionSymbol) revisionSymbol->release(); + if (blankSymbol) blankSymbol->release(); + return (ret); +} + +void +ZFSBootDevice::free() +{ + char *pstring = (char *)productString; + productString = 0; + + if (pstring) kmem_free(pstring, strlen(pstring) + 1); + + IOBlockStorageDevice::free(); +} + +#if 0 +bool +ZFSBootDevice::attach(IOService *provider) +{ + DPRINTF_FUNC(); + // return (IOMedia::attach(provider)); + return (IOBlockStorageDevice::attach(provider)); +} + +void +ZFSBootDevice::detach(IOService *provider) +{ + DPRINTF_FUNC(); + // IOMedia::detach(provider); + IOBlockStorageDevice::detach(provider); +} + +bool +ZFSBootDevice::start(IOService *provider) +{ + DPRINTF_FUNC(); + // return (IOMedia::start(provider)); + return (IOBlockStorageDevice::start(provider)); +} + +void +ZFSBootDevice::stop(IOService *provider) +{ + DPRINTF_FUNC(); + // IOMedia::stop(provider); + IOBlockStorageDevice::stop(provider); +} + +IOService* +ZFSBootDevice::probe(IOService *provider, SInt32 *score) +{ + DPRINTF_FUNC(); + // return (IOMedia::probe(provider, score)); + return (IOBlockStorageDevice::probe(provider, score)); +} +#endif + +IOReturn +ZFSBootDevice::doSynchronizeCache(void) +{ + dprintf("ZFSBootDevice %s\n", __func__); + return (kIOReturnSuccess); +} + +IOReturn +ZFSBootDevice::doAsyncReadWrite(IOMemoryDescriptor *buffer, + UInt64 block, UInt64 nblks, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + char zero[ZFS_BOOT_DEV_BSIZE]; + size_t len, cur, off = 0; + + DPRINTF_FUNC(); + + if (!buffer) { + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* Read vs. write */ + if (buffer->getDirection() == kIODirectionIn) { + /* Zero the read buffer */ + bzero(zero, ZFS_BOOT_DEV_BSIZE); + len = buffer->getLength(); + while (len > 0) { + cur = (len > ZFS_BOOT_DEV_BSIZE ? + ZFS_BOOT_DEV_BSIZE : len); + buffer->writeBytes(/* offset */ off, + /* buf */ zero, /* length */ cur); + off += cur; + len -= cur; + } + // dprintf("%s: read: %llu %llu\n", + // __func__, block, nblks); + IOStorage::complete(completion, kIOReturnSuccess, + buffer->getLength()); + return (kIOReturnSuccess); + } + + if (buffer->getDirection() != kIODirectionOut) { + dprintf("%s invalid direction %d\n", __func__, + buffer->getDirection()); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* + * XXX For now this just returns error for all writes. + * If it turns out that mountroot/bdevvp try to + * verify writable status by reading a block and writing + * it back to disk, lie and say it succeeded. + */ + dprintf("%s: write: %llu %llu\n", __func__, block, nblks); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); +} + +IOReturn +ZFSBootDevice::doEjectMedia() +{ + DPRINTF_FUNC(); + /* XXX Called at shutdown, maybe return success? */ + return (kIOReturnError); +} + +IOReturn +ZFSBootDevice::doFormatMedia(UInt64 byteCapacity) +{ + DPRINTF_FUNC(); + /* XXX shouldn't need it */ + return (kIOReturnError); + // return (kIOReturnSuccess); +} + +UInt32 +ZFSBootDevice::doGetFormatCapacities(UInt64 *capacities, + UInt32 capacitiesMaxCount) const +{ + DPRINTF_FUNC(); + if (capacities && capacitiesMaxCount > 0) { + capacities[0] = (ZFS_BOOT_DEV_BSIZE * ZFS_BOOT_DEV_BCOUNT); + dprintf("ZFSBootDevice %s: capacity %llu\n", + __func__, capacities[0]); + } + + /* Always inform caller of capacity count */ + return (1); +} + +/* Assign dataset name from null-terminated string */ +bool +ZFSBootDevice::setDatasetName(const char *dsname) +{ + OSDictionary *dict; + OSString *dsstr; + char *newname, *oldname; + size_t len; + + DPRINTF_FUNC(); + + /* Validate arguments */ + if (!dsname || (len = strnlen(dsname, + ZFS_MAX_DATASET_NAME_LEN)) == 0) { + dprintf("%s: missing argument\n", __func__); + return (false); + } + + /* Truncate too-long names (shouldn't happen) */ + if (len == ZFS_MAX_DATASET_NAME_LEN && + dsname[ZFS_MAX_DATASET_NAME_LEN] != '\0') { + dprintf("%s: dsname too long [%s]\n", + __func__, dsname); + /* XXX Just truncate the name */ + len--; + } + + /* Allocate room for name plus null char */ + newname = (char *)kmem_alloc(len+1, KM_SLEEP); + if (!newname) { + dprintf("ZFSBootDevice::%s string alloc failed\n", __func__); + return (false); + } + snprintf(newname, len+1, "%s", dsname); + newname[len] = '\0'; /* just in case */ + + /* Save an OSString copy for IORegistry */ + dsstr = OSString::withCString(newname); + if (!dsstr) { + dprintf("ZFSBootDevice::%s OSString failed\n", __func__); + kmem_free(newname, len+1); + return (false); + } + + /* Swap into class private var */ + oldname = productString; + productString = newname; + newname = 0; + if (oldname) { + kmem_free(oldname, strlen(oldname)+1); + oldname = 0; + } + + /* Get and clone device characteristics prop dict */ + if ((dict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyDeviceCharacteristicsKey))) == NULL || + (dict = OSDictionary::withDictionary(dict)) == NULL) { + dprintf("%s couldn't clone prop dict\n", __func__); + /* Should only happen during initialization */ + } + + if (dict) { + /* Copy string, add to dictionary, and replace prop dict */ + if (dict->setObject(kIOPropertyProductNameKey, + dsstr) == false || + setProperty(kIOPropertyDeviceCharacteristicsKey, + dict) == false) { + dprintf("%s couldn't set name\n", __func__); + dsstr->release(); + dict->release(); + return (false); + } + dict->release(); + dict = 0; + } + + /* Finally, set the IORegistryEntry/IOService name */ + setName(dsstr->getCStringNoCopy()); + dsstr->release(); + + return (true); +} + +/* Returns full dataset name from instance private var */ +char * +ZFSBootDevice::getProductString() +{ + dprintf("ZFSBootDevice %s [%s]\n", productString); + /* Return class private string */ + return (productString); +} + +/* Returns readonly status from instance private var */ +IOReturn +ZFSBootDevice::reportWriteProtection(bool *isWriteProtected) +{ + DPRINTF_FUNC(); + if (isWriteProtected) *isWriteProtected = isReadOnly; + return (kIOReturnSuccess); +} + +/* These return class static string for all instances */ +char * +ZFSBootDevice::getVendorString() +{ + dprintf("ZFSBootDevice %s [%s]\n", vendorString); + /* Return class static string */ + return (vendorString); +} +char * +ZFSBootDevice::getRevisionString() +{ + dprintf("ZFSBootDevice %s [%s]\n", revisionString); + /* Return class static string */ + return (revisionString); +} +char * +ZFSBootDevice::getAdditionalDeviceInfoString() +{ + dprintf("ZFSBootDevice %s [%s]\n", infoString); + /* Return class static string */ + return (infoString); +} + +/* Always return media present and unchanged */ +IOReturn +ZFSBootDevice::reportMediaState(bool *mediaPresent, + bool *changedState) +{ + DPRINTF_FUNC(); + if (mediaPresent) *mediaPresent = true; + if (changedState) *changedState = false; + return (kIOReturnSuccess); +} + +/* Always report nonremovable and nonejectable */ +IOReturn +ZFSBootDevice::reportRemovability(bool *isRemoveable) +{ + DPRINTF_FUNC(); + if (isRemoveable) *isRemoveable = false; + return (kIOReturnSuccess); +} +IOReturn +ZFSBootDevice::reportEjectability(bool *isEjectable) +{ + DPRINTF_FUNC(); + if (isEjectable) *isEjectable = false; + return (kIOReturnSuccess); +} + +/* Always report 512b blocksize */ +IOReturn +ZFSBootDevice::reportBlockSize(UInt64 *blockSize) +{ + DPRINTF_FUNC(); + if (!blockSize) + return (kIOReturnError); + + *blockSize = ZFS_BOOT_DEV_BSIZE; + return (kIOReturnSuccess); +} + +/* XXX Calculate from dev_bcount, should get size from objset */ +/* XXX Can issue message kIOMessageMediaParametersHaveChanged to update */ +IOReturn +ZFSBootDevice::reportMaxValidBlock(UInt64 *maxBlock) +{ + DPRINTF_FUNC(); + if (!maxBlock) + return (kIOReturnError); + + // *maxBlock = 0; + *maxBlock = ZFS_BOOT_DEV_BCOUNT - 1; + dprintf("ZFSBootDevice %s: maxBlock %llu\n", __func__, *maxBlock); + + return (kIOReturnSuccess); +} +#endif /* ZFS_BOOT */ +#endif /* 0 */ diff --git a/module/os/macos/zfs/zfs_ctldir.c b/module/os/macos/zfs/zfs_ctldir.c new file mode 100644 index 0000000000..87bc56344c --- /dev/null +++ b/module/os/macos/zfs/zfs_ctldir.c @@ -0,0 +1,1519 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * LLNL-CODE-403049. + * Rewritten for Linux by: + * Rohan Puri + * Brian Behlendorf + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright (c) 2018 George Melikov. All Rights Reserved. + * Copyright (c) 2019 Datto, Inc. All rights reserved. + * Copyright (c) 2020 Jorgen Lundman. All rights reserved. + */ + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' and 'shares' directory, but this may + * expand in the future. The elements are built dynamically, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by an user mode helper which invokes + * the mount procedure. Unmounts are handled by allowing the mount + * point to expire so the kernel may automatically unmount it. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/') all share the same + * zfsvfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted on top of the '.zfs/snapshot/' paths + * (ie: snapshots) are complete ZFS filesystems and have their own unique + * zfsvfs_t. However, the fsid reported by these mounts will be the same + * as that used by the parent zfsvfs_t to make NFS happy. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_namecheck.h" + +extern kmem_cache_t *znode_cache; +extern uint64_t vnop_num_vnodes; + +/* + * Apple differences; + * + * We don't have 'shares' directory, so only 'snapshot' is relevant. + * + * We can not issue mount from kernel, so involve zed. + * - see zfs_ctldir_snapdir.c + * + * All vnodes point to znode_t, no special case nodes. + */ + +/* List of zfsctl mounts waiting to be mounted */ +static kmutex_t zfsctl_mounts_lock; +static list_t zfsctl_mounts_list; +struct zfsctl_mounts_waiting { + kmutex_t zcm_lock; + kcondvar_t zcm_cv; + list_node_t zcm_node; + char zcm_name[ZFS_MAX_DATASET_NAME_LEN]; +}; +typedef struct zfsctl_mounts_waiting zfsctl_mounts_waiting_t; + + +/* + * Control Directory Tunables (.zfs) + */ +int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; +int zfs_admin_snapshot = 1; +int zfs_auto_snapshot = 1; + +static kmutex_t zfsctl_unmount_lock; +static kcondvar_t zfsctl_unmount_cv; +static boolean_t zfsctl_unmount_thread_exit; + +static kmutex_t zfsctl_unmount_list_lock; +static list_t zfsctl_unmount_list; + +struct zfsctl_unmount_delay { + char *se_name; /* full snapshot name */ + spa_t *se_spa; /* pool spa */ + uint64_t se_objsetid; /* snapshot objset id */ + time_t se_time; + list_node_t se_nodelink; +}; +typedef struct zfsctl_unmount_delay zfsctl_unmount_delay_t; + + +/* + * Check if the given vnode is a part of the virtual .zfs directory. + */ +boolean_t +zfsctl_is_node(struct vnode *ip) +{ + return (ITOZ(ip)->z_is_ctldir); +} + +typedef int (**vnode_operations)(void *); + + +/* + * Allocate a new vnode with the passed id and ops. + */ +static struct vnode * +zfsctl_vnode_alloc(zfsvfs_t *zfsvfs, uint64_t id, + char *name) +{ + timestruc_t now; + struct vnode *vp = NULL; + znode_t *zp = NULL; + struct vnode_fsparam vfsp; + + printf("%s\n", __func__); + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + + gethrestime(&now); + ASSERT3P(zp->z_dirlocks, ==, NULL); + ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT3P(zp->z_xattr_cached, ==, NULL); + zp->z_zfsvfs = zfsvfs; + zp->z_id = id; + zp->z_unlinked = B_FALSE; + zp->z_atime_dirty = B_FALSE; + zp->z_zn_prefetch = B_FALSE; + zp->z_moved = B_FALSE; + zp->z_is_sa = B_FALSE; + zp->z_is_mapped = B_FALSE; + zp->z_is_ctldir = B_TRUE; + zp->z_is_stale = B_FALSE; + zp->z_sa_hdl = NULL; + zp->z_blksz = 0; + zp->z_seq = 0; + zp->z_mapcnt = 0; + zp->z_size = 0; + zp->z_pflags = 0; + zp->z_mode = 0; + zp->z_sync_cnt = 0; + zp->z_gen = 0; + zp->z_mode = (S_IFDIR | (S_IRWXU|S_IRWXG|S_IRWXO)); + zp->z_uid = 0; + zp->z_gid = 0; + ZFS_TIME_ENCODE(&now, zp->z_atime); + + zp->z_snap_mount_time = 0; /* Allow automount attempt */ + + strlcpy(zp->z_name_cache, name, sizeof (zp->z_name_cache)); + + dprintf("%s zp %p with vp %p zfsvfs %p vfs %p\n", __func__, + zp, vp, zfsvfs, zfsvfs->z_vfs); + + bzero(&vfsp, sizeof (vfsp)); + vfsp.vnfs_str = "zfs"; + vfsp.vnfs_mp = zfsvfs->z_vfs; + vfsp.vnfs_vtype = IFTOVT((mode_t)zp->z_mode); + vfsp.vnfs_fsnode = zp; + vfsp.vnfs_flags = VNFS_ADDFSREF; + + /* Tag root directory */ + if (id == zfsvfs->z_root) + vfsp.vnfs_markroot = 1; + + /* + * This creates a vnode with VSYSTEM set, this is so that unmount's + * vflush() (called before our vfs_unmount) will pass (and not block + * waiting for the usercount ref to be released). We then release the + * VROOT vnode in zfsctl_destroy, and release the usercount ref. + * Because of this, we need to call vnode_recycle() ourselves in destroy + */ + if (id == ZFSCTL_INO_ROOT) + vfsp.vnfs_marksystem = 1; + + vfsp.vnfs_vops = zfs_ctldirops; + + while (vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp) != 0) { + kpreempt(KPREEMPT_SYNC); + } + atomic_inc_64(&vnop_num_vnodes); + + printf("Assigned zp %p with vp %p zfsvfs %p\n", zp, vp, zp->z_zfsvfs); + + vnode_settag(vp, VT_ZFS); + + zp->z_vid = vnode_vid(vp); + zp->z_vnode = vp; + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + membar_producer(); + if (id < zfsvfs->z_ctldir_startid) + zfsvfs->z_ctldir_startid = id; + mutex_exit(&zfsvfs->z_znodes_lock); + + return (vp); +} + +/* + * Lookup the vnode with given id, it will be allocated if needed. + */ +static struct vnode * +zfsctl_vnode_lookup(zfsvfs_t *zfsvfs, uint64_t id, + char *name) +{ + struct vnode *ip = NULL; + int error = 0; + + printf("%s\n", __func__); + + while (ip == NULL) { + + error = zfs_vfs_vget(zfsvfs->z_vfs, id, &ip, NULL); + if (error == 0 && ip != NULL) + break; + + /* May fail due to concurrent zfsctl_vnode_alloc() */ + ip = zfsctl_vnode_alloc(zfsvfs, id, name); + } + + return (ip); +} + +/* + * Create the '.zfs' directory. This directory is cached as part of the VFS + * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount() + * therefore checks against a vfs_count of 2 instead of 1. This reference + * is removed when the ctldir is destroyed in the unmount. All other entities + * under the '.zfs' directory are created dynamically as needed. + * + * Because the dynamically created '.zfs' directory entries assume the use + * of 64-bit vnode numbers this support must be disabled on 32-bit systems. + */ +int +zfsctl_create(zfsvfs_t *zfsvfs) +{ + ASSERT(zfsvfs->z_ctldir == NULL); + + dprintf("%s\n", __func__); + + /* Create root node, tagged with VSYSTEM - see above */ + zfsvfs->z_ctldir = zfsctl_vnode_alloc(zfsvfs, ZFSCTL_INO_ROOT, + ZFS_CTLDIR_NAME); + if (zfsvfs->z_ctldir == NULL) + return (SET_ERROR(ENOENT)); + + vnode_ref(zfsvfs->z_ctldir); + VN_RELE(zfsvfs->z_ctldir); + + dprintf("%s: done %p\n", __func__, zfsvfs->z_ctldir); + + return (0); +} + +/* + * Destroy the '.zfs' directory or remove a snapshot from + * zfs_snapshots_by_name. Only called when the filesystem is unmounted. + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + if (zfsvfs->z_ctldir) { + if (VN_HOLD(zfsvfs->z_ctldir) == 0) { + vnode_rele(zfsvfs->z_ctldir); + /* Because tagged VSYSTEM, we manually call recycle */ + vnode_recycle(zfsvfs->z_ctldir); + VN_RELE(zfsvfs->z_ctldir); + } + zfsvfs->z_ctldir = NULL; + } +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +struct vnode * +zfsctl_root(znode_t *zp) +{ + ASSERT(zfs_has_ctldir(zp)); + VN_HOLD(ZTOZSB(zp)->z_ctldir); + return (ZTOZSB(zp)->z_ctldir); +} + + +struct vnode * +zfs_root_dotdot(struct vnode *vp) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + znode_t *rootzp = NULL; + struct vnode *retvp = NULL; + + dprintf("%s: for id %llu\n", __func__, zp->z_id); + + if (zp->z_id == ZFSCTL_INO_ROOT) + zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + else if (zp->z_id == ZFSCTL_INO_SNAPDIR) + retvp = zfsctl_root(zp); + else + retvp = zfsctl_vnode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR, + "snapshot"); + + if (rootzp != NULL) + retvp = ZTOV(rootzp); + + return (retvp); +} + +/* + * Special case the handling of "..". + */ +int +zfsctl_root_lookup(struct vnode *dvp, char *name, struct vnode **vpp, + int flags, cred_t *cr, int *direntflags, struct componentname *realpnp) +{ + znode_t *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + int error = 0; + uint64_t id; + + dprintf("%s: '%s'\n", __func__, name); + + ZFS_ENTER(zfsvfs); + + if (strcmp(name, "..") == 0) { + *vpp = zfs_root_dotdot(dvp); + } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) { + *vpp = zfsctl_vnode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR, + name); + } else { + error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id); + if (error != 0) + goto out; + *vpp = zfsctl_vnode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id, + name); + } + + if (*vpp == NULL) { + error = SET_ERROR(ENOENT); + } + +out: + ZFS_EXIT(zfsvfs); + + return (error); +} + +int +zfsctl_vnop_lookup(struct vnop_lookup_args *ap) +#if 0 + struct vnop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + vfs_context_t a_context; + }; +#endif +{ + int direntflags = 0; + int error; + struct componentname *cnp = ap->a_cnp; + char *filename = NULL; + int filename_num_bytes = 0; + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + + /* + * Darwin uses namelen as an optimisation, for example it can be + * set to 5 for the string "alpha/beta" to look up "alpha". In this + * case we need to copy it out to null-terminate. + */ + if (cnp->cn_nameptr[cnp->cn_namelen] != 0) { + filename_num_bytes = cnp->cn_namelen + 1; + filename = (char *)kmem_alloc(filename_num_bytes, KM_SLEEP); + bcopy(cnp->cn_nameptr, filename, cnp->cn_namelen); + filename[cnp->cn_namelen] = '\0'; + } + + error = zfsctl_root_lookup(ap->a_dvp, + filename ? filename : cnp->cn_nameptr, + ap->a_vpp, /* flags */ 0, cr, &direntflags, NULL); + + /* If we are to create a directory, change error code for XNU */ + if ((error == ENOENT) && + (cnp->cn_flags & ISLASTCN)) { + if ((cnp->cn_nameiop == CREATE) || + (cnp->cn_nameiop == RENAME)) + error = EJUSTRETURN; + } + + if (filename != NULL) + kmem_free(filename, filename_num_bytes); + + return (error); +} + +/* Quick output function for readdir */ +#define DIRENT_RECLEN(namelen, ext) \ + ((ext) ? \ + ((sizeof (struct direntry) + (namelen) - (MAXPATHLEN-1) + 7) & ~7) \ + : \ + ((sizeof (struct dirent) - (NAME_MAX+1)) + (((namelen)+1 + 7) &~ 7))) + +static int zfsctl_dir_emit(const char *name, uint64_t id, enum vtype type, + struct vnop_readdir_args *ap, uint64_t **next) +{ + struct uio *uio = ap->a_uio; + boolean_t extended = (ap->a_flags & VNODE_READDIR_EXTENDED); + struct direntry *eodp; /* Extended */ + struct dirent *odp; /* Standard */ + int namelen; + void *buf; + int error = 0; + ushort_t reclen; + + dprintf("%s '%s'\n", __func__, name); + + namelen = strlen(name); + reclen = DIRENT_RECLEN(namelen, extended); + + if (reclen > uio_resid(uio)) + return (EINVAL); + + buf = kmem_zalloc(reclen, KM_SLEEP); + + if (extended) { + eodp = buf; + + /* + * NOTE: d_seekoff is the offset for the *next* entry - + * so poke in the previous struct with this id + */ + eodp->d_seekoff = uio_offset(uio) + 1; + + eodp->d_ino = id; + eodp->d_type = type; + + (void) bcopy(name, eodp->d_name, namelen + 1); + eodp->d_namlen = namelen; + eodp->d_reclen = reclen; + + } else { + odp = buf; + + odp->d_ino = id; + odp->d_type = type; + (void) bcopy(name, odp->d_name, namelen + 1); + odp->d_namlen = namelen; + odp->d_reclen = reclen; + + } + + /* Copyout this entry */ + error = uiomove(buf, (long)reclen, UIO_READ, uio); + + kmem_free(buf, reclen); + return (error); +} + +int +zfsctl_vnop_readdir_root(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + int error = 0; + uint64_t *next = NULL; + int entries = 0; + uint64_t offset; + struct uio *uio = ap->a_uio; + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + dprintf("%s\n", __func__); + + ZFS_ENTER(zfsvfs); + + *ap->a_numdirent = 0; + + offset = uio_offset(uio); + + while (offset < 3 && error == 0) { + + switch (offset) { + case 0: /* "." */ + error = zfsctl_dir_emit(".", ZFSCTL_INO_ROOT, + DT_DIR, ap, &next); + break; + + case 1: /* ".." */ + error = zfsctl_dir_emit("..", 2, + DT_DIR, ap, &next); + break; + + case 2: + error = zfsctl_dir_emit(ZFS_SNAPDIR_NAME, + ZFSCTL_INO_SNAPDIR, DT_DIR, ap, &next); + break; + } + + if (error == ENOENT) { + dprintf("end of snapshots reached\n"); + break; + } + + if (error != 0) { + dprintf("emit error\n"); + break; + } + + entries++; + offset++; + uio_setoffset(uio, offset); + } + + uio_setoffset(uio, offset); + + /* Finished without error? Set EOF */ + if (offset >= 3 && error == 0) { + *ap->a_eofflag = 1; + dprintf("Setting eof\n"); + } + + *ap->a_numdirent = entries; + dprintf("Returning %d entries\n", entries); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +int +zfsctl_vnop_readdir_snapdir(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + int error = 0; + uint64_t *next = NULL; + int entries = 0; + uint64_t offset; + struct uio *uio = ap->a_uio; + boolean_t case_conflict; + uint64_t id; + char snapname[MAXNAMELEN]; + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + dprintf("%s\n", __func__); + + ZFS_ENTER(zfsvfs); + + *ap->a_numdirent = 0; + + offset = uio_offset(uio); + + while (error == 0) { + + switch (offset) { + case 0: /* "." */ + error = zfsctl_dir_emit(".", ZFSCTL_INO_SNAPDIR, + DT_DIR, ap, &next); + break; + + case 1: /* ".." */ + error = zfsctl_dir_emit("..", ZFSCTL_INO_ROOT, + DT_DIR, ap, &next); + break; + + default: + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), + FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, + MAXNAMELEN, snapname, &id, &offset, &case_conflict); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), + FTAG); + if (error) + break; + + error = zfsctl_dir_emit(snapname, + ZFSCTL_INO_SHARES - id, DT_DIR, ap, &next); + break; + } + + if (error != 0) { + dprintf("emit error\n"); + break; + } + + entries++; + offset++; + uio_setoffset(uio, offset); + } + + uio_setoffset(uio, offset); + + /* Finished without error? Set EOF */ + if (error == ENOENT) { + *ap->a_eofflag = 1; + dprintf("Setting eof\n"); + error = 0; + } + + *ap->a_numdirent = entries; + dprintf("Returning %d entries\n", entries); + + ZFS_EXIT(zfsvfs); + + return (error); +} + + +/* We need to spit out a valid "." ".." entries for mount to work */ +int +zfsctl_vnop_readdir_snapdirs(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + int error = 0; + uint64_t *next = NULL; + int entries = 0; + uint64_t offset; + struct uio *uio = ap->a_uio; + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + + *ap->a_numdirent = 0; + + offset = uio_offset(uio); + + dprintf("%s: for id %llu: offset %llu\n", __func__, + zp->z_id, offset); + + while (error == 0) { + + switch (offset) { + case 0: /* "." */ + error = zfsctl_dir_emit(".", ZFSCTL_INO_SNAPDIR, + DT_DIR, ap, &next); + break; + + case 1: /* ".." */ + error = zfsctl_dir_emit("..", ZFSCTL_INO_ROOT, + DT_DIR, ap, &next); + break; + + default: + error = ENOENT; + break; + } + + if (error != 0) { + dprintf("emit error\n"); + break; + } + + entries++; + offset++; + uio_setoffset(uio, offset); + } + + uio_setoffset(uio, offset); + + /* Finished without error? Set EOF */ + if (error == ENOENT) { + *ap->a_eofflag = 1; + dprintf("Setting eof\n"); + error = 0; + } + + *ap->a_numdirent = entries; + dprintf("Returning %d entries\n", entries); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +int +zfsctl_vnop_readdir(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + znode_t *zp = VTOZ(ap->a_vp); + + dprintf("%s\n", __func__); + + /* Which directory are we to output? */ + switch (zp->z_id) { + case ZFSCTL_INO_ROOT: + return (zfsctl_vnop_readdir_root(ap)); + case ZFSCTL_INO_SNAPDIR: + return (zfsctl_vnop_readdir_snapdir(ap)); + default: + return (zfsctl_vnop_readdir_snapdirs(ap)); + } + return (EINVAL); +} + +int +zfsctl_vnop_getattr(struct vnop_getattr_args *ap) +#if 0 + struct vnop_getattr_args { + struct vnode *a_vp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ + vattr_t *vap = ap->a_vap; + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + timestruc_t now; + + dprintf("%s: active x%llx\n", __func__, vap->va_active); + + ZFS_ENTER(zfsvfs); + + gethrestime(&now); + + if (VATTR_IS_ACTIVE(vap, va_rdev)) + VATTR_RETURN(vap, va_rdev, zfsvfs->z_rdev); + if (VATTR_IS_ACTIVE(vap, va_nlink)) + VATTR_RETURN(vap, va_nlink, + vnode_isdir(vp) ? zp->z_size : zp->z_links); + if (VATTR_IS_ACTIVE(vap, va_total_size)) + VATTR_RETURN(vap, va_total_size, 512); + if (VATTR_IS_ACTIVE(vap, va_total_alloc)) + VATTR_RETURN(vap, va_total_alloc, 512); + if (VATTR_IS_ACTIVE(vap, va_data_size)) + VATTR_RETURN(vap, va_data_size, 0); + if (VATTR_IS_ACTIVE(vap, va_data_alloc)) + VATTR_RETURN(vap, va_data_alloc, 0); + if (VATTR_IS_ACTIVE(vap, va_iosize)) + VATTR_RETURN(vap, va_iosize, 512); + if (VATTR_IS_ACTIVE(vap, va_uid)) + VATTR_RETURN(vap, va_uid, 0); + if (VATTR_IS_ACTIVE(vap, va_gid)) + VATTR_RETURN(vap, va_gid, 0); + if (VATTR_IS_ACTIVE(vap, va_mode)) + VATTR_RETURN(vap, va_mode, S_IFDIR | + S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + if (VATTR_IS_ACTIVE(vap, va_flags)) + VATTR_RETURN(vap, va_flags, zfs_getbsdflags(zp)); + + if (VATTR_IS_ACTIVE(vap, va_acl)) { + VATTR_RETURN(vap, va_uuuid, kauth_null_guid); + VATTR_RETURN(vap, va_guuid, kauth_null_guid); + VATTR_RETURN(vap, va_acl, NULL); + } + + // crtime, atime, mtime, ctime, btime + uint64_t timez[2]; + timez[0] = zfsvfs->z_mount_time; + timez[1] = 0; + + if (VATTR_IS_ACTIVE(vap, va_create_time)) { + ZFS_TIME_DECODE(&vap->va_create_time, timez); + VATTR_SET_SUPPORTED(vap, va_create_time); + } + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + ZFS_TIME_DECODE(&vap->va_access_time, timez); + VATTR_SET_SUPPORTED(vap, va_access_time); + } + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + ZFS_TIME_DECODE(&vap->va_modify_time, timez); + VATTR_SET_SUPPORTED(vap, va_modify_time); + } + if (VATTR_IS_ACTIVE(vap, va_change_time)) { + ZFS_TIME_DECODE(&vap->va_change_time, timez); + VATTR_SET_SUPPORTED(vap, va_change_time); + } + if (VATTR_IS_ACTIVE(vap, va_backup_time)) { + ZFS_TIME_DECODE(&vap->va_backup_time, timez); + VATTR_SET_SUPPORTED(vap, va_backup_time); + } + if (VATTR_IS_ACTIVE(vap, va_addedtime)) { + ZFS_TIME_DECODE(&vap->va_addedtime, timez); + VATTR_SET_SUPPORTED(vap, va_addedtime); + } + + if (VATTR_IS_ACTIVE(vap, va_fileid)) + VATTR_RETURN(vap, va_fileid, zp->z_id); + if (VATTR_IS_ACTIVE(vap, va_linkid)) + VATTR_RETURN(vap, va_linkid, zp->z_id); + if (VATTR_IS_ACTIVE(vap, va_parentid)) { + switch (zp->z_id) { + case ZFSCTL_INO_ROOT: + // ".zfs" parent is mount, 2 on osx + VATTR_RETURN(vap, va_linkid, 2); + break; + case ZFSCTL_INO_SNAPDIR: + // ".zfs/snapshot" parent is ".zfs" + VATTR_RETURN(vap, va_linkid, ZFSCTL_INO_ROOT); + break; + default: + // ".zfs/snapshot/$name" parent ".zfs/snapshot" + VATTR_RETURN(vap, va_linkid, + ZFSCTL_INO_SNAPDIR); + break; + } + } + if (VATTR_IS_ACTIVE(vap, va_fsid)) + VATTR_RETURN(vap, va_fsid, zfsvfs->z_rdev); + + if (VATTR_IS_ACTIVE(vap, va_filerev)) + VATTR_RETURN(vap, va_filerev, 0); + if (VATTR_IS_ACTIVE(vap, va_gen)) + VATTR_RETURN(vap, va_gen, zp->z_gen); + if (VATTR_IS_ACTIVE(vap, va_type)) + VATTR_RETURN(vap, va_type, vnode_vtype(ZTOV(zp))); + if (VATTR_IS_ACTIVE(vap, va_name)) { + strlcpy(vap->va_name, zp->z_name_cache, MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + } + + /* Don't include '.' and '..' in the number of entries */ + if (VATTR_IS_ACTIVE(vap, va_nchildren) && vnode_isdir(vp)) { + VATTR_RETURN(vap, va_nchildren, + zp->z_links > 3 ? zp->z_links-2 : 1); + } + if (VATTR_IS_ACTIVE(vap, va_dirlinkcount) && vnode_isdir(vp)) + VATTR_RETURN(vap, va_dirlinkcount, 1); + +#ifdef VNODE_ATTR_va_fsid64 + if (VATTR_IS_ACTIVE(vap, va_fsid64)) { + vap->va_fsid64.val[0] = + vfs_statfs(zfsvfs->z_vfs)->f_fsid.val[0]; + vap->va_fsid64.val[1] = vfs_typenum(zfsvfs->z_vfs); + VATTR_SET_SUPPORTED(vap, va_fsid64); + } +#endif + + ZFS_EXIT(zfsvfs); + + dprintf("%s: returned x%llx missed: x%llx\n", __func__, + vap->va_supported, vap->va_active &= ~vap->va_supported); + return (0); +} + +int +zfsctl_vnop_access(struct vnop_access_args *ap) +{ + int accmode = ap->a_action; + dprintf("zfsctl_access\n"); + + if (accmode & VWRITE) + return (EACCES); + return (0); +} + +int +zfsctl_vnop_open(struct vnop_open_args *ap) +{ + int flags = ap->a_mode; + + if (flags & FWRITE) + return (EACCES); + + return (zfsctl_snapshot_mount(ap->a_vp, 0)); +} + +int +zfsctl_vnop_close(struct vnop_close_args *ap) +{ + dprintf("%s\n", __func__); + return (0); +} + +int +zfsctl_vnop_inactive(struct vnop_inactive_args *ap) +{ + dprintf("%s\n", __func__); + return (0); +} + +int +zfsctl_vnop_reclaim(struct vnop_reclaim_args *ap) +{ + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + dprintf("%s vp %p\n", __func__, vp); + vnode_removefsref(vp); /* ADDREF from vnode_create */ + vnode_clearfsnode(vp); /* vp->v_data = NULL */ + + mutex_enter(&zfsvfs->z_znodes_lock); + if (list_link_active(&zp->z_link_node)) { + list_remove(&zfsvfs->z_all_znodes, zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + zp->z_vnode = NULL; + kmem_cache_free(znode_cache, zp); + + return (0); +} + +/* + * Construct a full dataset name in full_name: "pool/dataset@snap_name" + */ +static int +zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len, + char *full_name) +{ + objset_t *os = zfsvfs->z_os; + + if (zfs_component_namecheck(snap_name, NULL, NULL) != 0) + return (SET_ERROR(EILSEQ)); + + dmu_objset_name(os, full_name); + if ((strlen(full_name) + 1 + strlen(snap_name)) >= len) + return (SET_ERROR(ENAMETOOLONG)); + + (void) strcat(full_name, "@"); + (void) strcat(full_name, snap_name); + + return (0); +} + +int +zfsctl_snapshot_mount(struct vnode *vp, int flags) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int ret = 0; + /* + * If we are here for a snapdirs directory, attempt to get zed + * to mount the snapshot for the user. If successful, forward the + * vnop_open() to them (ourselves). + * Use a timeout in case zed is not running. + */ + + if (zfs_auto_snapshot == 0) + return (0); + + ZFS_ENTER(zfsvfs); + if (((zp->z_id >= zfsvfs->z_ctldir_startid) && + (zp->z_id <= ZFSCTL_INO_SNAPDIRS))) { + hrtime_t now; + now = gethrtime(); + + /* + * If z_snap_mount_time is set, check if it is old enough to + * retry, if so, set z_snap_mount_time to zero. + */ + if (now - zp->z_snap_mount_time > SEC2NSEC(60)) + atomic_cas_64((uint64_t *)&zp->z_snap_mount_time, + (uint64_t)zp->z_snap_mount_time, + 0ULL); + + /* + * Attempt mount, make sure only to issue one request, by + * attempting to CAS in current time in place of zero. + */ + if (atomic_cas_64((uint64_t *)&zp->z_snap_mount_time, 0ULL, + (uint64_t)now) == 0ULL) { + char full_name[ZFS_MAX_DATASET_NAME_LEN]; + + /* First! */ + ret = zfsctl_snapshot_name(zfsvfs, zp->z_name_cache, + ZFS_MAX_DATASET_NAME_LEN, full_name); + + if (ret == 0) { + zfsctl_mounts_waiting_t *zcm; + + /* Create condvar to wait for mount to happen */ + + zcm = kmem_alloc( + sizeof (zfsctl_mounts_waiting_t), KM_SLEEP); + mutex_init(&zcm->zcm_lock, NULL, MUTEX_DEFAULT, + NULL); + cv_init(&zcm->zcm_cv, NULL, CV_DEFAULT, NULL); + strlcpy(zcm->zcm_name, full_name, + sizeof (zcm->zcm_name)); + + dprintf("%s: requesting mount for '%s'\n", + __func__, full_name); + + mutex_enter(&zfsctl_mounts_lock); + list_insert_tail(&zfsctl_mounts_list, zcm); + mutex_exit(&zfsctl_mounts_lock); + + mutex_enter(&zcm->zcm_lock); + zfs_ereport_snapshot_post( + FM_EREPORT_ZFS_SNAPSHOT_MOUNT, + dmu_objset_spa(zfsvfs->z_os), full_name); + + /* Now we wait hoping zed comes back to us */ + ret = cv_timedwait(&zcm->zcm_cv, &zcm->zcm_lock, + ddi_get_lbolt() + (hz * 3)); + + dprintf("%s: finished waiting %d\n", + __func__, ret); + + mutex_exit(&zcm->zcm_lock); + + mutex_enter(&zfsctl_mounts_lock); + list_remove(&zfsctl_mounts_list, zcm); + mutex_exit(&zfsctl_mounts_lock); + + mutex_destroy(&zcm->zcm_lock); + cv_destroy(&zcm->zcm_cv); + + kmem_free(zcm, + sizeof (zfsctl_mounts_waiting_t)); + + /* + * If we mounted, make it re-open it so + * the process that issued the access will + * see the mounted content + */ + if (ret >= 0) { + /* Remove the cache entry */ + cache_purge(vp); + cache_purge_negatives(vp); + ret = ERESTART; + } + } + } + } + + ZFS_EXIT(zfsvfs); + + return (ret); +} + +/* Called whenever zfs_vfs_mount() is called with a snapshot */ +void +zfsctl_mount_signal(char *osname, boolean_t mounting) +{ + zfsctl_mounts_waiting_t *zcm; + + dprintf("%s: looking for snapshot '%s'\n", __func__, osname); + + mutex_enter(&zfsctl_mounts_lock); + for (zcm = list_head(&zfsctl_mounts_list); + zcm; + zcm = list_next(&zfsctl_mounts_list, zcm)) { + if (strncmp(zcm->zcm_name, osname, sizeof (zcm->zcm_name)) == 0) + break; + } + mutex_exit(&zfsctl_mounts_lock); + + /* Is there someone to wake up? */ + if (zcm != NULL) { + mutex_enter(&zcm->zcm_lock); + cv_signal(&zcm->zcm_cv); + mutex_exit(&zcm->zcm_lock); + dprintf("%s: mount waiter found and signalled\n", __func__); + } + + zfsctl_unmount_delay_t *zcu; + + /* Add or remove mount to/from list of active mounts */ + + if (mounting) { + /* Add active mounts to the list */ + zcu = kmem_alloc(sizeof (zfsctl_unmount_delay_t), KM_SLEEP); + zcu->se_name = kmem_strdup(osname); + zcu->se_time = gethrestime_sec(); + list_link_init(&zcu->se_nodelink); + + mutex_enter(&zfsctl_unmount_list_lock); + list_insert_tail(&zfsctl_unmount_list, zcu); + mutex_exit(&zfsctl_unmount_list_lock); + + } else { + /* Unmounting */ + mutex_enter(&zfsctl_unmount_list_lock); + for (zcu = list_head(&zfsctl_unmount_list); + zcu != NULL; + zcu = list_next(&zfsctl_unmount_list, zcu)) { + if (strcmp(osname, zcu->se_name) == 0) { + list_remove(&zfsctl_unmount_list, zcu); + kmem_strfree(zcu->se_name); + kmem_free(zcu, sizeof (zfsctl_unmount_delay_t)); + break; + } + } + mutex_exit(&zfsctl_unmount_list_lock); + } +} + +int +zfsctl_snapshot_unmount_node(struct vnode *vp, const char *full_name, + int flags) +{ + znode_t *zp = VTOZ(vp); + + dprintf("%s\n", __func__); + + if (zp == NULL) + return (ENOENT); + + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int ret = ENOENT; + /* + * If we are here for a snapdirs directory, attempt to get zed + * to mount the snapshot for the user. If successful, forward the + * vnop_open() to them (ourselves). + * Use a timeout in case zed is not running. + */ + + ZFS_ENTER(zfsvfs); + + if (zp->z_id == zfsvfs->z_root) { + hrtime_t now; + now = gethrtime(); + + /* + * If z_snap_mount_time is set, check if it is old enough to + * retry, if so, set z_snap_mount_time to zero. + */ + if (now - zp->z_snap_mount_time > SEC2NSEC(60)) + atomic_cas_64((uint64_t *)&zp->z_snap_mount_time, + (uint64_t)zp->z_snap_mount_time, + 0ULL); + + /* + * Attempt unmount, make sure only to issue one request, by + * attempting to CAS in current time in place of zero. + */ + if (atomic_cas_64((uint64_t *)&zp->z_snap_mount_time, 0ULL, + (uint64_t)now) == 0ULL) { + + ret = 0; + + /* First! */ + + if (ret == 0) { + zfsctl_mounts_waiting_t *zcm; + + /* Create condvar to wait for mount to happen */ + + zcm = kmem_alloc( + sizeof (zfsctl_mounts_waiting_t), KM_SLEEP); + mutex_init(&zcm->zcm_lock, NULL, MUTEX_DEFAULT, + NULL); + cv_init(&zcm->zcm_cv, NULL, CV_DEFAULT, NULL); + strlcpy(zcm->zcm_name, full_name, + sizeof (zcm->zcm_name)); + + dprintf("%s: requesting unmount for '%s'\n", + __func__, full_name); + + mutex_enter(&zfsctl_mounts_lock); + list_insert_tail(&zfsctl_mounts_list, zcm); + mutex_exit(&zfsctl_mounts_lock); + + mutex_enter(&zcm->zcm_lock); + zfs_ereport_snapshot_post( + FM_EREPORT_ZFS_SNAPSHOT_UNMOUNT, + dmu_objset_spa(zfsvfs->z_os), full_name); + + /* Now we wait hoping zed comes back to us */ + ret = cv_timedwait(&zcm->zcm_cv, &zcm->zcm_lock, + ddi_get_lbolt() + (hz * 3)); + + dprintf("%s: finished waiting %d\n", + __func__, ret); + + mutex_exit(&zcm->zcm_lock); + + mutex_enter(&zfsctl_mounts_lock); + list_remove(&zfsctl_mounts_list, zcm); + mutex_exit(&zfsctl_mounts_lock); + + kmem_free(zcm, + sizeof (zfsctl_mounts_waiting_t)); + + /* Allow mounts to happen immediately */ + zp->z_snap_mount_time = 0; + + /* + * If we unmounted, alert caller + */ + if (ret >= 0) + ret = 0; + + } + } + } + + ZFS_EXIT(zfsvfs); + + return (ret); +} + +int +zfsctl_snapshot_unmount(const char *snapname, int flags) +{ + znode_t *rootzp; + zfsvfs_t *zfsvfs; + + dprintf("%s\n", __func__); + + if (strchr(snapname, '@') == NULL) + return (0); + + int err = getzfsvfs(snapname, &zfsvfs); + if (err != 0) { + ASSERT3P(zfsvfs, ==, NULL); + return (0); + } + ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); + + err = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (err == 0) { + zfsctl_snapshot_unmount_node(ZTOV(rootzp), snapname, flags); + VN_RELE(ZTOV(rootzp)); + } + + vfs_unbusy(zfsvfs->z_vfs); + return (0); +} + +int +zfsctl_vnop_mkdir(struct vnop_mkdir_args *ap) +#if 0 + struct vnop_mkdir_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + znode_t *dzp = VTOZ(ap->a_dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + char *dsname; + int error; + + if (zfs_admin_snapshot == 0) + return (SET_ERROR(EACCES)); + + dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (zfs_component_namecheck(ap->a_cnp->cn_nameptr, NULL, NULL) != 0) { + error = SET_ERROR(EILSEQ); + goto out; + } + + dmu_objset_name(zfsvfs->z_os, dsname); + + error = zfs_secpolicy_snapshot_perms(dsname, cr); + if (error != 0) + goto out; + + if (error == 0) { + error = dmu_objset_snapshot_one(dsname, ap->a_cnp->cn_nameptr); + if (error != 0) + goto out; + + error = zfsctl_root_lookup(ap->a_dvp, ap->a_cnp->cn_nameptr, + ap->a_vpp, 0, cr, NULL, NULL); + } + +out: + kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); + + return (error); +} + +int +zfsctl_vnop_rmdir(struct vnop_rmdir_args *ap) +#if 0 + struct vnop_rmdir_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + vfs_context_t a_context; + }; +#endif +{ + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + znode_t *dzp = VTOZ(ap->a_dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + char *snapname, *real; + char *name = ap->a_cnp->cn_nameptr; + int error; + + dprintf("%s: '%s'\n", __func__, name); + + if (zfs_admin_snapshot == 0) + return (SET_ERROR(EACCES)); + + ZFS_ENTER(zfsvfs); + + snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + error = dmu_snapshot_realname(zfsvfs->z_os, name, + real, ZFS_MAX_DATASET_NAME_LEN, NULL); + if (error == 0) { + name = real; + } else if (error != ENOTSUP) { + goto out; + } + } + + error = zfsctl_snapshot_name(zfsvfs, name, + ZFS_MAX_DATASET_NAME_LEN, snapname); + if (error == 0) + error = zfs_secpolicy_destroy_perms(snapname, cr); + if (error != 0) + goto out; + + error = zfsctl_snapshot_unmount_node(ap->a_vp, snapname, MNT_FORCE); + if ((error == 0) || (error == ENOENT)) { + error = dsl_destroy_snapshot(snapname, B_FALSE); + + /* Destroy the vnode */ + if (ap->a_vp != NULL) { + dprintf("%s: releasing vp\n", __func__); + vnode_recycle(ap->a_vp); + } + } + +out: + kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static void +zfsctl_unmount_thread(void *notused) +{ + callb_cpr_t cpr; + zfsctl_unmount_delay_t *zcu; + time_t now; + CALLB_CPR_INIT(&cpr, &zfsctl_unmount_lock, callb_generic_cpr, FTAG); + + dprintf("%s is alive\n", __func__); + + mutex_enter(&zfsctl_unmount_lock); + while (!zfsctl_unmount_thread_exit) { + + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&zfsctl_unmount_cv, + &zfsctl_unmount_lock, ddi_get_lbolt() + (hz<<6)); + CALLB_CPR_SAFE_END(&cpr, &zfsctl_unmount_lock); + + if (!zfsctl_unmount_thread_exit) { + /* + * Loop all active mounts, if any are older + * than ZFSCTL_EXPIRE_SNAPSHOT, then we update + * their timestamp and attempt unmount. + */ + now = gethrestime_sec(); + mutex_enter(&zfsctl_unmount_list_lock); + for (zcu = list_head(&zfsctl_unmount_list); + zcu != NULL; + zcu = list_next(&zfsctl_unmount_list, zcu)) { + if ((now > zcu->se_time) && + ((now - zcu->se_time) > + zfs_expire_snapshot)) { + zcu->se_time = now; + zfsctl_snapshot_unmount(zcu->se_name, + 0); + } + } + mutex_exit(&zfsctl_unmount_list_lock); + } + } + + zfsctl_unmount_thread_exit = FALSE; + cv_broadcast(&zfsctl_unmount_cv); + CALLB_CPR_EXIT(&cpr); + dprintf("ZFS: zfsctl_unmount thread exit\n"); + thread_exit(); +} + +/* + * Initialize the various pieces we'll need to create and manipulate .zfs + * directories. Currently this is unused but available. + */ +void +zfsctl_init(void) +{ + mutex_init(&zfsctl_mounts_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsctl_mounts_list, sizeof (zfsctl_mounts_waiting_t), + offsetof(zfsctl_mounts_waiting_t, zcm_node)); + + mutex_init(&zfsctl_unmount_list_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsctl_unmount_list, sizeof (zfsctl_unmount_delay_t), + offsetof(zfsctl_unmount_delay_t, se_nodelink)); + + mutex_init(&zfsctl_unmount_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zfsctl_unmount_cv, NULL, CV_DEFAULT, NULL); + zfsctl_unmount_thread_exit = FALSE; + + (void) thread_create(NULL, 0, zfsctl_unmount_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +/* + * Cleanup the various pieces we needed for .zfs directories. In particular + * ensure the expiry timer is canceled safely. + */ +void +zfsctl_fini(void) +{ + mutex_destroy(&zfsctl_mounts_lock); + list_destroy(&zfsctl_mounts_list); + + mutex_destroy(&zfsctl_unmount_list_lock); + list_destroy(&zfsctl_unmount_list); + + mutex_enter(&zfsctl_unmount_lock); + zfsctl_unmount_thread_exit = TRUE; + while (zfsctl_unmount_thread_exit) { + cv_signal(&zfsctl_unmount_cv); + cv_wait(&zfsctl_unmount_cv, &zfsctl_unmount_lock); + } + mutex_exit(&zfsctl_unmount_lock); + + mutex_destroy(&zfsctl_unmount_lock); + cv_destroy(&zfsctl_unmount_cv); +} + +module_param(zfs_admin_snapshot, int, 0644); +MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot"); + +module_param(zfs_expire_snapshot, int, 0644); +MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot"); diff --git a/module/os/macos/zfs/zfs_debug.c b/module/os/macos/zfs/zfs_debug.c new file mode 100644 index 0000000000..b201363899 --- /dev/null +++ b/module/os/macos/zfs/zfs_debug.c @@ -0,0 +1,264 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include + +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + int zdm_size; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ +kstat_t *zfs_dbgmsg_kstat; + +int zfs_dbgmsg_enable = 1; + +static int +zfs_dbgmsg_headers(char *buf, size_t size) +{ + (void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message"); + + return (0); +} + +static int +zfs_dbgmsg_data(char *buf, size_t size, void *data) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data; + + (void) snprintf(buf, size, "%-12llu %-s\n", + (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); + + return (0); +} + +static void * +zfs_dbgmsg_addr(kstat_t *ksp, loff_t n) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private; + + ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); + + if (n == 0) + ksp->ks_private = list_head(&zfs_dbgmsgs); + else if (zdm) + ksp->ks_private = list_next(&zfs_dbgmsgs, zdm); + + return (ksp->ks_private); +} + +static void +zfs_dbgmsg_purge(int max_size) +{ + zfs_dbgmsg_t *zdm; + int size; + + ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); + + while (zfs_dbgmsg_size > max_size) { + zdm = list_remove_head(&zfs_dbgmsgs); + if (zdm == NULL) + return; + + size = zdm->zdm_size; + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } +} + +static int +zfs_dbgmsg_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) + zfs_dbgmsg_purge(0); + + return (0); +} + +/* + * Debug logging is enabled by default for production kernel builds. + * The overhead for this is negligible and the logs can be valuable when + * debugging. For non-production user space builds all debugging except + * logging is enabled since performance is no longer a concern. + */ +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); + + zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + if (zfs_dbgmsg_kstat) { + zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock; + zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX; + zfs_dbgmsg_kstat->ks_private = NULL; + zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update; + kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers, + zfs_dbgmsg_data, zfs_dbgmsg_addr); + kstat_install(zfs_dbgmsg_kstat); + } +} + +void +zfs_dbgmsg_fini(void) +{ + zfs_dbgmsg_t *zdm; + + if (zfs_dbgmsg_kstat) + kstat_delete(zfs_dbgmsg_kstat); + + while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { + int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_destroy(&zfs_dbgmsgs_lock); + ASSERT0(zfs_dbgmsg_size); +} + +void +__set_error(const char *file, const char *func, int line, int err) +{ + /* + * To enable this: + * + * $ echo 512 >/sys/module/zfs/parameters/zfs_flags + */ + if (zfs_flags & ZFS_DEBUG_SET_ERROR) + __dprintf(B_FALSE, file, func, line, "error %lu", err); +} + +/* + * Print these messages by running: + * echo ::zfs_dbgmsg | mdb -k + * + * Monitor these messages by running: + * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + * + * When used with libzpool, monitor with: + * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}' + */ + +/* + * MacOS X's dtrace doesn't handle the PROBEs, so + * we have a utility function that we can watch with + * sudo dtrace -qn '__zfs_dbgmsg:entry{printf("%s\n", stringof(arg0));}' + */ +noinline void +__zfs_dbgmsg(char *buf) +{ + int size = sizeof (zfs_dbgmsg_t) + strlen(buf); + zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP); + zdm->zdm_size = size; + zdm->zdm_timestamp = gethrestime_sec(); + strlcpy(zdm->zdm_msg, buf, size); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); + mutex_exit(&zfs_dbgmsgs_lock); +} + +void +__dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...) +{ + int size, i; + va_list adx; + char *buf, *nl; + char *prefix = (dprint) ? "dprintf: " : ""; + const char *newfile; + + /* + * Get rid of annoying prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx); + va_end(adx); + + size += snprintf(NULL, 0, "%s%s:%d:%s(): ", prefix, newfile, line, + func); + + size++; /* null byte in the "buf" string */ + + /* + * There is one byte of string in sizeof (zfs_dbgmsg_t), used + * for the terminating null. + */ + buf = kmem_alloc(size, KM_SLEEP); + int roger = 0; + + va_start(adx, fmt); + i = snprintf(buf, size + 1, "%s%s:%d:%s(): ", + prefix, newfile, line, func); + roger = vsnprintf(buf + i, size -i + 1, fmt, adx); + va_end(adx); + + /* + * Get rid of trailing newline for dprintf logs. + */ + if (dprint && buf[0] != '\0') { + nl = &buf[strlen(buf) - 1]; + if (*nl == '\n') + *nl = '\0'; + } + + DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); + + __zfs_dbgmsg(buf); + + /* Also emit string to log/console */ + printf("%s", buf); + + kmem_free(buf, size); +} + +void +zfs_dbgmsg_print(const char *tag) +{ + zfs_dbgmsg_t *zdm; + + (void) printf("ZFS_DBGMSG(%s):\n", tag); + mutex_enter(&zfs_dbgmsgs_lock); + for (zdm = list_head(&zfs_dbgmsgs); zdm; + zdm = list_next(&zfs_dbgmsgs, zdm)) + (void) printf("%s\n", zdm->zdm_msg); + mutex_exit(&zfs_dbgmsgs_lock); +} diff --git a/module/os/macos/zfs/zfs_dir.c b/module/os/macos/zfs/zfs_dir.c new file mode 100644 index 0000000000..b868baee7f --- /dev/null +++ b/module/os/macos/zfs/zfs_dir.c @@ -0,0 +1,1214 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt, + boolean_t update, int *deflags, struct componentname *rpnp, uint64_t *zoid) +{ + boolean_t conflict = B_FALSE; + int error; + + if (zfsvfs->z_norm) { + size_t bufsz = 0; + char *buf = NULL; + + if (rpnp) { + buf = rpnp->cn_nameptr; + bufsz = rpnp->cn_namelen; + } + + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, buf, bufsz, &conflict); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + + /* + * Allow multiple entries provided the first entry is + * the object id. Non-zpl consumers may safely make + * use of the additional space. + * + * XXX: This should be a feature flag for compatibility + */ + if (error == EOVERFLOW) + error = 0; + + if (zfsvfs->z_norm && !error && deflags) + *deflags = conflict ? ED_CASE_CONFLICT : 0; + + *zoid = ZFS_DIRENT_OBJ(*zoid); + + return (error); +} + +/* + * Lock a directory entry. A dirlock on protects that name + * in dzp's directory zap object. As long as you hold a dirlock, you can + * assume two things: (1) dzp cannot be reaped, and (2) no other thread + * can change the zap entry for (i.e. link or unlink) this name. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZSHARED: allow concurrent access with other ZSHARED callers. + * ZXATTR: we want dzp's xattr directory + * ZCILOOK: On a mixed sensitivity file system, + * this lookup should be case-insensitive. + * ZCIEXACT: On a purely case-insensitive file system, + * this lookup should be case-sensitive. + * ZRENAMING: we are locking for renaming, force narrow locks + * ZHAVELOCK: Don't grab the z_name_lock for this call. The + * current thread already holds it. + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * dlpp - pointer to the dirlock for this entry (NULL on error) + * direntflags - (case-insensitive lookup only) + * flags if multiple case-sensitive matches exist in directory + * realpnp - (case-insensitive lookup only) + * actual name matched within the directory + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + * NOTE: For case-insensitive file systems we take wide locks (see below), + * but return znode pointers to a single match. + */ +int +zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, + int flag, int *direntflags, struct componentname *realpnp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zfs_dirlock_t *dl; + boolean_t update; + matchtype_t mt = 0; + uint64_t zoid; + int error = 0; + int cmpflags; + + *zpp = NULL; + *dlpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if ((name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || + (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) + return (SET_ERROR(EEXIST)); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect what vnodes can be cached in the DNLC, how we + * perform zap lookups, and the "width" of our dirlocks. + * + * A normal dirlock locks a single name. Note that with + * normalization a name can be composed multiple ways, but + * when normalized, these names all compare equal. A wide + * dirlock locks multiple names. We need these when the file + * system is supporting mixed-mode access. It is sometimes + * necessary to lock all case permutations of file name at + * once so that simultaneous case-insensitive/case-sensitive + * behaves as rationally as possible. + */ + + /* + * When matching we may need to normalize & change case according to + * FS settings. + * + * Note that a normalized match is necessary for a case insensitive + * filesystem when the lookup request is not exact because normalization + * can fold case independent of normalizing code point sequences. + * + * See the table above zfs_dropname(). + */ + if (zfsvfs->z_norm != 0) { + mt = MT_NORMALIZE; + + /* + * Determine if the match needs to honor the case specified in + * lookup, and if so keep track of that so that during + * normalization we don't fold case. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE && + (flag & ZCIEXACT)) || + (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { + mt |= MT_MATCH_CASE; + } + } + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * Maybe can add TO-UPPERed version of name to dnlc in ci-only + * case for performance improvement? + */ + update = !zfsvfs->z_norm || + (zfsvfs->z_case == ZFS_CASE_MIXED && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); + + /* + * ZRENAMING indicates we are in a situation where we should + * take narrow locks regardless of the file system's + * preferences for normalizing and case folding. This will + * prevent us deadlocking trying to grab the same wide lock + * twice if the two names happen to be case-insensitive + * matches. + */ + if (flag & ZRENAMING) + cmpflags = 0; + else + cmpflags = zfsvfs->z_norm; + + /* + * Wait until there are no locks on this name. + * + * Don't grab the lock if it is already held. However, cannot + * have both ZSHARED and ZHAVELOCK together. + */ + ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); + if (!(flag & ZHAVELOCK)) + rw_enter(&dzp->z_name_lock, RW_READER); + + mutex_enter(&dzp->z_lock); + for (;;) { + if (dzp->z_unlinked && !(flag & ZXATTR)) { + mutex_exit(&dzp->z_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); + return (SET_ERROR(ENOENT)); + } + for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { + if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, + U8_UNICODE_LATEST, &error) == 0) || error != 0) + break; + } + if (error != 0) { + mutex_exit(&dzp->z_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); + return (SET_ERROR(ENOENT)); + } + if (dl == NULL) { + /* + * Allocate a new dirlock and add it to the list. + */ + dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); + cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); + dl->dl_name = name; + dl->dl_sharecnt = 0; + dl->dl_namelock = 0; + dl->dl_namesize = 0; + dl->dl_dzp = dzp; + dl->dl_next = dzp->z_dirlocks; + dzp->z_dirlocks = dl; + break; + } + if ((flag & ZSHARED) && dl->dl_sharecnt != 0) + break; + cv_wait(&dl->dl_cv, &dzp->z_lock); + } + + /* + * If the z_name_lock was NOT held for this dirlock record it. + */ + if (flag & ZHAVELOCK) + dl->dl_namelock = 1; + + if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { + /* + * We're the second shared reference to dl. Make a copy of + * dl_name in case the first thread goes away before we do. + * Note that we initialize the new name before storing its + * pointer into dl_name, because the first thread may load + * dl->dl_name at any time. It'll either see the old value, + * which belongs to it, or the new shared copy; either is OK. + */ + dl->dl_namesize = strlen(dl->dl_name) + 1; + name = kmem_alloc(dl->dl_namesize, KM_SLEEP); + bcopy(dl->dl_name, name, dl->dl_namesize); + dl->dl_name = name; + } + + mutex_exit(&dzp->z_lock); + + /* + * We have a dirlock on the name. (Note that it is the dirlock, + * not the dzp's z_lock, that protects the name in the zap object.) + * See if there's an object by this name; if so, put a hold on it. + */ + if (flag & ZXATTR) { + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, mt, + update, direntflags, realpnp, &zoid); + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + zfs_dirent_unlock(dl); + return (error); + } + } else { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + return (SET_ERROR(EEXIST)); + } + error = zfs_zget(zfsvfs, zoid, zpp); + if (error) { + zfs_dirent_unlock(dl); + return (error); + } + } + + *dlpp = dl; + + return (0); +} + +/* + * Unlock this directory entry and wake anyone who was waiting for it. + */ +void +zfs_dirent_unlock(zfs_dirlock_t *dl) +{ + znode_t *dzp = dl->dl_dzp; + zfs_dirlock_t **prev_dl, *cur_dl; + + mutex_enter(&dzp->z_lock); + + if (!dl->dl_namelock) + rw_exit(&dzp->z_name_lock); + + if (dl->dl_sharecnt > 1) { + dl->dl_sharecnt--; + mutex_exit(&dzp->z_lock); + return; + } + prev_dl = &dzp->z_dirlocks; + while ((cur_dl = *prev_dl) != dl) + prev_dl = &cur_dl->dl_next; + *prev_dl = dl->dl_next; + cv_broadcast(&dl->dl_cv); + mutex_exit(&dzp->z_lock); + + if (dl->dl_namesize != 0) + kmem_free(dl->dl_name, dl->dl_namesize); + cv_destroy(&dl->dl_cv); + kmem_free(dl, sizeof (*dl)); +} + +/* + * Look up an entry in a directory. + * + * NOTE: '.' and '..' are handled as special cases because + * no directory entries are actually stored for them. If this is + * the root of a filesystem, then '.zfs' is also treated as a + * special pseudo-directory. + */ +int +zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags, + int *deflg, struct componentname *rpnp) +{ + zfs_dirlock_t *dl; + znode_t *zp; + struct vnode *vp; + int error = 0; + uint64_t parent; + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *zpp = dzp; + zhold(*zpp); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + + /* + * If we are a snapshot mounted under .zfs, return + * the inode pointer for the snapshot directory. + */ + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + + if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { + error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, + "snapshot", &vp, 0, kcred, NULL, NULL); + if (error == 0) + *zpp = VTOZ(vp); + return (error); + } + rw_enter(&dzp->z_parent_lock, RW_READER); + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *zpp = zp; + rw_exit(&dzp->z_parent_lock); + } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { + vp = zfsctl_root(dzp); + if (vp != NULL) + *zpp = VTOZ(vp); + else + error = ENOENT; + } else { + int zf; + + zf = ZEXISTS | ZSHARED; + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); + if (error == 0) { + *zpp = zp; + zfs_dirent_unlock(dl); + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + } + rpnp = NULL; + } + + if ((flags & FIGNORECASE) && rpnp && !error) + (void) strlcpy(rpnp->cn_nameptr, name, rpnp->cn_namelen); + + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + ASSERT(zp->z_unlinked); + ASSERT(ZTOI(zp)->i_nlink == 0); + + VERIFY3U(0, ==, + zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +static void +zfs_unlinked_drain_task(void *arg) +{ + zfsvfs_t *zfsvfs = arg; + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + int error; + + ASSERT3B(zfsvfs->z_draining, ==, B_TRUE); + + /* + * Iterate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0 && + zfsvfs->z_drain_state == ZFS_DRAIN_RUNNING; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + zp->z_unlinked = B_TRUE; + + /* + * zrele() decrements the znode's ref count and may cause + * it to be synchronously freed. We interrupt freeing + * of this znode by checking the return value of + * dmu_objset_zfs_unmounting() in dmu_free_long_range() + * when an unmount is requested. + */ + zrele(zp); + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + } + zap_cursor_fini(&zc); + + mutex_enter(&zfsvfs->z_drain_lock); + zfsvfs->z_drain_state = ZFS_DRAIN_SHUTDOWN; + cv_broadcast(&zfsvfs->z_drain_cv); + mutex_exit(&zfsvfs->z_drain_lock); +} + +/* + * Sets z_draining then tries to dispatch async unlinked drain. + * If that fails executes synchronous unlinked drain. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + + mutex_enter(&zfsvfs->z_drain_lock); + ASSERT(zfsvfs->z_drain_state == ZFS_DRAIN_SHUTDOWN); + zfsvfs->z_drain_state = ZFS_DRAIN_RUNNING; + mutex_exit(&zfsvfs->z_drain_lock); + + if (taskq_dispatch( + dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)), + zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP) == 0) { + zfs_dbgmsg("async zfs_unlinked_drain dispatch failed"); + zfs_unlinked_drain_task(zfsvfs); + } +} + +/* + * Wait for the unlinked drain taskq task to stop. This will interrupt the + * unlinked set processing if it is in progress. + */ +void +zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) +{ + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + + mutex_enter(&zfsvfs->z_drain_lock); + while (zfsvfs->z_drain_state != ZFS_DRAIN_SHUTDOWN) { + zfsvfs->z_drain_state = ZFS_DRAIN_SHUTDOWN_REQ; + cv_wait(&zfsvfs->z_drain_cv, &zfsvfs->z_drain_lock); + } + mutex_exit(&zfsvfs->z_drain_lock); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zfs_dirlock_t dl; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + ASSERT(S_ISREG(ZTOI(xzp)->i_mode) || + S_ISLNK(ZTOI(xzp)->i_mode)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_zrele_async(xzp); + skipped += 1; + continue; + } + bzero(&dl, sizeof (dl)); + dl.dl_dzp = dzp; + dl.dl_name = zap.za_name; + + error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + zfs_zrele_async(xzp); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + znode_t *xzp = NULL; + dmu_tx_t *tx; + uint64_t acl_obj; + uint64_t xattr_obj; + int error; + + /* + * If this is an attribute directory, purge its contents. + */ + if (S_ISDIR(zp->z_mode) && (zp->z_pflags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + + return; + } + } + + /* + * Free up all the data in the file. We don't do this for directories + * because we need truncate and remove to be in the same tx, like in + * zfs_znode_delete(). Otherwise, if we crash here we'll end up with + * an inconsistent truncated zap object in the delete queue. Note a + * truncated file is harmless since it only contains user data. + */ + if (S_ISREG(zp->z_mode)) { + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space or we were interrupted by unmount. + * Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT(error == 0); + } + + acl_obj = zfs_external_acl(zp); + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xzp) { + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + goto out; + } + + if (xzp) { + ASSERT(error == 0); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ + xzp->z_links = 0; /* no more links to it */ + VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &xzp->z_links, sizeof (xzp->z_links), tx)); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + } + + mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + /* + * Remove this znode from the unlinked set. If a has rollback has + * occurred while a file is open and unlinked. Then when the file + * is closed post rollback it will not exist in the rolled back + * version of the unlinked object. + */ + error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + zp->z_id, tx); + VERIFY(error == 0 || error == ENOENT); + + uint64_t count; + if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { + cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); + } + + mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); +out: + if (xzp) + zfs_zrele_async(xzp); +} + +static uint64_t +zfs_dirent(znode_t *zp, uint64_t mode) +{ + uint64_t de = zp->z_id; + + if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT(mode) << 60; + return (de); +} + +/* + * Link zp into dl. Can fail in the following cases : + * - if zp has been unlinked. + * - if the number of entries with the same hash (aka. colliding entries) + * exceed the capacity of a leaf-block of fatzap and splitting of the + * leaf-block does not help. + */ +int +zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +{ + znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t value; + int zp_is_dir = S_ISDIR(zp->z_mode); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + mutex_enter(&zp->z_lock); + + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + mutex_exit(&zp->z_lock); + return (SET_ERROR(ENOENT)); + } + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + } + + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, + &value, tx); + + /* + * zap_add could fail to add the entry if it exceeds the capacity of the + * leaf-block and zap_leaf_split() failed to help. + * The caller of this routine is responsible for failing the transaction + * which will rollback the SA updates done above. + */ + if (error != 0) { + mutex_exit(&zp->z_lock); + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + mutex_exit(&zp->z_lock); + + mutex_enter(&dzp->z_lock); + dzp->z_size++; + if (zp_is_dir) + dzp->z_links++; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&dzp->z_lock); + + return (0); +} + +/* + * The match type in the code for this function should conform to: + * + * ------------------------------------------------------------------------ + * fs type | z_norm | lookup type | match type + * ---------|-------------|-------------|---------------------------------- + * CS !norm | 0 | 0 | 0 (exact) + * CS norm | formX | 0 | MT_NORMALIZE + * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE + * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE + * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | ZCILOOK | MT_NORMALIZE + * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE + * + * Abbreviations: + * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed + * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) + * formX = unicode normalization form set on fs creation + */ +static int +zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (ZTOZSB(zp)->z_norm) { + matchtype_t mt = MT_NORMALIZE; + + if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE && + (flag & ZCIEXACT)) || + (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED && + !(flag & ZCILOOK))) { + mt |= MT_MATCH_CASE; + } + + error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id, + dl->dl_name, mt, tx); + } else { + error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, + tx); + } + + return (error); +} + +/* + * Unlink zp from dl, and mark zp for deletion if this was the last link. Can + * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, + boolean_t *unlinkedp) +{ + znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + int zp_is_dir = S_ISDIR(zp->z_mode); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (!(flag & ZRENAMING)) { + mutex_enter(&zp->z_lock); + + if (zp_is_dir && !zfs_dirempty(zp)) { + mutex_exit(&zp->z_lock); + return (SET_ERROR(ENOTEMPTY)); + } + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) { + mutex_exit(&zp->z_lock); + return (error); + } + + if (zp->z_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on %lu is %u, " + "should be at least %u", zp->z_id, + (int)zp->z_links, zp_is_dir + 1); + zp->z_links = zp_is_dir + 1; + } + if (--zp->z_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_links = 0; + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT(error == 0); + mutex_exit(&zp->z_lock); + } else { + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) + return (error); + } + + mutex_enter(&dzp->z_lock); + dzp->z_size--; /* one dirent removed */ + if (zp_is_dir) + dzp->z_links--; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&dzp->z_lock); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. Works with or without z_lock + * held, but can only be consider a hint in the latter case. Returns true + * if only "." and ".." remain and there's no work in progress. + * + * The internal ZAP size, rather than zp->z_size, needs to be checked since + * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + uint64_t count; + int error; + + if (dzp->z_dirlocks != NULL) + return (B_FALSE); + + error = zap_count(zfsvfs->z_os, dzp->z_id, &count); + if (error != 0 || count != 0) + return (B_FALSE); + + return (B_TRUE); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; +#ifdef DEBUG + uint64_t parent; +#endif + + *xzpp = NULL; + + if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) + return (error); + + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { + zfs_acl_ids_free(&acl_ids); + return (SET_ERROR(EDQUOT)); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + +#ifdef DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); + + if (!zp->z_unlinked) + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + +#ifdef __APPLE__ + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(xzp, zfsvfs); +#endif + + *xzpp = xzp; + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xipp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + znode_t *xzp; + zfs_dirlock_t *dl; + vattr_t va; + int error; +top: + error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); + if (error) + return (error); + + if (xzp != NULL) { + *xzpp = xzp; + zfs_dirent_unlock(dl); + return (0); + } + + if (!(flags & CREATE_XATTR_DIR)) { + zfs_dirent_unlock(dl); + return (SET_ERROR(ENOENT)); + } + + if (zfs_is_readonly(zfsvfs)) { + zfs_dirent_unlock(dl); + return (SET_ERROR(EROFS)); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = ATTR_TYPE | ATTR_MODE | ATTR_UID | ATTR_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + error = zfs_make_xattrdir(zp, &va, xzpp, cr); + zfs_dirent_unlock(dl); + + if (error == ERESTART) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * you have write access to the entry, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = ZTOZSB(zdp); + + if (zfsvfs->z_replay) + return (0); + + if ((zdp->z_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + (vnode_isreg(ZTOV(zp)) && + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(ZTOV(zp), cr)); +} diff --git a/module/os/macos/zfs/zfs_file_os.c b/module/os/macos/zfs/zfs_file_os.c new file mode 100644 index 0000000000..60e2c05e55 --- /dev/null +++ b/module/os/macos/zfs/zfs_file_os.c @@ -0,0 +1,405 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include + +#define FILE_FD_NOTUSED -1 + +/* + * Open file + * + * path - fully qualified path to file + * flags - file attributes O_READ / O_WRITE / O_EXCL + * fpp - pointer to return file pointer + * + * Returns 0 on success underlying error on failure. + */ +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + struct vnode *vp = NULL; + vfs_context_t vctx; + int error; + + if (!(flags & O_CREAT) && (flags & O_WRONLY)) + flags |= O_EXCL; + + vctx = vfs_context_create((vfs_context_t)0); + error = vnode_open(path, flags, mode, 0, &vp, vctx); + (void) vfs_context_rele(vctx); + if (error == 0 && + vp != NULL) { + zfs_file_t *zf; + zf = (zfs_file_t *)kmem_zalloc(sizeof (zfs_file_t), KM_SLEEP); + zf->f_vnode = vp; + zf->f_fd = FILE_FD_NOTUSED; + *fpp = zf; + } + + /* Optional, implemented O_APPEND: set offset to file size. */ + VERIFY0(flags & O_APPEND); + + return (error); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + vfs_context_t vctx; + vctx = vfs_context_create((vfs_context_t)0); + vnode_close(fp->f_vnode, fp->f_writes ? FWRITE : 0, vctx); + (void) vfs_context_rele(vctx); + + kmem_free(fp, sizeof (zfs_file_t)); +} + +static int +zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, + loff_t *off, ssize_t *resid) +{ + int error; + ssize_t local_resid = count; + + /* If we came with a 'fd' use it, as it can handle pipes. */ + if (fp->f_fd == FILE_FD_NOTUSED) + error = zfs_vn_rdwr(UIO_WRITE, fp->f_vnode, (caddr_t)buf, count, + *off, UIO_SYSSPACE, 0, RLIM64_INFINITY, + kcred, &local_resid); + else + error = spl_vn_rdwr(UIO_WRITE, fp, (caddr_t)buf, count, + *off, UIO_SYSSPACE, 0, RLIM64_INFINITY, + kcred, &local_resid); + + if (error != 0) + return (SET_ERROR(error)); + + fp->f_writes = 1; + + if (resid != NULL) + *resid = local_resid; + else if (local_resid != 0) + return (SET_ERROR(EIO)); + + *off += count - local_resid; + + return (0); +} + +/* + * Stateful write - use os internal file pointer to determine where to + * write and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + ssize_t rc; + + rc = zfs_file_write_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + + return (SET_ERROR(rc)); +} + +/* + * Stateless write - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * off - file offset to write to (only valid for seekable types) + * resid - pointer to count of unwritten bytes + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_write_impl(fp, buf, count, &off, resid)); +} + +static ssize_t +zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *off, + ssize_t *resid) +{ + int error; + ssize_t local_resid = count; + + /* If we have realvp, it's faster to call its spl_vn_rdwr */ + if (fp->f_fd == FILE_FD_NOTUSED) + error = zfs_vn_rdwr(UIO_READ, fp->f_vnode, buf, count, + *off, UIO_SYSSPACE, 0, RLIM64_INFINITY, + kcred, &local_resid); + else + error = spl_vn_rdwr(UIO_READ, fp, buf, count, + *off, UIO_SYSSPACE, 0, RLIM64_INFINITY, + kcred, &local_resid); + + if (error) + return (SET_ERROR(error)); + + *off += count - local_resid; + if (resid != NULL) + *resid = local_resid; + + return (SET_ERROR(0)); +} + +/* + * Stateful read - use os internal file pointer to determine where to + * read and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to read + * resid - pointer to count of unread bytes (if short read) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + int rc; + + rc = zfs_file_read_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + return (rc); +} + +/* + * Stateless read - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to write + * off - file offset to read from (only valid for seekable types) + * resid - pointer to count of unwritten bytes (if short read) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_read_impl(fp, buf, count, &off, resid)); +} + +/* + * lseek - set / get file pointer + * + * fp - pointer to file (pipe, socket, etc) to read from + * offp - value to seek to, returns current value plus passed offset + * whence - see man pages for standard lseek whence values + * + * Returns 0 on success errno on failure (ESPIPE for non seekable types) + */ +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + if (*offp < 0 || *offp > MAXOFFSET_T) + return (EINVAL); + + switch (whence) { + case SEEK_SET: + fp->f_offset = *offp; + break; + case SEEK_CUR: + fp->f_offset += *offp; + *offp = fp->f_offset; + break; + case SEEK_END: + /* Implement this if eventually needed: get filesize */ + VERIFY0(whence == SEEK_END); + break; + } + + return (0); +} + +/* + * Get file attributes + * + * filp - file pointer + * zfattr - pointer to file attr structure + * + * Currently only used for fetching size and file mode. + * + * Returns 0 on success or error code of underlying getattr call on failure. + */ +int +zfs_file_getattr(zfs_file_t *filp, zfs_file_attr_t *zfattr) +{ + vfs_context_t vctx; + int rc; + vattr_t vap; + + VATTR_INIT(&vap); + VATTR_WANTED(&vap, va_size); + VATTR_WANTED(&vap, va_mode); + + vctx = vfs_context_create((vfs_context_t)0); + rc = vnode_getattr(filp->f_vnode, &vap, vctx); + (void) vfs_context_rele(vctx); + + if (rc) + return (rc); + + zfattr->zfa_size = vap.va_size; + zfattr->zfa_mode = vap.va_mode; + + return (0); +} + +/* + * Sync file to disk + * + * filp - file pointer + * flags - O_SYNC and or O_DSYNC + * + * Returns 0 on success or error code of underlying sync call on failure. + */ +int +zfs_file_fsync(zfs_file_t *filp, int flags) +{ + vfs_context_t vctx; + int rc; + + vctx = vfs_context_create((vfs_context_t)0); + rc = VNOP_FSYNC(filp->f_vnode, (flags == FSYNC), vctx); + (void) vfs_context_rele(vctx); + return (rc); +} + +/* + * fallocate - allocate or free space on disk + * + * fp - file pointer + * mode (non-standard options for hole punching etc) + * offset - offset to start allocating or freeing from + * len - length to free / allocate + * + * OPTIONAL + */ +int +zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) +{ + return (0); +} + +/* + * Request current file pointer offset + * + * fp - pointer to file + * + * Returns current file offset. + */ +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (fp->f_offset); +} + +/* + * Request file pointer private data + * + * fp - pointer to file + * + * Returns pointer to file private data. + */ +extern kmutex_t zfsdev_state_lock; +dev_t zfsdev_get_dev(void); + +void * +zfs_file_private(zfs_file_t *fp) +{ + dev_t dev; + void *zs; + + dev = zfsdev_get_dev(); + dprintf("%s: fetching dev x%x\n", __func__, dev); + if (dev == 0) + return (NULL); + + mutex_enter(&zfsdev_state_lock); + zs = zfsdev_get_state(minor(dev), ZST_ALL); + mutex_exit(&zfsdev_state_lock); + dprintf("%s: searching minor %d %p\n", __func__, minor(dev), zs); + + return (zs); +} + +/* + * unlink file + * + * path - fully qualified file path + * + * Returns 0 on success. + * + * OPTIONAL + */ +int +zfs_file_unlink(const char *path) +{ + return (EOPNOTSUPP); +} + +/* + * Get reference to file pointer + * + * fd - input file descriptor + * fpp - pointer to file pointer + * + * Returns 0 on success EBADF on failure. + */ +int +zfs_file_get(int fd, zfs_file_t **fpp) +{ + *fpp = getf(fd); + if (*fpp == NULL) + return (EBADF); + return (0); +} + +/* + * Drop reference to file pointer + * + * fd - input file descriptor + */ +void +zfs_file_put(int fd) +{ + releasef(fd); +} diff --git a/module/os/macos/zfs/zfs_fuid_os.c b/module/os/macos/zfs/zfs_fuid_os.c new file mode 100644 index 0000000000..8d6e9b9c54 --- /dev/null +++ b/module/os/macos/zfs/zfs_fuid_os.c @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#endif +#include + +uint64_t +zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, + cred_t *cr, zfs_fuid_info_t **fuidp) +{ + uid_t id; + + VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); + + id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); + + if (IS_EPHEMERAL(id)) + return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); + + return ((uint64_t)id); +} diff --git a/module/os/macos/zfs/zfs_ioctl_os.c b/module/os/macos/zfs/zfs_ioctl_os.c new file mode 100644 index 0000000000..d76718a3a9 --- /dev/null +++ b/module/os/macos/zfs/zfs_ioctl_os.c @@ -0,0 +1,403 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013, 2020 Jorgen Lundman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +int zfs_major = 0; +int zfs_bmajor = 0; +static void *zfs_devnode = NULL; +#define ZFS_MAJOR -24 + +boolean_t +zfs_vfs_held(zfsvfs_t *zfsvfs) +{ + return (zfsvfs->z_vfs != NULL); +} + +int +zfs_vfs_ref(zfsvfs_t **zfvp) +{ + int error = 0; + + if (*zfvp == NULL || (*zfvp)->z_vfs == NULL) + return (SET_ERROR(ESRCH)); + + error = vfs_busy((*zfvp)->z_vfs, LK_NOWAIT); + if (error != 0) { + *zfvp = NULL; + error = SET_ERROR(ESRCH); + } + return (error); +} + +void +zfs_vfs_rele(zfsvfs_t *zfsvfs) +{ + vfs_unbusy(zfsvfs->z_vfs); +} + +static uint_t zfsdev_private_tsd; + +static int +zfsdev_state_init(dev_t dev) +{ + zfsdev_state_t *zs, *zsprev = NULL; + minor_t minor; + boolean_t newzs = B_FALSE; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + minor = minor(dev); + if (minor == 0) + return (SET_ERROR(ENXIO)); + + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs->zs_minor == -1) + break; + zsprev = zs; + } + + if (!zs) { + zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); + newzs = B_TRUE; + } + + /* Store this dev_t in tsd, so zfs_get_private() can retrieve it */ + tsd_set(zfsdev_private_tsd, (void *)(uintptr_t)dev); + + zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); + zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); + + /* + * In order to provide for lock-free concurrent read access + * to the minor list in zfsdev_get_state_impl(), new entries + * must be completely written before linking them into the + * list whereas existing entries are already linked; the last + * operation must be updating zs_minor (from -1 to the new + * value). + */ + if (newzs) { + zs->zs_minor = minor; + zsprev->zs_next = zs; + } else { + zs->zs_minor = minor; + } + + return (0); +} + +dev_t +zfsdev_get_dev(void) +{ + return ((dev_t)tsd_get(zfsdev_private_tsd)); +} + +static int +zfsdev_state_destroy(dev_t dev) +{ + zfsdev_state_t *zs; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + tsd_set(zfsdev_private_tsd, NULL); + + zs = zfsdev_get_state(minor(dev), ZST_ALL); + + if (!zs) { + printf("%s: no cleanup for minor x%x\n", __func__, + minor(dev)); + return (0); + } + + ASSERT(zs != NULL); + if (zs->zs_minor != -1) { + zs->zs_minor = -1; + zfs_onexit_destroy(zs->zs_onexit); + zfs_zevent_destroy(zs->zs_zevent); + zs->zs_onexit = NULL; + zs->zs_zevent = NULL; + } + return (0); +} + +static int +zfsdev_open(dev_t dev, int flags, int devtype, struct proc *p) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + if (zfsdev_get_state(minor(dev), ZST_ALL)) { + mutex_exit(&zfsdev_state_lock); + return (0); + } + error = zfsdev_state_init(dev); + mutex_exit(&zfsdev_state_lock); + + return (-error); +} + +static int +zfsdev_release(dev_t dev, int flags, int devtype, struct proc *p) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + error = zfsdev_state_destroy(dev); + mutex_exit(&zfsdev_state_lock); + + return (-error); +} + +static int +zfsdev_ioctl(dev_t dev, ulong_t cmd, caddr_t arg, __unused int xflag, + struct proc *p) +{ + uint_t len, vecnum; + zfs_iocparm_t *zit; + zfs_cmd_t *zc; + int error, rc; + user_addr_t uaddr; + + /* Translate XNU ioctl to enum table: */ + len = IOCPARM_LEN(cmd); + vecnum = cmd - _IOWR('Z', ZFS_IOC_FIRST, zfs_iocparm_t); + zit = (void *)arg; + uaddr = (user_addr_t)zit->zfs_cmd; + + if (len != sizeof (zfs_iocparm_t)) { + /* + * printf("len %d vecnum: %d sizeof (zfs_cmd_t) %lu\n", + * len, vecnum, sizeof (zfs_cmd_t)); + */ + /* + * We can get plenty raw ioctl()s here, for exaple open() will + * cause spec_open() to issue DKIOCGETTHROTTLEMASK. + */ + return (EINVAL); + } + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + + if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) { + error = SET_ERROR(EFAULT); + goto out; + } + + error = zfsdev_ioctl_common(vecnum, zc, 0); + + rc = copyout(zc, uaddr, sizeof (*zc)); + + if (error == 0 && rc != 0) + error = -SET_ERROR(EFAULT); + + /* + * OSX must return(0) or XNU doesn't copyout(). Save the real + * rc to userland + */ + zit->zfs_ioc_error = error; + error = 0; + +out: + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); + +} + +/* for spa_iokit_dataset_proxy_create */ +#include +#include + +static int +zfs_ioc_osx_proxy_dataset(zfs_cmd_t *zc) +{ + int error; + const char *osname; + + /* XXX Get osname */ + osname = zc->zc_name; + + /* Create new virtual disk, and return /dev/disk name */ + error = zfs_osx_proxy_create(osname); + + if (error == 0) + error = zfs_osx_proxy_get_bsdname(osname, + zc->zc_value, sizeof (zc->zc_value)); + if (error == 0) + printf("%s: Created virtual disk '%s' for '%s'\n", __func__, + zc->zc_value, osname); + + return (error); +} + +void +zfs_ioctl_init_os(void) +{ + /* APPLE Specific ioctls */ + zfs_ioctl_register_pool(ZFS_IOC_PROXY_DATASET, + zfs_ioc_osx_proxy_dataset, zfs_secpolicy_config, + B_FALSE, POOL_CHECK_NONE); +} + +/* ioctl handler for block device. Relay to zvol */ +static int +zfsdev_bioctl(dev_t dev, ulong_t cmd, caddr_t data, + __unused int flag, struct proc *p) +{ + return (zvol_os_ioctl(dev, cmd, data, 1, NULL, NULL)); +} + +static struct bdevsw zfs_bdevsw = { + .d_open = zvol_os_open, + .d_close = zvol_os_close, + .d_strategy = zvol_os_strategy, + .d_ioctl = zfsdev_bioctl, /* block ioctl handler */ + .d_dump = eno_dump, + .d_psize = zvol_os_get_volume_blocksize, + .d_type = D_DISK, +}; + +static struct cdevsw zfs_cdevsw = { + .d_open = zfsdev_open, + .d_close = zfsdev_release, + .d_read = zvol_os_read, + .d_write = zvol_os_write, + .d_ioctl = zfsdev_ioctl, + .d_stop = eno_stop, + .d_reset = eno_reset, + .d_ttys = NULL, + .d_select = eno_select, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_reserved_1 = eno_getc, + .d_reserved_2 = eno_putc, + .d_type = D_DISK +}; + +/* Callback to create a unique minor for each open */ +static int +zfs_devfs_clone(__unused dev_t dev, int action) +{ + static minor_t minorx; + + if (action == DEVFS_CLONE_ALLOC) { + mutex_enter(&zfsdev_state_lock); + minorx = zfsdev_minor_alloc(); + mutex_exit(&zfsdev_state_lock); + return (minorx); + } + return (-1); +} + +int +zfsdev_attach(void) +{ + dev_t dev; + + zfs_bmajor = bdevsw_add(-1, &zfs_bdevsw); + zfs_major = cdevsw_add_with_bdev(-1, &zfs_cdevsw, zfs_bmajor); + + if (zfs_major < 0) { + printf("ZFS: zfs_attach() failed to allocate a major number\n"); + return (-1); + } + + dev = makedev(zfs_major, 0); /* Get the device number */ + zfs_devnode = devfs_make_node_clone(dev, DEVFS_CHAR, UID_ROOT, + GID_WHEEL, 0666, zfs_devfs_clone, "zfs", 0); + if (!zfs_devnode) { + printf("ZFS: devfs_make_node() failed\n"); + return (-1); + } + + wrap_avl_init(); + wrap_unicode_init(); + wrap_nvpair_init(); + wrap_zcommon_init(); + wrap_icp_init(); + wrap_lua_init(); + + tsd_create(&zfsdev_private_tsd, NULL); + + kstat_osx_init(); + return (0); +} + +void +zfsdev_detach(void) +{ + kstat_osx_fini(); + + tsd_destroy(&zfsdev_private_tsd); + + wrap_lua_fini(); + wrap_icp_fini(); + wrap_zcommon_fini(); + wrap_nvpair_fini(); + wrap_unicode_fini(); + wrap_avl_fini(); + + if (zfs_devnode) { + devfs_remove(zfs_devnode); + zfs_devnode = NULL; + } + if (zfs_major) { + (void) cdevsw_remove(zfs_major, &zfs_cdevsw); + zfs_major = 0; + } +} + +uint64_t +zfs_max_nvlist_src_size_os(void) +{ + if (zfs_max_nvlist_src_size != 0) + return (zfs_max_nvlist_src_size); + + return (KMALLOC_MAX_SIZE); +} diff --git a/module/os/macos/zfs/zfs_kstat_osx.c b/module/os/macos/zfs/zfs_kstat_osx.c new file mode 100644 index 0000000000..91c8416bc5 --- /dev/null +++ b/module/os/macos/zfs/zfs_kstat_osx.c @@ -0,0 +1,869 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014, 2020 Jorgen Lundman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * In Solaris the tunable are set via /etc/system. Until we have a load + * time configuration, we add them to writable kstat tunables. + * + * This table is more or less populated from IllumOS mdb zfs_params sources + * https://github.com/illumos/illumos-gate/blob/master/ + * usr/src/cmd/mdb/common/modules/zfs/zfs.c#L336-L392 + * + */ + + + +osx_kstat_t osx_kstat = { + { "spa_version", KSTAT_DATA_UINT64 }, + { "zpl_version", KSTAT_DATA_UINT64 }, + + { "active_vnodes", KSTAT_DATA_UINT64 }, + { "vnop_debug", KSTAT_DATA_UINT64 }, + { "reclaim_nodes", KSTAT_DATA_UINT64 }, + { "ignore_negatives", KSTAT_DATA_UINT64 }, + { "ignore_positives", KSTAT_DATA_UINT64 }, + { "create_negatives", KSTAT_DATA_UINT64 }, + { "force_formd_normalized", KSTAT_DATA_UINT64 }, + { "skip_unlinked_drain", KSTAT_DATA_UINT64 }, + { "use_system_sync", KSTAT_DATA_UINT64 }, + + { "zfs_arc_max", KSTAT_DATA_UINT64 }, + { "zfs_arc_min", KSTAT_DATA_UINT64 }, + { "zfs_arc_meta_limit", KSTAT_DATA_UINT64 }, + { "zfs_arc_meta_min", KSTAT_DATA_UINT64 }, + { "zfs_arc_grow_retry", KSTAT_DATA_UINT64 }, + { "zfs_arc_shrink_shift", KSTAT_DATA_UINT64 }, + { "zfs_arc_p_min_shift", KSTAT_DATA_UINT64 }, + { "zfs_arc_average_blocksize", KSTAT_DATA_UINT64 }, + + { "l2arc_write_max", KSTAT_DATA_UINT64 }, + { "l2arc_write_boost", KSTAT_DATA_UINT64 }, + { "l2arc_headroom", KSTAT_DATA_UINT64 }, + { "l2arc_headroom_boost", KSTAT_DATA_UINT64 }, + { "l2arc_feed_secs", KSTAT_DATA_UINT64 }, + { "l2arc_feed_min_ms", KSTAT_DATA_UINT64 }, + + { "max_active", KSTAT_DATA_UINT64 }, + { "sync_read_min_active", KSTAT_DATA_UINT64 }, + { "sync_read_max_active", KSTAT_DATA_UINT64 }, + { "sync_write_min_active", KSTAT_DATA_UINT64 }, + { "sync_write_max_active", KSTAT_DATA_UINT64 }, + { "async_read_min_active", KSTAT_DATA_UINT64 }, + { "async_read_max_active", KSTAT_DATA_UINT64 }, + { "async_write_min_active", KSTAT_DATA_UINT64 }, + { "async_write_max_active", KSTAT_DATA_UINT64 }, + { "scrub_min_active", KSTAT_DATA_UINT64 }, + { "scrub_max_active", KSTAT_DATA_UINT64 }, + { "async_write_min_dirty_pct", KSTAT_DATA_INT64 }, + { "async_write_max_dirty_pct", KSTAT_DATA_INT64 }, + { "aggregation_limit", KSTAT_DATA_INT64 }, + { "read_gap_limit", KSTAT_DATA_INT64 }, + { "write_gap_limit", KSTAT_DATA_INT64 }, + + {"arc_lotsfree_percent", KSTAT_DATA_INT64 }, + {"zfs_dirty_data_max", KSTAT_DATA_INT64 }, + {"zfs_delay_max_ns", KSTAT_DATA_INT64 }, + {"zfs_delay_min_dirty_percent", KSTAT_DATA_INT64 }, + {"zfs_delay_scale", KSTAT_DATA_INT64 }, + {"spa_asize_inflation", KSTAT_DATA_INT64 }, + {"zfs_prefetch_disable", KSTAT_DATA_INT64 }, + {"zfetch_max_streams", KSTAT_DATA_INT64 }, + {"zfetch_min_sec_reap", KSTAT_DATA_INT64 }, + {"zfetch_array_rd_sz", KSTAT_DATA_INT64 }, + {"zfs_default_bs", KSTAT_DATA_INT64 }, + {"zfs_default_ibs", KSTAT_DATA_INT64 }, + {"metaslab_aliquot", KSTAT_DATA_INT64 }, + {"spa_max_replication_override", KSTAT_DATA_INT64 }, + {"spa_mode_global", KSTAT_DATA_INT64 }, + {"zfs_flags", KSTAT_DATA_INT64 }, + {"zfs_txg_timeout", KSTAT_DATA_INT64 }, + {"zfs_vdev_cache_max", KSTAT_DATA_INT64 }, + {"zfs_vdev_cache_size", KSTAT_DATA_INT64 }, + {"zfs_vdev_cache_bshift", KSTAT_DATA_INT64 }, + {"vdev_mirror_shift", KSTAT_DATA_INT64 }, + {"zfs_scrub_limit", KSTAT_DATA_INT64 }, + {"zfs_no_scrub_io", KSTAT_DATA_INT64 }, + {"zfs_no_scrub_prefetch", KSTAT_DATA_INT64 }, + {"fzap_default_block_shift", KSTAT_DATA_INT64 }, + {"zfs_immediate_write_sz", KSTAT_DATA_INT64 }, + {"zfs_read_chunk_size", KSTAT_DATA_INT64 }, + {"zfs_nocacheflush", KSTAT_DATA_INT64 }, + {"zil_replay_disable", KSTAT_DATA_INT64 }, + {"metaslab_df_alloc_threshold", KSTAT_DATA_INT64 }, + {"metaslab_df_free_pct", KSTAT_DATA_INT64 }, + {"zio_injection_enabled", KSTAT_DATA_INT64 }, + {"zvol_immediate_write_sz", KSTAT_DATA_INT64 }, + + { "l2arc_noprefetch", KSTAT_DATA_INT64 }, + { "l2arc_feed_again", KSTAT_DATA_INT64 }, + { "l2arc_norw", KSTAT_DATA_INT64 }, + + {"zfs_recover", KSTAT_DATA_INT64 }, + + {"zfs_free_bpobj_enabled", KSTAT_DATA_INT64 }, + + {"zfs_send_corrupt_data", KSTAT_DATA_UINT64 }, + {"zfs_send_queue_length", KSTAT_DATA_UINT64 }, + {"zfs_recv_queue_length", KSTAT_DATA_UINT64 }, + + {"zvol_inhibit_dev", KSTAT_DATA_UINT64 }, + {"zfs_send_set_freerecords_bit", KSTAT_DATA_UINT64 }, + + {"zfs_write_implies_delete_child", KSTAT_DATA_UINT64 }, + {"zfs_send_holes_without_birth_time", KSTAT_DATA_UINT64 }, + + {"dbuf_cache_max_bytes", KSTAT_DATA_UINT64 }, + + {"zfs_vdev_queue_depth_pct", KSTAT_DATA_UINT64 }, + {"zio_dva_throttle_enabled", KSTAT_DATA_UINT64 }, + + {"zfs_lua_max_instrlimit", KSTAT_DATA_UINT64 }, + {"zfs_lua_max_memlimit", KSTAT_DATA_UINT64 }, + + {"zfs_trim_extent_bytes_max", KSTAT_DATA_UINT64 }, + {"zfs_trim_extent_bytes_min", KSTAT_DATA_UINT64 }, + {"zfs_trim_metaslab_skip", KSTAT_DATA_UINT64 }, + {"zfs_trim_txg_batch", KSTAT_DATA_UINT64 }, + {"zfs_trim_queue_limit", KSTAT_DATA_UINT64 }, + + {"zfs_send_unmodified_spill_blocks", KSTAT_DATA_UINT64 }, + {"zfs_special_class_metadata_reserve_pct", KSTAT_DATA_UINT64 }, + + {"zfs_vdev_raidz_impl", KSTAT_DATA_STRING }, + {"icp_gcm_impl", KSTAT_DATA_STRING }, + {"icp_aes_impl", KSTAT_DATA_STRING }, + {"zfs_fletcher_4_impl", KSTAT_DATA_STRING }, + + {"zfs_expire_snapshot", KSTAT_DATA_UINT64 }, + {"zfs_admin_snapshot", KSTAT_DATA_UINT64 }, + {"zfs_auto_snapshot", KSTAT_DATA_UINT64 }, + + {"zfs_spa_discard_memory_limit", KSTAT_DATA_UINT64 }, + {"zfs_async_block_max_blocks", KSTAT_DATA_UINT64 }, + {"zfs_initialize_chunk_size", KSTAT_DATA_UINT64 }, + {"zfs_scan_suspend_progress", KSTAT_DATA_UINT64 }, + {"zfs_removal_suspend_progress", KSTAT_DATA_UINT64 }, + {"zfs_livelist_max_entries", KSTAT_DATA_UINT64 }, + + {"zfs_allow_redacted_dataset_mount", KSTAT_DATA_UINT64 }, + {"zfs_checksum_events_per_second", KSTAT_DATA_UINT64 }, + {"zfs_commit_timeout_pct", KSTAT_DATA_UINT64 }, + {"zfs_compressed_arc_enabled", KSTAT_DATA_UINT64 }, + {"zfs_condense_indirect_commit_entry_delay_ms", KSTAT_DATA_UINT64 }, + {"zfs_condense_min_mapping_bytes", KSTAT_DATA_UINT64 }, + {"zfs_deadman_checktime_ms", KSTAT_DATA_UINT64 }, + {"zfs_deadman_failmode", KSTAT_DATA_STRING }, + {"zfs_deadman_synctime_ms", KSTAT_DATA_UINT64 }, + {"zfs_deadman_ziotime_ms", KSTAT_DATA_UINT64 }, + {"zfs_disable_ivset_guid_check", KSTAT_DATA_UINT64 }, + {"zfs_initialize_value", KSTAT_DATA_UINT64 }, + {"zfs_keep_log_spacemaps_at_export", KSTAT_DATA_UINT64 }, + {"l2arc_rebuild_blocks_min_l2size", KSTAT_DATA_UINT64 }, + {"l2arc_rebuild_enabled", KSTAT_DATA_UINT64 }, + {"l2arc_trim_ahead", KSTAT_DATA_UINT64 }, + {"zfs_livelist_condense_new_alloc", KSTAT_DATA_UINT64 }, + {"zfs_livelist_condense_sync_cancel", KSTAT_DATA_UINT64 }, + {"zfs_livelist_condense_sync_pause", KSTAT_DATA_UINT64 }, + {"zfs_livelist_condense_zthr_cancel", KSTAT_DATA_UINT64 }, + {"zfs_livelist_condense_zthr_pause", KSTAT_DATA_UINT64 }, + {"zfs_livelist_min_percent_shared", KSTAT_DATA_UINT64 }, + {"zfs_max_dataset_nesting", KSTAT_DATA_UINT64 }, + {"zfs_max_missing_tvds", KSTAT_DATA_UINT64 }, + {"metaslab_debug_load", KSTAT_DATA_UINT64 }, + {"metaslab_force_ganging", KSTAT_DATA_UINT64 }, + {"zfs_multihost_fail_intervals", KSTAT_DATA_UINT64 }, + {"zfs_multihost_import_intervals", KSTAT_DATA_UINT64 }, + {"zfs_multihost_interval", KSTAT_DATA_UINT64 }, + {"zfs_override_estimate_recordsize", KSTAT_DATA_UINT64 }, + {"zfs_remove_max_segment", KSTAT_DATA_UINT64 }, + {"zfs_resilver_min_time_ms", KSTAT_DATA_UINT64 }, + {"zfs_scan_legacy", KSTAT_DATA_UINT64 }, + {"zfs_scan_vdev_limit", KSTAT_DATA_UINT64 }, + {"zfs_slow_io_events_per_second", KSTAT_DATA_UINT64 }, + {"spa_load_verify_data", KSTAT_DATA_UINT64 }, + {"spa_load_verify_metadata", KSTAT_DATA_UINT64 }, + {"zfs_unlink_suspend_progress", KSTAT_DATA_UINT64 }, + {"zfs_vdev_min_ms_count", KSTAT_DATA_UINT64 }, + {"vdev_validate_skip", KSTAT_DATA_UINT64 }, + {"zfs_zevent_len_max", KSTAT_DATA_UINT64 }, + {"zio_slow_io_ms", KSTAT_DATA_UINT64 }, + +}; + +static char vdev_raidz_string[KSTAT_STRLEN] = { 0 }; +static char icp_gcm_string[KSTAT_STRLEN] = { 0 }; +static char icp_aes_string[KSTAT_STRLEN] = { 0 }; +static char zfs_fletcher_4_string[KSTAT_STRLEN] = { 0 }; + +static kstat_t *osx_kstat_ksp; + +#if !defined(__OPTIMIZE__) +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif + +extern kstat_t *arc_ksp; + +static int osx_kstat_update(kstat_t *ksp, int rw) +{ + osx_kstat_t *ks = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + + /* Darwin */ + + zfs_vnop_ignore_negatives = + ks->darwin_ignore_negatives.value.ui64; + zfs_vnop_ignore_positives = + ks->darwin_ignore_positives.value.ui64; + zfs_vnop_create_negatives = + ks->darwin_create_negatives.value.ui64; + zfs_vnop_force_formd_normalized_output = + ks->darwin_force_formd_normalized.value.ui64; + zfs_vnop_skip_unlinked_drain = + ks->darwin_skip_unlinked_drain.value.ui64; + zfs_vfs_sync_paranoia = + ks->darwin_use_system_sync.value.ui64; + + /* L2ARC */ + l2arc_write_max = ks->l2arc_write_max.value.ui64; + l2arc_write_boost = ks->l2arc_write_boost.value.ui64; + l2arc_headroom = ks->l2arc_headroom.value.ui64; + l2arc_headroom_boost = ks->l2arc_headroom_boost.value.ui64; + l2arc_feed_secs = ks->l2arc_feed_secs.value.ui64; + l2arc_feed_min_ms = ks->l2arc_feed_min_ms.value.ui64; + + l2arc_noprefetch = ks->l2arc_noprefetch.value.i64; + l2arc_feed_again = ks->l2arc_feed_again.value.i64; + l2arc_norw = ks->l2arc_norw.value.i64; + + /* vdev_queue */ + + zfs_vdev_max_active = + ks->zfs_vdev_max_active.value.ui64; + zfs_vdev_sync_read_min_active = + ks->zfs_vdev_sync_read_min_active.value.ui64; + zfs_vdev_sync_read_max_active = + ks->zfs_vdev_sync_read_max_active.value.ui64; + zfs_vdev_sync_write_min_active = + ks->zfs_vdev_sync_write_min_active.value.ui64; + zfs_vdev_sync_write_max_active = + ks->zfs_vdev_sync_write_max_active.value.ui64; + zfs_vdev_async_read_min_active = + ks->zfs_vdev_async_read_min_active.value.ui64; + zfs_vdev_async_read_max_active = + ks->zfs_vdev_async_read_max_active.value.ui64; + zfs_vdev_async_write_min_active = + ks->zfs_vdev_async_write_min_active.value.ui64; + zfs_vdev_async_write_max_active = + ks->zfs_vdev_async_write_max_active.value.ui64; + zfs_vdev_scrub_min_active = + ks->zfs_vdev_scrub_min_active.value.ui64; + zfs_vdev_scrub_max_active = + ks->zfs_vdev_scrub_max_active.value.ui64; + zfs_vdev_async_write_active_min_dirty_percent = + ks->zfs_vdev_async_write_active_min_dirty_percent.value.i64; + zfs_vdev_async_write_active_max_dirty_percent = + ks->zfs_vdev_async_write_active_max_dirty_percent.value.i64; + zfs_vdev_aggregation_limit = + ks->zfs_vdev_aggregation_limit.value.i64; + zfs_vdev_read_gap_limit = + ks->zfs_vdev_read_gap_limit.value.i64; + zfs_vdev_write_gap_limit = + ks->zfs_vdev_write_gap_limit.value.i64; + + arc_lotsfree_percent = + ks->arc_lotsfree_percent.value.i64; + zfs_dirty_data_max = + ks->zfs_dirty_data_max.value.i64; + zfs_delay_max_ns = + ks->zfs_delay_max_ns.value.i64; + zfs_delay_min_dirty_percent = + ks->zfs_delay_min_dirty_percent.value.i64; + zfs_delay_scale = + ks->zfs_delay_scale.value.i64; + spa_asize_inflation = + ks->spa_asize_inflation.value.i64; + zfs_prefetch_disable = + ks->zfs_prefetch_disable.value.i64; + zfetch_max_streams = + ks->zfetch_max_streams.value.i64; + zfetch_min_sec_reap = + ks->zfetch_min_sec_reap.value.i64; + zfetch_array_rd_sz = + ks->zfetch_array_rd_sz.value.i64; + zfs_default_bs = + ks->zfs_default_bs.value.i64; + zfs_default_ibs = + ks->zfs_default_ibs.value.i64; + metaslab_aliquot = + ks->metaslab_aliquot.value.i64; + spa_max_replication_override = + ks->spa_max_replication_override.value.i64; + spa_mode_global = + ks->spa_mode_global.value.i64; + zfs_flags = + ks->zfs_flags.value.i64; + zfs_txg_timeout = + ks->zfs_txg_timeout.value.i64; + zfs_vdev_cache_max = + ks->zfs_vdev_cache_max.value.i64; + zfs_vdev_cache_size = + ks->zfs_vdev_cache_size.value.i64; + zfs_no_scrub_io = + ks->zfs_no_scrub_io.value.i64; + zfs_no_scrub_prefetch = + ks->zfs_no_scrub_prefetch.value.i64; + fzap_default_block_shift = + ks->fzap_default_block_shift.value.i64; + zfs_immediate_write_sz = + ks->zfs_immediate_write_sz.value.i64; + zfs_read_chunk_size = + ks->zfs_read_chunk_size.value.i64; + zfs_nocacheflush = + ks->zfs_nocacheflush.value.i64; + zil_replay_disable = + ks->zil_replay_disable.value.i64; + metaslab_df_alloc_threshold = + ks->metaslab_df_alloc_threshold.value.i64; + metaslab_df_free_pct = + ks->metaslab_df_free_pct.value.i64; + zio_injection_enabled = + ks->zio_injection_enabled.value.i64; + zvol_immediate_write_sz = + ks->zvol_immediate_write_sz.value.i64; + + zfs_recover = + ks->zfs_recover.value.i64; + + zfs_free_bpobj_enabled = + ks->zfs_free_bpobj_enabled.value.i64; + + zfs_send_corrupt_data = + ks->zfs_send_corrupt_data.value.ui64; + zfs_send_queue_length = + ks->zfs_send_queue_length.value.ui64; + zfs_recv_queue_length = + ks->zfs_recv_queue_length.value.ui64; + + zvol_inhibit_dev = + ks->zvol_inhibit_dev.value.ui64; + zfs_send_set_freerecords_bit = + ks->zfs_send_set_freerecords_bit.value.ui64; + + zfs_write_implies_delete_child = + ks->zfs_write_implies_delete_child.value.ui64; + send_holes_without_birth_time = + ks->zfs_send_holes_without_birth_time.value.ui64; + + dbuf_cache_max_bytes = + ks->dbuf_cache_max_bytes.value.ui64; + + zfs_vdev_queue_depth_pct = + ks->zfs_vdev_queue_depth_pct.value.ui64; + + zio_dva_throttle_enabled = + (boolean_t)ks->zio_dva_throttle_enabled.value.ui64; + + zfs_lua_max_instrlimit = + ks->zfs_lua_max_instrlimit.value.ui64; + zfs_lua_max_memlimit = + ks->zfs_lua_max_memlimit.value.ui64; + + zfs_trim_extent_bytes_max = + ks->zfs_trim_extent_bytes_max.value.ui64; + zfs_trim_extent_bytes_min = + ks->zfs_trim_extent_bytes_min.value.ui64; + zfs_trim_metaslab_skip = + ks->zfs_trim_metaslab_skip.value.ui64; + zfs_trim_txg_batch = + ks->zfs_trim_txg_batch.value.ui64; + zfs_trim_queue_limit = + ks->zfs_trim_queue_limit.value.ui64; + + zfs_send_unmodified_spill_blocks = + ks->zfs_send_unmodified_spill_blocks.value.ui64; + zfs_special_class_metadata_reserve_pct = + ks->zfs_special_class_metadata_reserve_pct.value.ui64; + + // Check if string has changed (from KREAD), if so, update. + if (strcmp(vdev_raidz_string, + KSTAT_NAMED_STR_PTR(&ks->zfs_vdev_raidz_impl)) != 0) + vdev_raidz_impl_set( + KSTAT_NAMED_STR_PTR(&ks->zfs_vdev_raidz_impl)); + + if (strcmp(icp_gcm_string, + KSTAT_NAMED_STR_PTR(&ks->icp_gcm_impl)) != 0) + gcm_impl_set(KSTAT_NAMED_STR_PTR(&ks->icp_gcm_impl)); + + if (strcmp(icp_aes_string, + KSTAT_NAMED_STR_PTR(&ks->icp_aes_impl)) != 0) + aes_impl_set(KSTAT_NAMED_STR_PTR(&ks->icp_aes_impl)); + + if (strcmp(zfs_fletcher_4_string, + KSTAT_NAMED_STR_PTR(&ks->zfs_fletcher_4_impl)) != 0) + fletcher_4_impl_set( + KSTAT_NAMED_STR_PTR(&ks->zfs_fletcher_4_impl)); + + zfs_expire_snapshot = + ks->zfs_expire_snapshot.value.ui64; + zfs_admin_snapshot = + ks->zfs_admin_snapshot.value.ui64; + zfs_auto_snapshot = + ks->zfs_auto_snapshot.value.ui64; + + zfs_spa_discard_memory_limit = + ks->zfs_spa_discard_memory_limit.value.ui64; + zfs_async_block_max_blocks = + ks->zfs_async_block_max_blocks.value.ui64; + zfs_initialize_chunk_size = + ks->zfs_initialize_chunk_size.value.ui64; + zfs_scan_suspend_progress = + ks->zfs_scan_suspend_progress.value.ui64; + zfs_removal_suspend_progress = + ks->zfs_removal_suspend_progress.value.ui64; + zfs_livelist_max_entries = + ks->zfs_livelist_max_entries.value.ui64; + + zfs_allow_redacted_dataset_mount = + ks->zfs_allow_redacted_dataset_mount.value.ui64; + zfs_checksum_events_per_second = + ks->zfs_checksum_events_per_second.value.ui64; + zfs_commit_timeout_pct = + ks->zfs_commit_timeout_pct.value.ui64; + zfs_compressed_arc_enabled = + ks->zfs_compressed_arc_enabled.value.ui64; + zfs_condense_indirect_commit_entry_delay_ms = + ks->zfs_condense_indirect_commit_entry_delay_ms.value.ui64; + zfs_condense_min_mapping_bytes = + ks->zfs_condense_min_mapping_bytes.value.ui64; + zfs_deadman_checktime_ms = + ks->zfs_deadman_checktime_ms.value.ui64; + + if (strcmp(zfs_deadman_failmode, + KSTAT_NAMED_STR_PTR(&ks->zfs_vdev_raidz_impl)) != 0) { + char *buf = + KSTAT_NAMED_STR_PTR(&ks->zfs_vdev_raidz_impl); + if (strcmp(buf, "wait") == 0) + zfs_deadman_failmode = "wait"; + if (strcmp(buf, "continue") == 0) + zfs_deadman_failmode = "continue"; + if (strcmp(buf, "panic") == 0) + zfs_deadman_failmode = "panic"; + param_set_deadman_failmode_common(buf); + } + + zfs_deadman_synctime_ms = + ks->zfs_deadman_synctime_ms.value.ui64; + zfs_deadman_ziotime_ms = + ks->zfs_deadman_ziotime_ms.value.ui64; + zfs_disable_ivset_guid_check = + ks->zfs_disable_ivset_guid_check.value.ui64; + zfs_initialize_value = + ks->zfs_initialize_value.value.ui64; + zfs_keep_log_spacemaps_at_export = + ks->zfs_keep_log_spacemaps_at_export.value.ui64; + l2arc_rebuild_blocks_min_l2size = + ks->l2arc_rebuild_blocks_min_l2size.value.ui64; + l2arc_rebuild_enabled = + ks->l2arc_rebuild_enabled.value.ui64; + l2arc_trim_ahead = ks->l2arc_trim_ahead.value.ui64; + zfs_livelist_condense_new_alloc = + ks->zfs_livelist_condense_new_alloc.value.ui64; + zfs_livelist_condense_sync_cancel = + ks->zfs_livelist_condense_sync_cancel.value.ui64; + zfs_livelist_condense_sync_pause = + ks->zfs_livelist_condense_sync_pause.value.ui64; + zfs_livelist_condense_zthr_cancel = + ks->zfs_livelist_condense_zthr_cancel.value.ui64; + zfs_livelist_condense_zthr_pause = + ks->zfs_livelist_condense_zthr_pause.value.ui64; + zfs_livelist_min_percent_shared = + ks->zfs_livelist_min_percent_shared.value.ui64; + zfs_max_dataset_nesting = + ks->zfs_max_dataset_nesting.value.ui64; + zfs_max_missing_tvds = + ks->zfs_max_missing_tvds.value.ui64; + metaslab_debug_load = ks->metaslab_debug_load.value.ui64; + metaslab_force_ganging = + ks->metaslab_force_ganging.value.ui64; + zfs_multihost_fail_intervals = + ks->zfs_multihost_fail_intervals.value.ui64; + zfs_multihost_import_intervals = + ks->zfs_multihost_import_intervals.value.ui64; + zfs_multihost_interval = + ks->zfs_multihost_interval.value.ui64; + zfs_override_estimate_recordsize = + ks->zfs_override_estimate_recordsize.value.ui64; + zfs_remove_max_segment = + ks->zfs_remove_max_segment.value.ui64; + zfs_resilver_min_time_ms = + ks->zfs_resilver_min_time_ms.value.ui64; + zfs_scan_legacy = ks->zfs_scan_legacy.value.ui64; + zfs_scan_vdev_limit = + ks->zfs_scan_vdev_limit.value.ui64; + zfs_slow_io_events_per_second = + ks->zfs_slow_io_events_per_second.value.ui64; + spa_load_verify_data = + ks->spa_load_verify_data.value.ui64; + spa_load_verify_metadata = + ks->spa_load_verify_metadata.value.ui64; + zfs_unlink_suspend_progress = + ks->zfs_unlink_suspend_progress.value.ui64; + zfs_vdev_min_ms_count = ks->zfs_vdev_min_ms_count.value.ui64; + vdev_validate_skip = ks->vdev_validate_skip.value.ui64; + zfs_zevent_len_max = ks->zfs_zevent_len_max.value.ui64; + zio_slow_io_ms = ks->zio_slow_io_ms.value.ui64; + + + } else { + + /* kstat READ */ + ks->spa_version.value.ui64 = SPA_VERSION; + ks->zpl_version.value.ui64 = ZPL_VERSION; + + /* Darwin */ + ks->darwin_active_vnodes.value.ui64 = vnop_num_vnodes; + ks->darwin_reclaim_nodes.value.ui64 = vnop_num_reclaims; + ks->darwin_ignore_negatives.value.ui64 = + zfs_vnop_ignore_negatives; + ks->darwin_ignore_positives.value.ui64 = + zfs_vnop_ignore_positives; + ks->darwin_create_negatives.value.ui64 = + zfs_vnop_create_negatives; + ks->darwin_force_formd_normalized.value.ui64 = + zfs_vnop_force_formd_normalized_output; + ks->darwin_skip_unlinked_drain.value.ui64 = + zfs_vnop_skip_unlinked_drain; + ks->darwin_use_system_sync.value.ui64 = zfs_vfs_sync_paranoia; + + /* L2ARC */ + ks->l2arc_write_max.value.ui64 = l2arc_write_max; + ks->l2arc_write_boost.value.ui64 = l2arc_write_boost; + ks->l2arc_headroom.value.ui64 = l2arc_headroom; + ks->l2arc_headroom_boost.value.ui64 = l2arc_headroom_boost; + ks->l2arc_feed_secs.value.ui64 = l2arc_feed_secs; + ks->l2arc_feed_min_ms.value.ui64 = l2arc_feed_min_ms; + + ks->l2arc_noprefetch.value.i64 = l2arc_noprefetch; + ks->l2arc_feed_again.value.i64 = l2arc_feed_again; + ks->l2arc_norw.value.i64 = l2arc_norw; + + /* vdev_queue */ + ks->zfs_vdev_max_active.value.ui64 = + zfs_vdev_max_active; + ks->zfs_vdev_sync_read_min_active.value.ui64 = + zfs_vdev_sync_read_min_active; + ks->zfs_vdev_sync_read_max_active.value.ui64 = + zfs_vdev_sync_read_max_active; + ks->zfs_vdev_sync_write_min_active.value.ui64 = + zfs_vdev_sync_write_min_active; + ks->zfs_vdev_sync_write_max_active.value.ui64 = + zfs_vdev_sync_write_max_active; + ks->zfs_vdev_async_read_min_active.value.ui64 = + zfs_vdev_async_read_min_active; + ks->zfs_vdev_async_read_max_active.value.ui64 = + zfs_vdev_async_read_max_active; + ks->zfs_vdev_async_write_min_active.value.ui64 = + zfs_vdev_async_write_min_active; + ks->zfs_vdev_async_write_max_active.value.ui64 = + zfs_vdev_async_write_max_active; + ks->zfs_vdev_scrub_min_active.value.ui64 = + zfs_vdev_scrub_min_active; + ks->zfs_vdev_scrub_max_active.value.ui64 = + zfs_vdev_scrub_max_active; + ks->zfs_vdev_async_write_active_min_dirty_percent.value.i64 = + zfs_vdev_async_write_active_min_dirty_percent; + ks->zfs_vdev_async_write_active_max_dirty_percent.value.i64 = + zfs_vdev_async_write_active_max_dirty_percent; + ks->zfs_vdev_aggregation_limit.value.i64 = + zfs_vdev_aggregation_limit; + ks->zfs_vdev_read_gap_limit.value.i64 = + zfs_vdev_read_gap_limit; + ks->zfs_vdev_write_gap_limit.value.i64 = + zfs_vdev_write_gap_limit; + + ks->arc_lotsfree_percent.value.i64 = + arc_lotsfree_percent; + ks->zfs_dirty_data_max.value.i64 = + zfs_dirty_data_max; + ks->zfs_delay_max_ns.value.i64 = + zfs_delay_max_ns; + ks->zfs_delay_min_dirty_percent.value.i64 = + zfs_delay_min_dirty_percent; + ks->zfs_delay_scale.value.i64 = + zfs_delay_scale; + ks->spa_asize_inflation.value.i64 = + spa_asize_inflation; + ks->zfs_prefetch_disable.value.i64 = + zfs_prefetch_disable; + ks->zfetch_max_streams.value.i64 = + zfetch_max_streams; + ks->zfetch_min_sec_reap.value.i64 = + zfetch_min_sec_reap; + ks->zfetch_array_rd_sz.value.i64 = + zfetch_array_rd_sz; + ks->zfs_default_bs.value.i64 = + zfs_default_bs; + ks->zfs_default_ibs.value.i64 = + zfs_default_ibs; + ks->metaslab_aliquot.value.i64 = + metaslab_aliquot; + ks->spa_max_replication_override.value.i64 = + spa_max_replication_override; + ks->spa_mode_global.value.i64 = + spa_mode_global; + ks->zfs_flags.value.i64 = + zfs_flags; + ks->zfs_txg_timeout.value.i64 = + zfs_txg_timeout; + ks->zfs_vdev_cache_max.value.i64 = + zfs_vdev_cache_max; + ks->zfs_vdev_cache_size.value.i64 = + zfs_vdev_cache_size; + ks->zfs_no_scrub_io.value.i64 = + zfs_no_scrub_io; + ks->zfs_no_scrub_prefetch.value.i64 = + zfs_no_scrub_prefetch; + ks->fzap_default_block_shift.value.i64 = + fzap_default_block_shift; + ks->zfs_immediate_write_sz.value.i64 = + zfs_immediate_write_sz; + ks->zfs_read_chunk_size.value.i64 = + zfs_read_chunk_size; + ks->zfs_nocacheflush.value.i64 = + zfs_nocacheflush; + ks->zil_replay_disable.value.i64 = + zil_replay_disable; + ks->metaslab_df_alloc_threshold.value.i64 = + metaslab_df_alloc_threshold; + ks->metaslab_df_free_pct.value.i64 = + metaslab_df_free_pct; + ks->zio_injection_enabled.value.i64 = + zio_injection_enabled; + ks->zvol_immediate_write_sz.value.i64 = + zvol_immediate_write_sz; + + ks->zfs_recover.value.i64 = + zfs_recover; + + ks->zfs_free_bpobj_enabled.value.i64 = + zfs_free_bpobj_enabled; + + ks->zfs_send_corrupt_data.value.ui64 = + zfs_send_corrupt_data; + ks->zfs_send_queue_length.value.ui64 = + zfs_send_queue_length; + ks->zfs_recv_queue_length.value.ui64 = + zfs_recv_queue_length; + + ks->zvol_inhibit_dev.value.ui64 = + zvol_inhibit_dev; + ks->zfs_send_set_freerecords_bit.value.ui64 = + zfs_send_set_freerecords_bit; + + ks->zfs_write_implies_delete_child.value.ui64 = + zfs_write_implies_delete_child; + ks->zfs_send_holes_without_birth_time.value.ui64 = + send_holes_without_birth_time; + + ks->dbuf_cache_max_bytes.value.ui64 = dbuf_cache_max_bytes; + + ks->zfs_vdev_queue_depth_pct.value.ui64 = + zfs_vdev_queue_depth_pct; + ks->zio_dva_throttle_enabled.value.ui64 = + (uint64_t)zio_dva_throttle_enabled; + + ks->zfs_lua_max_instrlimit.value.ui64 = zfs_lua_max_instrlimit; + ks->zfs_lua_max_memlimit.value.ui64 = zfs_lua_max_memlimit; + + ks->zfs_trim_extent_bytes_max.value.ui64 = + zfs_trim_extent_bytes_max; + ks->zfs_trim_extent_bytes_min.value.ui64 = + zfs_trim_extent_bytes_min; + ks->zfs_trim_metaslab_skip.value.ui64 = + zfs_trim_metaslab_skip; + ks->zfs_trim_txg_batch.value.ui64 = + zfs_trim_txg_batch; + ks->zfs_trim_queue_limit.value.ui64 = + zfs_trim_queue_limit; + + ks->zfs_send_unmodified_spill_blocks.value.ui64 = + zfs_send_unmodified_spill_blocks; + ks->zfs_special_class_metadata_reserve_pct.value.ui64 = + zfs_special_class_metadata_reserve_pct; + + vdev_raidz_impl_get(vdev_raidz_string, + sizeof (vdev_raidz_string)); + kstat_named_setstr(&ks->zfs_vdev_raidz_impl, vdev_raidz_string); + + gcm_impl_get(icp_gcm_string, sizeof (icp_gcm_string)); + kstat_named_setstr(&ks->icp_gcm_impl, icp_gcm_string); + + aes_impl_get(icp_aes_string, sizeof (icp_aes_string)); + kstat_named_setstr(&ks->icp_aes_impl, icp_aes_string); + + fletcher_4_get(zfs_fletcher_4_string, + sizeof (zfs_fletcher_4_string)); + kstat_named_setstr(&ks->zfs_fletcher_4_impl, + zfs_fletcher_4_string); + + ks->zfs_expire_snapshot.value.ui64 = zfs_expire_snapshot; + ks->zfs_admin_snapshot.value.ui64 = zfs_admin_snapshot; + ks->zfs_auto_snapshot.value.ui64 = zfs_auto_snapshot; + + ks->zfs_spa_discard_memory_limit.value.ui64 = + zfs_spa_discard_memory_limit; + ks->zfs_async_block_max_blocks.value.ui64 = + zfs_async_block_max_blocks; + ks->zfs_initialize_chunk_size.value.ui64 = + zfs_initialize_chunk_size; + ks->zfs_scan_suspend_progress.value.ui64 = + zfs_scan_suspend_progress; + ks->zfs_livelist_max_entries.value.ui64 = + zfs_livelist_max_entries; + + ks->zfs_allow_redacted_dataset_mount.value.ui64 = + zfs_allow_redacted_dataset_mount; + ks->zfs_checksum_events_per_second.value.ui64 = + zfs_checksum_events_per_second; + ks->zfs_commit_timeout_pct.value.ui64 = zfs_commit_timeout_pct; + ks->zfs_compressed_arc_enabled.value.ui64 = + zfs_compressed_arc_enabled; + ks->zfs_condense_indirect_commit_entry_delay_ms.value.ui64 = + zfs_condense_indirect_commit_entry_delay_ms; + ks->zfs_condense_min_mapping_bytes.value.ui64 = + zfs_condense_min_mapping_bytes; + ks->zfs_deadman_checktime_ms.value.ui64 = + zfs_deadman_checktime_ms; + + kstat_named_setstr(&ks->zfs_deadman_failmode, + zfs_deadman_failmode); + + ks->zfs_deadman_synctime_ms.value.ui64 = + zfs_deadman_synctime_ms; + ks->zfs_deadman_ziotime_ms.value.ui64 = zfs_deadman_ziotime_ms; + ks->zfs_disable_ivset_guid_check.value.ui64 = + zfs_disable_ivset_guid_check; + ks->zfs_initialize_value.value.ui64 = zfs_initialize_value; + ks->zfs_keep_log_spacemaps_at_export.value.ui64 = + zfs_keep_log_spacemaps_at_export; + ks->l2arc_rebuild_blocks_min_l2size.value.ui64 = + l2arc_rebuild_blocks_min_l2size; + ks->l2arc_rebuild_enabled.value.ui64 = l2arc_rebuild_enabled; + ks->l2arc_trim_ahead.value.ui64 = l2arc_trim_ahead; + ks->zfs_livelist_condense_new_alloc.value.ui64 = + zfs_livelist_condense_new_alloc; + ks->zfs_livelist_condense_sync_cancel.value.ui64 = + zfs_livelist_condense_sync_cancel; + ks->zfs_livelist_condense_sync_pause.value.ui64 = + zfs_livelist_condense_sync_pause; + ks->zfs_livelist_condense_zthr_cancel.value.ui64 = + zfs_livelist_condense_zthr_cancel; + ks->zfs_livelist_condense_zthr_pause.value.ui64 = + zfs_livelist_condense_zthr_pause; + ks->zfs_livelist_min_percent_shared.value.ui64 = + zfs_livelist_min_percent_shared; + ks->zfs_max_dataset_nesting.value.ui64 = + zfs_max_dataset_nesting; + ks->zfs_max_missing_tvds.value.ui64 = zfs_max_missing_tvds; + ks->metaslab_debug_load.value.ui64 = metaslab_debug_load; + ks->metaslab_force_ganging.value.ui64 = metaslab_force_ganging; + ks->zfs_multihost_fail_intervals.value.ui64 = + zfs_multihost_fail_intervals; + ks->zfs_multihost_import_intervals.value.ui64 = + zfs_multihost_import_intervals; + ks->zfs_multihost_interval.value.ui64 = zfs_multihost_interval; + ks->zfs_override_estimate_recordsize.value.ui64 = + zfs_override_estimate_recordsize; + ks->zfs_remove_max_segment.value.ui64 = zfs_remove_max_segment; + ks->zfs_resilver_min_time_ms.value.ui64 = + zfs_resilver_min_time_ms; + ks->zfs_scan_legacy.value.ui64 = zfs_scan_legacy; + ks->zfs_scan_vdev_limit.value.ui64 = zfs_scan_vdev_limit; + ks->zfs_slow_io_events_per_second.value.ui64 = + zfs_slow_io_events_per_second; + ks->spa_load_verify_data.value.ui64 = spa_load_verify_data; + ks->spa_load_verify_metadata.value.ui64 = + spa_load_verify_metadata; + ks->zfs_unlink_suspend_progress.value.ui64 = + zfs_unlink_suspend_progress; + ks->zfs_vdev_min_ms_count.value.ui64 = zfs_vdev_min_ms_count; + ks->vdev_validate_skip.value.ui64 = vdev_validate_skip; + ks->zfs_zevent_len_max.value.ui64 = zfs_zevent_len_max; + ks->zio_slow_io_ms.value.ui64 = zio_slow_io_ms; + } + + return (0); +} + + + +int +kstat_osx_init(void) +{ + osx_kstat_ksp = kstat_create("zfs", 0, "tunable", "darwin", + KSTAT_TYPE_NAMED, sizeof (osx_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); + + if (osx_kstat_ksp != NULL) { + osx_kstat_ksp->ks_data = &osx_kstat; + osx_kstat_ksp->ks_update = osx_kstat_update; + kstat_install(osx_kstat_ksp); + } + + return (0); +} + +void +kstat_osx_fini(void) +{ + if (osx_kstat_ksp != NULL) { + kstat_delete(osx_kstat_ksp); + osx_kstat_ksp = NULL; + } +} diff --git a/module/os/macos/zfs/zfs_osx.cpp b/module/os/macos/zfs/zfs_osx.cpp new file mode 100644 index 0000000000..898ff897ec --- /dev/null +++ b/module/os/macos/zfs/zfs_osx.cpp @@ -0,0 +1,310 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013-2020, Jorgen Lundman. All rights reserved. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +// Define the superclass. +#define super IOService + +OSDefineMetaClassAndStructors(net_lundman_zfs_zvol, IOService) + +extern "C" { + +#include +#include +#include + +extern SInt32 zfs_active_fs_count; + +#ifdef DEBUG +#define ZFS_DEBUG_STR " (DEBUG mode)" +#else +#define ZFS_DEBUG_STR "" +#endif + +static char spl_gitrev[64] = ZFS_META_GITREV; + +SYSCTL_DECL(_zfs); +SYSCTL_NODE(, OID_AUTO, zfs, CTLFLAG_RD, 0, ""); +SYSCTL_STRING(_zfs, OID_AUTO, kext_version, + CTLFLAG_RD | CTLFLAG_LOCKED, + spl_gitrev, 0, "ZFS KEXT Version"); + + +extern kern_return_t _start(kmod_info_t *ki, void *data); +extern kern_return_t _stop(kmod_info_t *ki, void *data); + +__attribute__((visibility("default"))) KMOD_EXPLICIT_DECL(net.lundman.zfs, + "1.0.0", _start, _stop) +kmod_start_func_t *_realmain = 0; +kmod_stop_func_t *_antimain = 0; +int _kext_apple_cc = __APPLE_CC__; + +} // Extern "C" + +bool +net_lundman_zfs_zvol::init(OSDictionary* dict) +{ + bool res; + + /* Need an OSSet for open clients */ + _openClients = OSSet::withCapacity(1); + if (_openClients == NULL) { + dprintf("client OSSet failed"); + return (false); + } + + res = super::init(dict); + + // IOLog("ZFS::init\n"); + return (res); +} + +void +net_lundman_zfs_zvol::free(void) +{ + OSSafeReleaseNULL(_openClients); + + // IOLog("ZFS::free\n"); + super::free(); +} + +bool +net_lundman_zfs_zvol::isOpen(const IOService *forClient) const +{ + bool ret; + ret = IOService::isOpen(forClient); + return (ret); +} + +bool +net_lundman_zfs_zvol::handleOpen(IOService *client, + IOOptionBits options, void *arg) +{ + bool ret = true; + + dprintf(""); + + _openClients->setObject(client); + ret = _openClients->containsObject(client); + + return (ret); +} + +bool +net_lundman_zfs_zvol::handleIsOpen(const IOService *client) const +{ + bool ret; + + dprintf(""); + + ret = _openClients->containsObject(client); + + return (ret); +} + +void +net_lundman_zfs_zvol::handleClose(IOService *client, + IOOptionBits options) +{ + dprintf(""); + + if (_openClients->containsObject(client) == false) { + dprintf("not open"); + } + + _openClients->removeObject(client); +} + +IOService* +net_lundman_zfs_zvol::probe(IOService *provider, SInt32 *score) +{ + IOService *res = super::probe(provider, score); + return (res); +} + + +/* + * + * ************************************************************************ + * + * Kernel Module Load + * + * ************************************************************************ + * + */ + +bool +net_lundman_zfs_zvol::start(IOService *provider) +{ + bool res = super::start(provider); + + IOLog("ZFS: Loading module ... \n"); + + if (!res) + return (res); + + /* Fire up all SPL modules and threads */ + spl_start(NULL, NULL); + + /* registerService() allows zconfigd to match against the service */ + this->registerService(); + + /* + * hostid is left as 0 on OSX, and left to be set if developers wish to + * use it. If it is 0, we will hash the hardware.uuid into a 32 bit + * value and set the hostid. + */ + if (!zone_get_hostid(NULL)) { + uint32_t myhostid = 0; + IORegistryEntry *ioregroot = + IORegistryEntry::getRegistryRoot(); + if (ioregroot) { + IORegistryEntry *macmodel = + ioregroot->getChildEntry(gIOServicePlane); + + if (macmodel) { + OSObject *ioplatformuuidobj; + ioplatformuuidobj = + macmodel->getProperty(kIOPlatformUUIDKey); + if (ioplatformuuidobj) { + OSString *ioplatformuuidstr = + OSDynamicCast(OSString, + ioplatformuuidobj); + + myhostid = fnv_32a_str( + ioplatformuuidstr-> + getCStringNoCopy(), + FNV1_32A_INIT); + + sysctlbyname("kern.hostid", NULL, NULL, + &myhostid, sizeof (myhostid)); + printf("ZFS: hostid set to %08x from " + "UUID '%s'\n", myhostid, + ioplatformuuidstr-> + getCStringNoCopy()); + } + } + } + } + + /* Register ZFS KEXT Version sysctl - separate to kstats */ + sysctl_register_oid(&sysctl__zfs); + sysctl_register_oid(&sysctl__zfs_kext_version); + + /* Init LDI */ + int error = 0; + error = ldi_init(NULL); + if (error) { + IOLog("%s ldi_init error %d\n", __func__, error); + goto failure; + } + + /* Start ZFS itself */ + zfs_kmod_init(); + + /* Register fs with XNU */ + zfs_vfsops_init(); + + /* + * When is the best time to start the system_taskq? It is strictly + * speaking not used by SPL, but by ZFS. ZFS should really start it? + */ + system_taskq_init(); + + res = zfs_boot_init((IOService *)this); + + printf("ZFS: Loaded module v%s-%s%s, " + "ZFS pool version %s, ZFS filesystem version %s\n", + ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, + SPA_VERSION_STRING, ZPL_VERSION_STRING); + + return (true); + +failure: + spl_stop(NULL, NULL); + sysctl_unregister_oid(&sysctl__zfs_kext_version); + sysctl_unregister_oid(&sysctl__zfs); + return (false); +} + +/* Here we are, at the end of all things */ +void +net_lundman_zfs_zvol::stop(IOService *provider) +{ + + zfs_boot_fini(); + + IOLog("ZFS: Attempting to unload ...\n"); + + super::stop(provider); + + system_taskq_fini(); + + zfs_vfsops_fini(); + + zfs_kmod_fini(); + + ldi_fini(); + + sysctl_unregister_oid(&sysctl__zfs_kext_version); + sysctl_unregister_oid(&sysctl__zfs); + + spl_stop(NULL, NULL); + + printf("ZFS: Unloaded module v%s-%s%s\n", + ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); + + /* + * There is no way to ensure all threads have actually got to the + * thread_exit() call, before we exit here (and XNU unloads all + * memory for the KEXT). So we increase the odds of that happening + * by delaying a little bit before we return to XNU. Quite possibly + * the worst "solution" but Apple has not given any good options. + */ + delay(hz*5); +} diff --git a/module/os/macos/zfs/zfs_vfsops.c b/module/os/macos/zfs/zfs_vfsops.c new file mode 100644 index 0000000000..16ad11e4eb --- /dev/null +++ b/module/os/macos/zfs/zfs_vfsops.c @@ -0,0 +1,2951 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ +/* Portions Copyright 2013,2020 Jorgen Lundman */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_comutil.h" + +#include +#include +#include +#include +#include +#include + +unsigned int zfs_vnop_skip_unlinked_drain = 0; + +int zfs_module_start(kmod_info_t *ki, void *data); +int zfs_module_stop(kmod_info_t *ki, void *data); +extern int getzfsvfs(const char *dsname, zfsvfs_t **zfvp); + +void arc_os_init(void); +void arc_os_fini(void); + +/* + * AVL tree of hardlink entries, which we need to map for Finder. The va_linkid + * needs to be unique for each hardlink target, as well as, return the znode + * in vget(va_linkid). Unfortunately, the va_linkid is 32bit (lost in the + * syscall translation to userland struct). We sort the AVL tree by + * -> directory id + * -> z_id + * -> name + * + */ +static int hardlinks_compare(const void *arg1, const void *arg2) +{ + const hardlinks_t *node1 = arg1; + const hardlinks_t *node2 = arg2; + int value; + if (node1->hl_parent > node2->hl_parent) + return (1); + if (node1->hl_parent < node2->hl_parent) + return (-1); + if (node1->hl_fileid > node2->hl_fileid) + return (1); + if (node1->hl_fileid < node2->hl_fileid) + return (-1); + + value = strncmp(node1->hl_name, node2->hl_name, PATH_MAX); + if (value < 0) + return (-1); + if (value > 0) + return (1); + return (0); +} + +/* + * Lookup same information from linkid, to get at parentid, objid and name + */ +static int hardlinks_compare_linkid(const void *arg1, const void *arg2) +{ + const hardlinks_t *node1 = arg1; + const hardlinks_t *node2 = arg2; + if (node1->hl_linkid > node2->hl_linkid) + return (1); + if (node1->hl_linkid < node2->hl_linkid) + return (-1); + return (0); +} + +extern int +zfs_obtain_xattr(znode_t *, const char *, mode_t, cred_t *, vnode_t **, int); + + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our kext + * from being unloaded after a umount -f + */ +uint32_t zfs_active_fs_count = 0; + +extern void zfs_ioctl_init(void); +extern void zfs_ioctl_fini(void); + +int +zfs_is_readonly(zfsvfs_t *zfsvfs) +{ + return (!!(vfs_isrdonly(zfsvfs->z_vfs))); +} + +/* + * The OS sync ignored by default, as ZFS handles internal periodic + * syncs. (As per illumos) Unfortunately, we can not tell the difference + * of when users run "sync" by hand. Sync is called on umount though. + */ +uint64_t zfs_vfs_sync_paranoia = 0; + +int +zfs_vfs_sync(struct mount *vfsp, __unused int waitfor, + __unused vfs_context_t context) +{ + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (spl_panicstr()) + return (0); + + /* Check if sysctl setting wants sync - and we are not unmounting */ + if (zfs_vfs_sync_paranoia == 0 && + !vfs_isunmount(vfsp)) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + dsl_pool_t *dp; + + ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (spl_system_inshutdown() && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + + ZFS_EXIT(zfsvfs); + + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == B_TRUE) { + zfsvfs->z_atime = B_TRUE; + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOATIME); + } else { + zfsvfs->z_atime = B_FALSE; + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOATIME); + } +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + /* + * Apple does have MNT_NOUSERXATTR mount option, but unfortunately + * the VFS layer returns EACCESS if xattr access is attempted. + * Finder etc, will do so, even if filesystem capabilities is set + * without xattr, rendering the mount option useless. We no longer + * set it, and handle xattrs being disabled internally. + */ + + if (newval == ZFS_XATTR_OFF) { + zfsvfs->z_xattr = B_FALSE; + // vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOUSERXATTR); + } else { + zfsvfs->z_xattr = B_TRUE; + // vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOUSERXATTR); + + if (newval == ZFS_XATTR_SA) + zfsvfs->z_xattr_sa = B_TRUE; + else + zfsvfs->z_xattr_sa = B_FALSE; + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); + + zfsvfs->z_max_blksz = newval; + // zfsvfs->z_vfs->mnt_stat.f_iosize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_TRUE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_RDONLY); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_RDONLY); + } +} + +static void +devices_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NODEV); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NODEV); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOSUID); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOSUID); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOEXEC); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOEXEC); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + zfsvfs->z_show_ctldir = newval; + cache_purgevfs(zfsvfs->z_vfs); +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + // zfsvfs_t *zfsvfs = arg; + // zfsvfs->z_vscan = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +static void +finderbrowse_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_DONTBROWSE); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_DONTBROWSE); + } +} +static void +ignoreowner_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_IGNORE_OWNERSHIP); + } else { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_IGNORE_OWNERSHIP); + } +} + +static void +mimic_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(zfsvfs->z_vfs); + + if (newval == 0) { + strlcpy(vfsstatfs->f_fstypename, "zfs", MFSTYPENAMELEN); + } else { + strlcpy(vfsstatfs->f_fstypename, "hfs", MFSTYPENAMELEN); + } +} + +static int +zfs_register_callbacks(struct mount *vfsp) +{ + struct dsl_dataset *ds = NULL; + + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; + boolean_t devices = B_FALSE; + boolean_t do_devices = B_FALSE; + boolean_t xattr = B_FALSE; + boolean_t do_xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; + boolean_t finderbrowse = B_FALSE; + boolean_t do_finderbrowse = B_FALSE; + boolean_t ignoreowner = B_FALSE; + boolean_t do_ignoreowner = B_FALSE; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfs_fsprivate(vfsp); + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * This function can be called for a snapshot when we update snapshot's + * mount point, which isn't really supported. + */ + if (dmu_objset_is_snapshot(os)) + return (EOPNOTSUPP); + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ +#define vfs_optionisset(X, Y, Z) (vfs_flags(X)&(Y)) + + if (vfs_optionisset(vfsp, MNT_RDONLY, NULL) || + !spa_writeable(dmu_objset_spa(os))) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_NODEV, NULL)) { + devices = B_FALSE; + do_devices = B_TRUE; + } + /* xnu SETUID, not IllumOS SUID */ + if (vfs_optionisset(vfsp, MNT_NOSUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_NOUSERXATTR, NULL)) { + xattr = B_FALSE; + do_xattr = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_DONTBROWSE, NULL)) { + finderbrowse = B_FALSE; + do_finderbrowse = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_IGNORE_OWNERSHIP, NULL)) { + ignoreowner = B_TRUE; + do_ignoreowner = B_TRUE; + } + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); + // This appears to be PROP_PRIVATE, investigate if we want this + // ZOL calls this ACLTYPE + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_BROWSE), finderbrowse_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_IGNOREOWNER), + ignoreowner_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_MIMIC), mimic_changed_cb, zfsvfs); + + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_devices) + devices_changed_cb(zfsvfs, devices); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); + + if (do_finderbrowse) + finderbrowse_changed_cb(zfsvfs, finderbrowse); + if (do_ignoreowner) + ignoreowner_changed_cb(zfsvfs, ignoreowner); + + return (0); + +unregister: + dsl_prop_unregister_all(ds, zfsvfs); + return (error); +} + +/* + * Takes a dataset, a property, a value and that value's setpoint as + * found in the ZAP. Checks if the property has been changed in the vfs. + * If so, val and setpoint will be overwritten with updated content. + * Otherwise, they are left unchanged. + */ +int +zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, + char *setpoint) +{ + int error; + zfsvfs_t *zfvp; + mount_t vfsp; + objset_t *os; + uint64_t tmp = *val; + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + if (dmu_objset_type(os) != DMU_OST_ZFS) + return (EINVAL); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + mutex_exit(&os->os_user_ptr_lock); + if (zfvp == NULL) + return (ESRCH); + + vfsp = zfvp->z_vfs; + + switch (zfs_prop) { + case ZFS_PROP_ATIME: +// if (vfsp->vfs_do_atime) +// tmp = vfsp->vfs_atime; + break; + case ZFS_PROP_RELATIME: +// if (vfsp->vfs_do_relatime) +// tmp = vfsp->vfs_relatime; + break; + case ZFS_PROP_DEVICES: +// if (vfsp->vfs_do_devices) +// tmp = vfsp->vfs_devices; + break; + case ZFS_PROP_EXEC: +// if (vfsp->vfs_do_exec) +// tmp = vfsp->vfs_exec; + break; + case ZFS_PROP_SETUID: +// if (vfsp->vfs_do_setuid) +// tmp = vfsp->vfs_setuid; + break; + case ZFS_PROP_READONLY: +// if (vfsp->vfs_do_readonly) +// tmp = vfsp->vfs_readonly; + break; + case ZFS_PROP_XATTR: +// if (vfsp->vfs_do_xattr) +// tmp = vfsp->vfs_xattr; + break; + case ZFS_PROP_NBMAND: +// if (vfsp->vfs_do_nbmand) +// tmp = vfsp->vfs_nbmand; + break; + default: + return (ENOENT); + } + + if (tmp != *val) { + (void) strlcpy(setpoint, "temporary", ZFS_MAX_DATASET_NAME_LEN); + *val = tmp; + } + return (0); +} + + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + /* Volume status "all ok" */ + zfsvfs->z_notification_conditions = 0; + zfsvfs->z_freespace_notify_warninglimit = 0; + zfsvfs->z_freespace_notify_dangerlimit = 0; + zfsvfs->z_freespace_notify_desiredlevel = 0; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.\n", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); + } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; + + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); + + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; + + error = zfs_get_zplprop(os, ZFS_PROP_ACLMODE, &val); + if (error != 0) + return (error); + zfsvfs->z_acl_mode = (uint_t)val; + + zfs_get_zplprop(os, ZFS_PROP_LASTUNMOUNT, &val); + zfsvfs->z_last_unmount_time = val; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); + if ((error == 0) && (val == ZFS_XATTR_SA)) + zfsvfs->z_xattr_sa = B_TRUE; + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error != 0) + return (error); + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + return (0); +} + +int +zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + /* + * We claim to always be readonly so we can open snapshots; + * other ZPL code will prevent us from writing to snapshots. + */ + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, B_TRUE, + zfsvfs, &os); + if (error != 0) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + error = zfsvfs_create_impl(zfvp, zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + return (error); +} + +int +zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + zfsvfs->z_ctldir_startid = ZFSCTL_INO_SNAPDIRS; + + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); + + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + + int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1), + ZFS_OBJ_MTX_MAX); + zfsvfs->z_hold_size = size; + zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, + KM_SLEEP); + zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); + for (int i = 0; i != size; i++) { + avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, + sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); + mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); + } + + rw_init(&zfsvfs->z_hardlinks_lock, NULL, RW_DEFAULT, NULL); + avl_create(&zfsvfs->z_hardlinks, hardlinks_compare, + sizeof (hardlinks_t), offsetof(hardlinks_t, hl_node)); + avl_create(&zfsvfs->z_hardlinks_linkid, hardlinks_compare_linkid, + sizeof (hardlinks_t), offsetof(hardlinks_t, hl_node_linkid)); + zfsvfs->z_rdonly = 0; + + mutex_init(&zfsvfs->z_drain_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zfsvfs->z_drain_cv, NULL, CV_DEFAULT, NULL); + + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + *zfvp = zfsvfs; + return (0); +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + boolean_t readonly = vfs_isrdonly(zfsvfs->z_vfs); + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + + if (readonly != 0) + readonly_changed_cb(zfsvfs, B_FALSE); + else + if (!zfs_vnop_skip_unlinked_drain) + zfs_unlinked_drain(zfsvfs); + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } + } + + /* restore readonly bit */ + if (readonly != 0) + readonly_changed_cb(zfsvfs, B_TRUE); + } + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + return (0); +} + +extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + +void +zfsvfs_free(zfsvfs_t *zfsvfs) +{ + int i, size = zfsvfs->z_hold_size; + + dprintf("+zfsvfs_free\n"); + + zfs_fuid_destroy(zfsvfs); + + cv_destroy(&zfsvfs->z_drain_cv); + mutex_destroy(&zfsvfs->z_drain_lock); + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_lock); + list_destroy(&zfsvfs->z_all_znodes); + rrm_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + + for (i = 0; i != size; i++) { + avl_destroy(&zfsvfs->z_hold_trees[i]); + mutex_destroy(&zfsvfs->z_hold_locks[i]); + } + kmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); + kmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); + + dprintf("ZFS: Unloading hardlink AVLtree: %lu\n", + avl_numnodes(&zfsvfs->z_hardlinks)); + void *cookie = NULL; + hardlinks_t *hardlink; + rw_destroy(&zfsvfs->z_hardlinks_lock); + while ((hardlink = avl_destroy_nodes(&zfsvfs->z_hardlinks_linkid, + &cookie))) { + } + cookie = NULL; + while ((hardlink = avl_destroy_nodes(&zfsvfs->z_hardlinks, &cookie))) { + kmem_free(hardlink, sizeof (*hardlink)); + } + avl_destroy(&zfsvfs->z_hardlinks); + avl_destroy(&zfsvfs->z_hardlinks_linkid); + + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + dprintf("-zfsvfs_free\n"); +} + +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_vfs) { +#if 0 + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } +#endif + } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} + + +static int +zfs_domount(struct mount *vfsp, dev_t mount_dev, char *osname, + vfs_context_t ctx) +{ + int error = 0; + zfsvfs_t *zfsvfs; + uint64_t mimic = 0; + struct timeval tv; + + ASSERT(vfsp); + ASSERT(osname); + + error = zfsvfs_create(osname, B_FALSE, &zfsvfs); + if (error) + return (error); + zfsvfs->z_vfs = vfsp; + + zfsvfs->z_rdev = mount_dev; + + /* HFS sets this prior to mounting */ + vfs_setflags(vfsp, (uint64_t)((unsigned int)MNT_DOVOLFS)); + /* Advisory locking should be handled at the VFS layer */ + vfs_setlocklocal(vfsp); + + /* + * Record the mount time (for Spotlight) + */ + microtime(&tv); + zfsvfs->z_mount_time = tv.tv_sec; + + vfs_setfsprivate(vfsp, zfsvfs); + + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + + error = dsl_prop_get_integer(osname, "com.apple.mimic", &mimic, NULL); + if (zfsvfs->z_rdev) { + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(vfsp); + vfsstatfs->f_fsid.val[0] = zfsvfs->z_rdev; + vfsstatfs->f_fsid.val[1] = vfs_typenum(vfsp); + } else { + // Otherwise, ask VFS to give us a random unique one. + vfs_getnewfsid(vfsp); + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(vfsp); + zfsvfs->z_rdev = vfsstatfs->f_fsid.val[0]; + } + + /* + * If we are readonly (ie, waiting for rootmount) we need to reply + * honestly, so launchd runs fsck_zfs and mount_zfs + */ + if (mimic) { + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(vfsp); + strlcpy(vfsstatfs->f_fstypename, "hfs", MFSTYPENAMELEN); + } + + /* + * Set features for file system. + */ + zfs_set_fuid_feature(zfsvfs); + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + zfsvfs_t *fs_zfsvfs; + + dmu_fsname(osname, fsname); + error = getzfsvfs(fsname, &fs_zfsvfs); + if (error == 0) { + if (fs_zfsvfs->z_unmounted) + error = SET_ERROR(EINVAL); + vfs_unbusy(fs_zfsvfs->z_vfs); + } + if (error) { + printf("file system '%s' is unmounted : error %d\n", + fsname, + error); + goto out; + } + + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if ((error = dsl_prop_get_integer(osname, "xattr", &pval, + NULL))) + goto out; + xattr_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + zfsctl_mount_signal(osname, B_TRUE); + + } else { + if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) + goto out; + } + + vfs_setflags(vfsp, (uint64_t)((unsigned int)MNT_JOURNALED)); + + if ((vfs_flags(vfsp) & MNT_ROOTFS) != 0) { + /* Root FS */ + vfs_clearflags(vfsp, + (uint64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS)); + vfs_clearflags(vfsp, + (uint64_t)((unsigned int)MNT_IGNORE_OWNERSHIP)); + } + +#if 1 // Want .zfs or not + if (!zfsvfs->z_issnap) { + zfsctl_create(zfsvfs); + } +#endif + +out: + if (error) { + vfs_setfsprivate(vfsp, NULL); + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); + zfsvfs_free(zfsvfs); + } else { + atomic_inc_32(&zfs_active_fs_count); + } + + return (error); +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + + /* + * Unregister properties. + */ + if (!dmu_objset_is_snapshot(os)) { + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); + } +} + +/* + * zfs_vfs_mountroot + * Given a device vnode created by vfs_mountroot bdevvp, + * and with the root pool already imported, root mount the + * dataset specified in the pool's bootfs property. + * + * Inputs: + * mp: VFS mount struct + * devvp: device vnode, currently only used to retrieve the + * dev_t for the fsid. Could vnode_get, vnode_ref, vnode_put, + * with matching get/rele/put in zfs_vfs_umount, but this is + * already done by XNU as well. + * ctx: VFS context, unused. + * + * Return: + * 0 on success, positive int on failure. + */ +int +zfs_vfs_mountroot(struct mount *mp, struct vnode *devvp, vfs_context_t ctx) +{ + /* + * static int zfsrootdone = 0; + */ + zfsvfs_t *zfsvfs = NULL; + spa_t *spa = 0; + char *zfs_bootfs = 0; + dev_t dev = 0; + int error = EINVAL; + + printf("ZFS: %s\n", __func__); + ASSERT(mp); + ASSERT(devvp); + ASSERT(ctx); + if (!mp || !devvp | !ctx) { + cmn_err(CE_NOTE, "%s: missing one of mp %p devvp %p" + " or ctx %p", __func__, mp, devvp, ctx); + return (EINVAL); + } + + /* Look up bootfs variable from pool here */ + zfs_bootfs = kmem_alloc(MAXPATHLEN, KM_SLEEP); + if (!zfs_bootfs) { + cmn_err(CE_NOTE, "%s: bootfs alloc failed", + __func__); + return (ENOMEM); + } + + mutex_enter(&spa_namespace_lock); + spa = spa_next(NULL); + if (!spa) { + mutex_exit(&spa_namespace_lock); + cmn_err(CE_NOTE, "%s: no pool available", + __func__); + goto out; + } + + error = dsl_dsobj_to_dsname(spa_name(spa), + spa_bootfs(spa), zfs_bootfs); + if (error != 0) { + mutex_exit(&spa_namespace_lock); + cmn_err(CE_NOTE, "%s: bootfs to name error %d", + __func__, error); + goto out; + } + mutex_exit(&spa_namespace_lock); + + /* + * By setting the dev_t value in the mount vfsp, + * mount_zfs will be called with the /dev/diskN + * proxy, but we can leave the dataset name in + * the mountedfrom field + */ + dev = vnode_specrdev(devvp); + + dprintf("Setting readonly\n"); + + if ((error = zfs_domount(mp, dev, zfs_bootfs, ctx)) != 0) { + printf("zfs_domount: error %d", error); + goto out; + } + + zfsvfs = (zfsvfs_t *)vfs_fsprivate(mp); + ASSERT(zfsvfs); + if (!zfsvfs) { + cmn_err(CE_NOTE, "missing zfsvfs"); + goto out; + } + + /* Set this mount to read-only */ + zfsvfs->z_rdonly = 1; + + /* + * Due to XNU mount flags, readonly gets set off for a short + * while, which means mimic will kick in if enabled. But we need + * to reply with true "zfs" until root has been remounted RW, so + * that launchd tries to run mount_zfs instead of mount_hfs + */ + mimic_changed_cb(zfsvfs, B_FALSE); + + /* + * Leave rootvp held. The root file system is never unmounted. + * + * XXX APPLE + * xnu will in fact call vfs_unmount on the root filesystem + * during shutdown/reboot. + */ + +out: + + if (zfs_bootfs) { + kmem_free(zfs_bootfs, MAXPATHLEN); + } + return (error); + +} + +/*ARGSUSED*/ +int +zfs_vfs_mount(struct mount *vfsp, vnode_t *mvp /* devvp */, + user_addr_t data, vfs_context_t context) +{ + char *osname = NULL; + char *options = NULL; + int error = 0; + int rdonly = 0; + int mflag = 0; + char *proxy = NULL; + struct zfs_mount_args mnt_args; + size_t osnamelen = 0; + uint32_t cmdflags = 0; + + cmdflags = (uint32_t)vfs_flags(vfsp) & MNT_CMDFLAGS; + rdonly = vfs_isrdonly(vfsp); + + if (!data) { + /* + * From 10.12, if you set VFS_TBLCANMOUNTROOT, XNU will + * call vfs_mountroot if set (and we can not set it), OR + * call vfs_mount if not set. Since data is always passed NULL + * in this case, we know we are supposed to call mountroot. + */ + dprintf("ZFS: vfs_mount -> vfs_mountroot\n"); + return (zfs_vfs_mountroot(vfsp, mvp, context)); + } + + /* + * Get the objset name (the "special" mount argument). + */ + if (data) { + + // Clear the struct, so that "flags" is null if only given path. + bzero(&mnt_args, sizeof (mnt_args)); + + osname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + if (vfs_context_is64bit(context)) { + if ((error = ddi_copyin((void *)data, + (caddr_t)&mnt_args, sizeof (mnt_args), 0))) { + dprintf("%s: error on mnt_args copyin %d\n", + __func__, error); + goto out; + } + } else { + user32_addr_t tmp; + if ((error = ddi_copyin((void *)data, + (caddr_t)&tmp, sizeof (tmp), 0))) { + printf("%s: error on mnt_args copyin32 %d\n", + __func__, error); + goto out; + } + /* munge into LP64 addr */ + mnt_args.fspec = (char *)CAST_USER_ADDR_T(tmp); + } + + // Copy over the string + if ((error = ddi_copyinstr((const void *)mnt_args.fspec, osname, + MAXPATHLEN, &osnamelen))) { + dprintf("%s: error on osname copyin %d\n", + __func__, error); + if (!mvp) + goto out; + } + } + + proxy = kmem_alloc(MAXPATHLEN, KM_SLEEP); + *proxy = 0; + + /* + * Translate /dev/disk path into dataset name + * After this; + * "proxy" will have "/dev/disk" (IF given) + * "osname" has the dataset name as usual + */ + if (strncmp(osname, "/dev/disk", 9) == 0) { + strlcpy(proxy, osname, MAXPATHLEN); + error = zfs_osx_proxy_get_osname(osname, + osname, MAXPATHLEN); + if (error != 0) { + printf("%s couldn't get dataset from %s\n", + __func__, osname); + error = ENOENT; + goto out; + } + dprintf("%s got new osname %s\n", __func__, osname); + } + + if (mnt_args.struct_size == sizeof (mnt_args)) { + mflag = mnt_args.mflag; + options = kmem_alloc(mnt_args.optlen, KM_SLEEP); + error = ddi_copyin((const void *)mnt_args.optptr, + (caddr_t)options, mnt_args.optlen, 0); + } + + if (mflag & MS_RDONLY) { + dprintf("%s: adding MNT_RDONLY\n", __func__); + cmdflags |= MNT_RDONLY; + } + + if (mflag & MS_OVERLAY) { + dprintf("%s: adding MNT_UNION\n", __func__); + cmdflags |= MNT_UNION; + } + + if (mflag & MS_FORCE) { + dprintf("%s: adding MNT_FORCE\n", __func__); + cmdflags |= MNT_FORCE; + } + + if (mflag & MS_REMOUNT) { + dprintf("%s: adding MNT_UPDATE on MS_REMOUNT\n", __func__); + cmdflags |= MNT_UPDATE; + } + + vfs_setflags(vfsp, (uint64_t)cmdflags); + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (cmdflags & MNT_UPDATE) { + + error = 0; + // Used after fsck + if (cmdflags & MNT_RELOAD) { + goto out; + } + + /* refresh mount options */ + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + + if (zfsvfs != NULL) { + if (zfsvfs->z_rdonly == 0 && + (cmdflags & MNT_RDONLY || + vfs_isrdonly(vfsp))) { + /* downgrade */ + dprintf("%s: downgrade requested\n", __func__); + zfsvfs->z_rdonly = 1; + readonly_changed_cb(zfsvfs, B_TRUE); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + if (error) { + dprintf("%s: remount returned %d", + __func__, error); + } + } + + if (vfs_iswriteupgrade(vfsp)) { + /* upgrade */ + dprintf("%s: upgrade requested\n", __func__); + zfsvfs->z_rdonly = 0; + readonly_changed_cb(zfsvfs, B_FALSE); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + if (error) { + dprintf("%s: remount returned %d", + __func__, error); + } + } + } + goto out; + } + + if (vfs_fsprivate(vfsp) != NULL) { + dprintf("already mounted\n"); + error = 0; + goto out; + } + + error = zfs_domount(vfsp, 0, osname, context); + if (error) { + dprintf("%s: zfs_domount returned %d\n", + __func__, error); + goto out; + } + +out: + + if (error == 0) { + + /* Indicate to VFS that we support ACLs. */ + vfs_setextendedsecurity(vfsp); + + // Set /dev/disk name if we have one, otherwise, datasetname + vfs_mountedfrom(vfsp, proxy && *proxy ? proxy : osname); + + } + + if (error) + dprintf("zfs_vfs_mount: error %d\n", error); + + if (osname) + kmem_free(osname, MAXPATHLEN); + + if (proxy) + kmem_free(proxy, MAXPATHLEN); + + if (options) + kmem_free(options, mnt_args.optlen); + + return (error); +} + + +int +zfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, + __unused vfs_context_t context) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + uint64_t refdbytes, availbytes, usedobjs, availobjs; + uint64_t log_blksize; + uint64_t log_blkcnt; + + // dprintf("vfs_getattr\n"); + + ZFS_ENTER(zfsvfs); + + /* + * Finder will show the old/incorrect size, we can force a sync of the + * pool to make it correct, but that has side effects which are + * undesirable. + */ + /* txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); */ + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + VFSATTR_RETURN(fsap, f_objcount, usedobjs); + VFSATTR_RETURN(fsap, f_maxobjcount, 0x7fffffffffffffff); + /* + * Carbon depends on f_filecount and f_dircount so + * make up some values based on total objects. + */ + VFSATTR_RETURN(fsap, f_filecount, usedobjs - (usedobjs / 4)); + VFSATTR_RETURN(fsap, f_dircount, usedobjs / 4); + + /* + * Model after HFS in working out if we should use the legacy size + * 512, or go to 4096. Note that XNU only likes those two + * blocksizes, so we don't use the ZFS recordsize + */ + log_blkcnt = (u_int64_t)((refdbytes + availbytes) >> SPA_MINBLOCKSHIFT); + log_blksize = (log_blkcnt > 0x000000007fffffff) ? + 4096 : (1 << SPA_MINBLOCKSHIFT); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + VFSATTR_RETURN(fsap, f_bsize, log_blksize); + VFSATTR_RETURN(fsap, f_iosize, zfsvfs->z_max_blksz); + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + VFSATTR_RETURN(fsap, f_blocks, + (u_int64_t)((refdbytes + availbytes) / log_blksize)); + VFSATTR_RETURN(fsap, f_bfree, (u_int64_t)(availbytes / log_blksize)); + VFSATTR_RETURN(fsap, f_bavail, fsap->f_bfree); + VFSATTR_RETURN(fsap, f_bused, fsap->f_blocks - fsap->f_bfree); + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + VFSATTR_RETURN(fsap, f_ffree, (u_int64_t)MIN(availobjs, fsap->f_bfree)); + VFSATTR_RETURN(fsap, f_files, fsap->f_ffree + usedobjs); + + if (VFSATTR_IS_ACTIVE(fsap, f_fsid)) { + fsap->f_fsid.val[0] = zfsvfs->z_rdev; + fsap->f_fsid.val[1] = vfs_typenum(mp); + VFSATTR_SET_SUPPORTED(fsap, f_fsid); + } + if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) { + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_HARDLINKS | // ZFS + VOL_CAP_FMT_SPARSE_FILES | // ZFS + VOL_CAP_FMT_2TB_FILESIZE | // ZFS + VOL_CAP_FMT_JOURNAL | VOL_CAP_FMT_JOURNAL_ACTIVE | // ZFS + VOL_CAP_FMT_SYMBOLICLINKS | // msdos.. + // ZFS has root times just fine + /* VOL_CAP_FMT_NO_ROOT_TIMES | */ + // Ask XNU to remember zero-runs, instead of writing + // zeros to it. + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_PATH_FROM_ID | + VOL_CAP_FMT_64BIT_OBJECT_IDS | + /* VOL_CAP_FMT_DECMPFS_COMPRESSION | */ + VOL_CAP_FMT_HIDDEN_FILES; + + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_ATTRLIST | // ZFS + VOL_CAP_INT_NFSEXPORT | // ZFS + VOL_CAP_INT_EXTENDED_SECURITY | // ZFS +#if NAMEDSTREAMS + VOL_CAP_INT_NAMEDSTREAMS | // ZFS +#endif + VOL_CAP_INT_EXTENDED_ATTR | // ZFS + VOL_CAP_INT_VOL_RENAME | // msdos.. + VOL_CAP_INT_ADVLOCK | + // ZFS does not yet have exchangedata (it's in a branch) + /* VOL_CAP_INT_EXCHANGEDATA| */ + // ZFS does not yet have copyfile + /* VOL_CAP_INT_COPYFILE| */ + // ZFS does not yet have allocate + /* VOL_CAP_INT_ALLOCATE| */ + VOL_CAP_INT_FLOCK; + + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED1] = + 0; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED2] = + 0; + + /* + * This is the list of valid capabilities at time of + * compile. The valid list should have them all defined + * and the "capability" list above should enable only + * those we have implemented + */ + fsap->f_capabilities.valid[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + VOL_CAP_FMT_JOURNAL_ACTIVE | + VOL_CAP_FMT_NO_ROOT_TIMES | + VOL_CAP_FMT_SPARSE_FILES | + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE | + VOL_CAP_FMT_OPENDENYMODES | + VOL_CAP_FMT_PATH_FROM_ID | + VOL_CAP_FMT_64BIT_OBJECT_IDS | + VOL_CAP_FMT_NO_VOLUME_SIZES | + VOL_CAP_FMT_DECMPFS_COMPRESSION | + VOL_CAP_FMT_HIDDEN_FILES; + fsap->f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_SEARCHFS | + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_COPYFILE | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK | + VOL_CAP_INT_EXTENDED_ATTR | + VOL_CAP_INT_USERACCESS | +#if NAMEDSTREAMS + VOL_CAP_INT_NAMEDSTREAMS | +#endif + VOL_CAP_INT_MANLOCK; + + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED1] = 0; + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED2] = 0; + + /* Check if we are case-sensitive */ + if (zfsvfs->z_case == ZFS_CASE_SENSITIVE) + fsap->f_capabilities.capabilities[ + VOL_CAPABILITIES_FORMAT] |= + VOL_CAP_FMT_CASE_SENSITIVE; + + /* Check if xattr is enabled */ + if (zfsvfs->z_xattr == B_TRUE) { + fsap->f_capabilities.capabilities[ + VOL_CAPABILITIES_INTERFACES] |= + VOL_CAP_INT_EXTENDED_ATTR; + } + + // Check if mimic is on + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(zfsvfs->z_vfs); + if (strcmp(vfsstatfs->f_fstypename, "hfs") == 0) { + fsap->f_capabilities.capabilities[ + VOL_CAPABILITIES_FORMAT] |= + VOL_CAP_FMT_DECMPFS_COMPRESSION; + } + + VFSATTR_SET_SUPPORTED(fsap, f_capabilities); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { + fsap->f_attributes.validattr.commonattr = + ATTR_CMN_NAME | + ATTR_CMN_DEVID | + ATTR_CMN_FSID | + ATTR_CMN_OBJTYPE | + ATTR_CMN_OBJTAG | + ATTR_CMN_OBJID | + ATTR_CMN_OBJPERMANENTID | + ATTR_CMN_PAROBJID | + /* ATTR_CMN_SCRIPT | */ + ATTR_CMN_CRTIME | + ATTR_CMN_MODTIME | + ATTR_CMN_CHGTIME | + ATTR_CMN_ACCTIME | + /* ATTR_CMN_BKUPTIME | */ + ATTR_CMN_FNDRINFO | + ATTR_CMN_OWNERID | + ATTR_CMN_GRPID | + ATTR_CMN_ACCESSMASK | + ATTR_CMN_FLAGS | + ATTR_CMN_USERACCESS | + ATTR_CMN_EXTENDED_SECURITY | + ATTR_CMN_UUID | + ATTR_CMN_GRPUUID | +#ifdef ATTR_CMN_DOCUMENT_ID + ATTR_CMN_DOCUMENT_ID | +#endif +#ifdef ATTR_CMN_GEN_COUNT + ATTR_CMN_GEN_COUNT | +#endif + 0; + fsap->f_attributes.validattr.volattr = + ATTR_VOL_FSTYPE | + ATTR_VOL_SIGNATURE | + ATTR_VOL_SIZE | + ATTR_VOL_SPACEFREE | + ATTR_VOL_SPACEAVAIL | + ATTR_VOL_MINALLOCATION | + ATTR_VOL_ALLOCATIONCLUMP | + ATTR_VOL_IOBLOCKSIZE | + ATTR_VOL_OBJCOUNT | + ATTR_VOL_FILECOUNT | + ATTR_VOL_DIRCOUNT | + ATTR_VOL_MAXOBJCOUNT | + /* ATTR_VOL_MOUNTPOINT | */ + ATTR_VOL_NAME | + ATTR_VOL_MOUNTFLAGS | + /* ATTR_VOL_MOUNTEDDEVICE | */ + /* ATTR_VOL_ENCODINGSUSED | */ + ATTR_VOL_CAPABILITIES | + ATTR_VOL_ATTRIBUTES; + fsap->f_attributes.validattr.dirattr = + ATTR_DIR_LINKCOUNT | + ATTR_DIR_ENTRYCOUNT | + ATTR_DIR_MOUNTSTATUS; + fsap->f_attributes.validattr.fileattr = + ATTR_FILE_LINKCOUNT | + ATTR_FILE_TOTALSIZE | + ATTR_FILE_ALLOCSIZE | + /* ATTR_FILE_IOBLOCKSIZE */ + ATTR_FILE_DEVTYPE | + /* ATTR_FILE_FORKCOUNT */ + /* ATTR_FILE_FORKLIST */ + ATTR_FILE_DATALENGTH | + ATTR_FILE_DATAALLOCSIZE | + ATTR_FILE_RSRCLENGTH | + ATTR_FILE_RSRCALLOCSIZE; + fsap->f_attributes.validattr.forkattr = 0; + fsap->f_attributes.nativeattr.commonattr = + ATTR_CMN_NAME | + ATTR_CMN_DEVID | + ATTR_CMN_FSID | + ATTR_CMN_OBJTYPE | + ATTR_CMN_OBJTAG | + ATTR_CMN_OBJID | + ATTR_CMN_OBJPERMANENTID | + ATTR_CMN_PAROBJID | + /* ATTR_CMN_SCRIPT | */ + ATTR_CMN_CRTIME | + ATTR_CMN_MODTIME | + /* ATTR_CMN_CHGTIME | */ /* Supported but not native */ + ATTR_CMN_ACCTIME | + /* ATTR_CMN_BKUPTIME | */ + /* ATTR_CMN_FNDRINFO | */ + ATTR_CMN_OWNERID | /* Supported but not native */ + ATTR_CMN_GRPID | /* Supported but not native */ + ATTR_CMN_ACCESSMASK | /* Supported but not native */ + ATTR_CMN_FLAGS | + ATTR_CMN_USERACCESS | + ATTR_CMN_EXTENDED_SECURITY | + ATTR_CMN_UUID | + ATTR_CMN_GRPUUID | +#ifdef ATTR_CMN_DOCUMENT_ID + ATTR_CMN_DOCUMENT_ID | +#endif +#ifdef ATTR_CMN_GEN_COUNT + ATTR_CMN_GEN_COUNT | +#endif + 0; + fsap->f_attributes.nativeattr.volattr = + ATTR_VOL_FSTYPE | + ATTR_VOL_SIGNATURE | + ATTR_VOL_SIZE | + ATTR_VOL_SPACEFREE | + ATTR_VOL_SPACEAVAIL | + ATTR_VOL_MINALLOCATION | + ATTR_VOL_ALLOCATIONCLUMP | + ATTR_VOL_IOBLOCKSIZE | + ATTR_VOL_OBJCOUNT | + ATTR_VOL_FILECOUNT | + ATTR_VOL_DIRCOUNT | + ATTR_VOL_MAXOBJCOUNT | + /* ATTR_VOL_MOUNTPOINT | */ + ATTR_VOL_NAME | + ATTR_VOL_MOUNTFLAGS | + /* ATTR_VOL_MOUNTEDDEVICE | */ + /* ATTR_VOL_ENCODINGSUSED */ + ATTR_VOL_CAPABILITIES | + ATTR_VOL_ATTRIBUTES; + fsap->f_attributes.nativeattr.dirattr = 0; + fsap->f_attributes.nativeattr.fileattr = + /* ATTR_FILE_LINKCOUNT | */ /* Supported but not native */ + ATTR_FILE_TOTALSIZE | + ATTR_FILE_ALLOCSIZE | + /* ATTR_FILE_IOBLOCKSIZE */ + ATTR_FILE_DEVTYPE | + /* ATTR_FILE_FORKCOUNT */ + /* ATTR_FILE_FORKLIST */ + ATTR_FILE_DATALENGTH | + ATTR_FILE_DATAALLOCSIZE | + ATTR_FILE_RSRCLENGTH | + ATTR_FILE_RSRCALLOCSIZE; + fsap->f_attributes.nativeattr.forkattr = 0; + + VFSATTR_SET_SUPPORTED(fsap, f_attributes); + } + if (VFSATTR_IS_ACTIVE(fsap, f_create_time)) { + char osname[MAXNAMELEN]; + uint64_t value; + + // Get dataset name + dmu_objset_name(zfsvfs->z_os, osname); + dsl_prop_get_integer(osname, "CREATION", + &value, NULL); + fsap->f_create_time.tv_sec = value; + fsap->f_create_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_create_time); + } + if (VFSATTR_IS_ACTIVE(fsap, f_modify_time)) { + timestruc_t now; + uint64_t mtime[2]; + + gethrestime(&now); + ZFS_TIME_ENCODE(&now, mtime); + // fsap->f_modify_time = mtime; + ZFS_TIME_DECODE(&fsap->f_modify_time, mtime); + + VFSATTR_SET_SUPPORTED(fsap, f_modify_time); + } + /* + * For Carbon compatibility, pretend to support this legacy/unused + * attribute + */ + if (VFSATTR_IS_ACTIVE(fsap, f_backup_time)) { + fsap->f_backup_time.tv_sec = 0; + fsap->f_backup_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_backup_time); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { + char osname[MAXNAMELEN], *slash; + dmu_objset_name(zfsvfs->z_os, osname); + + slash = strrchr(osname, '/'); + if (slash) { + /* Advance past last slash */ + slash += 1; + } else { + /* Copy whole osname (pool root) */ + slash = osname; + } + strlcpy(fsap->f_vol_name, slash, MAXPATHLEN); + + VFSATTR_SET_SUPPORTED(fsap, f_vol_name); + dprintf("vfs_getattr: volume name '%s'\n", fsap->f_vol_name); + } + + /* If we are mimicking, we need userland know we are really ZFS */ + VFSATTR_RETURN(fsap, f_fssubtype, MNTTYPE_ZFS_SUBTYPE); + + /* + * According to joshade over at + * https://github.com/joshado/liberate-applefileserver/blob/ + * master/liberate.m + * the following values need to be returned for it to be considered + * by Apple's AFS. + */ + VFSATTR_RETURN(fsap, f_signature, 0x482b); /* "H+" in ascii */ + VFSATTR_RETURN(fsap, f_carbon_fsid, 0); + // Make up a UUID here, based on the name + if (VFSATTR_IS_ACTIVE(fsap, f_uuid)) { + + char osname[MAXNAMELEN]; + int error; + // Get dataset name + dmu_objset_name(zfsvfs->z_os, osname); + dprintf("%s: osname [%s]\n", __func__, osname); + + if ((error = zfs_vfs_uuid_gen(osname, + fsap->f_uuid)) != 0) { + dprintf("%s uuid_gen error %d\n", __func__, error); + } else { + /* return f_uuid in fsap */ + VFSATTR_SET_SUPPORTED(fsap, f_uuid); + } + } + + uint64_t missing = 0; + missing = (fsap->f_active ^ (fsap->f_active & fsap->f_supported)); + if (missing != 0) { + dprintf("%s: asked %08llx reply %08llx missing %08llx\n", + __func__, fsap->f_active, fsap->f_supported, + missing); + } + + ZFS_EXIT(zfsvfs); + + return (0); +} + +int +zfs_vnode_lock(vnode_t *vp, int flags) +{ + int error; + + ASSERT(vp != NULL); + + error = vn_lock(vp, flags); + return (error); +} + +/* + * The ARC has requested that the filesystem drop entries from the dentry + * and inode caches. This can occur when the ARC needs to free meta data + * blocks but can't because they are all pinned by entries in these caches. + */ + +/* Get vnode for the root object of this mount */ +int +zfs_vfs_root(struct mount *mp, vnode_t **vpp, __unused vfs_context_t context) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + znode_t *rootzp = NULL; + int error; + + if (!zfsvfs) { + struct vfsstatfs *stat = 0; + if (mp) stat = vfs_statfs(mp); + if (stat) + dprintf("%s mp on %s from %s\n", __func__, + stat->f_mntonname, stat->f_mntfromname); + dprintf("%s no zfsvfs yet for mp\n", __func__); + return (EINVAL); + } + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + else + *vpp = NULL; + + ZFS_EXIT(zfsvfs); + + if (error == 0 && *vpp != NULL) + if (vnode_vtype(*vpp) != VDIR) { + panic("%s: not a directory\n", __func__); + } + + return (error); +} + +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + /* + * We have experienced deadlocks with dmu_recv_end happening between + * suspend_fs() and resume_fs(). Clearly something is not quite ready + * so we will wait for pools to be synced first. + * This is considered a temporary solution until we can work out + * the full issue. + */ + + zfs_unlinked_drain_stop_wait(zfsvfs); + + /* + * If someone has not already unmounted this file system, + * drain the iput_taskq to ensure all active references to the + * zfs_sb_t have been handled only then can it be safely destroyed. + */ + if (zfsvfs->z_os) { + /* + * If we're unmounting we have to wait for the list to + * drain completely. + * + * If we're not unmounting there's no guarantee the list + * will drain completely, but iputs run from the taskq + * may add the parents of dir-based xattrs to the taskq + * so we want to wait for these. + * + * We can safely read z_nr_znodes without locking because the + * VFS has already blocked operations which add to the + * z_all_znodes list and thus increment z_nr_znodes. + */ + int round = 0; + while (!list_empty(&zfsvfs->z_all_znodes)) { + taskq_wait_outstanding(dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + if (++round > 1 && !unmounting) + break; + break; /* Only loop once - osx can get stuck */ + } + } + + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ + cache_purgevfs(zfsvfs->z_parent->z_vfs); + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + return (SET_ERROR(EIO)); + } + /* + * At this point there are no VFS ops active, and any new VFS ops + * will fail with EIO since we have z_teardown_lock for writer (only + * relevant for forced unmount). + * + * Release all holds on dbufs. We also grab an extra reference to all + * the remaining inodes so that the kernel does not attempt to free + * any inodes of a suspended fs. This can cause deadlocks since the + * zfs_resume_fs() process may involve starting threads, which might + * attempt to free unreferenced inodes to free up memory for the new + * thread. + */ + if (!unmounting) { + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + if (zp->z_sa_hdl) + zfs_znode_dmu_fini(zp); + if (VN_HOLD(ZTOV(zp)) == 0) { + vnode_ref(ZTOV(zp)); + zp->z_suspended = B_TRUE; + VN_RELE(ZTOV(zp)); + } + } + mutex_exit(&zfsvfs->z_znodes_lock); + } + + /* + * If we are unmounting, set the unmounted flag and let new VFS ops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other VFS ops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + /* + * Evict cached data. We must write out any dirty data before + * disowning the dataset. + */ + objset_t *os = zfsvfs->z_os; + boolean_t os_dirty = B_FALSE; + for (int t = 0; t < TXG_SIZE; t++) { + if (dmu_objset_is_dirty(os, t)) { + os_dirty = B_TRUE; + break; + } + } + if (!zfs_is_readonly(zfsvfs) && os_dirty) { + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + } + dmu_objset_evict_dbufs(zfsvfs->z_os); + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + dsl_dir_cancel_waiters(dd); + + return (0); +} + +int +zfs_vfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + objset_t *os; + char osname[MAXNAMELEN]; + int ret; + /* cred_t *cr = (cred_t *)vfs_context_ucred(context); */ + int destroyed_zfsctl = 0; + + dprintf("%s\n", __func__); + + zfs_unlinked_drain_stop_wait(zfsvfs); + + /* Save osname for later */ + dmu_objset_name(zfsvfs->z_os, osname); + + /* + * We might skip the sync called in the unmount path, since + * zfs_vfs_sync() is generally ignoring xnu's calls, and alas, + * mount_isforce() is set AFTER that sync call, so we can not + * detect unmount is inflight. But why not just sync now, it + * is safe. Optionally, sync if (mount_isforce()); + */ + spa_sync_allpools(); + + /* + * We purge the parent filesystem's vfsp as the parent filesystem + * and all of its snapshots have their vnode's v_vfsp set to the + * parent's filesystem's vfsp. Note, 'z_parent' is self + * referential for non-snapshots. + */ + cache_purgevfs(zfsvfs->z_parent->z_vfs); + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + * + * Unfortunately, XNU will check for mounts in preflight, and + * simply not call us at all if snapshots are mounted. + * We expect userland to unmount snapshots now. + */ + + ret = vflush(mp, NULLVP, SKIPSYSTEM); + + if (mntflags & MNT_FORCE) { + /* + * Mark file system as unmounted before calling + * vflush(FORCECLOSE). This way we ensure no future vnops + * will be called and risk operating on DOOMED vnodes. + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfsvfs->z_unmounted = B_TRUE; + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * We must release ctldir before vflush on osx. + */ + if (zfsvfs->z_ctldir != NULL) { + destroyed_zfsctl = 1; + zfsctl_destroy(zfsvfs); + } + + /* + * Flush all the files. + */ + ret = vflush(mp, NULLVP, + (mntflags & MNT_FORCE) ? FORCECLOSE|SKIPSYSTEM : SKIPSYSTEM); + + if ((ret != 0) && !(mntflags & MNT_FORCE)) { + if (destroyed_zfsctl) + zfsctl_create(zfsvfs); + return (ret); + } + + /* If we are ourselves a snapshot */ + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + /* Wake up anyone waiting for unmount */ + zfsctl_mount_signal(osname, B_FALSE); + } + + if (!vfs_isrdonly(zfsvfs->z_vfs) && + spa_writeable(dmu_objset_spa(zfsvfs->z_os)) && + !(mntflags & MNT_FORCE)) { + /* Update the last-unmount time for Spotlight's next mount */ + timestruc_t now; + dmu_tx_t *tx; + int error; + uint64_t value; + + dprintf("ZFS: '%s' Updating spotlight LASTUNMOUNT property\n", + osname); + + gethrestime(&now); + zfsvfs->z_last_unmount_time = now.tv_sec; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + value = zfsvfs->z_last_unmount_time; + error = zap_update(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_prop_to_name(ZFS_PROP_LASTUNMOUNT), + 8, 1, + &value, tx); + dmu_tx_commit(tx); + } + dprintf("ZFS: '%s' set lastunmount to 0x%lx (%d)\n", + osname, zfsvfs->z_last_unmount_time, error); + } + + /* + * Last chance to dump unreferenced system files. + */ + (void) vflush(mp, NULLVP, FORCECLOSE); + + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + + zfs_freevfs(zfsvfs->z_vfs); + + dprintf("zfs_osx_proxy_remove"); + zfs_osx_proxy_remove(osname); + + return (0); +} + +static int +zfs_vget_internal(zfsvfs_t *zfsvfs, ino64_t ino, vnode_t **vpp) +{ + znode_t *zp; + int err; + + dprintf("vget get %llu\n", ino); + + /* + * Check to see if we expect to find this in the hardlink avl tree of + * hashes. Use the MSB set high as indicator. + */ + hardlinks_t *findnode = NULL; + if ((1ULL<<31) & ino) { + hardlinks_t *searchnode; + avl_index_t loc; + + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + + dprintf("ZFS: vget looking for (%llx,%llu)\n", ino, ino); + + searchnode->hl_linkid = ino; + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks_linkid, searchnode, + &loc); + rw_exit(&zfsvfs->z_hardlinks_lock); + + kmem_free(searchnode, sizeof (hardlinks_t)); + + if (findnode) { + dprintf("ZFS: vget found (%llu, %llu, %u): '%s'\n", + findnode->hl_parent, + findnode->hl_fileid, findnode->hl_linkid, + findnode->hl_name); + // Lookup the actual zp instead + ino = findnode->hl_fileid; + } // findnode + } // MSB set + + + /* We can not be locked during zget. */ + if (!ino) { + dprintf("%s: setting ino from %lld to 2\n", __func__, ino); + ino = 2; + } + + err = zfs_zget(zfsvfs, ino, &zp); + + if (err) { + dprintf("zget failed %d\n", err); + return (err); + } + + /* Don't expose EA objects! */ + if (zp->z_pflags & ZFS_XATTR) { + err = ENOENT; + goto out; + } + if (zp->z_unlinked) { + err = EINVAL; + goto out; + } + + *vpp = ZTOV(zp); + + err = zfs_vnode_lock(*vpp, 0 /* flags */); + + /* + * Spotlight requires that vap->va_name() is set when returning + * from vfs_vget, so that vfs_getrealpath() can succeed in returning + * a path to mds. + */ + char *name = kmem_alloc(MAXPATHLEN + 2, KM_SLEEP); + + /* Root can't lookup in ZAP */ + if (zp->z_id == zfsvfs->z_root) { + + dmu_objset_name(zfsvfs->z_os, name); + dprintf("vget: set root '%s'\n", name); + vnode_update_identity(*vpp, NULL, name, + strlen(name), 0, VNODE_UPDATE_NAME); + + } else { + uint64_t parent; + + // if its a hardlink cache + if (findnode) { + + dprintf("vget: updating vnode to '%s' parent %llu\n", + findnode->hl_name, findnode->hl_parent); + + vnode_update_identity(*vpp, + NULL, findnode->hl_name, + strlen(findnode->hl_name), 0, + VNODE_UPDATE_NAME|VNODE_UPDATE_PARENT); + mutex_enter(&zp->z_lock); + strlcpy(zp->z_name_cache, findnode->hl_name, PATH_MAX); + zp->z_finder_parentid = findnode->hl_parent; + mutex_exit(&zp->z_lock); + + + // If we already have the name, cached in zfs_vnop_lookup + } else if (zp->z_name_cache[0]) { + dprintf("vget: cached name '%s'\n", zp->z_name_cache); + vnode_update_identity(*vpp, NULL, zp->z_name_cache, + strlen(zp->z_name_cache), 0, + VNODE_UPDATE_NAME); + + /* If needed, if findnode is set, update the parentid */ + + } else { + + /* Lookup name from ID, grab parent */ + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)) == 0); + + if (zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), name) == 0) { + + dprintf("vget: set name '%s'\n", name); + vnode_update_identity(*vpp, NULL, name, + strlen(name), 0, + VNODE_UPDATE_NAME); + } else { + dprintf("vget: unable to get name for %llu\n", + zp->z_id); + } // !zap_search + } + } // rootid + + kmem_free(name, MAXPATHLEN + 2); + +out: + + if (err != 0) { + VN_RELE(ZTOV(zp)); + *vpp = NULL; + } + + dprintf("vget return %d\n", err); + return (err); +} + +/* + * Get a vnode from a file id (ignoring the generation) + * + * Use by NFS Server (readdirplus) and VFS (build_path) + */ +int +zfs_vfs_vget(struct mount *mp, ino64_t ino, vnode_t **vpp, + __unused vfs_context_t context) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + int error; + + dprintf("%s: %llu\n", __func__, ino); + + ZFS_ENTER(zfsvfs); + + /* We also need to handle (.zfs) and (.zfs/snapshot). */ + if ((ino == ZFSCTL_INO_ROOT) && (zfsvfs->z_ctldir != NULL)) { + if (VN_HOLD(zfsvfs->z_ctldir) == 0) { + *vpp = zfsvfs->z_ctldir; + error = 0; + } else { + error = ENOENT; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * This one is trickier, we have no reference to it, but it is + * in the all list. A little expensive to search list, but at + * least "snapshot" is infrequently accessed + * We also need to check if it is a ".zfs/snapshot/$name" entry - + * luckily we keep the "lowest" ID seen, so we only need to check + * when it is in the range. + */ + if (zfsvfs->z_ctldir != NULL) { + + /* + * Either it is the snapdir itself, or one of the snapshot + * directories inside it + */ + if ((ino == ZFSCTL_INO_SNAPDIR) || + ((ino >= zfsvfs->z_ctldir_startid) && + (ino <= ZFSCTL_INO_SNAPDIRS))) { + znode_t *zp; + + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + if (zp->z_id == ino) + break; + if (zp->z_id == ZFSCTL_INO_SHARES - ino) + break; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + error = ENOENT; + if (zp != NULL) { + if (VN_HOLD(ZTOV(zp)) == 0) { + *vpp = ZTOV(zp); + error = 0; + } + } + + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * On Mac OS X we always export the root directory id as 2. + * So we don't expect to see the real root directory id + * from zfs_vfs_vget KPI (unless of course the real id was + * already 2). + */ + ino = INO_XNUTOZFS(ino, zfsvfs->z_root); + + error = zfs_vget_internal(zfsvfs, ino, vpp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +int +zfs_vfs_setattr(__unused struct mount *mp, __unused struct vfs_attr *fsap, + __unused vfs_context_t context) +{ + // 10a286 bits has an implementation of this: to set volume name. + return (ENOTSUP); +} + +/* + * NFS Server File Handle File ID + */ +typedef struct zfs_zfid { + uint8_t zf_object[8]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[8]; /* gen[i] = gen >> (8 * i) */ +} zfs_zfid_t; + +/* + * File handle to vnode pointer + */ +int +zfs_vfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, + vnode_t **vpp, __unused vfs_context_t context) +{ + dprintf("%s\n", __func__); + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + zfs_zfid_t *zfid = (zfs_zfid_t *)fhp; + znode_t *zp; + uint64_t obj_num = 0; + uint64_t fid_gen = 0; + uint64_t zp_gen; + int i; + int error; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + if (fhlen < sizeof (zfs_zfid_t)) { + error = EINVAL; + goto out; + } + + /* + * Grab the object and gen numbers in an endian neutral manner + */ + for (i = 0; i < sizeof (zfid->zf_object); i++) + obj_num |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + + if ((error = zfs_zget(zfsvfs, obj_num, &zp))) { + goto out; + } + + zp_gen = zp->z_gen; + if (zp_gen == 0) + zp_gen = 1; + + if (zp->z_unlinked || zp_gen != fid_gen) { + vnode_put(ZTOV(zp)); + error = EINVAL; + goto out; + } + *vpp = ZTOV(zp); +out: + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Vnode pointer to File handle + * + * XXX Do we want to check the DSL sharenfs property? + */ +int +zfs_vfs_vptofh(vnode_t *vp, int *fhlenp, unsigned char *fhp, + __unused vfs_context_t context) +{ + dprintf("%s\n", __func__); + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + zfs_zfid_t *zfid = (zfs_zfid_t *)fhp; + znode_t *zp = VTOZ(vp); + uint64_t obj_num; + uint64_t zp_gen; + int i; + + if (*fhlenp < sizeof (zfs_zfid_t)) { + return (EOVERFLOW); + } + + ZFS_ENTER(zfsvfs); + + obj_num = zp->z_id; + zp_gen = zp->z_gen; + if (zp_gen == 0) + zp_gen = 1; + + /* + * Store the object and gen numbers in an endian neutral manner + */ + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(obj_num >> (8 * i)); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(zp_gen >> (8 * i)); + + *fhlenp = sizeof (zfs_zfid_t); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + return (0); +} + +/* + * Reopen zfsvfs_t::z_os and release VOPs. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + int err, err2; + znode_t *zp; + + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; + + ds->ds_dir->dd_activity_cancelled = B_FALSE; + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + zfs_set_fuid_feature(zfsvfs); + + /* + * Attempt to re-establish all the active inodes with their + * dbufs. If a zfs_rezget() fails, then we unhash the inode + * and mark it stale. This prevents a collision if a new + * inode/object is created which must use the same inode + * number. The stale inode will be be released when the + * VFS prunes the dentry holding the remaining references + * on the stale inode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + err2 = zfs_rezget(zp); + if (err2) { + zp->z_is_stale = B_TRUE; + } + + /* see comment in zfs_suspend_fs() */ + if (zp->z_suspended) { + if (vnode_getwithref(ZTOV(zp)) == 0) { + vnode_rele(ZTOV(zp)); + zfs_zrele_async(zp); + zp->z_suspended = B_FALSE; + } + } + } + mutex_exit(&zfsvfs->z_znodes_lock); + + if (!vfs_isrdonly(zfsvfs->z_vfs) && !zfsvfs->z_unmounted) { + /* + * zfs_suspend_fs() could have interrupted freeing + * of dnodes. We need to restart this freeing so + * that we don't "leak" the space. + */ + zfs_unlinked_drain(zfsvfs); + } + + cache_purgevfs(zfsvfs->z_parent->z_vfs); + +bail: + /* release the VFS ops */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't setup the sa framework, try to force + * unmount this file system. + */ + if (zfsvfs->z_os) + zfs_vfs_unmount(zfsvfs->z_vfs, 0, NULL); + } + return (err); +} + + +void +zfs_freevfs(struct mount *vfsp) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + + dprintf("+freevfs\n"); + + vfs_setfsprivate(vfsp, NULL); + + zfsvfs_free(zfsvfs); + + atomic_dec_32(&zfs_active_fs_count); + dprintf("-freevfs\n"); +} + +struct fromname_struct { + char *oldname; + char *newname; +}; +typedef struct fromname_struct fromname_t; + +static int +zfsvfs_update_fromname_callback(mount_t mp, void *arg) +{ + fromname_t *frna = (fromname_t *)arg; + struct vfsstatfs *vsf = vfs_statfs(mp); + + if (vsf->f_mntfromname, frna->oldname, + sizeof (vsf->f_mntfromname) == 0) { + vfs_mountedfrom(mp, frna->newname); + return (VFS_RETURNED_DONE); + } + + return (VFS_RETURNED); +} + +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ + fromname_t frna; + + // find oldname's vfsp + // vfs_mountedfrom(vfsp, newname); + frna.oldname = oldname; + frna.newname = newname; + vfs_iterate(0, zfsvfs_update_fromname_callback, (void *)&frna); +} + +void +zfs_init(void) +{ + + printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); + + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); + + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); + + /* Start arc_os - reclaim thread */ + arc_os_init(); + +} + +void +zfs_fini(void) +{ + arc_os_fini(); + zfsctl_fini(); + zfs_znode_fini(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +/* + * Release VOPs and unmount a suspended filesystem. + */ +int +zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + zfsvfs->z_os = os; + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + /* + * Try to force unmount this file system. + */ + zfs_vfs_unmount(zfsvfs->z_vfs, 0, NULL); + zfsvfs->z_unmounted = B_TRUE; + return (0); +} + +int +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) +{ + int error; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (SET_ERROR(EINVAL)); + + if (newvers < zfsvfs->z_version) + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal(dmu_objset_spa(os), "upgrade", tx, + "oldver=%llu newver=%llu dataset = %llu", zfsvfs->z_version, + newvers, dmu_objset_id(os)); + + dmu_tx_commit(tx); + + zfsvfs->z_version = newvers; + os->os_version = newvers; + + zfs_set_fuid_feature(zfsvfs); + + return (0); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLMODE: + *value = ZFS_ACLTYPE_OFF; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + +/* + * Return true if the coresponding vfs's unmounted flag is set. + * Otherwise return false. + * If this function returns true we know VFS unmount has been initiated. + */ +boolean_t +zfs_get_vfs_flag_unmounted(objset_t *os) +{ + zfsvfs_t *zfvp; + boolean_t unmounted = B_FALSE; + + ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp != NULL && zfvp->z_vfs != NULL && + (vfs_isunmount(zfvp->z_vfs))) + unmounted = B_TRUE; + mutex_exit(&os->os_user_ptr_lock); + + return (unmounted); +} diff --git a/module/os/macos/zfs/zfs_vnops.c b/module/os/macos/zfs/zfs_vnops.c new file mode 100644 index 0000000000..11ed964911 --- /dev/null +++ b/module/os/macos/zfs/zfs_vnops.c @@ -0,0 +1,4560 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2013, 2020 Jorgen Lundman + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int zfs_vnop_force_formd_normalized_output = 0; /* disabled by default */ + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) zrele() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call zrele() within a tx then use zfs_zrele_async(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * zrele(...); // release held znodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * zrele(...); // release held znodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* + * Virus scanning is unsupported. It would be possible to add a hook + * here to performance the required virus scan. This could be done + * entirely in the kernel or potentially as an update to invoke a + * scanning utility. + */ +static int +zfs_vscan(struct vnode *vp, cred_t *cr, int async) +{ + return (0); +} + +/* ARGSUSED */ +int +zfs_open(struct vnode *vp, int mode, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Honor ZFS_APPENDONLY file attribute */ + if ((mode & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & O_APPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Virus scan eligible files on open */ + if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(zp->z_mode) && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (zfs_vscan(vp, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +int +zfs_close(struct vnode *vp, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_dec_32(&zp->z_sync_cnt); + + if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(zp->z_mode) && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) + VERIFY(zfs_vscan(vp, cr, 1) == 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +#if defined(SEEK_HOLE) && defined(SEEK_DATA) +/* + * Lseek support for finding holes (cmd == SEEK_HOLE) and + * data (cmd == SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey_common(struct vnode *vp, int cmd, loff_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_size; + if (noff >= file_sz) { + return (SET_ERROR(ENXIO)); + } + + if (cmd == SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); + + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* file was dirty, so fall back to using generic logic */ + if (error == EBUSY) { + if (hole) + *off = file_sz; + + return (0); + } + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +int +zfs_holey(struct vnode *vp, int cmd, loff_t *off) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_holey_common(vp, cmd, off); + + ZFS_EXIT(zfsvfs); + return (error); +} +#endif /* SEEK_HOLE && SEEK_DATA */ + +#if defined(_KERNEL) +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +static void +update_pages(vnode_t *vp, int64_t start, int64_t len, + objset_t *os, uint64_t oid) +{ + znode_t *zp = VTOZ(vp); + int error = 0; + vm_offset_t vaddr = 0; + upl_t upl; + upl_page_info_t *pl = NULL; + int upl_size; + int upl_page; + off_t off; + + off = start & (PAGE_SIZE - 1); + start &= ~PAGE_MASK; + + upl_size = (off + len + (PAGE_SIZE - 1)) & ~PAGE_MASK; + + // dprintf("update_pages: start 0x%llx len 0x%llx: 1st off x%llx\n", + // start, len, off); + /* + * Create a UPL for the current range and map its + * page list into the kernel virtual address space. + */ + error = ubc_create_upl(vp, start, upl_size, &upl, &pl, + UPL_FILE_IO | UPL_SET_LITE); + if ((error != KERN_SUCCESS) || !upl) { + printf("ZFS: update_pages failed to ubc_create_upl: %d\n", + error); + return; + } + + if (ubc_upl_map(upl, &vaddr) != KERN_SUCCESS) { + printf("ZFS: update_pages failed to ubc_upl_map: %d\n", + error); + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); + return; + } + for (upl_page = 0; len > 0; ++upl_page) { + uint64_t nbytes = MIN(PAGESIZE - off, len); + /* + * We don't want a new page to "appear" in the middle of + * the file update (because it may not get the write + * update data), so we grab a lock to block + * zfs_getpage(). + */ + rw_enter(&zp->z_map_lock, RW_WRITER); + if (pl && upl_valid_page(pl, upl_page)) { + rw_exit(&zp->z_map_lock); + (void) dmu_read(os, oid, start+off, nbytes, + (void *)(vaddr+off), DMU_READ_PREFETCH); + + } else { // !upl_valid_page + rw_exit(&zp->z_map_lock); + } + vaddr += PAGE_SIZE; + start += PAGE_SIZE; + len -= nbytes; + off = 0; + } + + /* + * Unmap the page list and free the UPL. + */ + (void) ubc_upl_unmap(upl); + /* + * We want to abort here since due to dmu_write() + * we effectively didn't dirty any pages. + */ + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(struct vnode *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + int len = nbytes; + int error = 0; + vm_offset_t vaddr = 0; + upl_t upl; + upl_page_info_t *pl = NULL; + off_t upl_start; + int upl_size; + int upl_page; + off_t off; + + upl_start = uio_offset(uio); + off = upl_start & PAGE_MASK; + upl_start &= ~PAGE_MASK; + upl_size = (off + nbytes + (PAGE_SIZE - 1)) & ~PAGE_MASK; + + /* + * Create a UPL for the current range and map its + * page list into the kernel virtual address space. + */ + error = ubc_create_upl(vp, upl_start, upl_size, &upl, &pl, + UPL_FILE_IO | UPL_SET_LITE); + if ((error != KERN_SUCCESS) || !upl) { + return (EIO); + } + + if (ubc_upl_map(upl, &vaddr) != KERN_SUCCESS) { + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); + return (ENOMEM); + } + + for (upl_page = 0; len > 0; ++upl_page) { + uint64_t bytes = MIN(PAGE_SIZE - off, len); + if (pl && upl_valid_page(pl, upl_page)) { + uio_setrw(uio, UIO_READ); + error = uiomove((caddr_t)vaddr + off, bytes, UIO_READ, + uio); + } else { + error = dmu_read_uio(os, zp->z_id, uio, bytes); + } + + vaddr += PAGE_SIZE; + len -= bytes; + off = 0; + if (error) + break; + } + + /* + * Unmap the page list and free the UPL. + */ + (void) ubc_upl_unmap(upl); + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); + + return (error); +} +#endif /* _KERNEL */ + +unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ +unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: ip - inode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - O_SYNC flags; used to provide FRSYNC semantics. + * O_DIRECT flag; used to bypass page cache. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 on success, error code on failure. + * + * Side Effects: + * inode - atime updated if byte count > 0 + */ +/* ARGSUSED */ +int +zfs_read(struct vnode *vp, uio_t *uio, int ioflag, cred_t *cr) +{ + int error = 0; + boolean_t frsync = B_FALSE; + + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_pflags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + /* + * Validate file offset + */ + if (uio_offset(uio) < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Fasttrack empty reads + */ + if (uio_resid(uio) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + +#ifdef FRSYNC + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + * Only do this for non-snapshots. + * + * Some platforms do not support FRSYNC and instead map it + * to O_SYNC, which results in unnecessary calls to zil_commit. We + * only honor FRSYNC requests on platforms which support it. + */ + frsync = !!(ioflag & FRSYNC); +#endif + if (zfsvfs->z_log && + (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); + + /* + * Lock the range against changes. + */ + zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, + uio_offset(uio), uio_resid(uio), RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio_offset(uio) >= zp->z_size) { + error = 0; + goto out; + } + + ASSERT(uio_offset(uio) < zp->z_size); + ssize_t n = MIN(uio_resid(uio), zp->z_size - uio_offset(uio)); + + while (n > 0) { + ssize_t nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio_offset(uio), zfs_read_chunk_size)); + + if (zp->z_is_mapped && !(ioflag & O_DIRECT)) { + error = mappedread(vp, nbytes, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + } + + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + + n -= nbytes; + } + +out: + zfs_rangelock_exit(lr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: ip - inode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - O_APPEND flag set if in append mode. + * O_DIRECT flag; used to bypass page cache. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime|mtime updated if byte count > 0 + */ + +/* ARGSUSED */ +int +zfs_write(struct vnode *vp, uio_t *uio, int ioflag, cred_t *cr) +{ + int error = 0; + ssize_t start_resid = uio_resid(uio); + rlim64_t limit = MAXOFFSET_T; + const iovec_t *aiov = NULL; + arc_buf_t *abuf = NULL; + int write_eof; + + /* + * Fasttrack empty write + */ + ssize_t n = start_resid; + if (n == 0) + return (0); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + sa_bulk_attr_t bulk[4]; + int count = 0; + uint64_t mtime[2], ctime[2]; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM + */ + if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && + (uio_offset(uio) < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* + * Validate file offset + */ + offset_t woff = ioflag & O_APPEND ? zp->z_size : uio_offset(uio); + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + int max_blksz = zfsvfs->z_max_blksz; + xuio_t *xuio = NULL; + + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFAULT)); + } + + /* + * If in append mode, set the io offset pointer to eof. + */ + zfs_locked_range_t *lr; + if (ioflag & O_APPEND) { + /* + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; + } + uio_setoffset(uio, woff); + } else { + /* + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); + } + + if (woff >= limit) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* Will this write extend the file length? */ + write_eof = (woff + n > zp->z_size); + uint64_t end_size = MAX(zp->z_size, woff + n); + zilog_t *zilog = zfsvfs->z_log; + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + woff = uio_offset(uio); + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, + zp->z_uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, + zp->z_gid) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + abuf = NULL; + if (xuio) { + + } else if (n >= max_blksz && woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + size_t cbytes; + + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if ((error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes))) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + + /* + * Start a transaction. + */ + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } + + /* + * If rangelock_enter() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since rangelock_reduce() will + * shrink down lr_length to the appropriate size. + */ + if (lr->lr_length == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_rangelock_reduce(lr, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + + ssize_t tx_bytes = 0; + + if (woff + nbytes > zp->z_size) + vnode_pager_setsize(vp, woff + nbytes); + + /* + * This conditional is always true in OSX, it is kept so + * the sources look familiar to other platforms + */ + if (abuf == NULL) { + tx_bytes = uio_resid(uio); + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); + tx_bytes -= uio_resid(uio); + } else { + tx_bytes = nbytes; + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + error = dmu_assign_arcbuf_by_dbuf( + sa_get_db(zp->z_sa_hdl), woff, abuf, tx); + if (error != 0) { + dmu_return_arcbuf(abuf); + dmu_tx_commit(tx); + break; + } + } + ASSERT(tx_bytes <= uio_resid(uio)); + uioskip(uio, tx_bytes); + } + if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) { + update_pages(vp, woff, tx_bytes, zfsvfs->z_os, + zp->z_id); + } + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the execute bits is set. + * + * It would be nice to do this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + uint32_t uid = KUID_TO_SUID(zp->z_uid); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + zp->z_mode = newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_size) < uio_offset(uio)) { + (void) atomic_cas_64(&zp->z_size, end_size, + uio_offset(uio)); + ASSERT(error == 0); + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + if (error == 0) + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + else + (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, + NULL, NULL); + dmu_tx_commit(tx); + + if (error != 0) + break; + + ASSERT(tx_bytes == nbytes); + n -= nbytes; + } + + zfs_rangelock_exit(lr); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_replay || uio_resid(uio) == start_resid) { + dprintf("%s: error resid %llu\n", __func__, uio_resid(uio)); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (O_SYNC | O_DSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); + + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * Write the bytes to a file. + * + * IN: zp - znode of file to be written to + * data - bytes to write + * len - number of bytes to write + * pos - offset to start writing at + * + * OUT: resid - remaining bytes to write + * + * RETURN: 0 if success + * positive error code if failure + * + * Timestamps: + * zp - ctime|mtime updated if byte count > 0 + */ +int +zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *presid) +{ + int error = 0; + ssize_t resid; + + error = zfs_vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, + pos, UIO_SYSSPACE, IO_SYNC, RLIM64_INFINITY, NOCRED, &resid); + + if (error) { + return (SET_ERROR(error)); + } else if (presid == NULL) { + if (resid != 0) { + error = SET_ERROR(EIO); + } + } else { + *presid = resid; + } + return (error); +} + +/* + * Drop a reference on the passed inode asynchronously. This ensures + * that the caller will never drop the last reference on an inode in + * the current context. Doing so while holding open a tx could result + * in a deadlock if iput_final() re-enters the filesystem code. + */ +void +zfs_zrele_async(znode_t *zp) +{ + struct vnode *vp = ZTOV(zp); + objset_t *os = ITOZSB(vp)->z_os; + + ASSERT(os != NULL); + + if (vnode_iocount(vp) == 1) + VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), + (task_func_t *)vnode_put, vp, TQ_SLEEP) != TASKQID_INVALID); + else + zrele(zp); +} + +/* ARGSUSED */ +void +zfs_get_done(zgd_t *zgd, int error) +{ + znode_t *zp = zgd->zgd_private; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_rangelock_exit(zgd->zgd_lr); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_zrele_async(zp); + + kmem_free(zgd, sizeof (zgd_t)); +} + +#ifdef DEBUG +static int zil_fault_io = 0; +#endif + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error = 0; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); + if (zp->z_unlinked) { + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_zrele_async(zp); + return (SET_ERROR(ENOENT)); + } + + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zp; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + /* test for truncation needs to be done while range locked */ + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } + ASSERT(error == 0 || error == ENOENT); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's + * written out and its checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); + } + /* test for truncation needs to be done while range locked */ + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); +#ifdef DEBUG + if (zil_fault_io) { + error = SET_ERROR(EIO); + zil_fault_io = 0; + } +#endif + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } + } + } + + zfs_get_done(zgd, error); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_access(struct vnode *vp, int mode, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held inode reference for it. + * + * IN: zdp - znode of directory to search. + * nm - name of entry to lookup. + * flags - LOOKUP_XATTR set if looking for an attribute. + * cr - credentials of caller. + * direntflags - directory lookup flags + * realpnp - returned pathname. + * + * OUT: zpp - znode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +int +zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, + cred_t *cr, int *direntflags, struct componentname *realpnp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zdp); + int error = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *zpp = NULL; + + /* + * OsX has separate vnops for XATTR activity + */ + + + if (!S_ISDIR(zdp->z_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTDIR)); + } + + /* + * Check accessibility of directory. + */ + + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the ip of the created or trunc'd file. + * + * IN: dzp - znode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - file flag. + * vsecp - ACL to be set + * + * OUT: zpp - znode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dzp - ctime|mtime updated if new entry created + * zp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp = NULL; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + objset_t *os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr(vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + +top: + *zpp = NULL; + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + zhold(dzp); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible igrab(zp) */ + int zflg = 0; + + if (flag & FIGNORECASE) + zflg |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL); + if (error) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + if (strcmp(name, "..") == 0) + error = SET_ERROR(EISDIR); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zp == NULL) { + uint64_t txtype; + uint64_t projid = ZFS_DEFAULT_PROJID; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EINVAL); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + + error = dmu_tx_assign(tx, + (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + /* + * Failed, have zp but on OsX we don't have a vp, as it + * would have been attached below, and we've cleared out + * zp, signal then not to call zrele() on it. + */ + if (ZTOV(zp) == NULL) { + zfs_znode_free(zp); + zp = NULL; + } + + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + if (flag & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(zp, zfsvfs); + + } else { + int aflags = (flag & O_APPEND) ? V_APPEND : 0; + + if (have_acl) + zfs_acl_ids_free(&acl_ids); + have_acl = B_FALSE; + + /* + * A directory entry already exists for this name. + */ + /* + * Can't truncate an existing file if in exclusive mode. + */ + if (excl) { + error = SET_ERROR(EEXIST); + goto out; + } + /* + * Can't open a directory for writing. + */ + if (S_ISDIR(zp->z_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + goto out; + } + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + /* + * Truncate regular files if requested. + */ + if (S_ISREG(zp->z_mode) && + (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { + /* we can't hold any locks when calling zfs_freesp() */ + if (dl) { + zfs_dirent_unlock(dl); + dl = NULL; + } + error = zfs_freesp(zp, 0, 0, mode, TRUE); + } + } +out: + + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + zrele(zp); + } else { + *zpp = zp; + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dzp - znode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dzp - ctime|mtime + * ip - ctime (if nlink > 0) + */ + +uint64_t null_xattr = 0; + +/*ARGSUSED*/ +int +zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) +{ + znode_t *zp; + znode_t *xzp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + uint64_t acl_obj, xattr_obj; + uint64_t xattr_obj_unlinked = 0; + uint64_t obj = 0; + uint64_t links; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + boolean_t may_delete_now, delete_now = FALSE; + boolean_t unlinked, toobig = FALSE; + uint64_t txtype; + struct componentname *realnmp = NULL; + struct componentname realnm = { 0 }; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) { + zflg |= ZCILOOK; + + realnm.cn_nameptr = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + realnm.cn_namelen = MAXPATHLEN; + realnmp = &realnm; + } + +top: + xattr_obj = 0; + xzp = NULL; + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, realnmp))) { + if (realnmp) + kmem_free(realnm.cn_nameptr, realnm.cn_namelen); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (S_ISDIR(zp->z_mode)) { + error = SET_ERROR(EPERM); + goto out; + } + + mutex_enter(&zp->z_lock); + may_delete_now = vnode_iocount(ZTOV(zp)) == 1 && + !(zp->z_is_mapped); + mutex_exit(&zp->z_lock); + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the inode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + obj = zp->z_id; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + if (may_delete_now) { + toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + mutex_enter(&zp->z_lock); + if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + mutex_exit(&zp->z_lock); + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(zp); + if (xzp) + zrele(xzp); + goto top; + } + if (realnmp) + kmem_free(realnm.cn_nameptr, realnm.cn_namelen); + dmu_tx_abort(tx); + zrele(zp); + if (xzp) + zrele(xzp); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + /* + * Hold z_lock so that we can make sure that the ACL obj + * hasn't changed. Could have been deleted due to + * zfs_sa_upgrade(). + */ + mutex_enter(&zp->z_lock); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); + delete_now = may_delete_now && !toobig && + vnode_iocount(ZTOV(zp)) == 1 && + !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && + zfs_external_acl(zp) == acl_obj; + } + + if (delete_now) { + if (xattr_obj_unlinked) { + ASSERT3U(xzp->z_nlink, ==, 2); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; + links = 0; + error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &links, sizeof (links), tx); + ASSERT3U(error, ==, 0); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + + if (zp->z_is_sa) + error = sa_remove(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), tx); + else + error = sa_update(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), &null_xattr, + sizeof (uint64_t), tx); + ASSERT0(error); + } + /* + * Add to the unlinked set because a new reference could be + * taken concurrently resulting in a deferred destruction. + */ + zfs_unlinked_add(zp, tx); + mutex_exit(&zp->z_lock); + } else if (unlinked) { + mutex_exit(&zp->z_lock); + zfs_unlinked_add(zp, tx); + } + + txtype = TX_REMOVE; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); + + dmu_tx_commit(tx); +out: + if (realnmp) + kmem_free(realnm.cn_nameptr, realnm.cn_namelen); + + zfs_dirent_unlock(dl); + + if (delete_now) + zrele(zp); + else + zfs_zrele_async(zp); + + if (xzp) + zfs_zrele_async(xzp); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dzp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dzp - znode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * flags - case flags. + * vsecp - ACL to be set + * + * OUT: zpp - znode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dzp - ctime|mtime updated + * zpp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, + cred_t *cr, int flags, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + uint64_t txtype; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t waited = B_FALSE; + + ASSERT(S_ISDIR(vap->va_mode)); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (dirname == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr(vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + vsecp, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ +top: + *zpp = NULL; + + if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, + NULL, NULL))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + /* + * Now put new name in parent dir. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + *zpp = zp; + + txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); + +out: + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(zp, zfsvfs); + + zfs_dirent_unlock(dl); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (error != 0) { + zrele(zp); + } else { + } + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dzp - znode of directory to remove from. + * name - name of directory to be removed. + * cwd - inode of current working directory. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dzp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, + int flags) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (ZTOTYPE(zp) != VDIR) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + if (zp == cwd) { + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * Grab a lock on the directory to make sure that no one is + * trying to add (or lookup) entries while we are removing it. + */ + rw_enter(&zp->z_name_lock, RW_WRITER); + + /* + * Grab a lock on the parent pointer to make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(zp); + goto top; + } + dmu_tx_abort(tx); + zrele(zp); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, + B_FALSE); + } + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); +out: + zfs_dirent_unlock(dl); + + zrele(zp); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read directory entries from the given directory cursor position and emit + * name and position for each entry. + * + * IN: ip - inode of directory to read. + * ctx - directory entry context. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +int +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, + int flags, int *a_numdirent) +{ + + znode_t *zp = VTOZ(vp); + boolean_t extended = (flags & VNODE_READDIR_EXTENDED); + struct direntry *eodp; /* Extended */ + struct dirent *odp; /* Standard */ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; + int local_eof; + int outcount; + int error = 0; + uint8_t prefetch; + uint8_t type; + int numdirent = 0; + char *bufptr; + boolean_t isdotdir = B_TRUE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) + goto out; + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio_curriovlen(uio) <= 0) { + error = EINVAL; + goto out; + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + goto out; + } + + error = 0; + os = zfsvfs->z_os; + offset = uio_offset(uio); + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + bytes_wanted = uio_curriovlen(uio); + bufsize = (size_t)bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + bufptr = (char *)outbuf; + + /* + * Transform to file-system independent format + */ + + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen; + uint64_t *next = NULL; + size_t namelen; + int force_formd_normalized_output; + size_t nfdlen; + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strlcpy(zap.za_name, ".", MAXNAMELEN); + zap.za_normalization_conflict = 0; + objnum = (zp->z_id == zfsvfs->z_root) ? 2 : zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strlcpy(zap.za_name, "..", MAXNAMELEN); + zap.za_normalization_conflict = 0; + objnum = (parent == zfsvfs->z_root) ? 2 : parent; + objnum = (zp->z_id == zfsvfs->z_root) ? 1 : objnum; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strlcpy(zap.za_name, ZFS_CTLDIR_NAME, + MAXNAMELEN); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + + /* This is not a special case directory */ + isdotdir = B_FALSE; + + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + /* + * Allow multiple entries provided the first entry is + * the object id. Non-zpl consumers may safely make + * use of the additional space. + * + * XXX: This should be a feature flag for compatibility + */ + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + + } + + /* emit start */ + +#define DIRENT_RECLEN(namelen, ext) \ + ((ext) ? \ + ((sizeof (struct direntry) + (namelen) - (MAXPATHLEN-1) + 7) & ~7) \ + : \ + ((sizeof (struct dirent) - (NAME_MAX+1)) + (((namelen)+1 + 7) &~ 7))) + + /* + * Check if name will fit. + * + * Note: non-ascii names may expand (3x) when converted to NFD + */ + namelen = strlen(zap.za_name); + + /* sysctl to force formD normalization of vnop output */ + if (zfs_vnop_force_formd_normalized_output && + !is_ascii_str(zap.za_name)) + force_formd_normalized_output = 1; + else + force_formd_normalized_output = 0; + + if (force_formd_normalized_output) + namelen = MIN(extended ? MAXPATHLEN-1 : MAXNAMLEN, + namelen * 3); + + reclen = DIRENT_RECLEN(namelen, extended); + + /* + * Will this entry fit in the buffer? + */ + if (outcount + reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = (EINVAL); + goto update; + } + break; + } + + if (extended) { + /* + * Add extended flag entry: + */ + eodp = (struct direntry *)bufptr; + /* NOTE: d_seekoff is the offset for the *next* entry */ + next = &(eodp->d_seekoff); + eodp->d_ino = INO_ZFSTOXNU(objnum, zfsvfs->z_root); + eodp->d_type = type; + + /* + * Mac OS X: non-ascii names are UTF-8 NFC on disk + * so convert to NFD before exporting them. + */ + namelen = strlen(zap.za_name); + if (!force_formd_normalized_output || + utf8_normalizestr((const u_int8_t *)zap.za_name, + namelen, (u_int8_t *)eodp->d_name, &nfdlen, + MAXPATHLEN-1, UTF_DECOMPOSED) != 0) { + /* ASCII or normalization failed, copy zap */ + if ((namelen > 0)) + (void) bcopy(zap.za_name, eodp->d_name, + namelen + 1); + } else { + /* Normalization succeeded (in buffer) */ + namelen = nfdlen; + } + eodp->d_namlen = namelen; + eodp->d_reclen = reclen = + DIRENT_RECLEN(namelen, extended); + + } else { + /* + * Add normal entry: + */ + + odp = (struct dirent *)bufptr; + odp->d_ino = INO_ZFSTOXNU(objnum, zfsvfs->z_root); + odp->d_type = type; + + /* + * Mac OS X: non-ascii names are UTF-8 NFC on disk + * so convert to NFD before exporting them. + */ + namelen = strlen(zap.za_name); + if (!force_formd_normalized_output || + utf8_normalizestr((const u_int8_t *)zap.za_name, + namelen, (u_int8_t *)odp->d_name, &nfdlen, + MAXNAMLEN, UTF_DECOMPOSED) != 0) { + /* ASCII or normalization failed, copy zap */ + if ((namelen > 0)) + (void) bcopy(zap.za_name, odp->d_name, + namelen + 1); + } else { + /* Normalization succeeded (in buffer). */ + namelen = nfdlen; + } + odp->d_namlen = namelen; + odp->d_reclen = reclen = + DIRENT_RECLEN(namelen, extended); + } + + outcount += reclen; + bufptr += reclen; + numdirent++; + + ASSERT(outcount <= bufsize); + + /* emit done */ + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + if (extended) + *next = offset; + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + /* All done, copy temporary buffer to userland */ + if ((error = uiomove(outbuf, (long)outcount, UIO_READ, uio))) { + /* + * Reset the pointer. + */ + offset = uio_offset(uio); + } + + +update: + zap_cursor_fini(&zc); + if (outbuf) { + kmem_free(outbuf, bufsize); + } + + if (error == ENOENT) + error = 0; + + uio_setoffset(uio, offset); + if (a_numdirent) + *a_numdirent = numdirent; + +out: + ZFS_EXIT(zfsvfs); + + dprintf("-zfs_readdir: num %d\n", numdirent); + + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +int +zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + vnode_t *vp = ZTOV(zp); + + if (zp->z_is_mapped /* && !(syncflag & FNODSYNC) */ && + vnode_isreg(vp) && !vnode_isswap(vp)) { + cluster_push(vp, /* waitdata ? IO_SYNC : */ 0); + } + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED && + !vnode_isrecycled(ZTOV(zp))) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } + tsd_set(zfs_fsyncer_key, NULL); + + return (0); +} + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * If ATTR_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * ct - caller context + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds) + */ +int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + uint64_t links; + uint64_t mtime[2], ctime[2], crtime[2], rdev; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = /* (flags&ATTR_NOACLCHECK)?B_TRUE: */ B_FALSE; + sa_bulk_attr_t bulk[4]; + int count = 0; + + VERIFY3P(zp->z_zfsvfs, ==, vfs_fsprivate(vnode_mount(vp))); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + if (vnode_isblk(vp) || vnode_ischr(vp)) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { + if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + mutex_enter(&zp->z_lock); + vap->va_type = IFTOVT(zp->z_mode); + vap->va_mode = zp->z_mode & ~S_IFMT; + vap->va_nodeid = zp->z_id; + if (vnode_isvroot((vp)) && zfs_show_ctldir(zp)) + links = zp->z_links + 1; + else + links = zp->z_links; + vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ + vap->va_size = zp->z_size; + if (vnode_isblk(vp) || vnode_ischr(vp)) + vap->va_rdev = zfs_cmpldev(rdev); + + vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((zp->z_pflags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((zp->z_pflags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((zp->z_pflags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((zp->z_pflags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((zp->z_pflags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((zp->z_pflags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((zp->z_pflags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((zp->z_pflags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vnode_isreg(vp)) { + zfs_sa_get_scanstamp(zp, xvap); + } + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + times, sizeof (times)); + ZFS_TIME_DECODE(&xoap->xoa_createtime, times); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + ZFS_TIME_DECODE(&vap->va_crtime, crtime); + + mutex_exit(&zp->z_lock); + + /* + * If we are told to ignore owners, we scribble over the + * uid and gid here unless root. + */ + if (((unsigned int)vfs_flags(zfsvfs->z_vfs)) & MNT_IGNORE_OWNERSHIP) { + if (kauth_cred_getuid(cr) != 0) { + vap->va_uid = UNKNOWNUID; + vap->va_gid = UNKNOWNGID; + } + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +#ifdef NOTSUREYET +/* + * For the operation of changing file's user/group/project, we need to + * handle not only the main object that is assigned to the file directly, + * but also the ones that are used by the file via hidden xattr directory. + * + * Because the xattr directory may contains many EA entries, as to it may + * be impossible to change all of them via the transaction of changing the + * main object's user/group/project attributes. Then we have to change them + * via other multiple independent transactions one by one. It may be not good + * solution, but we have no better idea yet. + */ +static int +zfs_setattr_dir(znode_t *dzp) +{ + struct vnode *dxip = ZTOI(dzp); + struct vnode *xip = NULL; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + objset_t *os = zfsvfs->z_os; + zap_cursor_t zc; + zap_attribute_t zap; + zfs_dirlock_t *dl; + znode_t *zp; + dmu_tx_t *tx = NULL; + uint64_t uid, gid; + sa_bulk_attr_t bulk[4]; + int count; + int err; + + zap_cursor_init(&zc, os, dzp->z_id); + while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { + count = 0; + if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { + err = ENXIO; + break; + } + + err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, + ZEXISTS, NULL, NULL); + if (err == ENOENT) + goto next; + if (err) + break; + + xip = ZTOI(zp); + if (zp->z_uid == dzp->z_uid && + zp->z_gid == dzp->z_gid && + zp->z_projid == dzp->z_projid) + goto next; + + tx = dmu_tx_create(os); + if (!(zp->z_pflags & ZFS_PROJID)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + break; + + mutex_enter(&dzp->z_lock); + + if (zp->z_uid != dxzp->z_uid) { + zp->z_uid = dzp->z_uid; + uid = zfs_uid_read(dzp); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, sizeof (uid)); + } + + if (KGID_TO_SGID(zp->z_gid) != KGID_TO_SGID(dxzp->z_gid)) { + zp->z_gid = dzp->z_gid; + gid = zfs_gid_read(dzp); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, sizeof (gid)); + } + + if (zp->z_projid != dzp->z_projid) { + if (!(zp->z_pflags & ZFS_PROJID)) { + zp->z_pflags |= ZFS_PROJID; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, + sizeof (zp->z_pflags)); + } + + zp->z_projid = dzp->z_projid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), + NULL, &zp->z_projid, sizeof (zp->z_projid)); + } + + mutex_exit(&dzp->z_lock); + + if (likely(count > 0)) { + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + tx = NULL; + if (err != 0 && err != ENOENT) + break; + +next: + if (zp) { + zrele(zp); + zp = NULL; + zfs_dirent_unlock(dl); + } + zap_cursor_advance(&zc); + } + + if (tx) + dmu_tx_abort(tx); + if (zp) { + zrele(zp); + zfs_dirent_unlock(dl); + } + zap_cursor_fini(&zc); + + return (err == ENOENT ? 0 : err); +} +#endif + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: zp - znode of file to be modified. + * vap - new attribute values. + * If ATTR_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +{ + struct vnode *vp; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t *tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2], atime[2], crtime[2]; + uint64_t projid = ZFS_INVALID_PROJID; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2 = 0; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t fuid_dirtied = B_FALSE; + boolean_t handle_eadir = B_FALSE; + sa_bulk_attr_t *bulk, *xattr_bulk; + int count = 0, xattr_count = 0, bulks = 9; + + if (mask == 0) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + vp = ZTOV(zp); + + /* + * If this is a xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + if (xoap != NULL && (mask & ATTR_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + } + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & ATTR_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & ATTR_SIZE && S_ISDIR(zp->z_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & ATTR_SIZE && !S_ISREG(zp->z_mode) && !S_ISFIFO(zp->z_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); + xva_init(tmpxvattr); + + bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || + ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + err = SET_ERROR(EPERM); + goto out3; + } + + if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { + err = SET_ERROR(EPERM); + goto out3; + } + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (ATTR_ATIME | ATTR_MTIME)) { + if (((mask & ATTR_ATIME) && + TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & ATTR_MTIME) && + TIMESPEC_OVERFLOW(&vap->va_mtime))) { + err = SET_ERROR(EOVERFLOW); + goto out3; + } + } + +top: + attrzp = NULL; + aclp = NULL; + + /* Can this be moved to before the top label? */ + if (zfs_is_readonly(zfsvfs)) { + err = SET_ERROR(EROFS); + goto out3; + } + + /* + * First validate permissions + */ + + if (mask & ATTR_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr); + if (err) + goto out3; + + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) + goto out3; + } + + if (mask & (ATTR_ATIME|ATTR_MTIME) || + ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + B_FALSE, cr); + } + + if (mask & (ATTR_UID|ATTR_GID)) { + int idmask = (mask & (ATTR_UID|ATTR_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & ATTR_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & ATTR_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both ATTR_UID and ATTR_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (ATTR_UID|ATTR_GID)) && + take_owner && take_group) || + ((idmask == ATTR_UID) && take_owner) || + ((idmask == ATTR_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + B_FALSE, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + (void) secpolicy_setid_clear(vap, cr); + trim_mask = (mask & (ATTR_UID|ATTR_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + mutex_enter(&zp->z_lock); + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & ATTR_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((!S_ISREG(zp->z_mode) && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + mutex_exit(&zp->z_lock); + err = SET_ERROR(EPERM); + goto out3; + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + mutex_exit(&zp->z_lock); + + if (mask & ATTR_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, B_FALSE, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) + goto out3; + + trim_mask |= ATTR_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) + goto out3; + + if (trim_mask) + vap->va_mask |= saved_mask; + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { + handle_eadir = B_TRUE; + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); + if (err) + goto out2; + } + if (mask & ATTR_UID) { + new_kuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_kuid != zp->z_uid && + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_kuid)) { + if (attrzp) + zrele(attrzp); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & ATTR_GID) { + new_kgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); + if (new_kgid != zp->z_gid && + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_kgid)) { + if (attrzp) + zrele(attrzp); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + zrele(attrzp); + err = EDQUOT; + goto out2; + } + } + tx = dmu_tx_create(os); + + if (mask & ATTR_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) + goto out; + + mutex_enter(&zp->z_lock); + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + mutex_exit(&zp->z_lock); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if (((mask & ATTR_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } + + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&attrzp->z_acl_lock); + mutex_enter(&attrzp->z_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } + } + + if (mask & (ATTR_UID|ATTR_GID)) { + + if (mask & ATTR_UID) { + new_uid = new_kuid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = new_uid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_uid = new_uid; + } + } + + if (mask & ATTR_GID) { + new_gid = new_kgid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + zp->z_gid = new_gid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = new_gid; + } + } + if (!(mask & ATTR_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + + if (mask & ATTR_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; + ASSERT3P(aclp, !=, NULL); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { + zp->z_atime_dirty = B_FALSE; + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, sizeof (atime)); + } + + if (mask & (ATTR_MTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (mask & (ATTR_CTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_ctime, ctime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + } + + if (mask & ATTR_CRTIME) { + ZFS_TIME_ENCODE(&vap->va_crtime, crtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, + crtime, sizeof (crtime)); + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + + if (attrzp && mask) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, &ctime, + sizeof (ctime)); + } + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & ATTR_XVATTR)) { + + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(S_ISREG(zp->z_mode)); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + mutex_exit(&zp->z_lock); + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&attrzp->z_acl_lock); + mutex_exit(&attrzp->z_lock); + } +out: + if (err == 0 && xattr_count > 0) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + if (attrzp) + zrele(attrzp); + if (err == ERESTART) + goto top; + } else { + if (count > 0) + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + if (attrzp) { + zrele(attrzp); + } + } + +out2: + if (os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + +out3: + kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(tmpxvattr, sizeof (xvattr_t)); + ZFS_EXIT(zfsvfs); + return (err); +} + +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + zfs_zrele_async(zl->zl_znode); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = ZTOZSB(zp)->z_root; + uint64_t oidp = zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + if (!rw_tryenter(rwlp, rw)) { + /* + * Another thread is renaming in this path. + * Note that if we are a WRITER, we don't have any + * parent_locks held yet. + */ + if (rw == RW_READER && zp->z_id > szp->z_id) { + /* + * Drop our locks and restart + */ + zfs_rename_unlock(&zl); + *zlpp = NULL; + zp = tdzp; + oidp = zp->z_id; + rwlp = &szp->z_parent_lock; + rw = RW_WRITER; + continue; + } else { + /* + * Wait for other thread to drop its locks + */ + rw_enter(rwlp, rw); + } + } + + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + if (oidp == szp->z_id) /* We're a descendant of szp */ + return (SET_ERROR(EINVAL)); + + if (oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(ZTOZSB(zp), oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), + &oidp, sizeof (oidp)); + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdzp - Source directory containing the "old entry". + * snm - Old entry name. + * tdzp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdzp,tdzp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, + cred_t *cr, int flags) +{ + znode_t *szp, *tzp; + zfsvfs_t *zfsvfs = ZTOZSB(sdzp); + zilog_t *zilog; + uint64_t addtime[2]; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int cmp, serr, terr; + int error = 0; + int zflg = 0; + boolean_t waited = B_FALSE; + + if (snm == NULL || tnm == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(sdzp); + zilog = zfsvfs->z_log; + + ZFS_VERIFY_ZP(tdzp); + + /* + * We check i_sb because snapshots and the ctldir must have different + * super blocks. + */ + // Can't we use zp->z_zfsvfs in place of zp->vp->v_vfs ? + if (VTOM(ZTOV(tdzp)) != VTOM(ZTOV(sdzp)) || + zfsctl_is_node(ZTOV(tdzp))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + cmp = -1; + } else if (sdzp->z_id > tdzp->z_id) { + cmp = 1; + } else { + /* + * First compare the two name arguments without + * considering any case folding. + */ + int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + + cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); + ASSERT(error == 0 || !zfsvfs->z_utf8); + if (cmp == 0) { + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + */ + ZFS_EXIT(zfsvfs); + return (0); + } + /* + * If the file system is case-folding, then we may + * have some more checking to do. A case-folding file + * system is either supporting mixed case sensitivity + * access or is completely case-insensitive. Note + * that the file system is always case preserving. + * + * In mixed sensitivity mode case sensitive behavior + * is the default. FIGNORECASE must be used to + * explicitly request case insensitive behavior. + * + * If the source and target names provided differ only + * by case (e.g., a request to rename 'tim' to 'Tim'), + * we will treat this as a special case in the + * case-insensitive mode: as long as the source name + * is an exact match, we will allow this to proceed as + * a name-change request. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + (zfsvfs->z_case == ZFS_CASE_MIXED && + flags & FIGNORECASE)) && + u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, + &error) == 0) { + /* + * case preserving rename request, require exact + * name matches + */ + zflg |= ZCIEXACT; + zflg &= ~ZCILOOK; + } + } + + /* + * If the source and destination directories are the same, we should + * grab the z_name_lock of that directory only once. + */ + if (sdzp == tdzp) { + zflg |= ZHAVELOCK; + rw_enter(&sdzp->z_name_lock, RW_READER); + } + + if (cmp < 0) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, + ZEXISTS | zflg, NULL, NULL); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); + } else { + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, zflg, NULL, NULL); + serr = zfs_dirent_lock(&sdl, + sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, + NULL, NULL); + } + + if (serr) { + /* + * Source entry invalid or not there. + */ + if (!terr) { + zfs_dirent_unlock(tdl); + if (tzp) + zrele(tzp); + } + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) + serr = EINVAL; + ZFS_EXIT(zfsvfs); + return (serr); + } + if (terr) { + zfs_dirent_unlock(sdl); + zrele(szp); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(tnm, "..") == 0) + terr = EINVAL; + ZFS_EXIT(zfsvfs); + return (terr); + } + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto out; + + if (S_ISDIR(szp->z_mode)) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (S_ISDIR(szp->z_mode)) { + if (!S_ISDIR(tzp->z_mode)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + } else { + if (S_ISDIR(tzp->z_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + +#if defined(MAC_OS_X_VERSION_10_12) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12) + /* If renamex(VFS_RENAME_EXCL) is used, error out */ + if (flags & VFS_RENAME_EXCL) { + error = EEXIST; + goto out; + } +#endif + + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(szp); + if (tzp) + zrele(tzp); + goto top; + } + dmu_tx_abort(tx); + zrele(szp); + if (tzp) + zrele(tzp); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + if (tdzp->z_pflags & ZFS_PROJINHERIT) + szp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + /* + * If we moved an entry into a different directory + * (sdzp != tdzp) then we also need to update ADDEDTIME + * (ADDTIME) property for FinderInfo. We are already + * inside error == 0 conditional + */ + if ((sdzp != tdzp) && + zfsvfs->z_use_sa == B_TRUE) { + timestruc_t now; + gethrestime(&now); + ZFS_TIME_ENCODE(&now, addtime); + error = sa_update(szp->z_sa_hdl, + SA_ZPL_ADDTIME(zfsvfs), (void *)&addtime, + sizeof (addtime), tx); + } + + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME | + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + + /* + * Update cached name - for vget, and access + * without calling vnop_lookup first - it is + * easier to clear it out and let getattr + * look it up if needed. + */ + if (tzp) { + mutex_enter(&tzp->z_lock); + tzp->z_name_cache[0] = 0; + mutex_exit(&tzp->z_lock); + } + if (szp) { + mutex_enter(&szp->z_lock); + szp->z_name_cache[0] = 0; + mutex_exit(&szp->z_lock); + } + + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdl, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } else { + /* + * If we had removed the existing target, subsequent + * call to zfs_link_create() to add back the same entry + * but, the new dnode (szp) should not fail. + */ + ASSERT(tzp == NULL); + } + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + zrele(szp); + if (tzp) { + zrele(tzp); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dzp - Directory to contain new symbolic link. + * name - Name of directory entry in dip. + * vap - Attributes of new entry. + * link - Name for new symlink entry. + * cr - credentials of caller. + * flags - case flags + * + * OUT: zpp - Znode for new symbolic link. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dip - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, + znode_t **zpp, cred_t *cr, int flags) +{ + znode_t *zp; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + int zflg = ZNEW; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + boolean_t waited = B_FALSE; + + ASSERT(S_ISLNK(vap->va_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } +top: + *zpp = NULL; + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datsets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + mutex_exit(&zp->z_lock); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + } + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(zp, zfsvfs); + + if (error == 0) { + *zpp = zp; + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } else { + zrele(zp); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by ip. + * + * IN: ip - inode of symbolic link + * uio - structure to contain the link path. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + */ +/* ARGSUSED */ +int +zfs_readlink(struct vnode *vp, uio_t *uio, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdzp referencing szp. + * + * IN: tdzp - Directory to contain new entry. + * szp - znode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdzp - ctime|mtime updated + * szp - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, + int flags) +{ + struct vnode *svp = ZTOV(szp); + znode_t *tzp; + zfsvfs_t *zfsvfs = ZTOZSB(tdzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uint64_t parent; + uid_t owner; + boolean_t waited = B_FALSE; + boolean_t is_tmpfile = 0; + uint64_t txg; + + ASSERT(S_ISDIR(tdzp->z_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(tdzp); + zilog = zfsvfs->z_log; + +#ifdef __APPLE__ + if (VTOM(svp) != VTOM(ZTOV(tdzp))) { + ZFS_EXIT(zfsvfs); + return (EXDEV); + } +#endif + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (vnode_isdir(svp)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + ZFS_VERIFY_ZP(szp); + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(szp->z_uid), + cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + +top: + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); + if (is_tmpfile) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, tdzp); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + } else if (is_tmpfile) { + /* restore z_unlinked since when linking failed */ + szp->z_unlinked = B_TRUE; + } + txg = dmu_tx_get_txg(tx); + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +void +zfs_inactive(struct vnode *vp) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vnode_recycle(vp); + return; + } + + if (zp->z_unlinked) { + /* + * Fast path to recycle a vnode of a removed file. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vnode_recycle(vp); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); + zp->z_atime_dirty = 0; + dmu_tx_commit(tx); + } + } + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + +static int +zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + // boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t skipaclchk = B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +int +zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + // boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t skipaclchk = B_FALSE; + zilog_t *zilog = zfsvfs->z_log; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_setacl(zp, vsecp, skipaclchk, cr); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: zp - znode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * zp - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} diff --git a/module/os/macos/zfs/zfs_vnops_osx.c b/module/os/macos/zfs/zfs_vnops_osx.c new file mode 100644 index 0000000000..15942ff5df --- /dev/null +++ b/module/os/macos/zfs/zfs_vnops_osx.c @@ -0,0 +1,5276 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013 Will Andrews + * Copyright (c) 2013, 2020 Jorgen Lundman + */ + +/* + * XXX GENERAL COMPATIBILITY ISSUES + * + * 'name' is a common argument, but in OS X (and FreeBSD), we need to pass + * the componentname pointer, so other things can use them. We should + * change the 'name' argument to be an opaque name pointer, and define + * OS-dependent macros that yield the desired results when needed. + * + * On OS X, VFS performs access checks before calling anything, so + * zfs_zaccess_* calls are not used. Not true on FreeBSD, though. Perhaps + * those calls should be conditionally #if 0'd? + * + * On OS X, VFS & I/O objects are often opaque, e.g. uio_t and struct vnode + * require using functions to access elements of an object. Should convert + * the Solaris code to use macros on other platforms. + * + * OS X and FreeBSD appear to use similar zfs-vfs interfaces; see Apple's + * comment in zfs_remove() about the fact that VFS holds the last ref while + * in Solaris it's the ZFS code that does. On FreeBSD, the code Apple + * refers to here results in a panic if the branch is actually taken. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + + + +#ifdef _KERNEL +#include +#include +unsigned int zfs_vnop_ignore_negatives = 0; +unsigned int zfs_vnop_ignore_positives = 0; +unsigned int zfs_vnop_create_negatives = 1; +#endif + +#define DECLARE_CRED(ap) \ + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context) +#define DECLARE_CONTEXT(ap) \ + caller_context_t *ct = (caller_context_t *)(ap)->a_context +#define DECLARE_CRED_AND_CONTEXT(ap) \ + DECLARE_CRED(ap); \ + DECLARE_CONTEXT(ap) + +/* Empty FinderInfo struct */ +static u_int32_t emptyfinfo[8] = {0}; + +/* + * zfs vfs operations. + */ +static struct vfsops zfs_vfsops_template = { + zfs_vfs_mount, + zfs_vfs_start, + zfs_vfs_unmount, + zfs_vfs_root, + zfs_vfs_quotactl, + zfs_vfs_getattr, + zfs_vfs_sync, + zfs_vfs_vget, + zfs_vfs_fhtovp, + zfs_vfs_vptofh, + zfs_vfs_init, + zfs_vfs_sysctl, + zfs_vfs_setattr, +#if defined(MAC_OS_X_VERSION_10_12) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12) + NULL, /* vfs_ioctl */ + NULL, /* vfs_vget_snapdir */ + NULL +#else + {NULL} +#endif +}; + +#define ZFS_VNOP_TBL_CNT 6 + +static struct vnodeopv_desc *zfs_vnodeop_opv_desc_list[ZFS_VNOP_TBL_CNT] = +{ + &zfs_dvnodeop_opv_desc, + &zfs_fvnodeop_opv_desc, + &zfs_symvnodeop_opv_desc, + &zfs_xdvnodeop_opv_desc, + &zfs_fifonodeop_opv_desc, + &zfs_ctldir_opv_desc, +}; + +static vfstable_t zfs_vfsconf; + +int +zfs_vnop_removexattr_int(zfsvfs_t *zfsvfs, znode_t *zp, const char *name, + cred_t *cr); + +int +zfs_vfs_init(__unused struct vfsconf *vfsp) +{ + return (0); +} + +int +zfs_vfs_start(__unused struct mount *mp, __unused int flags, + __unused vfs_context_t context) +{ + return (0); +} + +int +zfs_vfs_quotactl(__unused struct mount *mp, __unused int cmds, + __unused uid_t uid, __unused caddr_t datap, __unused vfs_context_t context) +{ + dprintf("%s ENOTSUP\n", __func__); + return (ENOTSUP); +} + +static kmutex_t zfs_findernotify_lock; +static kcondvar_t zfs_findernotify_thread_cv; +static boolean_t zfs_findernotify_thread_exit; + +#define VNODE_EVENT_ATTRIB 0x00000008 + +static int +zfs_findernotify_callback(mount_t mp, __unused void *arg) +{ + /* Do some quick checks to see if it is ZFS */ + struct vfsstatfs *vsf = vfs_statfs(mp); + + // Filesystem ZFS? + if (vsf->f_fssubtype == MNTTYPE_ZFS_SUBTYPE) { + vfs_context_t kernelctx = spl_vfs_context_kernel(); + struct vnode *rootvp, *vp; + + /* + * Since potentially other filesystems could be using "our" + * fssubtype, and we don't always announce as "zfs" due to + * hfs-mimic requirements, we have to make extra care here to + * make sure this "mp" really is ZFS. + */ + zfsvfs_t *zfsvfs; + + zfsvfs = vfs_fsprivate(mp); + + /* + * The first entry in struct zfsvfs is the vfs ptr, so they + * should be equal if it is ZFS + */ + if (!zfsvfs || + (mp != zfsvfs->z_vfs)) + return (VFS_RETURNED); + + /* Guard against unmount */ + ZFS_ENTER_ERROR(zfsvfs, VFS_RETURNED); + + /* Check if space usage has changed enough to bother updating */ + uint64_t refdbytes, availbytes, usedobjs, availobjs; + uint64_t delta; + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + if (availbytes >= zfsvfs->z_findernotify_space) { + delta = availbytes - zfsvfs->z_findernotify_space; + } else { + delta = zfsvfs->z_findernotify_space - availbytes; + } + +#define ZFS_FINDERNOTIFY_THRESHOLD (1ULL<<20) + + /* Under the limit ? */ + if (delta <= ZFS_FINDERNOTIFY_THRESHOLD) goto out; + + /* Over threadhold, so we will notify finder, remember value */ + zfsvfs->z_findernotify_space = availbytes; + + /* If old value is zero (first run), don't bother */ + if (availbytes == delta) + goto out; + + dprintf("ZFS: findernotify %p space delta %llu\n", mp, delta); + + // Grab the root zp + if (!VFS_ROOT(mp, 0, &rootvp)) { + + struct componentname cn; + char *tmpname = ".fseventsd"; + + bzero(&cn, sizeof (cn)); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN; + // cn.cn_context = kernelctx; + cn.cn_pnbuf = tmpname; + cn.cn_pnlen = sizeof (tmpname); + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_namelen = strlen(tmpname); + + // Attempt to lookup .Trashes + if (!VOP_LOOKUP(rootvp, &vp, &cn, kernelctx)) { + + // Send the event to wake up Finder + struct vnode_attr vattr; + // Also calls VATTR_INIT + spl_vfs_get_notify_attributes(&vattr); + // Fill in vap + vnode_getattr(vp, &vattr, kernelctx); + // Send event + spl_vnode_notify(vp, VNODE_EVENT_ATTRIB, + &vattr); + + // Cleanup vp + vnode_put(vp); + + } // VNOP_LOOKUP + + // Cleanup rootvp + vnode_put(rootvp); + + } // VFS_ROOT + +out: + ZFS_EXIT(zfsvfs); + + } // SUBTYPE_ZFS + + return (VFS_RETURNED); +} + + +static void +zfs_findernotify_thread(void *notused) +{ + callb_cpr_t cpr; + + dprintf("ZFS: findernotify thread start\n"); + CALLB_CPR_INIT(&cpr, &zfs_findernotify_lock, callb_generic_cpr, FTAG); + + mutex_enter(&zfs_findernotify_lock); + while (!zfs_findernotify_thread_exit) { + + /* Sleep 32 seconds */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&zfs_findernotify_thread_cv, + &zfs_findernotify_lock, ddi_get_lbolt() + (hz<<5)); + CALLB_CPR_SAFE_END(&cpr, &zfs_findernotify_lock); + + if (!zfs_findernotify_thread_exit) + vfs_iterate(LK_NOWAIT, zfs_findernotify_callback, NULL); + + } + + zfs_findernotify_thread_exit = FALSE; + cv_broadcast(&zfs_findernotify_thread_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ + dprintf("ZFS: findernotify thread exit\n"); + thread_exit(); +} + +void +zfs_start_notify_thread(void) +{ + mutex_init(&zfs_findernotify_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zfs_findernotify_thread_cv, NULL, CV_DEFAULT, NULL); + zfs_findernotify_thread_exit = FALSE; + (void) thread_create(NULL, 0, zfs_findernotify_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + + +void +zfs_stop_notify_thread(void) +{ + mutex_enter(&zfs_findernotify_lock); + zfs_findernotify_thread_exit = TRUE; + /* + * The reclaim thread will set arc_reclaim_thread_exit back to + * FALSE when it is finished exiting; we're waiting for that. + */ + while (zfs_findernotify_thread_exit) { + cv_signal(&zfs_findernotify_thread_cv); + cv_wait(&zfs_findernotify_thread_cv, &zfs_findernotify_lock); + } + mutex_exit(&zfs_findernotify_lock); + mutex_destroy(&zfs_findernotify_lock); + cv_destroy(&zfs_findernotify_thread_cv); +} + +int +zfs_vfs_sysctl(int *name, __unused uint_t namelen, user_addr_t oldp, + size_t *oldlenp, user_addr_t newp, size_t newlen, + __unused vfs_context_t context) +{ +#if 0 + int error; + switch (name[0]) { + case ZFS_SYSCTL_FOOTPRINT: { + zfs_footprint_stats_t *footprint; + size_t copyinsize; + size_t copyoutsize; + int max_caches; + int act_caches; + + if (newp) { + return (EINVAL); + } + if (!oldp) { + *oldlenp = sizeof (zfs_footprint_stats_t); + return (0); + } + copyinsize = *oldlenp; + if (copyinsize < sizeof (zfs_footprint_stats_t)) { + *oldlenp = sizeof (zfs_footprint_stats_t); + return (ENOMEM); + } + footprint = kmem_alloc(copyinsize, KM_SLEEP); + + max_caches = copyinsize - sizeof (zfs_footprint_stats_t); + max_caches += sizeof (kmem_cache_stats_t); + max_caches /= sizeof (kmem_cache_stats_t); + + footprint->version = ZFS_FOOTPRINT_VERSION; + + footprint->memory_stats.current = zfs_footprint.current; + footprint->memory_stats.target = zfs_footprint.target; + footprint->memory_stats.highest = zfs_footprint.highest; + footprint->memory_stats.maximum = zfs_footprint.maximum; + + arc_get_stats(&footprint->arc_stats); + + kmem_cache_stats(&footprint->cache_stats[0], max_caches, + &act_caches); + footprint->caches_count = act_caches; + footprint->thread_count = zfs_threads; + + copyoutsize = sizeof (zfs_footprint_stats_t) + + ((act_caches - 1) * sizeof (kmem_cache_stats_t)); + + error = ddi_copyout(footprint, oldp, copyoutsize, 0); + + kmem_free(footprint, copyinsize); + + return (error); + } + + case ZFS_SYSCTL_CONFIG_DEBUGMSG: + error = sysctl_int(oldp, oldlenp, newp, newlen, + &zfs_msg_buf_enabled); + return (error); + + case ZFS_SYSCTL_CONFIG_zdprintf: +#ifdef ZFS_DEBUG + error = sysctl_int(oldp, oldlenp, newp, newlen, + &zfs_zdprintf_enabled); +#else + error = ENOTSUP; +#endif + return (error); + } +#endif + return (ENOTSUP); +} + +/* + * All these functions could be declared as 'static' but to assist with + * dtrace debugging, we do not. + */ + +int +zfs_vnop_open(struct vnop_open_args *ap) +#if 0 + struct vnop_open_args { + struct vnode *a_vp; + int a_mode; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + int err = 0; + + err = zfs_open(ap->a_vp, ap->a_mode, 0, cr); + + if (err) dprintf("zfs_open() failed %d\n", err); + return (err); +} + +int +zfs_vnop_close(struct vnop_close_args *ap) +#if 0 + struct vnop_close_args { + struct vnode *a_vp; + int a_fflag; + vfs_context_t a_context; + }; +#endif +{ +// int count = 1; +// int offset = 0; +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + + return (zfs_close(ap->a_vp, ap->a_fflag, cr)); +} + +int +zfs_vnop_ioctl(struct vnop_ioctl_args *ap) +#if 0 + struct vnop_ioctl_args { + struct vnode *a_vp; + ulong_t a_command; + caddr_t a_data; + int a_fflag; + kauth_cred_t a_cred; + struct proc *a_p; + }; +#endif +{ + /* OS X has no use for zfs_ioctl(). */ + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + DECLARE_CRED_AND_CONTEXT(ap); + + dprintf("vnop_ioctl %08lx: VTYPE %d\n", ap->a_command, + vnode_vtype(ZTOV(zp))); + + ZFS_ENTER(zfsvfs); + if (IFTOVT((mode_t)zp->z_mode) == VFIFO) { + dprintf("ZFS: FIFO ioctl %02lx ('%lu' + %lu)\n", + ap->a_command, (ap->a_command&0xff00)>>8, + ap->a_command&0xff); + error = fifo_ioctl(ap); + error = 0; + ZFS_EXIT(zfsvfs); + goto out; + } + + if ((IFTOVT((mode_t)zp->z_mode) == VBLK) || + (IFTOVT((mode_t)zp->z_mode) == VCHR)) { + dprintf("ZFS: spec ioctl %02lx ('%lu' + %lu)\n", + ap->a_command, (ap->a_command&0xff00)>>8, + ap->a_command&0xff); + error = spec_ioctl(ap); + ZFS_EXIT(zfsvfs); + goto out; + } + ZFS_EXIT(zfsvfs); + + switch (ap->a_command) { + + /* ioctl supported by ZFS and POSIX */ + + case F_FULLFSYNC: + dprintf("%s F_FULLFSYNC\n", __func__); +#ifdef F_BARRIERFSYNC + case F_BARRIERFSYNC: + dprintf("%s F_BARRIERFSYNC\n", __func__); +#endif + error = zfs_fsync(VTOZ(ap->a_vp), /* flag */0, cr); + break; + + case F_CHKCLEAN: + dprintf("%s F_CHKCLEAN\n", __func__); + /* + * normally calls http://fxr.watson.org/fxr/source/bsd/ + * vfs/vfs_cluster.c?v=xnu-2050.18.24#L5839 + */ + /* XXX Why don't we? */ + off_t fsize = zp->z_size; + error = is_file_clean(ap->a_vp, fsize); + break; + + case F_RDADVISE: + dprintf("%s F_RDADVISE\n", __func__); + uint64_t file_size; + struct radvisory *ra; + int len; + + ra = (struct radvisory *)(ap->a_data); + + file_size = zp->z_size; + len = ra->ra_count; + + /* XXX Check request size */ + if (ra->ra_offset > file_size) { + dprintf("invalid request offset\n"); + error = EFBIG; + break; + } + + if ((ra->ra_offset + len) > file_size) { + len = file_size - ra->ra_offset; + dprintf("%s truncating F_RDADVISE from" + " %08x -> %08x\n", __func__, + ra->ra_count, len); + } + + /* + * Rather than advisory_read (which calls + * cluster_io->VNOP_BLOCKMAP), prefetch + * the level 0 metadata and level 1 data + * at the requested offset + length. + */ + // error = advisory_read(ap->a_vp, file_size, + // ra->ra_offset, len); + dmu_prefetch(zfsvfs->z_os, zp->z_id, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch(zfsvfs->z_os, zp->z_id, + 1, ra->ra_offset, len, + ZIO_PRIORITY_SYNC_READ); +#if 0 + { + const char *name = vnode_getname(ap->a_vp); + printf("%s F_RDADVISE: prefetch issued for " + "[%s](0x%016llx) (0x%016llx 0x%08x)\n", __func__, + (name ? name : ""), zp->z_id, + ra->ra_offset, len); + if (name) vnode_putname(name); + } +#endif + + break; + + case SPOTLIGHT_GET_MOUNT_TIME: + case SPOTLIGHT_IOC_GET_MOUNT_TIME: + case SPOTLIGHT_FSCTL_GET_MOUNT_TIME: + dprintf("%s SPOTLIGHT_GET_MOUNT_TIME\n", __func__); + *(uint32_t *)ap->a_data = zfsvfs->z_mount_time; + break; + case SPOTLIGHT_GET_UNMOUNT_TIME: + dprintf("%s SPOTLIGHT_GET_UNMOUNT_TIME\n", __func__); + *(uint32_t *)ap->a_data = zfsvfs->z_last_unmount_time; + break; + case SPOTLIGHT_FSCTL_GET_LAST_MTIME: + case SPOTLIGHT_IOC_GET_LAST_MTIME: + dprintf("%s SPOTLIGHT_FSCTL_GET_LAST_MTIME\n", + __func__); + *(uint32_t *)ap->a_data = zfsvfs->z_last_unmount_time; + break; + + case HFS_SET_ALWAYS_ZEROFILL: + dprintf("%s HFS_SET_ALWAYS_ZEROFILL\n", __func__); + /* Required by Spotlight search */ + break; + case HFS_EXT_BULKACCESS_FSCTL: + dprintf("%s HFS_EXT_BULKACCESS_FSCTL\n", __func__); + /* Required by Spotlight search */ + break; + + /* ioctl required to simulate HFS mimic behavior */ + case 0x80005802: + dprintf("%s 0x80005802 unknown\n", __func__); + /* unknown - from subsystem read, 'X', 2 */ + break; + + case HFS_GETPATH: + case HFSIOC_GETPATH: + dprintf("%s HFS_GETPATH\n", __func__); + { + struct vfsstatfs *vfsp; + struct vnode *file_vp; + ino64_t cnid; + int outlen; + char *bufptr; + int flags = 0; + + /* Caller must be owner of file system. */ + vfsp = vfs_statfs(zfsvfs->z_vfs); + if (proc_suser(current_proc()) && + kauth_cred_getuid((kauth_cred_t)cr) != + vfsp->f_owner) { + error = EACCES; + goto out; + } + /* Target vnode must be file system's root. */ + if (!vnode_isvroot(ap->a_vp)) { + error = EINVAL; + goto out; + } + + /* We are passed a string containing inode # */ + bufptr = (char *)ap->a_data; + cnid = strtoul(bufptr, NULL, 10); + if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) { + flags |= BUILDPATH_VOLUME_RELATIVE; + } + + if ((error = zfs_vfs_vget(zfsvfs->z_vfs, cnid, + &file_vp, (vfs_context_t)ct))) { + goto out; + } + error = build_path(file_vp, bufptr, MAXPATHLEN, + &outlen, flags, (vfs_context_t)ct); + vnode_put(file_vp); + + dprintf("ZFS: HFS_GETPATH done %d : '%s'\n", + error, error ? "" : bufptr); + } + break; + + case HFS_TRANSFER_DOCUMENT_ID: + case HFSIOC_TRANSFER_DOCUMENT_ID: + dprintf("%s HFS_TRANSFER_DOCUMENT_ID\n", __func__); + { + u_int32_t to_fd = *(u_int32_t *)ap->a_data; + file_t *to_fp; + struct vnode *to_vp; + znode_t *to_zp; + + to_fp = getf(to_fd); + if (to_fp == NULL) { + error = EBADF; + goto out; + } + + to_vp = getf_vnode(to_fp); + + if ((error = vnode_getwithref(to_vp))) { + releasef(to_fd); + goto out; + } + + /* Confirm it is inside our mount */ + if (((zfsvfs_t *)vfs_fsprivate( + vnode_mount(to_vp))) != zfsvfs) { + error = EXDEV; + goto transfer_out; + } + + to_zp = VTOZ(to_vp); + + /* Source should have UF_TRACKED */ + if (!(zp->z_pflags & ZFS_TRACKED)) { + dprintf("ZFS: source is not TRACKED\n"); + error = EINVAL; + /* dest should NOT have UF_TRACKED */ + } else if (to_zp->z_pflags & ZFS_TRACKED) { + dprintf("ZFS: dest already TRACKED\n"); + error = EEXIST; + /* should be valid types */ + } else if ( + (IFTOVT((mode_t)zp->z_mode) == VDIR) || + (IFTOVT((mode_t)zp->z_mode) == VREG) || + (IFTOVT((mode_t)zp->z_mode) == VLNK)) { + /* + * Make sure source has a document id + * - although it can't + */ + if (!zp->z_document_id) + zfs_setattr_generate_id(zp, 0, + NULL); + + /* transfer over */ + to_zp->z_document_id = + zp->z_document_id; + zp->z_document_id = 0; + to_zp->z_pflags |= ZFS_TRACKED; + zp->z_pflags &= ~ZFS_TRACKED; + + /* Commit to disk */ + zfs_setattr_set_documentid(to_zp, + B_TRUE); + zfs_setattr_set_documentid(zp, + B_TRUE); /* also update flags */ + dprintf("ZFS: Moved docid %u from " + "id %llu to id %llu\n", + to_zp->z_document_id, zp->z_id, + to_zp->z_id); + } +transfer_out: + vnode_put(to_vp); + releasef(to_fd); + } + break; + + + case F_MAKECOMPRESSED: + dprintf("%s F_MAKECOMPRESSED\n", __func__); + /* + * Not entirely sure what this does, but HFS comments + * include: "Make the file compressed; truncate & + * toggle BSD bits" + * makes compressed copy of allocated blocks + * shortens file to new length + * sets BSD bits to indicate per-file compression + * + * On HFS, locks cnode and compresses its data. ZFS + * inband compression makes this obsolete. + */ + if (vfs_isrdonly(zfsvfs->z_vfs) || + !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + error = EROFS; + goto out; + } + + /* Are there any other usecounts/FDs? */ + if (vnode_isinuse(ap->a_vp, 1)) { + error = EBUSY; + goto out; + } + + if (zp->z_pflags & ZFS_IMMUTABLE) { + error = EINVAL; + goto out; + } + + /* Return failure */ + error = EINVAL; + break; + + case HFS_PREV_LINK: + case HFS_NEXT_LINK: + case HFSIOC_PREV_LINK: + case HFSIOC_NEXT_LINK: + dprintf("%s HFS_PREV/NEXT_LINK\n", __func__); + { + /* + * Find sibling linkids with hardlinks. a_data points + * to the "current" linkid, and look up either prev + * or next (a_command) linkid. Return in a_data. + */ + uint32_t linkfileid; + struct vfsstatfs *vfsp; + /* Caller must be owner of file system. */ + vfsp = vfs_statfs(zfsvfs->z_vfs); + if ((kauth_cred_getuid(cr) == 0) && + kauth_cred_getuid(cr) != vfsp->f_owner) { + error = EACCES; + goto out; + } + /* Target vnode must be file system's root. */ + if (!vnode_isvroot(ap->a_vp)) { + error = EINVAL; + goto out; + } + linkfileid = *(uint32_t *)ap->a_data; + if (linkfileid < 16) { /* kHFSFirstUserCatalogNodeID */ + error = EINVAL; + goto out; + } + + /* + * Attempt to find the linkid in the hardlink_link + * AVL tree. If found, call to get prev or next. + */ + hardlinks_t *searchnode, *findnode, *sibling; + avl_index_t loc; + + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + searchnode->hl_linkid = linkfileid; + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks_linkid, + searchnode, &loc); + kmem_free(searchnode, sizeof (hardlinks_t)); + + if (!findnode) { + rw_exit(&zfsvfs->z_hardlinks_lock); + *(uint32_t *)ap->a_data = 0; + dprintf("ZFS: HFS_NEXT_LINK/HFS_PREV_LINK %u " + "not found\n", linkfileid); + goto out; + } + + if (ap->a_command != HFS_NEXT_LINK) { + + while ((sibling = + AVL_NEXT(&zfsvfs->z_hardlinks_linkid, + findnode)) != NULL) { + if (findnode->hl_fileid == + sibling->hl_fileid) + break; + } + + } else { + + while ((sibling = + AVL_PREV(&zfsvfs->z_hardlinks_linkid, + findnode)) != NULL) { + if (findnode->hl_fileid == + sibling->hl_fileid) + break; + } + + } + rw_exit(&zfsvfs->z_hardlinks_lock); + + dprintf("ZFS: HFS_%s_LINK %u sibling %u\n", + (ap->a_command != HFS_NEXT_LINK) ? "NEXT" : "PREV", + linkfileid, + sibling ? sibling->hl_linkid : 0); + + // Did we get a new node? + if (sibling == NULL) { + *(uint32_t *)ap->a_data = 0; + goto out; + } + + *(uint32_t *)ap->a_data = sibling->hl_linkid; + error = 0; + } + break; + + case HFS_RESIZE_PROGRESS: + case HFSIOC_RESIZE_PROGRESS: + dprintf("%s HFS_RESIZE_PROGRESS\n", __func__); + /* fail as if requested of non-root fs */ + error = EINVAL; + break; + + case HFS_RESIZE_VOLUME: + case HFSIOC_RESIZE_VOLUME: + dprintf("%s HFS_RESIZE_VOLUME\n", __func__); + /* fail as if requested of non-root fs */ + error = EINVAL; + break; + + case HFS_CHANGE_NEXT_ALLOCATION: + case HFSIOC_CHANGE_NEXT_ALLOCATION: + dprintf("%s HFS_CHANGE_NEXT_ALLOCATION\n", __func__); + /* fail as if requested of non-root fs */ + error = EINVAL; + break; + + case HFS_CHANGE_NEXTCNID: + case HFSIOC_CHANGE_NEXTCNID: + dprintf("%s HFS_CHANGE_NEXTCNID\n", __func__); + /* FIXME : fail as though read only */ + error = EROFS; + break; + + case F_FREEZE_FS: + dprintf("%s F_FREEZE_FS\n", __func__); + /* Dont support freeze */ + error = ENOTSUP; + break; + + case F_THAW_FS: + dprintf("%s F_THAW_FS\n", __func__); + /* dont support fail as though insufficient privilege */ + error = EACCES; + break; + + case HFS_BULKACCESS_FSCTL: + case HFSIOC_BULKACCESS: + dprintf("%s HFS_BULKACCESS_FSCTL\n", __func__); + /* Respond as if HFS_STANDARD flag is set */ + error = EINVAL; + break; + + case HFS_FSCTL_GET_VERY_LOW_DISK: + case HFSIOC_GET_VERY_LOW_DISK: + dprintf("%s HFS_FSCTL_GET_VERY_LOW_DISK\n", __func__); + *(uint32_t *)ap->a_data = + zfsvfs->z_freespace_notify_dangerlimit; + break; + + case HFS_FSCTL_SET_VERY_LOW_DISK: + case HFSIOC_SET_VERY_LOW_DISK: + dprintf("%s HFS_FSCTL_SET_VERY_LOW_DISK\n", __func__); + if (*(uint32_t *)ap->a_data >= + zfsvfs->z_freespace_notify_warninglimit) { + error = EINVAL; + } else { + zfsvfs->z_freespace_notify_dangerlimit = + *(uint32_t *)ap->a_data; + } + break; + + case HFS_FSCTL_GET_LOW_DISK: + case HFSIOC_GET_LOW_DISK: + dprintf("%s HFS_FSCTL_GET_LOW_DISK\n", __func__); + *(uint32_t *)ap->a_data = + zfsvfs->z_freespace_notify_warninglimit; + break; + + case HFS_FSCTL_SET_LOW_DISK: + case HFSIOC_SET_LOW_DISK: + dprintf("%s HFS_FSCTL_SET_LOW_DISK\n", __func__); + if (*(uint32_t *)ap->a_data >= + zfsvfs->z_freespace_notify_desiredlevel || + *(uint32_t *)ap->a_data <= + zfsvfs->z_freespace_notify_dangerlimit) { + error = EINVAL; + } else { + zfsvfs->z_freespace_notify_warninglimit = + *(uint32_t *)ap->a_data; + } + break; + + case HFS_FSCTL_GET_DESIRED_DISK: + case HFSIOC_GET_DESIRED_DISK: + dprintf("%s HFS_FSCTL_GET_DESIRED_DISK\n", __func__); + *(uint32_t *)ap->a_data = + zfsvfs->z_freespace_notify_desiredlevel; + break; + + case HFS_FSCTL_SET_DESIRED_DISK: + case HFSIOC_SET_DESIRED_DISK: + dprintf("%s HFS_FSCTL_SET_DESIRED_DISK\n", __func__); + if (*(uint32_t *)ap->a_data <= + zfsvfs->z_freespace_notify_warninglimit) { + error = EINVAL; + } else { + zfsvfs->z_freespace_notify_desiredlevel = + *(uint32_t *)ap->a_data; + } + break; + + case HFS_VOLUME_STATUS: + case HFSIOC_VOLUME_STATUS: + dprintf("%s HFS_VOLUME_STATUS\n", __func__); + /* For now we always reply "all ok" */ + *(uint32_t *)ap->a_data = + zfsvfs->z_notification_conditions; + break; + + case HFS_SET_BOOT_INFO: + dprintf("%s HFS_SET_BOOT_INFO\n", __func__); + /* + * ZFS booting is not supported, mimic selection + * of a non-root HFS volume + */ + *(uint32_t *)ap->a_data = 0; + error = EINVAL; + break; + case HFS_GET_BOOT_INFO: + { + u_int32_t vcbFndrInfo[8]; + printf("%s HFS_GET_BOOT_INFO\n", __func__); + /* + * ZFS booting is not supported, mimic selection + * of a non-root HFS volume + */ + memset(vcbFndrInfo, 0, sizeof (vcbFndrInfo)); + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(zfsvfs->z_vfs); + vcbFndrInfo[6] = vfsstatfs->f_fsid.val[0]; + vcbFndrInfo[7] = vfsstatfs->f_fsid.val[1]; + bcopy(vcbFndrInfo, ap->a_data, + sizeof (vcbFndrInfo)); + } + break; + case HFS_MARK_BOOT_CORRUPT: + dprintf("%s HFS_MARK_BOOT_CORRUPT\n", __func__); + /* + * ZFS booting is not supported, mimic selection + * of a non-root HFS volume + */ + *(uint32_t *)ap->a_data = 0; + error = EINVAL; + break; + + case HFS_FSCTL_GET_JOURNAL_INFO: + case HFSIOC_GET_JOURNAL_INFO: + dprintf("%s HFS_FSCTL_GET_JOURNAL_INFO\n", __func__); + /* + * XXX We're setting the mount as 'Journaled' + * so this might conflict + * Respond as though journal is empty/disabled + */ + { + struct hfs_journal_info *jip; + jip = (struct hfs_journal_info *)ap->a_data; + jip->jstart = 0; + jip->jsize = 0; + } + break; + + case HFS_DISABLE_METAZONE: + dprintf("%s HFS_DISABLE_METAZONE\n", __func__); + /* fail as though insufficient privs */ + error = EACCES; + break; + +#ifdef HFS_GET_FSINFO + case HFS_GET_FSINFO: + case HFSIOC_GET_FSINFO: + dprintf("%s HFS_GET_FSINFO\n", __func__); + break; +#endif + +#ifdef HFS_REPIN_HOTFILE_STATE + case HFS_REPIN_HOTFILE_STATE: + case HFSIOC_REPIN_HOTFILE_STATE: + dprintf("%s HFS_REPIN_HOTFILE_STATE\n", __func__); + break; +#endif + +#ifdef HFS_SET_HOTFILE_STATE + case HFS_SET_HOTFILE_STATE: + case HFSIOC_SET_HOTFILE_STATE: + dprintf("%s HFS_SET_HOTFILE_STATE\n", __func__); + break; +#endif + +#ifdef APFSIOC_GET_NEAR_LOW_DISK + case APFSIOC_GET_NEAR_LOW_DISK: + dprintf("%s APFSIOC_GET_NEAR_LOW_DISK\n", __func__); + *(uint32_t *)ap->a_data = + zfsvfs->z_freespace_notify_warninglimit; + break; +#endif + +#ifdef APFSIOC_SET_NEAR_LOW_DISK + case APFSIOC_SET_NEAR_LOW_DISK: + dprintf("%s APFSIOC_SET_NEAR_LOW_DISK\n", __func__); + if (*(uint32_t *)ap->a_data >= + zfsvfs->z_freespace_notify_desiredlevel || + *(uint32_t *)ap->a_data <= + zfsvfs->z_freespace_notify_dangerlimit) { + error = EINVAL; + } else { + zfsvfs->z_freespace_notify_warninglimit = + *(uint32_t *)ap->a_data; + } + break; +#endif + + /* End HFS mimic ioctl */ + + default: + dprintf("%s: Unknown ioctl %02lx ('%lu' + %lu)\n", + __func__, ap->a_command, (ap->a_command&0xff00)>>8, + ap->a_command&0xff); + error = ENOTTY; + } + +out: + if (error) { + dprintf("%s: failing ioctl: %02lx ('%lu' + %lu) returned %d\n", + __func__, ap->a_command, (ap->a_command&0xff00)>>8, + ap->a_command&0xff, error); + } + + return (error); +} + + +int +zfs_vnop_read(struct vnop_read_args *ap) +#if 0 + struct vnop_read_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; + }; +#endif +{ + int ioflag = zfs_ioflags(ap->a_ioflag); + int error; + /* uint64_t resid; */ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + + /* resid = uio_resid(ap->a_uio); */ + error = zfs_read(ap->a_vp, ap->a_uio, ioflag, cr); + + if (error) dprintf("vnop_read %d\n", error); + return (error); +} + +int +zfs_vnop_write(struct vnop_write_args *ap) +#if 0 + struct vnop_write_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; + }; +#endif +{ + int ioflag = zfs_ioflags(ap->a_ioflag); + int error; + DECLARE_CRED(ap); + + // dprintf("zfs_vnop_write(vp %p, offset 0x%llx size 0x%llx\n", + // ap->a_vp, uio_offset(ap->a_uio), uio_resid(ap->a_uio)); + + error = zfs_write(ap->a_vp, ap->a_uio, ioflag, cr); + + /* + * Mac OS X: pageout requires that the UBC file size be current. + * Possibly, we could update it only if size has changed. + */ + + /* if (tx_bytes != 0) { */ + if (!error) { + ubc_setsize(ap->a_vp, VTOZ(ap->a_vp)->z_size); + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + +int +zfs_vnop_access(struct vnop_access_args *ap) +#if 0 + struct vnop_access_args { + struct vnodeop_desc *a_desc; + struct vnode a_vp; + int a_action; + vfs_context_t a_context; + }; +#endif +{ + int error = ENOTSUP; + int action = ap->a_action; + int mode = 0; + DECLARE_CRED(ap); + + /* + * KAUTH_VNODE_READ_EXTATTRIBUTES, as well? + * KAUTH_VNODE_WRITE_EXTATTRIBUTES + */ + if (action & KAUTH_VNODE_READ_DATA) + mode |= VREAD; + if (action & KAUTH_VNODE_WRITE_DATA) + mode |= VWRITE; + if (action & KAUTH_VNODE_EXECUTE) + mode |= VEXEC; + + dprintf("vnop_access: action %04x -> mode %04x\n", action, mode); + error = zfs_access(ap->a_vp, mode, 0, cr); + + if (error) dprintf("%s: error %d\n", __func__, error); + return (error); +} + + +/* + * hard link references? + * Read the comment in zfs_getattr_znode_unlocked for the reason + * for this hackery. Since getattr(VA_NAME) is extremely common + * call in OSX, we opt to always save the name. We need to be careful + * as zfs_dirlook can return ctldir node as well (".zfs"). + * Hardlinks also need to be able to return the correct parentid. + */ +static void zfs_cache_name(struct vnode *vp, struct vnode *dvp, char *filename) +{ + znode_t *zp; + if (!vp || + !filename || + !filename[0] || + zfsctl_is_node(vp) || + !VTOZ(vp)) + return; + + // Only cache files, or we might end up caching "." + if (!vnode_isreg(vp)) + return; + + zp = VTOZ(vp); + + mutex_enter(&zp->z_lock); + + strlcpy(zp->z_name_cache, filename, + MAXPATHLEN); + + // If hardlink, remember the parentid. + if (((zp->z_links > 1) || (zp->z_finder_hardlink)) && + (IFTOVT((mode_t)zp->z_mode) == VREG) && dvp) { + zp->z_finder_parentid = VTOZ(dvp)->z_id; + } + + mutex_exit(&zp->z_lock); +} + + +int +zfs_vnop_lookup(struct vnop_lookup_args *ap) +#if 0 + struct vnop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + vfs_context_t a_context; + }; +#endif +{ + struct componentname *cnp = ap->a_cnp; + DECLARE_CRED(ap); + int error; + int negative_cache = 0; + znode_t *zp = NULL; + int direntflags = 0; + char filename[MAXNAMELEN]; + + *ap->a_vpp = NULL; /* In case we return an error */ + + /* + * Darwin uses namelen as an optimisation, for example it can be + * set to 5 for the string "alpha/beta" to look up "alpha". In this + * case we need to copy it out to null-terminate. + */ + bcopy(cnp->cn_nameptr, filename, cnp->cn_namelen); + filename[cnp->cn_namelen] = '\0'; + +#if 1 + /* + * cache_lookup() returns 0 for no-entry + * -1 for cache found (a_vpp set) + * ENOENT for negative cache + */ + error = cache_lookup(ap->a_dvp, ap->a_vpp, cnp); + if (error) { + /* We found a cache entry, positive or negative. */ + if (error == -1) { /* Positive entry? */ + if (!zfs_vnop_ignore_positives) { + error = 0; + goto exit; /* Positive cache, return it */ + } + /* Release iocount held by cache_lookup */ + vnode_put(*ap->a_vpp); + } + /* Negatives are only followed if not CREATE, from HFS+. */ + if (cnp->cn_nameiop != CREATE) { + if (!zfs_vnop_ignore_negatives) { + goto exit; /* Negative cache hit */ + } + negative_cache = 1; + } + } +#endif + + dprintf("+vnop_lookup '%s' %s\n", filename, + negative_cache ? "negative_cache":""); + + /* + * 'cnp' passed to us is 'readonly' as XNU does not expect a return + * name, but most likely expects it correct in getattr. + */ + struct componentname cn2; + cn2.cn_nameptr = filename; + cn2.cn_namelen = MAXNAMELEN; + cn2.cn_nameiop = cnp->cn_nameiop; + cn2.cn_flags = cnp->cn_flags; + + error = zfs_lookup(VTOZ(ap->a_dvp), filename, &zp, /* flags */ 0, cr, + &direntflags, &cn2); + /* flags can be LOOKUP_XATTR | FIGNORECASE */ + +#if 1 + /* + * It appears that VFS layer adds negative cache entries for us, so + * we do not need to add them here, or they are duplicated. + */ + if ((error == ENOENT) && zfs_vnop_create_negatives) { + if ((ap->a_cnp->cn_nameiop == CREATE || + ap->a_cnp->cn_nameiop == RENAME) && + (cnp->cn_flags & ISLASTCN)) { + error = EJUSTRETURN; + goto exit; + } + /* Insert name into cache (as non-existent) if appropriate. */ + if ((cnp->cn_flags & MAKEENTRY) && + ap->a_cnp->cn_nameiop != CREATE) { + cache_enter(ap->a_dvp, NULL, ap->a_cnp); + dprintf("Negative-cache made for '%s'\n", + filename); + } + } /* ENOENT */ +#endif + +exit: + + if (error == 0 && (zp != NULL)) { + dprintf("back with zp %p: name '%s'\n", zp, filename); + + *ap->a_vpp = ZTOV(zp); + + zfs_cache_name(*ap->a_vpp, ap->a_dvp, filename); + + } + + dprintf("-vnop_lookup %d : dvp %llu '%s'\n", error, + VTOZ(ap->a_dvp)->z_id, filename); + + return (error); +} + +int +zfs_vnop_create(struct vnop_create_args *ap) +#if 0 + struct vnop_create_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + DECLARE_CRED(ap); + vcexcl_t excl; + int mode = 0; /* FIXME */ + int error; + znode_t *zp = NULL; + + dprintf("vnop_create: '%s'\n", cnp->cn_nameptr); + + /* + * extern int zfs_create(struct vnode *dvp, char *name, vattr_t *vap, + * int excl, int mode, struct vnode **vpp, cred_t *cr); + */ + excl = (vap->va_vaflags & VA_EXCLUSIVE) ? EXCL : NONEXCL; + + error = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, excl, mode, + &zp, cr, 0, NULL); + if (!error) { + cache_purge_negatives(ap->a_dvp); + *ap->a_vpp = ZTOV(zp); + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + + +static int zfs_remove_hardlink(struct vnode *vp, struct vnode *dvp, char *name) +{ + /* + * Because we store hash of hardlinks in an AVLtree, we need to remove + * any entries in it upon deletion. Since it is complicated to know + * if an entry was a hardlink, we simply check if the avltree has the + * name. + */ + hardlinks_t *searchnode, *findnode; + avl_index_t loc; + + if (!vp || !VTOZ(vp)) + return (1); + if (!dvp || !VTOZ(dvp)) + return (1); + znode_t *zp = VTOZ(vp); + znode_t *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int ishardlink = 0; + + ishardlink = ((zp->z_links > 1) && + (IFTOVT((mode_t)zp->z_mode) == VREG)) ? 1 : 0; + if (zp->z_finder_hardlink) + ishardlink = 1; + + if (!ishardlink) + return (0); + + dprintf("ZFS: removing hash (%llu,%llu,'%s')\n", + dzp->z_id, zp->z_id, name); + + // Attempt to remove from hardlink avl, if its there + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + searchnode->hl_parent = dzp->z_id == zfsvfs->z_root ? 2 : dzp->z_id; + searchnode->hl_fileid = zp->z_id; + strlcpy(searchnode->hl_name, name, PATH_MAX); + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks, searchnode, &loc); + rw_exit(&zfsvfs->z_hardlinks_lock); + kmem_free(searchnode, sizeof (hardlinks_t)); + + // Found it? remove it + if (findnode) { + rw_enter(&zfsvfs->z_hardlinks_lock, RW_WRITER); + avl_remove(&zfsvfs->z_hardlinks, findnode); + avl_remove(&zfsvfs->z_hardlinks_linkid, findnode); + rw_exit(&zfsvfs->z_hardlinks_lock); + kmem_free(findnode, sizeof (*findnode)); + dprintf("ZFS: removed hash '%s'\n", name); + mutex_enter(&zp->z_lock); + zp->z_name_cache[0] = 0; + zp->z_finder_parentid = 0; + mutex_exit(&zp->z_lock); + return (1); + } + return (0); +} + + +static int zfs_rename_hardlink(struct vnode *vp, struct vnode *tvp, + struct vnode *fdvp, struct vnode *tdvp, + char *from, char *to) +{ + /* + * Because we store hash of hardlinks in an AVLtree, we need to update + * any entries in it upon rename. Since it is complicated to know + * if an entry was a hardlink, we simply check if the avltree has the + * name. + */ + hardlinks_t *searchnode, *findnode, *delnode; + avl_index_t loc; + uint64_t parent_fid, parent_tid; + int ishardlink = 0; + + if (!vp || !VTOZ(vp)) + return (0); + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ishardlink = ((zp->z_links > 1) && + (IFTOVT((mode_t)zp->z_mode) == VREG)) ? 1 : 0; + if (zp->z_finder_hardlink) + ishardlink = 1; + + if (!ishardlink) + return (0); + + if (!fdvp || !VTOZ(fdvp)) + return (0); + parent_fid = VTOZ(fdvp)->z_id; + parent_fid = parent_fid == zfsvfs->z_root ? 2 : parent_fid; + + if (!tdvp || !VTOZ(tdvp)) { + parent_tid = parent_fid; + } else { + parent_tid = VTOZ(tdvp)->z_id; + parent_tid = parent_tid == zfsvfs->z_root ? 2 : parent_tid; + } + + dprintf("ZFS: looking to rename hardlinks (%llu,%llu,%s)\n", + parent_fid, zp->z_id, from); + + + // Attempt to remove from hardlink avl, if its there + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + searchnode->hl_parent = parent_fid; + searchnode->hl_fileid = zp->z_id; + strlcpy(searchnode->hl_name, from, PATH_MAX); + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks, searchnode, &loc); + rw_exit(&zfsvfs->z_hardlinks_lock); + + // Found it? update it + if (findnode) { + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_WRITER); + + // Technically, we do not need to re-do the _linkid AVL here. + avl_remove(&zfsvfs->z_hardlinks, findnode); + avl_remove(&zfsvfs->z_hardlinks_linkid, findnode); + + // If we already have a hashid for "to" and the rename + // presumably unlinked it, we need to remove it first. + searchnode->hl_parent = parent_tid; + strlcpy(searchnode->hl_name, to, PATH_MAX); + delnode = avl_find(&zfsvfs->z_hardlinks, searchnode, &loc); + if (delnode) { + dprintf("ZFS: apparently %llu:'%s' exists, deleting\n", + parent_tid, to); + avl_remove(&zfsvfs->z_hardlinks, delnode); + avl_remove(&zfsvfs->z_hardlinks_linkid, delnode); + kmem_free(delnode, sizeof (*delnode)); + } + + dprintf("ZFS: renamed hash %llu (%llu:'%s' to %llu:'%s'): %s\n", + zp->z_id, + parent_fid, from, + parent_tid, to, + delnode ? "deleted":""); + + // Update source node to new hash, and name. + findnode->hl_parent = parent_tid; + strlcpy(findnode->hl_name, to, PATH_MAX); + // zp->z_finder_parentid = parent_tid; + + avl_add(&zfsvfs->z_hardlinks, findnode); + avl_add(&zfsvfs->z_hardlinks_linkid, findnode); + + rw_exit(&zfsvfs->z_hardlinks_lock); + kmem_free(searchnode, sizeof (hardlinks_t)); + + return (1); + } + + kmem_free(searchnode, sizeof (hardlinks_t)); + return (0); +} + + +int +zfs_vnop_remove(struct vnop_remove_args *ap) +#if 0 + struct vnop_remove_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int error; + + dprintf("vnop_remove: %p (%s)\n", ap->a_vp, ap->a_cnp->cn_nameptr); + + /* + * extern int zfs_remove ( struct vnode *dvp, char *name, cred_t *cr, + * caller_context_t *ct, int flags); + */ + error = zfs_remove(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, cr, + /* flags */0); + if (!error) { + cache_purge(ap->a_vp); + + zfs_remove_hardlink(ap->a_vp, + ap->a_dvp, + ap->a_cnp->cn_nameptr); + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + +int +zfs_vnop_mkdir(struct vnop_mkdir_args *ap) +#if 0 + struct vnop_mkdir_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int error; + + dprintf("vnop_mkdir '%s'\n", ap->a_cnp->cn_nameptr); + +#if 0 + /* Let's deny OS X fseventd for now */ + if (ap->a_cnp->cn_nameptr && + strcmp(ap->a_cnp->cn_nameptr, ".fseventsd") == 0) + return (EINVAL); +#endif + +#if 0 + /* spotlight for now */ + if (ap->a_cnp->cn_nameptr && + strcmp(ap->a_cnp->cn_nameptr, ".Spotlight-V100") == 0) + return (EINVAL); +#endif + /* + * extern int zfs_mkdir(struct vnode *dvp, char *dirname, vattr_t *vap, + * struct vnode **vpp, cred_t *cr, caller_context_t *ct, int flags, + * vsecattr_t *vsecp); + */ + znode_t *zp = NULL; + error = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, ap->a_vap, + &zp, cr, /* flags */0, /* vsecp */NULL); + if (!error) { + *ap->a_vpp = ZTOV(zp); + cache_purge_negatives(ap->a_dvp); + vnode_update_identity(*ap->a_vpp, ap->a_dvp, + (const char *)ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen, + 0, VNODE_UPDATE_NAME); + + VERIFY3P(zp->z_zfsvfs, ==, + vfs_fsprivate(vnode_mount(*ap->a_vpp))); + + + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + +int +zfs_vnop_rmdir(struct vnop_rmdir_args *ap) +#if 0 + struct vnop_rmdir_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int error; + + dprintf("vnop_rmdir\n"); + + /* + * extern int zfs_rmdir(struct vnode *dvp, char *name, + * struct vnode *cwd, cred_t *cr, caller_context_t *ct, int flags); + */ + error = zfs_rmdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, + /* cwd */NULL, cr, /* flags */0); + if (!error) { + cache_purge(ap->a_vp); + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + +int +zfs_vnop_readdir(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + int error; + DECLARE_CRED(ap); + + dprintf("+readdir: %p\n", ap->a_vp); + + /* + * XXX This interface needs vfs_has_feature. + * XXX zfs_readdir() also needs to grow support for passing back the + * number of entries (OS X/FreeBSD) and cookies (FreeBSD). However, + * it should be the responsibility of the OS caller to malloc/free + * space for that. + */ + + /* + * extern int zfs_readdir(struct vnode *vp, uio_t *uio, cred_t *cr, + * int *eofp, int flags, int *a_numdirent); + */ + *ap->a_numdirent = 0; + + error = zfs_readdir(ap->a_vp, ap->a_uio, cr, ap->a_eofflag, ap->a_flags, + ap->a_numdirent); + + /* .zfs dirs can be completely empty */ + if (*ap->a_numdirent == 0) + *ap->a_numdirent = 2; /* . and .. */ + + if (error) { + dprintf("-readdir %d (nument %d)\n", error, *ap->a_numdirent); + } + return (error); +} + +int +zfs_vnop_fsync(struct vnop_fsync_args *ap) +#if 0 + struct vnop_fsync_args { + struct vnode *a_vp; + int a_waitfor; + vfs_context_t a_context; + }; +#endif +{ + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs; +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int err; + + /* + * Check if this znode has already been synced, freed, and recycled + * by znode_pageout_func. + * + * XXX What is this? Substitute for Illumos vn_has_cached_data()? + */ + if (zp == NULL) + return (0); + + zfsvfs = zp->z_zfsvfs; + + if (!zfsvfs) + return (0); + + /* + * If we come here via vnode_create()->vclean() we can not end up in + * zil_commit() or we will deadlock. But we know that vnop_reclaim will + * be called next, so we just return success. + */ + if (vnode_isrecycled(ap->a_vp)) + return (0); + + err = zfs_fsync(VTOZ(ap->a_vp), /* flag */0, cr); + + if (err) dprintf("%s err %d\n", __func__, err); + + return (err); +} + +int +zfs_vnop_getattr(struct vnop_getattr_args *ap) +#if 0 + struct vnop_getattr_args { + struct vnode *a_vp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ + int error; + DECLARE_CRED_AND_CONTEXT(ap); + + /* dprintf("+vnop_getattr zp %p vp %p\n", VTOZ(ap->a_vp), ap->a_vp); */ + + error = zfs_getattr(ap->a_vp, ap->a_vap, /* flags */0, cr, ct); + + if (error == 0) { + error = zfs_getattr_znode_unlocked(ap->a_vp, ap->a_vap); + } + if (error) + dprintf("-vnop_getattr '%p' %d\n", (ap->a_vp), error); + + return (error); +} + +int +zfs_vnop_setattr(struct vnop_setattr_args *ap) +#if 0 + struct vnop_setattr_args { + struct vnode *a_vp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + vattr_t *vap = ap->a_vap; + uint_t mask = vap->va_mask; + int error = 0; + int hfscompression = 0; + znode_t *zp = VTOZ(ap->a_vp); + + /* Translate OS X requested mask to ZFS */ + mask = vap->va_mask; + + /* + * Both 'flags' and 'acl' can come to setattr, but without 'mode' set. + * However, ZFS assumes 'mode' is also set. We need to look up 'mode' in + * this case. + */ + if ((VATTR_IS_ACTIVE(vap, va_flags) || VATTR_IS_ACTIVE(vap, va_acl)) && + !VATTR_IS_ACTIVE(vap, va_mode)) { + uint64_t mode; + + mask |= ATTR_MODE; + + dprintf("fetching MODE for FLAGS or ACL\n"); + ZFS_ENTER(zp->z_zfsvfs); + ZFS_VERIFY_ZP(zp); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zp->z_zfsvfs), &mode, + sizeof (mode)); + vap->va_mode = mode; + ZFS_EXIT(zp->z_zfsvfs); + } + if (VATTR_IS_ACTIVE(vap, va_flags)) { + + /* + * If TRACKED is wanted, and not previously set, + * go set DocumentID + */ + if ((vap->va_flags & UF_TRACKED) && + !(zp->z_pflags & ZFS_TRACKED)) { + zfs_setattr_generate_id(zp, 0, NULL); + /* flags updated in vnops */ + zfs_setattr_set_documentid(zp, B_FALSE); + } + + /* If they are trying to turn on compression.. */ + if (vap->va_flags & UF_COMPRESSED) { + zp->z_skip_truncate_undo_decmpfs = B_TRUE; + dprintf("setattr trying to set COMPRESSED!\n"); + } + /* Map OS X file flags to zfs file flags */ + zfs_setbsdflags(zp, vap->va_flags); + dprintf("OS X flags %08x changed to ZFS %04llx\n", + vap->va_flags, zp->z_pflags); + vap->va_flags = zp->z_pflags; + + } + + vap->va_mask = mask; + + /* + * If z_skip_truncate_undo_decmpfs is set, and they are trying to + * va_size == 0 (truncate), we undo the decmpfs work here. This is + * because we can not stop (no error, or !feature works) macOS from + * using decmpfs. + */ +#ifndef DECMPFS_XATTR_NAME +#define DECMPFS_XATTR_NAME "com.apple.decmpfs" +#endif + if ((VATTR_IS_ACTIVE(vap, va_total_size) || + VATTR_IS_ACTIVE(vap, va_data_size)) && + zp->z_skip_truncate_undo_decmpfs) { + zp->z_skip_truncate_undo_decmpfs = B_FALSE; + + dprintf("setattr setsize with compress attempted\n"); + + if (zfs_vnop_removexattr_int(zp->z_zfsvfs, zp, + DECMPFS_XATTR_NAME, NULL) == 0) { + /* Successfully deleted the XATTR - skip truncate */ + VATTR_CLEAR_ACTIVE(vap, va_total_size); + VATTR_CLEAR_ACTIVE(vap, va_data_size); + dprintf("setattr skipping truncate!\n"); + } + } + + error = zfs_setattr(VTOZ(ap->a_vp), ap->a_vap, /* flag */0, cr); + + dprintf("vnop_setattr: called on vp %p with mask %04x, err=%d\n", + ap->a_vp, mask, error); + + if (!error) { + /* If successful, tell OS X which fields ZFS set. */ + if (VATTR_IS_ACTIVE(vap, va_data_size)) { + dprintf("ZFS: setattr new size %llx %llx\n", + vap->va_size, ubc_getsize(ap->a_vp)); + ubc_setsize(ap->a_vp, vap->va_size); + VATTR_SET_SUPPORTED(vap, va_data_size); + } + if (VATTR_IS_ACTIVE(vap, va_mode)) + VATTR_SET_SUPPORTED(vap, va_mode); + if (VATTR_IS_ACTIVE(vap, va_acl)) + VATTR_SET_SUPPORTED(vap, va_acl); + if (VATTR_IS_ACTIVE(vap, va_uid)) + VATTR_SET_SUPPORTED(vap, va_uid); + if (VATTR_IS_ACTIVE(vap, va_gid)) + VATTR_SET_SUPPORTED(vap, va_gid); + if (VATTR_IS_ACTIVE(vap, va_access_time)) + VATTR_SET_SUPPORTED(vap, va_access_time); + if (VATTR_IS_ACTIVE(vap, va_modify_time)) + VATTR_SET_SUPPORTED(vap, va_modify_time); + if (VATTR_IS_ACTIVE(vap, va_change_time)) + VATTR_SET_SUPPORTED(vap, va_change_time); + if (VATTR_IS_ACTIVE(vap, va_create_time)) + VATTR_SET_SUPPORTED(vap, va_create_time); + if (VATTR_IS_ACTIVE(vap, va_backup_time)) + VATTR_SET_SUPPORTED(vap, va_backup_time); + if (VATTR_IS_ACTIVE(vap, va_flags)) { + VATTR_SET_SUPPORTED(vap, va_flags); + } + + } + +#if 1 + uint64_t missing = 0; + missing = (vap->va_active ^ (vap->va_active & vap->va_supported)); + if (missing != 0) { + dprintf("vnop_setattr:: asked %08llx replied %08llx " + "missing %08llx\n", vap->va_active, + vap->va_supported, missing); + } +#endif + + if (error) + dprintf("ZFS: vnop_setattr return failure %d\n", error); + return (error); +} + +int +zfs_vnop_rename(struct vnop_rename_args *ap) +#if 0 + struct vnop_rename_args { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int error; + + dprintf("vnop_rename\n"); + + /* + * extern int zfs_rename(struct vnode *sdvp, char *snm, + * struct vnode *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, + * int flags); + */ + error = zfs_rename(VTOZ(ap->a_fdvp), ap->a_fcnp->cn_nameptr, + VTOZ(ap->a_tdvp), ap->a_tcnp->cn_nameptr, cr, /* flags */0); + + if (!error) { + cache_purge_negatives(ap->a_fdvp); + cache_purge_negatives(ap->a_tdvp); + cache_purge(ap->a_fvp); + + zfs_rename_hardlink(ap->a_fvp, ap->a_tvp, + ap->a_fdvp, ap->a_tdvp, + ap->a_fcnp->cn_nameptr, + ap->a_tcnp->cn_nameptr); + if (ap->a_tvp) { + cache_purge(ap->a_tvp); + } + +#ifdef __APPLE__ + /* + * After a rename, the VGET path /.vol/$fsid/$ino fails for + * a short period on hardlinks (until someone calls lookup). + * So until we can figure out exactly why this is, we drive + * a lookup here to ensure that vget will work + * (Finder/Spotlight). + */ + if (ap->a_fvp && VTOZ(ap->a_fvp) && + VTOZ(ap->a_fvp)->z_finder_hardlink) { + struct vnode *vp; + if (VOP_LOOKUP(ap->a_tdvp, &vp, ap->a_tcnp, + spl_vfs_context_kernel()) == 0) + vnode_put(vp); + } +#endif + + } + + if (error) dprintf("%s: error %d\n", __func__, error); + return (error); +} + +#if defined(MAC_OS_X_VERSION_10_12) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12) +int +zfs_vnop_renamex(struct vnop_renamex_args *ap) +#if 0 + struct vnop_renamex_args { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + struct vnode_attr *a_vap; // Reserved for future use + vfs_rename_flags_t a_flags; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + int error; + + dprintf("vnop_renamex\n"); + + /* + * extern int zfs_rename(struct vnode *sdvp, char *snm, + * struct vnode *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, + * int flags); + * + * Currently, hfs only supports one flag, VFS_RENAME_EXCL, so + * we will do the same. Since zfs_rename() only has logic for + * FIGNORECASE, passing VFS_RENAME_EXCL should be ok, if a bit + * hacky. + */ + error = zfs_rename(VTOZ(ap->a_fdvp), ap->a_fcnp->cn_nameptr, + VTOZ(ap->a_tdvp), ap->a_tcnp->cn_nameptr, cr, + (ap->a_flags&VFS_RENAME_EXCL)); + + if (!error) { + cache_purge_negatives(ap->a_fdvp); + cache_purge_negatives(ap->a_tdvp); + cache_purge(ap->a_fvp); + + zfs_rename_hardlink(ap->a_fvp, ap->a_tvp, + ap->a_fdvp, ap->a_tdvp, + ap->a_fcnp->cn_nameptr, + ap->a_tcnp->cn_nameptr); + if (ap->a_tvp) { + cache_purge(ap->a_tvp); + } + +#ifdef __APPLE__ + /* + * After a rename, the VGET path /.vol/$fsid/$ino fails for + * a short period on hardlinks (until someone calls lookup). + * So until we can figure out exactly why this is, we drive + * a lookup here to ensure that vget will work + * (Finder/Spotlight). + */ + if (ap->a_fvp && VTOZ(ap->a_fvp) && + VTOZ(ap->a_fvp)->z_finder_hardlink) { + struct vnode *vp; + if (VOP_LOOKUP(ap->a_tdvp, &vp, ap->a_tcnp, + spl_vfs_context_kernel()) == 0) + vnode_put(vp); + } +#endif + + } + + if (error) dprintf("%s: error %d\n", __func__, error); + return (error); +} +#endif // vnop_renamex_args + +int +zfs_vnop_symlink(struct vnop_symlink_args *ap) +#if 0 + struct vnop_symlink_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vnode_vattr *a_vap; + char *a_target; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + int error; + + dprintf("vnop_symlink\n"); + + /* + * extern int zfs_symlink(struct vnode *dvp, struct vnode **vpp, + * char *name, vattr_t *vap, char *link, cred_t *cr); + */ + + /* OS X doesn't need to set vap->va_mode? */ + znode_t *zp = NULL; + error = zfs_symlink(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, + ap->a_vap, ap->a_target, &zp, cr, 0); + if (!error) { + *ap->a_vpp = ZTOV(zp); + cache_purge_negatives(ap->a_dvp); + } else { + dprintf("%s: error %d\n", __func__, error); + } + /* XXX zfs_attach_vnode()? */ + return (error); +} + + +int +zfs_vnop_readlink(struct vnop_readlink_args *ap) +#if 0 + struct vnop_readlink_args { + struct vnode *vp; + struct uio *uio; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + + dprintf("vnop_readlink\n"); + + /* + * extern int zfs_readlink(struct vnode *vp, uio_t *uio, cred_t *cr, + * caller_context_t *ct); + */ + return (zfs_readlink(ap->a_vp, ap->a_uio, cr)); +} + +int +zfs_vnop_link(struct vnop_link_args *ap) +#if 0 + struct vnop_link_args { + struct vnode *a_vp; + struct vnode *a_tdvp; + struct componentname *a_cnp; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int error; + + dprintf("vnop_link\n"); + + /* XXX Translate this inside zfs_link() instead. */ + if (vnode_mount(ap->a_vp) != vnode_mount(ap->a_tdvp)) { + dprintf("%s: vp and tdvp on different mounts\n", __func__); + return (EXDEV); + } + + /* + * XXX Understand why Apple made this comparison in so many places where + * others do not. + */ + if (ap->a_cnp->cn_namelen >= ZAP_MAXNAMELEN) { + dprintf("%s: name too long %d\n", __func__, + ap->a_cnp->cn_namelen); + return (ENAMETOOLONG); + } + + /* + * extern int zfs_link(struct vnode *tdvp, struct vnode *svp, + * char *name, cred_t *cr, caller_context_t *ct, int flags); + */ + + error = zfs_link(VTOZ(ap->a_tdvp), VTOZ(ap->a_vp), + ap->a_cnp->cn_nameptr, cr, 0); + if (!error) { + // Set source vnode to multipath too, zfs_get_vnode() + // handles the target + vnode_setmultipath(ap->a_vp); + cache_purge(ap->a_vp); + cache_purge_negatives(ap->a_tdvp); + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + +int +zfs_vnop_pagein(struct vnop_pagein_args *ap) +#if 0 + struct vnop_pagein_args { + struct vnode *a_vp; + upl_t a_pl; + vm_offset_t a_pl_offset; + off_t a_foffset; + size_t a_size; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ + /* XXX Crib this from the Apple zfs_vnops.c. */ + struct vnode *vp = ap->a_vp; + offset_t off = ap->a_f_offset; + size_t len = ap->a_size; + upl_t upl = ap->a_pl; + vm_offset_t upl_offset = ap->a_pl_offset; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + caddr_t vaddr = NULL; + /* vm_offset_t vaddr = NULL; */ + int flags = ap->a_flags; + int need_unlock = 0; + int error = 0; + uint64_t file_sz; + + dprintf("+vnop_pagein: %p/%p off 0x%llx size 0x%lx filesz 0x%llx\n", + zp, vp, off, len, zp->z_size); + + if (upl == (upl_t)NULL) + panic("zfs_vnop_pagein: no upl!"); + + if (len <= 0) { + dprintf("zfs_vnop_pagein: invalid size %ld", len); + if (!(flags & UPL_NOCOMMIT)) + (void) ubc_upl_abort(upl, 0); + return (EINVAL); + } + + ZFS_ENTER(zfsvfs); + + file_sz = zp->z_size; + + ASSERT(vn_has_cached_data(vp)); + /* ASSERT(zp->z_dbuf_held && zp->z_phys); */ + /* can't fault passed EOF */ + if ((off < 0) || (off >= file_sz) || + (len & PAGE_MASK) || (upl_offset & PAGE_MASK)) { + dprintf("passed EOF or size error\n"); + ZFS_EXIT(zfsvfs); + if (!(flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(upl, upl_offset, len, + (UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY)); + return (EFAULT); + } + + /* + * If we already own the lock, then we must be page faulting in the + * middle of a write to this file (i.e., we are writing to this file + * using data from a mapped region of the file). + */ + if (!rw_write_held(&zp->z_map_lock)) { + rw_enter(&zp->z_map_lock, RW_WRITER); + need_unlock = TRUE; + } + + + if (ubc_upl_map(upl, (vm_offset_t *)&vaddr) != KERN_SUCCESS) { + dprintf("zfs_vnop_pagein: failed to ubc_upl_map"); + if (!(flags & UPL_NOCOMMIT)) + (void) ubc_upl_abort(upl, 0); + if (need_unlock) + rw_exit(&zp->z_map_lock); + ZFS_EXIT(zfsvfs); + return (ENOMEM); + } + + dprintf("vaddr %p with upl_off 0x%lx\n", vaddr, upl_offset); + vaddr += upl_offset; + + /* Can't read beyond EOF - but we need to zero those extra bytes. */ + if (off + len > file_sz) { + uint64_t newend = file_sz - off; + + dprintf("ZFS: pagein zeroing offset 0x%llx for 0x%llx bytes.\n", + newend, len - newend); + memset(&vaddr[newend], 0, len - newend); + len = newend; + } + /* + * Fill pages with data from the file. + */ + while (len > 0) { + uint64_t readlen; + + readlen = MIN(PAGESIZE, len); + + dprintf("pagein from off 0x%llx len 0x%llx into " + "address %p (len 0x%lx)\n", + off, readlen, vaddr, len); + + error = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, readlen, + (void *)vaddr, DMU_READ_PREFETCH); + if (error) { + printf("zfs_vnop_pagein: dmu_read err %d\n", error); + break; + } + off += readlen; + vaddr += readlen; + len -= readlen; + } + ubc_upl_unmap(upl); + + if (!(flags & UPL_NOCOMMIT)) { + if (error) + ubc_upl_abort_range(upl, upl_offset, ap->a_size, + (UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY)); + else + ubc_upl_commit_range(upl, upl_offset, ap->a_size, + (UPL_COMMIT_CLEAR_DIRTY | + UPL_COMMIT_FREE_ON_EMPTY)); + } + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + /* + * We can't grab the range lock for the page as reader which would stop + * truncation as this leads to deadlock. So we need to recheck the file + * size. + */ + if (ap->a_f_offset >= file_sz) + error = EFAULT; + if (need_unlock) + rw_exit(&zp->z_map_lock); + + ZFS_EXIT(zfsvfs); + if (error) dprintf("%s error %d\n", __func__, error); + return (error); +} + + + + +static int +zfs_pageout(zfsvfs_t *zfsvfs, znode_t *zp, upl_t upl, vm_offset_t upl_offset, + offset_t off, size_t size, int flags) +{ + dmu_tx_t *tx; + zfs_locked_range_t *lr; + uint64_t filesz; + int err = 0; + size_t len = size; + + dprintf("+vnop_pageout: %p/%p off 0x%llx len 0x%lx upl_off 0x%lx: " + "blksz 0x%x, z_size 0x%llx upl %p flags 0x%x\n", zp, ZTOV(zp), + off, len, upl_offset, zp->z_blksz, + zp->z_size, upl, flags); + + if (upl == (upl_t)NULL) { + dprintf("ZFS: vnop_pageout: failed on NULL upl\n"); + return (EINVAL); + } + /* + * We can't leave this function without either calling upl_commit or + * upl_abort. So use the non-error version. + */ + ZFS_ENTER_IFERROR(zfsvfs) { + if (!(flags & UPL_NOCOMMIT)) + (void) ubc_upl_abort(upl, + UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); + dprintf("ZFS: vnop_pageout: abort on z_unmounted\n"); + ZFS_EXIT(zfsvfs); + return (EIO); + } + + + ASSERT(vn_has_cached_data(ZTOV(zp))); + /* ASSERT(zp->z_dbuf_held); */ /* field no longer present in znode. */ + + if (len <= 0) { + if (!(flags & UPL_NOCOMMIT)) + (void) ubc_upl_abort(upl, + UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); + err = EINVAL; + goto exit; + } + if (vnode_vfsisrdonly(ZTOV(zp))) { + if (!(flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(upl, upl_offset, len, + UPL_ABORT_FREE_ON_EMPTY); + err = EROFS; + goto exit; + } + + filesz = zp->z_size; /* get consistent copy of zp_size */ + + if (off < 0 || off >= filesz || (off & PAGE_MASK_64) || + (len & PAGE_MASK)) { + if (!(flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(upl, upl_offset, len, + UPL_ABORT_FREE_ON_EMPTY); + err = EINVAL; + goto exit; + } + + uint64_t pgsize = roundup(filesz, PAGESIZE); + + /* Any whole pages beyond the end of the file while we abort */ + if ((size + off) > pgsize) { + printf("ZFS: pageout abort outside pages (rounded 0x%llx > " + "UPLlen 0x%llx\n", pgsize, size + off); + ubc_upl_abort_range(upl, pgsize, + pgsize - (size + off), + UPL_ABORT_FREE_ON_EMPTY); + } + + dprintf("ZFS: starting with size %lx\n", len); + +top: + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + /* + * can't push pages passed end-of-file + */ + filesz = zp->z_size; + if (off >= filesz) { + /* ignore all pages */ + err = 0; + goto out; + } else if (off + len > filesz) { +#if 0 + int npages = btopr(filesz - off); + page_t *trunc; + + page_list_break(&pp, &trunc, npages); + /* ignore pages past end of file */ + if (trunc) + pvn_write_done(trunc, flags); +#endif + len = filesz - off; + } + + tx = dmu_tx_create(zfsvfs->z_os); + if (!tx) { + dprintf("ZFS: zfs_vnops_osx: NULL TX encountered!\n"); + if (!(flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(upl, upl_offset, len, + UPL_ABORT_FREE_ON_EMPTY); + err = EINVAL; + goto exit; + } + dmu_tx_hold_write(tx, zp->z_id, off, len); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + if (err == ERESTART) { + zfs_rangelock_exit(lr); + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + goto out; + } + + caddr_t va; + + if (ubc_upl_map(upl, (vm_offset_t *)&va) != KERN_SUCCESS) { + err = EINVAL; + goto out; + } + + va += upl_offset; + while (len >= PAGESIZE) { + ssize_t sz = PAGESIZE; + + dprintf("pageout: dmu_write off 0x%llx size 0x%lx\n", off, sz); + + dmu_write(zfsvfs->z_os, zp->z_id, off, sz, va, tx); + va += sz; + off += sz; + len -= sz; + } + + /* + * The last, possibly partial block needs to have the data zeroed that + * would extend past the size of the file. + */ + if (len > 0) { + ssize_t sz = len; + + dprintf("pageout: dmu_writeX off 0x%llx size 0x%lx\n", off, sz); + dmu_write(zfsvfs->z_os, zp->z_id, off, sz, va, tx); + + va += sz; + off += sz; + len -= sz; + + /* + * Zero out the remainder of the PAGE that didn't fit within + * the file size. + */ + // bzero(va, PAGESIZE-sz); + // dprintf("zero last 0x%lx bytes.\n", PAGESIZE-sz); + + } + ubc_upl_unmap(upl); + + if (err == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(err); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0, + NULL, NULL); + } + dmu_tx_commit(tx); + +out: + zfs_rangelock_exit(lr); + if (flags & UPL_IOSYNC) + zil_commit(zfsvfs->z_log, zp->z_id); + + if (!(flags & UPL_NOCOMMIT)) { + if (err) + ubc_upl_abort_range(upl, upl_offset, size, + (UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY)); + else + ubc_upl_commit_range(upl, upl_offset, size, + (UPL_COMMIT_CLEAR_DIRTY | + UPL_COMMIT_FREE_ON_EMPTY)); + } +exit: + ZFS_EXIT(zfsvfs); + if (err) dprintf("%s err %d\n", __func__, err); + return (err); +} + + + +int +zfs_vnop_pageout(struct vnop_pageout_args *ap) +#if 0 + struct vnop_pageout_args { + struct vnode *a_vp; + upl_t a_pl; + vm_offset_t a_pl_offset; + off_t a_foffset; + size_t a_size; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + int flags = ap->a_flags; + upl_t upl = ap->a_pl; + vm_offset_t upl_offset = ap->a_pl_offset; + size_t len = ap->a_size; + offset_t off = ap->a_f_offset; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = NULL; + int error; + + if (!zp || !zp->z_zfsvfs) { + if (!(flags & UPL_NOCOMMIT)) + ubc_upl_abort(upl, + (UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY)); + dprintf("ZFS: vnop_pageout: null zp or zfsvfs\n"); + return (ENXIO); + } + + zfsvfs = zp->z_zfsvfs; + + dprintf("+vnop_pageout: off 0x%llx len 0x%lx upl_off 0x%lx: " + "blksz 0x%x, z_size 0x%llx\n", off, len, upl_offset, zp->z_blksz, + zp->z_size); + + /* + * XXX Crib this too, although Apple uses parts of zfs_putapage(). + * Break up that function into smaller bits so it can be reused. + */ + error = zfs_pageout(zfsvfs, zp, upl, upl_offset, ap->a_f_offset, + len, flags); + + return (error); +} + + +static int bluster_pageout(zfsvfs_t *zfsvfs, znode_t *zp, upl_t upl, + upl_offset_t upl_offset, off_t f_offset, int size, + uint64_t filesize, int flags, caddr_t vaddr, + dmu_tx_t *tx) +{ + int io_size; + int rounded_size; + off_t max_size; + int is_clcommit = 0; + + if ((flags & UPL_NOCOMMIT) == 0) + is_clcommit = 1; + + /* + * If they didn't specify any I/O, then we are done... + * we can't issue an abort because we don't know how + * big the upl really is + */ + if (size <= 0) { + dprintf("%s invalid size %d\n", __func__, size); + return (EINVAL); + } + + if (vnode_vfsisrdonly(ZTOV(zp))) { + if (is_clcommit) + ubc_upl_abort_range(upl, upl_offset, size, + UPL_ABORT_FREE_ON_EMPTY); + dprintf("%s: readonly fs\n", __func__); + return (EROFS); + } + + /* + * can't page-in from a negative offset + * or if we're starting beyond the EOF + * or if the file offset isn't page aligned + * or the size requested isn't a multiple of PAGE_SIZE + */ + if (f_offset < 0 || f_offset >= filesize || + (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) { + if (is_clcommit) + ubc_upl_abort_range(upl, upl_offset, size, + UPL_ABORT_FREE_ON_EMPTY); + dprintf("%s: invalid offset or size\n", __func__); + return (EINVAL); + } + max_size = filesize - f_offset; + + if (size < max_size) + io_size = size; + else + io_size = max_size; + + rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; + + if (size > rounded_size) { + if (is_clcommit) + ubc_upl_abort_range(upl, upl_offset + rounded_size, + size - rounded_size, UPL_ABORT_FREE_ON_EMPTY); + } + +#if 1 + if (f_offset + size > filesize) { + dprintf("ZFS: lowering size %u to %llu\n", + size, f_offset > filesize ? 0 : filesize - f_offset); + if (f_offset > filesize) + size = 0; + else + size = filesize - f_offset; + } +#endif + + dmu_write(zfsvfs->z_os, zp->z_id, f_offset, size, + &vaddr[upl_offset], tx); + + return (0); +} + + + + +/* + * In V2 of vnop_pageout, we are given a NULL upl, so that we can + * grab the file locks first, then request the upl to lock down pages. + */ +int +zfs_vnop_pageoutv2(struct vnop_pageout_args *ap) +#if 0 + struct vnop_pageout_args { + struct vnode *a_vp; + upl_t a_pl; + vm_offset_t a_pl_offset; + off_t a_foffset; + size_t a_size; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + int a_flags = ap->a_flags; + vm_offset_t a_pl_offset = ap->a_pl_offset; + size_t a_size = ap->a_size; + upl_t upl = ap->a_pl; + upl_page_info_t *pl; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = NULL; + int error = 0; + uint64_t filesize; + zfs_locked_range_t *lr; + dmu_tx_t *tx; + caddr_t vaddr = NULL; + int merror = 0; + + /* + * We can still get into this function as non-v2 style, by the default + * pager (ie, swap - when we eventually support it) + */ + if (upl) { + dprintf("ZFS: Relaying vnop_pageoutv2 to vnop_pageout\n"); + return (zfs_vnop_pageout(ap)); + } + + if (!zp || !zp->z_zfsvfs) { + dprintf("ZFS: vnop_pageout: null zp or zfsvfs\n"); + return (ENXIO); + } + + if (ZTOV(zp) == NULL) { + dprintf("ZFS: vnop_pageout: null vp\n"); + return (ENXIO); + } + + // XNU can call us with iocount == 0 && usecount == 0. Grab + // a ref now so the vp doesn't reclaim while we are in here. + if (vnode_get(ZTOV(zp)) != 0) { + dprintf("ZFS: vnop_pageout: vnode_ref failed.\n"); + return (ENXIO); + } + + mutex_enter(&zp->z_lock); + + sa_handle_t *z_sa_hdl; + z_sa_hdl = zp->z_sa_hdl; + if (!z_sa_hdl) { + mutex_exit(&zp->z_lock); + vnode_put(ZTOV(zp)); + dprintf("ZFS: vnop_pageout: null sa_hdl\n"); + return (ENXIO); + } + + zfsvfs = zp->z_zfsvfs; + + mutex_exit(&zp->z_lock); + + if (error) { + dprintf("ZFS: %s: can't hold_sa: %d\n", __func__, error); + vnode_put(ZTOV(zp)); + return (ENXIO); + } + + dprintf("+vnop_pageout2: off 0x%llx len 0x%lx upl_off 0x%lx: " + "blksz 0x%x, z_size 0x%llx\n", ap->a_f_offset, a_size, + a_pl_offset, zp->z_blksz, + zp->z_size); + + + /* Start the pageout request */ + /* + * We can't leave this function without either calling upl_commit or + * upl_abort. So use the non-error version. + */ + ZFS_ENTER_IFERROR(zfsvfs) { + dprintf("ZFS: vnop_pageoutv2: abort on z_unmounted\n"); + error = EIO; + goto exit_abort; + } + if (vfs_flags(zfsvfs->z_vfs) & MNT_RDONLY) { + dprintf("ZFS: vnop_pageoutv2: readonly\n"); + error = EROFS; + goto exit_abort; + } + ASSERT(vn_has_cached_data(ZTOV(zp))); + + lr = zfs_rangelock_enter(&zp->z_rangelock, ap->a_f_offset, a_size, + RL_WRITER); + + /* Grab UPL now */ + int request_flags; + + /* + * we're in control of any UPL we commit + * make sure someone hasn't accidentally passed in UPL_NOCOMMIT + */ + a_flags &= ~UPL_NOCOMMIT; + a_pl_offset = 0; + + if (a_flags & UPL_MSYNC) { + request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY; + } else { + request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; + } + + error = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, + request_flags); + if (error || (upl == NULL)) { + dprintf("ZFS: Failed to create UPL! %d\n", error); + goto pageout_done; + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, ap->a_f_offset, ap->a_size); + + // NULL z_sa_hdl + if (z_sa_hdl != NULL) + dmu_tx_hold_sa(tx, z_sa_hdl, B_FALSE); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + ubc_upl_abort(upl, (UPL_ABORT_ERROR|UPL_ABORT_FREE_ON_EMPTY)); + goto pageout_done; + } + + off_t f_offset; + int64_t offset; + int64_t isize; + int64_t pg_index; + + filesize = zp->z_size; /* get consistent copy of zp_size */ + + isize = ap->a_size; + f_offset = ap->a_f_offset; + + /* + * Scan from the back to find the last page in the UPL, so that we + * aren't looking at a UPL that may have already been freed by the + * preceding aborts/completions. + */ + for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0; ) { + if (upl_page_present(pl, --pg_index)) + break; + if (pg_index == 0) { + dprintf("ZFS: failed on pg_index\n"); + dmu_tx_commit(tx); + ubc_upl_abort_range(upl, 0, isize, + UPL_ABORT_FREE_ON_EMPTY); + goto pageout_done; + } + } + + dprintf("ZFS: isize %llu pg_index %llu\n", isize, pg_index); + /* + * initialize the offset variables before we touch the UPL. + * a_f_offset is the position into the file, in bytes + * offset is the position into the UPL, in bytes + * pg_index is the pg# of the UPL we're operating on. + * isize is the offset into the UPL of the last non-clean page. + */ + isize = ((pg_index + 1) * PAGE_SIZE); + + offset = 0; + pg_index = 0; + while (isize > 0) { + int64_t xsize; + int64_t num_of_pages; + + // printf("isize %d for page %d\n", isize, pg_index); + + if (!upl_page_present(pl, pg_index)) { + /* + * we asked for RET_ONLY_DIRTY, so it's possible + * to get back empty slots in the UPL. + * just skip over them + */ + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + + continue; + } + if (!upl_dirty_page(pl, pg_index)) { + /* + * hfs has a call to panic here, but we trigger this + * *a lot* so unsure what is going on + */ + dprintf("zfs_vnop_pageoutv2: unforeseen clean page " + "@ index %lld for UPL %p\n", pg_index, upl); + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + continue; + } + + /* + * We know that we have at least one dirty page. + * Now checking to see how many in a row we have + */ + num_of_pages = 1; + xsize = isize - PAGE_SIZE; + + while (xsize > 0) { + if (!upl_dirty_page(pl, pg_index + num_of_pages)) + break; + num_of_pages++; + xsize -= PAGE_SIZE; + } + xsize = num_of_pages * PAGE_SIZE; + + if (!vnode_isswap(vp)) { + off_t end_of_range; + + end_of_range = f_offset + xsize - 1; + if (end_of_range >= filesize) { + end_of_range = (off_t)(filesize - 1); + } + } + + // Map it if needed + if (!vaddr) { + if ((ubc_upl_map(upl, (vm_offset_t *)&vaddr) != + KERN_SUCCESS) || vaddr == NULL) { + error = EINVAL; + vaddr = NULL; + dprintf("ZFS: unable to map\n"); + goto out; + } + dprintf("ZFS: Mapped %p\n", vaddr); + } + + + dprintf("ZFS: bluster offset %lld fileoff %lld size %lld " + "filesize %lld\n", offset, f_offset, xsize, filesize); + merror = bluster_pageout(zfsvfs, zp, upl, offset, f_offset, + xsize, filesize, a_flags, vaddr, tx); + /* remember the first error */ + if ((error == 0) && (merror)) + error = merror; + + f_offset += xsize; + offset += xsize; + isize -= xsize; + pg_index += num_of_pages; + } // while isize + + /* finish off transaction */ + if (error == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, ap->a_f_offset, + a_size, 0, NULL, NULL); + } + dmu_tx_commit(tx); + + // unmap + if (vaddr) { + ubc_upl_unmap(upl); + vaddr = NULL; + } +out: + zfs_rangelock_exit(lr); + if (a_flags & UPL_IOSYNC) + zil_commit(zfsvfs->z_log, zp->z_id); + + if (error) + ubc_upl_abort(upl, (UPL_ABORT_ERROR|UPL_ABORT_FREE_ON_EMPTY)); + else + ubc_upl_commit_range(upl, 0, a_size, UPL_COMMIT_FREE_ON_EMPTY); + + upl = NULL; + + vnode_put(ZTOV(zp)); + + ZFS_EXIT(zfsvfs); + if (error) + dprintf("ZFS: pageoutv2 failed %d\n", error); + return (error); + +pageout_done: + zfs_rangelock_exit(lr); + +exit_abort: + dprintf("ZFS: pageoutv2 aborted %d\n", error); + // VERIFY(ubc_create_upl(vp, off, len, &upl, &pl, flags) == 0); + // ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); + + vnode_put(ZTOV(zp)); + + if (zfsvfs) + ZFS_EXIT(zfsvfs); + return (error); +} + + + + + + +int +zfs_vnop_mmap(struct vnop_mmap_args *ap) +#if 0 + struct vnop_mmap_args { + struct vnode *a_vp; + int a_fflags; + kauth_cred_t a_cred; + struct proc *a_p; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs; + + if (!zp) + return (ENODEV); + + zfsvfs = zp->z_zfsvfs; + + dprintf("+vnop_mmap: %p\n", ap->a_vp); + + ZFS_ENTER(zfsvfs); + + if (!vnode_isreg(vp)) { + ZFS_EXIT(zfsvfs); + return (ENODEV); + } + mutex_enter(&zp->z_lock); + zp->z_is_mapped = 1; + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + dprintf("-vnop_mmap\n"); + return (0); +} + +int +zfs_vnop_mnomap(struct vnop_mnomap_args *ap) +#if 0 + struct vnop_mnomap_args { + struct vnode *a_vp; + int a_fflags; + kauth_cred_t a_cred; + struct proc *a_p; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + dprintf("+vnop_mnomap: %p\n", ap->a_vp); + + ZFS_ENTER(zfsvfs); + + if (!vnode_isreg(vp)) { + ZFS_EXIT(zfsvfs); + return (ENODEV); + } + mutex_enter(&zp->z_lock); + /* + * If a file as been mmaped even once, it needs to keep "z_is_mapped" + * high because it will potentially keep pages in the UPL cache we need + * to update on writes. We can either drop the UPL pages here, or simply + * keep updating both places on zfs_write(). + */ + /* zp->z_is_mapped = 0; */ + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + dprintf("-vnop_mnomap\n"); + return (0); +} + + + + +int +zfs_vnop_inactive(struct vnop_inactive_args *ap) +#if 0 + struct vnop_inactive_args { + struct vnode *a_vp; + vfs_context_t a_context; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + zfs_inactive(vp); + return (0); +} + + + +#ifdef _KERNEL +uint64_t vnop_num_reclaims = 0; +uint64_t vnop_num_vnodes = 0; +#endif + + +int +zfs_vnop_reclaim(struct vnop_reclaim_args *ap) +#if 0 + struct vnop_reclaim_args { + struct vnode *a_vp; + vfs_context_t a_context; + }; +#endif +{ + /* + * Care needs to be taken here, we may already have called reclaim + * from vnop_inactive, if so, very little needs to be done. + */ + + struct vnode *vp = ap->a_vp; + znode_t *zp = NULL; + zfsvfs_t *zfsvfs = NULL; + + /* Destroy the vm object and flush associated pages. */ +#ifndef __APPLE__ + vnode_destroy_vobject(vp); +#endif + + /* Already been released? */ + zp = VTOZ(vp); + ASSERT(zp != NULL); + dprintf("+vnop_reclaim zp %p/%p type %d\n", zp, vp, vnode_vtype(vp)); + if (!zp) goto out; + + zfsvfs = zp->z_zfsvfs; + + if (!zfsvfs) { + dprintf("ZFS: vnop_reclaim with zfsvfs == NULL\n"); + return (0); + } + + if (zfsctl_is_node(vp)) { + dprintf("ZFS: vnop_reclaim with ctldir node\n"); + return (0); + } + + ZTOV(zp) = NULL; + + /* + * Purge old data structures associated with the denode. + */ + vnode_clearfsnode(vp); /* vp->v_data = NULL */ + vnode_removefsref(vp); /* ADDREF from vnode_create */ + atomic_dec_64(&vnop_num_vnodes); + + dprintf("+vnop_reclaim zp %p/%p unlinked %d unmount " + "%d sa_hdl %p\n", zp, vp, zp->z_unlinked, + zfsvfs->z_unmounted, zp->z_sa_hdl); + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + zfs_znode_free(zp); + } else { + zfs_zinactive(zp); + zfs_znode_free(zp); + } + rw_exit(&zfsvfs->z_teardown_inactive_lock); + +#ifdef _KERNEL + atomic_inc_64(&vnop_num_reclaims); +#endif + +out: + return (0); +} + + + + + +int +zfs_vnop_mknod(struct vnop_mknod_args *ap) +#if 0 + struct vnop_mknod_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vnode_vattr *vap; + vfs_context_t a_context; + }; +#endif +{ + struct vnop_create_args create_ap; + int error; + + dprintf("%s\n", __func__); + + bzero(&create_ap, sizeof (struct vnop_create_args)); + + create_ap.a_dvp = ap->a_dvp; + create_ap.a_vpp = ap->a_vpp; + create_ap.a_cnp = ap->a_cnp; + create_ap.a_vap = ap->a_vap; + create_ap.a_context = ap->a_context; + + error = zfs_vnop_create(&create_ap); + if (error) dprintf("%s error %d\n", __func__, error); + return (error); +} + +int +zfs_vnop_allocate(struct vnop_allocate_args *ap) +#if 0 + struct vnop_allocate_args { + struct vnode *a_vp; + off_t a_length; + u_int32_t a_flags; + off_t *a_bytesallocated; + off_t a_offset; + vfs_context_t a_context; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs; + uint64_t wantedsize = 0, filesize = 0; + int err = 0; + + dprintf("%s %llu %d %llu %llu: '%s'\n", __func__, ap->a_length, + ap->a_flags, (ap->a_bytesallocated ? *ap->a_bytesallocated : 0), + ap->a_offset, zp->z_name_cache); + + /* + * This code has been reverted: + * https://github.com/openzfsonosx/zfs/issues/631 + * Most likely not correctly aligned, and too-large offsets. + */ + return (0); + + if (!zp || !zp->z_sa_hdl) + return (ENODEV); + +// *ap->a_bytesallocated = 0; + + if (!vnode_isreg(vp)) { + ZFS_EXIT(zfsvfs); + return (ENODEV); + } + + filesize = zp->z_size; + wantedsize = ap->a_length; + + if (ap->a_flags & ALLOCATEFROMPEOF) + wantedsize += filesize; + else if (ap->a_flags & ALLOCATEFROMVOL) + /* blockhint = ap->a_offset / blocksize */ // yeah, no idea + dprintf("%s: help, allocatefromvolume set?\n", __func__); + + dprintf("%s: filesize %llu wantedsize %llu\n", __func__, + filesize, wantedsize); + + // If we are extending + if (wantedsize > filesize) { + + err = zfs_freesp(zp, wantedsize, 0, FWRITE, B_TRUE); + + // If we are truncating, Apple claims this code is never called. + } else if (wantedsize < filesize) { + + dprintf("%s: file shrinking branch taken?\n", __func__); + + } + + if (!err) { + *(ap->a_bytesallocated) = wantedsize - filesize; + } + + ZFS_EXIT(zfsvfs); + dprintf("-%s: %d\n", __func__, err); + return (err); +} + +int +zfs_vnop_whiteout(struct vnop_whiteout_args *ap) +#if 0 + struct vnop_whiteout_args { + struct vnode *a_dvp; + struct componentname *a_cnp; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ + dprintf("vnop_whiteout: ENOTSUP\n"); + + return (ENOTSUP); +} + +int +zfs_vnop_pathconf(struct vnop_pathconf_args *ap) +#if 0 + struct vnop_pathconf_args { + struct vnode *a_vp; + int a_name; + register_t *a_retval; + vfs_context_t a_context; + }; +#endif +{ + int32_t *valp = ap->a_retval; + int error = 0; + + dprintf("+vnop_pathconf a_name %d\n", ap->a_name); + + switch (ap->a_name) { + case _PC_LINK_MAX: + *valp = INT_MAX; + break; + case _PC_PIPE_BUF: + *valp = PIPE_BUF; + break; + case _PC_CHOWN_RESTRICTED: + *valp = 200112; /* POSIX */ + break; + case _PC_NO_TRUNC: + *valp = 200112; /* POSIX */ + break; + case _PC_NAME_MAX: + case _PC_NAME_CHARS_MAX: + *valp = ZAP_MAXNAMELEN - 1; /* 255 */ + break; + case _PC_PATH_MAX: + case _PC_SYMLINK_MAX: + *valp = PATH_MAX; /* 1024 */ + break; + case _PC_CASE_SENSITIVE: + { + znode_t *zp = VTOZ(ap->a_vp); + *valp = 1; + if (zp && zp->z_zfsvfs) { + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + *valp = (zfsvfs->z_case == ZFS_CASE_SENSITIVE) ? 1 : 0; + } + } + break; + case _PC_CASE_PRESERVING: + *valp = 1; + break; +/* + * OS X 10.6 does not define this. + */ +#ifndef _PC_XATTR_SIZE_BITS +#define _PC_XATTR_SIZE_BITS 26 +#endif +/* + * Even though ZFS has 64 bit limit on XATTR size, there would appear to be a + * limit in SMB2 that the bit size returned has to be 18, or we will get an + * error from most XATTR calls (STATUS_ALLOTTED_SPACE_EXCEEDED). + */ +#ifndef AD_XATTR_SIZE_BITS +#define AD_XATTR_SIZE_BITS 18 +#endif + case _PC_XATTR_SIZE_BITS: + *valp = AD_XATTR_SIZE_BITS; + break; + case _PC_FILESIZEBITS: + *valp = 64; + break; + default: + printf("ZFS: unknown pathconf %d called.\n", ap->a_name); + error = EINVAL; + } + + if (error) dprintf("%s vp %p : %d\n", __func__, ap->a_vp, error); + return (error); +} + +int +zfs_vnop_getxattr(struct vnop_getxattr_args *ap) +#if 0 + struct vnop_getxattr_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + char *a_name; + struct uio *a_uio; + size_t *a_size; + int a_options; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct uio *uio = ap->a_uio; + struct componentname cn = { 0 }; + int error = 0; + int size = 0; + uint64_t resid = uio ? uio_resid(uio) : 0; + znode_t *xdzp = NULL, *xzp = NULL; + + dprintf("+getxattr vp %p: '%s'\n", ap->a_vp, ap->a_name); + + /* xattrs disabled? */ + if (zfsvfs->z_xattr == B_FALSE) { + return (ENOTSUP); + } + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_use_sa && zfsvfs->z_xattr_sa && zp->z_is_sa) { + char *value = NULL; + + rw_enter(&zp->z_xattr_lock, RW_READER); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + rw_exit(&zp->z_xattr_lock); + + if (!resid) { /* Lookup size */ + + rw_enter(&zp->z_xattr_lock, RW_READER); + size = zpl_xattr_get_sa(vp, ap->a_name, NULL, 0); + rw_exit(&zp->z_xattr_lock); + if (size > 0) { + *ap->a_size = size; + goto out; + } + } + + if (resid) { + value = kmem_alloc(resid, KM_SLEEP); + rw_enter(&zp->z_xattr_lock, RW_READER); + size = zpl_xattr_get_sa(vp, ap->a_name, value, resid); + rw_exit(&zp->z_xattr_lock); + + /* Finderinfo checks */ + if (!error && resid && + bcmp(ap->a_name, XATTR_FINDERINFO_NAME, + sizeof (XATTR_FINDERINFO_NAME)) == 0) { + + /* Must be 32 bytes */ + if (resid != sizeof (emptyfinfo) || + size != sizeof (emptyfinfo)) { + error = ERANGE; + kmem_free(value, resid); + goto out; + } + + /* If FinderInfo is empty > it doesn't exist */ + if (bcmp(value, emptyfinfo, + sizeof (emptyfinfo)) == 0) { + error = ENOATTR; + kmem_free(value, resid); + goto out; + } + + /* According to HFS zero out some fields */ + finderinfo_update((uint8_t *)value, zp); + } + + if (size > 0) + error = uiomove((const char *)value, size, 0, + uio); + + kmem_free(value, resid); + + goto out; + } + } + + /* Legacy xattr */ + + /* Grab the hidden attribute directory vnode. */ + if ((error = zfs_get_xattrdir(zp, &xdzp, cr, 0))) { + goto out; + } + + cn.cn_namelen = strlen(ap->a_name) + 1; + cn.cn_nameptr = (char *)kmem_zalloc(cn.cn_namelen, KM_SLEEP); + + /* Lookup the attribute name. */ + if ((error = zfs_dirlook(xdzp, (char *)ap->a_name, &xzp, 0, NULL, + &cn))) { + goto out; + } + + /* + * If we are dealing with FinderInfo, we duplicate the UIO first + * so that we can uiomove to/from it to modify contents. + */ + if (!error && uio && + bcmp(ap->a_name, XATTR_FINDERINFO_NAME, + sizeof (XATTR_FINDERINFO_NAME)) == 0) { + ssize_t local_resid; + zfs_file_t zf; + u_int8_t finderinfo[32]; + static u_int32_t emptyfinfo[8] = {0}; + + /* Read the attribute data. */ + /* FinderInfo is 32 bytes */ + if ((user_size_t)uio_resid(uio) < 32) { + error = ERANGE; + goto out; + } + + /* Use the convenience wrappers to read to SYSSPACE */ + zf.f_vnode = ZTOV(xzp); + zf.f_fd = -1; + + error = zfs_file_pread(&zf, &finderinfo, + sizeof (finderinfo), 0ULL, &local_resid); + + if (local_resid != 0) { + error = ERANGE; + } else { + + /* Update size if requested */ + if (ap->a_size) + *ap->a_size = (size_t)sizeof (finderinfo); + + /* According to HFS we are to zero out some fields */ + finderinfo_update((uint8_t *)&finderinfo, zp); + + /* If Finder Info is empty then it doesn't exist. */ + if (bcmp(finderinfo, emptyfinfo, + sizeof (emptyfinfo)) == 0) { + error = ENOATTR; + } else { + + /* Copy out the data we just modified */ + error = uiomove((const char *)&finderinfo, + sizeof (finderinfo), 0, uio); + + } /* Not empty */ + } /* Correct size */ + + /* We are done */ + goto out; + } /* Is finder info */ + + /* If NOT finderinfo */ + + if (uio == NULL) { + + /* Query xattr size. */ + if (ap->a_size) { + mutex_enter(&xzp->z_lock); + *ap->a_size = (size_t)xzp->z_size; + mutex_exit(&xzp->z_lock); + } + + } else { + + /* Read xattr */ + error = zfs_read(ZTOV(xzp), uio, 0, cr); + + if (ap->a_size && uio) { + *ap->a_size = (size_t)resid - uio_resid(ap->a_uio); + } + + } + +out: + + if (error == ENOENT) + error = ENOATTR; + + if (cn.cn_nameptr) + kmem_free(cn.cn_nameptr, cn.cn_namelen); + if (xzp) { + zrele(xzp); + } + if (xdzp) { + zrele(xdzp); + } + + ZFS_EXIT(zfsvfs); + dprintf("-getxattr vp %p : %d size %lu: %s\n", ap->a_vp, error, + !error && ap->a_size ? *ap->a_size : 0, + ap->a_uio == NULL ? "sizelookup" : "xattrread"); + return (error); +} + +int +zfs_vnop_setxattr(struct vnop_setxattr_args *ap) +#if 0 + struct vnop_setxattr_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + char *a_name; + struct uio *a_uio; + int a_options; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + struct vnode *xvp = NULLVP; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct uio *uio = ap->a_uio; + int flag; + int error = 0; + znode_t *xdzp = NULL; + + dprintf("+setxattr vp %p '%s' (enabled: %d) resid %llu\n", ap->a_vp, + ap->a_name, zfsvfs->z_xattr, uio_resid(ap->a_uio)); + + /* xattrs disabled? */ + if (zfsvfs->z_xattr == B_FALSE) { + return (ENOTSUP); + } + + if (ap->a_name == NULL || ap->a_name[0] == '\0') { + return (EINVAL); /* invalid name */ + } + + ZFS_ENTER(zfsvfs); + + if (strlen(ap->a_name) >= ZAP_MAXNAMELEN) { + error = ENAMETOOLONG; + goto out; + } + + if (ap->a_options & XATTR_CREATE) + flag = ZNEW; /* expect no pre-existing entry */ + else if (ap->a_options & XATTR_REPLACE) + flag = ZEXISTS; /* expect an existing entry */ + else + flag = 0; + + + /* Preferentially store the xattr as a SA for better performance */ + if (zfsvfs->z_use_sa && zfsvfs->z_xattr_sa && zp->z_is_sa) { + char *value; + uint64_t size; + + rw_enter(&zp->z_xattr_lock, RW_READER); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + rw_exit(&zp->z_xattr_lock); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + + /* New, expect it to not exist .. */ + if ((flag & ZNEW) && + (zpl_xattr_get_sa(vp, ap->a_name, NULL, 0) > 0)) { + error = EEXIST; + rw_exit(&zp->z_xattr_lock); + goto out; + } + + /* Replace, XATTR must exist .. */ + if ((flag & ZEXISTS) && + ((error = + zpl_xattr_get_sa(vp, ap->a_name, NULL, 0)) <= 0) && + error == -ENOENT) { + error = ENOATTR; + rw_exit(&zp->z_xattr_lock); + goto out; + } + + size = uio_resid(uio); + value = kmem_alloc(size, KM_SLEEP); + + size_t bytes; + + /* Copy in the xattr value */ + uiocopy((const char *)value, size, UIO_WRITE, + uio, &bytes); + + + /* Finderinfo checks */ + if (!error && bytes && + bcmp(ap->a_name, XATTR_FINDERINFO_NAME, + sizeof (XATTR_FINDERINFO_NAME)) == 0) { + + /* Must be 32 bytes */ + if (bytes != sizeof (emptyfinfo)) { + error = ERANGE; + rw_exit(&zp->z_xattr_lock); + kmem_free(value, size); + goto out; + } + + /* According to HFS we are to zero out some fields */ + finderinfo_update((uint8_t *)value, zp); + } + + error = zpl_xattr_set_sa(vp, ap->a_name, + value, bytes, + flag, cr); + rw_exit(&zp->z_xattr_lock); + kmem_free(value, size); + + goto out; + } + + /* Legacy xattr */ + + if ((error = zfs_get_xattrdir(zp, &xdzp, cr, CREATE_XATTR_DIR))) { + goto out; + } + + /* Lookup or create the named attribute. */ + error = zpl_obtain_xattr(xdzp, ap->a_name, VTOZ(vp)->z_mode, cr, + &xvp, flag); + if (error) + goto out; + + /* Write the attribute data. */ + ASSERT(uio != NULL); + + /* OsX setxattr() replaces xattrs */ + error = zfs_freesp(VTOZ(xvp), 0, 0, VTOZ(vp)->z_mode, TRUE); + + /* Special case for Finderinfo */ + if (!error && uio && + bcmp(ap->a_name, XATTR_FINDERINFO_NAME, + sizeof (XATTR_FINDERINFO_NAME)) == 0) { + + u_int8_t finderinfo[32]; + + /* Read the attribute data. */ + /* FinderInfo is 32 bytes */ + if ((user_size_t)uio_resid(uio) < 32) { + error = ERANGE; + goto out; + } + + /* Copy in the finderinfo to our space */ + error = uiomove((const char *)&finderinfo, + sizeof (finderinfo), 0, uio); + if (error) + goto out; + + /* Zero out some fields, according to HFS */ + finderinfo_update((uint8_t *)&finderinfo, zp); + + /* + * TODO: + * When writing FINDERINFO, we need to replace the + * ADDEDTIME date with actual crtime and not let + * userland overwrite it. + */ + + /* Empty Finderinfo is non-existent. */ + if (bcmp(finderinfo, emptyfinfo, sizeof (emptyfinfo)) == 0) { + /* Attempt to delete it? */ + error = zfs_remove(xdzp, (char *)ap->a_name, cr, 0); + goto out; + } + + /* Build a new uio to call zfs_write() to make it go in txg */ + uio_t *luio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE); + if (luio == NULL) { + error = ENOMEM; + goto out; + } + uio_addiov(luio, (user_addr_t)&finderinfo, sizeof (finderinfo)); + + error = zfs_write(xvp, luio, 0, cr); + + if (uio_resid(luio) != 0) + error = ERANGE; + + uio_free(luio); + + goto out; + } /* Finderinfo */ + + /* Write XATTR to disk */ + error = zfs_write(xvp, uio, 0, cr); + +out: + + if (error == ENOENT) + error = ENOATTR; + + if (xdzp) { + zrele(xdzp); + } + if (xvp) { + VN_RELE(xvp); + } + + ZFS_EXIT(zfsvfs); + dprintf("-setxattr vp %p: err %d: resid %llx\n", ap->a_vp, error, + uio_resid(ap->a_uio)); + return (error); +} + +int +zfs_vnop_removexattr_int(zfsvfs_t *zfsvfs, znode_t *zp, const char *name, + cred_t *cr) +{ + struct vnode *vp = ZTOV(zp); + struct componentname cn = { 0 }; + int error; + uint64_t xattr; + znode_t *xdzp = NULL, *xzp = NULL; + + dprintf("+removexattr_int vp %p '%s'\n", vp, name); + + ZFS_ENTER(zfsvfs); + + /* + * Recursive attributes are not allowed. + */ + if (zp->z_pflags & ZFS_XATTR) { + error = EINVAL; + goto out; + } + + if (zfsvfs->z_use_sa && zfsvfs->z_xattr_sa && zp->z_is_sa) { + nvlist_t *nvl; + + rw_enter(&zp->z_xattr_lock, RW_READER); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + rw_exit(&zp->z_xattr_lock); + + nvl = zp->z_xattr_cached; + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); + + dprintf("ZFS: removexattr nvlist_remove said %d\n", error); + if (!error) { + /* Update the SA for adds, modss, and removals. */ + error = -zfs_sa_set_xattr(zp); + rw_exit(&zp->z_xattr_lock); + goto out; + } + rw_exit(&zp->z_xattr_lock); + } + + sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr, sizeof (xattr)); + if (xattr == 0) { + error = ENOATTR; + goto out; + } + + /* Grab the hidden attribute directory vnode. */ + if ((error = zfs_get_xattrdir(zp, &xdzp, cr, 0))) { + goto out; + } + + cn.cn_namelen = strlen(name)+1; + cn.cn_nameptr = (char *)kmem_zalloc(cn.cn_namelen, KM_SLEEP); + + /* Lookup the attribute name. */ + if ((error = zfs_dirlook(xdzp, (char *)name, &xzp, 0, NULL, + &cn))) { + if (error == ENOENT) + error = ENOATTR; + goto out; + } + + error = zfs_remove(xdzp, (char *)name, cr, /* flags */0); + +out: + if (cn.cn_nameptr) + kmem_free(cn.cn_nameptr, cn.cn_namelen); + + if (xzp) { + zrele(xzp); + } + if (xdzp) { + zrele(xdzp); + } + + ZFS_EXIT(zfsvfs); + if (error) dprintf("%s vp %p: error %d\n", __func__, vp, error); + return (error); +} + +int +zfs_vnop_removexattr(struct vnop_removexattr_args *ap) +#if 0 + struct vnop_removexattr_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + char *a_name; + int a_options; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + dprintf("+removexattr vp %p '%s'\n", ap->a_vp, ap->a_name); + + /* xattrs disabled? */ + if (zfsvfs->z_xattr == B_FALSE) { + return (ENOTSUP); + } + + return (zfs_vnop_removexattr_int(zfsvfs, zp, ap->a_name, cr)); +} + + +int +zfs_vnop_listxattr(struct vnop_listxattr_args *ap) +#if 0 + struct vnop_listxattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + uio_t a_uio; + size_t *a_size; + int a_options; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct uio *uio = ap->a_uio; + zap_cursor_t zc; + zap_attribute_t za; + objset_t *os; + size_t size = 0; + char *nameptr; + char nfd_name[ZAP_MAXNAMELEN]; + size_t namelen; + int error = 0; + uint64_t xattr; + int force_formd_normalized_output; + znode_t *xdzp = NULL; + + dprintf("+listxattr vp %p: \n", ap->a_vp); + + /* xattrs disabled? */ + if (zfsvfs->z_xattr == B_FALSE) { + return (EINVAL); + } + + ZFS_ENTER(zfsvfs); + + /* + * Recursive attributes are not allowed. + */ + if (zp->z_pflags & ZFS_XATTR) { + error = EINVAL; + goto out; + } + + if (zfsvfs->z_use_sa && zp->z_is_sa && zp->z_xattr_cached) { + nvpair_t *nvp = NULL; + + rw_enter(&zp->z_xattr_lock, RW_READER); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + rw_exit(&zp->z_xattr_lock); + + while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != + NULL) { + ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); + + namelen = strlen(nvpair_name(nvp)) + 1; /* Null byte */ + + /* Just checking for space requirements? */ + if (uio == NULL) { + size += namelen; + } else { + if (namelen > uio_resid(uio)) { + error = ERANGE; + break; + } + dprintf("ZFS: listxattr '%s'\n", + nvpair_name(nvp)); + error = uiomove((caddr_t)nvpair_name(nvp), + namelen, UIO_READ, uio); + if (error) + break; + } + } /* while nvlist */ + } /* SA xattr */ + if (error) goto out; + + /* Do we even have any attributes? */ + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr, + sizeof (xattr)) || (xattr == 0)) { + goto out; /* all done */ + } + + /* Grab the hidden attribute directory vnode. */ + if (zfs_get_xattrdir(zp, &xdzp, cr, 0) != 0) { + goto out; + } + os = zfsvfs->z_os; + + for (zap_cursor_init(&zc, os, xdzp->z_id); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + if (xattr_protected(za.za_name)) + continue; /* skip */ + /* + * Mac OS X: non-ascii names are UTF-8 NFC on disk + * so convert to NFD before exporting them. + */ + namelen = strlen(za.za_name); + + if (force_formd_normalized_output && + !is_ascii_str(za.za_name)) + force_formd_normalized_output = 1; + else + force_formd_normalized_output = 0; + + if (force_formd_normalized_output && + utf8_normalizestr((const u_int8_t *)za.za_name, namelen, + (u_int8_t *)nfd_name, &namelen, sizeof (nfd_name), + UTF_DECOMPOSED) == 0) { + nameptr = nfd_name; + } else { + nameptr = &za.za_name[0]; + } + ++namelen; /* account for NULL termination byte */ + if (uio == NULL) { + size += namelen; + } else { + if (namelen > uio_resid(uio)) { + error = ERANGE; + break; + } + error = uiomove((caddr_t)nameptr, namelen, UIO_READ, + uio); + if (error) + break; + } + } + zap_cursor_fini(&zc); +out: + if (uio == NULL) { + *ap->a_size = size; + } + if (xdzp) { + zrele(xdzp); + } + + ZFS_EXIT(zfsvfs); + if (error) { + dprintf("%s vp %p: error %d size %ld\n", __func__, + ap->a_vp, error, size); + } + return (error); +} + +#ifdef HAVE_NAMED_STREAMS +int +zfs_vnop_getnamedstream(struct vnop_getnamedstream_args *ap) +#if 0 + struct vnop_getnamedstream_args { + struct vnode *a_vp; + struct vnode **a_svpp; + char *a_name; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + struct vnode **svpp = ap->a_svpp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct componentname cn = { 0 }; + int error = ENOATTR; + znode_t *xdzp = NULL; + znode_t *xzp = NULL; + + dprintf("+getnamedstream vp %p '%s': op %u\n", ap->a_vp, ap->a_name, + ap->a_operation); + + *svpp = NULLVP; + + ZFS_ENTER(zfsvfs); + + /* + * Mac OS X only supports the "com.apple.ResourceFork" stream. + */ + if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, + sizeof (XATTR_RESOURCEFORK_NAME)) != 0) + goto out; + + /* Only regular files */ + if (!vnode_isreg(vp)) { + return (EPERM); + } + + /* Grab the hidden attribute directory vnode. */ + if (zfs_get_xattrdir(zp, &xdzp, cr, 0) != 0) + goto out; + + cn.cn_namelen = strlen(ap->a_name) + 1; + cn.cn_nameptr = (char *)kmem_zalloc(cn.cn_namelen, KM_SLEEP); + + /* Lookup the attribute name. */ + if ((error = zfs_dirlook(xdzp, (char *)ap->a_name, &xzp, 0, NULL, + &cn))) { + if (error == ENOENT) + error = ENOATTR; + } else { + *svpp = ZTOV(xzp); + } + + kmem_free(cn.cn_nameptr, cn.cn_namelen); + +out: + if (xdzp) + zrele(xdzp); + +#if 0 // Disabled, not sure its required and empty vnodes are odd. + /* + * If the lookup is NS_OPEN, they are accessing "..namedfork/rsrc" + * to which we should return 0 with empty vp to empty file. + * See hfs_vnop_getnamedstream() + */ + if ((error == ENOATTR) && + ap->a_operation == NS_OPEN) { + + if ((error = zfs_get_xattrdir(zp, &xdvp, cr, + CREATE_XATTR_DIR)) == 0) { + /* Lookup or create the named attribute. */ + error = zpl_obtain_xattr(VTOZ(xdvp), ap->a_name, + VTOZ(vp)->z_mode, cr, ap->a_svpp, + ZNEW); + vnode_put(xdvp); + } + } +#endif + + ZFS_EXIT(zfsvfs); + if (error) dprintf("%s vp %p: error %d\n", __func__, ap->a_vp, error); + return (error); +} + +int +zfs_vnop_makenamedstream(struct vnop_makenamedstream_args *ap) +#if 0 + struct vnop_makenamedstream_args { + struct vnode *a_vp; + struct vnode **a_svpp; + char *a_name; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct componentname cn; + struct vnode_attr vattr; + int error = 0; + znode_t *xdzp = NULL; + znode_t *xzp = NULL; + + dprintf("+makenamedstream vp %p: '%s'\n", ap->a_vp, ap->a_name); + + *ap->a_svpp = NULLVP; + + ZFS_ENTER(zfsvfs); + + /* Only regular files can have a resource fork stream. */ + if (!vnode_isreg(vp)) { + error = EPERM; + goto out; + } + + /* + * Mac OS X only supports the "com.apple.ResourceFork" stream. + */ + if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, + sizeof (XATTR_RESOURCEFORK_NAME)) != 0) { + error = ENOATTR; + goto out; + } + + /* Grab the hidden attribute directory vnode. */ + if ((error = zfs_get_xattrdir(zp, &xdzp, cr, CREATE_XATTR_DIR))) + goto out; + + bzero(&cn, sizeof (cn)); + cn.cn_nameiop = CREATE; + cn.cn_flags = ISLASTCN; + cn.cn_nameptr = (char *)ap->a_name; + cn.cn_namelen = strlen(cn.cn_nameptr); + + VATTR_INIT(&vattr); + VATTR_SET(&vattr, va_type, VREG); + VATTR_SET(&vattr, va_mode, VTOZ(vp)->z_mode & ~S_IFMT); + + error = zfs_create(xdzp, (char *)ap->a_name, &vattr, NONEXCL, + VTOZ(vp)->z_mode, &xzp, cr, 0, NULL); + + if (error == 0) + *ap->a_svpp = ZTOV(xzp); + +out: + if (xdzp) + zrele(xdzp); + + ZFS_EXIT(zfsvfs); + if (error) dprintf("%s vp %p: error %d\n", __func__, ap->a_vp, error); + return (error); +} + +int +zfs_vnop_removenamedstream(struct vnop_removenamedstream_args *ap) +#if 0 + struct vnop_removenamedstream_args { + struct vnode *a_vp; + struct vnode **a_svpp; + char *a_name; + }; +#endif +{ + struct vnode *svp = ap->a_svp; + znode_t *zp = VTOZ(svp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + + dprintf("zfs_vnop_removenamedstream: %p '%s'\n", + svp, ap->a_name); + ZFS_ENTER(zfsvfs); + + /* + * Mac OS X only supports the "com.apple.ResourceFork" stream. + */ + if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, + sizeof (XATTR_RESOURCEFORK_NAME)) != 0) { + error = ENOATTR; + goto out; + } + + /* ### MISING CODE ### */ + /* + * It turns out that even though APPLE uses makenamedstream() to + * create a stream, for example compression, they use vnop_removexattr + * to delete it, so this appears not in use. + */ + dprintf("zfs_vnop_removenamedstream\n"); + error = EPERM; +out: + ZFS_EXIT(zfsvfs); + return (ENOTSUP); +} +#endif /* HAVE_NAMED_STREAMS */ + +/* + * The Darwin kernel's HFS+ appears to implement this by two methods, + * + * if (ap->a_options & FSOPT_EXCHANGE_DATA_ONLY) is set + * ** Copy the data of the files over (including rsrc) + * + * if not set + * ** exchange FileID between the two nodes, copy over vnode information + * like that of *time records, uid/gid, flags, mode, linkcount, + * finderinfo, c_desc, c_attr, c_flag, and cache_purge(). + * + * This call is deprecated in 10.8 + */ +int +zfs_vnop_exchange(struct vnop_exchange_args *ap) +#if 0 + struct vnop_exchange_args { + struct vnode *a_fvp; + struct vnode *a_tvp; + int a_options; + vfs_context_t a_context; + }; +#endif +{ + vnode_t *fvp = ap->a_fvp; + vnode_t *tvp = ap->a_tvp; + znode_t *fzp; + zfsvfs_t *zfsvfs; + + /* The files must be on the same volume. */ + if (vnode_mount(fvp) != vnode_mount(tvp)) { + dprintf("%s fvp and tvp not in same mountpoint\n", + __func__); + return (EXDEV); + } + + if (fvp == tvp) { + dprintf("%s fvp == tvp\n", __func__); + return (EINVAL); + } + + /* Only normal files can be exchanged. */ + if (!vnode_isreg(fvp) || !vnode_isreg(tvp)) { + dprintf("%s fvp or tvp is not a regular file\n", + __func__); + return (EINVAL); + } + + fzp = VTOZ(fvp); + zfsvfs = fzp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + + /* ADD MISSING CODE HERE */ + + ZFS_EXIT(zfsvfs); + printf("vnop_exchange: ENOTSUP\n"); + return (ENOTSUP); +} + +int +zfs_vnop_revoke(struct vnop_revoke_args *ap) +#if 0 + struct vnop_revoke_args { + struct vnode *a_vp; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ + return (vn_revoke(ap->a_vp, ap->a_flags, ap->a_context)); +} + +int +zfs_vnop_blktooff(struct vnop_blktooff_args *ap) +#if 0 + struct vnop_blktooff_args { + struct vnode *a_vp; + daddr64_t a_lblkno; + off_t *a_offset; + }; +#endif +{ + dprintf("vnop_blktooff: 0\n"); + return (ENOTSUP); +} + +int +zfs_vnop_offtoblk(struct vnop_offtoblk_args *ap) +#if 0 + struct vnop_offtoblk_args { + struct vnode *a_vp; + off_t a_offset; + daddr64_t *a_lblkno; + }; +#endif +{ + dprintf("+vnop_offtoblk\n"); + return (ENOTSUP); +} + +int +zfs_vnop_blockmap(struct vnop_blockmap_args *ap) +#if 0 + struct vnop_blockmap_args { + struct vnode *a_vp; + off_t a_foffset; + size_t a_size; + daddr64_t *a_bpn; + size_t *a_run; + void *a_poff; + int a_flags; +}; +#endif +{ + dprintf("+vnop_blockmap\n"); + return (ENOTSUP); + +#if 0 + znode_t *zp; + zfsvfs_t *zfsvfs; + + ASSERT(ap); + ASSERT(ap->a_vp); + ASSERT(ap->a_size); + + if (!ap->a_bpn) { + return (0); + } + + if (vnode_isdir(ap->a_vp)) { + return (ENOTSUP); + } + + zp = VTOZ(ap->a_vp); + if (!zp) + return (ENODEV); + + zfsvfs = zp->z_zfsvfs; + if (!zfsvfs) + return (ENODEV); + + /* Return full request size as contiguous */ + if (ap->a_run) { + // *ap->a_run = ap->a_size; + *ap->a_run = 0; + } + if (ap->a_poff) { + *((int *)(ap->a_poff)) = 0; + /* + * returning offset of -1 asks the + * caller to zero the ranges + */ + // *((int *)(ap->a_poff)) = -1; + } + *ap->a_bpn = 0; +// *ap->a_bpn = (daddr64_t)(ap->a_foffset / zfsvfs->z_max_blksz); + + dprintf("%s ret %lu %d %llu\n", __func__, + ap->a_size, *((int *)(ap->a_poff)), *((uint64_t *)(ap->a_bpn))); + + return (0); +#endif +} + +int +zfs_vnop_strategy(struct vnop_strategy_args *ap) +#if 0 + struct vnop_strategy_args { + struct buf *a_bp; + }; +#endif +{ + dprintf("vnop_strategy: 0\n"); + return (ENOTSUP); +} + +int +zfs_vnop_select(struct vnop_select_args *ap) +#if 0 + struct vnop_select_args { + struct vnode *a_vp; + int a_which; + int a_fflags; + kauth_cred_t a_cred; + void *a_wql; + struct proc *a_p; + }; +#endif +{ + dprintf("vnop_select: 1\n"); + return (1); +} + +#ifdef WITH_READDIRATTR +int +zfs_vnop_readdirattr(struct vnop_readdirattr_args *ap) +#if 0 + struct vnop_readdirattr_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct attrlist *a_alist; + struct uio *a_uio; + ulong_t a_maxcount; + ulong_t a_options; + ulong_t *a_newstate; + int *a_eofflag; + ulong_t *a_actualcount; + vfs_context_t a_context; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + struct attrlist *alp = ap->a_alist; + struct uio *uio = ap->a_uio; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zap_cursor_t zc; + zap_attribute_t zap; + attrinfo_t attrinfo; + int maxcount = ap->a_maxcount; + uint64_t offset = (uint64_t)uio_offset(uio); + u_int32_t fixedsize; + u_int32_t maxsize; + u_int32_t attrbufsize; + void *attrbufptr = NULL; + void *attrptr; + void *varptr; /* variable-length storage area */ + boolean_t user64 = vfs_context_is64bit(ap->a_context); + int prefetch = 0; + int error = 0; + +#if 0 + dprintf("+vnop_readdirattr\n"); +#endif + + *(ap->a_actualcount) = 0; + *(ap->a_eofflag) = 0; + + /* + * Check for invalid options or invalid uio. + */ + if (((ap->a_options & ~(FSOPT_NOINMEMUPDATE | FSOPT_NOFOLLOW)) != 0) || + (uio_resid(uio) <= 0) || (maxcount <= 0)) { + dprintf("%s invalid argument\n"); + return (EINVAL); + } + /* + * Reject requests for unsupported attributes. + */ + if ((alp->bitmapcount != ZFS_ATTR_BIT_MAP_COUNT) || + (alp->commonattr & ~ZFS_ATTR_CMN_VALID) || + (alp->dirattr & ~ZFS_ATTR_DIR_VALID) || + (alp->fileattr & ~ZFS_ATTR_FILE_VALID) || + (alp->volattr != 0 || alp->forkattr != 0)) { + dprintf("%s unsupported attr\n"); + return (EINVAL); + } + /* + * Check if we should prefetch znodes + */ + if ((alp->commonattr & ~ZFS_DIR_ENT_ATTRS) || + (alp->dirattr != 0) || (alp->fileattr != 0)) { + prefetch = TRUE; + } + + /* + * Setup a buffer to hold the packed attributes. + */ + fixedsize = sizeof (u_int32_t) + getpackedsize(alp, user64); + maxsize = fixedsize; + if (alp->commonattr & ATTR_CMN_NAME) + maxsize += ZAP_MAXNAMELEN + 1; + attrbufptr = (void*)kmem_alloc(maxsize, KM_SLEEP); + if (attrbufptr == NULL) { + dprintf("%s kmem_alloc failed\n"); + return (ENOMEM); + } + attrptr = attrbufptr; + varptr = (char *)attrbufptr + fixedsize; + + attrinfo.ai_attrlist = alp; + attrinfo.ai_varbufend = (char *)attrbufptr + maxsize; + attrinfo.ai_context = ap->a_context; + + ZFS_ENTER(zfsvfs); + + /* + * Initialize the zap iterator cursor. + */ + + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, zfsvfs->z_os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, zfsvfs->z_os, zp->z_id, offset); + } + + while (1) { + ino64_t objnum; + enum vtype vtype = VNON; + znode_t *tmp_zp = NULL; + + /* + * Note that the low 4 bits of the cookie returned by zap is + * always zero. This allows us to use the low nibble for + * "special" entries: + * We use 0 for '.', and 1 for '..' (ignored here). + * If this is the root of the filesystem, we use the offset 2 + * for the *'.zfs' directory. + */ + if (offset <= 1) { + offset = 2; + continue; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strlcpy(zap.za_name, ZFS_CTLDIR_NAME, + MAXNAMELEN); + objnum = ZFSCTL_INO_ROOT; + vtype = VDIR; + } else { + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + *(ap->a_eofflag) = (error == ENOENT); + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + error = ENXIO; + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + vtype = DTTOVT(ZFS_DIRENT_TYPE(zap.za_first_integer)); + /* Check if vtype is MIA */ + if ((vtype == 0) && !prefetch && (alp->dirattr || + alp->fileattr || + (alp->commonattr & ATTR_CMN_OBJTYPE))) { + prefetch = 1; + } + } + + /* Grab znode if required */ + if (prefetch) { + dmu_prefetch(zfsvfs->z_os, objnum, 0, 0); + if ((error = zfs_zget(zfsvfs, objnum, &tmp_zp)) == 0) { + if (vtype == VNON) { + /* SA_LOOKUP? */ + vtype = IFTOVT(tmp_zp->z_mode); + } + } else { + tmp_zp = NULL; + error = ENXIO; + goto skip_entry; + /* + * Currently ".zfs" entry is skipped, as we have + * no methods to pack that into the attrs (all + * helper functions take znode_t *, and .zfs is + * not a znode_t *). Add dummy .zfs code here if + * it is desirable to show .zfs in Finder. + */ + } + } + + /* + * Setup for the next item's attribute list + */ + *((u_int32_t *)attrptr) = 0; /* byte count slot */ + attrptr = ((u_int32_t *)attrptr) + 1; /* fixed attr start */ + attrinfo.ai_attrbufpp = &attrptr; + attrinfo.ai_varbufpp = &varptr; + + /* + * Pack entries into attribute buffer. + */ + if (alp->commonattr) { + commonattrpack(&attrinfo, zfsvfs, tmp_zp, zap.za_name, + objnum, vtype, user64); + } + if (alp->dirattr && vtype == VDIR) { + dirattrpack(&attrinfo, tmp_zp); + } + if (alp->fileattr && vtype != VDIR) { + fileattrpack(&attrinfo, zfsvfs, tmp_zp); + } + /* All done with tmp znode. */ + if (prefetch && tmp_zp) { + vnode_put(ZTOV(tmp_zp)); + tmp_zp = NULL; + } + attrbufsize = ((char *)varptr - (char *)attrbufptr); + + /* + * Make sure there's enough buffer space remaining. + */ + if (uio_resid(uio) < 0 || + attrbufsize > (u_int32_t)uio_resid(uio)) { + break; + } else { + *((u_int32_t *)attrbufptr) = attrbufsize; + error = uiomove((caddr_t)attrbufptr, attrbufsize, + UIO_READ, uio); + if (error != 0) + break; + attrptr = attrbufptr; + /* Point to variable-length storage */ + varptr = (char *)attrbufptr + fixedsize; + *(ap->a_actualcount) += 1; + + /* + * Move to the next entry, fill in the previous offset. + */ + skip_entry: + if ((offset > 2) || ((offset == 2) && + !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + /* Termination checks */ + if (--maxcount <= 0 || uio_resid(uio) < 0 || + (u_int32_t)uio_resid(uio) < (fixedsize + + ZAP_AVENAMELEN)) { + break; + } + } + } +update: + zap_cursor_fini(&zc); + + if (attrbufptr) { + kmem_free(attrbufptr, maxsize); + } + if (error == ENOENT) { + error = 0; + } + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + /* XXX newstate TBD */ + *ap->a_newstate = zp->z_atime[0] + zp->z_atime[1]; + uio_setoffset(uio, offset); + + ZFS_EXIT(zfsvfs); + dprintf("-readdirattr: error %d\n", error); + return (error); +} +#endif + + +#ifdef WITH_SEARCHFS +int +zfs_vnop_searchfs(struct vnop_searchfs_args *ap) +#if 0 + struct vnop_searchfs_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + void *a_searchparams1; + void *a_searchparams2; + struct attrlist *a_searchattrs; + ulong_t a_maxmatches; + struct timeval *a_timelimit; + struct attrlist *a_returnattrs; + ulong_t *a_nummatches; + ulong_t a_scriptcode; + ulong_t a_options; + struct uio *a_uio; + struct searchstate *a_searchstate; + vfs_context_t a_context; + }; +#endif +{ + printf("vnop_searchfs called, type %d\n", vnode_vtype(ap->a_vp)); + + *(ap->a_nummatches) = 0; + + return (ENOTSUP); +} +#endif + + + +/* + * Predeclare these here so that the compiler assumes that this is an "old + * style" function declaration that does not include arguments so that we won't + * get type mismatch errors in the initializations that follow. + */ +static int zfs_inval(void); +static int zfs_isdir(void); + +static int +zfs_inval() +{ + dprintf("ZFS: Bad vnop: returning EINVAL\n"); + return (EINVAL); +} + +static int +zfs_isdir() +{ + dprintf("ZFS: Bad vnop: returning EISDIR\n"); + return (EISDIR); +} + + +#define VOPFUNC int (*)(void *) + +/* Directory vnode operations template */ +int (**zfs_dvnodeops) (void *); +struct vnodeopv_entry_desc zfs_dvnodeops_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_lookup_desc, (VOPFUNC)zfs_vnop_lookup}, + {&vnop_create_desc, (VOPFUNC)zfs_vnop_create}, + {&vnop_whiteout_desc, (VOPFUNC)zfs_vnop_whiteout}, + {&vnop_mknod_desc, (VOPFUNC)zfs_vnop_mknod}, + {&vnop_open_desc, (VOPFUNC)zfs_vnop_open}, + {&vnop_close_desc, (VOPFUNC)zfs_vnop_close}, + {&vnop_access_desc, (VOPFUNC)zfs_vnop_access}, + {&vnop_getattr_desc, (VOPFUNC)zfs_vnop_getattr}, + {&vnop_setattr_desc, (VOPFUNC)zfs_vnop_setattr}, + {&vnop_read_desc, (VOPFUNC)zfs_isdir}, + {&vnop_write_desc, (VOPFUNC)zfs_isdir}, + {&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_select_desc, (VOPFUNC)zfs_isdir}, + {&vnop_bwrite_desc, (VOPFUNC)zfs_isdir}, + {&vnop_fsync_desc, (VOPFUNC)zfs_vnop_fsync}, + {&vnop_remove_desc, (VOPFUNC)zfs_vnop_remove}, + {&vnop_link_desc, (VOPFUNC)zfs_vnop_link}, + {&vnop_rename_desc, (VOPFUNC)zfs_vnop_rename}, +#if defined(MAC_OS_X_VERSION_10_12) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12) + {&vnop_renamex_desc, (VOPFUNC)zfs_vnop_renamex}, +#endif + {&vnop_mkdir_desc, (VOPFUNC)zfs_vnop_mkdir}, + {&vnop_rmdir_desc, (VOPFUNC)zfs_vnop_rmdir}, + {&vnop_symlink_desc, (VOPFUNC)zfs_vnop_symlink}, + {&vnop_readdir_desc, (VOPFUNC)zfs_vnop_readdir}, + {&vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim}, + {&vnop_pathconf_desc, (VOPFUNC)zfs_vnop_pathconf}, + {&vnop_revoke_desc, (VOPFUNC)zfs_vnop_revoke}, + {&vnop_getxattr_desc, (VOPFUNC)zfs_vnop_getxattr}, + {&vnop_setxattr_desc, (VOPFUNC)zfs_vnop_setxattr}, + {&vnop_removexattr_desc, (VOPFUNC)zfs_vnop_removexattr}, + {&vnop_listxattr_desc, (VOPFUNC)zfs_vnop_listxattr}, +#ifdef WITH_READDIRATTR + {&vnop_readdirattr_desc, (VOPFUNC)zfs_vnop_readdirattr}, +#endif +#ifdef WITH_SEARCHFS + {&vnop_searchfs_desc, (VOPFUNC)zfs_vnop_searchfs}, +#endif + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_dvnodeop_opv_desc = +{ &zfs_dvnodeops, zfs_dvnodeops_template }; + +/* Regular file vnode operations template */ +int (**zfs_fvnodeops) (void *); +struct vnodeopv_entry_desc zfs_fvnodeops_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_whiteout_desc, (VOPFUNC)zfs_vnop_whiteout}, + {&vnop_open_desc, (VOPFUNC)zfs_vnop_open}, + {&vnop_close_desc, (VOPFUNC)zfs_vnop_close}, + {&vnop_access_desc, (VOPFUNC)zfs_vnop_access}, + {&vnop_getattr_desc, (VOPFUNC)zfs_vnop_getattr}, + {&vnop_setattr_desc, (VOPFUNC)zfs_vnop_setattr}, + {&vnop_read_desc, (VOPFUNC)zfs_vnop_read}, + {&vnop_write_desc, (VOPFUNC)zfs_vnop_write}, + {&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_select_desc, (VOPFUNC)zfs_vnop_select}, + {&vnop_fsync_desc, (VOPFUNC)zfs_vnop_fsync}, + {&vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim}, + {&vnop_pathconf_desc, (VOPFUNC)zfs_vnop_pathconf}, + {&vnop_bwrite_desc, (VOPFUNC)zfs_inval}, + {&vnop_pagein_desc, (VOPFUNC)zfs_vnop_pagein}, +#if HAVE_PAGEOUT_V2 + {&vnop_pageout_desc, (VOPFUNC)zfs_vnop_pageoutv2}, +#else + {&vnop_pageout_desc, (VOPFUNC)zfs_vnop_pageout}, +#endif + {&vnop_mmap_desc, (VOPFUNC)zfs_vnop_mmap}, + {&vnop_mnomap_desc, (VOPFUNC)zfs_vnop_mnomap}, + {&vnop_blktooff_desc, (VOPFUNC)zfs_vnop_blktooff}, + {&vnop_offtoblk_desc, (VOPFUNC)zfs_vnop_offtoblk}, + {&vnop_blockmap_desc, (VOPFUNC)zfs_vnop_blockmap}, + {&vnop_strategy_desc, (VOPFUNC)zfs_vnop_strategy}, + {&vnop_allocate_desc, (VOPFUNC)zfs_vnop_allocate}, + {&vnop_revoke_desc, (VOPFUNC)zfs_vnop_revoke}, + {&vnop_exchange_desc, (VOPFUNC)zfs_vnop_exchange}, + {&vnop_getxattr_desc, (VOPFUNC)zfs_vnop_getxattr}, + {&vnop_setxattr_desc, (VOPFUNC)zfs_vnop_setxattr}, + {&vnop_removexattr_desc, (VOPFUNC)zfs_vnop_removexattr}, + {&vnop_listxattr_desc, (VOPFUNC)zfs_vnop_listxattr}, +#ifdef HAVE_NAMED_STREAMS + {&vnop_getnamedstream_desc, (VOPFUNC)zfs_vnop_getnamedstream}, + {&vnop_makenamedstream_desc, (VOPFUNC)zfs_vnop_makenamedstream}, + {&vnop_removenamedstream_desc, (VOPFUNC)zfs_vnop_removenamedstream}, +#endif +#ifdef WITH_SEARCHFS + {&vnop_searchfs_desc, (VOPFUNC)zfs_vnop_searchfs}, +#endif + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_fvnodeop_opv_desc = +{ &zfs_fvnodeops, zfs_fvnodeops_template }; + +/* Symbolic link vnode operations template */ +int (**zfs_symvnodeops) (void *); +struct vnodeopv_entry_desc zfs_symvnodeops_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_open_desc, (VOPFUNC)zfs_vnop_open}, + {&vnop_close_desc, (VOPFUNC)zfs_vnop_close}, + {&vnop_access_desc, (VOPFUNC)zfs_vnop_access}, + {&vnop_getattr_desc, (VOPFUNC)zfs_vnop_getattr}, + {&vnop_setattr_desc, (VOPFUNC)zfs_vnop_setattr}, + {&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_readlink_desc, (VOPFUNC)zfs_vnop_readlink}, + {&vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim}, + {&vnop_pathconf_desc, (VOPFUNC)zfs_vnop_pathconf}, + {&vnop_revoke_desc, (VOPFUNC)zfs_vnop_revoke}, + {&vnop_getxattr_desc, (VOPFUNC)zfs_vnop_getxattr}, + {&vnop_setxattr_desc, (VOPFUNC)zfs_vnop_setxattr}, + {&vnop_removexattr_desc, (VOPFUNC)zfs_vnop_removexattr}, + {&vnop_listxattr_desc, (VOPFUNC)zfs_vnop_listxattr}, + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_symvnodeop_opv_desc = +{ &zfs_symvnodeops, zfs_symvnodeops_template }; + +/* Extended attribtue directory vnode operations template */ +int (**zfs_xdvnodeops) (void *); +struct vnodeopv_entry_desc zfs_xdvnodeops_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_lookup_desc, (VOPFUNC)zfs_vnop_lookup}, + {&vnop_create_desc, (VOPFUNC)zfs_vnop_create}, + {&vnop_whiteout_desc, (VOPFUNC)zfs_vnop_whiteout}, + {&vnop_mknod_desc, (VOPFUNC)zfs_inval}, + {&vnop_open_desc, (VOPFUNC)zfs_vnop_open}, + {&vnop_close_desc, (VOPFUNC)zfs_vnop_close}, + {&vnop_access_desc, (VOPFUNC)zfs_vnop_access}, + {&vnop_getattr_desc, (VOPFUNC)zfs_vnop_getattr}, + {&vnop_setattr_desc, (VOPFUNC)zfs_vnop_setattr}, + {&vnop_read_desc, (VOPFUNC)zfs_vnop_read}, + {&vnop_write_desc, (VOPFUNC)zfs_vnop_write}, + {&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_select_desc, (VOPFUNC)zfs_vnop_select}, + {&vnop_fsync_desc, (VOPFUNC)zfs_vnop_fsync}, + {&vnop_remove_desc, (VOPFUNC)zfs_vnop_remove}, + {&vnop_link_desc, (VOPFUNC)zfs_vnop_link}, + {&vnop_rename_desc, (VOPFUNC)zfs_vnop_rename}, + {&vnop_mkdir_desc, (VOPFUNC)zfs_inval}, + {&vnop_rmdir_desc, (VOPFUNC)zfs_vnop_rmdir}, + {&vnop_symlink_desc, (VOPFUNC)zfs_inval}, + {&vnop_readdir_desc, (VOPFUNC)zfs_vnop_readdir}, + {&vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim}, + {&vnop_pathconf_desc, (VOPFUNC)zfs_vnop_pathconf}, + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_xdvnodeop_opv_desc = +{ &zfs_xdvnodeops, zfs_xdvnodeops_template }; + +/* Error vnode operations template */ +int (**zfs_evnodeops) (void *); +struct vnodeopv_entry_desc zfs_evnodeops_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim}, + {&vnop_pathconf_desc, (VOPFUNC)zfs_vnop_pathconf}, + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_evnodeop_opv_desc = +{ &zfs_evnodeops, zfs_evnodeops_template }; + +int (**zfs_fifonodeops)(void *); +struct vnodeopv_entry_desc zfs_fifonodeops_template[] = { + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)fifo_lookup }, + { &vnop_create_desc, (VOPFUNC)fifo_create }, + { &vnop_mknod_desc, (VOPFUNC)fifo_mknod }, + { &vnop_open_desc, (VOPFUNC)fifo_open }, + { &vnop_close_desc, (VOPFUNC)fifo_close }, + { &vnop_getattr_desc, (VOPFUNC)zfs_vnop_getattr }, + { &vnop_setattr_desc, (VOPFUNC)zfs_vnop_setattr }, + { &vnop_read_desc, (VOPFUNC)fifo_read }, + { &vnop_write_desc, (VOPFUNC)fifo_write }, + { &vnop_ioctl_desc, (VOPFUNC)fifo_ioctl }, + { &vnop_select_desc, (VOPFUNC)fifo_select }, + { &vnop_revoke_desc, (VOPFUNC)fifo_revoke }, + { &vnop_mmap_desc, (VOPFUNC)fifo_mmap }, + { &vnop_fsync_desc, (VOPFUNC)zfs_vnop_fsync }, + { &vnop_remove_desc, (VOPFUNC)fifo_remove }, + { &vnop_link_desc, (VOPFUNC)fifo_link }, + { &vnop_rename_desc, (VOPFUNC)fifo_rename }, + { &vnop_mkdir_desc, (VOPFUNC)fifo_mkdir }, + { &vnop_rmdir_desc, (VOPFUNC)fifo_rmdir }, + { &vnop_symlink_desc, (VOPFUNC)fifo_symlink }, + { &vnop_readdir_desc, (VOPFUNC)fifo_readdir }, + { &vnop_readlink_desc, (VOPFUNC)fifo_readlink }, + { &vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive }, + { &vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim }, + { &vnop_strategy_desc, (VOPFUNC)fifo_strategy }, + { &vnop_pathconf_desc, (VOPFUNC)fifo_pathconf }, + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, + { &vnop_bwrite_desc, (VOPFUNC)zfs_inval }, + { &vnop_pagein_desc, (VOPFUNC)zfs_vnop_pagein }, +#if HAVE_PAGEOUT_V2 + { &vnop_pageout_desc, (VOPFUNC)zfs_vnop_pageoutv2 }, +#else + { &vnop_pageout_desc, (VOPFUNC)zfs_vnop_pageout }, +#endif + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, + { &vnop_blktooff_desc, (VOPFUNC)zfs_vnop_blktooff }, + { &vnop_offtoblk_desc, (VOPFUNC)zfs_vnop_offtoblk }, + { &vnop_blockmap_desc, (VOPFUNC)zfs_vnop_blockmap }, + { &vnop_getxattr_desc, (VOPFUNC)zfs_vnop_getxattr}, + { &vnop_setxattr_desc, (VOPFUNC)zfs_vnop_setxattr}, + { &vnop_removexattr_desc, (VOPFUNC)zfs_vnop_removexattr}, + { &vnop_listxattr_desc, (VOPFUNC)zfs_vnop_listxattr}, + { (struct vnodeop_desc *)NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_fifonodeop_opv_desc = + { &zfs_fifonodeops, zfs_fifonodeops_template }; + + +/* + * .zfs/snapdir vnops + */ +int (**zfs_ctldirops) (void *); +struct vnodeopv_entry_desc zfs_ctldir_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_lookup_desc, (VOPFUNC)zfsctl_vnop_lookup}, + {&vnop_getattr_desc, (VOPFUNC)zfsctl_vnop_getattr}, + {&vnop_readdir_desc, (VOPFUNC)zfsctl_vnop_readdir}, + {&vnop_mkdir_desc, (VOPFUNC)zfsctl_vnop_mkdir}, + {&vnop_rmdir_desc, (VOPFUNC)zfsctl_vnop_rmdir}, + /* We also need to define these for the top ones to work */ + {&vnop_open_desc, (VOPFUNC)zfsctl_vnop_open}, + {&vnop_close_desc, (VOPFUNC)zfsctl_vnop_close}, + {&vnop_access_desc, (VOPFUNC)zfsctl_vnop_access}, + {&vnop_inactive_desc, (VOPFUNC)zfsctl_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfsctl_vnop_reclaim}, + {&vnop_revoke_desc, (VOPFUNC)err_revoke}, + {&vnop_fsync_desc, (VOPFUNC)nop_fsync}, + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_ctldir_opv_desc = +{ &zfs_ctldirops, zfs_ctldir_template }; + +/* + * Get new vnode for znode. + * + * This function uses zp->z_zfsvfs, zp->z_mode, zp->z_flags, zp->z_id and sets + * zp->z_vnode and zp->z_vid. + */ +int +zfs_znode_getvnode(znode_t *zp, zfsvfs_t *zfsvfs) +{ + struct vnode_fsparam vfsp; + struct vnode *vp = NULL; + + dprintf("getvnode zp %p with vp %p zfsvfs %p vfs %p\n", zp, vp, + zfsvfs, zfsvfs->z_vfs); + + if (zp->z_vnode) + panic("zp %p vnode already set\n", zp->z_vnode); + + bzero(&vfsp, sizeof (vfsp)); + vfsp.vnfs_str = "zfs"; + vfsp.vnfs_mp = zfsvfs->z_vfs; + vfsp.vnfs_vtype = IFTOVT((mode_t)zp->z_mode); + vfsp.vnfs_fsnode = zp; + vfsp.vnfs_flags = VNFS_ADDFSREF; + + /* Tag root directory */ + if (zp->z_id == zfsvfs->z_root) { + vfsp.vnfs_markroot = 1; + } + + switch (vfsp.vnfs_vtype) { + case VDIR: + if (zp->z_pflags & ZFS_XATTR) { + vfsp.vnfs_vops = zfs_xdvnodeops; + } else { + vfsp.vnfs_vops = zfs_dvnodeops; + } + zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ + break; + case VBLK: + case VCHR: + { + uint64_t rdev; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), + &rdev, sizeof (rdev)) == 0); + + vfsp.vnfs_rdev = zfs_cmpldev(rdev); + } + /* FALLTHROUGH */ + case VSOCK: + vfsp.vnfs_vops = zfs_fvnodeops; + break; + case VFIFO: + vfsp.vnfs_vops = zfs_fifonodeops; + break; + case VREG: + vfsp.vnfs_vops = zfs_fvnodeops; + vfsp.vnfs_filesize = zp->z_size; + break; + case VLNK: + vfsp.vnfs_vops = zfs_symvnodeops; +#if 0 + vfsp.vnfs_filesize = ???; +#endif + break; + default: + vfsp.vnfs_vops = zfs_fvnodeops; + printf("ZFS: Warning, error-vnops selected: vtype %d\n", + vfsp.vnfs_vtype); + break; + } + + /* + * vnode_create() has a habit of calling both vnop_reclaim() and + * vnop_fsync(), which can create havok as we are already holding locks. + */ + + while (vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp) != 0) { + kpreempt(KPREEMPT_SYNC); + } + atomic_inc_64(&vnop_num_vnodes); + + dprintf("Assigned zp %p with vp %p zfsvfs %p\n", zp, vp, zp->z_zfsvfs); + + /* + * Unfortunately, when it comes to IOCTL_GET_BOOT_INFO and getting + * the volume finderinfo, XNU checks the tags, and only acts on + * HFS. So we have to set it to HFS on the root. It is pretty gross + * but until XNU adds supporting code.. + * The only place we use tags in ZFS is ctldir checking for VT_OTHER + */ + if (zp->z_id == zfsvfs->z_root) + vnode_settag(vp, VT_HFS); + else + vnode_settag(vp, VT_ZFS); + + zp->z_vid = vnode_vid(vp); + zp->z_vnode = vp; + + /* + * OS X Finder is hardlink agnostic, so we need to mark vp's that + * are hardlinks, so that it forces a lookup each time, ignoring + * the name cache. + */ + if ((zp->z_links > 1) && (IFTOVT((mode_t)zp->z_mode) == VREG)) + vnode_setmultipath(vp); + + return (0); +} + + +/* + * Called by taskq, to call zfs_znode_getvnode( vnode_create( - and + * attach vnode to znode. + */ +void +zfs_znode_asyncgetvnode_impl(void *arg) +{ + znode_t *zp = (znode_t *)arg; + VERIFY3P(zp, !=, NULL); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + VERIFY3P(zfsvfs, !=, NULL); + + // Attach vnode, done as different thread + zfs_znode_getvnode(zp, zfsvfs); + + // Wake up anyone blocked on us + mutex_enter(&zp->z_attach_lock); + taskq_init_ent(&zp->z_attach_taskq); + cv_broadcast(&zp->z_attach_cv); + mutex_exit(&zp->z_attach_lock); + +} + + +/* + * If the znode's vnode is not yet attached (zp->z_vnode == NULL) + * we call taskq_wait to wait for it to complete. + * We guarantee znode has a vnode at the return of function only + * when return is "0". On failure to wait, it returns -1, and caller + * may consider waiting by other means. + */ +int +zfs_znode_asyncwait(znode_t *zp) +{ + int ret = -1; + zfsvfs_t *zfsvfs; + + if (zp == NULL) + return (ret); + + zfsvfs = zp->z_zfsvfs; + if (zfsvfs == NULL) + return (ret); + + ZFS_ENTER_IFERROR(zfsvfs) + goto out; + + if (zfsvfs->z_os == NULL) + goto out; + + // Work out if we need to block, that is, we have + // no vnode AND a taskq was launched. Unsure if we should + // look inside taskqent node like this. + mutex_enter(&zp->z_attach_lock); + if (zp->z_vnode == NULL && + zp->z_attach_taskq.tqent_func != NULL) { + // We need to block and wait for taskq to finish. + cv_wait(&zp->z_attach_cv, &zp->z_attach_lock); + ret = 0; + } + mutex_exit(&zp->z_attach_lock); + +out: + ZFS_EXIT(zfsvfs); + return (ret); +} + +/* + * Called in place of VN_RELE() for the places that uses ZGET_FLAG_ASYNC. + */ +void +zfs_znode_asyncput_impl(znode_t *zp) +{ + // Make sure the other thread finished zfs_znode_getvnode(); + // This may block, if waiting is required. + zfs_znode_asyncwait(zp); + + // Safe to release now that it is attached. + VN_RELE(ZTOV(zp)); +} + +/* + * Called in place of VN_RELE() for the places that uses ZGET_FLAG_ASYNC, + * where we also taskq it - as we come from reclaim. + */ +void +zfs_znode_asyncput(znode_t *zp) +{ + dsl_pool_t *dp = dmu_objset_pool(zp->z_zfsvfs->z_os); + taskq_t *tq = dsl_pool_zrele_taskq(dp); + + VERIFY3P(tq, !=, NULL); + + VERIFY(taskq_dispatch( + (taskq_t *)tq, + (task_func_t *)zfs_znode_asyncput_impl, zp, TQ_SLEEP) != 0); +} + +/* + * Attach a new vnode to the znode asynchronically. We do this using + * a taskq to call it, and then wait to release the iocount. + * Called of zget_ext(..., ZGET_FLAG_ASYNC); will use + * zfs_znode_asyncput(zp) instead of VN_RELE(vp). + */ +int +zfs_znode_asyncgetvnode(znode_t *zp, zfsvfs_t *zfsvfs) +{ + VERIFY(zp != NULL); + VERIFY(zfsvfs != NULL); + + // We should not have a vnode here. + VERIFY3P(ZTOV(zp), ==, NULL); + + dsl_pool_t *dp = dmu_objset_pool(zfsvfs->z_os); + taskq_t *tq = dsl_pool_zrele_taskq(dp); + VERIFY3P(tq, !=, NULL); + + mutex_enter(&zp->z_attach_lock); + taskq_dispatch_ent(tq, + (task_func_t *)zfs_znode_asyncgetvnode_impl, + zp, + TQ_SLEEP, + &zp->z_attach_taskq); + mutex_exit(&zp->z_attach_lock); + return (0); +} + + + +/* + * Maybe these should live in vfsops + */ +int +zfs_vfsops_init(void) +{ + struct vfs_fsentry vfe; + + /* Start thread to notify Finder of changes */ + zfs_start_notify_thread(); + + vfe.vfe_vfsops = &zfs_vfsops_template; + vfe.vfe_vopcnt = ZFS_VNOP_TBL_CNT; + vfe.vfe_opvdescs = zfs_vnodeop_opv_desc_list; + + strlcpy(vfe.vfe_fsname, "zfs", MFSNAMELEN); + + /* + * Note: must set VFS_TBLGENERICMNTARGS with VFS_TBLLOCALVOL + * to suppress local mount argument handling. + */ + vfe.vfe_flags = VFS_TBLTHREADSAFE | VFS_TBLNOTYPENUM | VFS_TBLLOCALVOL | + VFS_TBL64BITREADY | VFS_TBLNATIVEXATTR | VFS_TBLGENERICMNTARGS | + VFS_TBLREADDIR_EXTENDED; + +#if HAVE_PAGEOUT_V2 + vfe.vfe_flags |= VFS_TBLVNOP_PAGEOUTV2; +#endif + +#ifdef VFS_TBLCANMOUNTROOT // From 10.12 + vfe.vfe_flags |= VFS_TBLCANMOUNTROOT; +#endif + + vfe.vfe_reserv[0] = 0; + vfe.vfe_reserv[1] = 0; + + if (vfs_fsadd(&vfe, &zfs_vfsconf) != 0) + return (KERN_FAILURE); + else + return (KERN_SUCCESS); +} + +int +zfs_vfsops_fini(void) +{ + + zfs_stop_notify_thread(); + + return (vfs_fsremove(zfs_vfsconf)); +} diff --git a/module/os/macos/zfs/zfs_vnops_osx_lib.c b/module/os/macos/zfs/zfs_vnops_osx_lib.c new file mode 100644 index 0000000000..7504d71013 --- /dev/null +++ b/module/os/macos/zfs/zfs_vnops_osx_lib.c @@ -0,0 +1,2232 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013 Will Andrews + * Copyright (c) 2013, 2020 Jorgen Lundman + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +extern int zfs_vnop_force_formd_normalized_output; /* disabled by default */ + +/* + * Unfortunately Apple defines "KAUTH_VNODE_ACCESS (1<<31)" which + * generates: "warning: signed shift result (0x80000000) sets the + * sign bit of the shift expression's type ('int') and becomes negative." + * So until they fix their define, we override it here. + */ + +#if KAUTH_VNODE_ACCESS == 0x80000000 +#undef KAUTH_VNODE_ACCESS +#define KAUTH_VNODE_ACCESS (1ULL<<31) +#endif + + + +int zfs_hardlink_addmap(znode_t *zp, uint64_t parentid, uint32_t linkid); + +/* Originally from illumos:uts/common/sys/vfs.h */ +typedef uint64_t vfs_feature_t; +#define VFSFT_XVATTR 0x100000001 /* Supports xvattr for attrs */ +#define VFSFT_CASEINSENSITIVE 0x100000002 /* Supports case-insensitive */ +#define VFSFT_NOCASESENSITIVE 0x100000004 /* NOT case-sensitive */ +#define VFSFT_DIRENTFLAGS 0x100000008 /* Supports dirent flags */ +#define VFSFT_ACLONCREATE 0x100000010 /* Supports ACL on create */ +#define VFSFT_ACEMASKONACCESS 0x100000020 /* Can use ACEMASK for access */ +#define VFSFT_SYSATTR_VIEWS 0x100000040 /* Supports sysattr view i/f */ +#define VFSFT_ACCESS_FILTER 0x100000080 /* dirents filtered by access */ +#define VFSFT_REPARSE 0x100000100 /* Supports reparse point */ +#define VFSFT_ZEROCOPY_SUPPORTED 0x100000200 /* Supports loaning buffers */ + +#define ZFS_SUPPORTED_VATTRS \ + (VNODE_ATTR_va_mode | \ + VNODE_ATTR_va_uid | \ + VNODE_ATTR_va_gid | \ + VNODE_ATTR_va_fsid | \ + VNODE_ATTR_va_fileid | \ + VNODE_ATTR_va_nlink | \ + VNODE_ATTR_va_data_size | \ + VNODE_ATTR_va_total_size | \ + VNODE_ATTR_va_rdev | \ + VNODE_ATTR_va_gen | \ + VNODE_ATTR_va_create_time | \ + VNODE_ATTR_va_access_time | \ + VNODE_ATTR_va_modify_time | \ + VNODE_ATTR_va_change_time | \ + VNODE_ATTR_va_backup_time | \ + VNODE_ATTR_va_flags | \ + VNODE_ATTR_va_parentid | \ + VNODE_ATTR_va_iosize | \ + VNODE_ATTR_va_filerev | \ + VNODE_ATTR_va_type | \ + VNODE_ATTR_va_encoding | \ + 0) + +// VNODE_ATTR_va_uuuid | +// VNODE_ATTR_va_guuid | + + + + + + + + +/* + * fnv_32a_str - perform a 32 bit Fowler/Noll/Vo FNV-1a hash on a string + * + * input: + * str - string to hash + * hval - previous hash value or 0 if first call + * + * returns: + * 32 bit hash as a static hash type + * + * NOTE: To use the recommended 32 bit FNV-1a hash, use FNV1_32A_INIT as the + * hval arg on the first call to either fnv_32a_buf() or fnv_32a_str(). + */ +uint32_t +fnv_32a_str(const char *str, uint32_t hval) +{ + unsigned char *s = (unsigned char *)str; /* unsigned string */ + + /* + * FNV-1a hash each octet in the buffer + */ + while (*s) { + + /* xor the bottom with the current octet */ + hval ^= (uint32_t)*s++; + + /* multiply by the 32 bit FNV magic prime mod 2^32 */ +#if defined(NO_FNV_GCC_OPTIMIZATION) + hval *= FNV_32_PRIME; +#else + hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + + (hval<<24); +#endif + } + + /* return our new hash value */ + return (hval); +} + +/* + * fnv_32a_buf - perform a 32 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * + * input: + * buf- start of buffer to hash + * len- length of buffer in octets + * hval- previous hash value or 0 if first call + * + * returns: + * 32 bit hash as a static hash type + * + * NOTE: To use the recommended 32 bit FNV-1a hash, use FNV1_32A_INIT as the + * hval arg on the first call to either fnv_32a_buf() or fnv_32a_str(). + */ +uint32_t +fnv_32a_buf(void *buf, size_t len, uint32_t hval) +{ + unsigned char *bp = (unsigned char *)buf; /* start of buffer */ + unsigned char *be = bp + len; /* beyond end of buffer */ + + /* + * FNV-1a hash each octet in the buffer + */ + while (bp < be) { + + /* xor the bottom with the current octet */ + hval ^= (uint32_t)*bp++; + + /* multiply by the 32 bit FNV magic prime mod 2^32 */ +#if defined(NO_FNV_GCC_OPTIMIZATION) + hval *= FNV_32_PRIME; +#else + hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + + (hval<<24); +#endif + } + + /* return our new hash value */ + return (hval); +} + +int +zfs_getattr_znode_unlocked(struct vnode *vp, vattr_t *vap) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + uint64_t parent; + sa_bulk_attr_t bulk[4]; + int count = 0; +#ifdef VNODE_ATTR_va_addedtime + uint64_t addtime[2] = { 0 }; +#endif + int ishardlink = 0; + + // printf("getattr_osx\n"); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (VATTR_IS_ACTIVE(vap, va_acl)) { + // printf("want acl\n"); + VATTR_RETURN(vap, va_uuuid, kauth_null_guid); + VATTR_RETURN(vap, va_guuid, kauth_null_guid); + + // dprintf("Calling getacl\n"); + if ((error = zfs_getacl(zp, &vap->va_acl, B_FALSE, NULL))) { + // dprintf("zfs_getacl returned error %d\n", error); + error = 0; + } else { + + VATTR_SET_SUPPORTED(vap, va_acl); + /* va_acl implies va_uuuid & va_guuid are supported. */ + VATTR_RETURN(vap, va_uuuid, kauth_null_guid); + VATTR_RETURN(vap, va_guuid, kauth_null_guid); + } + + } + + mutex_enter(&zp->z_lock); + + ishardlink = ((zp->z_links > 1) && + (IFTOVT((mode_t)zp->z_mode) == VREG)) ? 1 : 0; + if (zp->z_finder_hardlink == TRUE) + ishardlink = 1; + else if (ishardlink) + zp->z_finder_hardlink = TRUE; + + /* Work out which SA we need to fetch */ + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * Unfortunately, sa_bulk_lookup does not let you handle optional + * SA entries - so have to look up the optionals individually. + */ + error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count); + if (error) { + dprintf("ZFS: Warning: getattr failed sa_bulk_lookup: %d, " + "parent %llu, flags %llu\n", error, parent, zp->z_pflags); + mutex_exit(&zp->z_lock); + ZFS_EXIT(zfsvfs); + return (0); + } + +#ifdef VNODE_ATTR_va_addedtime + if (VATTR_IS_ACTIVE(vap, va_addedtime)) { + sa_lookup(zp->z_sa_hdl, SA_ZPL_ADDTIME(zfsvfs), + &addtime, sizeof (addtime)); + } +#endif + + /* + * On Mac OS X we always export the root directory id as 2 + */ + vap->va_fileid = INO_ZFSTOXNU(zp->z_id, zfsvfs->z_root); + + vap->va_data_size = zp->z_size; + vap->va_total_size = zp->z_size; + // vap->va_gen = zp->z_gen; + vap->va_gen = 0; +#if defined(DEBUG) || defined(ZFS_DEBUG) + if (zp->z_gen != 0) + dprintf("%s: va_gen %lld -> 0\n", __func__, zp->z_gen); +#endif + + if (vnode_isdir(vp)) { + vap->va_nlink = zp->z_size; + } else { + vap->va_nlink = zp->z_links; + } + + + /* + * Carbon compatibility, pretend to support this legacy attribute + */ + if (VATTR_IS_ACTIVE(vap, va_backup_time)) { + vap->va_backup_time.tv_sec = 0; + vap->va_backup_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_backup_time); + } + vap->va_flags = zfs_getbsdflags(zp); + /* + * On Mac OS X we always export the root directory id as 2 + * and its parent as 1 + */ + if (zp->z_id == zfsvfs->z_root) + vap->va_parentid = 1; + else if (parent == zfsvfs->z_root) + vap->va_parentid = 2; + else + vap->va_parentid = parent; + + // Hardlinks: Return cached parentid, make it 2 if root. + if (ishardlink && zp->z_finder_parentid) + vap->va_parentid = (zp->z_finder_parentid == zfsvfs->z_root) ? + 2 : zp->z_finder_parentid; + + vap->va_iosize = zp->z_blksz ? zp->z_blksz : zfsvfs->z_max_blksz; + // vap->va_iosize = 512; + VATTR_SET_SUPPORTED(vap, va_iosize); + + /* Don't include '.' and '..' in the number of entries */ + if (VATTR_IS_ACTIVE(vap, va_nchildren) && vnode_isdir(vp)) { + VATTR_RETURN(vap, va_nchildren, vap->va_nlink - 2); + } + + /* + * va_dirlinkcount is the count of directory hard links. When a file + * system does not support ATTR_DIR_LINKCOUNT, xnu will default to 1. + * Since we claim to support ATTR_DIR_LINKCOUNT both as valid and as + * native, we'll just return 1. We set 1 for this value in dirattrpack + * as well. If in the future ZFS actually supports directory hard links, + * we can return a real value. + */ + if (VATTR_IS_ACTIVE(vap, va_dirlinkcount) && vnode_isdir(vp)) { + VATTR_RETURN(vap, va_dirlinkcount, 1); + } + + + if (VATTR_IS_ACTIVE(vap, va_data_alloc) || + VATTR_IS_ACTIVE(vap, va_total_alloc)) { + uint32_t blksize; + u_longlong_t nblks; + sa_object_size(zp->z_sa_hdl, &blksize, &nblks); + vap->va_data_alloc = (uint64_t)512LL * (uint64_t)nblks; + vap->va_total_alloc = vap->va_data_alloc; + vap->va_supported |= VNODE_ATTR_va_data_alloc | + VNODE_ATTR_va_total_alloc; + } + + if (VATTR_IS_ACTIVE(vap, va_name)) { + vap->va_name[0] = 0; + + if (!vnode_isvroot(vp)) { + + /* + * Finder (Carbon) relies on getattr returning the + * correct name for hardlinks to work, so we store the + * lookup name in vnop_lookup if file references are + * high, then set the return name here. + * If we also want ATTR_CMN_* lookups to work, we need + * to set a unique va_linkid for each entry, and based + * on the linkid in the lookup, return the correct name. + * It is set in zfs_vnop_lookup(). + * Since zap_value_search is a slow call, we only use + * it if we have not cached the name in vnop_lookup. + */ + + // Cached name, from vnop_lookup + if (ishardlink && + zp->z_name_cache[0]) { + + strlcpy(vap->va_name, zp->z_name_cache, + MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + + } else if (zp->z_name_cache[0]) { + + strlcpy(vap->va_name, zp->z_name_cache, + MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + + } else { + + // Go find the name. + if (zap_value_search(zfsvfs->z_os, parent, + zp->z_id, ZFS_DIRENT_OBJ(-1ULL), + vap->va_name) == 0) { + VATTR_SET_SUPPORTED(vap, va_name); + // Might as well keep this name too. + strlcpy(zp->z_name_cache, vap->va_name, + MAXPATHLEN); + } // zap_value_search + + } + + dprintf("getattr: %p return name '%s':%04llx\n", vp, + vap->va_name, vap->va_linkid); + + + } else { + /* + * The vroot objects must return a unique name for + * Finder to be able to distringuish between mounts. + * For this reason we simply return the fullname, + * from the statfs mountedfrom + * + * dataset mountpoint + * foo /bar + * As we used to return "foo" to ATTR_CMN_NAME of + * "/bar" we change this to return "bar" as expected. + */ + char *r, *osname; + osname = vfs_statfs(zfsvfs->z_vfs)->f_mntonname; + r = strrchr(osname, '/'); + strlcpy(vap->va_name, + r ? &r[1] : osname, + MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + dprintf("getattr root returning '%s'\n", vap->va_name); + } + } + + if (VATTR_IS_ACTIVE(vap, va_linkid)) { + + /* + * Apple needs a little extra care with HARDLINKs. All hardlink + * targets return the same va_fileid (POSIX) but also return + * a unique va_linkid. This we generate by hashing the (unique) + * name and store as va_linkid. However, Finder will call + * vfs_vget() with linkid and expect to receive the correct link + * target, so we need to add it to the AVL z_hardlinks. + */ + if (ishardlink) { + hardlinks_t *searchnode, *findnode; + avl_index_t loc; + + // If we don't have a linkid, make one. + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + searchnode->hl_parent = vap->va_parentid; + searchnode->hl_fileid = zp->z_id; + strlcpy(searchnode->hl_name, zp->z_name_cache, + PATH_MAX); + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks, searchnode, + &loc); + rw_exit(&zfsvfs->z_hardlinks_lock); + kmem_free(searchnode, sizeof (hardlinks_t)); + + if (!findnode) { + static uint32_t zfs_hardlink_sequence = + 1ULL<<31; + uint32_t id; + + id = atomic_inc_32_nv(&zfs_hardlink_sequence); + + zfs_hardlink_addmap(zp, vap->va_parentid, id); + VATTR_RETURN(vap, va_linkid, id); + + } else { + VATTR_RETURN(vap, va_linkid, + findnode->hl_linkid); + } + + } else { // !ishardlink - use same as fileid + + VATTR_RETURN(vap, va_linkid, vap->va_fileid); + + } + + } // active linkid + + if (VATTR_IS_ACTIVE(vap, va_filerev)) { + VATTR_RETURN(vap, va_filerev, 0); + } + if (VATTR_IS_ACTIVE(vap, va_fsid)) { + VATTR_RETURN(vap, va_fsid, zfsvfs->z_rdev); + } + if (VATTR_IS_ACTIVE(vap, va_type)) { + VATTR_RETURN(vap, va_type, vnode_vtype(ZTOV(zp))); + } + if (VATTR_IS_ACTIVE(vap, va_encoding)) { + VATTR_RETURN(vap, va_encoding, kTextEncodingMacUnicode); + } +#ifdef VNODE_ATTR_va_addedtime + /* + * ADDEDTIME should come from finderinfo according to hfs_attrlist.c + * in ZFS we can use crtime, and add logic to getxattr finderinfo to + * copy the ADDEDTIME into the structure. See vnop_getxattr + */ + if (VATTR_IS_ACTIVE(vap, va_addedtime)) { + /* Lookup the ADDTIME if it exists, if not, use CRTIME */ + if ((addtime[0] == 0) && (addtime[1])) { + dprintf("ZFS: ADDEDTIME using crtime %llu (error %d)\n", + vap->va_crtime.tv_sec, error); + vap->va_addedtime.tv_sec = vap->va_crtime.tv_sec; + vap->va_addedtime.tv_nsec = vap->va_crtime.tv_nsec; + } else { + dprintf("ZFS: ADDEDTIME using addtime %llu\n", + addtime[0]); + ZFS_TIME_DECODE(&vap->va_addedtime, addtime); + } + VATTR_SET_SUPPORTED(vap, va_addedtime); + } +#endif +#ifdef VNODE_ATTR_va_fsid64 + if (VATTR_IS_ACTIVE(vap, va_fsid64)) { + vap->va_fsid64.val[0] = + vfs_statfs(zfsvfs->z_vfs)->f_fsid.val[0]; + vap->va_fsid64.val[1] = vfs_typenum(zfsvfs->z_vfs); + VATTR_SET_SUPPORTED(vap, va_fsid64); + } +#endif +#ifdef VNODE_ATTR_va_write_gencount + if (VATTR_IS_ACTIVE(vap, va_write_gencount)) { + if (!zp->z_write_gencount) + atomic_inc_64(&zp->z_write_gencount); + VATTR_RETURN(vap, va_write_gencount, + (uint32_t)zp->z_write_gencount); + } +#endif + +#ifdef VNODE_ATTR_va_document_id + if (VATTR_IS_ACTIVE(vap, va_document_id)) { + + if (!zp->z_document_id) { + zfs_setattr_generate_id(zp, parent, vap->va_name); + } + + VATTR_RETURN(vap, va_document_id, zp->z_document_id); + } +#endif /* VNODE_ATTR_va_document_id */ + + +#if 0 // Issue #192 + if (VATTR_IS_ACTIVE(vap, va_uuuid)) { + kauth_cred_uid2guid(zp->z_uid, &vap->va_uuuid); + } + if (VATTR_IS_ACTIVE(vap, va_guuid)) { + kauth_cred_gid2guid(zp->z_gid, &vap->va_guuid); + } +#endif + + if (ishardlink) { + dprintf("ZFS:getattr(%s,%llu,%llu) parent %llu: cache_parent " + "%llu: va_nlink %u\n", VATTR_IS_ACTIVE(vap, va_name) ? + vap->va_name : zp->z_name_cache, + vap->va_fileid, + VATTR_IS_ACTIVE(vap, va_linkid) ? vap->va_linkid : 0, + vap->va_parentid, + zp->z_finder_parentid, + vap->va_nlink); + } + + vap->va_supported |= ZFS_SUPPORTED_VATTRS; + uint64_t missing = 0; + missing = (vap->va_active ^ (vap->va_active & vap->va_supported)); + if (missing != 0) { + dprintf("vnop_getattr:: asked %08llx replied %08llx " + " missing %08llx\n", + vap->va_active, vap->va_supported, + missing); + } + + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + return (error); +} + +boolean_t +vfs_has_feature(vfs_t *vfsp, vfs_feature_t vfsft) +{ + + switch (vfsft) { + case VFSFT_CASEINSENSITIVE: + case VFSFT_NOCASESENSITIVE: + return (B_TRUE); + default: + return (B_FALSE); + } +} + +int +zfs_access_native_mode(struct vnode *vp, int *mode, cred_t *cr, + caller_context_t *ct) +{ + int accmode = *mode & (VREAD|VWRITE|VEXEC /* |VAPPEND */); + int error = 0; + int flag = 0; // FIXME + + if (accmode != 0) + error = zfs_access(vp, accmode, flag, cr); + + *mode &= ~(accmode); + + return (error); +} + +int +zfs_ioflags(int ap_ioflag) +{ + int flags = 0; + + if (ap_ioflag & IO_APPEND) + flags |= FAPPEND; + if (ap_ioflag & IO_NDELAY) + flags |= FNONBLOCK; + if (ap_ioflag & IO_SYNC) + flags |= (FSYNC | FDSYNC | FRSYNC); + + return (flags); +} + +int +zfs_vnop_ioctl_fullfsync(struct vnode *vp, vfs_context_t ct, zfsvfs_t *zfsvfs) +{ + int error; + + error = zfs_fsync(VTOZ(vp), /* syncflag */ 0, NULL); + if (error) + return (error); + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + else + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + return (0); +} + +uint32_t +zfs_getbsdflags(znode_t *zp) +{ + uint32_t bsdflags = 0; + uint64_t zflags = zp->z_pflags; + + if (zflags & ZFS_NODUMP) + bsdflags |= UF_NODUMP; + if (zflags & ZFS_UIMMUTABLE) + bsdflags |= UF_IMMUTABLE; + if (zflags & ZFS_UAPPENDONLY) + bsdflags |= UF_APPEND; + if (zflags & ZFS_OPAQUE) + bsdflags |= UF_OPAQUE; + if (zflags & ZFS_HIDDEN) + bsdflags |= UF_HIDDEN; + if (zflags & ZFS_TRACKED) + bsdflags |= UF_TRACKED; + if (zflags & ZFS_COMPRESSED) + bsdflags |= UF_COMPRESSED; + + if (zflags & ZFS_SIMMUTABLE) + bsdflags |= SF_IMMUTABLE; + if (zflags & ZFS_SAPPENDONLY) + bsdflags |= SF_APPEND; + /* + * Due to every file getting archive set automatically, and OSX + * don't let you move/copy it as a user, we disable archive connection + * for now + * if (zflags & ZFS_ARCHIVE) + * bsdflags |= SF_ARCHIVED; + */ + dprintf("getbsd changing zfs %08lx to osx %08lx\n", + zflags, bsdflags); + return (bsdflags); +} + +void +zfs_setbsdflags(znode_t *zp, uint32_t bsdflags) +{ + uint64_t zflags; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), + &zflags, sizeof (zflags)) == 0); + + if (bsdflags & UF_NODUMP) + zflags |= ZFS_NODUMP; + else + zflags &= ~ZFS_NODUMP; + + if (bsdflags & UF_IMMUTABLE) + zflags |= ZFS_UIMMUTABLE; + else + zflags &= ~ZFS_UIMMUTABLE; + + if (bsdflags & UF_APPEND) + zflags |= ZFS_UAPPENDONLY; + else + zflags &= ~ZFS_UAPPENDONLY; + + if (bsdflags & UF_OPAQUE) + zflags |= ZFS_OPAQUE; + else + zflags &= ~ZFS_OPAQUE; + + if (bsdflags & UF_HIDDEN) + zflags |= ZFS_HIDDEN; + else + zflags &= ~ZFS_HIDDEN; + + if (bsdflags & UF_TRACKED) + zflags |= ZFS_TRACKED; + else + zflags &= ~ZFS_TRACKED; + + if (bsdflags & UF_COMPRESSED) + zflags |= ZFS_COMPRESSED; + else + zflags &= ~ZFS_COMPRESSED; + + /* + * if (bsdflags & SF_ARCHIVED) + * zflags |= ZFS_ARCHIVE; + * else + * zflags &= ~ZFS_ARCHIVE; + */ + if (bsdflags & SF_IMMUTABLE) + zflags |= ZFS_SIMMUTABLE; + else + zflags &= ~ZFS_SIMMUTABLE; + + if (bsdflags & SF_APPEND) + zflags |= ZFS_SAPPENDONLY; + else + zflags &= ~ZFS_SAPPENDONLY; + + zp->z_pflags = zflags; + dprintf("setbsd changing osx %08lx to zfs %08lx\n", + bsdflags, zflags); + + /* + * (void )sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), + * (void *)&zp->z_pflags, sizeof (uint64_t), tx); + */ +} + +/* + * Lookup/Create an extended attribute entry. + * + * Input arguments: + * dzp - znode for hidden attribute directory + * name - name of attribute + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * + * Output arguments: + * vpp - pointer to the vnode for the entry (NULL if there isn't one) + * + * Return value: 0 on success or errno value on failure. + */ +int +zpl_obtain_xattr(znode_t *dzp, const char *name, mode_t mode, cred_t *cr, + vnode_t **vpp, int flag) +{ + znode_t *xzp = NULL; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + struct vnode_attr vattr; + int error; + struct componentname cn = { 0 }; + zfs_acl_ids_t acl_ids; + + /* zfs_dirent_lock() expects a component name */ + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + VATTR_INIT(&vattr); + VATTR_SET(&vattr, va_type, VREG); + VATTR_SET(&vattr, va_mode, mode & ~S_IFMT); + + if ((error = zfs_acl_ids_create(dzp, 0, + &vattr, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + cn.cn_namelen = strlen(name)+1; + cn.cn_nameptr = (char *)kmem_zalloc(cn.cn_namelen, KM_SLEEP); + +top: + /* Lock the attribute entry name. */ + if ((error = zfs_dirent_lock(&dl, dzp, (char *)name, &xzp, flag, + NULL, &cn))) { + goto out; + } + /* If the name already exists, we're done. */ + if (xzp != NULL) { + zfs_dirent_unlock(dl); + goto out; + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, (char *)name); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + +#if 1 // FIXME + if (dzp->z_pflags & ZFS_INHERIT_ACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + } +#endif + zfs_sa_upgrade_txholds(tx, dzp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + goto out; + } + + zfs_mknode(dzp, &vattr, tx, cr, 0, &xzp, &acl_ids); + + /* + * ASSERT(xzp->z_id == zoid); + */ + (void) zfs_link_create(dl, xzp, tx, ZNEW); + zfs_log_create(zilog, tx, TX_CREATE, dzp, xzp, (char *)name, + NULL /* vsecp */, 0 /* acl_ids.z_fuidp */, &vattr); + dmu_tx_commit(tx); + + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(xzp, zfsvfs); + + zfs_dirent_unlock(dl); +out: + zfs_acl_ids_free(&acl_ids); + if (cn.cn_nameptr) + kmem_free(cn.cn_nameptr, cn.cn_namelen); + + /* The REPLACE error if doesn't exist is ENOATTR */ + if ((flag & ZEXISTS) && (error == ENOENT)) + error = ENOATTR; + + if (xzp) + *vpp = ZTOV(xzp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * ace_trivial: + * determine whether an ace_t acl is trivial + * + * Trivialness implies that the acl is composed of only + * owner, group, everyone entries. ACL can't + * have read_acl denied, and write_owner/write_acl/write_attributes + * can only be owner@ entry. + */ +int +ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)) +{ + uint16_t flags; + uint32_t mask; + uint16_t type; + uint64_t cookie = 0; + + while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { + switch (flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + case ACE_EVERYONE: + break; + default: + return (1); + + } + + if (flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE)) + return (1); + + /* + * Special check for some special bits + * + * Don't allow anybody to deny reading basic + * attributes or a files ACL. + */ + if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (type == ACE_ACCESS_DENIED_ACE_TYPE)) + return (1); + + /* + * Delete permission is never set by default + */ + if (mask & ACE_DELETE) + return (1); + + /* + * Child delete permission should be accompanied by write + */ + if ((mask & ACE_DELETE_CHILD) && !(mask & ACE_WRITE_DATA)) + return (1); + /* + * only allow owner@ to have + * write_acl/write_owner/write_attributes/write_xattr/ + */ + + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(flags & ACE_OWNER) && (mask & + (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS)))) + return (1); + + } + + return (0); +} + + +void +acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) +{ + uint32_t read_mask = ACE_READ_DATA; + uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; + uint32_t execute_mask = ACE_EXECUTE; + + if (isdir) + write_mask |= ACE_DELETE_CHILD; + + masks->deny1 = 0; + if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) + masks->deny1 |= read_mask; + if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) + masks->deny1 |= write_mask; + if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) + masks->deny1 |= execute_mask; + + masks->deny2 = 0; + if (!(mode & S_IRGRP) && (mode & S_IROTH)) + masks->deny2 |= read_mask; + if (!(mode & S_IWGRP) && (mode & S_IWOTH)) + masks->deny2 |= write_mask; + if (!(mode & S_IXGRP) && (mode & S_IXOTH)) + masks->deny2 |= execute_mask; + + masks->allow0 = 0; + if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) + masks->allow0 |= read_mask; + if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) + masks->allow0 |= write_mask; + if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) + masks->allow0 |= execute_mask; + + masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; + if (mode & S_IRUSR) + masks->owner |= read_mask; + if (mode & S_IWUSR) + masks->owner |= write_mask; + if (mode & S_IXUSR) + masks->owner |= execute_mask; + + masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IRGRP) + masks->group |= read_mask; + if (mode & S_IWGRP) + masks->group |= write_mask; + if (mode & S_IXGRP) + masks->group |= execute_mask; + + masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IROTH) + masks->everyone |= read_mask; + if (mode & S_IWOTH) + masks->everyone |= write_mask; + if (mode & S_IXOTH) + masks->everyone |= execute_mask; +} + +void commonattrpack(attrinfo_t *aip, zfsvfs_t *zfsvfs, znode_t *zp, + const char *name, ino64_t objnum, enum vtype vtype, + boolean_t user64) +{ + attrgroup_t commonattr = aip->ai_attrlist->commonattr; + void *attrbufptr = *aip->ai_attrbufpp; + void *varbufptr = *aip->ai_varbufpp; + struct mount *mp = zfsvfs->z_vfs; + cred_t *cr = (cred_t *)vfs_context_ucred(aip->ai_context); + finderinfo_t finderinfo; + + /* + * We should probably combine all the sa_lookup into a bulk + * lookup operand. + */ + + finderinfo.fi_flags = 0; + + if (ATTR_CMN_NAME & commonattr) { + nameattrpack(aip, name, strlen(name)); + attrbufptr = *aip->ai_attrbufpp; + varbufptr = *aip->ai_varbufpp; + } + if (ATTR_CMN_DEVID & commonattr) { + *((dev_t *)attrbufptr) = vfs_statfs(mp)->f_fsid.val[0]; + attrbufptr = ((dev_t *)attrbufptr) + 1; + } + if (ATTR_CMN_FSID & commonattr) { + *((fsid_t *)attrbufptr) = vfs_statfs(mp)->f_fsid; + attrbufptr = ((fsid_t *)attrbufptr) + 1; + } + if (ATTR_CMN_OBJTYPE & commonattr) { + *((fsobj_type_t *)attrbufptr) = vtype; + attrbufptr = ((fsobj_type_t *)attrbufptr) + 1; + } + if (ATTR_CMN_OBJTAG & commonattr) { + *((fsobj_tag_t *)attrbufptr) = VT_ZFS; + attrbufptr = ((fsobj_tag_t *)attrbufptr) + 1; + } + /* + * Note: ATTR_CMN_OBJID is lossy (only 32 bits). + */ + if ((ATTR_CMN_OBJID | ATTR_CMN_OBJPERMANENTID) & commonattr) { + u_int32_t fileid; + /* + * On Mac OS X we always export the root directory id as 2 + */ + fileid = (objnum == zfsvfs->z_root) ? 2 : objnum; + + if (ATTR_CMN_OBJID & commonattr) { + ((fsobj_id_t *)attrbufptr)->fid_objno = fileid; + ((fsobj_id_t *)attrbufptr)->fid_generation = 0; + attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; + } + if (ATTR_CMN_OBJPERMANENTID & commonattr) { + ((fsobj_id_t *)attrbufptr)->fid_objno = fileid; + ((fsobj_id_t *)attrbufptr)->fid_generation = 0; + attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; + } + } + /* + * Note: ATTR_CMN_PAROBJID is lossy (only 32 bits). + */ + if (ATTR_CMN_PAROBJID & commonattr) { + uint64_t parentid; + + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parentid, sizeof (parentid)) == 0); + + /* + * On Mac OS X we always export the root + * directory id as 2 and its parent as 1 + */ + if (zp && zp->z_id == zfsvfs->z_root) + parentid = 1; + else if (parentid == zfsvfs->z_root) + parentid = 2; + + ASSERT(parentid != 0); + + ((fsobj_id_t *)attrbufptr)->fid_objno = (uint32_t)parentid; + ((fsobj_id_t *)attrbufptr)->fid_generation = 0; + attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; + } + if (ATTR_CMN_SCRIPT & commonattr) { + *((text_encoding_t *)attrbufptr) = kTextEncodingMacUnicode; + attrbufptr = ((text_encoding_t *)attrbufptr) + 1; + } + if (ATTR_CMN_CRTIME & commonattr) { + uint64_t times[2]; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + times, sizeof (times)) == 0); + if (user64) { + ZFS_TIME_DECODE((timespec_user64_t *)attrbufptr, + times); + attrbufptr = ((timespec_user64_t *)attrbufptr) + 1; + } else { + ZFS_TIME_DECODE((timespec_user32_t *)attrbufptr, + times); + attrbufptr = ((timespec_user32_t *)attrbufptr) + 1; + } + } + if (ATTR_CMN_MODTIME & commonattr) { + uint64_t times[2]; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_MTIME(zfsvfs), + times, sizeof (times)) == 0); + if (user64) { + ZFS_TIME_DECODE((timespec_user64_t *)attrbufptr, + times); + attrbufptr = ((timespec_user64_t *)attrbufptr) + 1; + } else { + ZFS_TIME_DECODE((timespec_user32_t *)attrbufptr, + times); + attrbufptr = ((timespec_user32_t *)attrbufptr) + 1; + } + } + if (ATTR_CMN_CHGTIME & commonattr) { + uint64_t times[2]; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_CTIME(zfsvfs), + times, sizeof (times)) == 0); + if (user64) { + ZFS_TIME_DECODE((timespec_user64_t *)attrbufptr, + times); + attrbufptr = ((timespec_user64_t *)attrbufptr) + 1; + } else { + ZFS_TIME_DECODE((timespec_user32_t *)attrbufptr, + times); + attrbufptr = ((timespec_user32_t *)attrbufptr) + 1; + } + } + if (ATTR_CMN_ACCTIME & commonattr) { + uint64_t times[2]; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + times, sizeof (times)) == 0); + if (user64) { + ZFS_TIME_DECODE((timespec_user64_t *)attrbufptr, + times); + attrbufptr = ((timespec_user64_t *)attrbufptr) + 1; + } else { + ZFS_TIME_DECODE((timespec_user32_t *)attrbufptr, + times); + attrbufptr = ((timespec_user32_t *)attrbufptr) + 1; + } + } + if (ATTR_CMN_BKUPTIME & commonattr) { + /* legacy attribute -- just pass zero */ + if (user64) { + ((timespec_user64_t *)attrbufptr)->tv_sec = 0; + ((timespec_user64_t *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((timespec_user64_t *)attrbufptr) + 1; + } else { + ((timespec_user32_t *)attrbufptr)->tv_sec = 0; + ((timespec_user32_t *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((timespec_user32_t *)attrbufptr) + 1; + } + } + if (ATTR_CMN_FNDRINFO & commonattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + &val, sizeof (val)) == 0); + getfinderinfo(zp, cr, &finderinfo); + /* Shadow ZFS_HIDDEN to Finder Info's invisible bit */ + if (val & ZFS_HIDDEN) { + finderinfo.fi_flags |= + OSSwapHostToBigConstInt16(kIsInvisible); + } + bcopy(&finderinfo, attrbufptr, sizeof (finderinfo)); + attrbufptr = (char *)attrbufptr + 32; + } + if (ATTR_CMN_OWNERID & commonattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_UID(zfsvfs), + &val, sizeof (val)) == 0); + *((uid_t *)attrbufptr) = val; + attrbufptr = ((uid_t *)attrbufptr) + 1; + } + if (ATTR_CMN_GRPID & commonattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_GID(zfsvfs), + &val, sizeof (val)) == 0); + *((gid_t *)attrbufptr) = val; + attrbufptr = ((gid_t *)attrbufptr) + 1; + } + if (ATTR_CMN_ACCESSMASK & commonattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + &val, sizeof (val)) == 0); + *((u_int32_t *)attrbufptr) = val; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_CMN_FLAGS & commonattr) { + // TODO, sa_lookup of ZPL_FLAGS + u_int32_t flags = zfs_getbsdflags(zp); + + /* Shadow Finder Info's invisible bit to UF_HIDDEN */ + if ((ATTR_CMN_FNDRINFO & commonattr) && + (OSSwapBigToHostInt16(finderinfo.fi_flags) & kIsInvisible)) + flags |= UF_HIDDEN; + + *((u_int32_t *)attrbufptr) = flags; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_CMN_USERACCESS & commonattr) { + u_int32_t user_access = 0; + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + &val, sizeof (val)) == 0); + + user_access = getuseraccess(zp, aip->ai_context); + + /* Also consider READ-ONLY file system. */ + if (vfs_flags(mp) & MNT_RDONLY) { + user_access &= ~W_OK; + } + + /* Locked objects are not writable either */ + if ((val & ZFS_IMMUTABLE) && + (vfs_context_suser(aip->ai_context) != 0)) { + user_access &= ~W_OK; + } + + *((u_int32_t *)attrbufptr) = user_access; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_CMN_FILEID & commonattr) { + /* + * On Mac OS X we always export the root directory id as 2 + */ + if (objnum == zfsvfs->z_root) + objnum = 2; + + *((u_int64_t *)attrbufptr) = objnum; + attrbufptr = ((u_int64_t *)attrbufptr) + 1; + } + if (ATTR_CMN_PARENTID & commonattr) { + uint64_t parentid; + + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parentid, sizeof (parentid)) == 0); + + /* + * On Mac OS X we always export the root + * directory id as 2 and its parent as 1 + */ + if (zp && zp->z_id == zfsvfs->z_root) + parentid = 1; + else if (parentid == zfsvfs->z_root) + parentid = 2; + + ASSERT(parentid != 0); + + *((u_int64_t *)attrbufptr) = parentid; + attrbufptr = ((u_int64_t *)attrbufptr) + 1; + } + + *aip->ai_attrbufpp = attrbufptr; + *aip->ai_varbufpp = varbufptr; +} + +void +dirattrpack(attrinfo_t *aip, znode_t *zp) +{ + attrgroup_t dirattr = aip->ai_attrlist->dirattr; + void *attrbufptr = *aip->ai_attrbufpp; + + if (ATTR_DIR_LINKCOUNT & dirattr) { + *((u_int32_t *)attrbufptr) = 1; /* no dir hard links */ + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_DIR_ENTRYCOUNT & dirattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &val, sizeof (val)) == 0); + *((u_int32_t *)attrbufptr) = (uint32_t)val; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_DIR_MOUNTSTATUS & dirattr && zp) { + vnode_t *vp = ZTOV(zp); + + if (vp != NULL && vnode_mountedhere(vp) != NULL) + *((u_int32_t *)attrbufptr) = DIR_MNTSTATUS_MNTPOINT; + else + *((u_int32_t *)attrbufptr) = 0; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + *aip->ai_attrbufpp = attrbufptr; +} + +void +fileattrpack(attrinfo_t *aip, zfsvfs_t *zfsvfs, znode_t *zp) +{ + attrgroup_t fileattr = aip->ai_attrlist->fileattr; + void *attrbufptr = *aip->ai_attrbufpp; + void *varbufptr = *aip->ai_varbufpp; + uint64_t allocsize = 0; + cred_t *cr = (cred_t *)vfs_context_ucred(aip->ai_context); + + if ((ATTR_FILE_ALLOCSIZE | ATTR_FILE_DATAALLOCSIZE) & fileattr && zp) { + uint32_t blksize; + u_longlong_t nblks; + + sa_object_size(zp->z_sa_hdl, &blksize, &nblks); + allocsize = (uint64_t)512LL * (uint64_t)nblks; + } + if (ATTR_FILE_LINKCOUNT & fileattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &val, sizeof (val)) == 0); + *((u_int32_t *)attrbufptr) = val; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_FILE_TOTALSIZE & fileattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &val, sizeof (val)) == 0); + *((off_t *)attrbufptr) = val; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_ALLOCSIZE & fileattr) { + *((off_t *)attrbufptr) = allocsize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_IOBLOCKSIZE & fileattr && zp) { + *((u_int32_t *)attrbufptr) = + zp->z_blksz ? zp->z_blksz : zfsvfs->z_max_blksz; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_FILE_DEVTYPE & fileattr) { + uint64_t mode, val = 0; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + &mode, sizeof (mode)) == 0); + sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), + &val, sizeof (val)); + if (S_ISBLK(mode) || S_ISCHR(mode)) + *((u_int32_t *)attrbufptr) = (u_int32_t)val; + else + *((u_int32_t *)attrbufptr) = 0; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_FILE_DATALENGTH & fileattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &val, sizeof (val)) == 0); + *((off_t *)attrbufptr) = val; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_DATAALLOCSIZE & fileattr) { + *((off_t *)attrbufptr) = allocsize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if ((ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE) & fileattr) { + uint64_t rsrcsize = 0; + uint64_t xattr; + + if (!sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr, sizeof (xattr)) && + xattr) { + znode_t *xdzp = NULL, *xzp = NULL; + struct componentname cn = { 0 }; + char *name = NULL; + + name = spa_strdup(XATTR_RESOURCEFORK_NAME); + cn.cn_namelen = strlen(name)+1; + cn.cn_nameptr = kmem_zalloc(cn.cn_namelen, KM_SLEEP); + + /* Grab the hidden attribute directory vnode. */ + if (zfs_get_xattrdir(zp, &xdzp, cr, 0) == 0 && + zfs_dirlook(xdzp, name, &xzp, 0, NULL, + &cn) == 0) { + rsrcsize = xzp->z_size; + } + spa_strfree(name); + kmem_free(cn.cn_nameptr, cn.cn_namelen); + + if (xzp) + zrele(xzp); + if (xdzp) + zrele(xdzp); + } + if (ATTR_FILE_RSRCLENGTH & fileattr) { + *((off_t *)attrbufptr) = rsrcsize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_RSRCALLOCSIZE & fileattr) { + *((off_t *)attrbufptr) = roundup(rsrcsize, 512); + attrbufptr = ((off_t *)attrbufptr) + 1; + } + } + *aip->ai_attrbufpp = attrbufptr; + *aip->ai_varbufpp = varbufptr; +} + +void +nameattrpack(attrinfo_t *aip, const char *name, int namelen) +{ + void *varbufptr; + struct attrreference *attr_refptr; + u_int32_t attrlen; + size_t nfdlen, freespace; + int force_formd_normalized_output; + + varbufptr = *aip->ai_varbufpp; + attr_refptr = (struct attrreference *)(*aip->ai_attrbufpp); + + freespace = (char *)aip->ai_varbufend - (char *)varbufptr; + /* + * Mac OS X: non-ascii names are UTF-8 NFC on disk + * so convert to NFD before exporting them. + */ + + if (zfs_vnop_force_formd_normalized_output && + !is_ascii_str(name)) + force_formd_normalized_output = 1; + else + force_formd_normalized_output = 0; + + namelen = strlen(name); + if (!force_formd_normalized_output || + utf8_normalizestr((const u_int8_t *)name, namelen, + (u_int8_t *)varbufptr, &nfdlen, + freespace, UTF_DECOMPOSED) != 0) { + /* ASCII or normalization failed, just copy zap name. */ + strncpy((char *)varbufptr, name, MIN(freespace, namelen+1)); + } else { + /* Normalization succeeded (already in buffer). */ + namelen = nfdlen; + } + attrlen = namelen + 1; + attr_refptr->attr_dataoffset = (char *)varbufptr - (char *)attr_refptr; + attr_refptr->attr_length = attrlen; + /* + * Advance beyond the space just allocated and + * round up to the next 4-byte boundary: + */ + varbufptr = ((char *)varbufptr) + attrlen + ((4 - (attrlen & 3)) & 3); + ++attr_refptr; + + *aip->ai_attrbufpp = attr_refptr; + *aip->ai_varbufpp = varbufptr; +} + +int +getpackedsize(struct attrlist *alp, boolean_t user64) +{ + attrgroup_t attrs; + int timespecsize; + int size = 0; + + timespecsize = user64 ? sizeof (timespec_user64_t) : + sizeof (timespec_user32_t); + + if ((attrs = alp->commonattr) != 0) { + if (attrs & ATTR_CMN_NAME) + size += sizeof (struct attrreference); + if (attrs & ATTR_CMN_DEVID) + size += sizeof (dev_t); + if (attrs & ATTR_CMN_FSID) + size += sizeof (fsid_t); + if (attrs & ATTR_CMN_OBJTYPE) + size += sizeof (fsobj_type_t); + if (attrs & ATTR_CMN_OBJTAG) + size += sizeof (fsobj_tag_t); + if (attrs & ATTR_CMN_OBJID) + size += sizeof (fsobj_id_t); + if (attrs & ATTR_CMN_OBJPERMANENTID) + size += sizeof (fsobj_id_t); + if (attrs & ATTR_CMN_PAROBJID) + size += sizeof (fsobj_id_t); + if (attrs & ATTR_CMN_SCRIPT) + size += sizeof (text_encoding_t); + if (attrs & ATTR_CMN_CRTIME) + size += timespecsize; + if (attrs & ATTR_CMN_MODTIME) + size += timespecsize; + if (attrs & ATTR_CMN_CHGTIME) + size += timespecsize; + if (attrs & ATTR_CMN_ACCTIME) + size += timespecsize; + if (attrs & ATTR_CMN_BKUPTIME) + size += timespecsize; + if (attrs & ATTR_CMN_FNDRINFO) + size += 32 * sizeof (u_int8_t); + if (attrs & ATTR_CMN_OWNERID) + size += sizeof (uid_t); + if (attrs & ATTR_CMN_GRPID) + size += sizeof (gid_t); + if (attrs & ATTR_CMN_ACCESSMASK) + size += sizeof (u_int32_t); + if (attrs & ATTR_CMN_FLAGS) + size += sizeof (u_int32_t); + if (attrs & ATTR_CMN_USERACCESS) + size += sizeof (u_int32_t); + if (attrs & ATTR_CMN_FILEID) + size += sizeof (u_int64_t); + if (attrs & ATTR_CMN_PARENTID) + size += sizeof (u_int64_t); + /* + * Also add: + * ATTR_CMN_GEN_COUNT (|FSOPT_ATTR_CMN_EXTENDED) + * ATTR_CMN_DOCUMENT_ID (|FSOPT_ATTR_CMN_EXTENDED) + * ATTR_CMN_EXTENDED_SECURITY + * ATTR_CMN_UUID + * ATTR_CMN_GRPUUID + * ATTR_CMN_FULLPATH + * ATTR_CMN_ADDEDTIME + * ATTR_CMN_ERROR + * ATTR_CMN_DATA_PROTECT_FLAGS + */ + } + if ((attrs = alp->dirattr) != 0) { + if (attrs & ATTR_DIR_LINKCOUNT) + size += sizeof (u_int32_t); + if (attrs & ATTR_DIR_ENTRYCOUNT) + size += sizeof (u_int32_t); + if (attrs & ATTR_DIR_MOUNTSTATUS) + size += sizeof (u_int32_t); + } + if ((attrs = alp->fileattr) != 0) { + if (attrs & ATTR_FILE_LINKCOUNT) + size += sizeof (u_int32_t); + if (attrs & ATTR_FILE_TOTALSIZE) + size += sizeof (off_t); + if (attrs & ATTR_FILE_ALLOCSIZE) + size += sizeof (off_t); + if (attrs & ATTR_FILE_IOBLOCKSIZE) + size += sizeof (u_int32_t); + if (attrs & ATTR_FILE_DEVTYPE) + size += sizeof (u_int32_t); + if (attrs & ATTR_FILE_DATALENGTH) + size += sizeof (off_t); + if (attrs & ATTR_FILE_DATAALLOCSIZE) + size += sizeof (off_t); + if (attrs & ATTR_FILE_RSRCLENGTH) + size += sizeof (off_t); + if (attrs & ATTR_FILE_RSRCALLOCSIZE) + size += sizeof (off_t); + } + return (size); +} + + +void +getfinderinfo(znode_t *zp, cred_t *cr, finderinfo_t *fip) +{ + znode_t *xdzp = NULL; + znode_t *xzp = NULL; + struct uio *auio = NULL; + struct componentname cn = { 0 }; + int error; + uint64_t xattr = 0; + char *name = NULL; + + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zp->z_zfsvfs), + &xattr, sizeof (xattr)) || + (xattr == 0)) { + goto nodata; + } + + auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); + if (auio == NULL) { + goto nodata; + } + uio_addiov(auio, CAST_USER_ADDR_T(fip), sizeof (finderinfo_t)); + + /* + * Grab the hidden attribute directory vnode. + * + * XXX - switch to embedded Finder Info when it becomes available + */ + if ((error = zfs_get_xattrdir(zp, &xdzp, cr, 0))) { + goto out; + } + + name = spa_strdup(XATTR_FINDERINFO_NAME); + cn.cn_namelen = strlen(name)+1; + cn.cn_nameptr = kmem_zalloc(cn.cn_namelen, KM_SLEEP); + + if ((error = zfs_dirlook(xdzp, name, &xzp, 0, NULL, &cn))) { + goto out; + } + error = dmu_read_uio(zp->z_zfsvfs->z_os, xzp->z_id, auio, + sizeof (finderinfo_t)); +out: + if (name) + spa_strfree(name); + if (cn.cn_nameptr) + kmem_free(cn.cn_nameptr, cn.cn_namelen); + if (auio) + uio_free(auio); + if (xzp) + zrele(xzp); + if (xdzp) + zrele(xdzp); + if (error == 0) + return; +nodata: + bzero(fip, sizeof (finderinfo_t)); +} + +#define KAUTH_DIR_WRITE (KAUTH_VNODE_ACCESS | KAUTH_VNODE_ADD_FILE | \ + KAUTH_VNODE_ADD_SUBDIRECTORY | \ + KAUTH_VNODE_DELETE_CHILD) + +#define KAUTH_DIR_READ (KAUTH_VNODE_ACCESS | KAUTH_VNODE_LIST_DIRECTORY) + +#define KAUTH_DIR_EXECUTE (KAUTH_VNODE_ACCESS | KAUTH_VNODE_SEARCH) + +#define KAUTH_FILE_WRITE (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA) + +#define KAUTH_FILE_READ (KAUTH_VNODE_ACCESS | KAUTH_VNODE_READ_DATA) + +#define KAUTH_FILE_EXECUTE (KAUTH_VNODE_ACCESS | KAUTH_VNODE_EXECUTE) + +/* + * Compute the same user access value as getattrlist(2) + */ +u_int32_t +getuseraccess(znode_t *zp, vfs_context_t ctx) +{ + vnode_t *vp; + u_int32_t user_access = 0; + zfs_acl_phys_t acl_phys; + int error; + /* Only take the expensive vnode_authorize path when we have an ACL */ + + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys)); + + if (error || acl_phys.z_acl_count == 0) { + kauth_cred_t cred = vfs_context_ucred(ctx); + uint64_t obj_uid; + uint64_t obj_mode; + + /* User id 0 (root) always gets access. */ + if (!vfs_context_suser(ctx)) { + return (R_OK | W_OK | X_OK); + } + + sa_lookup(zp->z_sa_hdl, SA_ZPL_UID(zp->z_zfsvfs), + &obj_uid, sizeof (obj_uid)); + sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zp->z_zfsvfs), + &obj_mode, sizeof (obj_mode)); + + // obj_uid = pzp->zp_uid; + obj_mode = obj_mode & MODEMASK; + if (obj_uid == UNKNOWNUID) { + obj_uid = kauth_cred_getuid(cred); + } + if ((obj_uid == kauth_cred_getuid(cred)) || + (obj_uid == UNKNOWNUID)) { + return (((u_int32_t)obj_mode & S_IRWXU) >> 6); + } + /* Otherwise, settle for 'others' access. */ + return ((u_int32_t)obj_mode & S_IRWXO); + } + vp = ZTOV(zp); + if (vnode_isdir(vp)) { + if (vnode_authorize(vp, NULLVP, KAUTH_DIR_WRITE, ctx) == 0) + user_access |= W_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_DIR_READ, ctx) == 0) + user_access |= R_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_DIR_EXECUTE, ctx) == 0) + user_access |= X_OK; + } else { + if (vnode_authorize(vp, NULLVP, KAUTH_FILE_WRITE, ctx) == 0) + user_access |= W_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_FILE_READ, ctx) == 0) + user_access |= R_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_FILE_EXECUTE, ctx) == 0) + user_access |= X_OK; + } + return (user_access); +} + + + +static unsigned char fingerprint[] = {0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef, + 0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef}; + +/* + * Convert "Well Known" GUID to enum type. + */ +int +kauth_wellknown_guid(guid_t *guid) +{ + uint32_t last = 0; + + if (memcmp(fingerprint, guid->g_guid, sizeof (fingerprint))) + return (KAUTH_WKG_NOT); + + last = BE_32(*((u_int32_t *)&guid->g_guid[12])); + + switch (last) { + case 0x0c: + return (KAUTH_WKG_EVERYBODY); + case 0x0a: + return (KAUTH_WKG_OWNER); + case 0x10: + return (KAUTH_WKG_GROUP); + case 0xFFFFFFFE: + return (KAUTH_WKG_NOBODY); + } + + return (KAUTH_WKG_NOT); +} + + +/* + * Set GUID to "well known" guid, based on enum type + */ +void +nfsacl_set_wellknown(int wkg, guid_t *guid) +{ + /* + * All WKGs begin with the same 12 bytes. + */ + bcopy(fingerprint, (void *)guid, 12); + /* + * The final 4 bytes are our code (in network byte order). + */ + switch (wkg) { + case 4: + *((u_int32_t *)&guid->g_guid[12]) = BE_32(0x0000000c); + break; + case 3: + *((u_int32_t *)&guid->g_guid[12]) = BE_32(0xfffffffe); + break; + case 1: + *((u_int32_t *)&guid->g_guid[12]) = BE_32(0x0000000a); + break; + case 2: + *((u_int32_t *)&guid->g_guid[12]) = BE_32(0x00000010); + }; +} + + +/* + * Convert Darwin ACL list, into ZFS ACL "aces" list. + */ +void +aces_from_acl(ace_t *aces, int *nentries, struct kauth_acl *k_acl, + int *seen_type) +{ + int i; + ace_t *ace; + guid_t *guidp; + kauth_ace_rights_t ace_rights; + uid_t who; + uint32_t mask = 0; + uint16_t flags = 0; + uint16_t type = 0; + u_int32_t ace_flags; + int wkg; + int err = 0; + + *nentries = k_acl->acl_entrycount; + + // bzero(aces, sizeof (*aces) * *nentries); + + // *nentries = aclp->acl_cnt; + + for (i = 0; i < *nentries; i++) { + // entry = &(aclp->acl_entry[i]); + + flags = 0; + mask = 0; + + ace = &(aces[i]); + + /* Note Mac OS X GUID is a 128-bit identifier */ + guidp = &k_acl->acl_ace[i].ace_applicable; + + who = -1; + wkg = kauth_wellknown_guid(guidp); + + switch (wkg) { + case KAUTH_WKG_OWNER: + flags |= ACE_OWNER; + if (seen_type) *seen_type |= ACE_OWNER; + break; + case KAUTH_WKG_GROUP: + flags |= ACE_GROUP|ACE_IDENTIFIER_GROUP; + if (seen_type) *seen_type |= ACE_GROUP; + break; + case KAUTH_WKG_EVERYBODY: + flags |= ACE_EVERYONE; + if (seen_type) *seen_type |= ACE_EVERYONE; + break; + + case KAUTH_WKG_NOBODY: + default: + /* Try to get a uid from supplied guid */ + err = kauth_cred_guid2uid(guidp, &who); + if (err) { + err = kauth_cred_guid2gid(guidp, &who); + if (!err) { + flags |= ACE_IDENTIFIER_GROUP; + } + } + if (err) { + *nentries = 0; + return; + } + + } // switch + + ace->a_who = who; + + ace_rights = k_acl->acl_ace[i].ace_rights; + if (ace_rights & KAUTH_VNODE_READ_DATA) + mask |= ACE_READ_DATA; + if (ace_rights & KAUTH_VNODE_WRITE_DATA) + mask |= ACE_WRITE_DATA; + if (ace_rights & KAUTH_VNODE_APPEND_DATA) + mask |= ACE_APPEND_DATA; + if (ace_rights & KAUTH_VNODE_READ_EXTATTRIBUTES) + mask |= ACE_READ_NAMED_ATTRS; + if (ace_rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) + mask |= ACE_WRITE_NAMED_ATTRS; + if (ace_rights & KAUTH_VNODE_EXECUTE) + mask |= ACE_EXECUTE; + if (ace_rights & KAUTH_VNODE_DELETE_CHILD) + mask |= ACE_DELETE_CHILD; + if (ace_rights & KAUTH_VNODE_READ_ATTRIBUTES) + mask |= ACE_READ_ATTRIBUTES; + if (ace_rights & KAUTH_VNODE_WRITE_ATTRIBUTES) + mask |= ACE_WRITE_ATTRIBUTES; + if (ace_rights & KAUTH_VNODE_DELETE) + mask |= ACE_DELETE; + if (ace_rights & KAUTH_VNODE_READ_SECURITY) + mask |= ACE_READ_ACL; + if (ace_rights & KAUTH_VNODE_WRITE_SECURITY) + mask |= ACE_WRITE_ACL; + if (ace_rights & KAUTH_VNODE_TAKE_OWNERSHIP) + mask |= ACE_WRITE_OWNER; + if (ace_rights & KAUTH_VNODE_SYNCHRONIZE) + mask |= ACE_SYNCHRONIZE; + ace->a_access_mask = mask; + + ace_flags = k_acl->acl_ace[i].ace_flags; + if (ace_flags & KAUTH_ACE_FILE_INHERIT) + flags |= ACE_FILE_INHERIT_ACE; + if (ace_flags & KAUTH_ACE_DIRECTORY_INHERIT) + flags |= ACE_DIRECTORY_INHERIT_ACE; + if (ace_flags & KAUTH_ACE_LIMIT_INHERIT) + flags |= ACE_NO_PROPAGATE_INHERIT_ACE; + if (ace_flags & KAUTH_ACE_ONLY_INHERIT) + flags |= ACE_INHERIT_ONLY_ACE; + ace->a_flags = flags; + + switch (ace_flags & KAUTH_ACE_KINDMASK) { + case KAUTH_ACE_PERMIT: + type = ACE_ACCESS_ALLOWED_ACE_TYPE; + break; + case KAUTH_ACE_DENY: + type = ACE_ACCESS_DENIED_ACE_TYPE; + break; + case KAUTH_ACE_AUDIT: + type = ACE_SYSTEM_AUDIT_ACE_TYPE; + break; + case KAUTH_ACE_ALARM: + type = ACE_SYSTEM_ALARM_ACE_TYPE; + break; + } + ace->a_type = type; + dprintf(" ACL: %d type %04x, mask %04x, flags %04x, who %d\n", + i, type, mask, flags, who); + } + +} + +void +finderinfo_update(uint8_t *finderinfo, znode_t *zp) +{ + u_int8_t *finfo = NULL; + struct timespec va_crtime; + + /* Advance finfo by 16 bytes to the 2nd half of the finderinfo */ + finfo = (u_int8_t *)finderinfo + 16; + + /* Don't expose a symlink's private type/creator. */ + if (IFTOVT((mode_t)zp->z_mode) == VLNK) { + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)finderinfo; + fip->fdType = 0; + fip->fdCreator = 0; + } + + /* hfs_xattr.c hfs_zero_hidden_fields() */ + if ((IFTOVT((mode_t)zp->z_mode) == VREG) || + (IFTOVT((mode_t)zp->z_mode) == VLNK)) { + struct FndrExtendedFileInfo *extinfo = + (struct FndrExtendedFileInfo *)finfo; + extinfo->document_id = 0; + extinfo->date_added = 0; + extinfo->write_gen_counter = 0; + } + + if (IFTOVT((mode_t)zp->z_mode) == VDIR) { + struct FndrExtendedDirInfo *extinfo = + (struct FndrExtendedDirInfo *)finfo; + extinfo->document_id = 0; + extinfo->date_added = 0; + extinfo->write_gen_counter = 0; + } + +} + + + +int +zpl_xattr_set_sa(struct vnode *vp, const char *name, const void *value, + size_t size, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + nvlist_t *nvl; + size_t sa_size; + int error; + + ASSERT(zp->z_xattr_cached); + nvl = zp->z_xattr_cached; + + if (value == NULL) { + error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); + if (error == -ENOENT) + return (error); + // error = zpl_xattr_set_dir(vp, name, NULL, 0, flags, cr); + } else { + /* Limited to 32k to keep nvpair memory allocations small */ + if (size > DXATTR_MAX_ENTRY_SIZE) + return (-EFBIG); + + /* Prevent the DXATTR SA from consuming the entire SA region */ + error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); + if (error) + return (error); + + if (sa_size > DXATTR_MAX_SA_SIZE) + return (-EFBIG); + error = -nvlist_add_byte_array(nvl, name, + (uchar_t *)value, size); + if (error) + return (error); + } + + /* Update the SA for additions, modifications, and removals. */ + if (!error) + error = -zfs_sa_set_xattr(zp); + + ASSERT3S(error, <=, 0); + + return (error); +} + +int +zpl_xattr_get_sa(struct vnode *vp, const char *name, void *value, size_t size) +{ + znode_t *zp = VTOZ(vp); + uchar_t *nv_value; + uint_t nv_size; + int error = 0; + +#ifdef __LINUX__ + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); +#endif + + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + return (error); + + ASSERT(zp->z_xattr_cached); + error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name, + &nv_value, &nv_size); + if (error) + return (error); + + if (!size) + return (nv_size); + if (size < nv_size) + return (-ERANGE); + + memcpy(value, nv_value, nv_size); + + return (nv_size); +} + + + +/* + * Document ID. Persistant IDs that can survive "safe saving". + * 'revisiond' appears to use fchflags(UF_TRACKED) on files/dirs + * that it wishes to use DocumentIDs with. Here, we will lookup + * if an entry already has a DocumentID stored in SA, but if not, + * hash the DocumentID for (PARENTID + filename) and return it. + * In vnop_setattr for UF_TRACKED, we will store the DocumentID to + * disk. + * Although it is not entirely clear which situations we should handle + * we do handle: + * + * Case 1: + * "file.txt" gets chflag(UF_TRACKED) and DocumentID set. + * "file.txt" is renamed to "file.tmp". DocumentID is kept. + * "file.txt" is re-created, DocumentID remains same, but not saved. + * + * Case 2: + * "file.txt" gets chflag(UF_TRACKED) and DocumentID set. + * "file.txt" is moved to another directory. DocumentID is kept. + * + * It is interesting to note that HFS+ has "tombstones" which is + * created when a UF_TRACKED entry is unlinked, or, renamed. + * Then if a new entry is created with same PARENT+name, and matching + * tombstone is found, will inherit the DocumentID, and UF_TRACKED flag. + * + * We may need to implement this as well. + * + * If "name" or "parent" is known, pass it along, or it needs to look it up. + * + */ +void +zfs_setattr_generate_id(znode_t *zp, uint64_t val, char *name) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + char *nameptr = NULL; + char *filename = NULL; + uint64_t parent = val; + int error = 0; + uint64_t docid = 0; + + if (!zp->z_document_id && zp->z_sa_hdl) { + + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DOCUMENTID(zfsvfs), + &docid, sizeof (docid)); + if (!error && docid) { + zp->z_document_id = docid; + return; + } + + /* Have name? */ + if (name && *name) { + nameptr = name; + } else { + /* Do we have parent? */ + if (!parent) { + VERIFY(sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, + sizeof (parent)) == 0); + } + /* Lookup filename */ + filename = kmem_zalloc(MAXPATHLEN + 2, KM_SLEEP); + if (zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), filename) == 0) { + + nameptr = filename; + // Might as well keep this name too. + strlcpy(zp->z_name_cache, filename, + MAXPATHLEN); + } + } + + zp->z_document_id = fnv_32a_buf(&parent, sizeof (parent), + FNV1_32A_INIT); + if (nameptr) + zp->z_document_id = + fnv_32a_str(nameptr, zp->z_document_id); + + if (filename) + kmem_free(filename, MAXPATHLEN + 2); + } // !document_id +} + +/* + * setattr asked for UF_TRACKED to be set, which means we will make sure + * we have a hash made (includes getting filename) and stored in SA. + */ +int +zfs_setattr_set_documentid(znode_t *zp, boolean_t update_flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + dmu_tx_t *tx; + int count = 0; + sa_bulk_attr_t bulk[2]; + + dprintf("ZFS: vnop_setattr(UF_TRACKED) obj %llu : documentid %08u\n", + zp->z_id, + zp->z_document_id); + + /* Write the new documentid to SA */ + if ((zfsvfs->z_use_sa == B_TRUE) && + !vfs_isrdonly(zfsvfs->z_vfs) && + spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + + uint64_t docid = zp->z_document_id; // 32->64 + + if (update_flags == B_TRUE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DOCUMENTID(zfsvfs), NULL, + &docid, sizeof (docid)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } + + if (error) + dprintf("ZFS: sa_update(SA_ZPL_DOCUMENTID) failed %d\n", + error); + + } // if z_use_sa && !readonly + + return (error); +} + +int +zfs_hardlink_addmap(znode_t *zp, uint64_t parentid, uint32_t linkid) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + hardlinks_t *searchnode, *findnode; + avl_index_t loc; + + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + searchnode->hl_parent = parentid; + searchnode->hl_fileid = zp->z_id; + strlcpy(searchnode->hl_name, zp->z_name_cache, PATH_MAX); + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_WRITER); + findnode = avl_find(&zfsvfs->z_hardlinks, searchnode, &loc); + kmem_free(searchnode, sizeof (hardlinks_t)); + if (!findnode) { + // Add hash entry + zp->z_finder_hardlink = TRUE; + findnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + + findnode->hl_parent = parentid; + findnode->hl_fileid = zp->z_id; + strlcpy(findnode->hl_name, zp->z_name_cache, PATH_MAX); + + findnode->hl_linkid = linkid; + + avl_add(&zfsvfs->z_hardlinks, findnode); + avl_add(&zfsvfs->z_hardlinks_linkid, findnode); + dprintf("ZFS: Inserted new hardlink node (%llu,%llu,'%s') " + "<-> (%x,%u)\n", + findnode->hl_parent, + findnode->hl_fileid, findnode->hl_name, + findnode->hl_linkid, findnode->hl_linkid); + } + rw_exit(&zfsvfs->z_hardlinks_lock); + + return (findnode ? 1 : 0); +} + +/* dst buffer must be at least UUID_PRINTABLE_STRING_LENGTH bytes */ + +int +zfs_vfs_uuid_unparse(uuid_t uuid, char *dst) +{ + if (!uuid || !dst) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + snprintf(dst, UUID_PRINTABLE_STRING_LENGTH, "%02X%02X%02X%02X-" + "%02X%02X-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X", + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7], + uuid[8], uuid[9], uuid[10], uuid[11], + uuid[12], uuid[13], uuid[14], uuid[15]); + + return (0); +} + +int +zfs_vfs_uuid_gen(const char *osname, uuid_t uuid) +{ + MD5_CTX md5c; + /* namespace (generated by uuidgen) */ + /* 50670853-FBD2-4EC3-9802-73D847BF7E62 */ + char namespace[16] = {0x50, 0x67, 0x08, 0x53, /* - */ + 0xfb, 0xd2, /* - */ 0x4e, 0xc3, /* - */ + 0x98, 0x02, /* - */ + 0x73, 0xd8, 0x47, 0xbf, 0x7e, 0x62}; + + /* Validate arguments */ + if (!osname || !uuid || strlen(osname) == 0) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + /* + * UUID version 3 (MD5) namespace variant: + * hash namespace (uuid) together with name + */ + MD5Init(&md5c); + MD5Update(&md5c, &namespace, sizeof (namespace)); + MD5Update(&md5c, osname, strlen(osname)); + MD5Final(uuid, &md5c); + + /* + * To make UUID version 3, twiddle a few bits: + * xxxxxxxx-xxxx-Mxxx-Nxxx-xxxxxxxxxxxx + * [uint32]-[uin-t32]-[uin-t32][uint32] + * M should be 0x3 to indicate uuid v3 + * N should be 0x8, 0x9, 0xa, or 0xb + */ + uuid[6] = (uuid[6] & 0x0F) | 0x30; + uuid[8] = (uuid[8] & 0x3F) | 0x80; + + /* Print all caps */ + // dprintf("%s UUIDgen: [%s](%ld)->" + dprintf("%s UUIDgen: [%s](%ld) -> " + "[%02X%02X%02X%02X-%02X%02X-%02X%02X-" + "%02X%02X-%02X%02X%02X%02X%02X%02X]\n", + __func__, osname, strlen(osname), + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7], + uuid[8], uuid[9], uuid[10], uuid[11], + uuid[12], uuid[13], uuid[14], uuid[15]); + + return (0); +} + +int +uio_prefaultpages(ssize_t n, struct uio *uio) +{ + return (0); +} + +/* No #pragma weaks here! */ +void +dmu_buf_add_ref(dmu_buf_t *db, void *tag) +{ + dbuf_add_ref((dmu_buf_impl_t *)db, tag); +} + +boolean_t +dmu_buf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t object, + uint64_t blkid, void *tag) +{ + return (dbuf_try_add_ref(db, os, object, blkid, tag)); +} diff --git a/module/os/macos/zfs/zfs_znode.c b/module/os/macos/zfs/zfs_znode.c new file mode 100644 index 0000000000..f086caded1 --- /dev/null +++ b/module/os/macos/zfs/zfs_znode.c @@ -0,0 +1,2347 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2007-2009 Apple Inc. All rights reserved. + * Use is subject to license terms. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2011 Martin Matuska */ +/* Portions Copyright 2013 Jorgen Lundman */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif /* _KERNEL */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* Used by fstat(1). */ +#ifndef __APPLE__ +SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof (znode_t), + "sizeof (znode_t)"); +#endif +void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag); + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL +/* + * This is used by the test suite so that it can delay znodes from being + * freed in order to inspect the unlinked set. + */ +int zfs_unlink_suspend_progress = 0; + +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ + +kmem_cache_t *znode_cache = NULL; +static kmem_cache_t *znode_hold_cache = NULL; +unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; + +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ +static void +zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) +{ + znode_t *zp = arg; + + /* + * If in append mode, convert to writer and lock starting at the + * current end of file. + */ + if (new->lr_type == RL_APPEND) { + new->lr_offset = zp->z_size; + new->lr_type = RL_WRITER; + } + + /* + * If we need to grow the block size then lock the whole file range. + */ + uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { + new->lr_offset = 0; + new->lr_length = UINT64_MAX; + } +} + +/*ARGSUSED*/ +#if 0 // unused function +static void +znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) +{ + /* + * We should never drop all dbuf refs without first clearing + * the eviction callback. + */ + panic("evicting znode %p\n", user_ptr); +} +#endif + +extern struct vop_vector zfs_vnodeops; +extern struct vop_vector zfs_fifoops; +extern struct vop_vector zfs_shareops; + +/* + * XXX: We cannot use this function as a cache constructor, because + * there is one global cache for all file systems and we need + * to pass vfsp here, which is not possible, because argument + * 'cdrarg' is defined at kmem_cache_create() time. + */ +/*ARGSUSED*/ +static int +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_t *zp = buf; + + list_link_init(&zp->z_link_node); + + mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); + zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); + + mutex_init(&zp->z_attach_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zp->z_attach_cv, NULL, CV_DEFAULT, NULL); + + zp->z_dirlocks = NULL; + zp->z_acl_cached = NULL; + zp->z_xattr_cached = NULL; + zp->z_xattr_parent = 0; + zp->z_moved = 0; + zp->z_skip_truncate_undo_decmpfs = B_FALSE; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *arg) +{ + znode_t *zp = buf; + + ASSERT(ZTOV(zp) == NULL); + ASSERT(!list_link_active(&zp->z_link_node)); + mutex_destroy(&zp->z_lock); + rw_destroy(&zp->z_map_lock); + rw_destroy(&zp->z_parent_lock); + rw_destroy(&zp->z_name_lock); + mutex_destroy(&zp->z_acl_lock); + rw_destroy(&zp->z_xattr_lock); + zfs_rangelock_fini(&zp->z_rangelock); + mutex_destroy(&zp->z_attach_lock); + cv_destroy(&zp->z_attach_cv); + + ASSERT(zp->z_dirlocks == NULL); + ASSERT(zp->z_acl_cached == NULL); + ASSERT(zp->z_xattr_cached == NULL); +} + +static int +zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_hold_t *zh = buf; + + mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); + zfs_refcount_create(&zh->zh_refcount); + zh->zh_obj = ZFS_NO_OBJECT; + + return (0); +} + +static void +zfs_znode_hold_cache_destructor(void *buf, void *arg) +{ + znode_hold_t *zh = buf; + + mutex_destroy(&zh->zh_lock); + zfs_refcount_destroy(&zh->zh_refcount); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache. The KMC_SLAB hint is used in order that it be + * backed by kmalloc() when on the Linux slab in order that any + * wait_on_bit() operations on the related inode operate properly. + */ + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, + zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, + NULL, 0); + + ASSERT(znode_hold_cache == NULL); + znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", + sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, + zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); +} + +void +zfs_znode_fini(void) +{ + /* + * Cleanup zcache + */ + if (znode_cache) + kmem_cache_destroy(znode_cache); + znode_cache = NULL; + + if (znode_hold_cache) + kmem_cache_destroy(znode_hold_cache); + znode_hold_cache = NULL; +} + +/* + * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to + * serialize access to a znode and its SA buffer while the object is being + * created or destroyed. This kind of locking would normally reside in the + * znode itself but in this case that's impossible because the znode and SA + * buffer may not yet exist. Therefore the locking is handled externally + * with an array of mutexs and AVLs trees which contain per-object locks. + * + * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted + * in to the correct AVL tree and finally the per-object lock is held. In + * zfs_znode_hold_exit() the process is reversed. The per-object lock is + * released, removed from the AVL tree and destroyed if there are no waiters. + * + * This scheme has two important properties: + * + * 1) No memory allocations are performed while holding one of the z_hold_locks. + * This ensures evict(), which can be called from direct memory reclaim, will + * never block waiting on a z_hold_locks which just happens to have hashed + * to the same index. + * + * 2) All locks used to serialize access to an object are per-object and never + * shared. This minimizes lock contention without creating a large number + * of dedicated locks. + * + * On the downside it does require znode_lock_t structures to be frequently + * allocated and freed. However, because these are backed by a kmem cache + * and very short lived this cost is minimal. + */ +int +zfs_znode_hold_compare(const void *a, const void *b) +{ + const znode_hold_t *zh_a = (const znode_hold_t *)a; + const znode_hold_t *zh_b = (const znode_hold_t *)b; + + return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); +} + +boolean_t +zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) +{ + znode_hold_t *zh, search; + int i = ZFS_OBJ_HASH(zfsvfs, obj); + boolean_t held; + + search.zh_obj = obj; + + mutex_enter(&zfsvfs->z_hold_locks[i]); + zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); + held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; + mutex_exit(&zfsvfs->z_hold_locks[i]); + + return (held); +} + +static znode_hold_t * +zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) +{ + znode_hold_t *zh, *zh_new, search; + int i = ZFS_OBJ_HASH(zfsvfs, obj); + boolean_t found = B_FALSE; + + zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); + zh_new->zh_obj = obj; + search.zh_obj = obj; + + mutex_enter(&zfsvfs->z_hold_locks[i]); + zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); + if (likely(zh == NULL)) { + zh = zh_new; + avl_add(&zfsvfs->z_hold_trees[i], zh); + } else { + ASSERT3U(zh->zh_obj, ==, obj); + found = B_TRUE; + } + zfs_refcount_add(&zh->zh_refcount, NULL); + mutex_exit(&zfsvfs->z_hold_locks[i]); + + if (found == B_TRUE) + kmem_cache_free(znode_hold_cache, zh_new); + + ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); + ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); + mutex_enter(&zh->zh_lock); + + return (zh); +} + +static void +zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) +{ + int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); + boolean_t remove = B_FALSE; + + ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); + ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); + mutex_exit(&zh->zh_lock); + + mutex_enter(&zfsvfs->z_hold_locks[i]); + if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) { + avl_remove(&zfsvfs->z_hold_trees[i], zh); + remove = B_TRUE; + } + mutex_exit(&zfsvfs->z_hold_locks[i]); + + if (remove == B_TRUE) + kmem_cache_free(znode_hold_cache, zh); +} + +int +zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + int error = 0; +#if 0 // FIXME, uses vnode struct, not ptr + zfs_acl_ids_t acl_ids; + vattr_t vattr; + znode_t *sharezp; + struct vnode *vp, *vnode; + znode_t *zp; + + vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0555; + vattr.va_uid = crgetuid(kcred); + vattr.va_gid = crgetgid(kcred); + + sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); + sharezp->z_moved = 0; + sharezp->z_unlinked = 0; + sharezp->z_atime_dirty = 0; + sharezp->z_zfsvfs = zfsvfs; + sharezp->z_is_sa = zfsvfs->z_use_sa; + + sharezp->z_vnode = vnode; + vnode.v_data = sharezp; + + vp = ZTOV(sharezp); + vp->v_type = VDIR; + + VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, + kcred, NULL, &acl_ids)); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, sharezp); + POINTER_INVALIDATE(&sharezp->z_zfsvfs); + error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); + zfsvfs->z_shares_dir = sharezp->z_id; + + zfs_acl_ids_free(&acl_ids); + ZTOV(sharezp)->v_data = NULL; + ZTOV(sharezp)->v_count = 0; + ZTOV(sharezp)->v_holdcnt = 0; + zp->z_vnode = NULL; + sa_handle_destroy(sharezp->z_sa_hdl); + sharezp->z_vnode = NULL; + kmem_cache_free(znode_cache, sharezp); +#endif + return (error); +} + +/* + * define a couple of values we need available + * for both 64 and 32 bit environments. + */ +#ifndef NBITSMINOR64 +#define NBITSMINOR64 32 +#endif +#ifndef MAXMAJ64 +#define MAXMAJ64 0xffffffffUL +#endif +#ifndef MAXMIN64 +#define MAXMIN64 0xffffffffUL +#endif + +/* + * Create special expldev for ZFS private use. + * Can't use standard expldev since it doesn't do + * what we want. The standard expldev() takes a + * dev32_t in LP64 and expands it to a long dev_t. + * We need an interface that takes a dev32_t in ILP32 + * and expands it to a long dev_t. + */ +static uint64_t +zfs_expldev(dev_t dev) +{ + return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); +} +/* + * Special cmpldev for ZFS private use. + * Can't use standard cmpldev since it takes + * a long dev_t and compresses it to dev32_t in + * LP64. We need to do a compaction of a long dev_t + * to a dev32_t in ILP32. + */ +dev_t +zfs_cmpldev(uint64_t dev) +{ + return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); +} + +static void +zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, + dmu_buf_t *db, dmu_object_type_t obj_type, + sa_handle_t *sa_hdl) +{ + ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); + + mutex_enter(&zp->z_lock); + + ASSERT(zp->z_sa_hdl == NULL); + ASSERT(zp->z_acl_cached == NULL); + if (sa_hdl == NULL) { + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + SA_HDL_SHARED, &zp->z_sa_hdl)); + } else { + zp->z_sa_hdl = sa_hdl; + sa_set_userp(sa_hdl, zp); + } + + zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; + + mutex_exit(&zp->z_lock); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked || + RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); + + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; +} + +#if 0 // Until we need it ? +static void +zfs_vnode_destroy(struct vnode *vp) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + if (vp != NULL) { + znode_t *zp = VTOZ(vp); + + if (zp != NULL) { + mutex_enter(&zfsvfs->z_znodes_lock); + if (list_link_active(&zp->z_link_node)) { + list_remove(&zfsvfs->z_all_znodes, zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + + kmem_cache_free(znode_cache, zp); + } + + vnode_clearfsnode(vp); + vnode_put(vp); + vnode_recycle(vp); + } +} +#endif + +/* + * Construct a new znode/vnode and intialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, + dmu_object_type_t obj_type, sa_handle_t *hdl) +{ + znode_t *zp; + struct vnode *vp; + uint64_t mode; + uint64_t parent; + sa_bulk_attr_t bulk[11]; + int count = 0; + uint64_t projid = ZFS_DEFAULT_PROJID; + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + + ASSERT(zp->z_dirlocks == NULL); + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + zp->z_moved = 0; + + /* + * Defer setting z_zfsvfs until the znode is ready to be a candidate for + * the zfs_znode_move() callback. + */ + zp->z_vnode = NULL; + zp->z_sa_hdl = NULL; + zp->z_unlinked = 0; + zp->z_atime_dirty = 0; + zp->z_mapcnt = 0; + zp->z_id = db->db_object; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; + + zp->z_is_mapped = 0; + zp->z_is_ctldir = 0; + zp->z_vid = 0; + zp->z_uid = 0; + zp->z_gid = 0; + zp->z_size = 0; + zp->z_name_cache[0] = 0; + zp->z_finder_parentid = 0; + zp->z_finder_hardlink = FALSE; + + taskq_init_ent(&zp->z_attach_taskq); + + vp = ZTOV(zp); /* Does nothing in OSX */ + + zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, 8); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 || + (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + (zp->z_pflags & ZFS_PROJID) && + sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { + if (hdl == NULL) + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; + printf("znode_alloc: sa_bulk_lookup failed - aborting\n"); + kmem_cache_free(znode_cache, zp); + return (NULL); + } + + zp->z_projid = projid; + zp->z_mode = mode; + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + membar_producer(); + /* + * Everything else must be valid before assigning z_zfsvfs makes the + * znode eligible for zfs_znode_move(). + */ + zp->z_zfsvfs = zfsvfs; + mutex_exit(&zfsvfs->z_znodes_lock); + + return (zp); +} + + +static uint64_t empty_xattr; +static uint64_t pad[4]; +static zfs_acl_phys_t acl_phys; +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_XATTR - new object is an attribute + * bonuslen - length of bonus buffer + * setaclp - File/Dir initial ACL + * fuidp - Tracks fuid allocation. + * + * OUT: zpp - allocated znode + * + * OS X implementation notes: + * + * The caller of zfs_mknode() is expected to call zfs_znode_getvnode() + * AFTER the dmu_tx_commit() is performed. This prevents deadlocks + * since vnode_create can indirectly attempt to clean a dirty vnode. + * + * The current list of callers includes: + * zfs_vnop_create + * zfs_vnop_mkdir + * zfs_vnop_symlink + * zfs_obtain_xattr + * zfs_make_xattrdir + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) +{ + uint64_t crtime[2], atime[2], mtime[2], ctime[2]; + uint64_t mode, size, links, parent, pflags; + uint64_t projid = ZFS_DEFAULT_PROJID; + uint64_t dzp_pflags = 0; + uint64_t rdev = 0; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + dmu_buf_t *db; + timestruc_t now; + uint64_t gen, obj; + int bonuslen; + int dnodesize; + sa_handle_t *sa_hdl; + dmu_object_type_t obj_type; + sa_bulk_attr_t *sa_attrs; + int cnt = 0; + zfs_acl_locator_cb_t locate = { 0 }; + int err = 0; + znode_hold_t *zh; + + ASSERT(vap && (vap->va_mask & (ATTR_TYPE|ATTR_MODE)) == + (ATTR_TYPE|ATTR_MODE)); + + if (zfsvfs->z_replay) { + obj = vap->va_nodeid; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ + } else { + obj = 0; + gethrestime(&now); + gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); + } + + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + bonuslen = (obj_type == DMU_OT_SA) ? + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be needed to allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (vap->va_type == VDIR) { + if (zfsvfs->z_replay) { + VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = zap_create_norm_dnsize(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx); + } + } else { + if (zfsvfs->z_replay) { + VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = dmu_object_alloc_dnsize(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx); + } + } + + zh = zfs_znode_hold_enter(zfsvfs, obj); + VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_id = obj; + } else { + dzp_pflags = dzp->z_pflags; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp_pflags & ZFS_XATTR) { + flag |= IS_XATTR; + } + + if (zfsvfs->z_use_fuids) + pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + else + pflags = 0; + + if (vap->va_type == VDIR) { + size = 2; /* contents ("." and "..") */ + links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } else { + size = links = 0; + } + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + rdev = zfs_expldev(vap->va_rdev); + } + + parent = dzp->z_id; + mode = acl_ids->z_mode; + if (flag & IS_XATTR) + pflags |= ZFS_XATTR; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { + /* + * With ZFS_PROJID flag, we can easily know whether there is + * project ID stored on disk or not. See zfs_space_delta_cb(). + */ + if (obj_type != DMU_OT_ZNODE && + dmu_objset_projectquota_enabled(zfsvfs->z_os)) + pflags |= ZFS_PROJID; + + /* + * Inherit project ID from parent if required. + */ + projid = zfs_inherit_projid(dzp); + if (dzp->z_pflags & ZFS_PROJINHERIT) + pflags |= ZFS_PROJINHERIT; + } + + /* + * No execs denied will be deterimed when zfs_mode_compute() is called. + */ + pflags |= acl_ids->z_aclp->z_hints & + (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| + ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); + + ZFS_TIME_ENCODE(&now, crtime); + ZFS_TIME_ENCODE(&now, ctime); + + if (vap->va_mask & ATTR_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, atime); + } else { + ZFS_TIME_ENCODE(&now, atime); + } + + if (vap->va_mask & ATTR_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + } else { + ZFS_TIME_ENCODE(&now, mtime); + } + + /* Now add in all of the "SA" attributes */ + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + &sa_hdl)); + + /* + * Setup the array of attributes to be replaced/set on the new file + * + * order for DMU_OT_ZNODE is critical since it needs to be constructed + * in the old znode_phys_t format. Don't change this ordering + */ + sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + } else { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), + NULL, &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), + NULL, &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + } + + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, + &empty_xattr, 8); + } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + pflags & ZFS_PROJID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), + NULL, &projid, 8); + } + if (obj_type == DMU_OT_ZNODE || + (vap->va_type == VBLK || vap->va_type == VCHR)) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + + } + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, + sizeof (uint64_t) * 4); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (zfs_acl_phys_t)); + } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &acl_ids->z_aclp->z_acl_count, 8); + locate.cb_aclp = acl_ids->z_aclp; + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + acl_ids->z_aclp->z_acl_bytes); + mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, + acl_ids->z_fuid, acl_ids->z_fgid); + } + + VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + + if (!(flag & IS_ROOT_NODE)) { + /* + * We must not hold any locks while calling vnode_create inside + * zfs_znode_alloc(), as it may call either of vnop_reclaim, or + * vnop_fsync. If it is not enough to just release ZFS_OBJ_HOLD + * we will have to attach the vnode after the dmu_commit like + * maczfs does, in each vnop caller. + */ + do { + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); + } while (*zpp == NULL); + + VERIFY(*zpp != NULL); + VERIFY(dzp != NULL); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + + (*zpp)->z_sa_hdl = sa_hdl; + } + + (*zpp)->z_pflags = pflags; + (*zpp)->z_mode = mode; + (*zpp)->z_dnodesize = dnodesize; + (*zpp)->z_projid = projid; + + if (vap->va_mask & ATTR_XVATTR) + zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); + + if (obj_type == DMU_OT_ZNODE || + acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { + err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx); + ASSERT(err == 0); + } + + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); + zfs_znode_hold_exit(zfsvfs, zh); +} + +/* + * Update in-core attributes. It is assumed the caller will be doing an + * sa_bulk_update to push the changes out. + */ +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + xoptattr_t *xoap; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + ×, sizeof (times), tx); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined, zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + zfs_sa_set_scanstamp(zp, xvap, tx); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SPARSE); + } +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + return (zfs_zget_ext(zfsvfs, obj_num, zpp, 0)); +} + +int +zfs_zget_ext(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp, + int flags) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + znode_hold_t *zh; + struct vnode *vp = NULL; + sa_handle_t *hdl; + uint32_t vid; + int err; + + dprintf("+zget %llu\n", obj_num); + + *zpp = NULL; + +again: + zh = zfs_znode_hold_enter(zfsvfs, obj_num); + + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + zfs_znode_hold_exit(zfsvfs, zh); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EINVAL)); + } + + hdl = dmu_buf_get_user(db); + if (hdl != NULL) { + zp = sa_get_userdata(hdl); + + + /* + * Since "SA" does immediate eviction we + * should never find a sa handle that doesn't + * know about the znode. + */ + ASSERT3P(zp, !=, NULL); + + mutex_enter(&zp->z_lock); + + /* + * Since zp may disappear after we unlock below, + * we save a copy of vp and it's vid + */ + vid = zp->z_vid; + vp = ZTOV(zp); + + /* + * Since we do immediate eviction of the z_dbuf, we + * should never find a dbuf with a znode that doesn't + * know about the dbuf. + */ + ASSERT3U(zp->z_id, ==, obj_num); + + /* + * OS X can return the znode when the file is unlinked + * in order to support the sync of open-unlinked files + */ + if (!(flags & ZGET_FLAG_UNLINKED) && zp->z_unlinked) { + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (ENOENT); + } + + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + + /* + * We are racing zfs_znode_getvnode() and we got here first, we + * need to let it get ahead + */ + if (!vp) { + + // Wait until attached, if we can. + if ((flags & ZGET_FLAG_ASYNC) && + zfs_znode_asyncwait(zp) == 0) { + dprintf("%s: waited on z_vnode OK\n", __func__); + } else { + dprintf("%s: async racing attach\n", __func__); + // Could be zp is being torn down, idle a bit, + // and retry. This branch is rarely executed. + kpreempt(KPREEMPT_SYNC); + } + goto again; + } + + /* + * Due to vnode_create() -> zfs_fsync() -> zil_commit() -> + * zget() -> vnode_getwithvid() -> deadlock. Unsure why + * vnode_getwithvid() ends up sleeping in msleep() but + * vnode_get() does not. + * As we can deadlock here using vnode_getwithvid() we will use + * the simpler vnode_get() in the ASYNC cases. We verify the + * vids match below. + */ + if ((flags & ZGET_FLAG_ASYNC)) + err = vnode_get(vp); + else + err = vnode_getwithvid(vp, vid); + + if (err != 0) { + dprintf("ZFS: vnode_get() returned %d\n", err); + kpreempt(KPREEMPT_SYNC); + goto again; + } + + /* + * Since we had to drop all of our locks above, make sure + * that we have the vnode and znode we had before. + */ + mutex_enter(&zp->z_lock); + if ((vid != zp->z_vid) || (vp != ZTOV(zp))) { + mutex_exit(&zp->z_lock); + /* + * Release the wrong vp from vnode_getwithvid(). + */ + VN_RELE(vp); + dprintf("ZFS: the vids do not match part 1\n"); + goto again; + } + if (vnode_vid(vp) != zp->z_vid) + dprintf("ZFS: the vids do not match\n"); + mutex_exit(&zp->z_lock); + + *zpp = zp; + + return (0); + } // if vnode != NULL + + /* + * Not found create new znode/vnode + * but only if file exists. + * + * There is a small window where zfs_vget() could + * find this object while a file create is still in + * progress. This is checked for in zfs_znode_alloc() + * + * if zfs_znode_alloc() fails it will drop the hold on the + * bonus buffer. + */ + + zp = NULL; + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, + doi.doi_bonus_type, NULL); + if (zp == NULL) { + err = SET_ERROR(ENOENT); + zfs_znode_hold_exit(zfsvfs, zh); + dprintf("zget returning %d\n", err); + return (err); + } + + dprintf("zget create: %llu setting to %p\n", obj_num, zp); + *zpp = zp; + + // Spawn taskq to attach while we are locked + if (flags & ZGET_FLAG_ASYNC) { + zfs_znode_asyncgetvnode(zp, zfsvfs); + } + + zfs_znode_hold_exit(zfsvfs, zh); + + /* Attach a vnode to our new znode */ + if (!(flags & ZGET_FLAG_ASYNC)) { + zfs_znode_getvnode(zp, zfsvfs); + } + + dprintf("zget returning %d\n", err); + return (err); +} + + +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_info_t doi; + dmu_buf_t *db; + struct vnode *vp; + uint64_t obj_num = zp->z_id; + uint64_t mode, size; + sa_bulk_attr_t bulk[8]; + int err; + int count = 0; + uint64_t gen; + uint64_t projid = ZFS_DEFAULT_PROJID; + znode_hold_t *zh; + + if (zp->z_is_ctldir) + return (0); + + zh = zfs_znode_hold_enter(zfsvfs, obj_num); + + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + mutex_exit(&zp->z_acl_lock); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + + rw_exit(&zp->z_xattr_lock); + + ASSERT(zp->z_sa_hdl == NULL); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + zfs_znode_hold_exit(zfsvfs, zh); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EINVAL)); + } + + zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); + size = zp->z_size; + + /* reload cached values */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, sizeof (gen)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, sizeof (zp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, + sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, sizeof (zp->z_uid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, sizeof (zp->z_gid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EIO)); + } + + if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), + &projid, 8); + if (err != 0 && err != ENOENT) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(err)); + } + } + + zp->z_projid = projid; + zp->z_mode = mode; + + if (gen != zp->z_gen) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EIO)); + } + + /* + * XXXPJD: Not sure how is that possible, but under heavy + * zfs recv -F load it happens that z_gen is the same, but + * vnode type is different than znode type. This would mean + * that for example regular file was replaced with directory + * which has the same object number. + */ + vp = ZTOV(zp); + if (vp != NULL && + vnode_vtype(vp) != IFTOVT((mode_t)zp->z_mode)) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (EIO); + } + + zp->z_blksz = doi.doi_data_block_size; + if (vp != NULL) { + vn_pages_remove(vp, 0, 0); + if (zp->z_size != size) + vnode_pager_setsize(vp, zp->z_size); + } + + /* + * If the file has zero links, then it has been unlinked on the send + * side and it must be in the received unlinked set. + * We call zfs_znode_dmu_fini() now to prevent any accesses to the + * stale data and to prevent automatical removal of the file in + * zfs_zinactive(). The file will be removed either when it is removed + * on the send side and the next incremental stream is received or + * when the unlinked set gets processed. + */ + zp->z_unlinked = (zp->z_links == 0); + if (zp->z_unlinked) + zfs_znode_dmu_fini(zp); + + zfs_znode_hold_exit(zfsvfs, zh); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zfs_external_acl(zp); + znode_hold_t *zh; + + zh = zfs_znode_hold_enter(zfsvfs, obj); + if (acl_obj) { + VERIFY(!zp->z_is_sa); + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + } + VERIFY(0 == dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); +} + +void +zfs_zinactive(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t z_id = zp->z_id; + znode_hold_t *zh; + + ASSERT(zp->z_sa_hdl); + + /* + * Don't allow a zfs_zget() while were trying to release this znode + */ + zh = zfs_znode_hold_enter(zfsvfs, z_id); + + mutex_enter(&zp->z_lock); + + /* + * If this was the last reference to a file with no links, remove + * the file from the file system unless the file system is mounted + * read-only. That can happen, for example, if the file system was + * originally read-write, the file was opened, then unlinked and + * the file system was made read-only before the file was finally + * closed. The file will remain in the unlinked set. + */ + if (zp->z_unlinked) { + ASSERT(!zfsvfs->z_issnap); + + if (!(vfs_isrdonly(zfsvfs->z_vfs)) && + !zfs_unlink_suspend_progress) { + mutex_exit(&zp->z_lock); + zfs_znode_hold_exit(zfsvfs, zh); + zfs_rmnode(zp); + return; + } + } + + mutex_exit(&zp->z_lock); + zfs_znode_dmu_fini(zp); + + zfs_znode_hold_exit(zfsvfs, zh); +} + +void +zfs_znode_free(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + mutex_enter(&zfsvfs->z_znodes_lock); + zp->z_vnode = NULL; + zp->z_zfsvfs = NULL; + POINTER_INVALIDATE(&zp->z_zfsvfs); + list_remove(&zfsvfs->z_all_znodes, zp); /* XXX */ + mutex_exit(&zfsvfs->z_znodes_lock); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + + ASSERT(zp->z_sa_hdl == NULL); + + kmem_cache_free(znode_cache, zp); +} + + +/* + * Prepare to update znode time stamps. + * + * IN: zp - znode requiring timestamp update + * flag - ATTR_MTIME, ATTR_CTIME, ATTR_ATIME flags + * have_tx - true of caller is creating a new txg + * + * OUT: zp - new atime (via underlying inode's i_atime) + * mtime - new mtime + * ctime - new ctime + * + * NOTE: The arguments are somewhat redundant. The following condition + * is always true: + * + * have_tx == !(flag & ATTR_ATIME) + */ +void +zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2], boolean_t have_tx) +{ + timestruc_t now; + + gethrestime(&now); + + if (have_tx) { /* will sa_bulk_update happen really soon? */ + zp->z_atime_dirty = 0; + zp->z_seq++; + } else { + zp->z_atime_dirty = 1; + } + + if (flag & ATTR_ATIME) { + ZFS_TIME_ENCODE(&now, zp->z_atime); + } + + if (flag & ATTR_MTIME) { + ZFS_TIME_ENCODE(&now, mtime); + if (zp->z_zfsvfs->z_use_fuids) { + zp->z_pflags |= (ZFS_ARCHIVE | + ZFS_AV_MODIFIED); + } + } + + if (flag & ATTR_CTIME) { + ZFS_TIME_ENCODE(&now, ctime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_pflags |= ZFS_ARCHIVE; + } +} + +void +zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2]) +{ + zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE); +} + +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, + zp->z_id, + size, 0, tx); + + if (error == ENOTSUP) + return; + ASSERT(error == 0); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); +} + +#ifdef sun +/* + * This is a dummy interface used when pvn_vplist_dirty() should *not* + * be calling back into the fs for a putpage(). E.g.: when truncating + * a file, the pages being "thrown away* don't need to be written out. + */ +/* ARGSUSED */ +static int +zfs_no_putpage(struct vnode *vp, page_t *pp, u_offset_t *offp, size_t *lenp, + int flags, cred_t *cr) +{ + ASSERT(0); + return (0); +} +#endif /* sun */ + +/* + * Increase the file length + * + * IN: zp - znode of file to free data in. + * end - new end-of-file + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_extend(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_tx_t *tx; + zfs_locked_range_t *lr; + uint64_t newblksz; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end <= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + if (end > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); + } else { + newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; + } + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); + + zp->z_size = end; + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &zp->z_size, + sizeof (zp->z_size), tx)); + + vnode_pager_setsize(ZTOV(zp), end); + + zfs_rangelock_exit(lr); + + dmu_tx_commit(tx); + + return (0); +} + + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + int error; + + /* + * Lock the range being freed. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + if (off + len > zp->z_size) + len = zp->z_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + if (error == 0) { + /* + * In FreeBSD we cannot free block in the middle of a file, + * but only at the end of a file, so this code path should + * never happen. + */ + vnode_pager_setsize(ZTOV(zp), off); + } + +#ifdef _LINUX + /* + * Zero partial page cache entries. This must be done under a + * range lock in order to keep the ARC and page cache in sync. + */ + if (zp->z_is_mapped) { + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; + + /* first possible full page in hole */ + first_page = (off + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* last page of hole */ + last_page = (off + len) >> PAGE_CACHE_SHIFT; + + /* offset of first_page */ + first_page_offset = first_page << PAGE_CACHE_SHIFT; + /* offset of last_page */ + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* truncate whole pages */ + if (last_page_offset > first_page_offset) { + truncate_inode_pages_range(ZTOI(zp)->i_mapping, + first_page_offset, last_page_offset - 1); + } + + /* truncate sub-page ranges */ + if (first_page > last_page) { + /* entire punched area within a single page */ + zfs_zero_partial_page(zp, off, len); + } else { + /* beginning of punched area at the end of a page */ + page_len = first_page_offset - off; + if (page_len > 0) + zfs_zero_partial_page(zp, off, page_len); + + /* end of punched area at the beginning of a page */ + page_len = off + len - last_page_offset; + if (page_len > 0) + zfs_zero_partial_page(zp, last_page_offset, + page_len); + } + } +#endif + zfs_rangelock_exit(lr); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct vnode *vp = ZTOV(zp); + dmu_tx_t *tx; + zfs_locked_range_t *lr; + int error; + sa_bulk_attr_t bulk[2]; + int count = 0; + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); + if (error) { + zfs_rangelock_exit(lr); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + zp->z_size = end; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &zp->z_size, sizeof (zp->z_size)); + + if (end == 0) { + zp->z_pflags &= ~ZFS_SPARSE; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); + + dmu_tx_commit(tx); + + /* + * Clear any mapped pages in the truncated region. This has to + * happen outside of the transaction to avoid the possibility of + * a deadlock with someone trying to push a page that we are + * about to invalidate. + */ + vnode_pager_setsize(vp, end); + + zfs_rangelock_exit(lr); + + return (0); +} + +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 on success, error code on failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ +// struct vnode *vp = ZTOV(zp); + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t mode; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + int error; + + if (vnode_isfifo(ZTOV(zp))) + return (0); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, + sizeof (mode))) != 0) + return (error); + + if (off > zp->z_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + goto out; + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + goto out; +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + + error = 0; + +out: + + return (error); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs; + uint64_t moid, obj, sa_obj, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; + int size; + int error; + int i; + znode_t *rootzp = NULL; + vattr_t vattr; + znode_t *zp; + zfs_acl_ids_t acl_ids; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Set starting attributes. + */ + version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); + VERIFY(nvpair_value_uint64(elem, &val) == 0); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + if (val < version) + version = val; + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT(error == 0); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + + /* + * Create zap object used for SA attribute registration + */ + + if (version >= ZPL_VERSION_SA) { + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + } else { + sa_obj = 0; + } + /* + * Create a delete queue. + */ + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/vnode/zfsvfs + * to allow zfs_mknode to work. + */ + VATTR_NULL(&vattr); + vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); + + rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_moved = 0; + rootzp->z_unlinked = 0; + rootzp->z_atime_dirty = 0; + rootzp->z_is_sa = USE_SA(version, os); + + rootzp->z_vnode = NULL; +#ifndef __APPLE__ + vnode.v_type = VDIR; + vnode.v_data = rootzp; + rootzp->z_vnode = &vnode; +#endif + + zfsvfs = kmem_alloc(sizeof (zfsvfs_t), KM_SLEEP); +#ifdef __APPLE__ + bzero(zfsvfs, sizeof (zfsvfs_t)); +#endif + zfsvfs->z_os = os; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_version = version; + zfsvfs->z_use_fuids = USE_FUIDS(version, os); + zfsvfs->z_use_sa = USE_SA(version, os); + zfsvfs->z_norm = norm; + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + + ASSERT(error == 0); + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); + zfsvfs->z_hold_size = size; + zfsvfs->z_hold_trees = kmem_zalloc(sizeof (avl_tree_t) * size, + KM_SLEEP); + zfsvfs->z_hold_locks = kmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); + for (i = 0; i != size; i++) { + avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, + sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); + mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); + } + + rootzp->z_zfsvfs = zfsvfs; + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, rootzp); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); + ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); + POINTER_INVALIDATE(&rootzp->z_zfsvfs); + + sa_handle_destroy(rootzp->z_sa_hdl); + rootzp->z_sa_hdl = NULL; + rootzp->z_vnode = NULL; + kmem_cache_free(znode_cache, rootzp); + + for (i = 0; i != size; i++) { + avl_destroy(&zfsvfs->z_hold_trees[i]); + mutex_destroy(&zfsvfs->z_hold_locks[i]); + } + + /* + * Create shares directory + */ + + error = zfs_create_share_dir(zfsvfs, tx); + + ASSERT(error == 0); + + list_destroy(&zfsvfs->z_all_znodes); + mutex_destroy(&zfsvfs->z_znodes_lock); + + kmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); + kmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); + + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +#endif /* _KERNEL */ + +static int +zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) +{ + uint64_t sa_obj = 0; + int error; + + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (error != 0 && error != ENOENT) + return (error); + + error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); + return (error); +} +static int +zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, + dmu_buf_t **db, void *tag) +{ + dmu_object_info_t doi; + int error; + + if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) + return (error); + + dmu_object_info_from_db(*db, &doi); + if (((doi.doi_bonus_type != DMU_OT_SA) && + (doi.doi_bonus_type != DMU_OT_ZNODE)) || + ((doi.doi_bonus_type == DMU_OT_ZNODE) && + (doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(*db, tag); + return (SET_ERROR(ENOTSUP)); + } + + error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); + if (error != 0) { + sa_buf_rele(*db, tag); + return (error); + } + return (0); +} + +void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) +{ + sa_handle_destroy(hdl); + sa_buf_rele(db, tag); +} + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) +{ + uint64_t parent; + uint64_t pflags; + uint64_t mode; + uint64_t parent_mode; + sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; + int count = 0; + int error; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, + &parent, sizeof (parent)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, + &pflags, sizeof (pflags)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &mode, sizeof (mode)); + + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) + return (error); + + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return ((EINVAL)); + + *pobjp = parent; + + return (0); +} + +/* + * Given an object number, return some zpl level statistics + */ +static int +zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, + zfs_stat_t *sb) +{ + sa_bulk_attr_t bulk[4]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &sb->zs_mode, sizeof (sb->zs_mode)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, + &sb->zs_gen, sizeof (sb->zs_gen)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, + &sb->zs_links, sizeof (sb->zs_links)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, + &sb->zs_ctime, sizeof (sb->zs_ctime)); + + return (sa_bulk_lookup(hdl, bulk, count)); +} + +static int +zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, + sa_attr_type_t *sa_table, char *buf, int len) +{ + sa_handle_t *sa_hdl; + sa_handle_t *prevhdl = NULL; + dmu_buf_t *prevdb = NULL; + dmu_buf_t *sa_db = NULL; + char *path = buf + len - 1; + int error; + + *path = '\0'; + sa_hdl = hdl; + + uint64_t deleteq_obj; + VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + error = zap_lookup_int(osp, deleteq_obj, obj); + if (error == 0) { + return (ESTALE); + } else if (error != ENOENT) { + return (error); + } + error = 0; + + for (;;) { + uint64_t pobj = 0; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir = 0; + + if (prevdb) + zfs_release_sa_handle(prevhdl, prevdb, FTAG); + + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) snprintf(component + 1, MAXNAMELEN+1, + ""); + } else { + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), + component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT(path >= buf); + bcopy(component, path, complen); + obj = pobj; + + if (sa_hdl != hdl) { + prevhdl = sa_hdl; + prevdb = sa_db; + } + error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); + if (error != 0) { + sa_hdl = prevhdl; + sa_db = prevdb; + break; + } + } + + if (sa_hdl != NULL && sa_hdl != hdl) { + ASSERT(sa_db != NULL); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + + return (error); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +int +zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len) +{ + char *path = buf + len - 1; + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + *path = '\0'; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + if (error != 0) { + zfs_release_sa_handle(hdl, db, FTAG); + return (error); + } + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} diff --git a/module/os/macos/zfs/zio_crypt.c b/module/os/macos/zfs/zio_crypt.c new file mode 100644 index 0000000000..7523844c07 --- /dev/null +++ b/module/os/macos/zfs/zio_crypt.c @@ -0,0 +1,1995 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file is responsible for handling all of the details of generating + * encryption parameters and performing encryption and authentication. + * + * BLOCK ENCRYPTION PARAMETERS: + * Encryption /Authentication Algorithm Suite (crypt): + * The encryption algorithm, mode, and key length we are going to use. We + * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit + * keys. All authentication is currently done with SHA512-HMAC. + * + * Plaintext: + * The unencrypted data that we want to encrypt. + * + * Initialization Vector (IV): + * An initialization vector for the encryption algorithms. This is used to + * "tweak" the encryption algorithms so that two blocks of the same data are + * encrypted into different ciphertext outputs, thus obfuscating block patterns. + * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is + * never reused with the same encryption key. This value is stored unencrypted + * and must simply be provided to the decryption function. We use a 96 bit IV + * (as recommended by NIST) for all block encryption. For non-dedup blocks we + * derive the IV randomly. The first 64 bits of the IV are stored in the second + * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of + * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits + * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count + * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of + * level 0 blocks is the number of allocated dnodes in that block. The on-disk + * format supports at most 2^15 slots per L0 dnode block, because the maximum + * block size is 16MB (2^24). In either case, for level 0 blocks this number + * will still be smaller than UINT32_MAX so it is safe to store the IV in the + * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count + * for the dnode code. + * + * Master key: + * This is the most important secret data of an encrypted dataset. It is used + * along with the salt to generate that actual encryption keys via HKDF. We + * do not use the master key to directly encrypt any data because there are + * theoretical limits on how much data can actually be safely encrypted with + * any encryption mode. The master key is stored encrypted on disk with the + * user's wrapping key. Its length is determined by the encryption algorithm. + * For details on how this is stored see the block comment in dsl_crypt.c + * + * Salt: + * Used as an input to the HKDF function, along with the master key. We use a + * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt + * can be used for encrypting many blocks, so we cache the current salt and the + * associated derived key in zio_crypt_t so we do not need to derive it again + * needlessly. + * + * Encryption Key: + * A secret binary key, generated from an HKDF function used to encrypt and + * decrypt data. + * + * Message Authenication Code (MAC) + * The MAC is an output of authenticated encryption modes such as AES-GCM and + * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted + * data on disk and return garbage to the application. Effectively, it is a + * checksum that can not be reproduced by an attacker. We store the MAC in the + * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated + * regular checksum of the ciphertext which can be used for scrubbing. + * + * OBJECT AUTHENTICATION: + * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because + * they contain some info that always needs to be readable. To prevent this + * data from being altered, we authenticate this data using SHA512-HMAC. This + * will produce a MAC (similar to the one produced via encryption) which can + * be used to verify the object was not modified. HMACs do not require key + * rotation or IVs, so we can keep up to the full 3 copies of authenticated + * data. + * + * ZIL ENCRYPTION: + * ZIL blocks have their bp written to disk ahead of the associated data, so we + * cannot store the MAC there as we normally do. For these blocks the MAC is + * stored in the embedded checksum within the zil_chain_t header. The salt and + * IV are generated for the block on bp allocation instead of at encryption + * time. In addition, ZIL blocks have some pieces that must be left in plaintext + * for claiming even though all of the sensitive user data still needs to be + * encrypted. The function zio_crypt_init_uios_zil() handles parsing which + * pieces of the block need to be encrypted. All data that is not encrypted is + * authenticated using the AAD mechanisms that the supported encryption modes + * provide for. In order to preserve the semantics of the ZIL for encrypted + * datasets, the ZIL is not protected at the objset level as described below. + * + * DNODE ENCRYPTION: + * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left + * in plaintext for scrubbing and claiming, but the bonus buffers might contain + * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing + * which which pieces of the block need to be encrypted. For more details about + * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). + * + * OBJECT SET AUTHENTICATION: + * Up to this point, everything we have encrypted and authenticated has been + * at level 0 (or -2 for the ZIL). If we did not do any further work the + * on-disk format would be susceptible to attacks that deleted or rearrannged + * the order of level 0 blocks. Ideally, the cleanest solution would be to + * maintain a tree of authentication MACs going up the bp tree. However, this + * presents a problem for raw sends. Send files do not send information about + * indirect blocks so there would be no convenient way to transfer the MACs and + * they cannot be recalculated on the receive side without the master key which + * would defeat one of the purposes of raw sends in the first place. Instead, + * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs + * from the level below. We also include some portable fields from blk_prop such + * as the lsize and compression algorithm to prevent the data from being + * misinterpretted. + * + * At the objset level, we maintain 2 seperate 256 bit MACs in the + * objset_phys_t. The first one is "portable" and is the logical root of the + * MAC tree maintianed in the metadnode's bps. The second, is "local" and is + * used as the root MAC for the user accounting objects, which are also not + * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload + * of the send file. The useraccounting code ensures that the useraccounting + * info is not present upon a receive, so the local MAC can simply be cleared + * out at that time. For more info about objset_phys_t authentication, see + * zio_crypt_do_objset_hmacs(). + * + * CONSIDERATIONS FOR DEDUP: + * In order for dedup to work, blocks that we want to dedup with one another + * need to use the same IV and encryption key, so that they will have the same + * ciphertext. Normally, one should never reuse an IV with the same encryption + * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both + * blocks. In this case, however, since we are using the same plaindata as + * well all that we end up with is a duplicate of the original ciphertext we + * already had. As a result, an attacker with read access to the raw disk will + * be able to tell which blocks are the same but this information is given away + * by dedup anyway. In order to get the same IVs and encryption keys for + * equivalent blocks of data we use an HMAC of the plaindata. We use an HMAC + * here so that a reproducible checksum of the plaindata is never available to + * the attacker. The HMAC key is kept alongside the master key, encrypted on + * disk. The first 64 bits of the HMAC are used in place of the random salt, and + * the next 96 bits are used as the IV. As a result of this mechanism, dedup + * will only work within a clone family since encrypted dedup requires use of + * the same master and HMAC keys. + */ + +/* + * After encrypting many blocks with the same key we may start to run up + * against the theoretical limits of how much data can securely be encrypted + * with a single key using the supported encryption modes. The most obvious + * limitation is that our risk of generating 2 equivalent 96 bit IVs increases + * the more IVs we generate (which both GCM and CCM modes strictly forbid). + * This risk actually grows surprisingly quickly over time according to the + * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have + * generated n IVs with a cryptographically secure RNG, the approximate + * probability p(n) of a collision is given as: + * + * p(n) ~= e^(-n*(n-1)/(2*(2^96))) + * + * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] + * + * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion + * we must not write more than 398,065,730 blocks with the same encryption key. + * Therefore, we rotate our keys after 400,000,000 blocks have been written by + * generating a new random 64 bit salt for our HKDF encryption key generation + * function. + */ +#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 +#define ZFS_CURRENT_MAX_SALT_USES \ + (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) +unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; + +typedef struct blkptr_auth_buf { + uint64_t bab_prop; /* blk_prop - portable mask */ + uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ + uint64_t bab_pad; /* reserved for future use */ +} blkptr_auth_buf_t; + +zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { + {"", ZC_TYPE_NONE, 0, "inherit"}, + {"", ZC_TYPE_NONE, 0, "on"}, + {"", ZC_TYPE_NONE, 0, "off"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} +}; + +void +zio_crypt_key_destroy(zio_crypt_key_t *key) +{ + rw_destroy(&key->zk_salt_lock); + + /* free crypto templates */ + crypto_destroy_ctx_template(key->zk_current_tmpl); + crypto_destroy_ctx_template(key->zk_hmac_tmpl); + + /* zero out sensitive data */ + bzero(key, sizeof (zio_crypt_key_t)); +} + +int +zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech; + uint_t keydata_len; + + ASSERT(key != NULL); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + bzero(key, sizeof (zio_crypt_key_t)); + + /* fill keydata buffers and salt with random data */ + ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_master_keydata, keydata_len); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for the ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = &key->zk_hmac_key; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + /* + * Initialize the crypto templates. It's ok if this fails because + * this is just an optimization. + */ + mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, + &key->zk_hmac_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_hmac_tmpl = NULL; + + key->zk_crypt = crypt; + key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; + key->zk_salt_count = 0; + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + return (0); + +error: + zio_crypt_key_destroy(key); + return (ret); +} + +static int +zio_crypt_key_change_salt(zio_crypt_key_t *key) +{ + int ret = 0; + uint8_t salt[ZIO_DATA_SALT_LEN]; + crypto_mechanism_t mech; + uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; + + /* generate a new salt */ + ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + rw_enter(&key->zk_salt_lock, RW_WRITER); + + /* someone beat us to the salt rotation, just unlock and return */ + if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) + goto out_unlock; + + /* derive the current key from the master key and the new salt */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); + if (ret != 0) + goto out_unlock; + + /* assign the salt and reset the usage count */ + bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); + key->zk_salt_count = 0; + + /* destroy the old context template and create the new one */ + crypto_destroy_ctx_template(key->zk_current_tmpl); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + rw_exit(&key->zk_salt_lock); + + return (0); + +out_unlock: + rw_exit(&key->zk_salt_lock); +error: + return (ret); +} + +/* See comment above zfs_key_max_salt_uses definition for details */ +int +zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) +{ + int ret; + boolean_t salt_change; + + rw_enter(&key->zk_salt_lock, RW_READER); + + bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); + salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= + ZFS_CURRENT_MAX_SALT_USES); + + rw_exit(&key->zk_salt_lock); + + if (salt_change) { + ret = zio_crypt_key_change_salt(key); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * This function handles all encryption and decryption in zfs. When + * encrypting it expects puio to reference the plaintext and cuio to + * reference the cphertext. cuio must have enough space for the + * ciphertext + room for a MAC. datalen should be the length of the + * plaintext / ciphertext alone. + */ +static int +zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, + crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, + uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len) +{ + int ret; + crypto_data_t plaindata, cipherdata; + CK_AES_CCM_PARAMS ccmp; + CK_AES_GCM_PARAMS gcmp; + crypto_mechanism_t mech; + zio_crypt_info_t crypt_info; + uint_t plain_full_len; + uint64_t maclen; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW); + + /* lookup the encryption info */ + crypt_info = zio_crypt_table[crypt]; + + /* the mac will always be the last iovec_t in the cipher uio */ + user_addr_t mac; + uio_getiov(cuio, uio_iovcnt(cuio) - 1, &mac, &maclen); + + ASSERT(maclen <= ZIO_DATA_MAC_LEN); + + /* setup encryption mechanism (same as crypt) */ + mech.cm_type = crypto_mech2id(crypt_info.ci_mechname); + + /* + * Strangely, the ICP requires that plain_full_len must include + * the MAC length when decrypting, even though the UIO does not + * need to have the extra space allocated. + */ + if (encrypt) { + plain_full_len = datalen; + } else { + plain_full_len = datalen + maclen; + } + + /* + * setup encryption params (currently only AES CCM and AES GCM + * are supported) + */ + if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) { + ccmp.ulNonceSize = ZIO_DATA_IV_LEN; + ccmp.ulAuthDataSize = auth_len; + ccmp.authData = authbuf; + ccmp.ulMACSize = maclen; + ccmp.nonce = ivbuf; + ccmp.ulDataSize = plain_full_len; + + mech.cm_param = (char *)(&ccmp); + mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS); + } else { + gcmp.ulIvLen = ZIO_DATA_IV_LEN; + gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN); + gcmp.ulAADLen = auth_len; + gcmp.pAAD = authbuf; + gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen); + gcmp.pIv = ivbuf; + + mech.cm_param = (char *)(&gcmp); + mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); + } + + /* populate the cipher and plain data structs. */ + plaindata.cd_format = CRYPTO_DATA_UIO; + plaindata.cd_offset = 0; + plaindata.cd_uio = puio; + plaindata.cd_miscdata = NULL; + plaindata.cd_length = plain_full_len; + + cipherdata.cd_format = CRYPTO_DATA_UIO; + cipherdata.cd_offset = 0; + cipherdata.cd_uio = cuio; + cipherdata.cd_miscdata = NULL; + cipherdata.cd_length = datalen + maclen; + + /* perform the actual encryption */ + if (encrypt) { + ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata, + NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + } else { + ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata, + NULL); + if (ret != CRYPTO_SUCCESS) { + ASSERT3U(ret, ==, CRYPTO_INVALID_MAC); + ret = SET_ERROR(ECKSUM); + goto error; + } + } + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) +{ + int ret; + uio_t *puio = NULL, *cuio = NULL; + uint64_t aad[3]; + uint64_t crypt = key->zk_crypt; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* generate iv for wrapping the master and hmac key */ + ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); + if (ret != 0) + goto error; + + puio = uio_create(2, 0, UIO_SYSSPACE, UIO_READ); + if (puio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + cuio = uio_create(3, 0, UIO_SYSSPACE, UIO_WRITE); + if (cuio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + /* initialize uio_ts */ + VERIFY0(uio_addiov(puio, (user_addr_t)key->zk_master_keydata, + keydata_len)); + VERIFY0(uio_addiov(puio, (user_addr_t)key->zk_hmac_keydata, + SHA512_HMAC_KEYLEN)); + + VERIFY0(uio_addiov(cuio, (user_addr_t)keydata_out, keydata_len)); + VERIFY0(uio_addiov(cuio, (user_addr_t)hmac_keydata_out, + SHA512_HMAC_KEYLEN)); + VERIFY0(uio_addiov(cuio, (user_addr_t)mac, WRAPPING_MAC_LEN)); + + /* + * Although we don't support writing to the old format, we do + * support rewrapping the key so that the user can move and + * quarantine datasets on the old format. + */ + if (key->zk_version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(key->zk_guid); + } else { + ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(key->zk_guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(key->zk_version); + } + + enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; + + /* encrypt the keys and store the resulting ciphertext and mac */ + ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len, + puio, cuio, (uint8_t *)aad, aad_len); + if (ret != 0) + goto error; + + if (puio) uio_free(puio); + if (cuio) uio_free(cuio); + + return (0); + +error: + if (puio) uio_free(puio); + if (cuio) uio_free(cuio); + + return (ret); +} + +int +zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech; + uio_t *puio = NULL, *cuio = NULL; + uint64_t aad[3]; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + puio = uio_create(2, 0, UIO_SYSSPACE, UIO_WRITE); + if (puio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + cuio = uio_create(3, 0, UIO_SYSSPACE, UIO_READ); + if (cuio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + /* initialize uio_ts */ + VERIFY0(uio_addiov(puio, (user_addr_t)key->zk_master_keydata, + keydata_len)); + VERIFY0(uio_addiov(puio, (user_addr_t)key->zk_hmac_keydata, + SHA512_HMAC_KEYLEN)); + + VERIFY0(uio_addiov(cuio, (user_addr_t)keydata, keydata_len)); + VERIFY0(uio_addiov(cuio, (user_addr_t)hmac_keydata, + SHA512_HMAC_KEYLEN)); + VERIFY0(uio_addiov(cuio, (user_addr_t)mac, WRAPPING_MAC_LEN)); + + if (version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(guid); + } else { + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(version); + } + + enc_len = keydata_len + SHA512_HMAC_KEYLEN; + + /* decrypt the keys and store the result in the output buffers */ + ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len, + puio, cuio, (uint8_t *)aad, aad_len); + if (ret != 0) + goto error; + + /* generate a fresh salt */ + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = key->zk_hmac_keydata; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + /* + * Initialize the crypto templates. It's ok if this fails because + * this is just an optimization. + */ + mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, + &key->zk_hmac_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_hmac_tmpl = NULL; + + key->zk_crypt = crypt; + key->zk_version = version; + key->zk_guid = guid; + key->zk_salt_count = 0; + + if (puio) uio_free(puio); + if (cuio) uio_free(cuio); + + return (0); + +error: + if (puio) uio_free(puio); + if (cuio) uio_free(cuio); + zio_crypt_key_destroy(key); + return (ret); +} + +int +zio_crypt_generate_iv(uint8_t *ivbuf) +{ + int ret; + + /* randomly generate the IV */ + ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); + if (ret != 0) + goto error; + + return (0); + +error: + bzero(ivbuf, ZIO_DATA_IV_LEN); + return (ret); +} + +int +zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen) +{ + int ret; + crypto_mechanism_t mech; + crypto_data_t in_data, digest_data; + uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; + + ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); + + /* initialize sha512-hmac mechanism and crypto data */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + /* initialize the crypto data */ + in_data.cd_format = CRYPTO_DATA_RAW; + in_data.cd_offset = 0; + in_data.cd_length = datalen; + in_data.cd_raw.iov_base = (char *)data; + in_data.cd_raw.iov_len = in_data.cd_length; + + digest_data.cd_format = CRYPTO_DATA_RAW; + digest_data.cd_offset = 0; + digest_data.cd_length = SHA512_DIGEST_LENGTH; + digest_data.cd_raw.iov_base = (char *)raw_digestbuf; + digest_data.cd_raw.iov_len = digest_data.cd_length; + + /* generate the hmac */ + ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl, + &digest_data, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_digestbuf, digestbuf, digestlen); + + return (0); + +error: + bzero(digestbuf, digestlen); + return (ret); +} + +int +zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt) +{ + int ret; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + ret = zio_crypt_do_hmac(key, data, datalen, + digestbuf, SHA512_DIGEST_LENGTH); + if (ret != 0) + return (ret); + + bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); + bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); + + return (0); +} + +/* + * The following functions are used to encode and decode encryption parameters + * into blkptr_t and zil_header_t. The ICP wants to use these parameters as + * byte strings, which normally means that these strings would not need to deal + * with byteswapping at all. However, both blkptr_t and zil_header_t may be + * byteswapped by lower layers and so we must "undo" that byteswap here upon + * decoding and encoding in a non-native byteorder. These functions require + * that the byteorder bit is correct before being called. + */ +void +zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); + bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, val32); + } else { + bcopy(salt, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); + + bcopy(iv, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); + + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, BSWAP_32(val32)); + } +} + +void +zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_PROTECTED(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_IS_AUTHENTICATED(bp)) { + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); + bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); + + val32 = (uint32_t)BP_GET_IV2(bp); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } else { + val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); + bcopy(&val64, salt, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); + bcopy(&val64, iv, sizeof (uint64_t)); + + val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } +} + +void +zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], + sizeof (uint64_t)); + } else { + bcopy(mac, &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[2] = BSWAP_64(val64); + + bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[3] = BSWAP_64(val64); + } +} + +void +zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + bzero(mac, ZIO_DATA_MAC_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); + } else { + val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); + bcopy(&val64, mac, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); + bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); + } +} + +void +zio_crypt_encode_mac_zil(void *data, uint8_t *mac) +{ + zil_chain_t *zilc = data; + + bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], + sizeof (uint64_t)); +} + +void +zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) +{ + /* + * The ZIL MAC is embedded in the block it protects, which will + * not have been byteswapped by the time this function has been called. + * As a result, we don't need to worry about byteswapping the MAC. + */ + const zil_chain_t *zilc = data; + + bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); +} + +/* + * This routine takes a block of dnodes (src_abd) and copies only the bonus + * buffers to the same offsets in the dst buffer. datalen should be the size + * of both the src_abd and the dst buffer (not just the length of the bonus + * buffers). + */ +void +zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) +{ + uint_t i, max_dnp = datalen >> DNODE_SHIFT; + uint8_t *src; + dnode_phys_t *dnp, *sdnp, *ddnp; + + src = abd_borrow_buf_copy(src_abd, datalen); + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), + DN_MAX_BONUS_LEN(dnp)); + } + } + + abd_return_buf(src_abd, src, datalen); +} + +/* + * This function decides what fields from blk_prop are included in + * the on-disk various MAC algorithms. + */ +static void +zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) +{ + /* + * Version 0 did not properly zero out all non-portable fields + * as it should have done. We maintain this code so that we can + * do read-only imports of pools on this version. + */ + if (version == 0) { + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); + BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); + return; + } + + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + + /* + * The hole_birth feature might set these fields even if this bp + * is a hole. We zero them out here to guarantee that raw sends + * will function with or without the feature. + */ + if (BP_IS_HOLE(bp)) { + bp->blk_prop = 0ULL; + return; + } + + /* + * At L0 we want to verify these fields to ensure that data blocks + * can not be reinterpretted. For instance, we do not want an attacker + * to trick us into returning raw lz4 compressed data to the user + * by modifying the compression bits. At higher levels, we cannot + * enforce this policy since raw sends do not convey any information + * about indirect blocks, so these values might be different on the + * receive side. Fortunately, this does not open any new attack + * vectors, since any alterations that can be made to a higher level + * bp must still verify the correct order of the layer below it. + */ + if (BP_GET_LEVEL(bp) != 0) { + BP_SET_BYTEORDER(bp, 0); + BP_SET_COMPRESS(bp, 0); + + /* + * psize cannot be set to zero or it will trigger + * asserts, but the value doesn't really matter as + * long as it is constant. + */ + BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); + } + + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); +} + +static void +zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, + blkptr_auth_buf_t *bab, uint_t *bab_len) +{ + blkptr_t tmpbp = *bp; + + if (should_bswap) + byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); + + ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); + ASSERT0(BP_IS_EMBEDDED(&tmpbp)); + + zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); + + /* + * We always MAC blk_prop in LE to ensure portability. This + * must be done after decoding the mac, since the endianness + * will get zero'd out here. + */ + zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); + bab->bab_prop = LE_64(tmpbp.blk_prop); + bab->bab_pad = 0ULL; + + /* version 0 did not include the padding */ + *bab_len = sizeof (blkptr_auth_buf_t); + if (version == 0) + *bab_len -= sizeof (uint64_t); +} + +static int +zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + int ret; + uint_t bab_len; + blkptr_auth_buf_t bab; + crypto_data_t cd; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + cd.cd_length = bab_len; + cd.cd_raw.iov_base = (char *)&bab; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + return (0); + +error: + return (ret); +} + +static void +zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + SHA2Update(ctx, &bab, bab_len); +} + +static void +zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + bcopy(&bab, *aadp, bab_len); + *aadp += bab_len; + *aad_len += bab_len; +} + +static int +zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, dnode_phys_t *dnp) +{ + int ret, i; + dnode_phys_t *adnp; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + crypto_data_t cd; + uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; + + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + + /* authenticate the core dnode (masking out non-portable bits) */ + bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); + adnp = (dnode_phys_t *)tmp_dncore; + if (le_bswap) { + adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); + adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); + adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); + adnp->dn_used = BSWAP_64(adnp->dn_used); + } + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + + cd.cd_length = sizeof (tmp_dncore); + cd.cd_raw.iov_base = (char *)adnp; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + for (i = 0; i < dnp->dn_nblkptr; i++) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, &dnp->dn_blkptr[i]); + if (ret != 0) + goto error; + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, DN_SPILL_BLKPTR(dnp)); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * objset_phys_t blocks introduce a number of exceptions to the normal + * authentication process. objset_phys_t's contain 2 seperate HMACS for + * protecting the integrity of their data. The portable_mac protects the + * the metadnode. This MAC can be sent with a raw send and protects against + * reordering of data within the metadnode. The local_mac protects the user + * accounting objects which are not sent from one system to another. + * + * In addition, objset blocks are the only blocks that can be modified and + * written to disk without the key loaded under certain circumstances. During + * zil_claim() we need to be able to update the zil_header_t to complete + * claiming log blocks and during raw receives we need to write out the + * portable_mac from the send file. Both of these actions are possible + * because these fields are not protected by either MAC so neither one will + * need to modify the MACs without the key. However, when the modified blocks + * are written out they will be byteswapped into the host machine's native + * endianness which will modify fields protected by the MAC. As a result, MAC + * calculation for objset blocks works slightly differently from other block + * types. Where other block types MAC the data in whatever endianness is + * written to disk, objset blocks always MAC little endian version of their + * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() + * and le_bswap indicates whether a byteswap is needed to get this block + * into little endian format. + */ +int +zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) +{ + int ret; + crypto_mechanism_t mech; + crypto_context_t ctx; + crypto_data_t cd; + objset_phys_t *osp = data; + uint64_t intval; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; + uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; + + /* initialize HMAC mechanism */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + + /* calculate the portable MAC from the portable fields and metadnode */ + ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the os_type */ + intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in fields from the metadnode */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_meta_dnode); + if (ret) + goto error; + + /* store the final digest in a temporary buffer and copy what we need */ + cd.cd_length = SHA512_DIGEST_LENGTH; + cd.cd_raw.iov_base = (char *)raw_portable_mac; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_final(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + + /* + * The local MAC protects the user and group accounting. If these + * objects are not present, the local MAC is zeroed out. + */ + if (osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE) { + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (0); + } + + /* calculate the local MAC from the userused and groupused dnodes */ + ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the non-portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in fields from the user accounting dnodes */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_userused_dnode); + if (ret) + goto error; + + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_groupused_dnode); + if (ret) + goto error; + + /* store the final digest in a temporary buffer and copy what we need */ + cd.cd_length = SHA512_DIGEST_LENGTH; + cd.cd_raw.iov_base = (char *)raw_local_mac; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_final(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); + + return (0); + +error: + bzero(portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (ret); +} + +static void +zio_crypt_destroy_uio(uio_t *uio) +{ +#ifdef _KERNEL + if (uio) uio_free(uio); +#endif +} + +/* + * This function parses an uncompressed indirect block and returns a checksum + * of all the portable fields from all of the contained bps. The portable + * fields are the MAC and all of the fields from blk_prop except for the dedup, + * checksum, and psize bits. For an explanation of the purpose of this, see + * the comment block on object set authentication. + */ +static int +zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, + uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) +{ + blkptr_t *bp; + int i, epb = datalen >> SPA_BLKPTRSHIFT; + SHA2_CTX ctx; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + /* checksum all of the MACs from the layer below */ + SHA2Init(SHA512, &ctx); + for (i = 0, bp = buf; i < epb; i++, bp++) { + zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, + byteswap, bp); + } + SHA2Final(digestbuf, &ctx); + + if (generate) { + bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) + return (SET_ERROR(ECKSUM)); + + return (0); +} + +int +zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + + /* + * Unfortunately, callers of this function will not always have + * easy access to the on-disk format version. This info is + * normally found in the DSL Crypto Key, but the checksum-of-MACs + * is expected to be verifiable even when the key isn't loaded. + * Here, instead of doing a ZAP lookup for the version for each + * zio, we simply try both existing formats. + */ + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, + datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); + if (ret == ECKSUM) { + ASSERT(!generate); + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, + buf, datalen, 0, byteswap, cksum); + } + + return (ret); +} + +int +zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + void *buf; + + buf = abd_borrow_buf_copy(abd, datalen); + ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, + byteswap, cksum); + abd_return_buf(abd, buf, datalen); + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting ZIL blocks. + * We do not check for the older ZIL chain because the encryption feature + * was not available before the newer ZIL chain was introduced. The goal + * here is to encrypt everything except the blkptr_t of a lr_write_t and + * the zil_chain_t header. Everything that is not encrypted is authenticated. + */ +static int +zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t **puio, + uio_t **cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + boolean_t *no_crypt) +{ + int ret; + uint64_t txtype, lr_len; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + uio_t *srcuio = NULL, *dstuio = NULL; + uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; + zil_chain_t *zilc; + lr_t *lr; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + /* cipherbuf always needs an extra iovec for the MAC */ + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + /* find the start and end record of the log block */ + zilc = (zil_chain_t *)src; + slrp = src + sizeof (zil_chain_t); + aadp = aadbuf; + blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + + /* calculate the number of encrypted iovecs we will need */ + for (; slrp < blkend; slrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + nr_iovecs++; + if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) + nr_iovecs++; + } + + nr_src += nr_iovecs; + nr_dst += nr_iovecs; + + if (nr_src == 0) + nr_src = 1; + if (nr_dst == 0) + nr_dst = 1; + + /* allocate the uio to hold iovecs */ + if (nr_src != 0) { + srcuio = uio_create(nr_src, 0, UIO_SYSSPACE, UIO_READ); + if (srcuio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + if (nr_dst != 0) { + dstuio = uio_create(nr_dst, 0, UIO_SYSSPACE, UIO_WRITE); + if (dstuio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + /* + * Copy the plain zil header over and authenticate everything except + * the checksum that will store our MAC. If we are writing the data + * the embedded checksum will not have been calculated yet, so we don't + * authenticate that. + */ + bcopy(src, dst, sizeof (zil_chain_t)); + bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); + aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); + aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); + + /* loop over records again, filling in iovecs */ + nr_iovecs = 0; + slrp = src + sizeof (zil_chain_t); + dlrp = dst + sizeof (zil_chain_t); + + for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + /* copy the common lr_t */ + bcopy(slrp, dlrp, sizeof (lr_t)); + bcopy(slrp, aadp, sizeof (lr_t)); + aadp += sizeof (lr_t); + aad_len += sizeof (lr_t); + + /* + * If this is a TX_WRITE record we want to encrypt everything + * except the bp if exists. If the bp does exist we want to + * authenticate it. + */ + if (txtype == TX_WRITE) { + crypt_len = sizeof (lr_write_t) - + sizeof (lr_t) - sizeof (blkptr_t); + + VERIFY0(uio_addiov(srcuio, + (user_addr_t)slrp + sizeof (lr_t), + crypt_len)); + VERIFY0(uio_addiov(dstuio, + (user_addr_t)dlrp + sizeof (lr_t), + crypt_len)); + + /* copy the bp now since it will not be encrypted */ + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), + sizeof (blkptr_t)); + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + aadp, sizeof (blkptr_t)); + aadp += sizeof (blkptr_t); + aad_len += sizeof (blkptr_t); + nr_iovecs++; + total_len += crypt_len; + + if (lr_len != sizeof (lr_write_t)) { + crypt_len = lr_len - sizeof (lr_write_t); + + VERIFY0(uio_addiov(srcuio, + (user_addr_t)slrp + sizeof (lr_write_t), + crypt_len)); + VERIFY0(uio_addiov(dstuio, + (user_addr_t)dlrp + sizeof (lr_write_t), + crypt_len)); + nr_iovecs++; + total_len += crypt_len; + } + } else { + crypt_len = lr_len - sizeof (lr_t); + VERIFY0(uio_addiov(srcuio, + (user_addr_t)slrp + sizeof (lr_t), + crypt_len)); + VERIFY0(uio_addiov(dstuio, + (user_addr_t)dlrp + sizeof (lr_t), + crypt_len)); + nr_iovecs++; + total_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + if (encrypt) { + *puio = srcuio; + *cuio = dstuio; + } else { + *puio = dstuio; + *cuio = srcuio; + } + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (srcuio) uio_free(srcuio); + if (dstuio) uio_free(dstuio); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting dnode blocks. + */ +static int +zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uio_t **puio, uio_t **cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; + struct uio *src_uio = NULL, *dst_uio = NULL; + uint8_t *src, *dst, *aadp; + dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + aadp = aadbuf; + + /* + * Count the number of iovecs we will need to do the encryption by + * counting the number of bonus buffers that need to be encrypted. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + /* + * This block may still be byteswapped. However, all of the + * values we use are either uint8_t's (for which byteswapping + * is a noop) or a * != 0 check, which will work regardless + * of whether or not we byteswap. + */ + if (sdnp[i].dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && + sdnp[i].dn_bonuslen != 0) { + nr_iovecs++; + } + } + + nr_src += nr_iovecs; + nr_dst += nr_iovecs; + + if (nr_src == 0) + nr_src = 1; + if (nr_dst == 0) + nr_dst = 1; + + if (nr_src != 0) { + src_uio = uio_create(nr_src, 0, UIO_SYSSPACE, UIO_READ); + if (src_uio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + ASSERT(src_uio != NULL); + + if (nr_dst != 0) { + dst_uio = uio_create(nr_dst, 0, UIO_SYSSPACE, UIO_WRITE); + if (dst_uio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + ASSERT(dst_uio != NULL); + + nr_iovecs = 0; + + /* + * Iterate through the dnodes again, this time filling in the uios + * we allocated earlier. We also concatenate any data we want to + * authenticate onto aadbuf. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + + /* copy over the core fields and blkptrs (kept as plaintext) */ + bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), + sizeof (blkptr_t)); + } + + /* + * Handle authenticated data. We authenticate everything in + * the dnode that can be brought over when we do a raw send. + * This includes all of the core fields as well as the MACs + * stored in the bp checksums and all of the portable bits + * from blk_prop. We include the dnode padding here in case it + * ever gets used in the future. Some dn_flags and dn_used are + * not portable so we mask those out values out of the + * authenticated data. + */ + crypt_len = offsetof(dnode_phys_t, dn_blkptr); + bcopy(dnp, aadp, crypt_len); + adnp = (dnode_phys_t *)aadp; + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + aadp += crypt_len; + aad_len += crypt_len; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, &dnp->dn_blkptr[j]); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, DN_SPILL_BLKPTR(dnp)); + } + + /* + * If this bonus buffer needs to be encrypted, we prepare an + * iovec_t. The encryption / decryption functions will fill + * this in for us with the encrypted or decrypted data. + * Otherwise we add the bonus buffer to the authenticated + * data buffer and copy it over to the destination. The + * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that + * we can guarantee alignment with the AES block size + * (128 bits). + */ + crypt_len = DN_MAX_BONUS_LEN(dnp); + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + ASSERT3U(nr_iovecs, <, nr_src); + ASSERT3U(nr_iovecs, <, nr_dst); + VERIFY0(uio_addiov(src_uio, (user_addr_t)DN_BONUS(dnp), + crypt_len)); + VERIFY0(uio_addiov(dst_uio, + (user_addr_t)DN_BONUS(&ddnp[i]), + crypt_len)); + + nr_iovecs++; + total_len += crypt_len; + } else { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); + bcopy(DN_BONUS(dnp), aadp, crypt_len); + aadp += crypt_len; + aad_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + if (encrypt) { + *puio = src_uio; + *cuio = dst_uio; + } else { + *puio = dst_uio; + *cuio = src_uio; + } + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + zio_crypt_destroy_uio(src_uio); + zio_crypt_destroy_uio(dst_uio); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + return (ret); +} + +static int +zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, uio_t **puio, uio_t **cuio, + uint_t *enc_len) +{ + int ret = 0; + + *puio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); + *cuio = uio_create(2, 0, UIO_SYSSPACE, UIO_WRITE); + if (*puio == NULL || *cuio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + VERIFY0(uio_addiov(*puio, (user_addr_t)plainbuf, datalen)); + VERIFY0(uio_addiov(*cuio, (user_addr_t)cipherbuf, datalen)); + + *enc_len = datalen; + + return (0); + +error: + zio_crypt_destroy_uio(*puio); + zio_crypt_destroy_uio(*cuio); + + *enc_len = 0; + return (ret); +} + + +/* + * This function builds up the plaintext (puio) and ciphertext (cuio) uios so + * that they can be used for encryption and decryption by zio_do_crypt_uio(). + * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks + * requiring special handling to parse out pieces that are to be encrypted. The + * authbuf is used by these special cases to store additional authenticated + * data (AAD) for the encryption modes. + */ +static int +zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uint8_t *mac, uio_t **puio, uio_t **cuio, uint_t *enc_len, + uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + + ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); + + /* route to handler */ + switch (ot) { + case DMU_OT_INTENT_LOG: + ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, + datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, + no_crypt); + break; + case DMU_OT_DNODE: + ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, + cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, + auth_len, no_crypt); + break; + default: + ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, + datalen, puio, cuio, enc_len); + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + break; + } + + if (ret != 0) + goto error; + + ASSERT(puio != NULL); + ASSERT(cuio != NULL); + + /* populate the uios */ +#ifdef __APPLE__ + + VERIFY0(uio_addiov(*cuio, (user_addr_t)mac, ZIO_DATA_MAC_LEN)); + +#else // !APPLE + + puio->uio_segflg = UIO_SYSSPACE; + cuio->uio_segflg = UIO_SYSSPACE; + + mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); + mac_iov->iov_base = mac; + mac_iov->iov_len = ZIO_DATA_MAC_LEN; + +#endif // !APPLE + + return (0); + +error: + return (ret); +} + +/* + * Primary encryption / decryption entrypoint for zio data. + */ +int +zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt) +{ + int ret; + boolean_t locked = B_FALSE; + uint64_t crypt = key->zk_crypt; + uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; + /* + * We have to delay the allocation call uio_create() until we know + * how many iovecs we want (as max). + */ + uio_t *puio = NULL, *cuio = NULL; + uint_t enc_len, auth_len; + uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; + crypto_key_t tmp_ckey, *ckey = NULL; + crypto_ctx_template_t tmpl; + uint8_t *authbuf = NULL; + + /* create uios for encryption */ + ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, + cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, + &authbuf, &auth_len, no_crypt); + + if (ret != 0) + return (ret); + + /* + * If the needed key is the current one, just use it. Otherwise we + * need to generate a temporary one from the given salt + master key. + * If we are encrypting, we must return a copy of the current salt + * so that it can be stored in the blkptr_t. + */ + rw_enter(&key->zk_salt_lock, RW_READER); + locked = B_TRUE; + + if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { + ckey = &key->zk_current_key; + tmpl = key->zk_current_tmpl; + } else { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); + if (ret != 0) + goto error; + + tmp_ckey.ck_format = CRYPTO_KEY_RAW; + tmp_ckey.ck_data = enc_keydata; + tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + ckey = &tmp_ckey; + tmpl = NULL; + } + + VERIFY(puio != NULL); + VERIFY(cuio != NULL); + + /* perform the encryption / decryption */ + ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len, + puio, cuio, authbuf, auth_len); + + if (ret != 0) + goto error; + + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(puio); + zio_crypt_destroy_uio(cuio); + + return (0); + +error: + if (locked) + rw_exit(&key->zk_salt_lock); + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + + zio_crypt_destroy_uio(puio); + zio_crypt_destroy_uio(cuio); + return (ret); +} + +/* + * Simple wrapper around zio_do_crypt_data() to work with abd's instead of + * linear buffers. + */ +int +zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, + boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, + uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) +{ + int ret; + void *ptmp, *ctmp; + + if (encrypt) { + ptmp = abd_borrow_buf_copy(pabd, datalen); + ctmp = abd_borrow_buf(cabd, datalen); + } else { + ptmp = abd_borrow_buf(pabd, datalen); + ctmp = abd_borrow_buf_copy(cabd, datalen); + } + + ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, + datalen, ptmp, ctmp, no_crypt); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (0); + +error: + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (ret); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* BEGIN CSTYLED */ +module_param(zfs_key_max_salt_uses, ulong, 0644); +MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " + "can be used for generating encryption keys before it is rotated"); +/* END CSTYLED */ +#endif diff --git a/module/os/macos/zfs/zvolIO.cpp b/module/os/macos/zfs/zvolIO.cpp new file mode 100644 index 0000000000..450020760a --- /dev/null +++ b/module/os/macos/zfs/zvolIO.cpp @@ -0,0 +1,1177 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013-2020, Jorgen Lundman. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + + +/* + * ZVOL Device + */ + +// Define the superclass +#define super IOBlockStorageDevice + +#define ZVOL_BSIZE DEV_BSIZE + +static const char *ZVOL_PRODUCT_NAME_PREFIX = "ZVOL "; + +/* Wrapper for zvol_state pointer to IOKit device */ +typedef struct zvol_iokit { + net_lundman_zfs_zvol_device *dev; +} zvol_iokit_t; + +OSDefineMetaClassAndStructors(net_lundman_zfs_zvol_device, IOBlockStorageDevice) + +bool +net_lundman_zfs_zvol_device::init(zvol_state_t *c_zv, + OSDictionary *properties) +{ + zvol_iokit_t *iokitdev = NULL; + + dprintf("zvolIO_device:init\n"); + + if (!c_zv || c_zv->zv_zso->zvo_iokitdev != NULL) { + dprintf("zvol %s invalid c_zv\n", __func__); + return (false); + } + + if ((iokitdev = (zvol_iokit_t *)kmem_alloc(sizeof (zvol_iokit_t), + KM_SLEEP)) == NULL) { + printf("zvol %s wrapper alloc failed\n", __func__); + return (false); + } + + if (super::init(properties) == false) { + printf("zvol %s super init failed\n", __func__); + kmem_free(iokitdev, sizeof (zvol_iokit_t)); + return (false); + } + + /* Store reference to zvol_state_t in the iokitdev */ + zv = c_zv; + /* Store reference to iokitdev in zvol_state_t */ + iokitdev->dev = this; + + /* Assign to zv once completely initialized */ + zv->zv_zso->zvo_iokitdev = iokitdev; + + /* Apply the name from the full dataset path */ + if (strlen(zv->zv_name) != 0) { + setName(zv->zv_name); + } + + return (true); +} + +bool +net_lundman_zfs_zvol_device::attach(IOService* provider) +{ + OSDictionary *protocolCharacteristics = 0; + OSDictionary *deviceCharacteristics = 0; + OSDictionary *storageFeatures = 0; + OSBoolean *unmapFeature = 0; + const OSSymbol *propSymbol = 0; + OSString *dataString = 0; + OSNumber *dataNumber = 0; + + char product_name[strlen(ZVOL_PRODUCT_NAME_PREFIX) + MAXPATHLEN + 1]; + + if (!provider) { + dprintf("ZVOL attach missing provider\n"); + return (false); + } + + if (super::attach(provider) == false) + return (false); + + /* + * We want to set some additional properties for ZVOLs, in + * particular, "Virtual Device", and type "File" + * (or is Internal better?) + * + * Finally "Generic" type. + * + * These properties are defined in *protocol* characteristics + */ + + protocolCharacteristics = OSDictionary::withCapacity(3); + + if (!protocolCharacteristics) { + IOLog("failed to create dict for protocolCharacteristics.\n"); + return (true); + } + + propSymbol = OSSymbol::withCString( + kIOPropertyPhysicalInterconnectTypeVirtual); + + if (!propSymbol) { + IOLog("could not create interconnect type string\n"); + return (true); + } + protocolCharacteristics->setObject( + kIOPropertyPhysicalInterconnectTypeKey, propSymbol); + + propSymbol->release(); + propSymbol = 0; + + propSymbol = OSSymbol::withCString(kIOPropertyInterconnectFileKey); + if (!propSymbol) { + IOLog("could not create interconnect location string\n"); + return (true); + } + protocolCharacteristics->setObject( + kIOPropertyPhysicalInterconnectLocationKey, propSymbol); + + propSymbol->release(); + propSymbol = 0; + + setProperty(kIOPropertyProtocolCharacteristicsKey, + protocolCharacteristics); + + protocolCharacteristics->release(); + protocolCharacteristics = 0; + + /* + * We want to set some additional properties for ZVOLs, in + * particular, physical block size (volblocksize) of the + * underlying ZVOL, and 'logical' block size presented by + * the virtual disk. Also set physical bytes per sector. + * + * These properties are defined in *device* characteristics + */ + + deviceCharacteristics = OSDictionary::withCapacity(3); + + if (!deviceCharacteristics) { + IOLog("failed to create dict for deviceCharacteristics.\n"); + return (true); + } + + /* Set this device to be an SSD, for priority and VM paging */ + propSymbol = OSSymbol::withCString( + kIOPropertyMediumTypeSolidStateKey); + if (!propSymbol) { + IOLog("could not create medium type string\n"); + return (true); + } + deviceCharacteristics->setObject(kIOPropertyMediumTypeKey, + propSymbol); + + propSymbol->release(); + propSymbol = 0; + + /* Set logical block size to ZVOL_BSIZE (512b) */ + dataNumber = OSNumber::withNumber(ZVOL_BSIZE, + 8 * sizeof (ZVOL_BSIZE)); + + deviceCharacteristics->setObject(kIOPropertyLogicalBlockSizeKey, + dataNumber); + + dprintf("logicalBlockSize %llu\n", + dataNumber->unsigned64BitValue()); + + dataNumber->release(); + dataNumber = 0; + + /* Set physical block size to match volblocksize property */ + dataNumber = OSNumber::withNumber(zv->zv_volblocksize, + 8 * sizeof (zv->zv_volblocksize)); + + deviceCharacteristics->setObject(kIOPropertyPhysicalBlockSizeKey, + dataNumber); + + dprintf("physicalBlockSize %llu\n", + dataNumber->unsigned64BitValue()); + + dataNumber->release(); + dataNumber = 0; + + /* Set physical bytes per sector to match volblocksize property */ + dataNumber = OSNumber::withNumber((uint64_t)(zv->zv_volblocksize), + 8 * sizeof (uint64_t)); + + deviceCharacteristics->setObject(kIOPropertyBytesPerPhysicalSectorKey, + dataNumber); + + dprintf("physicalBytesPerSector %llu\n", + dataNumber->unsigned64BitValue()); + + dataNumber->release(); + dataNumber = 0; + + /* Publish the Device / Media name */ + (void) snprintf(product_name, sizeof (product_name), "%s%s", + ZVOL_PRODUCT_NAME_PREFIX, zv->zv_name); + dataString = OSString::withCString(product_name); + deviceCharacteristics->setObject(kIOPropertyProductNameKey, dataString); + dataString->release(); + dataString = 0; + + /* Apply these characteristics */ + setProperty(kIOPropertyDeviceCharacteristicsKey, + deviceCharacteristics); + + deviceCharacteristics->release(); + deviceCharacteristics = 0; + + /* + * ZVOL unmap support + * + * These properties are defined in IOStorageFeatures + */ + + storageFeatures = OSDictionary::withCapacity(1); + if (!storageFeatures) { + IOLog("failed to create dictionary for storageFeatures.\n"); + return (true); + } + + /* Set unmap feature */ + unmapFeature = OSBoolean::withBoolean(true); + storageFeatures->setObject(kIOStorageFeatureUnmap, unmapFeature); + unmapFeature->release(); + unmapFeature = 0; + + /* Apply these storage features */ + setProperty(kIOStorageFeaturesKey, storageFeatures); + storageFeatures->release(); + storageFeatures = 0; + + + /* + * Set transfer limits: + * + * Maximum transfer size (bytes) + * Maximum transfer block count + * Maximum transfer block size (bytes) + * Maximum transfer segment count + * Maximum transfer segment size (bytes) + * Minimum transfer segment size (bytes) + * + * We will need to establish safe defaults for all / per volblocksize + * + * Example: setProperty(kIOMinimumSegmentAlignmentByteCountKey, 1, 1); + */ + + /* + * Finally "Generic" type, set as a device property. Tried setting this + * to the string "ZVOL" however the OS does not recognize it as a block + * storage device. This would probably be possible by extending the + * IOBlockStorage Device / Driver relationship. + */ + + setProperty(kIOBlockStorageDeviceTypeKey, + kIOBlockStorageDeviceTypeGeneric); + + return (true); +} + +int +net_lundman_zfs_zvol_device::renameDevice(void) +{ + OSDictionary *deviceDict; + OSString *nameStr; + char *newstr; + int len; + + /* Length of string and null terminating character */ + len = strlen(ZVOL_PRODUCT_NAME_PREFIX) + strlen(zv->zv_name) + 1; + newstr = (char *)kmem_alloc(len, KM_SLEEP); + if (!newstr) { + dprintf("%s string alloc failed\n", __func__); + return (ENOMEM); + } + + /* Append prefix and dsl name */ + snprintf(newstr, len, "%s%s", ZVOL_PRODUCT_NAME_PREFIX, zv->zv_name); + nameStr = OSString::withCString(newstr); + kmem_free(newstr, len); + + if (!nameStr) { + dprintf("%s couldn't allocate name string\n", __func__); + return (ENOMEM); + } + + /* Fetch current device characteristics dictionary */ + deviceDict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyDeviceCharacteristicsKey)); + if (!deviceDict || (deviceDict = + OSDictionary::withDictionary(deviceDict)) == NULL) { + dprintf("couldn't clone device characteristics\n"); + /* Allocate new dict */ + if (!deviceDict && + (deviceDict = OSDictionary::withCapacity(1)) == NULL) { + dprintf("%s OSDictionary alloc failed\n", __func__); + nameStr->release(); + return (ENOMEM); + } + + } + + /* Add or replace the product name */ + if (deviceDict->setObject(kIOPropertyProductNameKey, + nameStr) == false) { + dprintf("%s couldn't set product name\n", __func__); + nameStr->release(); + deviceDict->release(); + return (ENXIO); + } + nameStr->release(); + nameStr = 0; + + /* Set IORegistry property */ + if (setProperty(kIOPropertyDeviceCharacteristicsKey, + deviceDict) == false) { + dprintf("%s couldn't set IORegistry property\n", __func__); + deviceDict->release(); + return (ENXIO); + } + deviceDict->release(); + deviceDict = 0; + + /* Apply the name from the full dataset path */ + setName(zv->zv_name); + + return (0); +} + +int +net_lundman_zfs_zvol_device::offlineDevice(void) +{ + IOService *client; + + if ((client = this->getClient()) == NULL) { + return (ENOENT); + } + + /* Ask IOBlockStorageDevice to offline media */ + if (client->message(kIOMessageMediaStateHasChanged, + this, (void *)kIOMediaStateOffline) != kIOReturnSuccess) { + dprintf("%s failed\n", __func__); + return (ENXIO); + } + + return (0); +} + +int +net_lundman_zfs_zvol_device::onlineDevice(void) +{ + IOService *client; + + if ((client = this->getClient()) == NULL) { + return (ENOENT); + } + + /* Ask IOBlockStorageDevice to online media */ + if (client->message(kIOMessageMediaStateHasChanged, + this, (void *)kIOMediaStateOnline) != kIOReturnSuccess) { + dprintf("%s failed\n", __func__); + return (ENXIO); + } + + return (0); +} + +int +net_lundman_zfs_zvol_device::refreshDevice(void) +{ + IOService *client; + + if ((client = this->getClient()) == NULL) { + return (ENOENT); + } + + /* Ask IOBlockStorageDevice to reset the media params */ + if (client->message(kIOMessageMediaParametersHaveChanged, + this) != kIOReturnSuccess) { + dprintf("%s failed\n", __func__); + return (ENXIO); + } + + return (0); +} + +int +net_lundman_zfs_zvol_device::getBSDName(void) +{ + IORegistryEntry *ioregdevice = 0; + OSObject *bsdnameosobj = 0; + OSString* bsdnameosstr = 0; + + ioregdevice = OSDynamicCast(IORegistryEntry, this); + + if (!ioregdevice) + return (-1); + + bsdnameosobj = ioregdevice->getProperty(kIOBSDNameKey, + gIOServicePlane, kIORegistryIterateRecursively); + + if (!bsdnameosobj) + return (-1); + + bsdnameosstr = OSDynamicCast(OSString, bsdnameosobj); + + IOLog("zvol: bsd name is '%s'\n", + bsdnameosstr->getCStringNoCopy()); + + if (!zv) + return (-1); + + zv->zv_zso->zvo_bsdname[0] = 'r'; // for 'rdiskX'. + strlcpy(&zv->zv_zso->zvo_bsdname[1], + bsdnameosstr->getCStringNoCopy(), + sizeof (zv->zv_zso->zvo_bsdname)-1); + /* + * IOLog("name assigned '%s'\n", zv->zv_zso->zvo_bsdname); + */ + + return (0); +} + +void +net_lundman_zfs_zvol_device::detach(IOService *provider) +{ + super::detach(provider); +} + +void +net_lundman_zfs_zvol_device::clearState(void) +{ + zv = NULL; +} + +bool +net_lundman_zfs_zvol_device::handleOpen(IOService *client, + IOOptionBits options, void *argument) +{ + IOStorageAccess access = (uintptr_t)argument; + bool ret = false; + int openflags = 0; + + if (super::handleOpen(client, options, argument) == false) + return (false); + + /* Device terminating? */ + if (zv == NULL || + zv->zv_zso == NULL || + zv->zv_zso->zvo_iokitdev == NULL) + return (false); + + if (access & kIOStorageAccessReaderWriter) { + openflags = FWRITE | ZVOL_EXCL; + } else { + openflags = FREAD; + } + + /* + * Don't use 'zv' until it has been verified by zvol_os_open_zv() + * and returned as opened, then it holds an open count and can be + * used. + */ + if (zvol_os_open_zv(zv, zv->zv_zso->zvo_openflags, 0, NULL) == 0) { + ret = true; + } else { + openflags = FREAD; + if (zvol_os_open_zv(zv, FREAD /* ZVOL_EXCL */, 0, NULL) == 0) { + ret = true; + } + } + + if (ret) + zv->zv_zso->zvo_openflags = openflags; + + + dprintf("Open %s (openflags %llx)\n", (ret ? "done" : "failed"), + ret ? zv->zv_zso->zvo_openflags : 0); + + if (ret == false) + super::handleClose(client, options); + + return (ret); +} + +void +net_lundman_zfs_zvol_device::handleClose(IOService *client, + IOOptionBits options) +{ + super::handleClose(client, options); + + /* Terminating ? */ + if (zv == NULL || + zv->zv_zso == NULL || + zv->zv_zso->zvo_iokitdev == NULL) + return; + + zvol_os_close_zv(zv, zv->zv_zso->zvo_openflags, 0, NULL); + +} + +IOReturn +net_lundman_zfs_zvol_device::doAsyncReadWrite( + IOMemoryDescriptor *buffer, UInt64 block, UInt64 nblks, + IOStorageAttributes *attributes, IOStorageCompletion *completion) +{ + IODirection direction; + IOByteCount actualByteCount; + struct iomem iomem; + iomem.buf = NULL; + + // Return errors for incoming I/O if we have been terminated. + if (isInactive() == true) { + dprintf("asyncReadWrite notActive fail\n"); + return (kIOReturnNotAttached); + } + + // These variables are set in zvol_first_open(), which should have been + // called already. + if (!zv->zv_dn) { + dprintf("asyncReadWrite no zvol dnode\n"); + return (kIOReturnNotAttached); + } + + // Ensure the start block is within the disk capacity. + if ((block)*(ZVOL_BSIZE) >= zv->zv_volsize) { + dprintf("asyncReadWrite start block outside volume\n"); + return (kIOReturnBadArgument); + } + + // Shorten the read, if beyond the end + if (((block + nblks)*(ZVOL_BSIZE)) > zv->zv_volsize) { + dprintf("asyncReadWrite block shortening needed\n"); + return (kIOReturnBadArgument); + } + + // Get the buffer direction, whether this is a read or a write. + direction = buffer->getDirection(); + if ((direction != kIODirectionIn) && (direction != kIODirectionOut)) { + dprintf("asyncReadWrite kooky direction\n"); + return (kIOReturnBadArgument); + } + + // dprintf("%s offset @block %llu numblocks %llu: blksz %u\n", + // direction == kIODirectionIn ? "Read" : "Write", + // block, nblks, (ZVOL_BSIZE)); + + /* Perform the read or write operation through the transport driver. */ + actualByteCount = (nblks*(ZVOL_BSIZE)); + + iomem.buf = buffer; + + /* Make sure we don't go away while the command is being executed */ + /* Open should be holding a retain */ + + if (direction == kIODirectionIn) { + + if (zvol_os_read_zv(zv, (block*(ZVOL_BSIZE)), + actualByteCount, &iomem)) { + + actualByteCount = 0; + } + + } else { + + if (zvol_os_write_zv(zv, (block*(ZVOL_BSIZE)), + actualByteCount, &iomem)) { + actualByteCount = 0; + } + + } + + /* Open should be holding a retain */ + iomem.buf = NULL; // overkill + + if (actualByteCount != nblks*(ZVOL_BSIZE)) + dprintf("Read/Write operation failed\n"); + + // Call the completion function. + (completion->action)(completion->target, completion->parameter, + kIOReturnSuccess, actualByteCount); + + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::doDiscard(UInt64 block, UInt64 nblks) +{ + dprintf("doDiscard called with block, nblks (%llu, %llu)\n", + block, nblks); + uint64_t bytes = 0; + uint64_t off = 0; + + /* Convert block/nblks to offset/bytes */ + off = block * ZVOL_BSIZE; + bytes = nblks * ZVOL_BSIZE; + dprintf("calling zvol_unmap with offset, bytes (%llu, %llu)\n", + off, bytes); + + if (zvol_os_unmap(zv, off, bytes) == 0) + return (kIOReturnSuccess); + else + return (kIOReturnError); +} + + +IOReturn +net_lundman_zfs_zvol_device::doUnmap(IOBlockStorageDeviceExtent *extents, + UInt32 extentsCount, UInt32 options = 0) +{ + UInt32 i = 0; + IOReturn result; + + dprintf("doUnmap called with (%u) extents and options (%u)\n", + (uint32_t)extentsCount, (uint32_t)options); + + if (options > 0 || !extents) { + return (kIOReturnUnsupported); + } + + for (i = 0; i < extentsCount; i++) { + + result = doDiscard(extents[i].blockStart, + extents[i].blockCount); + + if (result != kIOReturnSuccess) { + return (result); + } + } + + return (kIOReturnSuccess); +} + +UInt32 +net_lundman_zfs_zvol_device::doGetFormatCapacities(UInt64* capacities, + UInt32 capacitiesMaxCount) const +{ + dprintf("formatCap\n"); + + /* + * Ensure that the array is sufficient to hold all our formats + * (we require one element). + */ + if ((capacities != NULL) && (capacitiesMaxCount < 1)) + return (0); + /* Error, return an array size of 0. */ + + /* + * The caller may provide a NULL array if it wishes to query the number + * of formats that we support. + */ + if (capacities != NULL) + capacities[0] = zv->zv_volsize; + + dprintf("returning capacity[0] size %llu\n", zv->zv_volsize); + + return (1); +} + +char * +net_lundman_zfs_zvol_device::getProductString(void) +{ + dprintf("getProduct %p\n", zv); + + if (zv) + return (zv->zv_name); + + return ((char *)"ZVolume"); +} + +IOReturn +net_lundman_zfs_zvol_device::reportBlockSize(UInt64 *blockSize) +{ + dprintf("reportBlockSize %llu\n", *blockSize); + + if (blockSize) *blockSize = (ZVOL_BSIZE); + + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::reportMaxValidBlock(UInt64 *maxBlock) +{ + dprintf("reportMaxValidBlock %llu\n", *maxBlock); + + if (maxBlock) *maxBlock = ((zv->zv_volsize / (ZVOL_BSIZE)) - 1); + + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::reportMediaState(bool *mediaPresent, + bool *changedState) +{ + dprintf("reportMediaState\n"); + if (mediaPresent) *mediaPresent = true; + if (changedState) *changedState = false; + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::reportPollRequirements(bool *pollRequired, + bool *pollIsExpensive) +{ + dprintf("reportPollReq\n"); + if (pollRequired) *pollRequired = false; + if (pollIsExpensive) *pollIsExpensive = false; + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::reportRemovability(bool *isRemovable) +{ + dprintf("reportRemova\n"); + if (isRemovable) *isRemovable = false; + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::doEjectMedia(void) +{ + dprintf("ejectMedia\n"); +/* XXX */ + // Only 10.6 needs special work to eject + // if ((version_major == 10) && (version_minor == 8)) + // destroyBlockStorageDevice(zvol); + // } + + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::doFormatMedia(UInt64 byteCapacity) +{ + dprintf("doFormat\n"); + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::doLockUnlockMedia(bool doLock) +{ + dprintf("doLockUnlock\n"); + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::doSynchronizeCache(void) +{ + dprintf("doSync\n"); + if (zv && zv->zv_zilog) { + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } + return (kIOReturnSuccess); +} + +char * +net_lundman_zfs_zvol_device::getVendorString(void) +{ + dprintf("getVendor\n"); + return ((char *)"ZVOL"); +} + +char * +net_lundman_zfs_zvol_device::getRevisionString(void) +{ + dprintf("getRevision\n"); + return ((char *)ZFS_META_VERSION); +} + +char * +net_lundman_zfs_zvol_device::getAdditionalDeviceInfoString(void) +{ + dprintf("getAdditional\n"); + return ((char *)"ZFS Volume"); +} + +IOReturn +net_lundman_zfs_zvol_device::reportEjectability(bool *isEjectable) +{ + dprintf("reportEjecta\n"); + /* + * Which do we prefer? If you eject it, you can't get volume back until + * you import it again. + */ + + if (isEjectable) *isEjectable = false; + return (kIOReturnSuccess); +} + +/* XXX deprecated function */ +IOReturn +net_lundman_zfs_zvol_device::reportLockability(bool *isLockable) +{ + dprintf("reportLocka\n"); + if (isLockable) *isLockable = true; + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::reportWriteProtection(bool *isWriteProtected) +{ + dprintf("reportWritePro: %d\n", *isWriteProtected); + + if (!isWriteProtected) + return (kIOReturnSuccess); + + if (zv && (zv->zv_flags & ZVOL_RDONLY)) + *isWriteProtected = true; + else + *isWriteProtected = false; + + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::getWriteCacheState(bool *enabled) +{ + dprintf("getCacheState\n"); + if (enabled) *enabled = true; + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::setWriteCacheState(bool enabled) +{ + dprintf("setWriteCache\n"); + return (kIOReturnSuccess); +} + +extern "C" { + +/* C interfaces */ +int +zvolCreateNewDevice(zvol_state_t *zv) +{ + net_lundman_zfs_zvol_device *zvol; + ZFSPool *pool_proxy; + spa_t *spa; + dprintf("%s\n", __func__); + + /* We must have a valid zvol_state_t */ + if (!zv || !zv->zv_objset) { + dprintf("%s missing zv or objset\n", __func__); + return (EINVAL); + } + + /* We need the spa to get the pool proxy */ + if ((spa = dmu_objset_spa(zv->zv_objset)) == NULL) { + dprintf("%s couldn't get spa\n", __func__); + return (EINVAL); + } + if (spa->spa_iokit_proxy == NULL || + (pool_proxy = spa->spa_iokit_proxy->proxy) == NULL) { + dprintf("%s missing IOKit pool proxy\n", __func__); + return (EINVAL); + } + + zvol = new net_lundman_zfs_zvol_device; + + /* Validate creation, initialize and attach */ + if (!zvol || zvol->init(zv) == false || + zvol->attach(pool_proxy) == false) { + dprintf("%s device creation failed\n", __func__); + if (zvol) zvol->release(); + return (ENOMEM); + } + /* Start the service */ + if (zvol->start(pool_proxy) == false) { + dprintf("%s device start failed\n", __func__); + zvol->detach(pool_proxy); + zvol->release(); + return (ENXIO); + } + + /* Open pool_proxy provider */ + if (pool_proxy->open(zvol) == false) { + dprintf("%s open provider failed\n", __func__); + zvol->stop(pool_proxy); + zvol->detach(pool_proxy); + zvol->release(); + return (ENXIO); + } + /* Is retained by provider */ + zvol->release(); + zvol = 0; + + return (0); +} + +int +zvolRegisterDevice(zvol_state_t *zv) +{ + net_lundman_zfs_zvol_device *zvol; + OSDictionary *matching; + IOService *service = 0; + IOMedia *media = 0; + OSString *nameStr = 0, *bsdName = 0; + uint64_t timeout = (5ULL * kSecondScale); + bool ret = false; + + if (!zv || !zv->zv_zso->zvo_iokitdev || zv->zv_name[0] == 0) { + dprintf("%s missing zv, iokitdev, or name\n", __func__); + return (EINVAL); + } + + if ((zvol = zv->zv_zso->zvo_iokitdev->dev) == NULL) { + dprintf("%s couldn't get zvol device\n", __func__); + return (EINVAL); + } + + if (!zvol->getVendorString()) { + return (EINVAL); + } + + /* Create matching string and dictionary */ + { + char str[MAXNAMELEN]; + snprintf(str, MAXNAMELEN, "%s %s Media", + zvol->getVendorString(), zv->zv_name); + if ((nameStr = OSString::withCString(str)) == NULL) { + dprintf("%s problem with name string\n", __func__); + return (ENOMEM); + } + } + matching = IOService::serviceMatching("IOMedia"); + if (!matching || !matching->setObject(gIONameMatchKey, nameStr)) { + dprintf("%s couldn't get matching dictionary\n", __func__); + return (ENOMEM); + } + + /* Register device for service matching */ + zvol->registerService(kIOServiceAsynchronous); + + /* Wait for upper layer BSD client */ + dprintf("%s waiting for IOMedia\n", __func__); + /* Wait for up to 5 seconds */ + service = IOService::waitForMatchingService(matching, timeout); + dprintf("%s %s service\n", __func__, (service ? "got" : "no")); + + if (!service) { + dprintf("%s couldn't get matching service\n", __func__); + return (false); + } + + dprintf("%s casting to IOMedia\n", __func__); + media = OSDynamicCast(IOMedia, service); + + if (!media) { + dprintf("%s no IOMedia\n", __func__); + service->release(); + return (false); + } + + dprintf("%s getting IOBSDNameKey\n", __func__); + bsdName = OSDynamicCast(OSString, + media->getProperty(kIOBSDNameKey)); + + if (bsdName) { + const char *str = bsdName->getCStringNoCopy(); + dprintf("%s Got bsd name [%s]\n", + __func__, str); + zv->zv_zso->zvo_bsdname[0] = 'r'; + snprintf(zv->zv_zso->zvo_bsdname+1, + sizeof (zv->zv_zso->zvo_bsdname)-1, + "%s", str); + dprintf("%s zvol bsdname set to %s\n", __func__, + zv->zv_zso->zvo_bsdname); +// zvol_add_symlink(zv, zv->zv_zso->zvo_bsdname+1, +// zv->zv_zso->zvo_bsdname); + ret = true; + } else { + dprintf("%s couldn't get BSD Name\n", __func__); + } + + /* Release retain held by waitForMatchingService */ + service->release(); + + printf("%s complete\n", __func__); + return (ret); +} + +/* Struct passed in will be freed before returning */ +void * +zvolRemoveDevice(zvol_iokit_t *iokitdev) +{ + net_lundman_zfs_zvol_device *zvol; + dprintf("%s\n", __func__); + + if (!iokitdev) { + dprintf("%s missing argument\n", __func__); + return (NULL); + } + + zvol = iokitdev->dev; + /* Free the wrapper struct */ + kmem_free(iokitdev, sizeof (zvol_iokit_t)); + + if (zvol == NULL) { + dprintf("%s couldn't get IOKit handle\n", __func__); + return (NULL); + } + + /* Mark us as terminating */ + zvol->clearState(); + + return (zvol); +} + +/* + * zvolRemoveDevice continued.. + * terminate() will block and we can deadlock, so it is issued as a + * separate thread. Done from zvol_os.c as it is easier in C. + */ +int +zvolRemoveDeviceTerminate(void *arg) +{ + net_lundman_zfs_zvol_device *zvol = (net_lundman_zfs_zvol_device *)arg; + + /* Terminate */ + if (zvol->terminate(kIOServiceTerminate|kIOServiceAsynchronous| + kIOServiceRequired) == false) { + IOLog("%s terminate failed\n", __func__); + } + + return (0); +} + +/* Called with zv->zv_name already updated */ +int +zvolRenameDevice(zvol_state_t *zv) +{ + net_lundman_zfs_zvol_device *zvol = NULL; + int error; + + if (!zv || strnlen(zv->zv_name, 1) == 0) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + if ((zvol = zv->zv_zso->zvo_iokitdev->dev) == NULL) { + dprintf("%s couldn't get zvol device\n", __func__); + return (EINVAL); + } + + /* Set IORegistry name and property */ + if ((error = zvol->renameDevice()) != 0) { + dprintf("%s renameDevice error %d\n", __func__, error); + return (error); + } + + /* + * XXX This works, but if there is a volume mounted on + * the zvol at the time it is uncleanly ejected. + * We just need to add diskutil unmount to `zfs rename`, + * like zpool export. + */ + /* Inform clients of this device that name has changed */ + if (zvol->offlineDevice() != 0 || + zvol->onlineDevice() != 0) { + dprintf("%s media reset failed\n", __func__); + return (ENXIO); + } + + return (0); +} + +/* Called with zvol volsize already updated */ +int +zvolSetVolsize(zvol_state_t *zv) +{ + net_lundman_zfs_zvol_device *zvol; + int error; + + dprintf("%s\n", __func__); + + if (!zv || !zv->zv_zso->zvo_iokitdev) { + dprintf("%s invalid zvol\n", __func__); + return (EINVAL); + } + + /* Cast to correct type */ + if ((zvol = zv->zv_zso->zvo_iokitdev->dev) == NULL) { + dprintf("%s couldn't cast IOKit handle\n", __func__); + return (ENXIO); + } + /* + * XXX This works fine, even if volume is mounted, + * but only tested expanding the zvol and only with + * GPT/APM/MBR partition map (not volume on whole-zvol). + */ + /* Inform clients of this device that size has changed */ + if ((error = zvol->refreshDevice()) != 0) { + dprintf("%s refreshDevice error %d\n", __func__, error); + return (error); + } + + return (0); +} + +uint64_t +zvolIO_kit_read(struct iomem *iomem, uint64_t offset, + char *address, uint64_t len) +{ + IOByteCount done; + // IOLog("zvolIO_kit_read offset %p count %llx to offset %llx\n", + // address, len, offset); + ASSERT(iomem && address && len > 0); + + done = iomem->buf->writeBytes(offset, (void *)address, len); + + return (done); +} + +uint64_t +zvolIO_kit_write(struct iomem *iomem, uint64_t offset, + char *address, uint64_t len) +{ + IOByteCount done; + // IOLog("zvolIO_kit_write offset %p count %llx to offset %llx\n", + // address, len, offset); + ASSERT(iomem && address && len > 0); + + done = iomem->buf->readBytes(offset, (void *)address, len); + + return (done); +} + +} /* extern "C" */ diff --git a/module/os/macos/zfs/zvol_os.c b/module/os/macos/zfs/zvol_os.c new file mode 100644 index 0000000000..1b13ece6c7 --- /dev/null +++ b/module/os/macos/zfs/zvol_os.c @@ -0,0 +1,1024 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2020 by Jorgen Lundman. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static uint32_t zvol_major = ZVOL_MAJOR; + +unsigned int zvol_request_sync = 0; +unsigned int zvol_prefetch_bytes = (128 * 1024); +unsigned long zvol_max_discard_blocks = 16384; +unsigned int zvol_threads = 8; + +taskq_t *zvol_taskq; + +typedef struct zv_request { + zvol_state_t *zv; + + void (*zv_func)(void *); + void *zv_arg; + + taskq_ent_t ent; +} zv_request_t; + +int +dmu_read_iokit_dnode(dnode_t *dn, uint64_t *offset, + uint64_t position, uint64_t *size, struct iomem *iomem); +int +dmu_write_iokit_dnode(dnode_t *dn, uint64_t *offset, uint64_t position, + uint64_t *size, struct iomem *iomem, dmu_tx_t *tx); + + +static void +zvol_os_spawn_cb(void *param) +{ + zv_request_t *zvr = (zv_request_t *)param; + + zvr->zv_func(zvr->zv_arg); + + kmem_free(zvr, sizeof (zv_request_t)); +} + +static void +zvol_os_spawn(void (*func)(void *), void *arg) +{ + zv_request_t *zvr; + zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); + zvr->zv_arg = arg; + zvr->zv_func = func; + + taskq_init_ent(&zvr->ent); + + taskq_dispatch_ent(zvol_taskq, + zvol_os_spawn_cb, zvr, 0, &zvr->ent); +} + +/* + * Given a path, return TRUE if path is a ZVOL. + */ +static boolean_t +zvol_os_is_zvol(const char *device) +{ +#if 0 + struct stat stbf; + + // Stat device, get major/minor, match zv + if (stat(device, &stbf) == 0) { + if (S_ISBLK(stbf.st_mode) || S_ISCHR(stbf.st_mode)) { + dev_t dev = makedevice(stbf.st_major, stbf.st_minor); + + zvol_state_t *zv; + zv = zvol_find_by_dev(dev); + if (zv != NULL) { + mutex_exit(&zv->zv_state_lock); + return (B_TRUE); + } + } + } +#endif + return (B_FALSE); +} + +/* + * Make sure zv is still in the list (not freed) and if it is + * grab the locks in the correct order. + * Can we rely on list_link_active() instead of looping list? + */ +static int +zvol_os_verify_and_lock(zvol_state_t *node) +{ + zvol_state_t *zv; + + rw_enter(&zvol_state_lock, RW_READER); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + mutex_enter(&zv->zv_state_lock); + if (zv == node) { + + if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); + } + rw_exit(&zvol_state_lock); + return (1); + } + mutex_exit(&zv->zv_state_lock); + } + rw_exit(&zvol_state_lock); + return (0); +} + +static void +zvol_os_register_device_cb(void *param) +{ + zvol_state_t *zv = (zvol_state_t *)param; + + if (zvol_os_verify_and_lock(zv) == 0) + return; + + zvolRegisterDevice(zv); + + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); +} + +int +zvol_os_write(dev_t dev, struct uio *uio, int p) +{ + return (ENOTSUP); +} + +int +zvol_os_read(dev_t dev, struct uio *uio, int p) +{ + return (ENOTSUP); +} + +int +zvol_os_write_zv(zvol_state_t *zv, uint64_t position, + uint64_t count, struct iomem *iomem) +{ + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + boolean_t sync; + uint64_t offset = 0; + uint64_t bytes; + uint64_t off; + + if (zv == NULL) + return (ENXIO); + + /* Some requests are just for flush and nothing else. */ + if (count == 0) + return (0); + + volsize = zv->zv_volsize; + if (count > 0 && + (position >= volsize)) + return (EIO); + + rw_enter(&zv->zv_suspend_lock, RW_READER); + + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + } + rw_downgrade(&zv->zv_suspend_lock); + } + + dprintf("zvol_write_iokit(position %llu offset " + "0x%llx bytes 0x%llx)\n", position, offset, count); + + sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + + /* Lock the entire range */ + lr = zfs_rangelock_enter(&zv->zv_rangelock, position, count, + RL_WRITER); + + /* Iterate over (DMU_MAX_ACCESS/2) segments */ + while (count > 0 && (position + offset) < volsize) { + /* bytes for this segment */ + bytes = MIN(count, DMU_MAX_ACCESS >> 1); + off = offset; + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + + /* don't write past the end */ + if (bytes > volsize - (position + off)) + bytes = volsize - (position + off); + + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + + error = dmu_write_iokit_dnode(zv->zv_dn, &offset, + position, &bytes, iomem, tx); + + if (error == 0) { + count -= MIN(count, + (DMU_MAX_ACCESS >> 1)) + bytes; + zvol_log_write(zv, tx, offset, bytes, sync); + } + dmu_tx_commit(tx); + + if (error) + break; + } + zfs_rangelock_exit(lr); + + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + + rw_exit(&zv->zv_suspend_lock); + + return (error); +} + +int +zvol_os_read_zv(zvol_state_t *zv, uint64_t position, + uint64_t count, struct iomem *iomem) +{ + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + uint64_t offset = 0; + + if (zv == NULL) + return (ENXIO); + + volsize = zv->zv_volsize; + if (count > 0 && + (position >= volsize)) + return (EIO); + + rw_enter(&zv->zv_suspend_lock, RW_READER); + + lr = zfs_rangelock_enter(&zv->zv_rangelock, position, count, + RL_READER); + + while (count > 0 && (position+offset) < volsize) { + uint64_t bytes = MIN(count, DMU_MAX_ACCESS >> 1); + + /* don't read past the end */ + if (bytes > volsize - (position + offset)) + bytes = volsize - (position + offset); + + dprintf("%s %llu offset %llu len %llu bytes %llu\n", + "zvol_read_iokit: position", + position, offset, count, bytes); + + error = dmu_read_iokit_dnode(zv->zv_dn, &offset, position, + &bytes, iomem); + + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; + break; + } + count -= MIN(count, DMU_MAX_ACCESS >> 1) - bytes; + } + zfs_rangelock_exit(lr); + + rw_exit(&zv->zv_suspend_lock); + return (error); +} + +int +zvol_os_unmap(zvol_state_t *zv, uint64_t off, uint64_t bytes) +{ + zfs_locked_range_t *lr = NULL; + dmu_tx_t *tx = NULL; + int error = 0; + uint64_t end = off + bytes; + + if (zv == NULL) + return (ENXIO); + + /* + * XNU's wipefs_wipe() will issue one giant unmap for the entire + * device; + * zfs create -V 8g BOOM/vol + * zvolIO doDiscard calling zvol_unmap with offset, bytes (0, 858992) + * Which will both take too long, and is uneccessary. We will ignore + * any unmaps deemed "too large". + */ + if ((off == 0ULL) && + (zv->zv_volsize > (1ULL << 24)) && /* 16Mb slop */ + (bytes >= (zv->zv_volsize - (1ULL << 24)))) + return (0); + + rw_enter(&zv->zv_suspend_lock, RW_READER); + + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + } + rw_downgrade(&zv->zv_suspend_lock); + } + + off = P2ROUNDUP(off, zv->zv_volblocksize); + end = P2ALIGN(end, zv->zv_volblocksize); + + if (end > zv->zv_volsize) /* don't write past the end */ + end = zv->zv_volsize; + + if (off >= end) { + /* Return success- caller does not need to know */ + goto out; + } + + bytes = end - off; + lr = zfs_rangelock_enter(&zv->zv_rangelock, off, bytes, RL_WRITER); + + tx = dmu_tx_create(zv->zv_objset); + + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, TXG_WAIT); + + if (error) { + dmu_tx_abort(tx); + } else { + + zvol_log_truncate(zv, tx, off, bytes, B_TRUE); + + dmu_tx_commit(tx); + + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, off, bytes); + } + + zfs_rangelock_exit(lr); + + if (error == 0) { + /* + * If the 'sync' property is set to 'always' then + * treat this as a synchronous operation + * (i.e. commit to zil). + */ + if (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) { + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } + } + +out: + rw_exit(&zv->zv_suspend_lock); + return (error); +} + +int +zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) +{ + zv->zv_volsize = volsize; + return (0); +} + +static void +zvol_os_clear_private_cb(void *param) +{ + zvolRemoveDeviceTerminate(param); +} + +static void +zvol_os_clear_private(zvol_state_t *zv) +{ + void *term; + + printf("%s\n", __func__); + /* We can do all removal work, except call terminate. */ + term = zvolRemoveDevice(zv->zv_zso->zvo_iokitdev); + if (term == NULL) + return; + + zv->zv_zso->zvo_iokitdev = NULL; + + /* Call terminate in the background */ + zvol_os_spawn(zvol_os_clear_private_cb, term); + +} + +/* + * Find a zvol_state_t given the full major+minor dev_t. If found, + * return with zv_state_lock taken, otherwise, return (NULL) without + * taking zv_state_lock. + */ +static zvol_state_t * +zvol_os_find_by_dev(dev_t dev) +{ + zvol_state_t *zv; + + printf("%s\n", __func__); + + rw_enter(&zvol_state_lock, RW_READER); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + mutex_enter(&zv->zv_state_lock); + if (zv->zv_zso->zvo_dev == dev) { + rw_exit(&zvol_state_lock); + return (zv); + } + mutex_exit(&zv->zv_state_lock); + } + rw_exit(&zvol_state_lock); + + return (NULL); +} + +void +zvol_os_validate_dev(zvol_state_t *zv) +{ + ASSERT3U(MINOR(zv->zv_zso->zvo_dev) & ZVOL_MINOR_MASK, ==, 0); +} + +/* + * Allocate memory for a new zvol_state_t and setup the required + * request queue and generic disk structures for the block device. + */ +static zvol_state_t * +zvol_os_alloc(dev_t dev, const char *name) +{ + zvol_state_t *zv; + struct zvol_state_os *zso; + uint64_t volmode; + + printf("%s\n", __func__); + if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) + return (NULL); + + printf("%s 2\n", __func__); + if (volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + + printf("%s 3\n", __func__); + if (volmode == ZFS_VOLMODE_NONE) + return (NULL); + + printf("%s 4\n", __func__); + zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); + zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_zso = zso; + + list_link_init(&zv->zv_next); + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + + zv->zv_open_count = 0; + strlcpy(zv->zv_name, name, MAXNAMELEN); + + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + + return (zv); +#if 0 +out_kmem: + kmem_free(zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + return (NULL); +#endif +} + +/* + * Cleanup then free a zvol_state_t which was created by zvol_alloc(). + * At this time, the structure is not opened by anyone, is taken off + * the zvol_state_list, and has its private data set to NULL. + * The zvol_state_lock is dropped. + * + */ +static void +zvol_os_free(zvol_state_t *zv) +{ + printf("%s\n", __func__); + + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count == 0); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + + mutex_destroy(&zv->zv_state_lock); + + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); +} + + + +/* + * Create a block device minor node and setup the linkage between it + * and the specified volume. Once this function returns the block + * device is live and ready for use. + */ +static int +zvol_os_create_minor(const char *name) +{ + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t *doi; + uint64_t volsize; + unsigned minor = 0; + int error = 0; + uint64_t hash = zvol_name_hash(name); + + printf("%s\n", __func__); + + if (zvol_inhibit_dev) + return (0); + + // minor? + zv = zvol_find_by_name_hash(name, hash, RW_NONE); + if (zv) { + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + mutex_exit(&zv->zv_state_lock); + return (SET_ERROR(EEXIST)); + } + + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); + if (error) + goto out_doi; + + error = dmu_object_info(os, ZVOL_OBJ, doi); + if (error) + goto out_dmu_objset_disown; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + goto out_dmu_objset_disown; + + zv = zvol_os_alloc(makedevice(zvol_major, minor), name); + if (zv == NULL) { + error = SET_ERROR(EAGAIN); + goto out_dmu_objset_disown; + } + zv->zv_hash = hash; + + if (dmu_objset_is_snapshot(os)) + zv->zv_flags |= ZVOL_RDONLY; + + zv->zv_volblocksize = doi->doi_data_block_size; + zv->zv_volsize = volsize; + zv->zv_objset = os; + + // set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); + + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(dmu_objset_zil(os), B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + + /* Create the IOKit zvol while owned */ + if ((error = zvolCreateNewDevice(zv)) != 0) { + dprintf("%s zvolCreateNewDevice error %d\n", + __func__, error); + } + + zv->zv_objset = NULL; +out_dmu_objset_disown: + dmu_objset_disown(os, B_TRUE, FTAG); +out_doi: + kmem_free(doi, sizeof (dmu_object_info_t)); + + if (error == 0) { + rw_enter(&zvol_state_lock, RW_WRITER); + zvol_insert(zv); + rw_exit(&zvol_state_lock); + + /* Register (async) IOKit zvol after disown and unlock */ + /* The callback with release the mutex */ + zvol_os_spawn(zvol_os_register_device_cb, zv); + + } else { + + } + + printf("%s complete\n", __func__); + return (error); +} + + +static void zvol_os_rename_device_cb(void *param) +{ + zvol_state_t *zv = (zvol_state_t *)param; + if (zvol_os_verify_and_lock(zv) == 0) + return; + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + zvolRenameDevice(zv); +} + +static void +zvol_os_rename_minor(zvol_state_t *zv, const char *newname) +{ + // int readonly = get_disk_ro(zv->zv_zso->zvo_disk); + + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + + /* move to new hashtable entry */ + zv->zv_hash = zvol_name_hash(zv->zv_name); + hlist_del(&zv->zv_hlink); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); + + zvol_os_spawn(zvol_os_rename_device_cb, zv); + + /* + * The block device's read-only state is briefly changed causing + * a KOBJ_CHANGE uevent to be issued. This ensures udev detects + * the name change and fixes the symlinks. This does not change + * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never + * changes. This would normally be done using kobject_uevent() but + * that is a GPL-only symbol which is why we need this workaround. + */ + // set_disk_ro(zv->zv_zso->zvo_disk, !readonly); + // set_disk_ro(zv->zv_zso->zvo_disk, readonly); +} + +static void +zvol_os_set_disk_ro(zvol_state_t *zv, int flags) +{ + // set_disk_ro(zv->zv_zso->zvo_disk, flags); +} + +static void +zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) +{ + // set_capacity(zv->zv_zso->zvo_disk, capacity); +} + +int +zvol_os_open_zv(zvol_state_t *zv, int flag, int otyp, struct proc *p) +{ + int error = 0; + + printf("%s\n", __func__); + + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zvol_os_verify_and_lock(zv) == 0) + return (SET_ERROR(ENOENT)); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock)); + + if (zv->zv_open_count == 0) { + error = zvol_first_open(zv, !(flag & FWRITE)); + if (error) + goto out_mutex; + } + + if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + error = EROFS; + goto out_open_count; + } + + zv->zv_open_count++; + + mutex_exit(&zv->zv_state_lock); + + rw_exit(&zv->zv_suspend_lock); + + return (0); + +out_open_count: + if (zv->zv_open_count == 0) + zvol_last_close(zv); + +out_mutex: + mutex_exit(&zv->zv_state_lock); + + rw_exit(&zv->zv_suspend_lock); + if (error == EINTR) { + error = ERESTART; + schedule(); + } + return (SET_ERROR(error)); +} + +int +zvol_os_open(dev_t devp, int flag, int otyp, struct proc *p) +{ + zvol_state_t *zv; + int error = 0; + + printf("%s\n", __func__); + + if (!getminor(devp)) + return (0); + + zv = zvol_os_find_by_dev(devp); + if (zv == NULL) { + return (SET_ERROR(ENXIO)); + } + + error = zvol_os_open_zv(zv, flag, otyp, p); + + mutex_exit(&zv->zv_state_lock); + return (SET_ERROR(error)); +} + +int +zvol_os_close_zv(zvol_state_t *zv, int flag, int otyp, struct proc *p) +{ + printf("%s\n", __func__); + + if (zvol_os_verify_and_lock(zv) == 0) + return (SET_ERROR(ENOENT)); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock)); + + zv->zv_open_count--; + + if (zv->zv_open_count == 0) + zvol_last_close(zv); + + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + + return (0); +} + +int +zvol_os_close(dev_t dev, int flag, int otyp, struct proc *p) +{ + zvol_state_t *zv; + int error = 0; + + printf("%s\n", __func__); + + if (!getminor(dev)) + return (0); + + zv = zvol_os_find_by_dev(dev); + if (zv == NULL) { + return (SET_ERROR(-ENXIO)); + } + + error = zvol_os_close_zv(zv, flag, otyp, p); + + mutex_exit(&zv->zv_state_lock); + return (0); +} + +void +zvol_os_strategy(struct buf *bp) +{ + +} + +int +zvol_os_get_volume_blocksize(dev_t dev) +{ + /* XNU can only handle two sizes. */ + return (DEV_BSIZE); +} + +int +zvol_os_ioctl(dev_t dev, unsigned long cmd, caddr_t data, int isblk, + cred_t *cr, int *rvalp) +{ + int error = 0; + u_int32_t *f; + u_int64_t *o; + zvol_state_t *zv = NULL; + + printf("%s\n", __func__); + + if (!getminor(dev)) + return (ENXIO); + + zv = zvol_os_find_by_dev(dev); + + if (zv == NULL) { + dprintf("zv is NULL\n"); + return (ENXIO); + } + + f = (u_int32_t *)data; + o = (u_int64_t *)data; + + switch (cmd) { + + case DKIOCGETMAXBLOCKCOUNTREAD: + dprintf("DKIOCGETMAXBLOCKCOUNTREAD\n"); + *o = 32; + break; + + case DKIOCGETMAXBLOCKCOUNTWRITE: + dprintf("DKIOCGETMAXBLOCKCOUNTWRITE\n"); + *o = 32; + break; + case DKIOCGETMAXSEGMENTCOUNTREAD: + dprintf("DKIOCGETMAXSEGMENTCOUNTREAD\n"); + *o = 32; + break; + + case DKIOCGETMAXSEGMENTCOUNTWRITE: + dprintf("DKIOCGETMAXSEGMENTCOUNTWRITE\n"); + *o = 32; + break; + + case DKIOCGETBLOCKSIZE: + dprintf("DKIOCGETBLOCKSIZE: %llu\n", + zv->zv_volblocksize); + *f = zv->zv_volblocksize; + break; + + case DKIOCSETBLOCKSIZE: + dprintf("DKIOCSETBLOCKSIZE %lu\n", *f); + + if (!isblk) { + /* We can only do this for a block device */ + error = ENODEV; + break; + } + + if (zvol_check_volblocksize(zv->zv_name, + (uint64_t)*f)) { + error = EINVAL; + break; + } + + /* set the new block size */ + zv->zv_volblocksize = (uint64_t)*f; + dprintf("setblocksize changed: %llu\n", + zv->zv_volblocksize); + break; + + case DKIOCISWRITABLE: + dprintf("DKIOCISWRITABLE\n"); + if (zv && (zv->zv_flags & ZVOL_RDONLY)) + *f = 0; + else + *f = 1; + break; +#ifdef DKIOCGETBLOCKCOUNT32 + case DKIOCGETBLOCKCOUNT32: + dprintf("DKIOCGETBLOCKCOUNT32: %lu\n", + (uint32_t)zv->zv_volsize / zv->zv_volblocksize); + *f = (uint32_t)zv->zv_volsize / zv->zv_volblocksize; + break; +#endif + + case DKIOCGETBLOCKCOUNT: + dprintf("DKIOCGETBLOCKCOUNT: %llu\n", + zv->zv_volsize / zv->zv_volblocksize); + *o = (uint64_t)zv->zv_volsize / zv->zv_volblocksize; + break; + + case DKIOCGETBASE: + dprintf("DKIOCGETBASE\n"); + /* + * What offset should we say? + * 0 is ok for FAT but to HFS + */ + *o = zv->zv_volblocksize * 0; + break; + + case DKIOCGETPHYSICALBLOCKSIZE: + dprintf("DKIOCGETPHYSICALBLOCKSIZE\n"); + *f = zv->zv_volblocksize; + break; + +#ifdef DKIOCGETTHROTTLEMASK + case DKIOCGETTHROTTLEMASK: + dprintf("DKIOCGETTHROTTLEMASK\n"); + *o = 0; + break; +#endif + + case DKIOCGETMAXBYTECOUNTREAD: + *o = SPA_MAXBLOCKSIZE; + break; + + case DKIOCGETMAXBYTECOUNTWRITE: + *o = SPA_MAXBLOCKSIZE; + break; +#ifdef DKIOCUNMAP + case DKIOCUNMAP: + dprintf("DKIOCUNMAP\n"); + *f = 1; + break; +#endif + + case DKIOCGETFEATURES: + *f = 0; + break; + +#ifdef DKIOCISSOLIDSTATE + case DKIOCISSOLIDSTATE: + dprintf("DKIOCISSOLIDSTATE\n"); + *f = 0; + break; +#endif + + case DKIOCISVIRTUAL: + *f = 1; + break; + + case DKIOCGETMAXSEGMENTBYTECOUNTREAD: + *o = 32 * zv->zv_volblocksize; + break; + + case DKIOCGETMAXSEGMENTBYTECOUNTWRITE: + *o = 32 * zv->zv_volblocksize; + break; + + case DKIOCSYNCHRONIZECACHE: + dprintf("DKIOCSYNCHRONIZECACHE\n"); + break; + + default: + dprintf("unknown ioctl: ENOTTY\n"); + error = ENOTTY; + break; + } + + mutex_exit(&zv->zv_state_lock); + + return (SET_ERROR(error)); +} + +const static zvol_platform_ops_t zvol_macos_ops = { + .zv_free = zvol_os_free, + .zv_rename_minor = zvol_os_rename_minor, + .zv_create_minor = zvol_os_create_minor, + .zv_update_volsize = zvol_os_update_volsize, + .zv_clear_private = zvol_os_clear_private, + .zv_is_zvol = zvol_os_is_zvol, + .zv_set_disk_ro = zvol_os_set_disk_ro, + .zv_set_capacity = zvol_os_set_capacity, +}; + +int +zvol_init(void) +{ + int threads = MIN(MAX(zvol_threads, 1), 1024); + + zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, + threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (zvol_taskq == NULL) { + return (-ENOMEM); + } + + zvol_init_impl(); + zvol_register_ops(&zvol_macos_ops); + return (0); +} + +void +zvol_fini(void) +{ + zvol_fini_impl(); + taskq_destroy(zvol_taskq); +} diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 919f2aa5b9..528025094b 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -224,7 +224,8 @@ zfs_mod_supported_feature(const char *name) * The equivalent _can_ be done on FreeBSD by way of the sysctl * tree, but this has not been done yet. */ -#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__) || defined(__APPLE__) +#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || \ + defined(__FreeBSD__) || defined(__APPLE__) return (B_TRUE); #else return (zfs_mod_supported(ZFS_SYSFS_POOL_FEATURES, name)); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index f2882a3146..98daf55f2b 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -606,8 +606,9 @@ zfs_prop_init(void) "RSNAPS"); #ifdef __APPLE__ - zprop_register_index(ZFS_PROP_BROWSE, "com.apple.browse", 1,PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "on | off", "COM.APPLE.BROWSE", boolean_table); + zprop_register_index(ZFS_PROP_BROWSE, "com.apple.browse", 1, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "COM.APPLE.BROWSE", + boolean_table); zprop_register_index(ZFS_PROP_IGNOREOWNER, "com.apple.ignoreowner", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "COM.APPLE.IGNOREOWNER", boolean_table); diff --git a/module/zcommon/zprop_common.c b/module/zcommon/zprop_common.c index bc02716cb0..debec63433 100644 --- a/module/zcommon/zprop_common.c +++ b/module/zcommon/zprop_common.c @@ -77,7 +77,8 @@ zfs_mod_supported_prop(const char *name, zfs_type_t type) * The equivalent _can_ be done on FreeBSD by way of the sysctl * tree, but this has not been done yet. */ -#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__) || defined(__APPLE__) +#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || \ + defined(__FreeBSD__) || defined(__APPLE__) return (B_TRUE); #else return (zfs_mod_supported(type == ZFS_TYPE_POOL ? diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 283ab5a9d5..4735eb5ff2 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1387,7 +1387,7 @@ extern uint64_t zvolIO_kit_write(struct iomem *iomem, uint64_t offset, int dmu_read_iokit_dnode(dnode_t *dn, uint64_t *offset, - uint64_t position, uint64_t *size, struct iomem *iomem) + uint64_t position, uint64_t *size, struct iomem *iomem) { int err; @@ -1553,12 +1553,12 @@ dmu_write_iokit_dnode(dnode_t *dn, uint64_t *offset, uint64_t position, int err = 0; int i; - err = dmu_buf_hold_array_by_dnode(dn, *offset+position, *size, + err = dmu_buf_hold_array_by_dnode(dn, *offset+position, *size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); - while(*size > 0) { + while (*size > 0) { for (i = 0; i < numbufs; i++) { int tocpy; @@ -1571,7 +1571,8 @@ dmu_write_iokit_dnode(dnode_t *dn, uint64_t *offset, uint64_t position, bufoff = (position + *offset) - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, *size); - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == numbufs-1 || + tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c26964cee5..9312c1369a 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1255,7 +1255,7 @@ spa_activate(spa_t *spa, spa_mode_t mode) spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); -#if defined (_KERNEL) && defined (__APPLE__) +#if defined(_KERNEL) && defined(__APPLE__) spa_activate_os(spa); #endif @@ -1393,7 +1393,7 @@ spa_deactivate(spa_t *spa) spa->spa_did = 0; } -#if defined (_KERNEL) && defined (__APPLE__) +#if defined(_KERNEL) && defined(__APPLE__) spa_deactivate_os(spa); #endif @@ -5923,7 +5923,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; -#if defined (__APPLE__) && defined (_KERNEL) +#if defined(__APPLE__) && defined(_KERNEL) spa_create_os(spa); #endif @@ -6112,7 +6112,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) zvol_create_minors_recursive(pool); -#if defined (__APPLE__) && defined (_KERNEL) +#if defined(__APPLE__) && defined(_KERNEL) spa_create_os(spa); #endif @@ -6352,7 +6352,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, } export_spa: -#if defined (__APPLE__) && defined (_KERNEL) +#if defined(__APPLE__) && defined(_KERNEL) spa_export_os(spa); #endif diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 8a5a078cab..e5b7ce1839 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1412,7 +1412,8 @@ zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name) subclass, spa, NULL, NULL, NULL, 0, 0); - if (ereport == NULL) return; + if (ereport == NULL) + return; VERIFY0(nvlist_add_string(ereport, "snapshot_name", name)); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 10c1264053..377205dd74 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -257,14 +257,16 @@ zfs_xattr_owner_unlinked(znode_t *zp) if (tzp != zp) zrele(tzp); #elif __APPLE__ + VERIFY(ZTOV(zp) != NULL); if (VN_HOLD(ZTOV(zp)) == 0) { /* - * if zp is XATTR node, keep walking up via z_xattr_parent until we - * get the owner + * if zp is XATTR node, keep walking up via z_xattr_parent + * until we get the owner */ while (zp->z_pflags & ZFS_XATTR) { ASSERT3U(zp->z_xattr_parent, !=, 0); - if (zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, &dzp) != 0) { + if (zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, + &dzp) != 0) { unlinked = 1; break; } diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 5b215c0470..83be2ab817 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -71,7 +71,7 @@ zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, bzero(vap, sizeof (*vap)); vap->va_mask = (uint_t)mask; vap->va_mode = mode; -#if defined (__FreeBSD__) || defined (__APPLE__) +#if defined(__FreeBSD__) || defined(__APPLE__) vap->va_type = IFTOVT(mode); #endif vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index 9f67112148..e7ca31fcba 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -66,9 +66,9 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { {"ZPL_DACL_ACES", 0, SA_ACL, 0}, {"ZPL_DXATTR", 0, SA_UINT8_ARRAY, 0}, {"ZPL_PROJID", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, -#if defined (__APPLE__) - {"ZPL_ADDTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, - {"ZPL_DOCUMENTID", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, +#if defined(__APPLE__) + {"ZPL_ADDTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_DOCUMENTID", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, #endif {NULL, 0, 0, 0} }; diff --git a/scripts/cmd-macos.sh b/scripts/cmd-macos.sh new file mode 100755 index 0000000000..4549e70d45 --- /dev/null +++ b/scripts/cmd-macos.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +if test x"$#" = "x0" ; then + printf "You need to supply a command.\n" + exit 1 +fi + +cmd=$1 +shift + +READLINK=$(which greadlink 2>/dev/null) +if test "x$READLINK" = "x" ; then + READLINK=$(which readlink 2>/dev/null) +fi + +if ! test "x$READLINK" = "x" ; then + $READLINK -f . > /dev/null 2>&1 + if ! test x$? = "x0" ; then + unset READLINK + else + CANONICALIZE="$READLINK -f" + fi +fi + +if test "x$READLINK" = "x" ; then + REALPATH=$(which grealpath 2>/dev/null) + if test "x$REALPATH" = "x" ; then + REALPATH=$(which realpath 2>/dev/null) + fi + if test "x$REALPATH" = "x" ; then + CANONICALIZE=readlink + else + CANONICALIZE=$REALPATH + fi +fi + +topdir=$(dirname "$($CANONICALIZE "$0")") + +if test "x$topdir" = x"." ; then + if ! test -f zfs.release.in ; then + printf "cd into the zfs source directory or install GNU readlink or realpath.\n" + printf "Homebrew: brew install coreutils\n" + printf "MacPorts: port install coreutils\n" + printf "Gentoo Prefix: emerge sys-apps/coreutils\n" + exit 1 + fi +fi + +topdir=$topdir/../ + +for lib in nvpair uutil zpool zfs zfs_core diskmgt; do + export DYLD_LIBRARY_PATH=$topdir/lib/lib${lib}/.libs:$DYLD_LIBRARY_PATH +done +for c in zdb zfs zpool ztest; do + export PATH=${topdir}/cmd/${c}/.libs:$PATH +done + +#echo PATH=$PATH +#echo DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH +exec "${topdir}/cmd/$cmd/.libs/$cmd" "$@" diff --git a/scripts/debug-macos.sh b/scripts/debug-macos.sh new file mode 100755 index 0000000000..cd8e7ec847 --- /dev/null +++ b/scripts/debug-macos.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +if test x"$#" = "x0" ; then + printf "You need to supply a command.\n" + exit 1 +fi + +cmd=$1 +shift + +READLINK=$(which greadlink 2>/dev/null) +if test "x$READLINK" = "x" ; then + READLINK=$(which readlink 2>/dev/null) +fi + +if ! test "x$READLINK" = "x" ; then + $READLINK -f . > /dev/null 2>&1 + if ! test x$? = "x0" ; then + unset READLINK + else + CANONICALIZE="$READLINK -f" + fi +fi + +if test "x$READLINK" = "x" ; then + REALPATH=$(which grealpath 2>/dev/null) + if test "x$REALPATH" = "x" ; then + REALPATH=$(which realpath 2>/dev/null) + fi + if test "x$REALPATH" = "x" ; then + CANONICALIZE=readlink + else + CANONICALIZE=$REALPATH + fi +fi + +topdir=$(dirname "$($CANONICALIZE "$0")") + +if test "x$topdir" = x"." ; then + if ! test -f zfs.release.in ; then + printf "cd into the zfs source directory or install GNU readlink or realpath.\n" + printf "Homebrew: brew install coreutils\n" + printf "MacPorts: port install coreutils\n" + printf "Gentoo Prefix: emerge sys-apps/coreutils\n" + exit 1 + fi +fi + +topdir=$topdir/../ + +for lib in nvpair uutil zpool zfs zfs_core diskmgt; do + export DYLD_LIBRARY_PATH=$topdir/lib/lib${lib}/.libs:$DYLD_LIBRARY_PATH +done +for c in zdb zfs zpool ztest; do + export PATH=${topdir}/cmd/${c}/.libs:$PATH +done + +#echo PATH=$PATH +#echo DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH +exec lldb "${topdir}/cmd/$cmd/.libs/$cmd" "$@" diff --git a/scripts/load_macos.sh b/scripts/load_macos.sh new file mode 100755 index 0000000000..2e8a1880b7 --- /dev/null +++ b/scripts/load_macos.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# +# Expected to be run from the root of the source tree, as root; +# ./scripts/load_macos.sh +# +# Copies compiled zfs.kext to /tmp/ and prepares the requirements +# for load. +# + +rsync -ar module/os/macos/zfs/zfs.kext/ /tmp/zfs.kext/ + +chown -R root:wheel /tmp/zfs.kext + +kextload -v /tmp/zfs.kext || kextutil /tmp/zfs.kext + +# log stream --source --predicate 'sender == "zfs"' --style compact +