diff --git a/appveryor.yml b/appveryor.yml new file mode 100644 index 0000000000..725f8fd87e --- /dev/null +++ b/appveryor.yml @@ -0,0 +1,12 @@ +version: 1.0.{build} +branches: + only: + - macos +image: macOS +build_script: +- sh: >- + pwd + ls -l + sh autoconf.sh + ./configure CPPFLAGS="-I/usr/local/opt/gettext/include -I/usr/local/opt/openssl@1.1/include" LDFLAGS="-L/usr/local/opt/gettext/lib/ -L/usr/local/opt/openssl@1.1/lib" + make diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am index 8b2d0c2002..57d24aa206 100644 --- a/cmd/zed/zed.d/Makefile.am +++ b/cmd/zed/zed.d/Makefile.am @@ -20,6 +20,8 @@ dist_zedexec_SCRIPTS = \ scrub_finish-notify.sh \ statechange-led.sh \ statechange-notify.sh \ + snapshot_mount.sh \ + snapshot_unmount.sh \ vdev_clear-led.sh \ vdev_attach-led.sh \ pool_import-led.sh \ @@ -38,6 +40,8 @@ zedconfdefaults = \ scrub_finish-notify.sh \ statechange-led.sh \ statechange-notify.sh \ + snapshot_mount.sh \ + snapshot_unmount.sh \ vdev_clear-led.sh \ vdev_attach-led.sh \ pool_import-led.sh \ diff --git a/cmd/zed/zed.d/snapshot_mount.sh b/cmd/zed/zed.d/snapshot_mount.sh new file mode 100644 index 0000000000..5cf807aa99 --- /dev/null +++ b/cmd/zed/zed.d/snapshot_mount.sh @@ -0,0 +1,29 @@ +#!/bin/sh +# +# Helper to mount and unmount snapshots when asked to by kernel. +# +# Mostly used in macOS. +# +set -ef + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_SNAPSHOT_NAME}" ] || exit 1 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 2 + +if [ "${ZEVENT_SUBCLASS}" = "snapshot_mount" ]; then + action="mount" +elif [ "${ZEVENT_SUBCLASS}" = "snapshot_unmount" ]; then + action="unmount" +else + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 3 +fi + +zed_exit_if_ignoring_this_event +zed_check_cmd "${ZFS}" || exit 4 + +"${ZFS}" "${action}" "${ZEVENT_SNAPSHOT_NAME}" + +finished diff --git a/cmd/zed/zed.d/snapshot_unmount.sh b/cmd/zed/zed.d/snapshot_unmount.sh new file mode 120000 index 0000000000..9f74a29e61 --- /dev/null +++ b/cmd/zed/zed.d/snapshot_unmount.sh @@ -0,0 +1 @@ +snapshot_mount.sh \ No newline at end of file diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 2f4e075064..062bd625bf 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -766,11 +766,11 @@ zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) } else if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not mounted\n")); - ret = 1; + ret = 0; } else if (zfs_share(zhp) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not shared\n")); - ret = 1; + ret = 0; } zfs_commit_all_shares(); } @@ -6973,7 +6973,7 @@ share_mount(int op, int argc, char **argv) } } else { -#if defined (__APPLE__) +#if defined(__APPLE__) /* * OsX can not mount from kernel, users are expected to mount * by hand using "zfs mount dataset@snapshot". @@ -6997,8 +6997,8 @@ share_mount(int op, int argc, char **argv) } else { - ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, - options); + ret = share_mount_one(zhp, op, flags, NULL, + B_TRUE, options); } zfs_close(zhp); @@ -7400,7 +7400,7 @@ unshare_unmount(int op, int argc, char **argv) return (unshare_unmount_path(op, argv[0], flags, B_FALSE)); -#if defined (__APPLE__) +#if defined(__APPLE__) /* Temporarily, allow mounting snapshots on OS X */ if ((zhp = zfs_open(g_zfs, argv[0], diff --git a/cmd/zpool/os/macos/zpool_vdev_os.c b/cmd/zpool/os/macos/zpool_vdev_os.c new file mode 100644 index 0000000000..b95e4deea6 --- /dev/null +++ b/cmd/zpool/os/macos/zpool_vdev_os.c @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zpool_util.h" +#include + +#include +#include +#include +#include +#include + +boolean_t +check_sector_size_database(char *path, int *sector_size) +{ + return (B_FALSE); +} + +void +zpool_vdev_enable_file(struct stat64 *statbuf, boolean_t *wholedisk) +{ + if (S_ISCHR(statbuf->st_mode)) { + statbuf->st_mode &= ~S_IFCHR; + statbuf->st_mode |= S_IFBLK; + *wholedisk = B_FALSE; + } +} + +int +check_device(const char *name, boolean_t force, + boolean_t isspare, boolean_t iswholedisk) +{ + char path[MAXPATHLEN]; + + if (strncmp(name, _PATH_DEV, sizeof (_PATH_DEV) - 1) != 0) + snprintf(path, sizeof (path), "%s%s", _PATH_DEV, name); + else + strlcpy(path, name, sizeof (path)); + + return (check_file(path, force, isspare)); +} diff --git a/configure.ac b/configure.ac index 6e5a333c30..db5cb79269 100644 --- a/configure.ac +++ b/configure.ac @@ -139,6 +139,8 @@ AC_CONFIG_FILES([ include/os/macos/spl/Makefile include/os/macos/spl/rpc/Makefile include/os/macos/spl/sys/Makefile + include/os/macos/zfs/Makefile + include/os/macos/zfs/sys/Makefile include/sys/Makefile include/sys/crypto/Makefile include/sys/fm/Makefile @@ -161,6 +163,12 @@ AC_CONFIG_FILES([ lib/libspl/include/os/freebsd/sys/Makefile lib/libspl/include/os/linux/Makefile lib/libspl/include/os/linux/sys/Makefile + lib/libspl/include/os/macos/Makefile + lib/libspl/include/os/macos/ia32/Makefile + lib/libspl/include/os/macos/ia32/sys/Makefile + lib/libspl/include/os/macos/mach/Makefile + lib/libspl/include/os/macos/rpc/Makefile + lib/libspl/include/os/macos/sys/Makefile lib/libspl/include/rpc/Makefile lib/libspl/include/sys/Makefile lib/libspl/include/sys/dktp/Makefile diff --git a/include/os/macos/Makefile.am b/include/os/macos/Makefile.am new file mode 100644 index 0000000000..a9564c3e3c --- /dev/null +++ b/include/os/macos/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = spl zfs \ No newline at end of file diff --git a/include/os/macos/spl/Makefile.am b/include/os/macos/spl/Makefile.am new file mode 100644 index 0000000000..75cad0836e --- /dev/null +++ b/include/os/macos/spl/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys rpc diff --git a/include/os/macos/spl/ia32/sys/asm_linkage.h b/include/os/macos/spl/ia32/sys/asm_linkage.h new file mode 100644 index 0000000000..0009705ad6 --- /dev/null +++ b/include/os/macos/spl/ia32/sys/asm_linkage.h @@ -0,0 +1,297 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IA32_SYS_ASM_LINKAGE_H +#define _IA32_SYS_ASM_LINKAGE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _ASM /* The remainder of this file is only for assembly files */ + +/* + * make annoying differences in assembler syntax go away + */ + +/* + * D16 and A16 are used to insert instructions prefixes; the + * macros help the assembler code be slightly more portable. + */ +#if !defined(__GNUC_AS__) +/* + * /usr/ccs/bin/as prefixes are parsed as separate instructions + */ +#define D16 data16; +#define A16 addr16; + +/* + * (There are some weird constructs in constant expressions) + */ +#define _CONST(const) [const] +#define _BITNOT(const) -1!_CONST(const) +#define _MUL(a, b) _CONST(a \* b) + +#else +/* + * Why not use the 'data16' and 'addr16' prefixes .. well, the + * assembler doesn't quite believe in real mode, and thus argues with + * us about what we're trying to do. + */ +#define D16 .byte 0x66; +#define A16 .byte 0x67; + +#define _CONST(const) (const) +#define _BITNOT(const) ~_CONST(const) +#define _MUL(a, b) _CONST(a * b) + +#endif + +/* + * C pointers are different sizes between i386 and amd64. + * These constants can be used to compute offsets into pointer arrays. + */ +#if defined(__amd64) +#define CLONGSHIFT 3 +#define CLONGSIZE 8 +#define CLONGMASK 7 +#elif defined(__i386) +#define CLONGSHIFT 2 +#define CLONGSIZE 4 +#define CLONGMASK 3 +#endif + +/* + * Since we know we're either ILP32 or LP64 .. + */ +#define CPTRSHIFT CLONGSHIFT +#define CPTRSIZE CLONGSIZE +#define CPTRMASK CLONGMASK + +#if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) +#error "inconsistent shift constants" +#endif + +#if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) +#error "inconsistent mask constants" +#endif + +#define ASM_ENTRY_ALIGN 4, 0x90 + +/* + * SSE register alignment and save areas + */ + +#define XMM_SIZE 16 +#define XMM_ALIGN 16 +#define XMM_ALIGN_LOG 4, 0x90 + +#if defined(__amd64) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp; \ + movq %rsp, sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp + +#elif defined(__i386) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \ + movl %esp, sreg; \ + addl $XMM_ALIGN, sreg; \ + andl $_BITNOT(XMM_ALIGN-1), sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; + +#endif /* __i386 */ + +/* + * profiling causes definitions of the MCOUNT and RTMCOUNT + * particular to the type + */ +#ifdef GPROF + +#define MCOUNT(x) \ + pushl %ebp; \ + movl %esp, %ebp; \ + call _mcount; \ + popl %ebp + +#endif /* GPROF */ + +#ifdef PROF + +#define MCOUNT(x) \ +/* CSTYLED */ \ + .lcomm .L_/**/x/**/1, 4, 4; \ + pushl %ebp; \ + movl %esp, %ebp; \ +/* CSTYLED */ \ + movl $.L_/**/x/**/1, %edx; \ + call _mcount; \ + popl %ebp + +#endif /* PROF */ + +/* + * if we are not profiling, MCOUNT should be defined to nothing + */ +#if !defined(PROF) && !defined(GPROF) +#define MCOUNT(x) +#endif /* !defined(PROF) && !defined(GPROF) */ + +#define RTMCOUNT(x) MCOUNT(x) + +/* + * Macro to define weak symbol aliases. These are similar to the ANSI-C + * #pragma weak name = _name + * except a compiler can determine type. The assembler must be told. Hence, + * the second parameter must be the type of the symbol (i.e.: function,...) + */ +#define ANSI_PRAGMA_WEAK(sym, stype) \ + .weak sym; \ +/* CSTYLED */ \ +sym = _/**/sym + +/* + * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in: + * #pragma weak sym1 = sym2 + */ +#define ANSI_PRAGMA_WEAK2(sym1, sym2, stype) \ + .weak sym1; \ +sym1 = sym2 + +/* + * ENTRY provides the standard procedure entry code and an easy way to + * insert the calls to mcount for profiling. ENTRY_NP is identical, but + * never calls mcount. + */ +#define ENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: MCOUNT(x) + +#define ENTRY_NP(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: + +#define RTENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: RTMCOUNT(x) + +/* + * ENTRY2 is identical to ENTRY but provides two labels for the entry point. + */ +#define ENTRY2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ +/* CSTYLED */ \ +x:; \ +y: MCOUNT(x) + +#define ENTRY_NP2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ +/* CSTYLED */ \ +x:; \ +y: + + +/* + * ALTENTRY provides for additional entry points. + */ +#define ALTENTRY(x) \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: + +/* + * DGDEF and DGDEF2 provide global data declarations. + * + * DGDEF provides a word aligned word of storage. + * + * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This + * implies this macro is best used for byte arrays. + * + * DGDEF3 allocates "sz" bytes of storage with "algn" alignment. + */ +#define DGDEF2(name, sz) \ + .data; \ + .globl name; \ +name: + +#define DGDEF3(name, sz, algn) \ + .data; \ + .align algn; \ + .globl name; \ +name: + +#define DGDEF(name) DGDEF3(name, 4, 4) + +/* + * SET_SIZE trails a function and set the size for the ELF symbol table. + */ +#define SET_SIZE(x) + +/* + * NWORD provides native word value. + */ +#if defined(__amd64) + +/*CSTYLED*/ +#define NWORD quad + +#elif defined(__i386) + +#define NWORD long + +#endif /* __i386 */ + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/include/os/macos/spl/libkern/libkern.h b/include/os/macos/spl/libkern/libkern.h new file mode 100644 index 0000000000..5d4fa410b7 --- /dev/null +++ b/include/os/macos/spl/libkern/libkern.h @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2020 Jorgen Lundman + * + */ + +#ifndef _SPL_LIBKERN_H +#define _SPL_LIBKERN_H + +/* + * We wrap this header to handle that copyinstr()'s final argument is + * mandatory on OSX. Wrap it to call our ddi_copyinstr to make it optional. + */ +#include_next +#undef copyinstr +#define copyinstr(U, K, L, D) ddi_copyinstr((U), (K), (L), (D)) + +#endif diff --git a/include/os/macos/spl/linux/init.h b/include/os/macos/spl/linux/init.h new file mode 100644 index 0000000000..4ab1523c16 --- /dev/null +++ b/include/os/macos/spl/linux/init.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LINUX_INIT_H +#define _LINUX_INIT_H + + +#endif diff --git a/include/os/macos/spl/linux/kernel.h b/include/os/macos/spl/linux/kernel.h new file mode 100644 index 0000000000..73a2b2eaad --- /dev/null +++ b/include/os/macos/spl/linux/kernel.h @@ -0,0 +1,25 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LINUX_KERNEL_H +#define _LINUX_KERNEL_H + +#endif diff --git a/include/os/macos/spl/linux/module.h b/include/os/macos/spl/linux/module.h new file mode 100644 index 0000000000..264d6c058d --- /dev/null +++ b/include/os/macos/spl/linux/module.h @@ -0,0 +1,28 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LINUX_MODULE_H +#define _LINUX_MODULE_H + +#include +#include + +#endif diff --git a/include/os/macos/spl/rpc/Makefile.am b/include/os/macos/spl/rpc/Makefile.am new file mode 100644 index 0000000000..770d26812e --- /dev/null +++ b/include/os/macos/spl/rpc/Makefile.am @@ -0,0 +1,3 @@ +KERNEL_H = \ + $(top_srcdir)/include/os/macos/spl/rpc/types.h \ + $(top_srcdir)/include/os/macos/spl/rpc/xdr.h diff --git a/include/os/macos/spl/rpc/types.h b/include/os/macos/spl/rpc/types.h new file mode 100644 index 0000000000..e089e0ed8c --- /dev/null +++ b/include/os/macos/spl/rpc/types.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ +#ifndef _SPL_RPC_TYPES_H +#define _SPL_RPC_TYPES_H + +typedef int bool_t; + +#endif /* SPL_RPC_TYPES_H */ diff --git a/include/os/macos/spl/rpc/xdr.h b/include/os/macos/spl/rpc/xdr.h new file mode 100644 index 0000000000..7b8074b05c --- /dev/null +++ b/include/os/macos/spl/rpc/xdr.h @@ -0,0 +1,175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright (c) 1989, 2011, Oracle and/or its affiliates. All rights reserved. + */ +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Portions of this source code were derived from Berkeley + * 4.3 BSD under license from the Regents of the University of + * California. + */ + +/* + * xdr.h, External Data Representation Serialization Routines. + * + */ + +#ifndef _SPL_RPC_XDR_H +#define _SPL_RPC_XDR_H + + +#include +#include + +/* + * XDR enums and types. + */ +enum xdr_op { + XDR_ENCODE, + XDR_DECODE +}; + +struct xdr_ops; + +typedef struct { + struct xdr_ops *x_ops; /* Also used to let caller know if */ + /* xdrmem_create() succeeds (sigh..) */ + caddr_t x_addr; /* Current buffer addr */ + caddr_t x_addr_end; /* End of the buffer */ + enum xdr_op x_op; /* Stream direction */ +} XDR; + +typedef bool_t (*xdrproc_t)(XDR *xdrs, void *ptr); + +struct xdr_ops { + bool_t (*xdr_control)(XDR *, int, void *); + + bool_t (*xdr_char)(XDR *, char *); + bool_t (*xdr_u_short)(XDR *, unsigned short *); + bool_t (*xdr_u_int)(XDR *, unsigned *); + bool_t (*xdr_u_longlong_t)(XDR *, u_longlong_t *); + + bool_t (*xdr_opaque)(XDR *, caddr_t, const uint_t); + bool_t (*xdr_string)(XDR *, char **, const uint_t); + bool_t (*xdr_array)(XDR *, caddr_t *, uint_t *, const uint_t, + const uint_t, const xdrproc_t); +}; + +/* + * XDR control operator. + */ +#define XDR_GET_BYTES_AVAIL 1 + +struct xdr_bytesrec { + bool_t xc_is_last_record; + size_t xc_num_avail; +}; + +/* + * XDR functions. + */ +void xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op); +#define xdr_destroy(xdrs) ((void) 0) + +#define xdr_control(xdrs, req, info) \ + (xdrs)->x_ops->xdr_control((xdrs), (req), (info)) + +/* + * For precaution, the following are defined as static inlines instead of macros + * to get some amount of type safety. + * + * Also, macros wouldn't work in the case where typecasting is done, because it + * must be possible to reference the functions' addresses by these names. + */ +static inline bool_t +xdr_char(XDR *xdrs, char *cp) +{ + return (xdrs->x_ops->xdr_char(xdrs, cp)); +} + +static inline bool_t +xdr_u_short(XDR *xdrs, unsigned short *usp) +{ + return (xdrs->x_ops->xdr_u_short(xdrs, usp)); +} + +static inline bool_t +xdr_short(XDR *xdrs, short *sp) +{ + return (xdrs->x_ops->xdr_u_short(xdrs, (unsigned short *) sp)); +} + +static inline bool_t +xdr_u_int(XDR *xdrs, unsigned *up) +{ + return (xdrs->x_ops->xdr_u_int(xdrs, up)); +} + +static inline bool_t +xdr_int(XDR *xdrs, int *ip) +{ + return (xdrs->x_ops->xdr_u_int(xdrs, (unsigned *)ip)); +} + +static inline bool_t +xdr_u_longlong_t(XDR *xdrs, u_longlong_t *ullp) +{ + return (xdrs->x_ops->xdr_u_longlong_t(xdrs, ullp)); +} + +static inline bool_t +xdr_longlong_t(XDR *xdrs, longlong_t *llp) +{ + return (xdrs->x_ops->xdr_u_longlong_t(xdrs, (u_longlong_t *)llp)); +} + +/* + * Fixed-length opaque data. + */ +static inline bool_t +xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + return (xdrs->x_ops->xdr_opaque(xdrs, cp, cnt)); +} + +/* + * Variable-length string. + * The *sp buffer must have (maxsize + 1) bytes. + */ +static inline bool_t +xdr_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + return (xdrs->x_ops->xdr_string(xdrs, sp, maxsize)); +} + +/* + * Variable-length arrays. + */ +static inline bool_t xdr_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, + const uint_t maxsize, const uint_t elsize, const xdrproc_t elproc) +{ + return (xdrs->x_ops->xdr_array(xdrs, arrp, sizep, maxsize, elsize, + elproc)); +} + +#endif /* SPL_RPC_XDR_H */ diff --git a/include/os/macos/spl/sys/Makefile.am b/include/os/macos/spl/sys/Makefile.am new file mode 100644 index 0000000000..5703774569 --- /dev/null +++ b/include/os/macos/spl/sys/Makefile.am @@ -0,0 +1,49 @@ +KERNEL_H = \ + $(top_srcdir)/include/os/macos/spl/sys/atomic.h \ + $(top_srcdir)/include/os/macos/spl/sys/byteorder.h \ + $(top_srcdir)/include/os/macos/spl/sys/callb.h \ + $(top_srcdir)/include/os/macos/spl/sys/cmn_err.h \ + $(top_srcdir)/include/os/macos/spl/sys/condvar.h \ + $(top_srcdir)/include/os/macos/spl/sys/console.h \ + $(top_srcdir)/include/os/macos/spl/sys/cred.h \ + $(top_srcdir)/include/os/macos/spl/sys/debug.h \ + $(top_srcdir)/include/os/macos/spl/sys/errno.h \ + $(top_srcdir)/include/os/macos/spl/sys/fcntl.h \ + $(top_srcdir)/include/os/macos/spl/sys/file.h \ + $(top_srcdir)/include/os/macos/spl/sys/inttypes.h \ + $(top_srcdir)/include/os/macos/spl/sys/isa_defs.h \ + $(top_srcdir)/include/os/macos/spl/sys/kmem.h \ + $(top_srcdir)/include/os/macos/spl/sys/kmem_impl.h \ + $(top_srcdir)/include/os/macos/spl/sys/kstat.h \ + $(top_srcdir)/include/os/macos/spl/sys/list.h \ + $(top_srcdir)/include/os/macos/spl/sys/mod_os.h \ + $(top_srcdir)/include/os/macos/spl/sys/mutex.h \ + $(top_srcdir)/include/os/macos/spl/sys/param.h \ + $(top_srcdir)/include/os/macos/spl/sys/policy.h \ + $(top_srcdir)/include/os/macos/spl/sys/priv.h \ + $(top_srcdir)/include/os/macos/spl/sys/proc.h \ + $(top_srcdir)/include/os/macos/spl/sys/processor.h \ + $(top_srcdir)/include/os/macos/spl/sys/random.h \ + $(top_srcdir)/include/os/macos/spl/sys/rwlock.h \ + $(top_srcdir)/include/os/macos/spl/sys/seg_kmem.h \ + $(top_srcdir)/include/os/macos/spl/sys/signal.h \ + $(top_srcdir)/include/os/macos/spl/sys/stropts.h \ + $(top_srcdir)/include/os/macos/spl/sys/sunddi.h \ + $(top_srcdir)/include/os/macos/spl/sys/sysmacros.h \ + $(top_srcdir)/include/os/macos/spl/sys/systeminfo.h \ + $(top_srcdir)/include/os/macos/spl/sys/systm.h \ + $(top_srcdir)/include/os/macos/spl/sys/taskq.h \ + $(top_srcdir)/include/os/macos/spl/sys/taskq_impl.h \ + $(top_srcdir)/include/os/macos/spl/sys/thread.h \ + $(top_srcdir)/include/os/macos/spl/sys/time.h \ + $(top_srcdir)/include/os/macos/spl/sys/timer.h \ + $(top_srcdir)/include/os/macos/spl/sys/tsd.h \ + $(top_srcdir)/include/os/macos/spl/sys/types.h \ + $(top_srcdir)/include/os/macos/spl/sys/utsname.h \ + $(top_srcdir)/include/os/macos/spl/sys/varargs.h \ + $(top_srcdir)/include/os/macos/spl/sys/vfs.h \ + $(top_srcdir)/include/os/macos/spl/sys/vmem.h \ + $(top_srcdir)/include/os/macos/spl/sys/vmem_impl.h \ + $(top_srcdir)/include/os/macos/spl/sys/vmsystm.h \ + $(top_srcdir)/include/os/macos/spl/sys/vnode.h \ + $(top_srcdir)/include/os/macos/spl/sys/zone.h diff --git a/include/os/macos/spl/sys/acl.h b/include/os/macos/spl/sys/acl.h new file mode 100644 index 0000000000..840ba7f43c --- /dev/null +++ b/include/os/macos/spl/sys/acl.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_ACL_H +#define _SPL_ACL_H + +#include + +typedef struct ace { + uid_t a_who; + uint32_t a_access_mask; + uint16_t a_flags; + uint16_t a_type; +} ace_t; + +typedef struct ace_object { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ + uint8_t a_obj_type[16]; /* obj type */ + uint8_t a_inherit_obj_type[16]; /* inherit obj */ +} ace_object_t; + +#define MAX_ACL_ENTRIES 1024 + +#define ACE_READ_DATA 0x00000001 +#define ACE_LIST_DIRECTORY 0x00000001 +#define ACE_WRITE_DATA 0x00000002 +#define ACE_ADD_FILE 0x00000002 +#define ACE_APPEND_DATA 0x00000004 +#define ACE_ADD_SUBDIRECTORY 0x00000004 +#define ACE_READ_NAMED_ATTRS 0x00000008 +#define ACE_WRITE_NAMED_ATTRS 0x00000010 +#define ACE_EXECUTE 0x00000020 +#define ACE_DELETE_CHILD 0x00000040 +#define ACE_READ_ATTRIBUTES 0x00000080 +#define ACE_WRITE_ATTRIBUTES 0x00000100 +#define ACE_DELETE 0x00010000 +#define ACE_READ_ACL 0x00020000 +#define ACE_WRITE_ACL 0x00040000 +#define ACE_WRITE_OWNER 0x00080000 +#define ACE_SYNCHRONIZE 0x00100000 + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +#define ACL_AUTO_INHERIT 0x0001 +#define ACL_PROTECTED 0x0002 +#define ACL_DEFAULTED 0x0004 +#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED|ACL_DEFAULTED) + +#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E +#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 + +#define ACE_ALL_TYPES 0x001F + +#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE|ACE_IDENTIFIER_GROUP) + +#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS|\ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES|\ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_SYNCHRONIZE) + +#define VSA_ACE 0x0010 +#define VSA_ACECNT 0x0020 +#define VSA_ACE_ALLTYPES 0x0040 +#define VSA_ACE_ACLFLAGS 0x0080 + +typedef struct trivial_acl { + uint32_t allow0; /* allow mask for bits only in owner */ + uint32_t deny1; /* deny mask for bits not in owner */ + uint32_t deny2; /* deny mask for bits not in group */ + uint32_t owner; /* allow mask matching mode */ + uint32_t group; /* allow mask matching mode */ + uint32_t everyone; /* allow mask matching mode */ +} trivial_acl_t; + +#endif /* _SPL_ACL_H */ diff --git a/include/os/macos/spl/sys/atomic.h b/include/os/macos/spl/sys/atomic.h new file mode 100644 index 0000000000..0fcc072680 --- /dev/null +++ b/include/os/macos/spl/sys/atomic.h @@ -0,0 +1,288 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * OSX Atomic functions using clang builtins. + * + * Jorgen Lundman + * + */ + +#ifndef _SPL_ATOMIC_H +#define _SPL_ATOMIC_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Increment target + */ +static inline void +atomic_inc_8(volatile uint8_t *target) +{ + __sync_fetch_and_add(target, 1); +} + +static inline void +atomic_inc_16(volatile uint16_t *target) +{ + __sync_fetch_and_add(target, 1); +} + +static inline void +atomic_inc_32(volatile uint32_t *target) +{ + __sync_fetch_and_add(target, 1); +} + +static inline void +atomic_inc_64(volatile uint64_t *target) +{ + __sync_fetch_and_add(target, 1); +} + +static inline int32_t +atomic_inc_32_nv(volatile uint32_t *target) +{ + return (__sync_add_and_fetch(target, 1)); +} + +static inline int64_t +atomic_inc_64_nv(volatile uint64_t *target) +{ + return (__sync_add_and_fetch(target, 1)); +} + + + +/* + * Decrement target + */ +static inline void +atomic_dec_8(volatile uint8_t *target) +{ + __sync_fetch_and_sub(target, 1); +} + +static inline void +atomic_dec_16(volatile uint16_t *target) +{ + __sync_fetch_and_sub(target, 1); +} + +static inline void +atomic_dec_32(volatile uint32_t *target) +{ + __sync_fetch_and_sub(target, 1); +} + +static inline void +atomic_dec_64(volatile uint64_t *target) +{ + __sync_fetch_and_sub(target, 1); +} + +static inline int32_t +atomic_dec_32_nv(volatile uint32_t *target) +{ + return (__sync_sub_and_fetch(target, 1)); +} + +static inline int64_t +atomic_dec_64_nv(volatile uint64_t *target) +{ + return (__sync_sub_and_fetch(target, 1)); +} + +/* + * Add delta to target + */ +static inline void +atomic_add_8(volatile uint8_t *target, int8_t delta) +{ + __sync_add_and_fetch(target, delta); +} + +static inline void +atomic_add_16(volatile uint16_t *target, int16_t delta) +{ + __sync_add_and_fetch(target, delta); +} + +static inline void +atomic_add_32(volatile uint32_t *target, int32_t delta) +{ + __sync_add_and_fetch(target, delta); +} + +static inline uint32_t +atomic_add_32_nv(volatile uint32_t *target, int32_t delta) +{ + return (__sync_add_and_fetch(target, delta)); +} + +static inline void +atomic_add_64(volatile uint64_t *target, int64_t delta) +{ + __sync_add_and_fetch(target, delta); +} + +static inline uint64_t +atomic_add_64_nv(volatile uint64_t *target, int64_t delta) +{ + return (__sync_add_and_fetch(target, delta)); +} + + +/* + * Subtract delta to target + */ +static inline void +atomic_sub_8(volatile uint8_t *target, int8_t delta) +{ + __sync_sub_and_fetch(target, delta); +} + +static inline void +atomic_sub_16(volatile uint16_t *target, int16_t delta) +{ + __sync_sub_and_fetch(target, delta); +} + +static inline void +atomic_sub_32(volatile uint32_t *target, int32_t delta) +{ + __sync_sub_and_fetch(target, delta); +} + +static inline void +atomic_sub_64(volatile uint64_t *target, int64_t delta) +{ + __sync_sub_and_fetch(target, delta); +} + +static inline uint64_t +atomic_sub_64_nv(volatile uint64_t *target, int64_t delta) +{ + return (__sync_sub_and_fetch(target, delta)); +} + +/* + * logical OR bits with target + */ +static inline void +atomic_or_8(volatile uint8_t *target, uint8_t mask) +{ + __sync_or_and_fetch(target, mask); +} + +static inline void +atomic_or_16(volatile uint16_t *target, uint16_t mask) +{ + __sync_or_and_fetch(target, mask); +} + +static inline void +atomic_or_32(volatile uint32_t *target, uint32_t mask) +{ + __sync_or_and_fetch(target, mask); +} + +/* + * logical AND bits with target + */ +static inline void +atomic_and_8(volatile uint8_t *target, uint8_t mask) +{ + __sync_and_and_fetch(target, mask); +} + +static inline void +atomic_and_16(volatile uint16_t *target, uint16_t mask) +{ + __sync_and_and_fetch(target, mask); +} + +static inline void +atomic_and_32(volatile uint32_t *target, uint32_t mask) +{ + __sync_and_and_fetch(target, mask); +} + +/* + * Compare And Set + * if *arg1 == arg2, then set *arg1 = arg3; return old value. + */ +static inline uint8_t +atomic_cas_8(volatile uint8_t *_target, uint8_t _cmp, uint8_t _new) +{ + return (__sync_val_compare_and_swap(_target, _cmp, _new)); +} + +static inline uint16_t +atomic_cas_16(volatile uint16_t *_target, uint16_t _cmp, uint16_t _new) +{ + return (__sync_val_compare_and_swap(_target, _cmp, _new)); +} + +static inline uint32_t +atomic_cas_32(volatile uint32_t *_target, uint32_t _cmp, uint32_t _new) +{ + return (__sync_val_compare_and_swap(_target, _cmp, _new)); +} + +static inline uint64_t +atomic_cas_64(volatile uint64_t *_target, uint64_t _cmp, uint64_t _new) +{ + return (__sync_val_compare_and_swap(_target, _cmp, _new)); +} + +static inline uint32_t +atomic_swap_32(volatile uint32_t *_target, uint32_t _new) +{ + return (__sync_lock_test_and_set(_target, _new)); +} + +static inline uint64_t +atomic_swap_64(volatile uint64_t *_target, uint64_t _new) +{ + return (__sync_lock_test_and_set(_target, _new)); +} + +extern void *atomic_cas_ptr(volatile void *_target, void *_cmp, void *_new); + +static inline void +membar_producer(void) +{ + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_ATOMIC_H */ diff --git a/include/os/macos/spl/sys/byteorder.h b/include/os/macos/spl/sys/byteorder.h new file mode 100644 index 0000000000..1a97baefc0 --- /dev/null +++ b/include/os/macos/spl/sys/byteorder.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_BYTEORDER_H +#define _SPL_BYTEORDER_H + +#include +#include + +#define LE_16(x) OSSwapHostToLittleInt16(x) +#define LE_32(x) OSSwapHostToLittleInt32(x) +#define LE_64(x) OSSwapHostToLittleInt64(x) +#define BE_16(x) OSSwapHostToBigInt16(x) +#define BE_32(x) OSSwapHostToBigInt32(x) +#define BE_64(x) OSSwapHostToBigInt64(x) + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + + +/* 10.8 is lacking in htonll */ +#if !defined(htonll) +#define htonll(x) __DARWIN_OSSwapInt64(x) +#endif +#if !defined(ntohll) +#define ntohll(x) __DARWIN_OSSwapInt64(x) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define _LITTLE_ENDIAN +#endif + +#ifdef __BIG_ENDIAN__ +#define _BIG_ENDIAN +#endif + +#endif /* SPL_BYTEORDER_H */ diff --git a/include/os/macos/spl/sys/callb.h b/include/os/macos/spl/sys/callb.h new file mode 100644 index 0000000000..3d86b9d41c --- /dev/null +++ b/include/os/macos/spl/sys/callb.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_CALLB_H +#define _SPL_CALLB_H + +#include + +#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); + +typedef struct callb_cpr { + kmutex_t *cc_lockp; +} callb_cpr_t; + +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + (cp)->cc_lockp = lockp; \ +} + +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + CALLB_CPR_ASSERT(cp); \ +} + +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + CALLB_CPR_ASSERT(cp); \ +} + +#define CALLB_CPR_EXIT(cp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ + mutex_exit((cp)->cc_lockp); \ +} + + +#define CALLOUT_FLAG_ROUNDUP 0x1 +#define CALLOUT_FLAG_ABSOLUTE 0x2 +#define CALLOUT_FLAG_HRESTIME 0x4 +#define CALLOUT_FLAG_32BIT 0x8 + +/* Move me to more correct "sys/callo.h" file when convenient. */ +#define CALLOUT_NORMAL 1 +typedef uint64_t callout_id_t; +callout_id_t timeout_generic(int, void (*)(void *), void *, hrtime_t, + hrtime_t, int); + +#endif /* _SPL_CALLB_H */ diff --git a/include/os/macos/spl/sys/cmn_err.h b/include/os/macos/spl/sys/cmn_err.h new file mode 100644 index 0000000000..e4343a97a7 --- /dev/null +++ b/include/os/macos/spl/sys/cmn_err.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SPL_CMN_ERR_H +#define _SPL_CMN_ERR_H + +#include +#include + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +#ifdef _KERNEL + +extern void vcmn_err(int, const char *, __va_list); +extern void cmn_err(int, const char *, ...); + +#endif /* _KERNEL */ + +#define fm_panic panic + +#endif /* SPL_CMN_ERR_H */ diff --git a/include/os/macos/spl/sys/condvar.h b/include/os/macos/spl/sys/condvar.h new file mode 100644 index 0000000000..2955a4c581 --- /dev/null +++ b/include/os/macos/spl/sys/condvar.h @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef OSX_CONDVAR_H +#define OSX_CONDVAR_H + +#include +#include + +#define hz 10 /* frequency when using gethrtime() >> 23 for lbolt */ + +typedef enum { + CV_DEFAULT, + CV_DRIVER +} kcv_type_t; + + +struct cv { + uint64_t pad; +}; + +typedef struct cv kcondvar_t; + +void spl_cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg); +void spl_cv_destroy(kcondvar_t *cvp); +void spl_cv_signal(kcondvar_t *cvp); +void spl_cv_broadcast(kcondvar_t *cvp); +int spl_cv_wait(kcondvar_t *cvp, kmutex_t *mp, int flags, const char *msg); +int spl_cv_timedwait(kcondvar_t *, kmutex_t *, clock_t, int, const char *msg); +int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, + hrtime_t tim, hrtime_t res, int flag); + +/* + * Use these wrapper macros to obtain the CV variable + * name to make ZFS more gdb debugging friendly! + * This name shows up as a thread's wait_event string. + */ +#define cv_wait(cvp, mp) \ + (void) spl_cv_wait((cvp), (mp), PRIBIO, #cvp) + +#define cv_wait_io(cvp, mp) \ + (void) spl_cv_wait((cvp), (mp), PRIBIO, #cvp) + +#define cv_timedwait(cvp, mp, tim) \ + spl_cv_timedwait((cvp), (mp), (tim), PRIBIO, #cvp) + +#define cv_timedwait_io(cvp, mp, tim) \ + spl_cv_timedwait((cvp), (mp), (tim), PRIBIO, #cvp) + +#define cv_wait_interruptible(cvp, mp) \ + (void) spl_cv_wait((cvp), (mp), PRIBIO|PCATCH, #cvp) + +#define cv_timedwait_interruptible(cvp, mp, tim) \ + spl_cv_timedwait((cvp), (mp), (tim), PRIBIO|PCATCH, #cvp) + +/* cv_wait_sig is the correct name for cv_wait_interruptible */ +#define cv_wait_sig(cvp, mp) \ + spl_cv_wait((cvp), (mp), PRIBIO|PCATCH, #cvp) + +#define cv_wait_io_sig(cvp, mp) \ + spl_cv_wait((cvp), (mp), PRIBIO|PCATCH, #cvp) + +#define cv_timedwait_sig(cvp, mp, tim) \ + spl_cv_timedwait((cvp), (mp), (tim), PRIBIO|PCATCH, #cvp) + +#define TICK_TO_NSEC(tick) ((hrtime_t)(tick) * 1000000000 / hz) +#define cv_reltimedwait(cvp, mp, tim, type) \ + cv_timedwait_hires((cvp), (mp), TICK_TO_NSEC((tim)), 0, 0) + +#define cv_timedwait_sig_hires(cvp, mp, tim, res, flag) \ + cv_timedwait_hires(cvp, mp, tim, res, (flag)|PCATCH) + +#define cv_init spl_cv_init +#define cv_destroy spl_cv_destroy +#define cv_broadcast spl_cv_broadcast +#define cv_signal spl_cv_signal + +#endif diff --git a/include/os/macos/spl/sys/console.h b/include/os/macos/spl/sys/console.h new file mode 100644 index 0000000000..57c9622105 --- /dev/null +++ b/include/os/macos/spl/sys/console.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SPL_SYS_CONSOLE_H +#define _SPL_SYS_CONSOLE_H + +static inline void +console_vprintf(const char *fmt, va_list args) +{ + vprintf(fmt, args); +} + +static inline void +console_printf(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + console_vprintf(fmt, args); + va_end(args); +} + +#endif /* _SPL_SYS_CONSOLE_H */ diff --git a/include/os/macos/spl/sys/cred.h b/include/os/macos/spl/sys/cred.h new file mode 100644 index 0000000000..9dd1640b5b --- /dev/null +++ b/include/os/macos/spl/sys/cred.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_CRED_H +#define _SPL_CRED_H + +#include +#include +#include + +typedef struct ucred cred_t; + +#define kcred (cred_t *)NOCRED +#define CRED() (cred_t *)kauth_cred_get() +#define KUID_TO_SUID(x) (x) +#define KGID_TO_SGID(x) (x) + +#include + +// Older OSX API +#if !(MAC_OS_X_VERSION_MIN_REQUIRED >= 1070) +#define kauth_cred_getruid(x) (x)->cr_ruid +#define kauth_cred_getrgid(x) (x)->cr_rgid +#define kauth_cred_getsvuid(x) (x)->cr_svuid +#define kauth_cred_getsvgid(x) (x)->cr_svgid +#endif + + +extern void crhold(cred_t *cr); +extern void crfree(cred_t *cr); +extern uid_t crgetuid(const cred_t *cr); +extern uid_t crgetruid(const cred_t *cr); +extern uid_t crgetsuid(const cred_t *cr); +extern uid_t crgetfsuid(const cred_t *cr); +extern gid_t crgetgid(const cred_t *cr); +extern gid_t crgetrgid(const cred_t *cr); +extern gid_t crgetsgid(const cred_t *cr); +extern gid_t crgetfsgid(const cred_t *cr); +extern int crgetngroups(const cred_t *cr); +extern gid_t *crgetgroups(const cred_t *cr); +extern void crgetgroupsfree(gid_t *gids); +extern int spl_cred_ismember_gid(cred_t *cr, gid_t gid); + +#define crgetsid(cred, i) (NULL) + +#endif /* _SPL_CRED_H */ diff --git a/include/os/macos/spl/sys/ctype.h b/include/os/macos/spl/sys/ctype.h new file mode 100644 index 0000000000..7455487330 --- /dev/null +++ b/include/os/macos/spl/sys/ctype.h @@ -0,0 +1,27 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_CTYPE_H +#define _SPL_CTYPE_H + +#define iscntrl(C) (uchar(C) <= 0x1f || uchar(C) == 0x7f) + +#endif diff --git a/include/os/macos/spl/sys/debug.h b/include/os/macos/spl/sys/debug.h new file mode 100644 index 0000000000..61d39d035a --- /dev/null +++ b/include/os/macos/spl/sys/debug.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Available Solaris debug functions. All of the ASSERT() macros will be + * compiled out when NDEBUG is defined, this is the default behavior for + * the SPL. To enable assertions use the --enable-debug with configure. + * The VERIFY() functions are never compiled out and cannot be disabled. + * + * PANIC() - Panic the node and print message. + * ASSERT() - Assert X is true, if not panic. + * ASSERT3B() - Assert boolean X OP Y is true, if not panic. + * ASSERT3S() - Assert signed X OP Y is true, if not panic. + * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. + * ASSERT3P() - Assert pointer X OP Y is true, if not panic. + * ASSERT0() - Assert value is zero, if not panic. + * VERIFY() - Verify X is true, if not panic. + * VERIFY3B() - Verify boolean X OP Y is true, if not panic. + * VERIFY3S() - Verify signed X OP Y is true, if not panic. + * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. + * VERIFY3P() - Verify pointer X OP Y is true, if not panic. + * VERIFY0() - Verify value is zero, if not panic. + */ + +#ifndef _SPL_DEBUG_H +#define _SPL_DEBUG_H + +#include + +/* + * Common DEBUG functionality. + */ +int spl_panic(const char *file, const char *func, int line, + const char *fmt, ...); +void spl_dumpstack(void); + +void spl_backtrace(char *thesignal); +int getpcstack(uintptr_t *pcstack, int pcstack_limit); +void print_symbol(uintptr_t symbol); + +#ifndef expect +#define expect(expr, value) (__builtin_expect((expr), (value))) +#endif +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#ifndef __maybe_unused +#define __maybe_unused __attribute__((unused)) +#endif + +/* BEGIN CSTYLED */ +#define PANIC(fmt, a...) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) + +#define VERIFY(cond) \ + (void) (unlikely(!(cond)) && \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "%s", "VERIFY(" #cond ") failed\n")) + +#define VERIFY3B(LEFT, OP, RIGHT) do { \ + boolean_t _verify3_left = (boolean_t)(LEFT); \ + boolean_t _verify3_right = (boolean_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%d " #OP " %d)\n", \ + (boolean_t) (_verify3_left), \ + (boolean_t) (_verify3_right)); \ + } while (0) + +#define VERIFY3S(LEFT, OP, RIGHT) do { \ + int64_t _verify3_left = (int64_t)(LEFT); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%lld " #OP " %lld)\n", \ + (long long) (_verify3_left), \ + (long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3U(LEFT, OP, RIGHT) do { \ + uint64_t _verify3_left = (uint64_t)(LEFT); \ + uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%llu " #OP " %llu)\n", \ + (unsigned long long) (_verify3_left), \ + (unsigned long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3P(LEFT, OP, RIGHT) do { \ + uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%px " #OP " %px)\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right)); \ + } while (0) + +#define VERIFY0(RIGHT) do { \ + int64_t _verify3_left = (int64_t)(0); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left == _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(0 == " #RIGHT ") " \ + "failed (0 == %lld)\n", \ + (long long) (_verify3_right)); \ + } while (0) + +#define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__) +#define CTASSERT(x) { _CTASSERT(x, __LINE__); } +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) \ + typedef char __attribute__ ((unused)) \ + __compile_time_assertion__ ## y[(x) ? 1 : -1] + + + +/* + * Debugging disabled (--disable-debug) + */ +#ifdef NDEBUG + +#define ASSERT(x) ((void)0) +#define ASSERT3B(x,y,z) ((void)0) +#define ASSERT3S(x,y,z) ((void)0) +#define ASSERT3U(x,y,z) ((void)0) +#define ASSERT3P(x,y,z) ((void)0) +#define ASSERT0(x) ((void)0) +#define ASSERTV(x) ((void)0) +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) + +/* + * Debugging enabled (--enable-debug) + */ +#else + +#define ASSERT3B VERIFY3B +#define ASSERT3S VERIFY3S +#define ASSERT3U VERIFY3U +#define ASSERT3P VERIFY3P +#define ASSERT0 VERIFY0 +#define ASSERT VERIFY +#define ASSERTV(X) X +#define IMPLY(A, B) \ + ((void)(((!(A)) || (B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") implies (" #B ")"))) +#define EQUIV(A, B) \ + ((void)((!!(A) == !!(B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") is equivalent to (" #B ")"))) +/* END CSTYLED */ + +#endif /* NDEBUG */ + +#endif /* SPL_DEBUG_H */ diff --git a/include/os/macos/spl/sys/disp.h b/include/os/macos/spl/sys/disp.h new file mode 100644 index 0000000000..3b1bcbb25c --- /dev/null +++ b/include/os/macos/spl/sys/disp.h @@ -0,0 +1,25 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_DISP_H +#define _SPL_DISP_H + +#endif diff --git a/include/os/macos/spl/sys/dkio.h b/include/os/macos/spl/sys/dkio.h new file mode 100644 index 0000000000..d10314b3e4 --- /dev/null +++ b/include/os/macos/spl/sys/dkio.h @@ -0,0 +1,527 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _OPENSOLARIS_SYS_DKIO_H_ +#define _OPENSOLARIS_SYS_DKIO_H_ + +#include /* Needed for NDKMAP define */ +#include /* Needed for NDKMAP define */ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_SUNOS_VTOC_16) +#define NDKMAP 16 /* # of logical partitions */ +#define DK_LABEL_LOC 1 /* location of disk label */ +#elif defined(_SUNOS_VTOC_8) +#define NDKMAP 8 /* # of logical partitions */ +#define DK_LABEL_LOC 0 /* location of disk label */ +#else +#error "No VTOC format defined." +#endif + +/* + * Structures and definitions for disk io control commands + */ + +/* + * Structures used as data by ioctl calls. + */ + +#define DK_DEVLEN 16 /* device name max length, including */ + /* unit # & NULL (ie - "xyc1") */ + +/* + * Used for controller info + */ +struct dk_cinfo { + char dki_cname[DK_DEVLEN]; /* controller name (no unit #) */ + ushort_t dki_ctype; /* controller type */ + ushort_t dki_flags; /* flags */ + ushort_t dki_cnum; /* controller number */ + uint_t dki_addr; /* controller address */ + uint_t dki_space; /* controller bus type */ + uint_t dki_prio; /* interrupt priority */ + uint_t dki_vec; /* interrupt vector */ + char dki_dname[DK_DEVLEN]; /* drive name (no unit #) */ + uint_t dki_unit; /* unit number */ + uint_t dki_slave; /* slave number */ + ushort_t dki_partition; /* partition number */ + ushort_t dki_maxtransfer; /* max. transfer size in DEV_BSIZE */ +}; + +/* + * Controller types + */ +#define DKC_UNKNOWN 0 +#define DKC_CDROM 1 /* CD-ROM, SCSI or otherwise */ +#define DKC_WDC2880 2 +#define DKC_XXX_0 3 /* unassigned */ +#define DKC_XXX_1 4 /* unassigned */ +#define DKC_DSD5215 5 +#define DKC_ACB4000 7 +#define DKC_MD21 8 +#define DKC_XXX_2 9 /* unassigned */ +#define DKC_NCRFLOPPY 10 +#define DKC_SMSFLOPPY 12 +#define DKC_SCSI_CCS 13 /* SCSI CCS compatible */ +#define DKC_INTEL82072 14 /* native floppy chip */ +#define DKC_MD 16 /* meta-disk (virtual-disk) driver */ +#define DKC_INTEL82077 19 /* 82077 floppy disk controller */ +#define DKC_DIRECT 20 /* Intel direct attached device i.e. IDE */ +#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type */ +#define DKC_PCMCIA_ATA 22 /* PCMCIA AT Attached type */ +#define DKC_VBD 23 /* virtual block device */ + +/* + * Sun reserves up through 1023 + */ + +#define DKC_CUSTOMER_BASE 1024 + +/* + * Flags + */ +#define DKI_BAD144 0x01 /* use DEC std 144 bad sector fwding */ +#define DKI_MAPTRK 0x02 /* controller does track mapping */ +#define DKI_FMTTRK 0x04 /* formats only full track at a time */ +#define DKI_FMTVOL 0x08 /* formats only full volume at a time */ +#define DKI_FMTCYL 0x10 /* formats only full cylinders at a time */ +#define DKI_HEXUNIT 0x20 /* unit number is printed as 3 hex digits */ +#define DKI_PCMCIA_PFD 0x40 /* PCMCIA pseudo-floppy memory card */ + +/* + * partition headers: section 1 + * Returned in struct dk_allmap by ioctl DKIOC[SG]APART (dkio(7I)) + */ +struct dk_map { + uint64_t dkl_cylno; /* starting cylinder */ + uint64_t dkl_nblk; /* number of blocks; if == 0, */ + /* partition is undefined */ +}; + +/* + * Used for all partitions + */ +struct dk_allmap { + struct dk_map dka_map[NDKMAP]; +}; + +#if defined(_SYSCALL32) +struct dk_allmap32 { + struct dk_map32 dka_map[NDKMAP]; +}; +#endif /* _SYSCALL32 */ + +/* + * Definition of a disk's geometry + */ +struct dk_geom { + unsigned short dkg_ncyl; /* # of data cylinders */ + unsigned short dkg_acyl; /* # of alternate cylinders */ + unsigned short dkg_bcyl; /* cyl offset (for fixed head area) */ + unsigned short dkg_nhead; /* # of heads */ + unsigned short dkg_obs1; /* obsolete */ + unsigned short dkg_nsect; /* # of data sectors per track */ + unsigned short dkg_intrlv; /* interleave factor */ + unsigned short dkg_obs2; /* obsolete */ + unsigned short dkg_obs3; /* obsolete */ + unsigned short dkg_apc; /* alternates per cyl (SCSI only) */ + unsigned short dkg_rpm; /* revolutions per minute */ + unsigned short dkg_pcyl; /* # of physical cylinders */ + unsigned short dkg_write_reinstruct; /* # sectors to skip, writes */ + unsigned short dkg_read_reinstruct; /* # sectors to skip, reads */ + unsigned short dkg_extra[7]; /* for compatible expansion */ +}; + +/* + * These defines are for historic compatibility with old drivers. + */ +#define dkg_bhead dkg_obs1 /* used to be head offset */ +#define dkg_gap1 dkg_obs2 /* used to be gap1 */ +#define dkg_gap2 dkg_obs3 /* used to be gap2 */ + +/* + * Disk io control commands + * Warning: some other ioctls with the DIOC prefix exist elsewhere. + * The Generic DKIOC numbers are from 0 - 50. + * The Floppy Driver uses 51 - 100. + * The Hard Disk (except SCSI) 101 - 106. (these are obsolete) + * The CDROM Driver 151 - 200. + * The USCSI ioctl 201 - 250. + */ +#define DKIOC (0x04 << 8) + +/* + * The following ioctls are generic in nature and need to be + * supported as appropriate by all disk drivers + */ +#define DKIOCGGEOM (DKIOC|1) /* Get geometry */ +#define DKIOCINFO (DKIOC|3) /* Get info */ +#define DKIOCGVTOC (DKIOC|11) /* Get VTOC */ +#define DKIOCSVTOC (DKIOC|12) /* Set VTOC & Write to Disk */ + +/* + * Disk Cache Controls. These ioctls should be supported by + * all disk drivers. + * + * DKIOCFLUSHWRITECACHE when used from user-mode ignores the ioctl + * argument, but it should be passed as NULL to allow for future + * reinterpretation. From user-mode, this ioctl request is synchronous. + * + * When invoked from within the kernel, the arg can be NULL to indicate + * a synchronous request or can be the address of a struct dk_callback + * to request an asynchronous callback when the flush request is complete. + * In this case, the flag to the ioctl must include FKIOCTL and the + * dkc_callback field of the pointed to struct must be non-null or the + * request is made synchronously. + * + * In the callback case: if the ioctl returns 0, a callback WILL be performed. + * If the ioctl returns non-zero, a callback will NOT be performed. + * NOTE: In some cases, the callback may be done BEFORE the ioctl call + * returns. The caller's locking strategy should be prepared for this case. + */ +#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ + +struct dk_callback { + void (*dkc_callback)(void *dkc_cookie, int error); + void *dkc_cookie; + int dkc_flag; +}; + +/* bit flag definitions for dkc_flag */ +#define FLUSH_VOLATILE 0x1 /* Bit 0: if set, only flush */ + /* volatile cache; otherwise, flush */ + /* volatile and non-volatile cache */ + +#define DKIOCGETWCE (DKIOC|36) /* Get current write cache */ + /* enablement status */ +#define DKIOCSETWCE (DKIOC|37) /* Enable/Disable write cache */ + +/* + * The following ioctls are used by Sun drivers to communicate + * with their associated format routines. Support of these ioctls + * is not required of foreign drivers + */ +#define DKIOCSGEOM (DKIOC|2) /* Set geometry */ +#define DKIOCSAPART (DKIOC|4) /* Set all partitions */ +#define DKIOCGAPART (DKIOC|5) /* Get all partitions */ +#define DKIOCG_PHYGEOM (DKIOC|32) /* get physical geometry */ +#define DKIOCG_VIRTGEOM (DKIOC|33) /* get virtual geometry */ + +/* + * The following ioctl's are removable media support + */ +#define DKIOCLOCK (DKIOC|7) /* Generic 'lock' */ +#define DKIOCUNLOCK (DKIOC|8) /* Generic 'unlock' */ +#define DKIOCSTATE (DKIOC|13) /* Inquire insert/eject state */ +#define DKIOCREMOVABLE (DKIOC|16) /* is media removable */ + + +/* + * ioctl for hotpluggable devices + */ +#define DKIOCHOTPLUGGABLE (DKIOC|35) /* is hotpluggable */ + +/* + * Ioctl to force driver to re-read the alternate partition and rebuild + * the internal defect map. + */ +#define DKIOCADDBAD (DKIOC|20) /* Re-read the alternate map (IDE) */ +#define DKIOCGETDEF (DKIOC|21) /* read defect list (IDE) */ + +/* + * Used by applications to get disk defect information from IDE + * drives. + */ +#ifdef _SYSCALL32 +struct defect_header32 { + int head; + caddr32_t buffer; +}; +#endif /* _SYSCALL32 */ + +struct defect_header { + int head; + caddr_t buffer; +}; + +#define DKIOCPARTINFO (DKIOC|22) /* Get partition or slice parameters */ + +/* + * Used by applications to get partition or slice information + */ +#ifdef _SYSCALL32 +struct part_info32 { + uint32_t p_start; + int p_length; +}; +#endif /* _SYSCALL32 */ + +struct part_info { + uint64_t p_start; + int p_length; +}; + +/* The following ioctls are for Optical Memory Device */ +#define DKIOC_EBP_ENABLE (DKIOC|40) /* enable by pass erase on write */ +#define DKIOC_EBP_DISABLE (DKIOC|41) /* disable by pass erase on write */ + +/* + * This state enum is the argument passed to the DKIOCSTATE ioctl. + */ +enum dkio_state { DKIO_NONE, DKIO_EJECTED, DKIO_INSERTED, DKIO_DEV_GONE }; + +#define DKIOCGMEDIAINFO (DKIOC|42) /* get information about the media */ + +/* + * ioctls to read/write mboot info. + */ +#define DKIOCGMBOOT (DKIOC|43) /* get mboot info */ +#define DKIOCSMBOOT (DKIOC|44) /* set mboot info */ + +/* + * ioctl to get the device temperature. + */ +#define DKIOCGTEMPERATURE (DKIOC|45) /* get temperature */ + +/* + * Used for providing the temperature. + */ + +struct dk_temperature { + uint_t dkt_flags; /* Flags */ + short dkt_cur_temp; /* Current disk temperature */ + short dkt_ref_temp; /* reference disk temperature */ +}; + +#define DKT_BYPASS_PM 0x1 +#define DKT_INVALID_TEMP 0xFFFF + + +/* + * Media types or profiles known + */ +#define DK_UNKNOWN 0x00 /* Media inserted - type unknown */ + + +/* + * SFF 8090 Specification Version 3, media types 0x01 - 0xfffe are retained to + * maintain compatibility with SFF8090. The following define the + * optical media type. + */ +#define DK_REMOVABLE_DISK 0x02 /* Removable Disk */ +#define DK_MO_ERASABLE 0x03 /* MO Erasable */ +#define DK_MO_WRITEONCE 0x04 /* MO Write once */ +#define DK_AS_MO 0x05 /* AS MO */ +#define DK_CDROM 0x08 /* CDROM */ +#define DK_CDR 0x09 /* CD-R */ +#define DK_CDRW 0x0A /* CD-RW */ +#define DK_DVDROM 0x10 /* DVD-ROM */ +#define DK_DVDR 0x11 /* DVD-R */ +#define DK_DVDRAM 0x12 /* DVD_RAM or DVD-RW */ + +/* + * Media types for other rewritable magnetic media + */ +#define DK_FIXED_DISK 0x10001 /* Fixed disk SCSI or otherwise */ +#define DK_FLOPPY 0x10002 /* Floppy media */ +#define DK_ZIP 0x10003 /* IOMEGA ZIP media */ +#define DK_JAZ 0x10004 /* IOMEGA JAZ media */ + +#define DKIOCSETEFI (DKIOC|17) /* Set EFI info */ +#define DKIOCGETEFI (DKIOC|18) /* Get EFI info */ + +#define DKIOCPARTITION (DKIOC|9) /* Get partition info */ + +/* + * Ioctls to get/set volume capabilities related to Logical Volume Managers. + * They include the ability to get/set capabilities and to issue a read to a + * specific underlying device of a replicated device. + */ + +#define DKIOCGETVOLCAP (DKIOC | 25) /* Get volume capabilities */ +#define DKIOCSETVOLCAP (DKIOC | 26) /* Set volume capabilities */ +#define DKIOCDMR (DKIOC | 27) /* Issue a directed read */ + +typedef uint_t volcapinfo_t; + +typedef uint_t volcapset_t; + +#define DKV_ABR_CAP 0x00000001 /* Support Appl.Based Recovery */ +#define DKV_DMR_CAP 0x00000002 /* Support Directed Mirror Read */ + +typedef struct volcap { + volcapinfo_t vc_info; /* Capabilities available */ + volcapset_t vc_set; /* Capabilities set */ +} volcap_t; + +#define VOL_SIDENAME 256 + +typedef struct vol_directed_rd { + int vdr_flags; + offset_t vdr_offset; + size_t vdr_nbytes; + size_t vdr_bytesread; + void *vdr_data; + int vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd_t; + +#define DKV_SIDE_INIT (-1) +#define DKV_DMR_NEXT_SIDE 0x00000001 +#define DKV_DMR_DONE 0x00000002 +#define DKV_DMR_ERROR 0x00000004 +#define DKV_DMR_SUCCESS 0x00000008 +#define DKV_DMR_SHORT 0x00000010 + +#ifdef _MULTI_DATAMODEL +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif +typedef struct vol_directed_rd32 { + int32_t vdr_flags; + offset_t vdr_offset; /* 64-bit element on 32-bit alignment */ + size32_t vdr_nbytes; + size32_t vdr_bytesread; + caddr32_t vdr_data; + int32_t vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd32_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif +#endif /* _MULTI_DATAMODEL */ + +/* + * The ioctl is used to fetch disk's device type, vendor ID, + * model number/product ID, firmware revision and serial number together. + * + * Currently there are two device types - DKD_ATA_TYPE which means the + * disk is driven by cmdk/ata or dad/uata driver, and DKD_SCSI_TYPE + * which means the disk is driven by sd/scsi hba driver. + */ +#define DKIOC_GETDISKID (DKIOC|46) + +/* These two labels are for dkd_dtype of dk_disk_id_t */ +#define DKD_ATA_TYPE 0x01 /* ATA disk or legacy mode SATA disk */ +#define DKD_SCSI_TYPE 0x02 /* SCSI disk or native mode SATA disk */ + +#define DKD_ATA_MODEL 40 /* model number length */ +#define DKD_ATA_FWVER 8 /* firmware revision length */ +#define DKD_ATA_SERIAL 20 /* serial number length */ + +#define DKD_SCSI_VENDOR 8 /* vendor ID length */ +#define DKD_SCSI_PRODUCT 16 /* product ID length */ +#define DKD_SCSI_REVLEVEL 4 /* revision level length */ +#define DKD_SCSI_SERIAL 12 /* serial number length */ + +/* + * The argument type for DKIOC_GETDISKID ioctl. + */ +typedef struct dk_disk_id { + uint_t dkd_dtype; + union { + struct { + char dkd_amodel[DKD_ATA_MODEL]; /* 40 bytes */ + char dkd_afwver[DKD_ATA_FWVER]; /* 8 bytes */ + char dkd_aserial[DKD_ATA_SERIAL]; /* 20 bytes */ + } ata_disk_id; + struct { + char dkd_svendor[DKD_SCSI_VENDOR]; /* 8 bytes */ + char dkd_sproduct[DKD_SCSI_PRODUCT]; /* 16 bytes */ + char dkd_sfwver[DKD_SCSI_REVLEVEL]; /* 4 bytes */ + char dkd_sserial[DKD_SCSI_SERIAL]; /* 12 bytes */ + } scsi_disk_id; + } disk_id; +} dk_disk_id_t; + +/* + * The ioctl is used to update the firmware of device. + */ +#define DKIOC_UPDATEFW (DKIOC|47) + +/* The argument type for DKIOC_UPDATEFW ioctl */ +typedef struct dk_updatefw { + caddr_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_t; + +#ifdef _SYSCALL32 +typedef struct dk_updatefw_32 { + caddr32_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_32_t; +#endif /* _SYSCALL32 */ + +/* + * firmware update type - temporary or permanent use + */ +#define FW_TYPE_TEMP 0x0 /* temporary use */ +#define FW_TYPE_PERM 0x1 /* permanent use */ + +#define DKIOC (0x04 << 8) +#define DKIOCTRIM (DKIOC | 35) + +/* + * ioctl to free space (e.g. SCSI UNMAP) off a disk. + * Pass a dkioc_free_list_t containing a list of extents to be freed. + */ +#define DKIOCFREE (DKIOC|50) + +#define DF_WAIT_SYNC 0x00000001 /* Wait for full write-out of free. */ + +typedef struct dkioc_free_list_ext_s { + uint64_t dfle_start; + uint64_t dfle_length; +} dkioc_free_list_ext_t; + +typedef struct dkioc_free_list_s { + uint64_t dfl_flags; + uint64_t dfl_num_exts; + uint64_t dfl_offset; + dkioc_free_list_ext_t dfl_exts[1]; +} dkioc_free_list_t; +#define DFL_SZ(num_exts) \ + (sizeof (dkioc_free_list_t) + \ + (num_exts - 1) * sizeof (dkioc_free_list_ext_t)) + +/* Frees a variable-length dkioc_free_list_t structure. */ +static inline void +dfl_free(dkioc_free_list_t *dfl) +{ + kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _OPENSOLARIS_SYS_DKIO_H_ */ diff --git a/include/os/macos/spl/sys/errno.h b/include/os/macos/spl/sys/errno.h new file mode 100644 index 0000000000..67574721cc --- /dev/null +++ b/include/os/macos/spl/sys/errno.h @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include_next + +#define EBADE EBADMACHO +#define ECKSUM EBADE +#define EFRAGS EIDRM +#define EREMOTEIO ENOLINK +#define ENOTACTIVE ENOPOLICY +#define ECHRNG EMULTIHOP diff --git a/include/os/macos/spl/sys/fcntl.h b/include/os/macos/spl/sys/fcntl.h new file mode 100644 index 0000000000..5f13d304fc --- /dev/null +++ b/include/os/macos/spl/sys/fcntl.h @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_FCNTL_H +#define _SPL_FCNTL_H + +#include_next + +#define F_FREESP 11 + +#define O_LARGEFILE 0 +#define O_RSYNC 0 +#define O_DIRECT 0 + +#endif /* _SPL_FCNTL_H */ diff --git a/include/os/macos/spl/sys/file.h b/include/os/macos/spl/sys/file.h new file mode 100644 index 0000000000..136e8f3bfb --- /dev/null +++ b/include/os/macos/spl/sys/file.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_FILE_H +#define _SPL_FILE_H + +#define FIGNORECASE 0x00080000 +#define FKIOCTL 0x80000000 +#define ED_CASE_CONFLICT 0x10 + +#include + +/* + * XNU has all the proc structs as opaque and with no functions we + * are allowed to call, so we implement file IO from within the kernel + * as vnode operations. + * The second mode is when we are given a "fd" from userland, which we + * map in here, using getf()/releasef(). + * When it comes to IO, if "fd" is set, we use it (fo_rdwr()) as it + * can handle both files, and pipes. + * In kernel space file ops, we use vn_rdwr on the vnode. + */ +struct spl_fileproc { + void *f_vnode; /* underlying vnode */ + list_node_t f_next; /* * next getf() link for releasef() */ + int f_fd; /* * userland file descriptor */ + off_t f_offset; /* offset for stateful IO */ + void *f_proc; /* opaque */ + void *f_fp; /* opaque */ + int f_writes; /* did write? for close sync */ + minor_t f_file; /* minor of the file */ + void *f_private; /* zfsdev_state_t */ +}; +/* Members with '*' are not used when 'fd' is not given */ + +void *getf(int fd); +void releasef(int fd); +struct vnode *getf_vnode(void *fp); + +#endif /* SPL_FILE_H */ diff --git a/include/os/macos/spl/sys/inttypes.h b/include/os/macos/spl/sys/inttypes.h new file mode 100644 index 0000000000..c9f6a316aa --- /dev/null +++ b/include/os/macos/spl/sys/inttypes.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_INTTYPES_H +#define _SPL_INTTYPES_H + +#endif /* SPL_INTTYPES_H */ diff --git a/include/os/macos/spl/sys/isa_defs.h b/include/os/macos/spl/sys/isa_defs.h new file mode 100644 index 0000000000..f702dc51e1 --- /dev/null +++ b/include/os/macos/spl/sys/isa_defs.h @@ -0,0 +1,690 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ISA_DEFS_H +#define _SYS_ISA_DEFS_H + +/* + * This header file serves to group a set of well known defines and to + * set these for each instruction set architecture. These defines may + * be divided into two groups; characteristics of the processor and + * implementation choices for Solaris on a processor. + * + * Processor Characteristics: + * + * _LITTLE_ENDIAN / _BIG_ENDIAN: + * The natural byte order of the processor. A pointer to an int points + * to the least/most significant byte of that int. + * + * _STACK_GROWS_UPWARD / _STACK_GROWS_DOWNWARD: + * The processor specific direction of stack growth. A push onto the + * stack increases/decreases the stack pointer, so it stores data at + * successively higher/lower addresses. (Stackless machines ignored + * without regrets). + * + * _LONG_LONG_HTOL / _LONG_LONG_LTOH: + * A pointer to a long long points to the most/least significant long + * within that long long. + * + * _BIT_FIELDS_HTOL / _BIT_FIELDS_LTOH: + * The C compiler assigns bit fields from the high/low to the low/high end + * of an int (most to least significant vs. least to most significant). + * + * _IEEE_754: + * The processor (or supported implementations of the processor) + * supports the ieee-754 floating point standard. No other floating + * point standards are supported (or significant). Any other supported + * floating point formats are expected to be cased on the ISA processor + * symbol. + * + * _CHAR_IS_UNSIGNED / _CHAR_IS_SIGNED: + * The C Compiler implements objects of type `char' as `unsigned' or + * `signed' respectively. This is really an implementation choice of + * the compiler writer, but it is specified in the ABI and tends to + * be uniform across compilers for an instruction set architecture. + * Hence, it has the properties of a processor characteristic. + * + * _CHAR_ALIGNMENT / _SHORT_ALIGNMENT / _INT_ALIGNMENT / _LONG_ALIGNMENT / + * _LONG_LONG_ALIGNMENT / _DOUBLE_ALIGNMENT / _LONG_DOUBLE_ALIGNMENT / + * _POINTER_ALIGNMENT / _FLOAT_ALIGNMENT: + * The ABI defines alignment requirements of each of the primitive + * object types. Some, if not all, may be hardware requirements as + * well. The values are expressed in "byte-alignment" units. + * + * _MAX_ALIGNMENT: + * The most stringent alignment requirement as specified by the ABI. + * Equal to the maximum of all the above _XXX_ALIGNMENT values. + * + * _ALIGNMENT_REQUIRED: + * True or false (1 or 0) whether or not the hardware requires the ABI + * alignment. + * + * _LONG_LONG_ALIGNMENT_32 + * The 32-bit ABI supported by a 64-bit kernel may have different + * alignment requirements for primitive object types. The value of this + * identifier is expressed in "byte-alignment" units. + * + * _HAVE_CPUID_INSN + * This indicates that the architecture supports the 'cpuid' + * instruction as defined by Intel. (Intel allows other vendors + * to extend the instruction for their own purposes.) + * + * + * Implementation Choices: + * + * _ILP32 / _LP64: + * This specifies the compiler data type implementation as specified in + * the relevant ABI. The choice between these is strongly influenced + * by the underlying hardware, but is not absolutely tied to it. + * Currently only two data type models are supported: + * + * _ILP32: + * Int/Long/Pointer are 32 bits. This is the historical UNIX + * and Solaris implementation. Due to its historical standing, + * this is the default case. + * + * _LP64: + * Long/Pointer are 64 bits, Int is 32 bits. This is the chosen + * implementation for 64-bit ABIs such as SPARC V9. + * + * _I32LPx: + * A compilation environment where 'int' is 32-bit, and + * longs and pointers are simply the same size. + * + * In all cases, Char is 8 bits and Short is 16 bits. + * + * _SUNOS_VTOC_8 / _SUNOS_VTOC_16 / _SVR4_VTOC_16: + * This specifies the form of the disk VTOC (or label): + * + * _SUNOS_VTOC_8: + * This is a VTOC form which is upwardly compatible with the + * SunOS 4.x disk label and allows 8 partitions per disk. + * + * _SUNOS_VTOC_16: + * In this format the incore vtoc image matches the ondisk + * version. It allows 16 slices per disk, and is not + * compatible with the SunOS 4.x disk label. + * + * Note that these are not the only two VTOC forms possible and + * additional forms may be added. One possible form would be the + * SVr4 VTOC form. The symbol for that is reserved now, although + * it is not implemented. + * + * _SVR4_VTOC_16: + * This VTOC form is compatible with the System V Release 4 + * VTOC (as implemented on the SVr4 Intel and 3b ports) with + * 16 partitions per disk. + * + * + * _DMA_USES_PHYSADDR / _DMA_USES_VIRTADDR + * This describes the type of addresses used by system DMA: + * + * _DMA_USES_PHYSADDR: + * This type of DMA, used in the x86 implementation, + * requires physical addresses for DMA buffers. The 24-bit + * addresses used by some legacy boards is the source of the + * "low-memory" (<16MB) requirement for some devices using DMA. + * + * _DMA_USES_VIRTADDR: + * This method of DMA allows the use of virtual addresses for + * DMA transfers. + * + * _FIRMWARE_NEEDS_FDISK / _NO_FDISK_PRESENT + * This indicates the presence/absence of an fdisk table. + * + * _FIRMWARE_NEEDS_FDISK + * The fdisk table is required by system firmware. If present, + * it allows a disk to be subdivided into multiple fdisk + * partitions, each of which is equivalent to a separate, + * virtual disk. This enables the co-existence of multiple + * operating systems on a shared hard disk. + * + * _NO_FDISK_PRESENT + * If the fdisk table is absent, it is assumed that the entire + * media is allocated for a single operating system. + * + * _HAVE_TEM_FIRMWARE + * Defined if this architecture has the (fallback) option of + * using prom_* calls for doing I/O if a suitable kernel driver + * is not available to do it. + * + * _DONT_USE_1275_GENERIC_NAMES + * Controls whether or not device tree node names should + * comply with the IEEE 1275 "Generic Names" Recommended + * Practice. With _DONT_USE_GENERIC_NAMES, device-specific + * names identifying the particular device will be used. + * + * __i386_COMPAT + * This indicates whether the i386 ABI is supported as a *non-native* + * mode for the platform. When this symbol is defined: + * - 32-bit xstat-style system calls are enabled + * - 32-bit xmknod-style system calls are enabled + * - 32-bit system calls use i386 sizes -and- alignments + * + * Note that this is NOT defined for the i386 native environment! + * + * __x86 + * This is ONLY a synonym for defined(__i386) || defined(__amd64) + * which is useful only insofar as these two architectures share + * common attributes. Analogous to __sparc. + * + * _PSM_MODULES + * This indicates whether or not the implementation uses PSM + * modules for processor support, reading /etc/mach from inside + * the kernel to extract a list. + * + * _RTC_CONFIG + * This indicates whether or not the implementation uses /etc/rtc_config + * to configure the real-time clock in the kernel. + * + * _UNIX_KRTLD + * This indicates that the implementation uses a dynamically + * linked unix + krtld to form the core kernel image at boot + * time, or (in the absence of this symbol) a prelinked kernel image. + * + * _OBP + * This indicates the firmware interface is OBP. + * + * _SOFT_HOSTID + * This indicates that the implementation obtains the hostid + * from the file /etc/hostid, rather than from hardware. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The following set of definitions characterize Solaris on AMD's + * 64-bit systems. + */ +#if defined(__x86_64) || defined(__amd64) + +#if !defined(__amd64) +#define __amd64 /* preferred guard */ +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Different alignment constraints for the i386 ABI in compatibility mode + */ +#define _LONG_LONG_ALIGNMENT_32 4 + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define __i386_COMPAT +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +/* + * The feature test macro __i386 is generic for all processors implementing + * the Intel 386 instruction set or a superset of it. Specifically, this + * includes all members of the 386, 486, and Pentium family of processors. + */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__aarch64__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__riscv) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#define _LP64 +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__arm__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__mips__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#if defined(__mips_n64) +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _INT_ALIGNMENT +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#else +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__powerpc__) + +#if defined(__BIG_ENDIAN__) +#define _BIT_FIELDS_HTOL +#else +#define _BIT_FIELDS_LTOH +#endif + +/* + * The following set of definitions characterize the Solaris on SPARC systems. + * + * The symbol __sparc indicates any of the SPARC family of processor + * architectures. This includes SPARC V7, SPARC V8 and SPARC V9. + * + * The symbol __sparcv8 indicates the 32-bit SPARC V8 architecture as defined + * by Version 8 of the SPARC Architecture Manual. (SPARC V7 is close enough + * to SPARC V8 for the former to be subsumed into the latter definition.) + * + * The symbol __sparcv9 indicates the 64-bit SPARC V9 architecture as defined + * by Version 9 of the SPARC Architecture Manual. + * + * The symbols __sparcv8 and __sparcv9 are mutually exclusive, and are only + * relevant when the symbol __sparc is defined. + */ +/* + * XXX Due to the existence of 5110166, "defined(__sparcv9)" needs to be added + * to support backwards builds. This workaround should be removed in s10_71. + */ +#elif defined(__sparc) || defined(__sparcv9) || defined(__sparc__) +#if !defined(__sparc) +#define __sparc +#endif + +/* + * You can be 32-bit or 64-bit, but not both at the same time. + */ +#if defined(__sparcv8) && defined(__sparcv9) +#error "SPARC Versions 8 and 9 are mutually exclusive choices" +#endif + +/* + * Existing compilers do not set __sparcv8. Years will transpire before + * the compilers can be depended on to set the feature test macro. In + * the interim, we'll set it here on the basis of historical behaviour; + * if you haven't asked for SPARC V9, then you must've meant SPARC V8. + */ +#if !defined(__sparcv9) && !defined(__sparcv8) +#define __sparcv8 +#endif + +/* + * Define the appropriate "processor characteristics" shared between + * all Solaris on SPARC systems. + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_HTOL +#define _BIT_FIELDS_HTOL +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Define the appropriate "implementation choices" shared between versions. + */ +#define _SUNOS_VTOC_8 +#define _DMA_USES_VIRTADDR +#define _NO_FDISK_PRESENT +#define _HAVE_TEM_FIRMWARE +#define _OBP + +/* + * The following set of definitions characterize the implementation of + * 32-bit Solaris on SPARC V8 systems. + */ +#if defined(__sparcv8) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 8 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#define _ILP32 +#if !defined(_I32LPx) +#define _I32LPx +#endif + +/* + * The following set of definitions characterize the implementation of + * 64-bit Solaris on SPARC V9 systems. + */ +#elif defined(__sparcv9) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL + +#else +#error "unknown SPARC version" +#endif + +/* + * #error is strictly ansi-C, but works as well as anything for K&R systems. + */ +#else +#error "ISA not supported" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#define ____cacheline_aligned __attribute__((aligned(64))) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ISA_DEFS_H */ diff --git a/include/os/macos/spl/sys/kmem.h b/include/os/macos/spl/sys/kmem.h new file mode 100644 index 0000000000..86287060cc --- /dev/null +++ b/include/os/macos/spl/sys/kmem.h @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS Project + * Copyright (C) 2013 Jorgen Lundman + * Copyright (C) 2017 Sean Doran + * + */ + +#ifndef _SPL_KMEM_H +#define _SPL_KMEM_H + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// XNU total amount of memory +extern uint64_t physmem; + +#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */ +#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ +#define KM_PANIC 0x0002 /* if memory cannot be allocated, panic */ +#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ +#define KM_NORMALPRI 0x0008 /* with KM_NOSLEEP, lower priority allocation */ +#define KM_NODEBUG 0x0010 /* NOT IMPLEMENTED ON OSX */ +#define KM_NO_VBA 0x0020 /* OSX: don't descend to the bucket layer */ +#define KM_VMFLAGS 0x00ff /* flags that must match VM_* flags */ + +#define KM_FLAGS 0xffff /* all settable kmem flags */ + +/* + * Kernel memory allocator: DDI interfaces. + * See kmem_alloc(9F) for details. + */ + +// Work around symbol collisions in XNU +#define kmem_alloc(size, kmflags) zfs_kmem_alloc((size), (kmflags)) +#define kmem_zalloc(size, kmflags) zfs_kmem_zalloc((size), (kmflags)) +#define kmem_free(buf, size) zfs_kmem_free((buf), (size)) + +void *zfs_kmem_alloc(size_t size, int kmflags); +void *zfs_kmem_zalloc(size_t size, int kmflags); +void zfs_kmem_free(void *buf, size_t size); + +void spl_kmem_init(uint64_t); +void spl_kmem_thread_init(void); +void spl_kmem_mp_init(void); +void spl_kmem_thread_fini(void); +void spl_kmem_fini(void); + +size_t kmem_size(void); +size_t kmem_used(void); +int64_t kmem_avail(void); +size_t kmem_num_pages_wanted(void); +int spl_vm_pool_low(void); +int32_t spl_minimal_physmem_p(void); +int64_t spl_adjust_pressure(int64_t); +int64_t spl_free_wrapper(void); +int64_t spl_free_manual_pressure_wrapper(void); +boolean_t spl_free_fast_pressure_wrapper(void); +void spl_free_set_pressure(int64_t); +void spl_free_set_fast_pressure(boolean_t); +uint64_t spl_free_last_pressure_wrapper(void); + +#define KMC_NOTOUCH 0x00010000 +#define KMC_NODEBUG 0x00020000 +#define KMC_NOMAGAZINE 0x00040000 +#define KMC_NOHASH 0x00080000 +#define KMC_QCACHE 0x00100000 +#define KMC_KMEM_ALLOC 0x00200000 /* internal use only */ +#define KMC_IDENTIFIER 0x00400000 /* internal use only */ +#define KMC_PREFILL 0x00800000 +#define KMC_ARENA_SLAB 0x01000000 /* use a bigger kmem cache */ + +struct kmem_cache; + +typedef struct kmem_cache kmem_cache_t; + +/* Client response to kmem move callback */ +typedef enum kmem_cbrc { + KMEM_CBRC_YES, + KMEM_CBRC_NO, + KMEM_CBRC_LATER, + KMEM_CBRC_DONT_NEED, + KMEM_CBRC_DONT_KNOW +} kmem_cbrc_t; + +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + +kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, + int (*constructor)(void *, void *, int), + void (*destructor)(void *, void *), + void (*reclaim)(void *), + void *_private, vmem_t *vmp, int cflags); +void kmem_cache_destroy(kmem_cache_t *cache); +void *kmem_cache_alloc(kmem_cache_t *cache, int flags); +void kmem_cache_free(kmem_cache_t *cache, void *buf); +void kmem_cache_free_to_slab(kmem_cache_t *cache, void *buf); +extern boolean_t kmem_cache_reap_active(void); +void kmem_cache_reap_now(kmem_cache_t *cache); +void kmem_depot_ws_zero(kmem_cache_t *cache); +void kmem_reap(void); +void kmem_reap_idspace(void); +kmem_cache_t *kmem_cache_buf_in_cache(kmem_cache_t *, void *); + +int kmem_debugging(void); +void kmem_cache_set_move(kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, size_t, void *)); + +char *kmem_asprintf(const char *fmt, ...); +extern char *kmem_strdup(const char *str); +extern void kmem_strfree(char *str); +char *kmem_vasprintf(const char *fmt, va_list ap); +char *kmem_strstr(const char *in, const char *str); +void strident_canon(char *s, size_t n); + +boolean_t spl_arc_no_grow(size_t, boolean_t, kmem_cache_t **); + +extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); +extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_KMEM_H */ diff --git a/include/os/macos/spl/sys/kmem_cache.h b/include/os/macos/spl/sys/kmem_cache.h new file mode 100644 index 0000000000..2dc08b1712 --- /dev/null +++ b/include/os/macos/spl/sys/kmem_cache.h @@ -0,0 +1,25 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_KMEM_CACHE_H +#define _SPL_KMEM_CACHE_H + +#endif diff --git a/include/os/macos/spl/sys/kmem_impl.h b/include/os/macos/spl/sys/kmem_impl.h new file mode 100644 index 0000000000..2f3fa7f9da --- /dev/null +++ b/include/os/macos/spl/sys/kmem_impl.h @@ -0,0 +1,494 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_KMEM_IMPL_H +#define _SYS_KMEM_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * kernel memory allocator: implementation-private data structures + * + * Lock order: + * 1. cache_lock + * 2. cc_lock in order by CPU ID + * 3. cache_depot_lock + * + * Do not call kmem_cache_alloc() or taskq_dispatch() while holding any of the + * above locks. + */ + +#define KMF_AUDIT 0x00000001 /* transaction auditing */ +#define KMF_DEADBEEF 0x00000002 /* deadbeef checking */ +#define KMF_REDZONE 0x00000004 /* redzone checking */ +#define KMF_CONTENTS 0x00000008 /* freed-buffer content logging */ +#define KMF_STICKY 0x00000010 /* if set, override /etc/system */ +#define KMF_NOMAGAZINE 0x00000020 /* disable per-cpu magazines */ +#define KMF_FIREWALL 0x00000040 /* put all bufs before unmapped pages */ +#define KMF_LITE 0x00000100 /* lightweight debugging */ + +#define KMF_HASH 0x00000200 /* cache has hash table */ +#define KMF_RANDOMIZE 0x00000400 /* randomize other kmem_flags */ + +#define KMF_DUMPDIVERT 0x00001000 /* use alternate memory at dump time */ +#define KMF_DUMPUNSAFE 0x00002000 /* flag caches used at dump time */ +#define KMF_PREFILL 0x00004000 /* Prefill the slab when created. */ + +#define KMF_BUFTAG (KMF_DEADBEEF | KMF_REDZONE) +#define KMF_TOUCH (KMF_BUFTAG | KMF_LITE | KMF_CONTENTS) +#define KMF_RANDOM (KMF_TOUCH | KMF_AUDIT | KMF_NOMAGAZINE) +#define KMF_DEBUG (KMF_RANDOM | KMF_FIREWALL) + +#define KMEM_STACK_DEPTH 15 + +#define KMEM_FREE_PATTERN 0xdeadbeefdeadbeefULL +#define KMEM_UNINITIALIZED_PATTERN 0xbaddcafebaddcafeULL +#define KMEM_REDZONE_PATTERN 0xfeedfacefeedfaceULL +#define KMEM_REDZONE_BYTE 0xbb + +/* + * Upstream platforms handle size == 0 as valid alloc, we + * can not return NULL, as that invalidates KM_SLEEP. So + * we return a valid hardcoded address, instead of actually taking up + * memory by fudging size to 1 byte. If read/writes are + * attempted, we will get page fault (which is correct, they + * asked for zero bytes after all) + */ +#define KMEM_ZERO_SIZE_PTR ((void *)16) + +/* + * Redzone size encodings for kmem_alloc() / kmem_free(). We encode the + * allocation size, rather than storing it directly, so that kmem_free() + * can distinguish frees of the wrong size from redzone violations. + * + * A size of zero is never valid. + */ +#define KMEM_SIZE_ENCODE(x) (251 * (x) + 1) +#define KMEM_SIZE_DECODE(x) ((x) / 251) +#define KMEM_SIZE_VALID(x) ((x) % 251 == 1 && (x) != 1) + + +#define KMEM_ALIGN 8 /* min guaranteed alignment */ +#define KMEM_ALIGN_SHIFT 3 /* log2(KMEM_ALIGN) */ +#define KMEM_VOID_FRACTION 8 /* never waste more than 1/8 of slab */ + +#define KMEM_SLAB_IS_PARTIAL(sp) \ + ((sp)->slab_refcnt > 0 && (sp)->slab_refcnt < (sp)->slab_chunks) +#define KMEM_SLAB_IS_ALL_USED(sp) \ + ((sp)->slab_refcnt == (sp)->slab_chunks) + +/* + * The bufctl (buffer control) structure keeps some minimal information + * about each buffer: its address, its slab, and its current linkage, + * which is either on the slab's freelist (if the buffer is free), or + * on the cache's buf-to-bufctl hash table (if the buffer is allocated). + * In the case of non-hashed, or "raw", caches (the common case), only + * the freelist linkage is necessary: the buffer address is at a fixed + * offset from the bufctl address, and the slab is at the end of the page. + * + * NOTE: bc_next must be the first field; raw buffers have linkage only. + */ +typedef struct kmem_bufctl { + struct kmem_bufctl *bc_next; /* next bufctl struct */ + void *bc_addr; /* address of buffer */ + struct kmem_slab *bc_slab; /* controlling slab */ +} kmem_bufctl_t; + +/* + * The KMF_AUDIT version of the bufctl structure. The beginning of this + * structure must be identical to the normal bufctl structure so that + * pointers are interchangeable. + */ +typedef struct kmem_bufctl_audit { + struct kmem_bufctl *bc_next; /* next bufctl struct */ + void *bc_addr; /* address of buffer */ + struct kmem_slab *bc_slab; /* controlling slab */ + kmem_cache_t *bc_cache; /* controlling cache */ + hrtime_t bc_timestamp; /* transaction time */ + kthread_t *bc_thread; /* thread doing transaction */ + struct kmem_bufctl *bc_lastlog; /* last log entry */ + void *bc_contents; /* contents at last free */ + int bc_depth; /* stack depth */ + pc_t bc_stack[KMEM_STACK_DEPTH]; /* pc stack */ +} kmem_bufctl_audit_t; + +/* + * A kmem_buftag structure is appended to each buffer whenever any of the + * KMF_BUFTAG flags (KMF_DEADBEEF, KMF_REDZONE, KMF_VERIFY) are set. + */ +typedef struct kmem_buftag { + uint64_t bt_redzone; /* 64-bit redzone pattern */ + kmem_bufctl_t *bt_bufctl; /* bufctl */ + intptr_t bt_bxstat; /* bufctl ^ (alloc/free) */ +} kmem_buftag_t; + +/* + * A variant of the kmem_buftag structure used for KMF_LITE caches. + * Previous callers are stored in reverse chronological order. (i.e. most + * recent first) + */ +typedef struct kmem_buftag_lite { + kmem_buftag_t bt_buftag; /* a normal buftag */ + pc_t bt_history[1]; /* zero or more callers */ +} kmem_buftag_lite_t; + +#define KMEM_BUFTAG_LITE_SIZE(f) \ + (offsetof(kmem_buftag_lite_t, bt_history[f])) + +#define KMEM_BUFTAG(cp, buf) \ + ((kmem_buftag_t *)((char *)(buf) + (cp)->cache_buftag)) + +#define KMEM_BUFCTL(cp, buf) \ + ((kmem_bufctl_t *)((char *)(buf) + (cp)->cache_bufctl)) + +#define KMEM_BUF(cp, bcp) \ + ((void *)((char *)(bcp) - (cp)->cache_bufctl)) + +#define KMEM_SLAB(cp, buf) \ + ((kmem_slab_t *)P2END((uintptr_t)(buf), (cp)->cache_slabsize) - 1) + +/* + * Test for using alternate memory at dump time. + */ +#define KMEM_DUMP(cp) ((cp)->cache_flags & KMF_DUMPDIVERT) +#define KMEM_DUMPCC(ccp) ((ccp)->cc_flags & KMF_DUMPDIVERT) + +/* + * The "CPU" macro loads a cpu_t that refers to the cpu that the current + * thread is running on at the time the macro is executed. A context switch + * may occur immediately after loading this data structure, leaving this + * thread pointing at the cpu_t for the previous cpu. This is not a problem; + * we'd just end up checking the previous cpu's per-cpu cache, and then check + * the other layers of the kmem cache if need be. + * + * It's not even a problem if the old cpu gets DR'ed out during the context + * switch. The cpu-remove DR operation bzero()s the cpu_t, but doesn't free + * it. So the cpu_t's cpu_cache_offset would read as 0, causing us to use + * cpu 0's per-cpu cache. + * + * So, there is no need to disable kernel preemption while using the CPU macro + * below since if we have been context switched, there will not be any + * correctness problem, just a momentary use of a different per-cpu cache. + */ + +#define KMEM_CPU_CACHE(cp) \ + (&cp->cache_cpu[cpu_number()]) + +#define KMOM_MAGAZINE_VALID(cp, mp) \ + (((kmem_slab_t *)P2END((uintptr_t)(mp), PAGESIZE) - 1)->slab_cache == \ + (cp)->cache_magtype->mt_cache) + +#define KMEM_MAGAZINE_VALID(cp, mp) \ + (((kmem_slab_t *)P2END((uintptr_t)(mp), PAGESIZE) - 1)->slab_cache == \ + (cp)->cache_magtype->mt_cache) + +#define KMEM_SLAB_OFFSET(sp, buf) \ + ((size_t)((uintptr_t)(buf) - (uintptr_t)((sp)->slab_base))) + +#define KMEM_SLAB_MEMBER(sp, buf) \ + (KMEM_SLAB_OFFSET(sp, buf) < (sp)->slab_cache->cache_slabsize) + +#define KMEM_BUFTAG_ALLOC 0xa110c8edUL +#define KMEM_BUFTAG_FREE 0xf4eef4eeUL + +/* slab_later_count thresholds */ +#define KMEM_DISBELIEF 3 + +/* slab_flags */ +#define KMEM_SLAB_NOMOVE 0x1 +#define KMEM_SLAB_MOVE_PENDING 0x2 + +typedef struct kmem_slab { + struct kmem_cache *slab_cache; /* controlling cache */ + void *slab_base; /* base of allocated memory */ + avl_node_t slab_link; /* slab linkage */ + struct kmem_bufctl *slab_head; /* first free buffer */ + long slab_refcnt; /* outstanding allocations */ + long slab_chunks; /* chunks (bufs) in this slab */ + uint32_t slab_stuck_offset; /* unmoved buffer offset */ + uint16_t slab_later_count; /* cf KMEM_CBRC_LATER */ + uint16_t slab_flags; /* bits to mark the slab */ + hrtime_t slab_create_time; /* when was slab created? */ +} kmem_slab_t; + +#define KMEM_HASH_INITIAL 64 + +#define KMEM_HASH(cp, buf) \ + ((cp)->cache_hash_table + \ + (((uintptr_t)(buf) >> (cp)->cache_hash_shift) & (cp)->cache_hash_mask)) + +#define KMEM_CACHE_NAMELEN 31 + +typedef struct kmem_magazine { + void *mag_next; + void *mag_round[1]; /* one or more rounds */ +} kmem_magazine_t; + +/* + * The magazine types for fast per-cpu allocation + */ +typedef struct kmem_magtype { + short mt_magsize; /* magazine size (number of rounds) */ + int mt_align; /* magazine alignment */ + size_t mt_minbuf; /* all smaller buffers qualify */ + size_t mt_maxbuf; /* no larger buffers qualify */ + kmem_cache_t *mt_cache; /* magazine cache */ +} kmem_magtype_t; + +#define KMEM_CPU_CACHE_SIZE 128 /* must be power of 2 */ +#define KMEM_CPU_PAD (KMEM_CPU_CACHE_SIZE - sizeof (kmutex_t) - \ + 2 * sizeof (uint64_t) - 2 * sizeof (void *) - sizeof (int) - \ + 5 * sizeof (short)) +#define KMEM_CACHE_SIZE(ncpus) \ + __builtin_offsetof(kmem_cache_t, cache_cpu[ncpus]) + + /* Offset from kmem_cache->cache_cpu for per cpu caches */ +#define KMEM_CPU_CACHE_OFFSET(cpuid) \ + __builtin_offsetof(kmem_cache_t, cache_cpu[cpuid]) - \ + __builtin_offsetof(kmem_cache_t, cache_cpu) + +/* + * Per CPU cache data + */ +typedef struct kmem_cpu_cache { + kmutex_t cc_lock; /* protects this cpu's local cache */ + uint64_t cc_alloc; /* allocations from this cpu */ + uint64_t cc_free; /* frees to this cpu */ + kmem_magazine_t *cc_loaded; /* the currently loaded magazine */ + kmem_magazine_t *cc_ploaded; /* the previously loaded magazine */ + int cc_flags; /* CPU-local copy of cache_flags */ + short cc_rounds; /* number of objects in loaded mag */ + short cc_prounds; /* number of objects in previous mag */ + short cc_magsize; /* number of rounds in a full mag */ + short cc_dump_rounds; /* dump time copy of cc_rounds */ + short cc_dump_prounds; /* dump time copy of cc_prounds */ + char cc_pad[KMEM_CPU_PAD]; /* for nice alignment */ +} kmem_cpu_cache_t; + +/* + * The magazine lists used in the depot. + */ +typedef struct kmem_maglist { + kmem_magazine_t *ml_list; /* magazine list */ + long ml_total; /* number of magazines */ + long ml_min; /* min since last update */ + long ml_reaplimit; /* max reapable magazines */ + uint64_t ml_alloc; /* allocations from this list */ +} kmem_maglist_t; + +typedef struct kmem_defrag { + /* + * Statistics + */ + uint64_t kmd_callbacks; /* move callbacks */ + uint64_t kmd_yes; /* KMEM_CBRC_YES responses */ + uint64_t kmd_no; /* NO responses */ + uint64_t kmd_later; /* LATER responses */ + uint64_t kmd_dont_need; /* DONT_NEED responses */ + uint64_t kmd_dont_know; /* DONT_KNOW responses */ + uint64_t kmd_slabs_freed; /* slabs freed by moves */ + uint64_t kmd_defrags; /* kmem_cache_defrag() */ + uint64_t kmd_scans; /* kmem_cache_scan() */ + + /* + * Consolidator fields + */ + avl_tree_t kmd_moves_pending; /* buffer moves pending */ + list_t kmd_deadlist; /* deferred slab frees */ + size_t kmd_deadcount; /* # of slabs in kmd_deadlist */ + uint8_t kmd_reclaim_numer; /* slab usage threshold */ + uint8_t kmd_pad1; /* compiler padding */ + uint16_t kmd_consolidate; /* triggers consolidator */ + uint32_t kmd_pad2; /* compiler padding */ + size_t kmd_slabs_sought; /* reclaimable slabs sought */ + size_t kmd_slabs_found; /* reclaimable slabs found */ + size_t kmd_tries; /* nth scan interval counter */ + /* + * Fields used to ASSERT that the client does not kmem_cache_free() + * objects passed to the move callback. + */ + void *kmd_from_buf; /* object to move */ + void *kmd_to_buf; /* move destination */ + kthread_t *kmd_thread; /* thread calling move */ +} kmem_defrag_t; + +/* + * Cache callback function types + */ +typedef int (*constructor_fn_t)(void*, void*, int); +typedef void (*destructor_fn_t)(void*, void*); +typedef void (*reclaim_fn_t)(void*); + +/* + * Cache + */ +struct kmem_cache { + +/* + * Statistics + */ + uint64_t cache_slab_create; /* slab creates */ + uint64_t cache_slab_destroy; /* slab destroys */ + uint64_t cache_slab_alloc; /* slab layer allocations */ + uint64_t cache_slab_free; /* slab layer frees */ + uint64_t cache_alloc_fail; /* total failed allocations */ + uint64_t cache_buftotal; /* total buffers */ + uint64_t cache_bufmax; /* max buffers ever */ + uint64_t cache_bufslab; /* buffers free in slab layer */ + uint64_t cache_reap; /* cache reaps */ + uint64_t cache_rescale; /* hash table rescales */ + uint64_t cache_lookup_depth; /* hash lookup depth */ + uint64_t cache_depot_contention; /* mutex contention count */ + uint64_t cache_depot_contention_prev; /* previous snapshot */ + uint64_t cache_alloc_count; /* Number of allocations in cache */ + /* successful calls with KM_NO_VBA flag set */ + uint64_t no_vba_success; + uint64_t no_vba_fail; + /* number of times we set arc growth suppression time */ + uint64_t arc_no_grow_set; + /* number of times spl_zio_is_suppressed returned true for this cache */ + uint64_t arc_no_grow; + + /* + * Cache properties + */ + char cache_name[KMEM_CACHE_NAMELEN + 1]; + size_t cache_bufsize; /* object size */ + size_t cache_align; /* object alignment */ + int (*cache_constructor)(void *, void *, int); + void (*cache_destructor)(void *, void *); + void (*cache_reclaim)(void *); + kmem_cbrc_t (*cache_move)(void *, void *, size_t, void *); + void *cache_private; /* opaque arg to callbacks */ + vmem_t *cache_arena; /* vmem source for slabs */ + int cache_cflags; /* cache creation flags */ + int cache_flags; /* various cache state info */ + uint32_t cache_mtbf; /* induced alloc failure rate */ + uint32_t cache_pad1; /* compiler padding */ + kstat_t *cache_kstat; /* exported statistics */ + list_node_t cache_link; /* cache linkage */ + + /* + * Slab layer + */ + kmutex_t cache_lock; /* protects slab layer */ + + size_t cache_chunksize; /* buf + alignment [+ debug] */ + size_t cache_slabsize; /* size of a slab */ + size_t cache_maxchunks; /* max buffers per slab */ + size_t cache_bufctl; /* buf-to-bufctl distance */ + size_t cache_buftag; /* buf-to-buftag distance */ + size_t cache_verify; /* bytes to verify */ + size_t cache_contents; /* bytes of saved content */ + size_t cache_color; /* next slab color */ + size_t cache_mincolor; /* maximum slab color */ + size_t cache_maxcolor; /* maximum slab color */ + size_t cache_hash_shift; /* get to interesting bits */ + size_t cache_hash_mask; /* hash table mask */ + list_t cache_complete_slabs; /* completely allocated slabs */ + size_t cache_complete_slab_count; + avl_tree_t cache_partial_slabs; /* partial slab freelist */ + size_t cache_partial_binshift; /* for AVL sort bins */ + kmem_cache_t *cache_bufctl_cache; /* source of bufctls */ + kmem_bufctl_t **cache_hash_table; /* hash table base */ + kmem_defrag_t *cache_defrag; /* slab consolidator fields */ + + /* + * Depot layer + */ + kmutex_t cache_depot_lock; /* protects depot */ + kmem_magtype_t *cache_magtype; /* magazine type */ + kmem_maglist_t cache_full; /* full magazines */ + kmem_maglist_t cache_empty; /* empty magazines */ + void *cache_dumpfreelist; /* heap during crash dump */ + void *cache_dumplog; /* log entry during dump */ + + /* + * Per CPU structures + */ + // XNU adjust to suit __builtin_offsetof + kmem_cpu_cache_t cache_cpu[1]; /* per-cpu data */ + +}; + +typedef struct kmem_cpu_log_header { + kmutex_t clh_lock; + char *clh_current; + size_t clh_avail; + int clh_chunk; + int clh_hits; +#if defined(SPL_DEBUG_MUTEX) + char clh_pad[128 - sizeof (kmutex_t) - sizeof (char *) - + sizeof (size_t) - 2 * sizeof (int)]; +#else + char clh_pad[64 - sizeof (kmutex_t) - sizeof (char *) - + sizeof (size_t) - 2 * sizeof (int)]; +#endif +} kmem_cpu_log_header_t; + +typedef struct kmem_log_header { + kmutex_t lh_lock; + char *lh_base; + int *lh_free; + size_t lh_chunksize; + int lh_nchunks; + int lh_head; + int lh_tail; + int lh_hits; + kmem_cpu_log_header_t lh_cpu[1]; /* ncpus actually allocated */ +} kmem_log_header_t; + +/* kmem_move kmm_flags */ +#define KMM_DESPERATE 0x1 +#define KMM_NOTIFY 0x2 +#define KMM_DEBUG 0x4 + +typedef struct kmem_move { + kmem_slab_t *kmm_from_slab; + void *kmm_from_buf; + void *kmm_to_buf; + avl_node_t kmm_entry; + int kmm_flags; +} kmem_move_t; + +/* + * In order to consolidate partial slabs, it must be possible for the cache to + * have partial slabs. + */ +#define KMEM_IS_MOVABLE(cp) \ + (((cp)->cache_chunksize * 2) <= (cp)->cache_slabsize) + +#endif diff --git a/include/os/macos/spl/sys/kstat.h b/include/os/macos/spl/sys/kstat.h new file mode 100644 index 0000000000..4463770e7a --- /dev/null +++ b/include/os/macos/spl/sys/kstat.h @@ -0,0 +1,188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_KSTAT_H +#define _SPL_KSTAT_H + +#include +#include +#include +#include +#include + +#define KSTAT_STRLEN 63 + +/* + * For reference valid classes are: + * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc + */ + +#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ +#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ +#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ +#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ +#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ +#define KSTAT_TYPE_TXG 5 /* txg sync; ks_ndata >= 1 */ +#define KSTAT_NUM_TYPES 6 + +#define KSTAT_DATA_CHAR 0 +#define KSTAT_DATA_INT32 1 +#define KSTAT_DATA_UINT32 2 +#define KSTAT_DATA_INT64 3 +#define KSTAT_DATA_UINT64 4 +#define KSTAT_DATA_LONG 5 +#define KSTAT_DATA_ULONG 6 +#define KSTAT_DATA_STRING 7 +#define KSTAT_NUM_DATAS 8 + +#define KSTAT_INTR_HARD 0 +#define KSTAT_INTR_SOFT 1 +#define KSTAT_INTR_WATCHDOG 2 +#define KSTAT_INTR_SPURIOUS 3 +#define KSTAT_INTR_MULTSVC 4 +#define KSTAT_NUM_INTRS 5 + +#define KSTAT_FLAG_VIRTUAL 0x01 +#define KSTAT_FLAG_VAR_SIZE 0x02 +#define KSTAT_FLAG_WRITABLE 0x04 +#define KSTAT_FLAG_PERSISTENT 0x08 +#define KSTAT_FLAG_DORMANT 0x10 +#define KSTAT_FLAG_UNSUPPORTED (KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_WRITABLE | \ + KSTAT_FLAG_PERSISTENT | KSTAT_FLAG_DORMANT) +#define KSTAT_FLAG_INVALID 0x20 + +#define KS_MAGIC 0x9d9d9d9d + +/* Dynamic updates */ +#define KSTAT_READ 0 +#define KSTAT_WRITE 1 + +struct kstat_s; + +typedef int kid_t; /* unique kstat id */ +typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ + +typedef struct kstat_s { + int ks_magic; /* magic value */ + kid_t ks_kid; /* unique kstat ID */ + hrtime_t ks_crtime; /* creation time */ + hrtime_t ks_snaptime; /* last access time */ + char ks_module[KSTAT_STRLEN+1]; /* provider module name */ + int ks_instance; /* provider module instance */ + char ks_name[KSTAT_STRLEN+1]; /* kstat name */ + char ks_class[KSTAT_STRLEN+1]; /* kstat class */ + uchar_t ks_type; /* kstat data type */ + uchar_t ks_flags; /* kstat flags */ + void *ks_data; /* kstat type-specific data */ + uint_t ks_ndata; /* # of type-specific data records */ + size_t ks_data_size; /* size of kstat data section */ + struct proc_dir_entry *ks_proc; /* proc linkage */ + kstat_update_t *ks_update; /* dynamic updates */ + void *ks_private; /* private data */ + kmutex_t *ks_lock; /* kstat data lock */ +} kstat_t; + +typedef struct kstat_named_s { + char name[KSTAT_STRLEN]; /* name of counter */ + uchar_t data_type; /* data type */ + union { + char c[16]; /* 128-bit int */ + int32_t i32; /* 32-bit signed int */ + uint32_t ui32; /* 32-bit unsigned int */ + int64_t i64; /* 64-bit signed int */ + uint64_t ui64; /* 64-bit unsigned int */ + long l; /* native signed long */ + ulong_t ul; /* native unsigned long */ + struct { + union { + char *ptr; /* NULL-term string */ + char __pad[8]; /* 64-bit padding */ + } addr; + uint32_t len; /* # bytes for strlen + '\0' */ + } string; + } value; +} kstat_named_t; + +#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) +#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) + +typedef struct kstat_intr { + uint_t intrs[KSTAT_NUM_INTRS]; +} kstat_intr_t; + +typedef struct kstat_io { + u_longlong_t nread; /* number of bytes read */ + u_longlong_t nwritten; /* number of bytes written */ + uint_t reads; /* number of read operations */ + uint_t writes; /* number of write operations */ + hrtime_t wtime; /* cumulative wait (pre-service) time */ + hrtime_t wlentime; /* cumulative wait len * time product */ + hrtime_t wlastupdate; /* last time wait queue changed */ + hrtime_t rtime; /* cumulative run (service) time */ + hrtime_t rlentime; /* cumulative run length*time product */ + hrtime_t rlastupdate; /* last time run queue changed */ + uint_t wcnt; /* count of elements in wait state */ + uint_t rcnt; /* count of elements in run state */ +} kstat_io_t; + +typedef struct kstat_timer { + char name[KSTAT_STRLEN+1]; /* event name */ + u_longlong_t num_events; /* number of events */ + hrtime_t elapsed_time; /* cumulative elapsed time */ + hrtime_t min_time; /* shortest event duration */ + hrtime_t max_time; /* longest event duration */ + hrtime_t start_time; /* previous event start time */ + hrtime_t stop_time; /* previous event stop time */ +} kstat_timer_t; + +void spl_kstat_init(void); +void spl_kstat_fini(void); + +extern void __kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void* (*addr)(kstat_t *ksp, loff_t index)); + +extern kstat_t *__kstat_create(char *ks_module, int ks_instance, + char *ks_name, char *ks_class, + uchar_t ks_type, ulong_t ks_ndata, + uchar_t ks_flags); +extern void __kstat_install(kstat_t *ksp); +extern void __kstat_delete(kstat_t *ksp); + +#define kstat_create(m, i, n, c, t, s, f) \ + __kstat_create(m, i, n, c, t, s, f) +#define kstat_install(k) __kstat_install(k) +#define kstat_delete(k) __kstat_delete(k) + +extern void kstat_waitq_enter(kstat_io_t *); +extern void kstat_waitq_exit(kstat_io_t *); +extern void kstat_runq_enter(kstat_io_t *); +extern void kstat_runq_exit(kstat_io_t *); +extern void kstat_named_init(kstat_named_t *, const char *, uchar_t); + +#define kstat_set_raw_ops(k, h, d, a) __kstat_set_raw_ops(k, h, d, a) + +#endif /* _SPL_KSTAT_H */ diff --git a/include/os/macos/spl/sys/list.h b/include/os/macos/spl/sys/list.h new file mode 100644 index 0000000000..c9a72a53a6 --- /dev/null +++ b/include/os/macos/spl/sys/list.h @@ -0,0 +1,145 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _SPL_LIST_H +#define _SPL_LIST_H + +#include +#include + +/* + * NOTE: I have implemented the Solaris list API in terms of the native + * linux API. This has certain advantages in terms of leveraging the linux + * list debugging infrastructure, but it also means that the internals of a + * list differ slightly than on Solaris. This is not a problem as long as + * all callers stick to the published API. The two major differences are: + * + * 1) A list_node_t is mapped to a linux list_head struct which changes + * the name of the list_next/list_prev pointers to next/prev respectively. + * + * 2) A list_node_t which is not attached to a list on Solaris is denoted + * by having its list_next/list_prev pointers set to NULL. Under linux + * the next/prev pointers are set to LIST_POISON1 and LIST_POISON2 + * respectively. At this moment this only impacts the implementation + * of the list_link_init() and list_link_active() functions. + */ + +typedef struct list_node { + struct list_node *list_next; + struct list_node *list_prev; +} list_node_t; + + + +typedef struct list { + size_t list_size; + size_t list_offset; + list_node_t list_head; +} list_t; + +void list_create(list_t *, size_t, size_t); +void list_destroy(list_t *); + +void list_insert_after(list_t *, void *, void *); +void list_insert_before(list_t *, void *, void *); +void list_insert_head(list_t *, void *); +void list_insert_tail(list_t *, void *); +void list_remove(list_t *, void *); +void list_move_tail(list_t *, list_t *); + +void *list_head(list_t *); +void *list_tail(list_t *); +void *list_next(list_t *, void *); +void *list_prev(list_t *, void *); + +int list_link_active(list_node_t *); +int list_is_empty(list_t *); + +#define LIST_POISON1 NULL +#define LIST_POISON2 NULL + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) + + +static inline void +list_link_init(list_node_t *node) +{ + node->list_next = LIST_POISON1; + node->list_prev = LIST_POISON2; +} + +static inline void +__list_del(list_node_t *prev, list_node_t *next) +{ + next->list_prev = prev; + prev->list_next = next; +} + +static inline void list_del(list_node_t *entry) +{ + __list_del(entry->list_prev, entry->list_next); + entry->list_next = LIST_POISON1; + entry->list_prev = LIST_POISON2; +} + +static inline void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.list_next; + if (head == &list->list_head) + return (NULL); + + list_del(head); + return (list_object(list, head)); +} + +static inline void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.list_prev; + if (tail == &list->list_head) + return (NULL); + + list_del(tail); + return (list_object(list, tail)); +} + +static inline void +list_link_replace(list_node_t *old_node, list_node_t *new_node) +{ + ASSERT(list_link_active(old_node)); + ASSERT(!list_link_active(new_node)); + + new_node->list_next = old_node->list_next; + new_node->list_prev = old_node->list_prev; + old_node->list_prev->list_next = new_node; + old_node->list_next->list_prev = new_node; + list_link_init(old_node); +} + +#endif /* SPL_LIST_H */ diff --git a/include/os/macos/spl/sys/mod_os.h b/include/os/macos/spl/sys/mod_os.h new file mode 100644 index 0000000000..f9c1d80ee1 --- /dev/null +++ b/include/os/macos/spl/sys/mod_os.h @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SPL_MOD_H +#define _SPL_MOD_H + +#define MODULE_INIT(s) +#define MODULE_AUTHOR(s) +#define MODULE_LICENSE(s) +#define MODULE_VERSION(s) +#define ZFS_MODULE_DESCRIPTION(s) +#define ZFS_MODULE_AUTHOR(s) +#define ZFS_MODULE_LICENSE(s) +#define ZFS_MODULE_VERSION(s) + +#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \ + getfunc, perm, desc) + +#define __init __attribute__((unused)) +#define __exit __attribute__((unused)) + +/* + * The init/fini functions need to be called, but they are all static + */ +#define module_init(fn) \ + int wrap_ ## fn(void) \ + { \ + return (fn()); \ + } + +#define module_exit(fn) \ + void wrap_ ## fn(void) \ + { \ + fn(); \ + } + +#define ZFS_MODULE_PARAM_ARGS void + +#define ZFS_MODULE_PARAM(A, B, C, D, E, F) +#define module_param_call(a, b, c, d, e) +#define module_param_named(a, b, c, d) + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +kern_return_t spl_start(kmod_info_t *ki, void *d); +kern_return_t spl_stop(kmod_info_t *ki, void *d); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* SPL_MOD_H */ diff --git a/include/os/macos/spl/sys/mutex.h b/include/os/macos/spl/sys/mutex.h new file mode 100644 index 0000000000..11a4a89da6 --- /dev/null +++ b/include/os/macos/spl/sys/mutex.h @@ -0,0 +1,148 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * OSX mutex functions + * + * Jorgen Lundman + * + */ + +#ifndef OSX_MUTEX_H +#define OSX_MUTEX_H + +#include + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + MUTEX_ADAPTIVE = 0, /* spin if owner is running, otherwise block */ + MUTEX_SPIN = 1, /* block interrupts and spin */ + MUTEX_DRIVER = 4, /* driver (DDI) mutex */ + MUTEX_DEFAULT = 6 /* kernel default mutex */ +} kmutex_type_t; + +#define MUTEX_NOLOCKDEP 0 + +/* + * Alas lck_mtx_t; is opaque and not available at compile time, and we + * really want to embed them. Luckily, mutex size has not changed in + * many versions of OSX. We should possibly to a startup check of + * the size though. + */ +typedef struct { + uint32_t opaque[4]; +} wrapper_mutex_t; + +/* + * To enable watchdog to keep an eye on mutex being held for too long + * define this debug variable. + */ + +#define SPL_DEBUG_MUTEX + +#ifdef SPL_DEBUG_MUTEX +#define SPL_MUTEX_WATCHDOG_SLEEP 10 /* How long to sleep between checking */ +#define SPL_MUTEX_WATCHDOG_TIMEOUT 60 /* When is a mutex held too long? */ +#endif + +/* + * Solaris kmutex defined. + * + * and is embedded into ZFS structures (see dbuf) so we need to match the + * size carefully. It appears to be 32 bytes. Or rather, it needs to be + * aligned. + */ + +typedef struct kmutex { + void *m_owner; + wrapper_mutex_t m_lock; + +#ifdef SPL_DEBUG_MUTEX + void *leak; + uint64_t m_initialised; +#define MUTEX_INIT 0x123456789abcdef0ULL +#define MUTEX_DESTROYED 0xaabbccddaabbccddULL +#endif + +} kmutex_t; + +#include + +#define MUTEX_HELD(x) (mutex_owned(x)) +#define MUTEX_NOT_HELD(x) (!mutex_owned(x)) + +/* + * On OS X, CoreStorage provides these symbols, so we have to redefine them, + * preferably without having to modify SPL users. + */ +#ifdef SPL_DEBUG_MUTEX + +#define mutex_init(A, B, C, D) \ + spl_mutex_init(A, B, C, D, __FILE__, __FUNCTION__, __LINE__) +void spl_mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, + void *ibc, const char *f, const char *fn, int l); + +#else + +#define mutex_init spl_mutex_init +void spl_mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc); + +#endif + +#ifdef SPL_DEBUG_MUTEX +#define mutex_enter(X) spl_mutex_enter((X), __FILE__, __LINE__) +void spl_mutex_enter(kmutex_t *mp, char *file, int line); +#else +#define mutex_enter spl_mutex_enter +void spl_mutex_enter(kmutex_t *mp); +#endif + +#define mutex_enter_nested(A, B) mutex_enter(A) + +#define mutex_destroy spl_mutex_destroy +#define mutex_exit spl_mutex_exit +#define mutex_tryenter spl_mutex_tryenter +#define mutex_owned spl_mutex_owned +#define mutex_owner spl_mutex_owner + +void spl_mutex_destroy(kmutex_t *mp); +void spl_mutex_exit(kmutex_t *mp); +int spl_mutex_tryenter(kmutex_t *mp); +int spl_mutex_owned(kmutex_t *mp); + +struct thread *spl_mutex_owner(kmutex_t *mp); + +int spl_mutex_subsystem_init(void); +void spl_mutex_subsystem_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/os/macos/spl/sys/param.h b/include/os/macos/spl/sys/param.h new file mode 100644 index 0000000000..2819402be9 --- /dev/null +++ b/include/os/macos/spl/sys/param.h @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_PARAM_H +#define _SPL_PARAM_H + +#include_next +#include + +/* Pages to bytes and back */ +#define ptob(pages) (pages << PAGE_SHIFT) +#define btop(bytes) (bytes >> PAGE_SHIFT) + +#define MAXUID UINT32_MAX + +#endif /* SPL_PARAM_H */ diff --git a/include/os/macos/spl/sys/policy.h b/include/os/macos/spl/sys/policy.h new file mode 100644 index 0000000000..b8953209db --- /dev/null +++ b/include/os/macos/spl/sys/policy.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_POLICY_H +#define _SPL_POLICY_H + +#ifdef _KERNEL + +#include +#include + +struct vattr; + +int secpolicy_fs_unmount(cred_t *, struct mount *); +int secpolicy_nfs(const cred_t *); +int secpolicy_sys_config(const cred_t *, boolean_t); +int secpolicy_zfs(const cred_t *); +int secpolicy_zinject(const cred_t *); + +/* + * This function to be called from xxfs_setattr(). + * Must be called with the node's attributes read-write locked. + * + * cred_t * - acting credentials + * struct vnode * - vnode we're operating on + * struct vattr *va - new attributes, va_mask may be + * changed on return from a call + * struct vattr *oldva - old attributes, need include owner + * and mode only + * int flags - setattr flags + * int iaccess(void *node, int mode, cred_t *cr) + * - non-locking internal access function + * mode be checked + * w/ VREAD|VWRITE|VEXEC, not fs + * internal mode encoding. + * + * void *node - internal node (inode, tmpnode) to + * pass as arg to iaccess + */ +int secpolicy_vnode_setattr(cred_t *, struct vnode *, vattr_t *, + const vattr_t *, int, int (void *, int, cred_t *), void *); + +int secpolicy_vnode_stky_modify(const cred_t *); +int secpolicy_setid_setsticky_clear(struct vnode *vp, vattr_t *vap, + const vattr_t *ovap, cred_t *cr); + +int secpolicy_vnode_remove(struct vnode *, const cred_t *); +int secpolicy_vnode_create_gid(const cred_t *); +int secpolicy_vnode_setids_setgids(struct vnode *, const cred_t *, gid_t); +int secpolicy_vnode_setdac(struct vnode *, const cred_t *, uid_t); +int secpolicy_vnode_chown(struct vnode *, const cred_t *, uid_t); +int secpolicy_vnode_setid_retain(const cred_t *, boolean_t); +int secpolicy_xvattr(vattr_t *, uid_t, const cred_t *, mode_t); +int secpolicy_setid_clear(vattr_t *, const cred_t *); +int secpolicy_basic_link(const cred_t *); +int secpolicy_fs_mount_clearopts(const cred_t *, struct mount *); +int secpolicy_fs_mount(const cred_t *, struct vnode *, struct mount *); + +#endif /* _KERNEL */ + +#endif /* SPL_POLICY_H */ diff --git a/include/os/macos/spl/sys/priv.h b/include/os/macos/spl/sys/priv.h new file mode 100644 index 0000000000..a8da8f101e --- /dev/null +++ b/include/os/macos/spl/sys/priv.h @@ -0,0 +1,531 @@ +/* + * Copyright (c) 2006 nCircle Network Security, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson for the TrustedBSD + * Project under contract to nCircle Network Security, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, + * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Privilege checking interface for BSD kernel. + */ +#ifndef _SPL_PRIV_H +#define _SPL_PRIV_H + +/* + * Privilege list, sorted loosely by kernel subsystem. + * + * Think carefully before adding or reusing one of these privileges -- are + * there existing instances referring to the same privilege? Third party + * vendors may request the assignment of privileges to be used in loadable + * modules. Particular numeric privilege assignments are part of the + * loadable kernel module ABI, and should not be changed across minor + * releases. + * + * When adding a new privilege, remember to determine if it's appropriate for + * use in jail, and update the privilege switch in kern_jail.c as necessary. + */ + +/* + * Track beginning of privilege list. + */ +#define _PRIV_LOWEST 1 + +/* + * The remaining privileges typically correspond to one or a small + * number of specific privilege checks, and have (relatively) precise + * meanings. They are loosely sorted into a set of base system + * privileges, such as the ability to reboot, and then loosely by + * subsystem, indicated by a subsystem name. + */ +#define _PRIV_ROOT 1 /* Removed. */ +#define PRIV_ACCT 2 /* Manage process accounting. */ +#define PRIV_MAXFILES 3 /* Exceed system open files limit. */ +#define PRIV_MAXPROC 4 /* Exceed system processes limit. */ +#define PRIV_KTRACE 5 /* Set/clear KTRFAC_ROOT on ktrace. */ +#define PRIV_SETDUMPER 6 /* Configure dump device. */ +#define PRIV_REBOOT 8 /* Can reboot system. */ +#define PRIV_SWAPON 9 /* Can swapon(). */ +#define PRIV_SWAPOFF 10 /* Can swapoff(). */ +#define PRIV_MSGBUF 11 /* Can read kernel message buffer. */ +#define PRIV_IO 12 /* Can perform low-level I/O. */ +#define PRIV_KEYBOARD 13 /* Reprogram keyboard. */ +#define PRIV_DRIVER 14 /* Low-level driver privilege. */ +#define PRIV_ADJTIME 15 /* Set time adjustment. */ +#define PRIV_NTP_ADJTIME 16 /* Set NTP time adjustment. */ +#define PRIV_CLOCK_SETTIME 17 /* Can call clock_settime. */ +#define PRIV_SETTIMEOFDAY 18 /* Can call settimeofday. */ +#define _PRIV_SETHOSTID 19 /* Removed. */ +#define _PRIV_SETDOMAINNAME 20 /* Removed. */ + +/* + * Audit subsystem privileges. + */ +#define PRIV_AUDIT_CONTROL 40 /* Can configure audit. */ +#define PRIV_AUDIT_FAILSTOP 41 /* Can run during audit fail stop. */ +#define PRIV_AUDIT_GETAUDIT 42 /* Can get proc audit properties. */ +#define PRIV_AUDIT_SETAUDIT 43 /* Can set proc audit properties. */ +#define PRIV_AUDIT_SUBMIT 44 /* Can submit an audit record. */ + +/* + * Credential management privileges. + */ +#define PRIV_CRED_SETUID 50 /* setuid. */ +#define PRIV_CRED_SETEUID 51 /* seteuid to !ruid and !svuid. */ +#define PRIV_CRED_SETGID 52 /* setgid. */ +#define PRIV_CRED_SETEGID 53 /* setgid to !rgid and !svgid. */ +#define PRIV_CRED_SETGROUPS 54 /* Set process additional groups. */ +#define PRIV_CRED_SETREUID 55 /* setreuid. */ +#define PRIV_CRED_SETREGID 56 /* setregid. */ +#define PRIV_CRED_SETRESUID 57 /* setresuid. */ +#define PRIV_CRED_SETRESGID 58 /* setresgid. */ +#define PRIV_SEEOTHERGIDS 59 /* Exempt bsd.seeothergids. */ +#define PRIV_SEEOTHERUIDS 60 /* Exempt bsd.seeotheruids. */ + +/* + * Debugging privileges. + */ +#define PRIV_DEBUG_DIFFCRED 80 /* Exempt debugging other users. */ +#define PRIV_DEBUG_SUGID 81 /* Exempt debugging setuid proc. */ +#define PRIV_DEBUG_UNPRIV 82 /* Exempt unprivileged debug limit. */ +#define PRIV_DEBUG_DENIED 83 /* Exempt P2_NOTRACE. */ + +/* + * Dtrace privileges. + */ +#define PRIV_DTRACE_KERNEL 90 /* Allow use of DTrace on the kernel. */ +#define PRIV_DTRACE_PROC 91 /* Allow attaching DTrace to process. */ +#define PRIV_DTRACE_USER 92 /* Process may submit DTrace events. */ + +/* + * Firmware privilegs. + */ +#define PRIV_FIRMWARE_LOAD 100 /* Can load firmware. */ + +/* + * Jail privileges. + */ +#define PRIV_JAIL_ATTACH 110 /* Attach to a jail. */ +#define PRIV_JAIL_SET 111 /* Set jail parameters. */ +#define PRIV_JAIL_REMOVE 112 /* Remove a jail. */ + +/* + * Kernel environment priveleges. + */ +#define PRIV_KENV_SET 120 /* Set kernel env. variables. */ +#define PRIV_KENV_UNSET 121 /* Unset kernel env. variables. */ + +/* + * Loadable kernel module privileges. + */ +#define PRIV_KLD_LOAD 130 /* Load a kernel module. */ +#define PRIV_KLD_UNLOAD 131 /* Unload a kernel module. */ + +/* + * Privileges associated with the MAC Framework and specific MAC policy + * modules. + */ +#define PRIV_MAC_PARTITION 140 /* Privilege in mac_partition policy. */ +#define PRIV_MAC_PRIVS 141 /* Privilege in the mac_privs policy. */ + +/* + * Process-related privileges. + */ +#define PRIV_PROC_LIMIT 160 /* Exceed user process limit. */ +#define PRIV_PROC_SETLOGIN 161 /* Can call setlogin. */ +#define PRIV_PROC_SETRLIMIT 162 /* Can raise resources limits. */ +#define PRIV_PROC_SETLOGINCLASS 163 /* Can call setloginclass(2). */ + +/* + * System V IPC privileges. + */ +#define PRIV_IPC_READ 170 /* Can override IPC read perm. */ +#define PRIV_IPC_WRITE 171 /* Can override IPC write perm. */ +#define PRIV_IPC_ADMIN 172 /* Can override IPC owner-only perm. */ +#define PRIV_IPC_MSGSIZE 173 /* Exempt IPC message queue limit. */ + +/* + * POSIX message queue privileges. + */ +#define PRIV_MQ_ADMIN 180 /* Can override msgq owner-only perm. */ + +/* + * Performance monitoring counter privileges. + */ +#define PRIV_PMC_MANAGE 190 /* Can administer PMC. */ +#define PRIV_PMC_SYSTEM 191 /* Can allocate a system-wide PMC. */ + +/* + * Scheduling privileges. + */ +#define PRIV_SCHED_DIFFCRED 200 /* Exempt scheduling other users. */ +#define PRIV_SCHED_SETPRIORITY 201 /* Can set lower nice value for proc. */ +#define PRIV_SCHED_RTPRIO 202 /* Can set real time scheduling. */ +#define PRIV_SCHED_SETPOLICY 203 /* Can set scheduler policy. */ +#define PRIV_SCHED_SET 204 /* Can set thread scheduler. */ +#define PRIV_SCHED_SETPARAM 205 /* Can set thread scheduler params. */ +#define PRIV_SCHED_CPUSET 206 /* Can manipulate cpusets. */ +#define PRIV_SCHED_CPUSET_INTR 207 /* Can adjust IRQ to CPU binding. */ + +/* + * POSIX semaphore privileges. + */ +#define PRIV_SEM_WRITE 220 /* Can override sem write perm. */ + +/* + * Signal privileges. + */ +#define PRIV_SIGNAL_DIFFCRED 230 /* Exempt signalling other users. */ +#define PRIV_SIGNAL_SUGID 231 /* Non-conserv signal setuid proc. */ + +/* + * Sysctl privileges. + */ +#define PRIV_SYSCTL_DEBUG 240 /* Can invoke sysctl.debug. */ +#define PRIV_SYSCTL_WRITE 241 /* Can write sysctls. */ +#define PRIV_SYSCTL_WRITEJAIL 242 /* Can write sysctls, jail permitted. */ + +/* + * TTY privileges. + */ +#define PRIV_TTY_CONSOLE 250 /* Set console to tty. */ +#define PRIV_TTY_DRAINWAIT 251 /* Set tty drain wait time. */ +#define PRIV_TTY_DTRWAIT 252 /* Set DTR wait on tty. */ +#define PRIV_TTY_EXCLUSIVE 253 /* Override tty exclusive flag. */ +#define _PRIV_TTY_PRISON 254 /* Removed. */ +#define PRIV_TTY_STI 255 /* Simulate input on another tty. */ +#define PRIV_TTY_SETA 256 /* Set tty termios structure. */ + +/* + * UFS-specific privileges. + */ +#define PRIV_UFS_EXTATTRCTL 270 /* Can configure EAs on UFS1. */ +#define PRIV_UFS_QUOTAOFF 271 /* quotaoff(). */ +#define PRIV_UFS_QUOTAON 272 /* quotaon(). */ +#define PRIV_UFS_SETUSE 273 /* setuse(). */ + +/* + * ZFS-specific privileges. + */ +#define PRIV_ZFS_POOL_CONFIG 280 /* Can configure ZFS pools. */ + +/* Can inject faults in the ZFS fault injection framework. */ +#define PRIV_ZFS_INJECT 281 + +/* Can attach/detach ZFS file systems to/from jails. */ +#define PRIV_ZFS_JAIL 282 + +/* + * NFS-specific privileges. + */ +#define PRIV_NFS_DAEMON 290 /* Can become the NFS daemon. */ +#define PRIV_NFS_LOCKD 291 /* Can become NFS lock daemon. */ + +/* + * VFS privileges. + */ +#define PRIV_VFS_READ 310 /* Override vnode DAC read perm. */ +#define PRIV_VFS_WRITE 311 /* Override vnode DAC write perm. */ +#define PRIV_VFS_ADMIN 312 /* Override vnode DAC admin perm. */ +#define PRIV_VFS_EXEC 313 /* Override vnode DAC exec perm. */ +#define PRIV_VFS_LOOKUP 314 /* Override vnode DAC lookup perm. */ +#define PRIV_VFS_BLOCKRESERVE 315 /* Can use free block reserve. */ +#define PRIV_VFS_CHFLAGS_DEV 316 /* Can chflags() a device node. */ +#define PRIV_VFS_CHOWN 317 /* Can set user; group to non-member. */ +#define PRIV_VFS_CHROOT 318 /* chroot(). */ +#define PRIV_VFS_RETAINSUGID 319 /* Can retain sugid bits on change. */ +#define PRIV_VFS_EXCEEDQUOTA 320 /* Exempt from quota restrictions. */ +#define PRIV_VFS_EXTATTR_SYSTEM 321 /* Operate on system EA namespace. */ +#define PRIV_VFS_FCHROOT 322 /* fchroot(). */ +#define PRIV_VFS_FHOPEN 323 /* Can fhopen(). */ +#define PRIV_VFS_FHSTAT 324 /* Can fhstat(). */ +#define PRIV_VFS_FHSTATFS 325 /* Can fhstatfs(). */ +#define PRIV_VFS_GENERATION 326 /* stat() returns generation number. */ +#define PRIV_VFS_GETFH 327 /* Can retrieve file handles. */ +#define PRIV_VFS_GETQUOTA 328 /* getquota(). */ +#define PRIV_VFS_LINK 329 /* bsd.hardlink_check_uid */ +#define PRIV_VFS_MKNOD_BAD 330 /* Can mknod() to mark bad inodes. */ +#define PRIV_VFS_MKNOD_DEV 331 /* Can mknod() to create dev nodes. */ +#define PRIV_VFS_MKNOD_WHT 332 /* Can mknod() to create whiteout. */ +#define PRIV_VFS_MOUNT 333 /* Can mount(). */ +#define PRIV_VFS_MOUNT_OWNER 334 /* Can manage other users' fsystems. */ +#define PRIV_VFS_MOUNT_EXPORTED 335 /* Can set MNT_EXPORTED on mount. */ +#define PRIV_VFS_MOUNT_PERM 336 /* Override dev node perms at mount. */ +#define PRIV_VFS_MOUNT_SUIDDIR 337 /* Can set MNT_SUIDDIR on mount. */ +#define PRIV_VFS_MOUNT_NONUSER 338 /* Can perform a non-user mount. */ +#define PRIV_VFS_SETGID 339 /* Can setgid if not in group. */ +#define PRIV_VFS_SETQUOTA 340 /* setquota(). */ +#define PRIV_VFS_STICKYFILE 341 /* Can set sticky bit on file. */ +#define PRIV_VFS_SYSFLAGS 342 /* Can modify system flags. */ +#define PRIV_VFS_UNMOUNT 343 /* Can unmount(). */ +#define PRIV_VFS_STAT 344 /* Override vnode MAC stat perm. */ + +/* + * Virtual memory privileges. + */ +#define PRIV_VM_MADV_PROTECT 360 /* Can set MADV_PROTECT. */ +#define PRIV_VM_MLOCK 361 /* Can mlock(), mlockall(). */ +#define PRIV_VM_MUNLOCK 362 /* Can munlock(), munlockall(). */ +/* Can override the global swap reservation limits. */ +#define PRIV_VM_SWAP_NOQUOTA 363 +/* Can override the per-uid swap reservation limits. */ +#define PRIV_VM_SWAP_NORLIMIT 364 + +/* + * Device file system privileges. + */ +#define PRIV_DEVFS_RULE 370 /* Can manage devfs rules. */ +#define PRIV_DEVFS_SYMLINK 371 /* Can create symlinks in devfs. */ + +/* + * Random number generator privileges. + */ +#define PRIV_RANDOM_RESEED 380 /* Closing /dev/random reseeds. */ + +/* + * Network stack privileges. + */ +#define PRIV_NET_BRIDGE 390 /* Administer bridge. */ +#define PRIV_NET_GRE 391 /* Administer GRE. */ +#define _PRIV_NET_PPP 392 /* Removed. */ +#define _PRIV_NET_SLIP 393 /* Removed. */ +#define PRIV_NET_BPF 394 /* Monitor BPF. */ +#define PRIV_NET_RAW 395 /* Open raw socket. */ +#define PRIV_NET_ROUTE 396 /* Administer routing. */ +#define PRIV_NET_TAP 397 /* Can open tap device. */ +#define PRIV_NET_SETIFMTU 398 /* Set interface MTU. */ +#define PRIV_NET_SETIFFLAGS 399 /* Set interface flags. */ +#define PRIV_NET_SETIFCAP 400 /* Set interface capabilities. */ +#define PRIV_NET_SETIFNAME 401 /* Set interface name. */ +#define PRIV_NET_SETIFMETRIC 402 /* Set interface metrics. */ +#define PRIV_NET_SETIFPHYS 403 /* Set interface physical layer prop. */ +#define PRIV_NET_SETIFMAC 404 /* Set interface MAC label. */ +#define PRIV_NET_ADDMULTI 405 /* Add multicast addr. to ifnet. */ +#define PRIV_NET_DELMULTI 406 /* Delete multicast addr. from ifnet. */ +#define PRIV_NET_HWIOCTL 407 /* Issue hardware ioctl on ifnet. */ +#define PRIV_NET_SETLLADDR 408 /* Set interface link-level address. */ +#define PRIV_NET_ADDIFGROUP 409 /* Add new interface group. */ +#define PRIV_NET_DELIFGROUP 410 /* Delete interface group. */ +#define PRIV_NET_IFCREATE 411 /* Create cloned interface. */ +#define PRIV_NET_IFDESTROY 412 /* Destroy cloned interface. */ +#define PRIV_NET_ADDIFADDR 413 /* Add protocol addr to interface. */ +#define PRIV_NET_DELIFADDR 414 /* Delete protocol addr on interface. */ +#define PRIV_NET_LAGG 415 /* Administer lagg interface. */ +#define PRIV_NET_GIF 416 /* Administer gif interface. */ +#define PRIV_NET_SETIFVNET 417 /* Move interface to vnet. */ +#define PRIV_NET_SETIFDESCR 418 /* Set interface description. */ +#define PRIV_NET_SETIFFIB 419 /* Set interface fib. */ +#define PRIV_NET_VXLAN 420 /* Administer vxlan. */ + +/* + * 802.11-related privileges. + */ +#define PRIV_NET80211_GETKEY 440 /* Query 802.11 keys. */ +#define PRIV_NET80211_MANAGE 441 /* Administer 802.11. */ + +/* + * Placeholder for AppleTalk privileges, not supported anymore. + */ +#define _PRIV_NETATALK_RESERVEDPORT 450 /* Bind low port number. */ + +/* + * ATM privileges. + */ +#define PRIV_NETATM_CFG 460 +#define PRIV_NETATM_ADD 461 +#define PRIV_NETATM_DEL 462 +#define PRIV_NETATM_SET 463 + +/* + * Bluetooth privileges. + */ +#define PRIV_NETBLUETOOTH_RAW 470 /* Open raw bluetooth socket. */ + +/* + * Netgraph and netgraph module privileges. + */ +#define PRIV_NETGRAPH_CONTROL 480 /* Open netgraph control socket. */ +#define PRIV_NETGRAPH_TTY 481 /* Configure tty for netgraph. */ + +/* + * IPv4 and IPv6 privileges. + */ +#define PRIV_NETINET_RESERVEDPORT 490 /* Bind low port number. */ +#define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ +#define PRIV_NETINET_DIVERT 492 /* Open IP divert socket. */ +#define PRIV_NETINET_PF 493 /* Administer pf firewall. */ +#define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ +#define PRIV_NETINET_CARP 495 /* Administer CARP. */ +#define PRIV_NETINET_MROUTE 496 /* Administer multicast routing. */ +#define PRIV_NETINET_RAW 497 /* Open netinet raw socket. */ +#define PRIV_NETINET_GETCRED 498 /* Query netinet pcb credentials. */ +#define PRIV_NETINET_ADDRCTRL6 499 /* Administer IPv6 address scopes. */ +#define PRIV_NETINET_ND6 500 /* Administer IPv6 neighbor disc. */ +#define PRIV_NETINET_SCOPE6 501 /* Administer IPv6 address scopes. */ +#define PRIV_NETINET_ALIFETIME6 502 /* Administer IPv6 address lifetimes. */ +#define PRIV_NETINET_IPSEC 503 /* Administer IPSEC. */ +#define PRIV_NETINET_REUSEPORT 504 /* Allow [rapid] port/address reuse. */ +#define PRIV_NETINET_SETHDROPTS 505 /* Set certain IPv4/6 header options. */ +#define PRIV_NETINET_BINDANY 506 /* Allow bind to any address. */ +#define PRIV_NETINET_HASHKEY 507 /* Get and set hash keys for IPv4/6. */ + +/* + * Placeholders for IPX/SPX privileges, not supported any more. + */ +#define _PRIV_NETIPX_RESERVEDPORT 520 /* Bind low port number. */ +#define _PRIV_NETIPX_RAW 521 /* Open netipx raw socket. */ + +/* + * NCP privileges. + */ +#define PRIV_NETNCP 530 /* Use another user's connection. */ + +/* + * SMB privileges. + */ +#define PRIV_NETSMB 540 /* Use another user's connection. */ + +/* + * VM86 privileges. + */ +#define PRIV_VM86_INTCALL 550 /* Allow invoking vm86 int handlers. */ + +/* + * Set of reserved privilege values, which will be allocated to code as + * needed, in order to avoid renumbering later privileges due to insertion. + */ +#define _PRIV_RESERVED0 560 +#define _PRIV_RESERVED1 561 +#define _PRIV_RESERVED2 562 +#define _PRIV_RESERVED3 563 +#define _PRIV_RESERVED4 564 +#define _PRIV_RESERVED5 565 +#define _PRIV_RESERVED6 566 +#define _PRIV_RESERVED7 567 +#define _PRIV_RESERVED8 568 +#define _PRIV_RESERVED9 569 +#define _PRIV_RESERVED10 570 +#define _PRIV_RESERVED11 571 +#define _PRIV_RESERVED12 572 +#define _PRIV_RESERVED13 573 +#define _PRIV_RESERVED14 574 +#define _PRIV_RESERVED15 575 + +/* + * Define a set of valid privilege numbers that can be used by loadable + * modules that don't yet have privilege reservations. Ideally, these should + * not be used, since their meaning is opaque to any policies that are aware + * of specific privileges, such as jail, and as such may be arbitrarily + * denied. + */ +#define PRIV_MODULE0 600 +#define PRIV_MODULE1 601 +#define PRIV_MODULE2 602 +#define PRIV_MODULE3 603 +#define PRIV_MODULE4 604 +#define PRIV_MODULE5 605 +#define PRIV_MODULE6 606 +#define PRIV_MODULE7 607 +#define PRIV_MODULE8 608 +#define PRIV_MODULE9 609 +#define PRIV_MODULE10 610 +#define PRIV_MODULE11 611 +#define PRIV_MODULE12 612 +#define PRIV_MODULE13 613 +#define PRIV_MODULE14 614 +#define PRIV_MODULE15 615 + +/* + * DDB(4) privileges. + */ +#define PRIV_DDB_CAPTURE 620 /* Allow reading of DDB capture log. */ + +/* + * Arla/nnpfs privileges. + */ +#define PRIV_NNPFS_DEBUG 630 /* Perforn ARLA_VIOC_NNPFSDEBUG. */ + +/* + * cpuctl(4) privileges. + */ +#define PRIV_CPUCTL_WRMSR 640 /* Write model-specific register. */ +#define PRIV_CPUCTL_UPDATE 641 /* Update cpu microcode. */ + +/* + * Capi4BSD privileges. + */ +#define PRIV_C4B_RESET_CTLR 650 /* Load firmware, reset controller. */ +#define PRIV_C4B_TRACE 651 /* Unrestricted CAPI message tracing. */ + +/* + * OpenAFS privileges. + */ +#define PRIV_AFS_ADMIN 660 /* Can change AFS client settings. */ +#define PRIV_AFS_DAEMON 661 /* Can become the AFS daemon. */ + +/* + * Resource Limits privileges. + */ +#define PRIV_RCTL_GET_RACCT 670 +#define PRIV_RCTL_GET_RULES 671 +#define PRIV_RCTL_GET_LIMITS 672 +#define PRIV_RCTL_ADD_RULE 673 +#define PRIV_RCTL_REMOVE_RULE 674 + +/* + * mem(4) privileges. + */ +#define PRIV_KMEM_READ 680 /* Open mem/kmem for reading. */ +#define PRIV_KMEM_WRITE 681 /* Open mem/kmem for writing. */ + +/* + * Track end of privilege list. + */ +#define _PRIV_HIGHEST 682 + +/* + * Validate that a named privilege is known by the privilege system. Invalid + * privileges presented to the privilege system by a priv_check interface + * will result in a panic. This is only approximate due to sparse allocation + * of the privilege space. + */ +#define PRIV_VALID(x) ((x) > _PRIV_LOWEST && (x) < _PRIV_HIGHEST) + +#ifdef _KERNEL +/* + * Privilege check interfaces, modeled after historic suser() interfaces, but + * with the addition of a specific privilege name. No flags are currently + * defined for the API. Historically, flags specified using the real uid + * instead of the effective uid, and whether or not the check should be + * allowed in jail. + */ +struct thread; +struct ucred; +int priv_check(struct thread *td, int priv); +int priv_check_cred(struct ucred *cred, int priv, int flags); +#endif + +#endif /* _SPL_PRIV_H */ diff --git a/include/os/macos/spl/sys/proc.h b/include/os/macos/spl/sys/proc.h new file mode 100644 index 0000000000..132964d9c1 --- /dev/null +++ b/include/os/macos/spl/sys/proc.h @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#ifndef _SPL_PROC_H +#define _SPL_PROC_H + +#include +#include +#include_next +#include +#include + +#define proc_t struct proc + +extern proc_t p0; /* process 0 */ + +static inline boolean_t +zfs_proc_is_caller(proc_t *p) +{ + return (p == curproc); +} + +#endif /* SPL_PROC_H */ diff --git a/include/os/macos/spl/sys/processor.h b/include/os/macos/spl/sys/processor.h new file mode 100644 index 0000000000..d077817e94 --- /dev/null +++ b/include/os/macos/spl/sys/processor.h @@ -0,0 +1,37 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_PROCESSOR_H +#define _SPL_PROCESSOR_H + +#include + +extern uint32_t getcpuid(void); + +typedef int processorid_t; + +#endif /* _SPL_PROCESSOR_H */ diff --git a/include/os/macos/spl/sys/procfs_list.h b/include/os/macos/spl/sys/procfs_list.h new file mode 100644 index 0000000000..b90d44da2b --- /dev/null +++ b/include/os/macos/spl/sys/procfs_list.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_PROCFS_LIST_H +#define _SPL_PROCFS_LIST_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct seq_file { + void *dummy; /* warning: empty struct has size 0 in C, size 1 in C++ */ +}; + +void seq_printf(struct seq_file *m, const char *fmt, ...); + +typedef struct procfs_list { + void *pl_private; + kmutex_t pl_lock; + list_t pl_list; + uint64_t pl_next_id; + size_t pl_node_offset; +} procfs_list_t; + +typedef struct procfs_list_node { + list_node_t pln_link; + uint64_t pln_id; +} procfs_list_node_t; + +void procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off); +void procfs_list_uninstall(procfs_list_t *procfs_list); +void procfs_list_destroy(procfs_list_t *procfs_list); +void procfs_list_add(procfs_list_t *procfs_list, void *p); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/os/macos/spl/sys/random.h b/include/os/macos/spl/sys/random.h new file mode 100644 index 0000000000..c69184cc84 --- /dev/null +++ b/include/os/macos/spl/sys/random.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_RANDOM_H +#define _SPL_RANDOM_H + +#include_next + + +static inline int +random_get_bytes(uint8_t *ptr, size_t len) +{ + read_random(ptr, len); + return (0); +} + +static inline int +random_get_pseudo_bytes(uint8_t *ptr, size_t len) +{ + read_random(ptr, len); + return (0); +} + +#endif /* _SPL_RANDOM_H */ diff --git a/include/os/macos/spl/sys/rwlock.h b/include/os/macos/spl/sys/rwlock.h new file mode 100644 index 0000000000..aa69db9bd3 --- /dev/null +++ b/include/os/macos/spl/sys/rwlock.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_RWLOCK_H +#define _SPL_RWLOCK_H + +#include +#include + +typedef enum { + RW_DRIVER = 2, + RW_DEFAULT = 4 +} krw_type_t; + +typedef enum { + RW_NONE = 0, + RW_WRITER = 1, + RW_READER = 2 +} krw_t; + +#define RW_NOLOCKDEP 0 + +struct krwlock { + uint32_t rw_lock[4]; /* opaque lck_rw_t data */ + void *rw_owner; /* writer (exclusive) lock only */ + int rw_readers; /* reader lock only */ + int rw_pad; /* */ +#ifdef SPL_DEBUG_RWLOCK + void *leak; +#endif +}; +typedef struct krwlock krwlock_t; + +#define RW_WRITE_HELD(x) (rw_write_held((x))) +#define RW_LOCK_HELD(x) (rw_lock_held((x))) + +#ifdef SPL_DEBUG_RWLOCK +#define rw_init(A, B, C, D) \ + rw_initx(A, B, C, D, __FILE__, __FUNCTION__, __LINE__) +extern void rw_initx(krwlock_t *, char *, krw_type_t, void *, + const char *, const char *, int); +#else +extern void rw_init(krwlock_t *, char *, krw_type_t, void *); +#endif +extern void rw_destroy(krwlock_t *); +extern void rw_enter(krwlock_t *, krw_t); +extern int rw_tryenter(krwlock_t *, krw_t); +extern void rw_exit(krwlock_t *); +extern void rw_downgrade(krwlock_t *); +extern int rw_tryupgrade(krwlock_t *); +extern int rw_write_held(krwlock_t *); +extern int rw_lock_held(krwlock_t *); +extern int rw_isinit(krwlock_t *); + +int spl_rwlock_init(void); +void spl_rwlock_fini(void); + +#endif /* _SPL_RWLOCK_H */ diff --git a/include/os/macos/spl/sys/seg_kmem.h b/include/os/macos/spl/sys/seg_kmem.h new file mode 100644 index 0000000000..8ef014c129 --- /dev/null +++ b/include/os/macos/spl/sys/seg_kmem.h @@ -0,0 +1,92 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VM_SEG_KMEM_H +#define _VM_SEG_KMEM_H + + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * VM - Kernel Segment Driver + */ + +#if defined(_KERNEL) + +extern uint64_t segkmem_total_allocated; + +/* qcaching for zio arenas and abd arena */ +extern vmem_t *zio_arena_parent; +/* arena for zio caches for file blocks */ +extern vmem_t *zio_arena; +/* arena for zio caches for (zfs) metadata blocks */ +extern vmem_t *zio_metadata_arena; + +/* + * segkmem page vnodes + */ +#define kvp (kvps[KV_KVP]) +#define zvp (kvps[KV_ZVP]) +#if defined(__sparc) +#define mpvp (kvps[KV_MPVP]) +#define promvp (kvps[KV_PROMVP]) +#endif /* __sparc */ + +void *segkmem_alloc(vmem_t *, size_t, int); +extern void segkmem_free(vmem_t *, void *, size_t); +extern void kernelheap_init(void); +extern void kernelheap_fini(void); + +extern void *segkmem_zio_alloc(vmem_t *, size_t, int); +extern void segkmem_zio_free(vmem_t *, void *, size_t); +extern void segkmem_zio_init(void); +extern void segkmem_zio_fini(void); + +/* + * Flags for segkmem_xalloc(). + * + * SEGKMEM_SHARELOCKED requests pages which are locked SE_SHARED to be + * returned rather than unlocked which is now the default. Note that + * memory returned by SEGKMEM_SHARELOCKED cannot be freed by segkmem_free(). + * This is a hack for seg_dev that should be cleaned up in the future. + */ +#define SEGKMEM_SHARELOCKED 0x20000 + +#define SEGKMEM_USE_LARGEPAGES (segkmem_lpsize > PAGESIZE) + +#define IS_KMEM_VA_LARGEPAGE(vaddr) \ + (((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end)) + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_KMEM_H */ diff --git a/include/os/macos/spl/sys/sha2.h b/include/os/macos/spl/sys/sha2.h new file mode 100644 index 0000000000..9039835f18 --- /dev/null +++ b/include/os/macos/spl/sys/sha2.h @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2013 Saso Kiselkov. All rights reserved. */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#ifdef _KERNEL +#include /* for uint_* */ +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA2_HMAC_MIN_KEY_LEN 1 /* SHA2-HMAC min key length in bytes */ +#define SHA2_HMAC_MAX_KEY_LEN INT_MAX /* SHA2-HMAC max key length in bytes */ + +#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ +#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ +#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ + +/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ +#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ +#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ + +#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ +#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ + +#define SHA256 0 +#define SHA256_HMAC 1 +#define SHA256_HMAC_GEN 2 +#define SHA384 3 +#define SHA384_HMAC 4 +#define SHA384_HMAC_GEN 5 +#define SHA512 6 +#define SHA512_HMAC 7 +#define SHA512_HMAC_GEN 8 +#define SHA512_224 9 +#define SHA512_256 10 + +/* + * SHA2 context. + * The contents of this structure are a private interface between the + * Init/Update/Final calls of the functions defined below. + * Callers must never attempt to read or write any of the fields + * in this structure directly. + */ +typedef struct { + uint32_t algotype; /* Algorithm Type */ + + /* state (ABCDEFGH) */ + union { + uint32_t s32[8]; /* for SHA256 */ + uint64_t s64[8]; /* for SHA384/512 */ + } state; + /* number of bits */ + union { + uint32_t c32[2]; /* for SHA256 , modulo 2^64 */ + uint64_t c64[2]; /* for SHA384/512, modulo 2^128 */ + } count; + union { + uint8_t buf8[128]; /* undigested input */ + uint32_t buf32[32]; /* realigned input */ + uint64_t buf64[16]; /* realigned input */ + } buf_un; +} SHA2_CTX; + +typedef SHA2_CTX SHA256_CTX; +typedef SHA2_CTX SHA384_CTX; +typedef SHA2_CTX SHA512_CTX; + +extern void SHA2Init(uint64_t mech, SHA2_CTX *); + +extern void SHA2Update(SHA2_CTX *, const void *, size_t); + +extern void SHA2Final(void *, SHA2_CTX *); + +extern void SHA256Init(SHA256_CTX *); + +extern void SHA256Update(SHA256_CTX *, const void *, size_t); + +extern void SHA256Final(void *, SHA256_CTX *); + +extern void SHA384Init(SHA384_CTX *); + +extern void SHA384Update(SHA384_CTX *, const void *, size_t); + +extern void SHA384Final(void *, SHA384_CTX *); + +extern void SHA512Init(SHA512_CTX *); + +extern void SHA512Update(SHA512_CTX *, const void *, size_t); + +extern void SHA512Final(void *, SHA512_CTX *); + +#ifdef _SHA2_IMPL +/* + * The following types/functions are all private to the implementation + * of the SHA2 functions and must not be used by consumers of the interface + */ + +/* + * List of support mechanisms in this module. + * + * It is important to note that in the module, division or modulus calculations + * are used on the enumerated type to determine which mechanism is being used; + * therefore, changing the order or additional mechanisms should be done + * carefully + */ +typedef enum sha2_mech_type { + SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ + SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ + SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ + SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ + SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ + SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ + SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ + SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ +} sha2_mech_type_t; + +#endif /* _SHA2_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA2_H */ diff --git a/include/os/macos/spl/sys/sid.h b/include/os/macos/spl/sys/sid.h new file mode 100644 index 0000000000..75b6847096 --- /dev/null +++ b/include/os/macos/spl/sys/sid.h @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _SPL_SID_H +#define _SPL_SID_H + +#ifdef __cplusplus +extern "C" { +#endif + + +typedef struct ksiddomain { + char *kd_name; +} ksiddomain_t; + +typedef enum ksid_index { + KSID_USER, + KSID_GROUP, + KSID_OWNER, + KSID_COUNT +} ksid_index_t; + +typedef int ksid_t; + +/* Should be in kidmap.h */ +typedef int32_t idmap_stat; + +static inline ksiddomain_t * +ksid_lookupdomain(const char *dom) +{ + ksiddomain_t *kd; + int len = strlen(dom); + + kd = (ksiddomain_t *)kmem_zalloc(sizeof (ksiddomain_t), KM_SLEEP); + kd->kd_name = (char *)kmem_zalloc(len + 1, KM_SLEEP); + memcpy(kd->kd_name, dom, len); + + return (kd); +} + +static inline void +ksiddomain_rele(ksiddomain_t *ksid) +{ + kmem_free(ksid->kd_name, strlen(ksid->kd_name) + 1); + kmem_free(ksid, sizeof (ksiddomain_t)); +} + +#define UID_NOBODY 65534 +#define GID_NOBODY 65534 + +static __inline uint_t +ksid_getid(ksid_t *ks) +{ + panic("%s has been unexpectedly called", __func__); + return (0); +} + +static __inline const char * +ksid_getdomain(ksid_t *ks) +{ + panic("%s has been unexpectedly called", __func__); + return (0); +} + +static __inline uint_t +ksid_getrid(ksid_t *ks) +{ + panic("%s has been unexpectedly called", __func__); + return (0); +} + +#define kidmap_getsidbyuid(zone, uid, sid_prefix, rid) (1) +#define kidmap_getsidbygid(zone, gid, sid_prefix, rid) (1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_SID_H */ diff --git a/include/os/macos/spl/sys/signal.h b/include/os/macos/spl/sys/signal.h new file mode 100644 index 0000000000..f4bf1845e9 --- /dev/null +++ b/include/os/macos/spl/sys/signal.h @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2020 Jorgen Lundman + * + */ + +#ifndef _SPL_SYS_SIGNAL_H +#define _SPL_SYS_SIGNAL_H + +#include +#include_next +#include + +#define FORREAL 0 /* Usual side-effects */ +#define JUSTLOOKING 1 /* Don't stop the process */ + +struct proc; + +extern int thread_issignal(struct proc *, thread_t, sigset_t); + +#define THREADMASK (sigmask(SIGILL)|sigmask(SIGTRAP)|\ + sigmask(SIGIOT)|sigmask(SIGEMT)|\ + sigmask(SIGFPE)|sigmask(SIGBUS)|\ + sigmask(SIGSEGV)|sigmask(SIGSYS)|\ + sigmask(SIGPIPE)|sigmask(SIGKILL)|\ + sigmask(SIGTERM)|sigmask(SIGINT)) + +static __inline__ int +issig(int why) +{ + return (thread_issignal(current_proc(), current_thread(), + THREADMASK)); +} + +/* Always called with curthread */ +#define signal_pending(p) issig(0) + +#endif /* SPL_SYS_SIGNAL_H */ diff --git a/include/os/macos/spl/sys/simd.h b/include/os/macos/spl/sys/simd.h new file mode 100644 index 0000000000..37c06fc78a --- /dev/null +++ b/include/os/macos/spl/sys/simd.h @@ -0,0 +1,712 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Neskovic . + */ + +/* + * USER API: + * + * Kernel fpu methods: + * kfpu_begin() + * kfpu_end() + * + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_sse_available() + * zfs_sse2_available() + * zfs_sse3_available() + * zfs_ssse3_available() + * zfs_sse4_1_available() + * zfs_sse4_2_available() + * + * zfs_avx_available() + * zfs_avx2_available() + * + * zfs_bmi1_available() + * zfs_bmi2_available() + * + * zfs_avx512f_available() + * zfs_avx512cd_available() + * zfs_avx512er_available() + * zfs_avx512pf_available() + * zfs_avx512bw_available() + * zfs_avx512dq_available() + * zfs_avx512vl_available() + * zfs_avx512ifma_available() + * zfs_avx512vbmi_available() + * + * NOTE(AVX-512VL): If using AVX-512 instructions with 128Bit registers + * also add zfs_avx512vl_available() to feature check. + */ + +#ifndef _SIMD_X86_H +#define _SIMD_X86_H + +#include + +/* only for __x86 */ +#if defined(__x86) + +#include + +#if defined(_KERNEL) +#include +#include + +#ifdef __APPLE__ +// XNU fpu.h +static inline uint64_t +xgetbv(uint32_t c) +{ + uint32_t mask_hi, mask_lo; + __asm__ __volatile__("xgetbv" : "=a"(mask_lo), "=d"(mask_hi) : "c" (c)); + return (((uint64_t)mask_hi<<32) + (uint64_t)mask_lo); +} + +#endif + +extern uint64_t spl_cpuid_features(void); +extern uint64_t spl_cpuid_leaf7_features(void); + +#define ZFS_ASM_BUG() { ASSERT(0); } break + +#define kfpu_allowed() 1 + +#endif + +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) + +#define kfpu_begin() ((void)0) +#define kfpu_end() ((void)0) + +/* + * CPUID feature tests for user-space. Linux kernel provides an interface for + * CPU feature testing. + */ +#if !defined(_KERNEL) + +#include + +#define ZFS_ASM_BUG() { assert(0); } break + +/* + * x86 registers used implicitly by CPUID + */ +typedef enum cpuid_regs { + EAX = 0, + EBX, + ECX, + EDX, + CPUID_REG_CNT = 4 +} cpuid_regs_t; + +/* + * List of instruction sets identified by CPUID + */ +typedef enum cpuid_inst_sets { + SSE = 0, + SSE2, + SSE3, + SSSE3, + SSE4_1, + SSE4_2, + OSXSAVE, + AVX, + AVX2, + BMI1, + BMI2, + AVX512F, + AVX512CD, + AVX512DQ, + AVX512BW, + AVX512IFMA, + AVX512VBMI, + AVX512PF, + AVX512ER, + AVX512VL, + AES, + PCLMULQDQ +} cpuid_inst_sets_t; + +/* + * Instruction set descriptor. + */ +typedef struct cpuid_feature_desc { + uint32_t leaf; /* CPUID leaf */ + uint32_t subleaf; /* CPUID sub-leaf */ + uint32_t flag; /* bit mask of the feature */ + cpuid_regs_t reg; /* which CPUID return register to test */ +} cpuid_feature_desc_t; + +#define _AVX512F_BIT (1U << 16) +#define _AVX512CD_BIT (_AVX512F_BIT | (1U << 28)) +#define _AVX512DQ_BIT (_AVX512F_BIT | (1U << 17)) +#define _AVX512BW_BIT (_AVX512F_BIT | (1U << 30)) +#define _AVX512IFMA_BIT (_AVX512F_BIT | (1U << 21)) +#define _AVX512VBMI_BIT (1U << 1) /* AVX512F_BIT is on another leaf */ +#define _AVX512PF_BIT (_AVX512F_BIT | (1U << 26)) +#define _AVX512ER_BIT (_AVX512F_BIT | (1U << 27)) +#define _AVX512VL_BIT (1U << 31) /* if used also check other levels */ +#define _AES_BIT (1U << 25) +#define _PCLMULQDQ_BIT (1U << 1) + +/* + * Descriptions of supported instruction sets + */ +static const cpuid_feature_desc_t spl_cpuid_features[] = { + [SSE] = {1U, 0U, 1U << 25, EDX }, + [SSE2] = {1U, 0U, 1U << 26, EDX }, + [SSE3] = {1U, 0U, 1U << 0, ECX }, + [SSSE3] = {1U, 0U, 1U << 9, ECX }, + [SSE4_1] = {1U, 0U, 1U << 19, ECX }, + [SSE4_2] = {1U, 0U, 1U << 20, ECX }, + [OSXSAVE] = {1U, 0U, 1U << 27, ECX }, + [AVX] = {1U, 0U, 1U << 28, ECX }, + [AVX2] = {7U, 0U, 1U << 5, EBX }, + [BMI1] = {7U, 0U, 1U << 3, EBX }, + [BMI2] = {7U, 0U, 1U << 8, EBX }, + [AVX512F] = {7U, 0U, _AVX512F_BIT, EBX }, + [AVX512CD] = {7U, 0U, _AVX512CD_BIT, EBX }, + [AVX512DQ] = {7U, 0U, _AVX512DQ_BIT, EBX }, + [AVX512BW] = {7U, 0U, _AVX512BW_BIT, EBX }, + [AVX512IFMA] = {7U, 0U, _AVX512IFMA_BIT, EBX }, + [AVX512VBMI] = {7U, 0U, _AVX512VBMI_BIT, ECX }, + [AVX512PF] = {7U, 0U, _AVX512PF_BIT, EBX }, + [AVX512ER] = {7U, 0U, _AVX512ER_BIT, EBX }, + [AVX512VL] = {7U, 0U, _AVX512ER_BIT, EBX }, + [AES] = {1U, 0U, _AES_BIT, ECX }, + [PCLMULQDQ] = {1U, 0U, _PCLMULQDQ_BIT, ECX }, +}; + +/* + * Check if OS supports AVX and AVX2 by checking XCR0 + * Only call this function if CPUID indicates that AVX feature is + * supported by the CPU, otherwise it might be an illegal instruction. + */ +static inline uint64_t +xgetbv(uint32_t index) +{ + uint32_t eax, edx; + /* xgetbv - instruction byte code */ + __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0" + : "=a" (eax), "=d" (edx) + : "c" (index)); + + return ((((uint64_t)edx)<<32) | (uint64_t)eax); +} + +/* + * Check if CPU supports a feature + */ +static inline boolean_t +__cpuid_check_feature(const cpuid_feature_desc_t *desc) +{ + uint32_t r[CPUID_REG_CNT]; + + if (__get_cpuid_max(0, NULL) >= desc->leaf) { + /* + * __cpuid_count is needed to properly check + * for AVX2. It is a macro, so return parameters + * are passed by value. + */ + __cpuid_count(desc->leaf, desc->subleaf, + r[EAX], r[EBX], r[ECX], r[EDX]); + return ((r[desc->reg] & desc->flag) == desc->flag); + } + return (B_FALSE); +} + +#define CPUID_FEATURE_CHECK(name, id) \ +static inline boolean_t \ +__cpuid_has_ ## name(void) \ +{ \ + return (__cpuid_check_feature(&spl_cpuid_features[id])); \ +} + +/* + * Define functions for user-space CPUID features testing + */ +CPUID_FEATURE_CHECK(sse, SSE); +CPUID_FEATURE_CHECK(sse2, SSE2); +CPUID_FEATURE_CHECK(sse3, SSE3); +CPUID_FEATURE_CHECK(ssse3, SSSE3); +CPUID_FEATURE_CHECK(sse4_1, SSE4_1); +CPUID_FEATURE_CHECK(sse4_2, SSE4_2); +CPUID_FEATURE_CHECK(avx, AVX); +CPUID_FEATURE_CHECK(avx2, AVX2); +CPUID_FEATURE_CHECK(osxsave, OSXSAVE); +CPUID_FEATURE_CHECK(bmi1, BMI1); +CPUID_FEATURE_CHECK(bmi2, BMI2); +CPUID_FEATURE_CHECK(avx512f, AVX512F); +CPUID_FEATURE_CHECK(avx512cd, AVX512CD); +CPUID_FEATURE_CHECK(avx512dq, AVX512DQ); +CPUID_FEATURE_CHECK(avx512bw, AVX512BW); +CPUID_FEATURE_CHECK(avx512ifma, AVX512IFMA); +CPUID_FEATURE_CHECK(avx512vbmi, AVX512VBMI); +CPUID_FEATURE_CHECK(avx512pf, AVX512PF); +CPUID_FEATURE_CHECK(avx512er, AVX512ER); +CPUID_FEATURE_CHECK(avx512vl, AVX512VL); +CPUID_FEATURE_CHECK(aes, AES); +CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ); + +#endif /* !defined(_KERNEL) */ + + +/* + * Detect register set support + */ +static inline boolean_t +__simd_state_enabled(const uint64_t state) +{ + boolean_t has_osxsave; + uint64_t xcr0; + +#if defined(_KERNEL) + has_osxsave = !!(spl_cpuid_features() & CPUID_FEATURE_OSXSAVE); +#elif !defined(_KERNEL) + has_osxsave = __cpuid_has_osxsave(); +#endif + if (!has_osxsave) + return (B_FALSE); + + xcr0 = xgetbv(0); + return ((xcr0 & state) == state); +} + +#define _XSTATE_SSE_AVX (0x2 | 0x4) +#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) + +#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) +#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) + + +/* + * Check if SSE instruction set is available + */ +static inline boolean_t +zfs_sse_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSE)); +#elif !defined(_KERNEL) + return (__cpuid_has_sse()); +#endif +} + +/* + * Check if SSE2 instruction set is available + */ +static inline boolean_t +zfs_sse2_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSE2)); +#elif !defined(_KERNEL) + return (__cpuid_has_sse2()); +#endif +} + +/* + * Check if SSE3 instruction set is available + */ +static inline boolean_t +zfs_sse3_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSE3)); +#elif !defined(_KERNEL) + return (__cpuid_has_sse3()); +#endif +} + +/* + * Check if SSSE3 instruction set is available + */ +static inline boolean_t +zfs_ssse3_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSSE3)); +#elif !defined(_KERNEL) + return (__cpuid_has_ssse3()); +#endif +} + +/* + * Check if SSE4.1 instruction set is available + */ +static inline boolean_t +zfs_sse4_1_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSE4_1)); +#elif !defined(_KERNEL) + return (__cpuid_has_sse4_1()); +#endif +} + +/* + * Check if SSE4.2 instruction set is available + */ +static inline boolean_t +zfs_sse4_2_available(void) +{ +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_SSE4_2)); +#elif !defined(_KERNEL) + return (__cpuid_has_sse4_2()); +#endif +} + +/* + * Check if AVX instruction set is available + */ +static inline boolean_t +zfs_avx_available(void) +{ + boolean_t has_avx; +#if defined(_KERNEL) + return (!!(spl_cpuid_features() & CPUID_FEATURE_AVX1_0)); +#elif !defined(_KERNEL) + has_avx = __cpuid_has_avx(); +#endif + + return (has_avx && __ymm_enabled()); +} + +/* + * Check if AVX2 instruction set is available + */ +static inline boolean_t +zfs_avx2_available(void) +{ + boolean_t has_avx2; +#if defined(_KERNEL) +#if defined(HAVE_AVX2) + has_avx2 = (!!(spl_cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX2)); +#else + has_avx2 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx2 = __cpuid_has_avx2(); +#endif + + return (has_avx2 && __ymm_enabled()); +} + +/* + * Check if BMI1 instruction set is available + */ +static inline boolean_t +zfs_bmi1_available(void) +{ +#if defined(_KERNEL) +#if defined(CPUID_LEAF7_FEATURE_BMI1) + return (!!(spl_cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_BMI1)); +#else + return (B_FALSE); +#endif +#elif !defined(_KERNEL) + return (__cpuid_has_bmi1()); +#endif +} + +/* + * Check if BMI2 instruction set is available + */ +static inline boolean_t +zfs_bmi2_available(void) +{ +#if defined(_KERNEL) +#if defined(CPUID_LEAF7_FEATURE_BMI2) + return (!!(spl_cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_BMI2)); +#else + return (B_FALSE); +#endif +#elif !defined(_KERNEL) + return (__cpuid_has_bmi2()); +#endif +} + +/* + * Check if AES instruction set is available + */ +static inline boolean_t +zfs_aes_available(void) +{ +#if defined(_KERNEL) +#if defined(HAVE_AES) + return (!!(spl_cpuid_features() & CPUID_FEATURE_AES)); +#else + return (B_FALSE); +#endif +#elif !defined(_KERNEL) + return (__cpuid_has_aes()); +#endif +} + +/* + * Check if PCLMULQDQ instruction set is available + */ +static inline boolean_t +zfs_pclmulqdq_available(void) +{ +#if defined(_KERNEL) +#if defined(HAVE_PCLMULQDQ) + return (!!(spl_cpuid_features() & CPUID_FEATURE_PCLMULQDQ)); +#else + return (B_FALSE); +#endif +#elif !defined(_KERNEL) + return (__cpuid_has_pclmulqdq()); +#endif +} + +/* + * AVX-512 family of instruction sets: + * + * AVX512F Foundation + * AVX512CD Conflict Detection Instructions + * AVX512ER Exponential and Reciprocal Instructions + * AVX512PF Prefetch Instructions + * + * AVX512BW Byte and Word Instructions + * AVX512DQ Double-word and Quadword Instructions + * AVX512VL Vector Length Extensions + * + * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) + * AVX512VBMI Vector Byte Manipulation Instructions + */ + + +/* Check if AVX512F instruction set is available */ +static inline boolean_t +zfs_avx512f_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512F) && defined(CPUID_LEAF7_FEATURE_AVX512F) + return (!!(spl_cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F)); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512f(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512CD instruction set is available */ +static inline boolean_t +zfs_avx512cd_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512F) && defined(HAVE_AVX512CD) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512CD) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512CD)) == + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512CD); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512cd(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512ER instruction set is available */ +static inline boolean_t +zfs_avx512er_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512F) && defined(HAVE_AVX512ER) && \ + defined(CPUID_LEAF7_FEATURE_AVX512ER) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512ER)) == + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512ER); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512er(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512PF instruction set is available */ +static inline boolean_t +zfs_avx512pf_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512PF) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512PF) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512PF)) == + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512PF); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512pf(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512BW instruction set is available */ +static inline boolean_t +zfs_avx512bw_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512BW) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512BW) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512BW)) == + (CPUID_LEAF7_FEATURE_AVX512F | CPUID_LEAF7_FEATURE_AVX512BW); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512bw(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512DQ instruction set is available */ +static inline boolean_t +zfs_avx512dq_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512DQ) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512DQ) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512DQ)) == + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512DQ); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512dq(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VL instruction set is available */ +static inline boolean_t +zfs_avx512vl_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512VL) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512VL) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512VL)) == + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512VL); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512vl(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512IFMA instruction set is available */ +static inline boolean_t +zfs_avx512ifma_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512IFMA) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512IFMA) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512IFMA)) == + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512IFMA); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512ifma(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VBMI instruction set is available */ +static inline boolean_t +zfs_avx512vbmi_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) +#if defined(HAVE_AVX512VBMI) && defined(HAVE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512F) && \ + defined(CPUID_LEAF7_FEATURE_AVX512VBMI) + has_avx512 = (spl_cpuid_leaf7_features() & + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512VBMI)) == + (CPUID_LEAF7_FEATURE_AVX512F|CPUID_LEAF7_FEATURE_AVX512VBMI); +#else + has_avx512 = B_FALSE; +#endif +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512f() && + __cpuid_has_avx512vbmi(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +#endif /* defined(__x86) */ + +#endif /* _SIMD_X86_H */ diff --git a/include/os/macos/spl/sys/strings.h b/include/os/macos/spl/sys/strings.h new file mode 100644 index 0000000000..af643259c8 --- /dev/null +++ b/include/os/macos/spl/sys/strings.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_STRINGS_H +#define _SPL_STRINGS_H + + +#endif diff --git a/include/os/macos/spl/sys/stropts.h b/include/os/macos/spl/sys/stropts.h new file mode 100644 index 0000000000..bdce60f327 --- /dev/null +++ b/include/os/macos/spl/sys/stropts.h @@ -0,0 +1,247 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + + +#ifndef _SPL_STROPTS_H +#define _SPL_STROPTS_H + +#define LOCORE +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define isprint(c) ((c) >= ' ' && (c) <= '~') + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + * High order bit is 31 (or 63 in _LP64 kernel). + */ +static inline int +highbit64(unsigned long long i) +{ + register int h = 1; + if (i == 0) + return (0); + if (i & 0xffffffff00000000ull) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +static inline int +highbit(unsigned long long i) +{ + register int h = 1; + if (i == 0) + return (0); + if (i & 0xffffffff00000000ull) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +/* + * Find lowest one bit set. + * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. + * Low order bit is 0. + */ +static inline int +lowbit(unsigned long long i) +{ + register int h = 1; + + if (i == 0) + return (0); + + if (!(i & 0xffffffff)) { + h += 32; i >>= 32; + } + if (!(i & 0xffff)) { + h += 16; i >>= 16; + } + if (!(i & 0xff)) { + h += 8; i >>= 8; + } + if (!(i & 0xf)) { + h += 4; i >>= 4; + } + if (!(i & 0x3)) { + h += 2; i >>= 2; + } + if (!(i & 0x1)) { + h += 1; + } + return (h); +} + +static inline int +isdigit(char c) +{ + return (c >= ' ' && c <= '9'); +} + + +static inline char * +strpbrk(const char *s, const char *b) +{ + const char *p; + do { + for (p = b; *p != '\0' && *p != *s; ++p) + ; + if (*p != '\0') + return ((char *)s); + } while (*s++); + return (NULL); +} + + +static inline char * +strrchr(const char *p, int ch) +{ + union { + const char *cp; + char *p; + } u; + char *save; + + u.cp = p; + for (save = NULL; /* empty */; ++u.p) { + if (*u.p == ch) + save = u.p; + if (*u.p == '\0') + return (save); + } + /* NOTREACHED */ +} + +static inline int +is_ascii_str(const char *str) +{ + unsigned char ch; + + while ((ch = (unsigned char)*str++) != '\0') { + if (ch >= 0x80) + return (0); + } + return (1); +} + + +static inline void * +kmemchr(const void *s, int c, size_t n) +{ + if (n != 0) { + const unsigned char *p = (const unsigned char *)s; + do { + if (*p++ == (unsigned char)c) + return ((void *)(uintptr_t)(p - 1)); + } while (--n != 0); + } + return (NULL); +} + +#ifndef memchr +#define memchr kmemchr +#endif + +#define IDX(c) ((unsigned char)(c) / LONG_BIT) +#define BIT(c) ((unsigned long)1 << ((unsigned char)(c) % LONG_BIT)) + +static inline size_t +strcspn(const char *__restrict s, const char *__restrict charset) +{ + /* + * NB: idx and bit are temporaries whose use causes gcc 3.4.2 to + * generate better code. Without them, gcc gets a little confused. + */ + const char *s1; + unsigned long bit; + unsigned long tbl[(UCHAR_MAX + 1) / LONG_BIT]; + int idx; + + if (*s == '\0') + return (0); + + tbl[0] = 1; + tbl[3] = tbl[2] = tbl[1] = 0; + + for (; *charset != '\0'; charset++) { + idx = IDX(*charset); + bit = BIT(*charset); + tbl[idx] |= bit; + } + + for (s1 = s; ; s1++) { + idx = IDX(*s1); + bit = BIT(*s1); + if ((tbl[idx] & bit) != 0) + break; + } + return (s1 - s); +} + +#ifdef __cplusplus +} +#endif + +#endif /* SPL_STROPTS_H */ diff --git a/include/os/macos/spl/sys/sunddi.h b/include/os/macos/spl/sys/sunddi.h new file mode 100644 index 0000000000..ff299eeaf9 --- /dev/null +++ b/include/os/macos/spl/sys/sunddi.h @@ -0,0 +1,203 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Garrett D'Amore . All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + + + +#ifndef _SPL_SUNDDI_H +#define _SPL_SUNDDI_H + +#include +#include +#include +#include +#include +#include +#include + +typedef int ddi_devid_t; + +#define DDI_DEV_T_NONE ((dev_t)-1) +#define DDI_DEV_T_ANY ((dev_t)-2) +#define DI_MAJOR_T_UNKNOWN ((major_t)0) + +#define DDI_PROP_DONTPASS 0x0001 +#define DDI_PROP_CANSLEEP 0x0002 + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define ddi_prop_lookup_string(x1, x2, x3, x4, x5) (*x5 = NULL) +#define ddi_prop_free(x) (void)0 +#define ddi_root_node() (void)0 + +#define isalnum(ch) (isalpha(ch) || isdigit(ch)) +#define isalpha(ch) (isupper(ch) || islower(ch)) +#define isdigit(ch) ((ch) >= '0' && (ch) <= '9') +#define islower(ch) ((ch) >= 'a' && (ch) <= 'z') +#define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \ + ((ch) == '\t') || ((ch) == '\f')) +#define isupper(ch) ((ch) >= 'A' && (ch) <= 'Z') +#define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \ + ((ch) >= 'A' && (ch) <= 'F')) +#define tolower(C) (((C) >= 'A' && (C) <= 'Z') ? (C) - 'A' + 'a' : (C)) +#define toupper(C) (((C) >= 'a' && (C) <= 'z') ? (C) - 'a' + 'A': (C)) +#define isgraph(C) ((C) >= 0x21 && (C) <= 0x7E) +#define ispunct(C) (((C) >= 0x21 && (C) <= 0x2F) || \ + ((C) >= 0x3A && (C) <= 0x40) || \ + ((C) >= 0x5B && (C) <= 0x60) || \ + ((C) >= 0x7B && (C) <= 0x7E)) + +// Define proper Solaris API calls, and clean ZFS up to use +int ddi_copyin(const void *from, void *to, size_t len, int flags); +int ddi_copyout(const void *from, void *to, size_t len, int flags); +int ddi_copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done); + +static inline int +ddi_strtol(const char *str, char **nptr, int base, long *result) +{ + *result = strtol(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == LONG_MIN || *result == LONG_MAX) + return (ERANGE); + return (0); +} + +static inline int +ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result) +{ + *result = strtoul(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == ULONG_MAX) + return (ERANGE); + return (0); +} + +static inline int +ddi_strtoull(const char *str, char **nptr, int base, + unsigned long long *result) +{ + *result = (unsigned long long)strtouq(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == ULLONG_MAX) + return (ERANGE); + return (0); +} + +static inline int +ddi_strtoll(const char *str, char **nptr, int base, long long *result) +{ + *result = (unsigned long long)strtoq(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == ULLONG_MAX) + return (ERANGE); + return (0); +} + +#ifndef OTYPCNT +#define OTYPCNT 5 +#define OTYP_BLK 0 +#define OTYP_MNT 1 +#define OTYP_CHR 2 +#define OTYP_SWP 3 +#define OTYP_LYR 4 +#endif + +#define P2END(x, align) (-(~(x) & -(align))) + +#define ddi_name_to_major(name) devsw_name2blk(name, NULL, 0) + +struct dev_info { + dev_t dev; // Major / Minor + void *devc; + void *devb; +}; +typedef struct dev_info dev_info_t; + + +int ddi_strtoul(const char *, char **, int, unsigned long *); +int ddi_strtol(const char *, char **, int, long *); +int ddi_soft_state_init(void **, size_t, size_t); +int ddi_soft_state_zalloc(void *, int); +void *ddi_get_soft_state(void *, int); +void ddi_soft_state_free(void *, int); +void ddi_soft_state_fini(void **); +int ddi_create_minor_node(dev_info_t *, char *, int, + minor_t, char *, int); +void ddi_remove_minor_node(dev_info_t *, char *); + +int ddi_driver_major(dev_info_t *); + +typedef void *ldi_ident_t; + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define DDI_PSEUDO "" + +#define ddi_prop_update_int64(a, b, c, d) DDI_SUCCESS +#define ddi_prop_update_string(a, b, c, d) DDI_SUCCESS + +#define bioerror(bp, er) (buf_seterror((bp), (er))) +#define biodone(bp) buf_biodone(bp) + +#define ddi_ffs ffs +static inline long ddi_fls(long mask) { \ + /* Algorithm courtesy of Steve Chessin. */ \ + while (mask) { \ + long nx; \ + if ((nx = (mask & (mask - 1))) == 0) \ + break; \ + mask = nx; \ + } \ + return (ffs(mask)); \ +} + +#define getminor(X) minor((X)) + + + +/* + * This data structure is entirely private to the soft state allocator. + */ +struct i_ddi_soft_state { + void **array; /* the array of pointers */ + kmutex_t lock; /* serialize access to this struct */ + size_t size; /* how many bytes per state struct */ + size_t n_items; /* how many structs herein */ + struct i_ddi_soft_state *next; /* 'dirty' elements */ +}; + +#define MIN_N_ITEMS 8 /* 8 void *'s == 32 bytes */ + +extern int strspn(const char *string, register char *charset); + + +#endif /* SPL_SUNDDI_H */ diff --git a/include/os/macos/spl/sys/sysmacros.h b/include/os/macos/spl/sys/sysmacros.h new file mode 100644 index 0000000000..8858f8ae4c --- /dev/null +++ b/include/os/macos/spl/sys/sysmacros.h @@ -0,0 +1,265 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_SYSMACROS_H +#define _SPL_SYSMACROS_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _KERNEL +#define _KERNEL __KERNEL__ +#endif + +#define FALSE 0 +#define TRUE 1 + +#if 0 +#define INT8_MAX (127) +#define INT8_MIN (-128) +#define UINT8_MAX (255) +#define UINT8_MIN (0) + +#define INT16_MAX (32767) +#define INT16_MIN (-32768) +#define UINT16_MAX (65535) +#define UINT16_MIN (0) + +#define INT32_MAX INT_MAX +#define INT32_MIN INT_MIN +#define UINT32_MAX UINT_MAX +#define UINT32_MIN UINT_MIN + +#define INT64_MAX LLONG_MAX +#define INT64_MIN LLONG_MIN +#define UINT64_MAX ULLONG_MAX +#define UINT64_MIN ULLONG_MIN + +#define NBBY 8 +#define MAXBSIZE 8192 +#endif + +#define MAXMSGLEN 256 +#define MAXNAMELEN 256 +#define MAXPATHLEN PATH_MAX +#define MAXOFFSET_T LLONG_MAX +#define DEV_BSIZE 512 +#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ + +#define proc_pageout NULL +#define curproc (struct proc *)current_proc() + +extern int cpu_number(void); +#define CPU_SEQID (cpu_number()) +#define is_system_labeled() 0 + +extern unsigned int max_ncpus; +#define boot_ncpus max_ncpus + +#ifndef RLIM64_INFINITY +#define RLIM64_INFINITY (~0ULL) +#endif + +/* + * 0..MAX_PRIO-1: Process priority + * 0..MAX_RT_PRIO-1: RT priority tasks + * MAX_RT_PRIO..MAX_PRIO-1: SCHED_NORMAL tasks + * + * Treat shim tasks as SCHED_NORMAL tasks + */ + +/* + * In OSX, the kernel thread priorities start at 81 and goes to + * 95 MAXPRI_KERNEL. BASEPRI_REALTIME starts from 96. Since + * swap priority is at 92. Most ZFS priorities should probably + * stay below this, but kmem_reap needs to be higher. + */ +#define minclsyspri 81 /* BASEPRI_KERNEL */ +#define defclsyspri 81 /* BASEPRI_KERNEL */ +#define maxclsyspri 95 + + +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) + +/* + * Missing macros + */ +#define PAGESIZE PAGE_SIZE + +/* from Solaris sys/byteorder.h */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + + +/* Dtrace probes do not exist in the linux kernel */ +#ifdef DTRACE_PROBE +#undef DTRACE_PROBE +#endif /* DTRACE_PROBE */ +#define DTRACE_PROBE(a) ((void)0) + +#ifdef DTRACE_PROBE1 +#undef DTRACE_PROBE1 +#endif /* DTRACE_PROBE1 */ +#define DTRACE_PROBE1(a, b, c) ((void)0) + +#ifdef DTRACE_PROBE2 +#undef DTRACE_PROBE2 +#endif /* DTRACE_PROBE2 */ +#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) + +#ifdef DTRACE_PROBE3 +#undef DTRACE_PROBE3 +#endif /* DTRACE_PROBE3 */ +#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) + +#ifdef DTRACE_PROBE4 +#undef DTRACE_PROBE4 +#endif /* DTRACE_PROBE4 */ +#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) + +/* Missing globals */ +extern char spl_version[32]; +extern unsigned long spl_hostid; +extern char hw_serial[11]; + +/* Missing misc functions */ +extern uint32_t zone_get_hostid(void *zone); +extern void spl_setup(void); +extern void spl_cleanup(void); + +#define makedevice(maj, min) makedev(maj, min) + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#endif + +/* + * Compatibility macros/typedefs needed for Solaris -> Linux port + */ +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define ISP2(x) (((x) & ((x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +/* + * P2* Macros from Illumos + */ + +/* + * return x rounded up to the next phase (offset) within align. + * phase should be < align. + * eg, P2PHASEUP(0x1234, 0x100, 0x10) == 0x1310 (0x13*align + phase) + * eg, P2PHASEUP(0x5600, 0x100, 0x10) == 0x5610 (0x56*align + phase) + */ +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) + +/* + * Return TRUE if they have the same highest bit set. + * eg, P2SAMEHIGHBIT(0x1234, 0x1001) == TRUE (the high bit is 0x1000) + * eg, P2SAMEHIGHBIT(0x1234, 0x3010) == FALSE (high bit of 0x3010 is 0x2000) + */ +#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y))) + +/* + * End Illumos copy-fest + */ + +/* avoid any possibility of clashing with version */ +#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) +/* + * Use the correct builtin mechanism. The Traditional macro is + * not safe on this platform. + */ +#define offsetof(s, m) __builtin_offsetof(s, m) +#endif + +#define SET_ERROR(X) (X) + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_SYSMACROS_H */ diff --git a/include/os/macos/spl/sys/systeminfo.h b/include/os/macos/spl/sys/systeminfo.h new file mode 100644 index 0000000000..d1c15744ec --- /dev/null +++ b/include/os/macos/spl/sys/systeminfo.h @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_SYSTEMINFO_H +#define _SPL_SYSTEMINFO_H + +#define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ +#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ + /* to hold a decimal or hex */ + /* hostid string */ + +const char *spl_panicstr(void); +int spl_system_inshutdown(void); + + +#endif /* SPL_SYSTEMINFO_H */ diff --git a/include/os/macos/spl/sys/systm.h b/include/os/macos/spl/sys/systm.h new file mode 100644 index 0000000000..54b75e29b5 --- /dev/null +++ b/include/os/macos/spl/sys/systm.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_SYSTM_H +#define _SPL_SYSTM_H + +#include_next +#include + +typedef uintptr_t pc_t; + +#endif /* SPL_SYSTM_H */ diff --git a/include/os/macos/spl/sys/taskq.h b/include/os/macos/spl/sys/taskq.h new file mode 100644 index 0000000000..9dd6776108 --- /dev/null +++ b/include/os/macos/spl/sys/taskq.h @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (C) 2015 Jorgen Lundman + */ + +#ifndef _SYS_TASKQ_H +#define _SYS_TASKQ_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TASKQ_NAMELEN 31 + +typedef struct taskq taskq_t; +typedef uintptr_t taskqid_t; +typedef void (task_func_t)(void *); + +struct proc; +struct taskq_ent; + +/* New ZFS expects to find taskq_ent_t as well */ +#include + +/* + * Public flags for taskq_create(): bit range 0-15 + */ +#define TASKQ_PREPOPULATE 0x0001 /* Prepopulate with threads and data */ +#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ +#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */ +#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */ + +/* + * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as + * KM_SLEEP/KM_NOSLEEP. + */ +#define TQ_SLEEP 0x00 /* Can block for memory */ +#define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */ +#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ +#define TQ_FRONT 0x08 /* Put task at the front of the queue */ + +#define TASKQID_INVALID ((taskqid_t)0) + +#ifdef _KERNEL + +extern taskq_t *system_taskq; +/* Global dynamic task queue for long delay */ +extern taskq_t *system_delay_taskq; + +extern int spl_taskq_init(void); +extern void spl_taskq_fini(void); +extern void taskq_mp_init(void); + +extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, + int, uint_t); +extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, + proc_t *, uint_t); +extern taskq_t *taskq_create_sysdc(const char *, int, int, int, + proc_t *, uint_t, uint_t); +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern void nulltask(void *); +extern void taskq_destroy(taskq_t *); +extern void taskq_wait(taskq_t *); +#define HAVE_TASKQ_WAIT_ID +extern void taskq_wait_id(taskq_t *, taskqid_t); +extern void taskq_suspend(taskq_t *); +extern int taskq_suspended(taskq_t *); +extern void taskq_resume(taskq_t *); +extern int taskq_member(taskq_t *, kthread_t *); +extern boolean_t taskq_empty(taskq_t *tq); +extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern taskq_t *taskq_of_curthread(void); +extern int taskq_empty_ent(struct taskq_ent *); + +#define taskq_wait_outstanding(T, D) taskq_wait((T)) + +extern void system_taskq_init(void); +extern void system_taskq_fini(void); + +#endif /* _KERNEL */ + +extern int EMPTY_TASKQ(taskq_t *tq); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TASKQ_H */ diff --git a/include/os/macos/spl/sys/taskq_impl.h b/include/os/macos/spl/sys/taskq_impl.h new file mode 100644 index 0000000000..60e86b3673 --- /dev/null +++ b/include/os/macos/spl/sys/taskq_impl.h @@ -0,0 +1,181 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright (C) 2015 Jorgen Lundman + */ + + +#ifndef _SYS_TASKQ_IMPL_H +#define _SYS_TASKQ_IMPL_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct taskq_bucket taskq_bucket_t; + +typedef struct taskq_ent { + struct taskq_ent *tqent_next; + struct taskq_ent *tqent_prev; + task_func_t *tqent_func; + void *tqent_arg; + union { + taskq_bucket_t *tqent_bucket; + uintptr_t tqent_flags; + } tqent_un; + kthread_t *tqent_thread; + kcondvar_t tqent_cv; +#ifdef __APPLE__ + /* Used to simulate TS_STOPPED */ + kmutex_t tqent_thread_lock; + kcondvar_t tqent_thread_cv; +#endif +} taskq_ent_t; + +#define TQENT_FLAG_PREALLOC 0x1 + +/* + * Taskq Statistics fields are not protected by any locks. + */ +typedef struct tqstat { + uint_t tqs_hits; + uint_t tqs_misses; + uint_t tqs_overflow; /* no threads to allocate */ + uint_t tqs_tcreates; /* threads created */ + uint_t tqs_tdeaths; /* threads died */ + uint_t tqs_maxthreads; /* max # of alive threads */ + uint_t tqs_nomem; /* # of times there were no memory */ + uint_t tqs_disptcreates; +} tqstat_t; + +/* + * Per-CPU hash bucket manages taskq_bent_t structures using freelist. + */ +struct taskq_bucket { + kmutex_t tqbucket_lock; + taskq_t *tqbucket_taskq; /* Enclosing taskq */ + taskq_ent_t tqbucket_freelist; + uint_t tqbucket_nalloc; /* # of allocated entries */ + uint_t tqbucket_nfree; /* # of free entries */ + kcondvar_t tqbucket_cv; + ushort_t tqbucket_flags; + hrtime_t tqbucket_totaltime; + tqstat_t tqbucket_stat; +}; + +/* + * Bucket flags. + */ +#define TQBUCKET_CLOSE 0x01 +#define TQBUCKET_SUSPEND 0x02 + +#define TASKQ_INTERFACE_FLAGS 0x0000ffff /* defined in */ + +/* + * taskq implementation flags: bit range 16-31 + */ +#define TASKQ_CHANGING 0x00010000 /* nthreads != target */ +#define TASKQ_SUSPENDED 0x00020000 /* taskq is suspended */ +#define TASKQ_NOINSTANCE 0x00040000 /* no instance number */ +#define TASKQ_THREAD_CREATED 0x00080000 /* a thread has been created */ +#define TASKQ_DUTY_CYCLE 0x00100000 /* using the SDC class */ + +struct taskq { + char tq_name[TASKQ_NAMELEN + 1]; + kmutex_t tq_lock; + krwlock_t tq_threadlock; + kcondvar_t tq_dispatch_cv; + kcondvar_t tq_wait_cv; + kcondvar_t tq_exit_cv; + pri_t tq_pri; /* Scheduling priority */ + uint_t tq_flags; + int tq_active; + int tq_nthreads; + int tq_nthreads_target; + int tq_nthreads_max; + int tq_threads_ncpus_pct; + int tq_nalloc; + int tq_minalloc; + int tq_maxalloc; + kcondvar_t tq_maxalloc_cv; + int tq_maxalloc_wait; + taskq_ent_t *tq_freelist; + taskq_ent_t tq_task; + int tq_maxsize; + taskq_bucket_t *tq_buckets; /* Per-cpu array of buckets */ + int tq_instance; + uint_t tq_nbuckets; /* # of buckets (2^n) */ + union { + kthread_t *_tq_thread; + kthread_t **_tq_threadlist; + } tq_thr; + + list_node_t tq_cpupct_link; /* linkage for taskq_cpupct_list */ + proc_t *tq_proc; /* process for taskq threads */ + int tq_cpupart; /* cpupart id bound to */ + uint_t tq_DC; /* duty cycle for SDC */ + + /* + * Statistics. + */ + kstat_t *tq_kstat; /* Exported statistics */ + hrtime_t tq_totaltime; /* Time spent processing tasks */ + uint64_t tq_tasks; /* Total # of tasks posted */ + uint64_t tq_executed; /* Total # of tasks executed */ + int tq_maxtasks; /* Max number of tasks in the queue */ + int tq_tcreates; + int tq_tdeaths; +}; + +/* Special form of taskq dispatch that uses preallocated entries. */ +void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); + + +#define tq_thread tq_thr._tq_thread +#define tq_threadlist tq_thr._tq_threadlist + +/* The MAX guarantees we have at least one thread */ +#define TASKQ_THREADS_PCT(ncpus, pct) MAX(((ncpus) * (pct)) / 100, 1) + +/* Extra ZOL / Apple */ +extern void taskq_init_ent(taskq_ent_t *t); +extern taskqid_t taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time); + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TASKQ_IMPL_H */ diff --git a/include/os/macos/spl/sys/thread.h b/include/os/macos/spl/sys/thread.h new file mode 100644 index 0000000000..b07dca09df --- /dev/null +++ b/include/os/macos/spl/sys/thread.h @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_THREAD_H +#define _SPL_THREAD_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * OsX thread type is + * typedef struct thread *thread_t; + * + * Map that to the ZFS thread type: kthread_t + */ +#define kthread thread +#define kthread_t struct kthread + +/* + * Thread interfaces + */ +#define TP_MAGIC 0x53535353 + +#define TS_FREE 0x00 /* Thread at loose ends */ +#define TS_SLEEP 0x01 /* Awaiting an event */ +#define TS_RUN 0x02 /* Runnable, but not yet on a processor */ +#define TS_ONPROC 0x04 /* Thread is being run on a processor */ +#define TS_ZOMB 0x08 /* Thread has died but hasn't been reaped */ +#define TS_STOPPED 0x10 /* Stopped, initial state */ +#define TS_WAIT 0x20 /* Waiting to become runnable */ + + +typedef void (*thread_func_t)(void *); + + +#define curthread ((kthread_t *)current_thread()) /* current thread pointer */ +#define curproj (ttoproj(curthread)) /* current project pointer */ + +#define thread_join(t) VERIFY(0) + +// Drop the p0 argument, not used. + +#ifdef SPL_DEBUG_THREAD + +#define thread_create(A, B, C, D, E, F, G, H) \ + spl_thread_create(A, B, C, D, E, G, __FILE__, __LINE__, H) +extern kthread_t *spl_thread_create(caddr_t stk, size_t stksize, + void (*proc)(void *), void *arg, size_t len, /* proc_t *pp, */ int state, + char *, int, pri_t pri); + +#else + +#define thread_create(A, B, C, D, E, F, G, H) \ + spl_thread_create(A, B, C, D, E, G, H) +extern kthread_t *spl_thread_create(caddr_t stk, size_t stksize, + void (*proc)(void *), void *arg, size_t len, /* proc_t *pp, */ int state, + pri_t pri); + +#endif + +#define thread_exit spl_thread_exit +extern void spl_thread_exit(void); + +extern kthread_t *spl_current_thread(void); + +#define delay osx_delay +extern void osx_delay(int); + +#define KPREEMPT_SYNC 0 +static inline void kpreempt(int flags) +{ + (void) thread_block(THREAD_CONTINUE_NULL); +} + + + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_THREAD_H */ diff --git a/include/os/macos/spl/sys/time.h b/include/os/macos/spl/sys/time.h new file mode 100644 index 0000000000..c3a33f7e06 --- /dev/null +++ b/include/os/macos/spl/sys/time.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_TIME_H +#define _SPL_TIME_H + +#include +#include_next +#include +#include + +#if defined(CONFIG_64BIT) +#define TIME_MAX INT64_MAX +#define TIME_MIN INT64_MIN +#else +#define TIME_MAX INT32_MAX +#define TIME_MIN INT32_MIN +#endif + +#define SEC 1 +#define MILLISEC 1000 +#define MICROSEC 1000000 +#define NANOSEC 1000000000 + +#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) +#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) + +/* Already defined in include/linux/time.h */ +#undef CLOCK_THREAD_CPUTIME_ID +#undef CLOCK_REALTIME +#undef CLOCK_MONOTONIC +#undef CLOCK_PROCESS_CPUTIME_ID + +typedef enum clock_type { + __CLOCK_REALTIME0 = 0, /* obsolete; same as CLOCK_REALTIME */ + CLOCK_VIRTUAL = 1, /* thread's user-level CPU clock */ + CLOCK_THREAD_CPUTIME_ID = 2, /* thread's user+system CPU clock */ + CLOCK_REALTIME = 3, /* wall clock */ + CLOCK_MONOTONIC = 4, /* high resolution monotonic clock */ + CLOCK_PROCESS_CPUTIME_ID = 5, /* process's user+system CPU clock */ + CLOCK_HIGHRES = CLOCK_MONOTONIC, /* alternate name */ + CLOCK_PROF = CLOCK_THREAD_CPUTIME_ID, /* alternate name */ +} clock_type_t; + +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < TIME_MIN || (ts)->tv_sec > TIME_MAX) + +typedef long long hrtime_t; + +extern hrtime_t gethrtime(void); +extern void gethrestime(struct timespec *); +extern time_t gethrestime_sec(void); +extern void hrt2ts(hrtime_t hrt, struct timespec *tsp); + +#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) +#define USEC2NSEC(u) ((hrtime_t)(u) * (NANOSEC / MICROSEC)) +#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) +#define SEC_TO_TICK(sec) ((sec) * hz) +#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz)) + +#endif /* _SPL_TIME_H */ diff --git a/include/os/macos/spl/sys/timer.h b/include/os/macos/spl/sys/timer.h new file mode 100644 index 0000000000..ccdf64f5df --- /dev/null +++ b/include/os/macos/spl/sys/timer.h @@ -0,0 +1,88 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_TIMER_H +#define _SPL_TIMER_H + +#include + +/* Open Solaris lbolt is in hz */ +static inline uint64_t +zfs_lbolt(void) +{ + struct timeval tv; + uint64_t lbolt_hz; + microuptime(&tv); + lbolt_hz = ((uint64_t)tv.tv_sec * USEC_PER_SEC + tv.tv_usec) / 10000; + return (lbolt_hz); +} + + +#define lbolt zfs_lbolt() +#define lbolt64 zfs_lbolt() + +#define ddi_get_lbolt() (zfs_lbolt()) +#define ddi_get_lbolt64() (zfs_lbolt()) + +#define typecheck(type, x) \ + ( \ + { type __dummy; \ + typeof(x) __dummy2; \ + (void) (&__dummy == &__dummy2); \ + 1; \ + }) + + + +#define ddi_time_before(a, b) (typecheck(clock_t, a) && \ + typecheck(clock_t, b) && \ + ((a) - (b) < 0)) +#define ddi_time_after(a, b) ddi_time_before(b, a) + +#define ddi_time_before64(a, b) (typecheck(int64_t, a) && \ + typecheck(int64_t, b) && \ + ((a) - (b) < 0)) +#define ddi_time_after64(a, b) ddi_time_before64(b, a) + + + +extern void delay(clock_t ticks); + +#define usleep_range(wakeup, whocares) \ + do { \ + hrtime_t delta = wakeup - gethrtime(); \ + if (delta > 0) { \ + struct timespec ts; \ + ts.tv_sec = delta / NANOSEC; \ + ts.tv_nsec = delta % NANOSEC; \ + (void) msleep(NULL, NULL, PWAIT, "usleep_range", &ts); \ + } \ + } while (0) + + +#endif /* _SPL_TIMER_H */ diff --git a/include/os/macos/spl/sys/trace.h b/include/os/macos/spl/sys/trace.h new file mode 100644 index 0000000000..7b72d3a98d --- /dev/null +++ b/include/os/macos/spl/sys/trace.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_TRACE_H +#define _SPL_TRACE_H + + +#endif diff --git a/include/os/macos/spl/sys/tsd.h b/include/os/macos/spl/sys/tsd.h new file mode 100644 index 0000000000..cfc48000a5 --- /dev/null +++ b/include/os/macos/spl/sys/tsd.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + + +#ifndef _SPL_TSD_H +#define _SPL_TSD_H + +#include + +#define TSD_HASH_TABLE_BITS_DEFAULT 9 +#define TSD_KEYS_MAX 32768 +#define DTOR_PID (PID_MAX_LIMIT+1) +#define PID_KEY (TSD_KEYS_MAX+1) + +typedef void (*dtor_func_t)(void *); + +extern int tsd_set(uint_t, void *); +extern void *tsd_get(uint_t); +extern void *tsd_get_by_thread(uint_t, thread_t); +extern void tsd_create(uint_t *, dtor_func_t); +extern void tsd_destroy(uint_t *); +extern void tsd_exit(void); + +uint64_t spl_tsd_size(void); +void tsd_thread_exit(void); +int spl_tsd_init(void); +void spl_tsd_fini(void); + +#endif /* _SPL_TSD_H */ diff --git a/include/os/macos/spl/sys/types.h b/include/os/macos/spl/sys/types.h new file mode 100644 index 0000000000..38faab8367 --- /dev/null +++ b/include/os/macos/spl/sys/types.h @@ -0,0 +1,119 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_TYPES_H +#define _SPL_TYPES_H + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#include_next +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Avoid kcdata.h header error */ +extern unsigned long strnlen(const char *, unsigned long); + +#ifdef __cplusplus +} +#endif + +#include + +#include + +#ifndef ULLONG_MAX +#define ULLONG_MAX (~0ULL) +#endif + +#ifndef LLONG_MAX +#define LLONG_MAX ((long long)(~0ULL>>1)) +#endif + +enum { B_FALSE = 0, B_TRUE = 1 }; +typedef short pri_t; +typedef unsigned long ulong_t; +typedef unsigned long long u_longlong_t; +typedef unsigned long long rlim64_t; +typedef unsigned long long loff_t; +typedef long long longlong_t; +typedef unsigned char uchar_t; +typedef unsigned int uint_t; +typedef unsigned short ushort_t; +typedef void *spinlock_t; +typedef long long offset_t; +typedef struct timespec timestruc_t; /* definition per SVr4 */ +typedef struct timespec timespec_t; +typedef ulong_t pgcnt_t; +typedef unsigned int umode_t; +#define NODEV32 (dev32_t)(-1) +typedef uint32_t dev32_t; +typedef uint_t minor_t; +typedef short index_t; + +#include +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FEXCL O_EXCL +#define FNOCTTY O_NOCTTY +#define FNOFOLLOW O_NOFOLLOW + +#ifdef __APPLE__ +#define FSYNC O_SYNC /* file (data+inode) integrity while writing */ +#define FDSYNC O_DSYNC /* file data only integrity while writing */ +#define FOFFMAX 0x0000 /* not used */ +#define FRSYNC 0x0000 /* not used */ +#else +#define FRSYNC 0x8000 /* sync read operations at same level of */ + /* integrity as specified for writes by */ + /* FSYNC and FDSYNC flags */ +#define FOFFMAX 0x2000 /* large file */ +#endif + +#define EXPORT_SYMBOL(X) +#define module_param(X, Y, Z) +#define MODULE_PARM_DESC(X, Y) + +#ifdef __GNUC__ +#define member_type(type, member) __typeof__(((type *)0)->member) +#else +#define member_type(type, member) void +#endif + +#define container_of(ptr, type, member) ((type *) \ + ((char *)(member_type(type, member) *) \ + { ptr } - offsetof(type, member))) + +typedef struct timespec inode_timespec_t; + +#endif /* _SPL_TYPES_H */ diff --git a/include/os/macos/spl/sys/types32.h b/include/os/macos/spl/sys/types32.h new file mode 100644 index 0000000000..799a956b5e --- /dev/null +++ b/include/os/macos/spl/sys/types32.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SPL_TYPES32_H +#define _SPL_TYPES32_H + +typedef uint32_t caddr32_t; +typedef int32_t daddr32_t; +typedef int32_t time32_t; +typedef uint32_t size32_t; + +#endif /* SPL_TYPE32_H */ diff --git a/include/os/macos/spl/sys/uio.h b/include/os/macos/spl/sys/uio.h new file mode 100644 index 0000000000..2327de209d --- /dev/null +++ b/include/os/macos/spl/sys/uio.h @@ -0,0 +1,173 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + + +#ifndef _SPL_UIO_H +#define _SPL_UIO_H + + +// OSX defines "uio_t" as "struct uio *" +// ZFS defines "uio_t" as "struct uio" +#undef uio_t +#include_next +#define uio_t struct uio + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct iovec iovec_t; + +typedef enum uio_seg uio_seg_t; +typedef enum uio_rw uio_rw_t; + +typedef struct aio_req { + uio_t *aio_uio; + void *aio_private; +} aio_req_t; + +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY, +} xuio_type_t; + + +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { + int uioa_pfncnt; + void **uioa_ppp; + caddr_t uioa_base; + size_t uioa_len; +} uioa_page_t; + +typedef struct xuio { + uio_t *xu_uio; + enum xuio_type xu_type; + union { + struct { + uint32_t xu_a_state; + ssize_t xu_a_mbytes; + uioa_page_t *xu_a_lcur; + void **xu_a_lppp; + void *xu_a_hwst[4]; + uioa_page_t xu_a_locked[UIOA_IOV_MAX]; + } xu_aio; + + struct { + int xu_zc_rw; + void *xu_zc_priv; + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +#define uio_segflg(U) \ + (uio_isuserspace((struct uio *)(U))?UIO_USERSPACE:UIO_SYSSPACE) +#define uio_advance(U, N) uio_update((struct uio *)(U), (N)) + +static inline uint64_t +uio_iovlen(const struct uio *u, unsigned int i) +{ + user_size_t iov_len; + uio_getiov((struct uio *)u, i, NULL, &iov_len); + return (iov_len); +} + +static inline void * +uio_iovbase(const struct uio *u, unsigned int i) +{ + user_addr_t iov_base; + uio_getiov((struct uio *)u, i, &iov_base, NULL); + return ((void *)iov_base); +} + +static inline void +uio_iov_at_index(uio_t *uio, unsigned int idx, void **base, uint64_t *len) +{ + (void) uio_getiov(uio, idx, (user_addr_t *)base, len); +} + +static inline long long +uio_index_at_offset(struct uio *uio, long long off, unsigned int *vec_idx) +{ + uint64_t len; + *vec_idx = 0; + while (*vec_idx < uio_iovcnt(uio) && off >= + (len = uio_iovlen(uio, *vec_idx))) { + off -= len; + (*vec_idx)++; + } + return (off); +} + +/* + * same as uiomove() but doesn't modify uio structure. + * return in cbytes how many bytes were copied. + */ +static inline int +uiocopy(const char *p, size_t n, enum uio_rw rw, struct uio *uio, + size_t *cbytes) +{ + int result; + struct uio *nuio = uio_duplicate(uio); + unsigned long long x = uio_resid(uio); + if (!nuio) + return (ENOMEM); + uio_setrw(nuio, rw); + result = uiomove(p, n, nuio); + *cbytes = x-uio_resid(nuio); + uio_free(nuio); + return (result); +} + + +// Apple's uiomove puts the uio_rw in uio_create +#define uiomove(A, B, C, D) uiomove((A), (B), (D)) +#define uioskip(A, B) uio_update((A), (B)) + +extern int uio_prefaultpages(ssize_t, uio_t *); + +#ifdef __cplusplus +} +#endif +#endif /* SPL_UIO_H */ diff --git a/include/os/macos/spl/sys/utsname.h b/include/os/macos/spl/sys/utsname.h new file mode 100644 index 0000000000..b6bcab77bb --- /dev/null +++ b/include/os/macos/spl/sys/utsname.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _SPL_UTSNAME_H +#define _SPL_UTSNAME_H + +#define _SYS_NMLN 257 +struct opensolaris_utsname { + char sysname[_SYS_NMLN]; + char nodename[_SYS_NMLN]; + char release[_SYS_NMLN]; + char version[_SYS_NMLN]; + char machine[_SYS_NMLN]; +}; + +typedef struct opensolaris_utsname utsname_t; + +extern utsname_t *utsname(void); + +#endif /* SPL_UTSNAME_H */ diff --git a/include/os/macos/spl/sys/varargs.h b/include/os/macos/spl/sys/varargs.h new file mode 100644 index 0000000000..b7371a1f2a --- /dev/null +++ b/include/os/macos/spl/sys/varargs.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ +#ifndef _SPL_VARARGS_H +#define _SPL_VARARGS_H + +#define __va_list va_list + +#endif /* SPL_VARARGS_H */ diff --git a/include/os/macos/spl/sys/vfs.h b/include/os/macos/spl/sys/vfs.h new file mode 100644 index 0000000000..aa78dc4347 --- /dev/null +++ b/include/os/macos/spl/sys/vfs.h @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _SPL_ZFS_H +#define _SPL_ZFS_H + +#include +#include + +#define MAXFIDSZ 64 + +typedef struct mount vfs_t; + +#define vn_vfswlock(vp) (0) +#define vn_vfsunlock(vp) +#define VFS_HOLD(vfsp) +#define VFS_RELE(vfsp) + + + +/* + * File identifier. Should be unique per filesystem on a single + * machine. This is typically called by a stateless file server + * in order to generate "file handles". + * + * Do not change the definition of struct fid ... fid_t without + * letting the CacheFS group know about it! They will have to do at + * least two things, in the same change that changes this structure: + * 1. change CFSVERSION in usr/src/uts/common/sys/fs/cachefs_fs.h + * 2. put the old version # in the canupgrade array + * in cachfs_upgrade() in usr/src/cmd/fs.d/cachefs/fsck/fsck.c + * This is necessary because CacheFS stores FIDs on disk. + * + * Many underlying file systems cast a struct fid into other + * file system dependent structures which may require 4 byte alignment. + * Because a fid starts with a short it may not be 4 byte aligned, the + * fid_pad will force the alignment. + */ +#define MAXFIDSZ 64 +#define OLD_MAXFIDSZ 16 + +typedef struct fid { + union { + long fid_pad; + struct { + ushort_t len; /* length of data in bytes */ + char data[MAXFIDSZ]; /* data (variable len) */ + } _fid; + } un; +} fid_t; + + +extern void (*mountroot_post_hook)(void); + +#endif /* SPL_ZFS_H */ diff --git a/include/os/macos/spl/sys/vmem.h b/include/os/macos/spl/sys/vmem.h new file mode 100644 index 0000000000..6ff8a6e146 --- /dev/null +++ b/include/os/macos/spl/sys/vmem.h @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VMEM_H +#define _SYS_VMEM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +#define KMEM_QUANTUM (PAGESIZE) + + + /* + * Per-allocation flags + */ +#define VM_SLEEP 0x00000000 /* same as KM_SLEEP */ +#define VM_NOSLEEP 0x00000001 /* same as KM_NOSLEEP */ +#define VM_PANIC 0x00000002 /* same as KM_PANIC */ +#define VM_PUSHPAGE 0x00000004 /* same as KM_PUSHPAGE */ +#define VM_NORMALPRI 0x00000008 /* same as KM_NORMALPRI */ +#define VM_NODEBUG 0x00000010 /* matches KM_NODE~BUG, */ + /* not implemented on OSX */ +#define VM_NO_VBA 0x00000020 /* OSX: do not descend to the bucket layer */ +#define VM_KMFLAGS 0x000000ff /* flags that must match KM_* flags */ + +#define VM_BESTFIT 0x00000100 +#define VM_FIRSTFIT 0x00000200 +#define VM_NEXTFIT 0x00000400 + +/* + * The following flags are restricted for use only within the kernel. + * VM_MEMLOAD is for use by the HAT to avoid infinite recursion. + * VM_NORELOC is used by the kernel when static VA->PA mappings are required. + */ +#define VM_MEMLOAD 0x00000800 +#define VM_NORELOC 0x00001000 + +/* + * VM_ABORT requests that vmem_alloc() *ignore* the VM_SLEEP/VM_NOSLEEP flags + * and forgo reaping if the allocation or attempted import, fails. This + * flag is a segkmem-specific flag, and should not be used by anyone else. + */ +#define VM_ABORT 0x00002000 + +/* + * VM_ENDALLOC requests that large addresses be preferred in allocations. + * Has no effect if VM_NEXTFIT is active. + */ +#define VM_ENDALLOC 0x00004000 + +#define VM_FLAGS 0x0000FFFF + +/* + * Arena creation flags + */ +#define VMC_POPULATOR 0x00010000 +#define VMC_NO_QCACHE 0x00020000 /* cannot use quantum caches */ +#define VMC_IDENTIFIER 0x00040000 /* not backed by memory */ +// VMC_XALLOC 0x00080000 below +// VMC_XALIGN 0x00100000 below +#define VMC_DUMPSAFE 0x00200000 /* can use alternate dump memory */ +// KMC_IDENTIFIER == 0x00400000 +// KMC_PREFILL == 0x00800000 +#define VMC_TIMEFREE 0x01000000 /* keep span creation time, */ + /* newest spans to front */ +#define VMC_OLDFIRST 0x02000000 /* must accompany VMC_TIMEFREE, */ + /* oldest spans to front */ + +/* + * internal use only; the import function uses the vmem_ximport_t interface + * and may increase the request size if it so desires. + * VMC_XALIGN, for use with vmem_xcreate, specifies that + * the address returned by the import function will be + * aligned according to the alignment argument. + */ +#define VMC_XALLOC 0x00080000 +#define VMC_XALIGN 0x00100000 +#define VMC_FLAGS 0xFFFF0000 + +/* + * Public segment types + */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + +/* + * Implementation-private segment types + */ +#define VMEM_SPAN 0x10 +#define VMEM_ROTOR 0x20 +#define VMEM_WALKER 0x40 + +/* + * VMEM_REENTRANT indicates to vmem_walk() that the callback routine may + * call back into the arena being walked, so vmem_walk() must drop the + * arena lock before each callback. The caveat is that since the arena + * isn't locked, its state can change. Therefore it is up to the callback + * routine to handle cases where the segment isn't of the expected type. + * For example, we use this to walk heap_arena when generating a crash dump; + * see segkmem_dump() for sample usage. + */ +#define VMEM_REENTRANT 0x80000000 + +struct vmem; + +typedef struct vmem vmem_t; +typedef void *(vmem_alloc_t)(vmem_t *, size_t, int); +typedef void (vmem_free_t)(vmem_t *, void *, size_t); + +/* + * Alternate import style; the requested size is passed in a pointer, + * which can be increased by the import function if desired. + */ +typedef void *(vmem_ximport_t)(vmem_t *, size_t *, size_t, int); + +#ifdef _KERNEL +extern vmem_t *vmem_init(const char *, void *, size_t, size_t, + vmem_alloc_t *, vmem_free_t *); +extern void vmem_fini(vmem_t *); +extern void vmem_update(void *); +extern int vmem_is_populator(void); +extern size_t vmem_seg_size; +#endif + +extern vmem_t *vmem_create(const char *, void *, size_t, size_t, + vmem_alloc_t *, vmem_free_t *, vmem_t *, size_t, int); +extern vmem_t *vmem_xcreate(const char *, void *, size_t, size_t, + vmem_ximport_t *, vmem_free_t *, vmem_t *, size_t, int); +extern void vmem_destroy(vmem_t *); +extern void *vmem_alloc(vmem_t *, size_t, int); +extern void *vmem_xalloc(vmem_t *, size_t, size_t, size_t, size_t, + void *, void *, int); +extern void vmem_free(vmem_t *, void *, size_t); +extern void vmem_xfree(vmem_t *, void *, size_t); +extern void *vmem_add(vmem_t *, void *, size_t, int); +extern int vmem_contains(vmem_t *, void *, size_t); +extern void vmem_walk(vmem_t *, int, void (*)(void *, void *, size_t), void *); +extern size_t vmem_size(vmem_t *, int); +extern size_t vmem_size_locked(vmem_t *, int); +extern size_t vmem_size_semi_atomic(vmem_t *, int); +extern void vmem_qcache_reap(vmem_t *vmp); +extern int64_t vmem_buckets_size(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VMEM_H */ diff --git a/include/os/macos/spl/sys/vmem_impl.h b/include/os/macos/spl/sys/vmem_impl.h new file mode 100644 index 0000000000..233a6b33ae --- /dev/null +++ b/include/os/macos/spl/sys/vmem_impl.h @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1999-2001, 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VMEM_IMPL_H +#define _SYS_VMEM_IMPL_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vmem_seg vmem_seg_t; + +#define VMEM_STACK_DEPTH 20 + +struct vmem_seg { + /* + * The first four fields must match vmem_freelist_t exactly. + */ + uintptr_t vs_start; /* start of segment (inclusive) */ + uintptr_t vs_end; /* end of segment (exclusive) */ + vmem_seg_t *vs_knext; /* next of kin (alloc, free, span) */ + vmem_seg_t *vs_kprev; /* prev of kin */ + + vmem_seg_t *vs_anext; /* next in arena */ + vmem_seg_t *vs_aprev; /* prev in arena */ + uint8_t vs_type; /* alloc, free, span */ + uint8_t vs_import; /* non-zero if segment was imported */ + uint8_t vs_depth; /* stack depth if KMF_AUDIT active */ + /* + * if VM_FREESORT is set on the arena, then + * this field is set at span creation time. + */ + hrtime_t vs_span_createtime; + /* + * The following fields are present only when KMF_AUDIT is set. + */ + kthread_t *vs_thread; + hrtime_t vs_timestamp; + pc_t vs_stack[VMEM_STACK_DEPTH]; +}; + +typedef struct vmem_freelist { + uintptr_t vs_start; /* always zero */ + uintptr_t vs_end; /* segment size */ + vmem_seg_t *vs_knext; /* next of kin */ + vmem_seg_t *vs_kprev; /* prev of kin */ +} vmem_freelist_t; + +#define VS_SIZE(vsp) ((vsp)->vs_end - (vsp)->vs_start) + +/* + * Segment hashing + */ +#define VMEM_HASH_INDEX(a, s, q, m) \ + ((((a) + ((a) >> (s)) + ((a) >> ((s) << 1))) >> (q)) & (m)) + +#define VMEM_HASH(vmp, addr) \ + (&(vmp)->vm_hash_table[VMEM_HASH_INDEX(addr, \ + (vmp)->vm_hash_shift, (vmp)->vm_qshift, (vmp)->vm_hash_mask)]) + +#define VMEM_QCACHE_SLABSIZE(max) \ + MAX(1 << highbit(3 * (max)), 64) + +#define VMEM_NAMELEN 30 +#define VMEM_HASH_INITIAL 16 +#define VMEM_NQCACHE_MAX 16 +#define VMEM_FREELISTS (sizeof (void *) * 8) + +typedef struct vmem_kstat { + kstat_named_t vk_mem_inuse; /* memory in use */ + kstat_named_t vk_mem_import; /* memory imported */ + kstat_named_t vk_mem_total; /* total memory in arena */ + kstat_named_t vk_source_id; /* vmem id of vmem source */ + kstat_named_t vk_alloc; /* number of allocations */ + kstat_named_t vk_free; /* number of frees */ + kstat_named_t vk_wait; /* number of allocations that waited */ + kstat_named_t vk_fail; /* number of allocations that failed */ + kstat_named_t vk_lookup; /* hash lookup count */ + kstat_named_t vk_search; /* freelist search count */ + kstat_named_t vk_populate_fail; /* populates that failed */ + kstat_named_t vk_contains; /* vmem_contains() calls */ + kstat_named_t vk_contains_search; /* vmem_contains() search cnt */ + kstat_named_t vk_parent_alloc; /* called the source allocator */ + kstat_named_t vk_parent_free; /* called the source free function */ + kstat_named_t vk_threads_waiting; /* threads in cv_wait in vmem */ + /* allocator function */ + kstat_named_t vk_excess; /* count of retained excess imports */ +} vmem_kstat_t; + +struct vmem { + char vm_name[VMEM_NAMELEN]; /* arena name */ + kcondvar_t vm_cv; /* cv for blocking allocations */ + kmutex_t vm_lock; /* arena lock */ + uint32_t vm_id; /* vmem id */ + hrtime_t vm_createtime; + uint32_t vm_mtbf; /* induced alloc failure rate */ + int vm_cflags; /* arena creation flags */ + int vm_qshift; /* log2(vm_quantum) */ + size_t vm_quantum; /* vmem quantum */ + size_t vm_qcache_max; /* maximum size to front by kmem */ + size_t vm_min_import; /* smallest amount to import */ + void *(*vm_source_alloc)(vmem_t *, size_t, int); + void (*vm_source_free)(vmem_t *, void *, size_t); + vmem_t *vm_source; /* vmem source for imported memory */ + vmem_t *vm_next; /* next in vmem_list */ + kstat_t *vm_ksp; /* kstat */ + ssize_t vm_nsegfree; /* number of free vmem_seg_t's */ + vmem_seg_t *vm_segfree; /* free vmem_seg_t list */ + vmem_seg_t **vm_hash_table; /* allocated-segment hash table */ + size_t vm_hash_mask; /* hash_size - 1 */ + size_t vm_hash_shift; /* log2(vm_hash_mask + 1) */ + ulong_t vm_freemap; /* bitmap of non-empty freelists */ + vmem_seg_t vm_seg0; /* anchor segment */ + vmem_seg_t vm_rotor; /* rotor for VM_NEXTFIT allocations */ + vmem_seg_t *vm_hash0[VMEM_HASH_INITIAL]; /* initial hash table */ + void *vm_qcache[VMEM_NQCACHE_MAX]; /* quantum caches */ + vmem_freelist_t vm_freelist[VMEM_FREELISTS + 1]; /* power-of-2 flists */ + vmem_kstat_t vm_kstat; /* kstat data */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VMEM_IMPL_H */ diff --git a/include/os/macos/spl/sys/vmsystm.h b/include/os/macos/spl/sys/vmsystm.h new file mode 100644 index 0000000000..421e26364c --- /dev/null +++ b/include/os/macos/spl/sys/vmsystm.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_VMSYSTM_H +#define _SPL_VMSYSTM_H + +#include + +#define xcopyout copyout + +#endif /* SPL_VMSYSTM_H */ diff --git a/include/os/macos/spl/sys/vnode.h b/include/os/macos/spl/sys/vnode.h new file mode 100644 index 0000000000..37b7e41e75 --- /dev/null +++ b/include/os/macos/spl/sys/vnode.h @@ -0,0 +1,258 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_VNODE_H +#define _SPL_VNODE_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +// Be aware that Apple defines "typedef struct vnode *vnode_t" and +// ZFS uses "typedef struct vnode vnode_t". +#undef uio_t +#undef vnode_t +#include_next +#define vnode_t struct vnode +#define uio_t struct uio + + +struct caller_context; +typedef struct caller_context caller_context_t; +typedef int vcexcl_t; + +enum vcexcl { NONEXCL, EXCL }; + +#define B_INVAL 0x01 +#define B_TRUNC 0x02 + +#define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ + +#define IS_DEVVP(vp) \ + (vnode_ischr(vp) || vnode_isblk(vp) || vnode_isfifo(vp)) + +enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */ +enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */ + +#define va_mask va_active +#define va_nodeid va_fileid +#define va_nblocks va_filerev + +/* + * vnode attr translations + */ +#define ATTR_TYPE VNODE_ATTR_va_type +#define ATTR_MODE VNODE_ATTR_va_mode +#define ATTR_ACL VNODE_ATTR_va_acl +#define ATTR_UID VNODE_ATTR_va_uid +#define ATTR_GID VNODE_ATTR_va_gid +#define ATTR_ATIME VNODE_ATTR_va_access_time +#define ATTR_MTIME VNODE_ATTR_va_modify_time +#define ATTR_CTIME VNODE_ATTR_va_change_time +#define ATTR_CRTIME VNODE_ATTR_va_create_time +#define ATTR_SIZE VNODE_ATTR_va_data_size +#define ATTR_NOSET 0 +/* + * OSX uses separate vnop getxattr and setxattr to deal with XATTRs, so + * we never get vop&XVATTR set from VFS. All internal checks for it in + * ZFS is not required. + */ +#define ATTR_XVATTR 0 +#define AT_XVATTR ATTR_XVATTR + +#define va_size va_data_size +#define va_atime va_access_time +#define va_mtime va_modify_time +#define va_ctime va_change_time +#define va_crtime va_create_time +#define va_bytes va_data_size + +typedef struct vnode_attr vattr; +typedef struct vnode_attr vattr_t; + +/* vsa_mask values */ +#define VSA_ACL 0x0001 +#define VSA_ACLCNT 0x0002 +#define VSA_DFACL 0x0004 +#define VSA_DFACLCNT 0x0008 +#define VSA_ACE 0x0010 +#define VSA_ACECNT 0x0020 +#define VSA_ACE_ALLTYPES 0x0040 +#define VSA_ACE_ACLFLAGS 0x0080 /* get/set ACE ACL flags */ + + +extern struct vnode *vn_alloc(int flag); + +extern int vn_open(char *pnamep, enum uio_seg seg, int filemode, + int createmode, struct vnode **vpp, enum create crwhy, mode_t umask); +extern int vn_openat(char *pnamep, enum uio_seg seg, int filemode, + int createmode, struct vnode **vpp, enum create crwhy, + mode_t umask, struct vnode *startvp); + +#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) +#define vn_free(vp) do { } while (0) +#define vn_pages_remove(vp, fl, op) do { } while (0) + +/* XNU is a vn_rdwr, so we work around it to match arguments */ +/* This should be deprecated, if not now, soon. */ +extern int zfs_vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, + ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, + rlim64_t ulimit, cred_t *cr, ssize_t *residp); + +#define vn_rdwr(rw, vp, b, l, o, s, flg, li, cr, resid) \ + zfs_vn_rdwr((rw), (vp), (b), (l), (o), (s), (flg), (li), (cr), (resid)) + +/* Other vn_rdwr for zfs_file_t ops */ +struct spl_fileproc; +extern int spl_vn_rdwr(enum uio_rw rw, struct spl_fileproc *, caddr_t base, + ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, + rlim64_t ulimit, cred_t *cr, ssize_t *residp); + +extern int vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag); +extern int vn_rename(char *from, char *to, enum uio_seg seg); + +#define LK_RETRY 0 +#define LK_SHARED 0 +#define VN_UNLOCK(vp) +static inline int +vn_lock(struct vnode *vp, int fl) +{ + return (0); +} + +/* + * XNU reserves fileID 1-15, so we remap them high. + * 2 is root-of-the-mount. + * If ID is same as root, return 2. Otherwise, if it is 0-15, return + * adjusted, otherwise, return as-is. + * See hfs_format.h: kHFSRootFolderID, kHFSExtentsFileID, ... + */ +#define INO_ROOT 2ULL +#define INO_RESERVED 16ULL /* [0-15] reserved. */ +#define INO_ISRESERVED(ID) ((ID) < (INO_RESERVED)) +/* 0xFFFFFFFFFFFFFFF0 */ +#define INO_MAP ((uint64_t)-INO_RESERVED) /* -16, -15, .., -1 */ + +#define INO_ZFSTOXNU(ID, ROOT) \ + ((ID) == (ROOT)?INO_ROOT:(INO_ISRESERVED(ID)?INO_MAP+(ID):(ID))) + +/* + * This macro relies on *unsigned*. + * If asking for 2, return rootID. If in special range, adjust to + * normal, otherwise, return as-is. + */ +#define INO_XNUTOZFS(ID, ROOT) \ + ((ID) == INO_ROOT)?(ROOT): \ + (INO_ISRESERVED((ID)-INO_MAP))?((ID)-INO_MAP):(ID) + +#define VN_HOLD(vp) vnode_getwithref(vp) +#define VN_RELE(vp) vnode_put(vp) + +void spl_rele_async(void *arg); +void vn_rele_async(struct vnode *vp, void *taskq); + +extern int vnode_iocount(struct vnode *); + +#define VN_RELE_ASYNC(vp, tq) vn_rele_async((vp), (tq)) + +#define vn_exists(vp) +#define vn_is_readonly(vp) vnode_vfsisrdonly(vp) + +#define vnode_pager_setsize(vp, sz) ubc_setsize((vp), (sz)) + +#define VATTR_NULL(v) do { } while (0) + +extern int VOP_CLOSE(struct vnode *vp, int flag, int count, + offset_t off, void *cr, void *); +extern int VOP_FSYNC(struct vnode *vp, int flags, void* unused, void *); +extern int VOP_SPACE(struct vnode *vp, int cmd, struct flock *fl, + int flags, offset_t off, cred_t *cr, void *ctx); + +extern int VOP_GETATTR(struct vnode *vp, vattr_t *vap, int flags, + void *x3, void *x4); + +#define VOP_UNLOCK(vp, fl) do { } while (0) + +void vfs_mountedfrom(struct mount *vfsp, char *osname); + +#define build_path(A, B, C, D, E, F) spl_build_path(A, B, C, D, E, F) +extern int spl_build_path(struct vnode *vp, char *buff, int buflen, + int *outlen, int flags, vfs_context_t ctx); + +extern struct vnode *rootdir; + +static inline int chklock(struct vnode *vp, int iomode, + unsigned long long offset, ssize_t len, int fmode, void *ct) +{ + return (0); +} + +#define vn_ismntpt(vp) (vnode_mountedhere(vp) != NULL) + +extern errno_t VOP_LOOKUP (struct vnode *, struct vnode **, + struct componentname *, vfs_context_t); +extern errno_t VOP_MKDIR (struct vnode *, struct vnode **, + struct componentname *, struct vnode_attr *, + vfs_context_t); +extern errno_t VOP_REMOVE (struct vnode *, struct vnode *, + struct componentname *, int, vfs_context_t); +extern errno_t VOP_SYMLINK (struct vnode *, struct vnode **, + struct componentname *, struct vnode_attr *, + char *, vfs_context_t); + +void spl_vnode_fini(void); +int spl_vnode_init(void); + + +extern int spl_vfs_root(mount_t mount, struct vnode **vp); +#define VFS_ROOT(V, L, VP) spl_vfs_root((V), (VP)) + +extern void cache_purgevfs(mount_t mp); + +vfs_context_t vfs_context_kernel(void); +vfs_context_t spl_vfs_context_kernel(void); +extern int spl_vnode_notify(struct vnode *vp, uint32_t type, + struct vnode_attr *vap); +extern int spl_vfs_get_notify_attributes(struct vnode_attr *vap); +extern void spl_hijack_mountroot(void *func); +extern void spl_setrootvnode(struct vnode *vp); + +struct vnode *getrootdir(void); +void spl_vfs_start(void); + +#endif /* SPL_VNODE_H */ diff --git a/include/os/macos/spl/sys/zmod.h b/include/os/macos/spl/sys/zmod.h new file mode 100644 index 0000000000..6965c91f3d --- /dev/null +++ b/include/os/macos/spl/sys/zmod.h @@ -0,0 +1,122 @@ +/* + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler + */ + +#ifndef _SPL_ZMOD_H +#define _SPL_ZMOD_H + + +#include +#include +#include + +struct _zmemheader { + uint64_t length; + char data[0]; +}; + +static inline void * +zfs_zalloc(void* opaque, uInt items, uInt size) +{ + struct _zmemheader *hdr; + size_t alloc_size = (items * size) + sizeof (uint64_t); + hdr = kmem_zalloc(alloc_size, KM_SLEEP); + hdr->length = alloc_size; + return (&hdr->data); +} + +static inline void +zfs_zfree(void *opaque, void *addr) +{ + struct _zmemheader *hdr; + hdr = addr; + hdr--; + kmem_free(hdr, hdr->length); +} + +/* + * Uncompress the buffer 'src' into the buffer 'dst'. The caller must store + * the expected decompressed data size externally so it can be passed in. + * The resulting decompressed size is then returned through dstlen. This + * function return Z_OK on success, or another error code on failure. + */ +static inline int + z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) +{ + z_stream zs; + int err; + + bzero(&zs, sizeof (zs)); + zs.next_in = (uchar_t *)src; + zs.avail_in = srclen; + zs.next_out = dst; + zs.avail_out = *dstlen; + zs.zalloc = zfs_zalloc; + zs.zfree = zfs_zfree; + if ((err = inflateInit(&zs)) != Z_OK) + return (err); + if ((err = inflate(&zs, Z_FINISH)) != Z_STREAM_END) { + (void) inflateEnd(&zs); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *dstlen = zs.total_out; + return (inflateEnd(&zs)); +} + +static inline int +z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, + int level) +{ + z_stream zs; + int err; + bzero(&zs, sizeof (zs)); + zs.next_in = (uchar_t *)src; + zs.avail_in = srclen; + zs.next_out = dst; + zs.avail_out = *dstlen; + zs.zalloc = zfs_zalloc; + zs.zfree = zfs_zfree; + if ((err = deflateInit(&zs, level)) != Z_OK) + return (err); + if ((err = deflate(&zs, Z_FINISH)) != Z_STREAM_END) { + (void) deflateEnd(&zs); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *dstlen = zs.total_out; + return (deflateEnd(&zs)); +} + +static inline int +z_compress(void *dst, size_t *dstlen, const void *src, size_t srclen) +{ + return (z_compress_level(dst, dstlen, src, srclen, + Z_DEFAULT_COMPRESSION)); +} + + +int spl_zlib_init(void); +void spl_zlib_fini(void); + +#endif /* SPL_ZMOD_H */ diff --git a/include/os/macos/spl/sys/zone.h b/include/os/macos/spl/sys/zone.h new file mode 100644 index 0000000000..fdd4c2754f --- /dev/null +++ b/include/os/macos/spl/sys/zone.h @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_ZONE_H +#define _SPL_ZONE_H + +#include + +#define GLOBAL_ZONEID 0 + +#define zone_dataset_visible(x, y) (1) +#define INGLOBALZONE(z) (1) +#define crgetzoneid(x) (GLOBAL_ZONEID) + +#endif /* SPL_ZONE_H */ diff --git a/include/os/macos/zfs/Makefile.am b/include/os/macos/zfs/Makefile.am new file mode 100644 index 0000000000..081839c48c --- /dev/null +++ b/include/os/macos/zfs/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/include/os/macos/zfs/sys/Makefile.am b/include/os/macos/zfs/sys/Makefile.am new file mode 100644 index 0000000000..9779de08e0 --- /dev/null +++ b/include/os/macos/zfs/sys/Makefile.am @@ -0,0 +1,10 @@ +KERNEL_H = \ + $(top_srcdir)/include/os/macos/zfs/sys/kstat_osx.h \ + $(top_srcdir)/include/os/macos/spl/sys/ldi_buf.h \ + $(top_srcdir)/include/os/macos/spl/sys/ldi_impl_osx.h \ + $(top_srcdir)/include/os/macos/spl/sys/ldi_osx.h \ + $(top_srcdir)/include/os/macos/spl/sys/trace_zfs.h \ + $(top_srcdir)/include/os/macos/spl/sys/vdev_disk_os.h \ + $(top_srcdir)/include/os/macos/spl/sys/zfs_ioctl_compat.h \ + $(top_srcdir)/include/os/macos/spl/sys/zfs_vfsops.h \ + $(top_srcdir)/include/os/macos/spl/sys/zfs_znode_impl.h diff --git a/include/os/macos/zfs/sys/ZFSDataset.h b/include/os/macos/zfs/sys/ZFSDataset.h new file mode 100644 index 0000000000..06fa8fdfb0 --- /dev/null +++ b/include/os/macos/zfs/sys/ZFSDataset.h @@ -0,0 +1,141 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#ifndef ZFSDATASET_H_INCLUDED +#define ZFSDATASET_H_INCLUDED + +#ifdef __cplusplus + +#include +#include + +#ifdef super +#undef super +#endif +#define super IOMedia + +// #define kZFSContentHint "6A898CC3-1DD2-11B2-99A6-080020736631" +#define kZFSContentHint "ZFS_Dataset" + +#define kZFSIOMediaPrefix "ZFS " +#define kZFSIOMediaSuffix " Media" +#define kZFSDatasetNameKey "ZFS Dataset" +#define kZFSDatasetClassKey "ZFSDataset" + +class ZFSDataset : public IOMedia +{ + OSDeclareDefaultStructors(ZFSDataset) +public: +#if 0 + /* XXX Only for debug tracing */ + virtual bool open(IOService *client, + IOOptionBits options, IOStorageAccess access = 0); + virtual bool isOpen(const IOService *forClient = 0) const; + virtual void close(IOService *client, + IOOptionBits options); + + virtual bool handleOpen(IOService *client, + IOOptionBits options, void *access); + virtual bool handleIsOpen(const IOService *client) const; + virtual void handleClose(IOService *client, + IOOptionBits options); + + virtual bool attach(IOService *provider); + virtual void detach(IOService *provider); + + virtual bool start(IOService *provider); + virtual void stop(IOService *provider); +#endif + + virtual bool init(UInt64 base, UInt64 size, + UInt64 preferredBlockSize, + IOMediaAttributeMask attributes, + bool isWhole, bool isWritable, + const char *contentHint = 0, + OSDictionary *properties = 0); + virtual void free(); + + static ZFSDataset * withDatasetNameAndSize(const char *name, + uint64_t size); + + virtual void read(IOService *client, + UInt64 byteStart, IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion); + virtual void write(IOService *client, + UInt64 byteStart, IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion); + +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + virtual IOReturn synchronize(IOService *client, + UInt64 byteStart, UInt64 byteCount, + IOStorageSynchronizeOptions options = 0); +#else + virtual IOReturn synchronizeCache(IOService *client); +#endif + + virtual IOReturn unmap(IOService *client, + IOStorageExtent *extents, UInt32 extentsCount, +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + IOStorageUnmapOptions options = 0); +#else + UInt32 options = 0); +#endif + + virtual bool lockPhysicalExtents(IOService *client); + virtual IOStorage *copyPhysicalExtent(IOService *client, + UInt64 *byteStart, UInt64 *byteCount); + virtual void unlockPhysicalExtents(IOService *client); + +#if defined(MAC_OS_X_VERSION_10_10) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10) + virtual IOReturn setPriority(IOService *client, + IOStorageExtent *extents, UInt32 extentsCount, + IOStoragePriority priority); +#endif + + virtual UInt64 getPreferredBlockSize() const; + virtual UInt64 getSize() const; + virtual UInt64 getBase() const; + + virtual bool isEjectable() const; + virtual bool isFormatted() const; + virtual bool isWhole() const; + virtual bool isWritable() const; + + virtual const char *getContent() const; + virtual const char *getContentHint() const; + virtual IOMediaAttributeMask getAttributes() const; + +protected: +private: + bool setDatasetName(const char *); +}; + +#endif /* __cplusplus */ + +#endif /* ZFSDATASET_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/ZFSDatasetProxy.h b/include/os/macos/zfs/sys/ZFSDatasetProxy.h new file mode 100644 index 0000000000..e220cdcf9a --- /dev/null +++ b/include/os/macos/zfs/sys/ZFSDatasetProxy.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#ifndef ZFSDATASETPROXY_H_INCLUDED +#define ZFSDATASETPROXY_H_INCLUDED + +#include + +class ZFSDatasetProxy : public IOBlockStorageDevice +{ + OSDeclareDefaultStructors(ZFSDatasetProxy); +public: + + virtual void free(void); + virtual bool init(OSDictionary *properties); + virtual bool start(IOService *provider); + + /* IOBlockStorageDevice */ + virtual IOReturn doSynchronizeCache(void); + virtual IOReturn doAsyncReadWrite(IOMemoryDescriptor *, + UInt64, UInt64, IOStorageAttributes *, + IOStorageCompletion *); + virtual UInt32 doGetFormatCapacities(UInt64 *, + UInt32) const; + virtual IOReturn doFormatMedia(UInt64 byteCapacity); + virtual IOReturn doEjectMedia(); + virtual char *getVendorString(); + virtual char *getProductString(); + virtual char *getRevisionString(); + virtual char *getAdditionalDeviceInfoString(); + virtual IOReturn reportWriteProtection(bool *); + virtual IOReturn reportRemovability(bool *); + virtual IOReturn reportMediaState(bool *, bool *); + virtual IOReturn reportBlockSize(UInt64 *); + virtual IOReturn reportEjectability(bool *); + virtual IOReturn reportMaxValidBlock(UInt64 *); + + virtual IOReturn setWriteCacheState(bool enabled); + virtual IOReturn getWriteCacheState(bool *enabled); +#if 0 + virtual void read(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion); + virtual void write(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion); +#endif + +protected: +private: + /* These are declared class static to share across instances */ + const char *vendorString; + const char *revisionString; + const char *infoString; + /* These are per-instance */ + const char *productString; + uint64_t _pool_bcount; + bool isReadOnly; +}; + +#endif /* ZFSDATASETPROXY_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/ZFSDatasetScheme.h b/include/os/macos/zfs/sys/ZFSDatasetScheme.h new file mode 100644 index 0000000000..eaa8bb368d --- /dev/null +++ b/include/os/macos/zfs/sys/ZFSDatasetScheme.h @@ -0,0 +1,126 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#ifndef ZFSDATASETSCHEME_H_INCLUDED +#define ZFSDATASETSCHEME_H_INCLUDED + +#define kZFSDatasetSchemeClass "ZFSDatasetScheme" + +#include +#include + + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +int zfs_osx_proxy_get_osname(const char *bsdname, + char *osname, int len); +int zfs_osx_proxy_get_bsdname(const char *osname, + char *bsdname, int len); + + +void zfs_osx_proxy_remove(const char *osname); +int zfs_osx_proxy_create(const char *osname); + +#ifdef __cplusplus +} /* extern "C" */ + +/* Not C external */ +ZFSDataset * zfs_osx_proxy_get(const char *osname); + +class ZFSDatasetScheme : public IOPartitionScheme +{ + OSDeclareDefaultStructors(ZFSDatasetScheme); +public: + + virtual void free(void); + virtual bool init(OSDictionary *properties); + virtual bool start(IOService *provider); + virtual IOService *probe(IOService *provider, SInt32 *score); + + bool addDataset(const char *osname); + bool removeDataset(const char *osname, bool force); + + /* Compatibility shims */ + virtual void read(IOService *client, + UInt64 byteStart, + IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion); + + virtual void write(IOService *client, + UInt64 byteStart, + IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion); + +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + virtual IOReturn synchronize(IOService *client, + UInt64 byteStart, + UInt64 byteCount, + IOStorageSynchronizeOptions options = 0); +#else + virtual IOReturn synchronizeCache(IOService *client); +#endif + + virtual IOReturn unmap(IOService *client, + IOStorageExtent *extents, + UInt32 extentsCount, +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + IOStorageUnmapOptions options = 0); +#else + UInt32 options = 0); +#endif + + virtual bool lockPhysicalExtents(IOService *client); + + virtual IOStorage *copyPhysicalExtent(IOService *client, + UInt64 * byteStart, + UInt64 * byteCount); + + virtual void unlockPhysicalExtents(IOService *client); + +#if defined(MAC_OS_X_VERSION_10_10) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10) + virtual IOReturn setPriority(IOService *client, + IOStorageExtent *extents, + UInt32 extentsCount, + IOStoragePriority priority); +#endif + +protected: +private: + OSSet *_datasets; + OSOrderedSet *_holes; + uint64_t _max_id; + + uint32_t getNextPartitionID(); + void returnPartitionID(uint32_t part_id); +}; + +#endif /* __cplusplus */ +#endif /* ZFSDATASETSCHEME_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/ZFSPool.h b/include/os/macos/zfs/sys/ZFSPool.h new file mode 100644 index 0000000000..56a190a0c6 --- /dev/null +++ b/include/os/macos/zfs/sys/ZFSPool.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#ifndef ZFSPOOL_H_INCLUDED +#define ZFSPOOL_H_INCLUDED + +#ifdef __cplusplus +#include + +#pragma mark - ZFSPool + +#define kZFSPoolNameKey "ZFS Pool Name" +#define kZFSPoolSizeKey "ZFS Pool Size" +#define kZFSPoolGUIDKey "ZFS Pool GUID" +#define kZFSPoolReadOnlyKey "ZFS Pool Read-Only" + +typedef struct spa spa_t; + +class ZFSPool : public IOService { + OSDeclareDefaultStructors(ZFSPool); + +protected: +#if 0 + /* XXX Only for debug tracing */ + virtual bool open(IOService *client, + IOOptionBits options, void *arg = 0); + virtual bool isOpen(const IOService *forClient = 0) const; + virtual void close(IOService *client, + IOOptionBits options); +#endif + + bool setPoolName(const char *name); + + virtual bool handleOpen(IOService *client, + IOOptionBits options, void *arg); + virtual bool handleIsOpen(const IOService *client) const; + virtual void handleClose(IOService *client, + IOOptionBits options); + + virtual bool init(OSDictionary *properties, spa_t *spa); + virtual void free(); + +#if 0 + /* IOBlockStorageDevice */ + virtual IOReturn doSynchronizeCache(void); + virtual IOReturn doAsyncReadWrite(IOMemoryDescriptor *, + UInt64, UInt64, IOStorageAttributes *, + IOStorageCompletion *); + virtual UInt32 doGetFormatCapacities(UInt64 *, + UInt32) const; + virtual IOReturn doFormatMedia(UInt64 byteCapacity); + virtual IOReturn doEjectMedia(); + virtual char *getVendorString(); + virtual char *getProductString(); + virtual char *getRevisionString(); + virtual char *getAdditionalDeviceInfoString(); + virtual IOReturn reportWriteProtection(bool *); + virtual IOReturn reportRemovability(bool *); + virtual IOReturn reportMediaState(bool *, bool *); + virtual IOReturn reportBlockSize(UInt64 *); + virtual IOReturn reportEjectability(bool *); + virtual IOReturn reportMaxValidBlock(UInt64 *); + +public: + virtual void read(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion); + virtual void write(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion); +#endif +public: + static ZFSPool * withProviderAndPool(IOService *, spa_t *); + +private: + OSSet *_openClients; + spa_t *_spa; + +#if 0 + /* These are declared class static to share across instances */ + static const char *vendorString; + static const char *revisionString; + static const char *infoString; + /* These are per-instance */ + const char *productString; + bool isReadOnly; +#endif +}; + +/* C++ wrapper, C uses opaque pointer reference */ +typedef struct spa_iokit { + ZFSPool *proxy; +} spa_iokit_t; + +extern "C" { +#endif /* __cplusplus */ + +/* C functions */ +void spa_iokit_pool_proxy_destroy(spa_t *spa); +int spa_iokit_pool_proxy_create(spa_t *spa); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* ZFSPOOL_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/finderinfo.h b/include/os/macos/zfs/sys/finderinfo.h new file mode 100644 index 0000000000..ee3b48017b --- /dev/null +++ b/include/os/macos/zfs/sys/finderinfo.h @@ -0,0 +1,36 @@ +#ifndef FINDERINFO_H +#define FINDERINFO_H + + +struct FndrExtendedDirInfo { + u_int32_t document_id; + u_int32_t date_added; + u_int16_t extended_flags; + u_int16_t reserved3; + u_int32_t write_gen_counter; +} __attribute__((aligned(2), packed)); + +struct FndrExtendedFileInfo { + u_int32_t document_id; + u_int32_t date_added; + u_int16_t extended_flags; + u_int16_t reserved2; + u_int32_t write_gen_counter; +} __attribute__((aligned(2), packed)); + +/* Finder information */ +struct FndrFileInfo { + u_int32_t fdType; + u_int32_t fdCreator; + u_int16_t fdFlags; + struct { + int16_t v; + int16_t h; + } fdLocation; + int16_t opaque; +} __attribute__((aligned(2), packed)); +typedef struct FndrFileInfo FndrFileInfo; + + + +#endif diff --git a/include/os/macos/zfs/sys/hfs_internal.h b/include/os/macos/zfs/sys/hfs_internal.h new file mode 100644 index 0000000000..db8c76f7a3 --- /dev/null +++ b/include/os/macos/zfs/sys/hfs_internal.h @@ -0,0 +1,183 @@ + +#ifndef HFS_INTERNAL_H +#define HFS_INTERNAL_H + +// BGH - Definitions of HFS vnops that we will need to emulate +// including supporting structures. + +struct hfs_journal_info { + off_t jstart; + off_t jsize; +}; + +struct user32_access_t { + uid_t uid; + short flags; + short num_groups; + int num_files; + user32_addr_t file_ids; + user32_addr_t groups; + user32_addr_t access; +}; + +struct user64_access_t { + uid_t uid; + short flags; + short num_groups; + int num_files; + user64_addr_t file_ids; + user64_addr_t groups; + user64_addr_t access; +}; + +struct user32_ext_access_t { + uint32_t flags; + uint32_t num_files; + uint32_t map_size; + user32_addr_t file_ids; + user32_addr_t bitmap; + user32_addr_t access; + uint32_t num_parents; + user32_addr_t parents; +}; + +struct user64_ext_access_t { + uint32_t flags; + uint32_t num_files; + uint32_t map_size; + user64_addr_t file_ids; + user64_addr_t bitmap; + user64_addr_t access; + uint32_t num_parents; + user64_addr_t parents; +}; + +/* + * HFS specific fcntl()'s + */ +#define HFS_BULKACCESS (FCNTL_FS_SPECIFIC_BASE + 0x00001) +#define HFS_GET_MOUNT_TIME (FCNTL_FS_SPECIFIC_BASE + 0x00002) +#define HFS_GET_LAST_MTIME (FCNTL_FS_SPECIFIC_BASE + 0x00003) +#define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004) +#define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005) + +/* HFS FS CONTROL COMMANDS */ + +#define HFSIOC_RESIZE_PROGRESS _IOR('h', 1, u_int32_t) +#define HFS_RESIZE_PROGRESS IOCBASECMD(HFSIOC_RESIZE_PROGRESS) + +#define HFSIOC_RESIZE_VOLUME _IOW('h', 2, u_int64_t) +#define HFS_RESIZE_VOLUME IOCBASECMD(HFSIOC_RESIZE_VOLUME) + +#define HFSIOC_CHANGE_NEXT_ALLOCATION _IOWR('h', 3, u_int32_t) +#define HFS_CHANGE_NEXT_ALLOCATION IOCBASECMD(HFSIOC_CHANGE_NEXT_ALLOCATION) +/* + * Magic value for next allocation to use with fcntl to set next allocation + * to zero and never update it again on new block allocation. + */ +#define HFS_NO_UPDATE_NEXT_ALLOCATION 0xffffFFFF + +#define HFSIOC_GETCREATETIME _IOR('h', 4, time_t) +#define HFS_GETCREATETIME IOCBASECMD(HFSIOC_GETCREATETIME) + +#define HFSIOC_SETBACKINGSTOREINFO _IOW('h', 7, struct hfs_backingstoreinfo) +#define HFS_SETBACKINGSTOREINFO IOCBASECMD(HFSIOC_SETBACKINGSTOREINFO) + +#define HFSIOC_CLRBACKINGSTOREINFO _IO('h', 8) +#define HFS_CLRBACKINGSTOREINFO IOCBASECMD(HFSIOC_CLRBACKINGSTOREINFO) + +#define HFSIOC_BULKACCESS _IOW('h', 9, struct user32_access_t) +#define HFS_BULKACCESS_FSCTL IOCBASECMD(HFSIOC_BULKACCESS) + +#define HFSIOC_SETACLSTATE _IOW('h', 10, int32_t) +#define HFS_SETACLSTATE IOCBASECMD(HFSIOC_SETACLSTATE) + +#define HFSIOC_PREV_LINK _IOWR('h', 11, u_int32_t) +#define HFS_PREV_LINK IOCBASECMD(HFSIOC_PREV_LINK) + +#define HFSIOC_NEXT_LINK _IOWR('h', 12, u_int32_t) +#define HFS_NEXT_LINK IOCBASECMD(HFSIOC_NEXT_LINK) + +#define HFSIOC_GETPATH _IOWR('h', 13, pathname_t) +#define HFS_GETPATH IOCBASECMD(HFSIOC_GETPATH) +#define HFS_GETPATH_VOLUME_RELATIVE 0x1 + +/* This define is deemed secret by Apple */ +#define BUILDPATH_VOLUME_RELATIVE 0x8 + +/* Enable/disable extent-based extended attributes */ +#define HFSIOC_SET_XATTREXTENTS_STATE _IOW('h', 14, u_int32_t) +#define HFS_SET_XATTREXTENTS_STATE IOCBASECMD(HFSIOC_SET_XATTREXTENTS_STATE) + +#define HFSIOC_EXT_BULKACCESS _IOW('h', 15, struct user32_ext_access_t) +#define HFS_EXT_BULKACCESS_FSCTL IOCBASECMD(HFSIOC_EXT_BULKACCESS) + +#define HFSIOC_MARK_BOOT_CORRUPT _IO('h', 16) +#define HFS_MARK_BOOT_CORRUPT IOCBASECMD(HFSIOC_MARK_BOOT_CORRUPT) + +#define HFSIOC_GET_JOURNAL_INFO _IOR('h', 17, struct hfs_journal_info) +#define HFS_FSCTL_GET_JOURNAL_INFO IOCBASECMD(HFSIOC_GET_JOURNAL_INFO) + +#define HFSIOC_SET_VERY_LOW_DISK _IOW('h', 20, u_int32_t) +#define HFS_FSCTL_SET_VERY_LOW_DISK IOCBASECMD(HFSIOC_SET_VERY_LOW_DISK) + +#define HFSIOC_SET_LOW_DISK _IOW('h', 21, u_int32_t) +#define HFS_FSCTL_SET_LOW_DISK IOCBASECMD(HFSIOC_SET_LOW_DISK) + +#define HFSIOC_SET_DESIRED_DISK _IOW('h', 22, u_int32_t) +#define HFS_FSCTL_SET_DESIRED_DISK IOCBASECMD(HFSIOC_SET_DESIRED_DISK) + +#define HFSIOC_SET_ALWAYS_ZEROFILL _IOW('h', 23, int32_t) +#define HFS_SET_ALWAYS_ZEROFILL IOCBASECMD(HFSIOC_SET_ALWAYS_ZEROFILL) + +#define HFSIOC_VOLUME_STATUS _IOR('h', 24, u_int32_t) +#define HFS_VOLUME_STATUS IOCBASECMD(HFSIOC_VOLUME_STATUS) + +/* Disable metadata zone for given volume */ +#define HFSIOC_DISABLE_METAZONE _IO('h', 25) +#define HFS_DISABLE_METAZONE IOCBASECMD(HFSIOC_DISABLE_METAZONE) + +/* Change the next CNID value */ +#define HFSIOC_CHANGE_NEXTCNID _IOWR('h', 26, u_int32_t) +#define HFS_CHANGE_NEXTCNID IOCBASECMD(HFSIOC_CHANGE_NEXTCNID) + +/* Get the low disk space values */ +#define HFSIOC_GET_VERY_LOW_DISK _IOR('h', 27, u_int32_t) +#define HFS_FSCTL_GET_VERY_LOW_DISK IOCBASECMD(HFSIOC_GET_VERY_LOW_DISK) + +#define HFSIOC_GET_LOW_DISK _IOR('h', 28, u_int32_t) +#define HFS_FSCTL_GET_LOW_DISK IOCBASECMD(HFSIOC_GET_LOW_DISK) + +#define HFSIOC_GET_DESIRED_DISK _IOR('h', 29, u_int32_t) +#define HFS_FSCTL_GET_DESIRED_DISK IOCBASECMD(HFSIOC_GET_DESIRED_DISK) + +/* + * revisiond only uses this when something transforms in a way + * the kernel can't track such as "foo.rtf" -> "foo.rtfd" + */ +#define HFSIOC_TRANSFER_DOCUMENT_ID _IOW('h', 32, u_int32_t) +#define HFS_TRANSFER_DOCUMENT_ID IOCBASECMD(HFSIOC_TRANSFER_DOCUMENT_ID) + + +/* fcntl.h */ +#define F_MAKECOMPRESSED 80 + +/* Get file system information for the given volume */ +// #define HFSIOC_GET_FSINFO _IOWR('h', 45, hfs_fsinfo) +// #define HFS_GET_FSINFO IOCBASECMD(HFSIOC_GET_FSINFO) + +/* Re-pin hotfile data; argument controls what state gets repinned */ +#define HFSIOC_REPIN_HOTFILE_STATE _IOWR('h', 46, u_int32_t) +#define HFS_REPIN_HOTFILE_STATE IOCBASECMD(HFSIOC_REPIN_HOTFILE_STATE) + +/* Mark a directory or file as worth caching on any underlying "fast" device */ +#define HFSIOC_SET_HOTFILE_STATE _IOWR('h', 47, u_int32_t) +#define HFS_SET_HOTFILE_STATE IOCBASECMD(HFSIOC_SET_HOTFILE_STATE) + +#define APFSIOC_SET_NEAR_LOW_DISK _IOW('J', 17, u_int32_t) +#define APFSIOC_GET_NEAR_LOW_DISK _IOR('J', 18, u_int32_t) + + +// END of definitions + +#endif diff --git a/include/os/macos/zfs/sys/kstat_osx.h b/include/os/macos/zfs/sys/kstat_osx.h new file mode 100644 index 0000000000..1787ad87d5 --- /dev/null +++ b/include/os/macos/zfs/sys/kstat_osx.h @@ -0,0 +1,370 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014, 2016 Jorgen Lundman + */ + +#ifndef KSTAT_OSX_INCLUDED +#define KSTAT_OSX_INCLUDED + +typedef struct osx_kstat { + kstat_named_t spa_version; + kstat_named_t zpl_version; + + kstat_named_t darwin_active_vnodes; + kstat_named_t darwin_debug; + kstat_named_t darwin_reclaim_nodes; + kstat_named_t darwin_ignore_negatives; + kstat_named_t darwin_ignore_positives; + kstat_named_t darwin_create_negatives; + kstat_named_t darwin_force_formd_normalized; + kstat_named_t darwin_skip_unlinked_drain; + kstat_named_t darwin_use_system_sync; + + kstat_named_t arc_zfs_arc_max; + kstat_named_t arc_zfs_arc_min; + kstat_named_t arc_zfs_arc_meta_limit; + kstat_named_t arc_zfs_arc_meta_min; + kstat_named_t arc_zfs_arc_grow_retry; + kstat_named_t arc_zfs_arc_shrink_shift; + kstat_named_t arc_zfs_arc_p_min_shift; + kstat_named_t arc_zfs_arc_average_blocksize; + + kstat_named_t l2arc_write_max; + kstat_named_t l2arc_write_boost; + kstat_named_t l2arc_headroom; + kstat_named_t l2arc_headroom_boost; + kstat_named_t l2arc_feed_secs; + kstat_named_t l2arc_feed_min_ms; + + kstat_named_t zfs_vdev_max_active; + kstat_named_t zfs_vdev_sync_read_min_active; + kstat_named_t zfs_vdev_sync_read_max_active; + kstat_named_t zfs_vdev_sync_write_min_active; + kstat_named_t zfs_vdev_sync_write_max_active; + kstat_named_t zfs_vdev_async_read_min_active; + kstat_named_t zfs_vdev_async_read_max_active; + kstat_named_t zfs_vdev_async_write_min_active; + kstat_named_t zfs_vdev_async_write_max_active; + kstat_named_t zfs_vdev_scrub_min_active; + kstat_named_t zfs_vdev_scrub_max_active; + kstat_named_t zfs_vdev_async_write_active_min_dirty_percent; + kstat_named_t zfs_vdev_async_write_active_max_dirty_percent; + kstat_named_t zfs_vdev_aggregation_limit; + kstat_named_t zfs_vdev_read_gap_limit; + kstat_named_t zfs_vdev_write_gap_limit; + + kstat_named_t arc_lotsfree_percent; + kstat_named_t zfs_dirty_data_max; + kstat_named_t zfs_delay_max_ns; + kstat_named_t zfs_delay_min_dirty_percent; + kstat_named_t zfs_delay_scale; + kstat_named_t spa_asize_inflation; + kstat_named_t zfs_prefetch_disable; + kstat_named_t zfetch_max_streams; + kstat_named_t zfetch_min_sec_reap; + kstat_named_t zfetch_array_rd_sz; + kstat_named_t zfs_default_bs; + kstat_named_t zfs_default_ibs; + kstat_named_t metaslab_aliquot; + kstat_named_t spa_max_replication_override; + kstat_named_t spa_mode_global; + kstat_named_t zfs_flags; + kstat_named_t zfs_txg_timeout; + kstat_named_t zfs_vdev_cache_max; + kstat_named_t zfs_vdev_cache_size; + kstat_named_t zfs_vdev_cache_bshift; + kstat_named_t vdev_mirror_shift; + kstat_named_t zfs_scrub_limit; + kstat_named_t zfs_no_scrub_io; + kstat_named_t zfs_no_scrub_prefetch; + kstat_named_t fzap_default_block_shift; + kstat_named_t zfs_immediate_write_sz; + kstat_named_t zfs_read_chunk_size; + kstat_named_t zfs_nocacheflush; + kstat_named_t zil_replay_disable; + kstat_named_t metaslab_df_alloc_threshold; + kstat_named_t metaslab_df_free_pct; + kstat_named_t zio_injection_enabled; + kstat_named_t zvol_immediate_write_sz; + + kstat_named_t l2arc_noprefetch; + kstat_named_t l2arc_feed_again; + kstat_named_t l2arc_norw; + + kstat_named_t zfs_recover; + + kstat_named_t zfs_free_bpobj_enabled; + + kstat_named_t zfs_send_corrupt_data; + kstat_named_t zfs_send_queue_length; + kstat_named_t zfs_recv_queue_length; + + kstat_named_t zvol_inhibit_dev; + kstat_named_t zfs_send_set_freerecords_bit; + + kstat_named_t zfs_write_implies_delete_child; + kstat_named_t zfs_send_holes_without_birth_time; + + kstat_named_t dbuf_cache_max_bytes; + + kstat_named_t zfs_vdev_queue_depth_pct; + kstat_named_t zio_dva_throttle_enabled; + + kstat_named_t zfs_lua_max_instrlimit; + kstat_named_t zfs_lua_max_memlimit; + + kstat_named_t zfs_trim_extent_bytes_max; + kstat_named_t zfs_trim_extent_bytes_min; + kstat_named_t zfs_trim_metaslab_skip; + kstat_named_t zfs_trim_txg_batch; + kstat_named_t zfs_trim_queue_limit; + + kstat_named_t zfs_send_unmodified_spill_blocks; + kstat_named_t zfs_special_class_metadata_reserve_pct; + + kstat_named_t zfs_vdev_raidz_impl; + kstat_named_t icp_gcm_impl; + kstat_named_t icp_aes_impl; + kstat_named_t zfs_fletcher_4_impl; + + kstat_named_t zfs_expire_snapshot; + kstat_named_t zfs_admin_snapshot; + kstat_named_t zfs_auto_snapshot; + + kstat_named_t zfs_spa_discard_memory_limit; + kstat_named_t zfs_async_block_max_blocks; + kstat_named_t zfs_initialize_chunk_size; + kstat_named_t zfs_scan_suspend_progress; + kstat_named_t zfs_removal_suspend_progress; + kstat_named_t zfs_livelist_max_entries; + + kstat_named_t zfs_allow_redacted_dataset_mount; + kstat_named_t zfs_checksum_events_per_second; + kstat_named_t zfs_commit_timeout_pct; + kstat_named_t zfs_compressed_arc_enabled; + kstat_named_t zfs_condense_indirect_commit_entry_delay_ms; + kstat_named_t zfs_condense_min_mapping_bytes; + kstat_named_t zfs_deadman_checktime_ms; + kstat_named_t zfs_deadman_failmode; + kstat_named_t zfs_deadman_synctime_ms; + kstat_named_t zfs_deadman_ziotime_ms; + kstat_named_t zfs_disable_ivset_guid_check; + kstat_named_t zfs_initialize_value; + kstat_named_t zfs_keep_log_spacemaps_at_export; + kstat_named_t l2arc_rebuild_blocks_min_l2size; + kstat_named_t l2arc_rebuild_enabled; + kstat_named_t l2arc_trim_ahead; + kstat_named_t zfs_livelist_condense_new_alloc; + kstat_named_t zfs_livelist_condense_sync_cancel; + kstat_named_t zfs_livelist_condense_sync_pause; + kstat_named_t zfs_livelist_condense_zthr_cancel; + kstat_named_t zfs_livelist_condense_zthr_pause; + kstat_named_t zfs_livelist_min_percent_shared; + kstat_named_t zfs_max_dataset_nesting; + kstat_named_t zfs_max_missing_tvds; + kstat_named_t metaslab_debug_load; + kstat_named_t metaslab_force_ganging; + kstat_named_t zfs_multihost_fail_intervals; + kstat_named_t zfs_multihost_import_intervals; + kstat_named_t zfs_multihost_interval; + kstat_named_t zfs_override_estimate_recordsize; + kstat_named_t zfs_remove_max_segment; + kstat_named_t zfs_resilver_min_time_ms; + kstat_named_t zfs_scan_legacy; + kstat_named_t zfs_scan_vdev_limit; + kstat_named_t zfs_slow_io_events_per_second; + kstat_named_t spa_load_verify_data; + kstat_named_t spa_load_verify_metadata; + kstat_named_t zfs_unlink_suspend_progress; + kstat_named_t zfs_vdev_min_ms_count; + kstat_named_t vdev_validate_skip; + kstat_named_t zfs_zevent_len_max; + kstat_named_t zio_slow_io_ms; +} osx_kstat_t; + +extern unsigned int debug_vnop_osx_printf; +extern unsigned int zfs_vnop_ignore_negatives; +extern unsigned int zfs_vnop_ignore_positives; +extern unsigned int zfs_vnop_create_negatives; +extern unsigned int zfs_vnop_skip_unlinked_drain; +extern uint64_t zfs_vfs_sync_paranoia; +extern uint64_t vnop_num_vnodes; +extern uint64_t vnop_num_reclaims; + +extern uint64_t zfs_arc_max; +extern uint64_t zfs_arc_min; +extern unsigned long zfs_arc_meta_limit; +extern uint64_t zfs_arc_meta_min; +extern int zfs_arc_grow_retry; +extern int zfs_arc_shrink_shift; +extern int zfs_arc_p_min_shift; +extern int zfs_arc_average_blocksize; + +extern uint64_t l2arc_write_max; +extern uint64_t l2arc_write_boost; +extern uint64_t l2arc_headroom; +extern uint64_t l2arc_headroom_boost; +extern uint64_t l2arc_feed_secs; +extern uint64_t l2arc_feed_min_ms; + +extern uint32_t zfs_vdev_max_active; +extern uint32_t zfs_vdev_sync_read_min_active; +extern uint32_t zfs_vdev_sync_read_max_active; +extern uint32_t zfs_vdev_sync_write_min_active; +extern uint32_t zfs_vdev_sync_write_max_active; +extern uint32_t zfs_vdev_async_read_min_active; +extern uint32_t zfs_vdev_async_read_max_active; +extern uint32_t zfs_vdev_async_write_min_active; +extern uint32_t zfs_vdev_async_write_max_active; +extern uint32_t zfs_vdev_scrub_min_active; +extern uint32_t zfs_vdev_scrub_max_active; +extern int zfs_vdev_async_write_active_min_dirty_percent; +extern int zfs_vdev_async_write_active_max_dirty_percent; +extern int zfs_vdev_aggregation_limit; +extern int zfs_vdev_read_gap_limit; +extern int zfs_vdev_write_gap_limit; + +extern uint_t arc_reduce_dnlc_percent; +extern int arc_lotsfree_percent; +extern hrtime_t zfs_delay_max_ns; +extern int spa_asize_inflation; +extern unsigned int zfetch_max_streams; +extern unsigned int zfetch_min_sec_reap; +extern int zfs_default_bs; +extern int zfs_default_ibs; +extern uint64_t metaslab_aliquot; +extern int zfs_vdev_cache_max; +extern int spa_max_replication_override; +extern int zfs_no_scrub_io; +extern int zfs_no_scrub_prefetch; +extern ssize_t zfs_immediate_write_sz; +extern offset_t zfs_read_chunk_size; +extern uint64_t metaslab_df_alloc_threshold; +extern int metaslab_df_free_pct; +extern ssize_t zvol_immediate_write_sz; + +extern boolean_t l2arc_noprefetch; +extern boolean_t l2arc_feed_again; +extern boolean_t l2arc_norw; + +extern int zfs_top_maxinflight; +extern int zfs_resilver_delay; +extern int zfs_scrub_delay; +extern int zfs_scan_idle; + +extern int64_t zfs_free_bpobj_enabled; + +extern int zfs_send_corrupt_data; +extern int zfs_send_queue_length; +extern int zfs_recv_queue_length; + +extern uint64_t zvol_inhibit_dev; +extern uint64_t zfs_send_set_freerecords_bit; + +extern uint64_t zfs_write_implies_delete_child; +extern uint64_t send_holes_without_birth_time; +extern uint64_t zfs_send_holes_without_birth_time; + +extern uint64_t dbuf_cache_max_bytes; + +extern int zfs_vdev_queue_depth_pct; +extern boolean_t zio_dva_throttle_enabled; + +extern uint64_t zfs_lua_max_instrlimit; +extern uint64_t zfs_lua_max_memlimit; + + +extern uint64_t zfs_trim_extent_bytes_max; +extern uint64_t zfs_trim_extent_bytes_min; +extern unsigned int zfs_trim_metaslab_skip; +extern uint64_t zfs_trim_txg_batch; +extern uint64_t zfs_trim_queue_limit; + +extern uint64_t zfs_send_unmodified_spill_blocks; +extern uint64_t zfs_special_class_metadata_reserve_pct; + +extern int zfs_vnop_force_formd_normalized_output; + +extern int zfs_arc_min_prefetch_ms; +extern int zfs_arc_min_prescient_prefetch_ms; + +extern int zfs_expire_snapshot; +extern int zfs_admin_snapshot; +extern int zfs_auto_snapshot; + +extern unsigned long zfs_spa_discard_memory_limit; +extern unsigned long zfs_async_block_max_blocks; +extern unsigned long zfs_initialize_chunk_size; +extern int zfs_scan_suspend_progress; +extern int zfs_removal_suspend_progress; +extern unsigned long zfs_livelist_max_entries; + +extern int zfs_allow_redacted_dataset_mount; +extern unsigned int zfs_checksum_events_per_second; +extern int zfs_commit_timeout_pct; +extern int zfs_compressed_arc_enabled; +extern int zfs_condense_indirect_commit_entry_delay_ms; +extern unsigned long zfs_condense_min_mapping_bytes; +extern unsigned long zfs_deadman_checktime_ms; +extern char *zfs_deadman_failmode; +extern unsigned long zfs_deadman_synctime_ms; +extern unsigned long zfs_deadman_ziotime_ms; +extern int zfs_disable_ivset_guid_check; +extern unsigned long zfs_initialize_value; +extern int zfs_keep_log_spacemaps_at_export; +extern unsigned long l2arc_rebuild_blocks_min_l2size; +extern int l2arc_rebuild_enabled; +extern unsigned long l2arc_trim_ahead; +extern int zfs_livelist_condense_new_alloc; +extern int zfs_livelist_condense_sync_cancel; +extern int zfs_livelist_condense_sync_pause; +extern int zfs_livelist_condense_zthr_cancel; +extern int zfs_livelist_condense_zthr_pause; +extern int zfs_livelist_min_percent_shared; +extern int zfs_max_dataset_nesting; +extern unsigned long zfs_max_missing_tvds; +extern int metaslab_debug_load; +extern unsigned long metaslab_force_ganging; +extern unsigned int zfs_multihost_fail_intervals; +extern unsigned int zfs_multihost_import_intervals; +extern unsigned long zfs_multihost_interval; +extern int zfs_override_estimate_recordsize; +extern int zfs_remove_max_segment; +extern int zfs_resilver_min_time_ms; +extern int zfs_scan_legacy; +extern unsigned long zfs_scan_vdev_limit; +extern unsigned int zfs_slow_io_events_per_second; +extern int spa_load_verify_data; +extern int spa_load_verify_metadata; +extern int zfs_unlink_suspend_progress; +extern int zfs_vdev_min_ms_count; +extern int vdev_validate_skip; +extern int zfs_zevent_len_max; +extern int zio_slow_io_ms; + +int kstat_osx_init(void); +void kstat_osx_fini(void); + +int arc_kstat_update(kstat_t *ksp, int rw); +int arc_kstat_update_osx(kstat_t *ksp, int rw); + +#endif diff --git a/include/os/macos/zfs/sys/ldi_buf.h b/include/os/macos/zfs/sys/ldi_buf.h new file mode 100644 index 0000000000..9b69b0610a --- /dev/null +++ b/include/os/macos/zfs/sys/ldi_buf.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + * + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +#ifndef _SYS_LDI_BUF_H +#define _SYS_LDI_BUF_H + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* + * Buffer context for LDI strategy + */ +typedef struct ldi_buf { + /* For client use */ + int (*b_iodone)(struct ldi_buf *); /* Callback */ + union { + void *b_addr; /* Passed buffer address */ + } b_un; /* Union to match illumos */ + uint64_t b_bcount; /* Size of IO */ + uint64_t b_bufsize; /* Size of buffer */ + uint64_t b_lblkno; /* logical block number */ + uint64_t b_resid; /* Remaining IO size */ + int b_flags; /* Read or write, options */ + int b_error; /* IO error code */ + uint64_t pad; /* Pad to 64 bytes */ +} ldi_buf_t; /* XXX Currently 64b */ + +ldi_buf_t *ldi_getrbuf(int); +void ldi_freerbuf(ldi_buf_t *); +void ldi_bioinit(ldi_buf_t *); + +/* Define macros to get and release a buffer */ +#define getrbuf(flags) ldi_getrbuf(flags) +#define freerbuf(lbp) ldi_freerbuf(lbp) +#define bioinit(lbp) ldi_bioinit(lbp) +#define geterror(lbp) (lbp->b_error) +#define biowait(lbp) (0) + +#define lbtodb(bytes) \ + (bytes >> DEV_BSHIFT) +#define dbtolb(blkno) \ + (blkno << DEV_BSHIFT) +#define ldbtob(blkno) dbtolb(blkno) + +/* Redefine B_BUSY */ +#define B_BUSY B_PHYS + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _SYS_LDI_BUF_H */ diff --git a/include/os/macos/zfs/sys/ldi_impl_osx.h b/include/os/macos/zfs/sys/ldi_impl_osx.h new file mode 100644 index 0000000000..68c8d121ab --- /dev/null +++ b/include/os/macos/zfs/sys/ldi_impl_osx.h @@ -0,0 +1,226 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * Portions of this document are copyright Oracle and Joyent. + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +#ifndef _SYS_LDI_IMPL_OSX_H +#define _SYS_LDI_IMPL_OSX_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* + * OS X + */ +#define LDI_TYPE_INVALID 0x0 /* uninitialized */ +#define LDI_TYPE_IOKIT 0x1 /* IOMedia device */ +#define LDI_TYPE_VNODE 0x2 /* vnode (bdev) device */ + +/* + * OS X + */ +#define LDI_STATUS_OFFLINE 0x0 /* device offline (dead-end) */ +#define LDI_STATUS_CLOSED 0x1 /* just initialized or closed */ +#define LDI_STATUS_CLOSING 0x2 /* close in-progress */ +#define LDI_STATUS_OPENING 0x3 /* open in-progress */ +#define LDI_STATUS_ONLINE 0x4 /* device is open and active */ +typedef uint_t ldi_status_t; + +/* + * LDI hash definitions + */ +#define LH_HASH_SZ 32 /* number of hash lists */ + +/* + * Flag for LDI handle's lh_flags field + */ +#define LH_FLAGS_NOTIFY 0x0001 /* invoked in context of a notify */ + + +/* + * LDI handle (OS X) + */ +typedef struct _handle_iokit *handle_iokit_t; +typedef struct _handle_vnode *handle_vnode_t; +typedef struct _handle_notifier *handle_notifier_t; + +struct ldi_handle { + /* protected by ldi_handle_hash_lock */ + list_node_t lh_node; /* list membership */ + uint_t lh_ref; /* active references */ + uint_t lh_flags; /* for notify event */ + + /* protected by handle lh_lock */ + kmutex_t lh_lock; /* internal lock */ + kcondvar_t lh_cv; /* for concurrent open */ + ldi_status_t lh_status; /* Closed, Offline, Online */ + uint_t lh_openref; /* open client count */ + + /* unique/static fields in the handle */ + union ldi_handle_tsd { + handle_iokit_t iokit_tsd; + handle_vnode_t vnode_tsd; + } lh_tsd; /* union */ + handle_notifier_t lh_notifier; /* pointer */ + uint_t lh_type; /* IOKit or vnode */ + uint_t lh_fmode; /* FREAD | FWRITE */ + dev_t lh_dev; /* device number */ + uint_t pad; /* pad to 96 bytes */ +}; /* XXX Currently 96b */ + +/* Shared functions */ +struct ldi_handle *handle_alloc_common(uint_t, dev_t, int); +struct ldi_handle *handle_find(dev_t, int, boolean_t); +struct ldi_handle *handle_add(struct ldi_handle *); +int handle_status_change(struct ldi_handle *, int); +void handle_hold(struct ldi_handle *); +void handle_release(struct ldi_handle *); +ldi_status_t handle_open_start(struct ldi_handle *); +void handle_open_done(struct ldi_handle *, ldi_status_t); + +/* Handle IOKit functions */ +void handle_free_iokit(struct ldi_handle *); +struct ldi_handle *handle_alloc_iokit(dev_t, int); +int handle_register_notifier(struct ldi_handle *); +int handle_close_iokit(struct ldi_handle *); +int handle_free_ioservice(struct ldi_handle *); +int handle_alloc_ioservice(struct ldi_handle *); +int handle_remove_notifier(struct ldi_handle *); +int handle_set_wce_iokit(struct ldi_handle *, int *); +int handle_get_size_iokit(struct ldi_handle *, uint64_t *); +int handle_get_dev_path_iokit(struct ldi_handle *lh, + char *path, int len); +int handle_get_media_info_iokit(struct ldi_handle *, + struct dk_minfo *); +int handle_get_media_info_ext_iokit(struct ldi_handle *, + struct dk_minfo_ext *); +int handle_check_media_iokit(struct ldi_handle *, int *); +int handle_is_solidstate_iokit(struct ldi_handle *, int *); +int handle_sync_iokit(struct ldi_handle *); +int buf_strategy_iokit(ldi_buf_t *, struct ldi_handle *); +int ldi_open_media_by_dev(dev_t, int, ldi_handle_t *); +int ldi_open_media_by_path(char *, int, ldi_handle_t *); +int handle_get_bootinfo_iokit(struct ldi_handle *, + struct io_bootinfo *); +int handle_features_iokit(struct ldi_handle *, + uint32_t *); +int handle_unmap_iokit(struct ldi_handle *, + dkioc_free_list_ext_t *); + +/* Handle vnode functions */ +dev_t dev_from_path(char *); +void handle_free_vnode(struct ldi_handle *); +struct ldi_handle *handle_alloc_vnode(dev_t, int); +int handle_close_vnode(struct ldi_handle *); +int handle_get_size_vnode(struct ldi_handle *, uint64_t *); +int handle_get_dev_path_vnode(struct ldi_handle *lh, + char *path, int len); +int handle_get_media_info_vnode(struct ldi_handle *, + struct dk_minfo *); +int handle_get_media_info_ext_vnode(struct ldi_handle *, + struct dk_minfo_ext *); +int handle_check_media_vnode(struct ldi_handle *, int *); +int handle_is_solidstate_vnode(struct ldi_handle *, int *); +int handle_sync_vnode(struct ldi_handle *); +int buf_strategy_vnode(ldi_buf_t *, struct ldi_handle *); +int ldi_open_vnode_by_path(char *, dev_t, int, ldi_handle_t *); +int handle_get_bootinfo_vnode(struct ldi_handle *, + struct io_bootinfo *); +int handle_features_vnode(struct ldi_handle *, + uint32_t *); +int handle_unmap_vnode(struct ldi_handle *, + dkioc_free_list_ext_t *); + +/* + * LDI event information + */ +typedef struct ldi_ev_callback_impl { + struct ldi_handle *lec_lhp; +#ifdef illumos + dev_info_t *lec_dip; +#endif + dev_t lec_dev; + int lec_spec; + int (*lec_notify)(ldi_handle_t, ldi_ev_cookie_t, void *, void *); + void (*lec_finalize)(ldi_handle_t, ldi_ev_cookie_t, int, + void *, void *); + void *lec_arg; + void *lec_cookie; + void *lec_id; + list_node_t lec_list; +} ldi_ev_callback_impl_t; /* XXX Currently 72b */ + +/* + * Members of "struct ldi_ev_callback_list" are protected by their le_lock + * member. The struct is currently only used once, as a file-level global, + * and the locking protocol is currently implemented in ldi_ev_lock() and + * ldi_ev_unlock(). + * + * When delivering events to subscribers, ldi_invoke_notify() and + * ldi_invoke_finalize() will walk the list of callbacks: le_head. It is + * possible that an invoked callback function will need to unregister an + * arbitrary number of callbacks from this list. + * + * To enable ldi_ev_remove_callbacks() to remove elements from the list + * without breaking the walk-in-progress, we store the next element in the + * walk direction on the struct as le_walker_next and le_walker_prev. + */ +struct ldi_ev_callback_list { + kmutex_t le_lock; + kcondvar_t le_cv; + uint64_t le_busy; + void *le_thread; + list_t le_head; + ldi_ev_callback_impl_t *le_walker_next; + ldi_ev_callback_impl_t *le_walker_prev; +}; /* XXX Currently 96b, but only used once */ + +int ldi_invoke_notify(dev_info_t *, dev_t, int, char *, void *); +void ldi_invoke_finalize(dev_info_t *, dev_t, int, char *, int, void *); +int e_ddi_offline_notify(dev_info_t *); +void e_ddi_offline_finalize(dev_info_t *, int); + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _SYS_LDI_IMPL_OSX_H */ diff --git a/include/os/macos/zfs/sys/ldi_osx.h b/include/os/macos/zfs/sys/ldi_osx.h new file mode 100644 index 0000000000..2d78017c42 --- /dev/null +++ b/include/os/macos/zfs/sys/ldi_osx.h @@ -0,0 +1,153 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * Portions of this document are copyright Oracle and Joyent. + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +#ifndef _SYS_LDI_OSX_H +#define _SYS_LDI_OSX_H + +#include + +/* + * OS X - The initialization/destructor functions are available + * for zfs-osx.cpp to call during zfs_init/zfs_fini. + */ +#ifdef __cplusplus +extern "C" { + +int ldi_init(void *); /* passes IOService provider */ +void ldi_fini(); /* teardown */ +#endif /* __cplusplus */ + +/* + * Opaque layered driver data structures. + * vdev_disk and other C callers may use these LDI interfaces + * ldi_ident_t is already defined as typedef void* by spl sunddi.h + */ +typedef struct __ldi_handle *ldi_handle_t; +typedef struct __ldi_callback_id *ldi_callback_id_t; +typedef struct __ldi_ev_cookie *ldi_ev_cookie_t; + +/* + * LDI event interface related + */ +#define LDI_EV_SUCCESS 0 +#define LDI_EV_FAILURE (-1) +#define LDI_EV_NONE (-2) /* no matching callbacks registered */ +#define LDI_EV_OFFLINE "LDI:EVENT:OFFLINE" +#define LDI_EV_DEGRADE "LDI:EVENT:DEGRADE" +#define LDI_EV_DEVICE_REMOVE "LDI:EVENT:DEVICE_REMOVE" + +#define LDI_EV_CB_VERS_1 1 +#define LDI_EV_CB_VERS LDI_EV_CB_VERS_1 + +typedef struct ldi_ev_callback { + uint_t cb_vers; + int (*cb_notify)(ldi_handle_t, ldi_ev_cookie_t, void *, void *); + void (*cb_finalize)(ldi_handle_t, ldi_ev_cookie_t, int, + void *, void *); +} ldi_ev_callback_t; + +/* Structs passed to media_get_info */ +struct dk_minfo { + uint32_t dki_capacity; /* Logical block count */ + uint32_t dki_lbsize; /* Logical block size */ +}; /* (8b) */ + +struct dk_minfo_ext { + uint64_t dki_capacity; /* Logical block count */ + uint32_t dki_lbsize; /* Logical block size */ + uint32_t dki_pbsize; /* Physical block size */ +}; /* (16b) */ + +struct io_bootinfo { + char dev_path[MAXPATHLEN]; /* IODeviceTree path */ + uint64_t dev_size; /* IOMedia device size */ +}; + +/* + * XXX This struct is defined in spl but was unused until now. + * There is a reference in zvol.c zvol_ioctl, commented out. + */ +#if 0 +struct dk_callback { + void (*dkc_callback)(void *dkc_cookie, int error); + void *dkc_cookie; + int dkc_flag; +}; /* XXX Currently 20b */ +#endif + +/* XXX Already defined in spl dkio.h (used elsewhere) */ +#if 0 +#define DKIOCFLUSHWRITECACHE (DKIOC | 34) +#endif + +#define FLUSH_VOLATILE 0x1 +#define DKIOCGMEDIAINFOEXT (DKIOC | 48) + +/* XXX Created this additional ioctl */ +#define DKIOCGETBOOTINFO (DKIOC | 99) + +/* + * LDI Handle manipulation functions + */ +int ldi_open_by_dev(dev_t, int, int, cred_t *, + ldi_handle_t *, __unused ldi_ident_t); +int ldi_open_by_name(char *, int, cred_t *, + ldi_handle_t *, __unused ldi_ident_t); + +int ldi_close(ldi_handle_t, int, cred_t *); + +int ldi_sync(ldi_handle_t); +int ldi_get_size(ldi_handle_t, uint64_t *); +int ldi_ioctl(ldi_handle_t, int, intptr_t, int, cred_t *, int *); +int ldi_strategy(ldi_handle_t, ldi_buf_t *); + +/* + * LDI events related declarations + */ +extern int ldi_ev_get_cookie(ldi_handle_t, char *, ldi_ev_cookie_t *); +extern char *ldi_ev_get_type(ldi_ev_cookie_t); +extern int ldi_ev_register_callbacks(ldi_handle_t, ldi_ev_cookie_t, + ldi_ev_callback_t *, void *, ldi_callback_id_t *); +extern int ldi_ev_remove_callbacks(ldi_callback_id_t); + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _SYS_LDI_OSX_H */ diff --git a/include/os/macos/zfs/sys/trace_zfs.h b/include/os/macos/zfs/sys/trace_zfs.h new file mode 100644 index 0000000000..f32ba529ec --- /dev/null +++ b/include/os/macos/zfs/sys/trace_zfs.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#if !defined(_TRACE_ZFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZFS_H + +#include +#include + +/* + * The sys/trace_dbgmsg.h header defines tracepoint events for + * dprintf(), dbgmsg(), and SET_ERROR(). + */ +#define _SYS_TRACE_DBGMSG_INDIRECT +#include +#undef _SYS_TRACE_DBGMSG_INDIRECT + +/* + * Redefine the DTRACE_PROBE* functions to use Linux tracepoints + */ +#undef DTRACE_PROBE1 +#define DTRACE_PROBE1(name, t1, arg1) \ + trace_zfs_##name((arg1)) + +#undef DTRACE_PROBE2 +#define DTRACE_PROBE2(name, t1, arg1, t2, arg2) \ + trace_zfs_##name((arg1), (arg2)) + +#undef DTRACE_PROBE3 +#define DTRACE_PROBE3(name, t1, arg1, t2, arg2, t3, arg3) \ + trace_zfs_##name((arg1), (arg2), (arg3)) + +#undef DTRACE_PROBE4 +#define DTRACE_PROBE4(name, t1, arg1, t2, arg2, t3, arg3, t4, arg4) \ + trace_zfs_##name((arg1), (arg2), (arg3), (arg4)) + +#endif /* _TRACE_ZFS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace +#include + +#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/os/macos/zfs/sys/vdev_disk_os.h b/include/os/macos/zfs/sys/vdev_disk_os.h new file mode 100644 index 0000000000..79b68c7ee6 --- /dev/null +++ b/include/os/macos/zfs/sys/vdev_disk_os.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _ZFS_VDEV_DISK_OS_H +#define _ZFS_VDEV_DISK_OS_H + +#include + +typedef struct vdev_disk { + ldi_handle_t vd_lh; + list_t vd_ldi_cbs; + boolean_t vd_ldi_offline; +} vdev_disk_t; + +/* + * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. + */ +typedef struct vdev_buf { + ldi_buf_t vb_buf; /* buffer that describes the io */ + zio_t *vb_io; /* pointer back to the original zio_t */ +} vdev_buf_t; + + +extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); + +#endif diff --git a/include/os/macos/zfs/sys/zfs_boot.h b/include/os/macos/zfs/sys/zfs_boot.h new file mode 100644 index 0000000000..cad5c0bdfd --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_boot.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#ifndef ZFS_BOOT_H_INCLUDED +#define ZFS_BOOT_H_INCLUDED + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* Link data vdevs to virtual devices */ +int zfs_boot_update_bootinfo(spa_t *spa); + +int zfs_attach_devicedisk(zfsvfs_t *zfsvfs); +int zfs_detach_devicedisk(zfsvfs_t *zfsvfs); +int zfs_devdisk_get_path(void *, char *, int); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + + + +#ifdef __cplusplus +#include +bool zfs_boot_init(IOService *); +void zfs_boot_fini(); +#endif /* __cplusplus */ + + +#endif /* ZFS_BOOT_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/zfs_context_os.h b/include/os/macos/zfs/sys/zfs_context_os.h new file mode 100644 index 0000000000..097152f26e --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_context_os.h @@ -0,0 +1,175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SPL_ZFS_CONTEXT_OS_H +#define _SPL_ZFS_CONTEXT_OS_H + +#include +#include +#include + +#define MSEC_TO_TICK(msec) ((msec) / (MILLISEC / hz)) + +#define KMALLOC_MAX_SIZE MAXPHYS + +#define MNTTYPE_ZFS_SUBTYPE ('Z'<<24|'F'<<16|'S'<<8) + +#ifndef MAX_UPL_TRANSFER +#define MAX_UPL_TRANSFER 256 +#endif + +#define flock64_t struct flock + +struct spa_iokit; +typedef struct spa_iokit spa_iokit_t; + +#define noinline __attribute__((noinline)) + +/* really? */ +#define kpreempt_disable() ((void)0) +#define kpreempt_enable() ((void)0) +#define cond_resched() (void)thread_block(THREAD_CONTINUE_NULL); +#define schedule() (void)thread_block(THREAD_CONTINUE_NULL); + +#define current curthread + +extern boolean_t ml_set_interrupts_enabled(boolean_t); + +/* Make sure kmem and vmem are already included */ +#include +#include + +/* Since Linux code uses vmem_free() and we already have one: */ +#define vmem_free(A, B) zfs_kmem_free((A), (B)) +#define vmem_alloc(A, B) zfs_kmem_alloc((A), (B)) +#define vmem_zalloc(A, B) zfs_kmem_zalloc((A), (B)) + +typedef int fstrans_cookie_t; +#define spl_fstrans_mark() (0) +#define spl_fstrans_unmark(x) (x = 0) + +#ifdef _KERNEL + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + +typedef struct { + volatile int counter; +} atomic_t; + +#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) + +#define barrier() __asm__ __volatile__("": : :"memory") +#define smp_rmb() barrier() + +#define READ_ONCE(x) ( \ +{ \ + __typeof(x) __var = ( \ + { \ + barrier(); \ + ACCESS_ONCE(x); \ + }); \ + barrier(); \ + __var; \ + }) + +#define WRITE_ONCE(x, v) do { \ + barrier(); \ + ACCESS_ONCE(x) = (v); \ + barrier(); \ + } while (0) + +/* BEGIN CSTYLED */ +#define hlist_for_each(p, head) \ + for (p = (head)->first; p; p = (p)->next) + +#define hlist_entry(ptr, type, field) container_of(ptr, type, field) +/* END CSTYLED */ + +static inline void +hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + n->next = h->first; + if (h->first != NULL) + h->first->pprev = &n->next; + WRITE_ONCE(h->first, n); + n->pprev = &h->first; +} + +static inline void +hlist_del(struct hlist_node *n) +{ + WRITE_ONCE(*(n->pprev), n->next); + if (n->next != NULL) + n->next->pprev = n->pprev; +} + + +#define HLIST_HEAD_INIT { } +#define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT +#define INIT_HLIST_HEAD(head) (head)->first = NULL + +/* BEGIN CSTYLED */ +#define INIT_HLIST_NODE(node) \ + do { \ + (node)->next = NULL; \ + (node)->pprev = NULL; \ + } while (0) + +/* END CSTYLED */ + +static inline int +atomic_read(const atomic_t *v) +{ + return (READ_ONCE(v->counter)); +} + +static inline int +atomic_inc(atomic_t *v) +{ + return (__sync_fetch_and_add(&v->counter, 1) + 1); +} + +static inline int +atomic_dec(atomic_t *v) +{ + return (__sync_fetch_and_add(&v->counter, -1) - 1); +} + +extern void kx_qsort(void *array, size_t nm, size_t member_size, + int (*cmpf)(const void *, const void *)); +#define qsort kx_qsort + +#define strstr kmem_strstr + +void spa_create_os(void *spa); +void spa_export_os(void *spa); +void spa_activate_os(void *spa); +void spa_deactivate_os(void *spa); + +#endif // _KERNEL + +#endif diff --git a/include/os/macos/zfs/sys/zfs_ctldir.h b/include/os/macos/zfs/sys/zfs_ctldir.h new file mode 100644 index 0000000000..4cacf1aefe --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_ctldir.h @@ -0,0 +1,124 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * LLNL-CODE-403049. + * Rewritten for Linux by: + * Rohan Puri + * Brian Behlendorf + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#include +#include +#include +#include + +#define ZFS_CTLDIR_NAME ".zfs" +#define ZFS_SNAPDIR_NAME "snapshot" +#define ZFS_SHAREDIR_NAME "shares" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == ZTOZSB(zdp)->z_root && \ + (ZTOZSB(zdp)->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + (ZTOZSB(zdp)->z_show_ctldir)) + +struct path; + +extern int zfs_expire_snapshot; + +/* zfsctl generic functions */ +extern int zfsctl_create(zfsvfs_t *); +extern void zfsctl_destroy(zfsvfs_t *); +extern struct vnode *zfsctl_root(znode_t *); +extern void zfsctl_init(void); +extern void zfsctl_fini(void); +extern boolean_t zfsctl_is_node(struct vnode *ip); +extern boolean_t zfsctl_is_snapdir(struct vnode *ip); +extern int zfsctl_fid(struct vnode *ip, fid_t *fidp); + +/* zfsctl '.zfs' functions */ +extern int zfsctl_root_lookup(struct vnode *dip, char *name, + struct vnode **ipp, int flags, cred_t *cr, int *direntflags, + struct componentname *realpnp); + +/* zfsctl '.zfs/snapshot' functions */ +extern int zfsctl_snapdir_lookup(struct vnode *dip, char *name, + struct vnode **ipp, int flags, cred_t *cr, int *direntflags, + struct componentname *realpnp); +extern int zfsctl_snapdir_rename(struct vnode *sdip, char *sname, + struct vnode *tdip, char *tname, cred_t *cr, int flags); +extern int zfsctl_snapdir_remove(struct vnode *dip, char *name, cred_t *cr, + int flags); +extern int zfsctl_snapdir_mkdir(struct vnode *dip, char *dirname, vattr_t *vap, + struct vnode **ipp, cred_t *cr, int flags); +extern int zfsctl_snapshot_mount(struct vnode *, int flags); +extern int zfsctl_snapshot_unmount(const char *, int flags); +extern int zfsctl_snapshot_unmount_node(struct vnode *, const char *, + int flags); +extern int zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, + int delay); +extern int zfsctl_snapdir_vget(struct mount *sb, uint64_t objsetid, + int gen, struct vnode **ipp); + +/* zfsctl '.zfs/shares' functions */ +extern int zfsctl_shares_lookup(struct vnode *dip, char *name, + struct vnode **ipp, int flags, cred_t *cr, int *direntflags, + struct componentname *realpnp); + +extern int zfsctl_vnop_lookup(struct vnop_lookup_args *); +extern int zfsctl_vnop_getattr(struct vnop_getattr_args *); +extern int zfsctl_vnop_readdir(struct vnop_readdir_args *); +extern int zfsctl_vnop_mkdir(struct vnop_mkdir_args *); +extern int zfsctl_vnop_rmdir(struct vnop_rmdir_args *); +extern int zfsctl_vnop_access(struct vnop_access_args *); +extern int zfsctl_vnop_open(struct vnop_open_args *); +extern int zfsctl_vnop_close(struct vnop_close_args *); +extern int zfsctl_vnop_inactive(struct vnop_inactive_args *); +extern int zfsctl_vnop_reclaim(struct vnop_reclaim_args *); + +extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, + const char *name); + +extern void zfsctl_mount_signal(char *, boolean_t); + + +/* + * These vnodes numbers are reserved for the .zfs control directory. + * It is important that they be no larger that 48-bits because only + * 6 bytes are reserved in the NFS file handle for the object number. + * However, they should be as large as possible to avoid conflicts + * with the objects which are assigned monotonically by the dmu. + */ +#define ZFSCTL_INO_ROOT 0x0000FFFFFFFFFFFFULL +#define ZFSCTL_INO_SHARES 0x0000FFFFFFFFFFFEULL +#define ZFSCTL_INO_SNAPDIR 0x0000FFFFFFFFFFFDULL +#define ZFSCTL_INO_SNAPDIRS 0x0000FFFFFFFFFFFCULL + +#define ZFSCTL_EXPIRE_SNAPSHOT 300 + +#endif /* _ZFS_CTLDIR_H */ diff --git a/include/os/macos/zfs/sys/zfs_dir.h b/include/os/macos/zfs/sys/zfs_dir.h new file mode 100644 index 0000000000..cfee82308a --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_dir.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ +#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ +#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ +#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ +#define IS_REPLAY 0x04 /* we are replaying intent log */ + +extern int zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, + znode_t **zpp, int flag, int *direntflags, + struct componentname *realpnp); + +extern void zfs_dirent_unlock(zfs_dirlock_t *); +extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, + boolean_t *); + +extern int zfs_dirlook(znode_t *, char *name, znode_t **, int, + int *deflg, struct componentname *rpnp); + +extern void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids); + +extern void zfs_rmnode(znode_t *); +extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); +extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); +extern void zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); + +extern int zfs_get_xattrdir(znode_t *, znode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, znode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/include/os/macos/zfs/sys/zfs_ioctl_compat.h b/include/os/macos/zfs/sys/zfs_ioctl_compat.h new file mode 100644 index 0000000000..15f12b34fe --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_ioctl_compat.h @@ -0,0 +1,213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Jorgen Lundan . All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_IOCTL_COMPAT_H +#define _SYS_ZFS_IOCTL_COMPAT_H + +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Backwards ioctl compatibility + */ + +/* ioctl versions for vfs.zfs.version.ioctl */ +#define ZFS_IOCVER_UNDEF -1 +#define ZFS_IOCVER_NONE 0 +#define ZFS_IOCVER_1_9_4 1 +#define ZFS_IOCVER_ZOF 15 + +/* compatibility conversion flag */ +#define ZFS_CMD_COMPAT_NONE 0 +#define ZFS_CMD_COMPAT_V15 1 +#define ZFS_CMD_COMPAT_V28 2 + +#define ZFS_IOC_COMPAT_PASS 254 +#define ZFS_IOC_COMPAT_FAIL 255 + +#define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff) + +typedef struct zfs_iocparm { + uint32_t zfs_ioctl_version; + uint64_t zfs_cmd; + uint64_t zfs_cmd_size; + + /* + * ioctl() return codes can not be used to communicate - + * as XNU will skip copyout() if there is an error, so it + * is passed along in this wrapping structure. + */ + int zfs_ioc_error; /* ioctl error value */ +} zfs_iocparm_t; + +typedef struct zfs_cmd_1_9_4 +{ + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad3[3]; + boolean_t zc_resumable; + uint32_t zc_pad4; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; + int zc_ioc_error; /* ioctl error value */ + uint64_t zc_dev; /* OSX doesn't have ddi_driver_major */ +} zfs_cmd_1_9_4_t; + +// Figure this out +unsigned static long zfs_ioctl_1_9_4[] = +{ + // ZFS_IOC_POOL_CREATE = _IOWR('Z', 0, struct zfs_cmd), + + 0, /* 0 ZFS_IOC_POOL_CREATE */ + 1, /* 1 ZFS_IOC_POOL_DESTROY */ + 2, /* 2 ZFS_IOC_POOL_IMPORT */ + 3, /* 3 ZFS_IOC_POOL_EXPORT */ + 4, /* 4 ZFS_IOC_POOL_CONFIGS */ + 5, /* 5 ZFS_IOC_POOL_STATS */ + 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */ + 7, /* 7 ZFS_IOC_POOL_SCRUB */ + 8, /* 8 ZFS_IOC_POOL_FREEZE */ + 9, /* 9 ZFS_IOC_POOL_UPGRADE */ + 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */ + 11, /* 11 ZFS_IOC_VDEV_ADD */ + 12, /* 12 ZFS_IOC_VDEV_REMOVE */ + 13, /* 13 ZFS_IOC_VDEV_SET_STATE */ + 14, /* 14 ZFS_IOC_VDEV_ATTACH */ + 15, /* 15 ZFS_IOC_VDEV_DETACH */ + 16, /* 16 ZFS_IOC_VDEV_SETPATH */ + 18, /* 17 ZFS_IOC_OBJSET_STATS */ + 19, /* 18 ZFS_IOC_OBJSET_ZPLPROPS */ + 20, /* 19 ZFS_IOC_DATASET_LIST_NEXT */ + 21, /* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */ + 22, /* 21 ZFS_IOC_SET_PROP */ + ZFS_IOC_COMPAT_PASS, /* 22 ZFS_IOC_CREATE_MINOR */ + ZFS_IOC_COMPAT_PASS, /* 23 ZFS_IOC_REMOVE_MINOR */ + 23, /* 24 ZFS_IOC_CREATE */ + 24, /* 25 ZFS_IOC_DESTROY */ + 25, /* 26 ZFS_IOC_ROLLBACK */ + 26, /* 27 ZFS_IOC_RENAME */ + 27, /* 28 ZFS_IOC_RECV */ + 28, /* 29 ZFS_IOC_SEND */ + 29, /* 30 ZFS_IOC_INJECT_FAULT */ + 30, /* 31 ZFS_IOC_CLEAR_FAULT */ + 31, /* 32 ZFS_IOC_INJECT_LIST_NEXT */ + 32, /* 33 ZFS_IOC_ERROR_LOG */ + 33, /* 34 ZFS_IOC_CLEAR */ + 34, /* 35 ZFS_IOC_PROMOTE */ + 35, /* 36 ZFS_IOC_DESTROY_SNAPS */ + 36, /* 37 ZFS_IOC_SNAPSHOT */ + 37, /* 38 ZFS_IOC_DSOBJ_TO_DSNAME */ + 38, /* 39 ZFS_IOC_OBJ_TO_PATH */ + 39, /* 40 ZFS_IOC_POOL_SET_PROPS */ + 40, /* 41 ZFS_IOC_POOL_GET_PROPS */ + 41, /* 42 ZFS_IOC_SET_FSACL */ + 42, /* 43 ZFS_IOC_GET_FSACL */ + ZFS_IOC_COMPAT_PASS, /* 44 ZFS_IOC_ISCSI_PERM_CHECK */ + 43, /* 45 ZFS_IOC_SHARE */ + 44, /* 46 ZFS_IOC_IHNERIT_PROP */ + 58, /* 47 ZFS_IOC_JAIL */ + 59, /* 48 ZFS_IOC_UNJAIL */ + 45, /* 49 ZFS_IOC_SMB_ACL */ + 46, /* 50 ZFS_IOC_USERSPACE_ONE */ + 47, /* 51 ZFS_IOC_USERSPACE_MANY */ + 48, /* 52 ZFS_IOC_USERSPACE_UPGRADE */ + 17, /* 53 ZFS_IOC_SETFRU */ +}; + +#ifdef _KERNEL +int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); +void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int); +nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +#endif /* _KERNEL */ +void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int); +void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int); + +int wrap_avl_init(void); +int wrap_unicode_init(void); +int wrap_nvpair_init(void); +int wrap_zcommon_init(void); +int wrap_icp_init(void); +int wrap_lua_init(void); +void wrap_avl_fini(void); +void wrap_unicode_fini(void); +void wrap_nvpair_fini(void); +void wrap_zcommon_fini(void); +void wrap_icp_fini(void); +void wrap_lua_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_COMPAT_H */ diff --git a/include/os/macos/zfs/sys/zfs_mount.h b/include/os/macos/zfs/sys/zfs_mount.h new file mode 100644 index 0000000000..b69229c929 --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_mount.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SYS_ZFS_MOUNT_H_ +#define _SYS_ZFS_MOUNT_H_ + +struct zfs_mount_args { + const char *fspec; + int mflag; + char *optptr; + int optlen; + int struct_size; +}; + +/* + * Flag bits passed to mount(2). + */ +#define MS_RDONLY 0x0001 /* Read-only */ +#define MS_FSS 0x0002 /* Old (4-argument) mount (compatibility) */ +#define MS_DATA 0x0004 /* 6-argument mount */ +#define MS_NOSUID 0x0010 /* Setuid programs disallowed */ +#define MS_REMOUNT 0x0020 /* Remount */ +#define MS_NOTRUNC 0x0040 /* Return ENAMETOOLONG for long filenames */ +#define MS_OVERLAY 0x0080 /* Allow overlay mounts */ +#define MS_OPTIONSTR 0x0100 /* Data is a an in/out option string */ +#define MS_GLOBAL 0x0200 /* Clustering: Mount into global name space */ +#define MS_FORCE 0x0400 /* Forced unmount */ +#define MS_NOMNTTAB 0x0800 /* Don't show mount in mnttab */ +/* + * Additional flag bits that domount() is prepared to interpret, but that + * can't be passed through mount(2). + */ +#define MS_SYSSPACE 0x0008 /* Mounta already in kernel space */ +#define MS_NOSPLICE 0x1000 /* Don't splice fs instance into name space */ +#define MS_NOCHECK 0x2000 /* Clustering: suppress mount busy checks */ +/* + * Mask to sift out flag bits allowable from mount(2). + */ +#define MS_MASK \ + (MS_RDONLY|MS_FSS|MS_DATA|MS_NOSUID|MS_REMOUNT|MS_NOTRUNC|MS_OVERLAY|\ + MS_OPTIONSTR|MS_GLOBAL|MS_NOMNTTAB) + +/* + * Mask to sift out flag bits allowable from umount2(2). + */ + +#define MS_UMOUNT_MASK (MS_FORCE) + +/* + * Maximum option string length accepted or returned by mount(2). + */ +#define MAX_MNTOPT_STR 1024 /* max length of mount options string */ + + +#endif /* _SYS_ZFS_IOCTL_H */ diff --git a/include/os/macos/zfs/sys/zfs_vfsops.h b/include/os/macos/zfs/sys/zfs_vfsops.h new file mode 100644 index 0000000000..0e36b2fad4 --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_vfsops.h @@ -0,0 +1,291 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct zfs_sb; +struct znode; + +#ifdef __APPLE__ +#define APPLE_SA_RECOVER +/* #define WITH_SEARCHFS */ +/* #define WITH_READDIRATTR */ +#define HAVE_NAMED_STREAMS 1 +#define HAVE_PAGEOUT_V2 1 +#define HIDE_TRIVIAL_ACL 1 +#endif + +/* + * Status of the zfs_unlinked_drain thread. + */ +typedef enum drain_state { + ZFS_DRAIN_SHUTDOWN = 0, + ZFS_DRAIN_RUNNING, + ZFS_DRAIN_SHUTDOWN_REQ +} drain_state_t; + + +typedef struct zfsvfs zfsvfs_t; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + zfsvfs_t *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_root; /* id of root znode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ + uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ + boolean_t z_atime; /* enable atimes mount option */ + boolean_t z_unmounted; /* unmounted */ + rrmlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; + list_t z_all_znodes; /* all vnodes in the fs */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + struct vnode *z_ctldir; /* .zfs directory pointer */ + uint64_t z_ctldir_startid; /* Start of snapdir range */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_vscan; /* virus scan on/off */ + boolean_t z_use_fuids; /* version allows fuids */ + boolean_t z_replay; /* set during ZIL replay */ + boolean_t z_use_sa; /* version allow system attributes */ + boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ + uint64_t z_version; + uint64_t z_shares_dir; /* hidden shares dir */ + kmutex_t z_lock; + + /* for controlling async zfs_unlinked_drain */ + kmutex_t z_drain_lock; + kcondvar_t z_drain_cv; + drain_state_t z_drain_state; + + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; + uint64_t z_userobjquota_obj; + uint64_t z_groupobjquota_obj; + uint64_t z_projectquota_obj; + uint64_t z_projectobjquota_obj; + +#ifdef __APPLE__ + dev_t z_rdev; /* proxy device for mount */ + boolean_t z_rdonly; /* is mount read-only? */ + time_t z_mount_time; /* mount timestamp (for Spotlight) */ + time_t z_last_unmount_time; /* unmount timestamp (for Spotlight) */ + boolean_t z_xattr; /* enable atimes mount option */ + + avl_tree_t z_hardlinks; /* linkid hash avl tree for vget */ + avl_tree_t z_hardlinks_linkid; /* sorted on linkid */ + krwlock_t z_hardlinks_lock; /* lock to access z_hardlinks */ + + uint64_t z_notification_conditions; /* HFSIOC_VOLUME_STATUS */ + uint64_t z_freespace_notify_warninglimit; + uint64_t z_freespace_notify_dangerlimit; + uint64_t z_freespace_notify_desiredlevel; + + void *z_devdisk; /* Hold fake disk if prop devdisk is on */ + + uint64_t z_findernotify_space; + +#endif + uint64_t z_replay_eof; /* New end of file - replay only */ + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ + + uint64_t z_hold_size; /* znode hold array size */ + avl_tree_t *z_hold_trees; /* znode hold trees */ + kmutex_t *z_hold_locks; /* znode hold locks */ + taskqid_t z_drain_task; /* task id for the unlink drain task */ +}; +#define ZFS_OBJ_MTX_SZ 64 + +#ifdef __APPLE__ +struct hardlinks_struct { + avl_node_t hl_node; + avl_node_t hl_node_linkid; + uint64_t hl_parent; // parentid of entry + uint64_t hl_fileid; // the fileid (z_id) for vget + uint32_t hl_linkid; // the linkid, persistent over renames + char hl_name[PATH_MAX]; // cached name for vget +}; +typedef struct hardlinks_struct hardlinks_t; + +int zfs_vfs_uuid_unparse(uuid_t uuid, char *dst); +int zfs_vfs_uuid_gen(const char *osname, uuid_t uuid); +#endif + + +#define ZFS_SUPER_MAGIC 0x2fc12fc1 + +#define ZSB_XATTR 0x0001 /* Enable user xattrs */ + +/* + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. + * + * For normal filesystems, we partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + */ +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +extern uint_t zfs_fsyncer_key; + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valuep); +extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); +extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota); +extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *, + boolean_t isgroup); +extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, + uint64_t fuid); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); + +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, + uint64_t *value); + +extern int zfs_sb_create(const char *name, zfsvfs_t **zfsvfsp); +extern int zfs_sb_setup(zfsvfs_t *zfsvfs, boolean_t mounting); +extern void zfs_sb_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); +extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); + + + + +extern int zfs_vfs_init(struct vfsconf *vfsp); +extern int zfs_vfs_start(struct mount *mp, int flags, vfs_context_t context); +extern int zfs_vfs_mount(struct mount *mp, vnode_t *devvp, + user_addr_t data, vfs_context_t context); +extern int zfs_vfs_unmount(struct mount *mp, int mntflags, + vfs_context_t context); +extern int zfs_vfs_root(struct mount *mp, vnode_t **vpp, + vfs_context_t context); +extern int zfs_vfs_vget(struct mount *mp, ino64_t ino, vnode_t **vpp, + vfs_context_t context); +extern int zfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, + vfs_context_t context); +extern int zfs_vfs_setattr(struct mount *mp, struct vfs_attr *fsap, + vfs_context_t context); +extern int zfs_vfs_sync(struct mount *mp, int waitfor, vfs_context_t context); +extern int zfs_vfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, + vnode_t **vpp, vfs_context_t context); +extern int zfs_vfs_vptofh(vnode_t *vp, int *fhlenp, unsigned char *fhp, + vfs_context_t context); +extern int zfs_vfs_sysctl(int *name, uint_t namelen, user_addr_t oldp, + size_t *oldlenp, user_addr_t newp, size_t newlen, vfs_context_t context); +extern int zfs_vfs_quotactl(struct mount *mp, int cmds, uid_t uid, + caddr_t datap, vfs_context_t context); +extern int zfs_vfs_mountroot(struct mount *mp, struct vnode *vp, + vfs_context_t context); + +extern void zfs_init(void); +extern void zfs_fini(void); + +extern int zfs_vnode_lock(vnode_t *vp, int flags); +extern void zfs_freevfs(struct mount *vfsp); + +extern int zfsvfs_create(const char *name, boolean_t rd, zfsvfs_t **zfvp); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); + +extern int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, + uint64_t *val, char *setpoint); + +extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/include/os/macos/zfs/sys/zfs_vnops.h b/include/os/macos/zfs/sys/zfs_vnops.h new file mode 100644 index 0000000000..6d7df203ff --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_vnops.h @@ -0,0 +1,255 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VNOPS_H +#define _SYS_FS_ZFS_VNOPS_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Spotlight specific fcntl()'s + */ + +// Older defines +#define SPOTLIGHT_GET_MOUNT_TIME (FCNTL_FS_SPECIFIC_BASE + 0x00002) +#define SPOTLIGHT_GET_UNMOUNT_TIME (FCNTL_FS_SPECIFIC_BASE + 0x00003) + +// Newer defines, will these need a OSX version test to compile on older? +#define SPOTLIGHT_IOC_GET_MOUNT_TIME _IOR('h', 18, u_int32_t) +#define SPOTLIGHT_FSCTL_GET_MOUNT_TIME \ + IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME) +#define SPOTLIGHT_IOC_GET_LAST_MTIME _IOR('h', 19, u_int32_t) +#define SPOTLIGHT_FSCTL_GET_LAST_MTIME \ + IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME) + +/* + * Account for user timespec structure differences + */ +#ifdef ZFS_LEOPARD_ONLY +typedef struct timespec timespec_user32_t; +typedef struct user_timespec timespec_user64_t; +#else +typedef struct user32_timespec timespec_user32_t; +typedef struct user64_timespec timespec_user64_t; +#endif + +#define UNKNOWNUID ((uid_t)99) +#define UNKNOWNGID ((gid_t)99) + +#define DTTOVT(dtype) (iftovt_tab[(dtype)]) +#define kTextEncodingMacUnicode 0x7e +#define ZAP_AVENAMELEN (ZAP_MAXNAMELEN / 4) + +/* Finder information */ +struct finderinfo { + u_int32_t fi_type; /* files only */ + u_int32_t fi_creator; /* files only */ + u_int16_t fi_flags; + struct { + int16_t v; + int16_t h; + } fi_location; + int8_t fi_opaque[18]; +} __attribute__((aligned(2), packed)); +typedef struct finderinfo finderinfo_t; + +enum { + /* Finder Flags */ + kHasBeenInited = 0x0100, + kHasCustomIcon = 0x0400, + kIsStationery = 0x0800, + kNameLocked = 0x1000, + kHasBundle = 0x2000, + kIsInvisible = 0x4000, + kIsAlias = 0x8000 +}; + +/* Attribute packing information */ +typedef struct attrinfo { + struct attrlist *ai_attrlist; + void **ai_attrbufpp; + void **ai_varbufpp; + void *ai_varbufend; + vfs_context_t ai_context; +} attrinfo_t; + +/* + * Attributes that we can get for free from the zap (ie without a znode) + */ +#define ZFS_DIR_ENT_ATTRS ( \ + ATTR_CMN_NAME | ATTR_CMN_DEVID | ATTR_CMN_FSID | \ + ATTR_CMN_OBJTYPE | ATTR_CMN_OBJTAG | ATTR_CMN_OBJID | \ + ATTR_CMN_OBJPERMANENTID | ATTR_CMN_SCRIPT | \ + ATTR_CMN_FILEID) + +/* + * Attributes that we support + */ +#define ZFS_ATTR_BIT_MAP_COUNT 5 + +#define ZFS_ATTR_CMN_VALID ( \ + ATTR_CMN_NAME | ATTR_CMN_DEVID | ATTR_CMN_FSID | \ + ATTR_CMN_OBJTYPE | ATTR_CMN_OBJTAG | ATTR_CMN_OBJID | \ + ATTR_CMN_OBJPERMANENTID | ATTR_CMN_PAROBJID | \ + ATTR_CMN_SCRIPT | ATTR_CMN_CRTIME | ATTR_CMN_MODTIME | \ + ATTR_CMN_CHGTIME | ATTR_CMN_ACCTIME | \ + ATTR_CMN_BKUPTIME | ATTR_CMN_FNDRINFO | \ + ATTR_CMN_OWNERID | ATTR_CMN_GRPID | \ + ATTR_CMN_ACCESSMASK | ATTR_CMN_FLAGS | \ + ATTR_CMN_USERACCESS | ATTR_CMN_FILEID | \ + ATTR_CMN_PARENTID) + +#define ZFS_ATTR_DIR_VALID ( \ + ATTR_DIR_LINKCOUNT | ATTR_DIR_ENTRYCOUNT | \ + ATTR_DIR_MOUNTSTATUS) + +#define ZFS_ATTR_FILE_VALID ( \ + ATTR_FILE_LINKCOUNT |ATTR_FILE_TOTALSIZE | \ + ATTR_FILE_ALLOCSIZE | ATTR_FILE_IOBLOCKSIZE | \ + ATTR_FILE_DEVTYPE | ATTR_FILE_DATALENGTH | \ + ATTR_FILE_DATAALLOCSIZE | ATTR_FILE_RSRCLENGTH | \ + ATTR_FILE_RSRCALLOCSIZE) + +extern int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags); +extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, + znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp); +extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, + cred_t *cr, int flags); +extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr); +extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, + char *tnm, cred_t *cr, int flags); +extern int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, + char *link, znode_t **zpp, cred_t *cr, int flags); +extern int zfs_link(znode_t *tdzp, znode_t *sp, + char *name, cred_t *cr, int flags); +extern int zfs_space(znode_t *zp, int cmd, struct flock *bfp, int flag, + offset_t offset, cred_t *cr); +extern int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp); +extern int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, + cred_t *cr); +extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *resid); + +extern int zfs_open(struct vnode *ip, int mode, int flag, cred_t *cr); +extern int zfs_close(struct vnode *ip, int flag, cred_t *cr); +extern int zfs_read(struct vnode *ip, uio_t *uio, int ioflag, cred_t *cr); +extern int zfs_write(struct vnode *ip, uio_t *uio, int ioflag, cred_t *cr); +extern int zfs_lookup(znode_t *dzp, char *nm, znode_t **zpp, + int flags, cred_t *cr, int *direntflags, struct componentname *realpnp); +extern int zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, + cred_t *cred, int *rvalp, caller_context_t *ct); +extern int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, + int flags, int *a_numdirent); +extern int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr); +extern int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, + cred_t *cr, caller_context_t *ct); +extern int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr); + +extern int zfs_access(struct vnode *ip, int mode, int flag, cred_t *cr); +extern void zfs_inactive(vnode_t *vp); + +/* zfs_vops_osx.c calls */ +extern int zfs_znode_getvnode(znode_t *zp, zfsvfs_t *zfsvfs); + +extern void getnewvnode_reserve(int num); +extern void getnewvnode_drop_reserve(void); +extern int zfs_vfsops_init(void); +extern int zfs_vfsops_fini(void); +extern int zfs_znode_asyncgetvnode(znode_t *zp, zfsvfs_t *zfsvfs); +extern void zfs_znode_asyncput(znode_t *zp); +extern int zfs_znode_asyncwait(znode_t *zp); + +/* zfs_vnops_osx_lib calls */ +extern int zfs_ioflags(int ap_ioflag); +extern int zfs_getattr_znode_unlocked(struct vnode *vp, vattr_t *vap); +extern int ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)); +extern void acl_trivial_access_masks(mode_t mode, boolean_t isdir, + trivial_acl_t *masks); +extern int zpl_obtain_xattr(struct znode *, const char *name, mode_t mode, + cred_t *cr, struct vnode **vpp, int flag); + +extern void commonattrpack(attrinfo_t *aip, zfsvfs_t *zfsvfs, znode_t *zp, + const char *name, ino64_t objnum, enum vtype vtype, + boolean_t user64); +extern void dirattrpack(attrinfo_t *aip, znode_t *zp); +extern void fileattrpack(attrinfo_t *aip, zfsvfs_t *zfsvfs, znode_t *zp); +extern void nameattrpack(attrinfo_t *aip, const char *name, int namelen); +extern int getpackedsize(struct attrlist *alp, boolean_t user64); +extern void getfinderinfo(znode_t *zp, cred_t *cr, finderinfo_t *fip); +extern uint32_t getuseraccess(znode_t *zp, vfs_context_t ctx); +extern void finderinfo_update(uint8_t *finderinfo, znode_t *zp); +extern int zpl_xattr_set_sa(struct vnode *vp, const char *name, + const void *value, size_t size, int flags, cred_t *cr); +extern int zpl_xattr_get_sa(struct vnode *vp, const char *name, void *value, + size_t size); +extern void zfs_zrele_async(znode_t *zp); + +/* + * OSX ACL Helper funcions + * + * OSX uses 'guids' for the 'who' part of ACLs, and uses a 'well known' + * binary sequence to signify the special rules of "owner", "group" and + * "everybody". We translate between this "well-known" guid and ZFS' + * flags ACE_OWNER, ACE_GROUP and ACE_EVERYBODY. + * + */ +#define KAUTH_WKG_NOT 0 /* not a well-known GUID */ +#define KAUTH_WKG_OWNER 1 +#define KAUTH_WKG_GROUP 2 +#define KAUTH_WKG_NOBODY 3 +#define KAUTH_WKG_EVERYBODY 4 + +extern int kauth_wellknown_guid(guid_t *guid); +extern void aces_from_acl(ace_t *aces, int *nentries, struct kauth_acl *k_acl, + int *seen_type); +extern void nfsacl_set_wellknown(int wkg, guid_t *guid); +extern int zfs_addacl_trivial(znode_t *zp, ace_t *aces, int *nentries, + int seen_type); + +extern struct vnodeopv_desc zfs_dvnodeop_opv_desc; +extern struct vnodeopv_desc zfs_fvnodeop_opv_desc; +extern struct vnodeopv_desc zfs_symvnodeop_opv_desc; +extern struct vnodeopv_desc zfs_xdvnodeop_opv_desc; +extern struct vnodeopv_desc zfs_evnodeop_opv_desc; +extern struct vnodeopv_desc zfs_fifonodeop_opv_desc; +extern struct vnodeopv_desc zfs_ctldir_opv_desc; +extern int (**zfs_ctldirops)(void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VNOPS_H */ diff --git a/include/os/macos/zfs/sys/zfs_znode_impl.h b/include/os/macos/zfs/sys/zfs_znode_impl.h new file mode 100644 index 0000000000..fa53ac8d6e --- /dev/null +++ b/include/os/macos/zfs/sys/zfs_znode_impl.h @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _MACOS_ZFS_SYS_ZNODE_IMPL_H +#define _MACOS_ZFS_SYS_ZNODE_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_UIMMUTABLE 0x0000001000000000ull // OSX +#define ZFS_UAPPENDONLY 0x0000004000000000ull // OSX + +// #define ZFS_IMMUTABLE (ZFS_UIMMUTABLE | ZFS_SIMMUTABLE) +// #define ZFS_APPENDONLY (ZFS_UAPPENDONLY | ZFS_SAPPENDONLY) + +#define ZFS_TRACKED 0x0010000000000000ull +#define ZFS_COMPRESSED 0x0020000000000000ull + +#define ZFS_SIMMUTABLE 0x0040000000000000ull +#define ZFS_SAPPENDONLY 0x0080000000000000ull + +#define SA_ZPL_ADDTIME(z) z->z_attr_table[ZPL_ADDTIME] +#define SA_ZPL_DOCUMENTID(z) z->z_attr_table[ZPL_DOCUMENTID] + +#define ZGET_FLAG_UNLINKED (1<<0) /* Also lookup unlinked */ +#define ZGET_FLAG_ASYNC (1<<3) /* taskq the vnode_create call */ + +extern int zfs_zget_ext(zfsvfs_t *zfsvfs, uint64_t obj_num, + struct znode **zpp, int flags); + + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +#define ZNODE_OS_FIELDS \ + struct zfsvfs *z_zfsvfs; \ + struct vnode *z_vnode; \ + uint64_t z_uid; \ + uint64_t z_gid; \ + uint64_t z_gen; \ + uint64_t z_atime[2]; \ + uint64_t z_links; \ + uint32_t z_vid; \ + uint32_t z_document_id; \ + uint64_t z_finder_parentid; \ + boolean_t z_finder_hardlink; \ + uint64_t z_write_gencount; \ + char z_name_cache[MAXPATHLEN]; \ + boolean_t z_skip_truncate_undo_decmpfs; \ + taskq_ent_t z_attach_taskq; \ + kcondvar_t z_attach_cv; \ + kmutex_t z_attach_lock; \ + hrtime_t z_snap_mount_time; \ + krwlock_t z_map_lock; + +#define ZFS_LINK_MAX UINT64_MAX + +/* + * ZFS minor numbers can refer to either a control device instance or + * a zvol. Depending on the value of zss_type, zss_data points to either + * a zvol_state_t or a zfs_onexit_t. + */ +enum zfs_soft_state_type { + ZSST_ZVOL, + ZSST_CTLDEV +}; + +typedef struct zfs_soft_state { + enum zfs_soft_state_type zss_type; + void *zss_data; +} zfs_soft_state_t; + +extern minor_t zfsdev_minor_alloc(void); + +/* + * Convert between znode pointers and vnode pointers + */ +#define ZTOV(ZP) ((ZP)->z_vnode) +#define ZTOI(ZP) ((ZP)->z_vnode) +#define VTOZ(VP) ((znode_t *)vnode_fsnode((VP))) +#define ITOZ(VP) ((znode_t *)vnode_fsnode((VP))) + +#define VTOM(VP) ((mount_t *)vnode_mount((VP))) + +/* These are not used so far, VN_HOLD returncode must be checked. */ +#define zhold(zp) VN_HOLD(ZTOV(zp)) +#define zrele(zp) VN_RELE(ZTOV(zp)) + +#define ZTOZSB(zp) ((zp)->z_zfsvfs) +#define ITOZSB(vp) ((zfsvfs_t *)vfs_fsprivate(vnode_mount(vp))) +#define ZTOTYPE(zp) (vnode_vtype(ZTOV(zp))) +#define ZTOGID(zp) ((zp)->z_gid) +#define ZTOUID(zp) ((zp)->z_uid) +#define ZTONLNK(zp) ((zp)->z_links) +#define Z_ISBLK(type) ((type) == VBLK) +#define Z_ISCHR(type) ((type) == VCHR) +#define Z_ISLNK(type) ((type) == VLNK) + +/* Called on entry to each ZFS inode and vfs operation. */ +#define ZFS_ENTER_IFERROR(zfsvfs) \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + if ((zfsvfs)->z_unmounted) + +#define ZFS_ENTER_ERROR(zfsvfs, error) \ + do { \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + if ((zfsvfs)->z_unmounted) { \ + ZFS_EXIT(zfsvfs); \ + return (error); \ + } \ + } while (0) + +#define ZFS_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) +#define ZPL_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) + +/* Must be called before exiting the operation. */ +#define ZFS_EXIT(zfsvfs) \ + do { \ + rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG); \ + } while (0) +#define ZPL_EXIT(zfsvfs) ZFS_EXIT(zfsvfs) + +/* Verifies the znode is valid. */ +#define ZFS_VERIFY_ZP_ERROR(zp, error) \ + do { \ + if ((zp)->z_sa_hdl == NULL) { \ + ZFS_EXIT(ZTOZSB(zp)); \ + return (error); \ + } \ + } while (0) + +#define ZFS_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) +#define ZPL_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_MTX_SZ 64 +#define ZFS_OBJ_MTX_MAX (1024 * 1024) +#define ZFS_OBJ_HASH(zfsvfs, obj) ((obj) & ((zfsvfs->z_hold_size) - 1)) + +extern unsigned int zfs_object_mutex_size; + +/* Encode ZFS stored time values from a struct timespec */ +#define ZFS_TIME_ENCODE(tp, stmp) \ + { \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ + } + +/* Decode ZFS stored time values to a struct timespec */ +#define ZFS_TIME_DECODE(tp, stmp) \ + { \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} + +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ + if ((zfsvfs)->z_atime && !vfs_isrdonly(zfsvfs->z_vfs)) \ + zfs_tstamp_update_setup_ext(zp, ACCESSED, NULL, NULL, B_FALSE); + +extern void zfs_tstamp_update_setup_ext(struct znode *, + uint_t, uint64_t [2], uint64_t [2], boolean_t); +extern void zfs_tstamp_update_setup(struct znode *, + uint_t, uint64_t [2], uint64_t [2]); +extern void zfs_znode_free(struct znode *); + +extern zil_get_data_t zfs_get_data; +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp, + char *buf); +extern uint32_t zfs_getbsdflags(struct znode *zp); +extern void zfs_setattr_generate_id(struct znode *, uint64_t, char *name); + +extern int zfs_setattr_set_documentid(struct znode *zp, + boolean_t update_flags); + +/* Legacy macOS uses fnv_32a hash for hostid. */ +#define FNV1_32A_INIT ((uint32_t)0x811c9dc5) +uint32_t fnv_32a_str(const char *str, uint32_t hval); + +void zfs_setbsdflags(struct znode *, uint32_t bsdflags); +uint32_t zfs_getbsdflags(struct znode *zp); + +#ifdef __cplusplus +} +#endif + +#endif /* _MACOS_SYS_FS_ZFS_ZNODE_H */ diff --git a/include/os/macos/zfs/sys/zpl.h b/include/os/macos/zfs/sys/zpl.h new file mode 100644 index 0000000000..5d391c6a96 --- /dev/null +++ b/include/os/macos/zfs/sys/zpl.h @@ -0,0 +1,27 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SYS_ZPL_H +#define _SYS_ZPL_H + + + + +#endif // _SYS_ZPL_H diff --git a/include/os/macos/zfs/sys/zvolIO.h b/include/os/macos/zfs/sys/zvolIO.h new file mode 100644 index 0000000000..927840aa7b --- /dev/null +++ b/include/os/macos/zfs/sys/zvolIO.h @@ -0,0 +1,142 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2016 Jorgen Lundman + */ + +#ifndef ZVOLIO_H_INCLUDED +#define ZVOLIO_H_INCLUDED + +/* Linux polutes 'current' */ +#undef current + +#ifdef __cplusplus +#include + +extern "C" { +#endif /* __cplusplus */ + +#include +#include + +struct iomem { + IOMemoryDescriptor *buf; +}; + +uint64_t zvolIO_kit_read(struct iomem *iomem, uint64_t offset, + char *address, uint64_t len); +uint64_t zvolIO_kit_write(struct iomem *iomem, uint64_t offset, + char *address, uint64_t len); + +#ifdef __cplusplus +} /* extern "C" */ + +class net_lundman_zfs_zvol : public IOService +{ + OSDeclareDefaultStructors(net_lundman_zfs_zvol) + +private: + +public: + virtual bool init(OSDictionary* dictionary = NULL); + virtual void free(void); + virtual IOService* probe(IOService* provider, SInt32* score); + virtual bool start(IOService* provider); + virtual void stop(IOService* provider); + + virtual bool handleOpen(IOService *client, + IOOptionBits options, void *arg); + virtual bool handleIsOpen(const IOService *client) const; + virtual void handleClose(IOService *client, + IOOptionBits options); + virtual bool isOpen(const IOService *forClient = 0) const; + +private: + OSSet *_openClients; +}; + +#include + +class net_lundman_zfs_zvol_device : public IOBlockStorageDevice +{ + OSDeclareDefaultStructors(net_lundman_zfs_zvol_device) + +private: + // IOService *m_provider; + zvol_state_t *zv; + +public: + virtual bool init(zvol_state_t *c_zv, + OSDictionary* properties = 0); + + virtual bool attach(IOService* provider); + virtual void detach(IOService* provider); + virtual IOReturn doEjectMedia(void); + virtual IOReturn doFormatMedia(UInt64 byteCapacity); + virtual UInt32 doGetFormatCapacities(UInt64 * capacities, + UInt32 capacitiesMaxCount) const; + + virtual IOReturn doLockUnlockMedia(bool doLock); + virtual IOReturn doSynchronizeCache(void); + virtual char *getVendorString(void); + virtual char *getProductString(void); + virtual char *getRevisionString(void); + virtual char *getAdditionalDeviceInfoString(void); + virtual IOReturn reportBlockSize(UInt64 *blockSize); + virtual IOReturn reportEjectability(bool *isEjectable); + virtual IOReturn reportLockability(bool *isLockable); + virtual IOReturn reportMaxValidBlock(UInt64 *maxBlock); + virtual IOReturn reportMediaState(bool *mediaPresent, + bool *changedState); + + virtual IOReturn reportPollRequirements(bool *pollRequired, + bool *pollIsExpensive); + + virtual IOReturn reportRemovability(bool *isRemovable); + virtual IOReturn reportWriteProtection(bool *isWriteProtected); + virtual IOReturn getWriteCacheState(bool *enabled); + virtual IOReturn setWriteCacheState(bool enabled); + virtual IOReturn doAsyncReadWrite(IOMemoryDescriptor *buffer, + UInt64 block, UInt64 nblks, + IOStorageAttributes *attributes, + IOStorageCompletion *completion); + + virtual IOReturn doDiscard(UInt64 block, UInt64 nblks); + virtual IOReturn doUnmap(IOBlockStorageDeviceExtent *extents, + UInt32 extentsCount, UInt32 options); + + virtual bool handleOpen(IOService *client, + IOOptionBits options, void *access); + + virtual void handleClose(IOService *client, + IOOptionBits options); + + virtual int getBSDName(void); + virtual int renameDevice(void); + virtual int offlineDevice(void); + virtual int onlineDevice(void); + virtual int refreshDevice(void); + + virtual void clearState(void); +}; +#endif /* __cplusplus */ + +#endif /* ZVOLIO_H_INCLUDED */ diff --git a/include/os/macos/zfs/sys/zvol_os.h b/include/os/macos/zfs/sys/zvol_os.h new file mode 100644 index 0000000000..a5e5d86aa8 --- /dev/null +++ b/include/os/macos/zfs/sys/zvol_os.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _SYS_ZVOL_OS_h +#define _SYS_ZVOL_OS_h + +#ifdef __cplusplus +extern "C" { +#endif + +/* struct wrapper for IOKit class */ +typedef struct zvol_iokit zvol_iokit_t; +typedef struct zvol_state zvol_state_t; +struct iomem; + +struct zvol_state_os { + dev_t zvo_dev; /* device id */ + + zvol_iokit_t *zvo_iokitdev; /* IOKit device */ + uint64_t zvo_openflags; /* Remember flags used at open */ + char zvo_bsdname[MAXPATHLEN]; /* /dev/diskX */ +}; + +extern int zvol_os_ioctl(dev_t, unsigned long, caddr_t, + int isblk, cred_t *, int *rvalp); +extern int zvol_os_open_zv(zvol_state_t *, int, int, struct proc *p); +extern int zvol_os_open(dev_t dev, int flag, int otyp, struct proc *p); +extern int zvol_os_close_zv(zvol_state_t *, int, int, struct proc *p); +extern int zvol_os_close(dev_t dev, int flag, int otyp, struct proc *p); +extern int zvol_os_read(dev_t dev, struct uio *uio, int p); +extern int zvol_os_write(dev_t dev, struct uio *uio, int p); + +extern int zvol_os_read_zv(zvol_state_t *zv, uint64_t position, + uint64_t count, struct iomem *iomem); +extern int zvol_os_write_zv(zvol_state_t *zv, uint64_t position, + uint64_t count, struct iomem *iomem); +extern int zvol_os_unmap(zvol_state_t *zv, uint64_t off, uint64_t bytes); + +extern void zvol_os_strategy(struct buf *bp); +extern int zvol_os_get_volume_blocksize(dev_t dev); + +extern void zvol_os_lock_zv(zvol_state_t *zv); +extern void zvol_os_unlock_zv(zvol_state_t *zv); + +extern void *zvolRemoveDevice(zvol_iokit_t *iokitdev); +extern int zvolRemoveDeviceTerminate(void *iokitdev); +extern int zvolCreateNewDevice(zvol_state_t *zv); +extern int zvolRegisterDevice(zvol_state_t *zv); + +extern int zvolRenameDevice(zvol_state_t *zv); +extern int zvolSetVolsize(zvol_state_t *zv); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 7ed2470268..7a80733882 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -59,7 +59,7 @@ struct abd { union { struct abd_scatter { uint_t abd_offset; -#if defined(_KERNEL) && ( defined(__FreeBSD__) || defined(__APPLE__) ) +#if defined(_KERNEL) && (defined(__FreeBSD__) || defined(__APPLE__)) uint_t abd_chunk_size; void *abd_chunks[]; #else @@ -134,9 +134,9 @@ void abd_iter_unmap(struct abd_iter *); #if defined(__FreeBSD__) #define abd_enter_critical(flags) critical_enter() #define abd_exit_critical(flags) critical_exit() -#elif defined (__APPLE__) -#define abd_enter_critical(flags) (flags) = ml_set_interrupts_enabled(FALSE) -#define abd_exit_critical(flags) ml_set_interrupts_enabled((flags)) +#elif defined(__APPLE__) +#define abd_enter_critical(flags) (flags) = ml_set_interrupts_enabled(FALSE) +#define abd_exit_critical(flags) ml_set_interrupts_enabled((flags)) #else #define abd_enter_critical(flags) local_irq_save(flags) #define abd_exit_critical(flags) local_irq_restore(flags) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index a528340f9f..85d0f3b95e 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1238,7 +1238,7 @@ typedef enum zfs_ioc { /* * Core features - 81/128 numbers reserved. */ -#if defined (__FreeBSD__) || defined(__APPLE__) +#if defined(__FreeBSD__) || defined(__APPLE__) ZFS_IOC_FIRST = 0, #else ZFS_IOC_FIRST = ('Z' << 8), diff --git a/include/sys/mntent.h b/include/sys/mntent.h index cb463ce292..be62c53976 100644 --- a/include/sys/mntent.h +++ b/include/sys/mntent.h @@ -84,8 +84,8 @@ #define MNTOPT_NOSETUID "nosetuid" /* Set uid not allowed */ #define MNTOPT_BROWSE "browse" /* browsable autofs mount */ #define MNTOPT_NOBROWSE "nobrowse" /* non-browsable autofs mount */ -#define MNTOPT_OWNERS "owners" /* VFS will not ignore ownership information on filesystem objects */ -#define MNTOPT_NOOWNERS "noowners" /* VFS will ignore ownership information on filesystem objects */ +#define MNTOPT_OWNERS "owners" /* use ownership */ +#define MNTOPT_NOOWNERS "noowners" /* ignore ownership */ #else #error "unknown OS" #endif diff --git a/include/sys/sysevent/dev.h b/include/sys/sysevent/dev.h index 1255a176d3..a41f0c0fe8 100644 --- a/include/sys/sysevent/dev.h +++ b/include/sys/sysevent/dev.h @@ -239,7 +239,7 @@ extern "C" { #define DEV_INSTANCE "instance" #define DEV_PROP_PREFIX "prop-" -#if defined (__linux__) || defined (__APPLE__) +#if defined(__linux__) || defined(__APPLE__) #define DEV_IDENTIFIER "devid" #define DEV_PATH "path" #define DEV_IS_PART "is_slice" diff --git a/include/sys/zfs_sa.h b/include/sys/zfs_sa.h index dfac8ce4b6..b1517b9d3f 100644 --- a/include/sys/zfs_sa.h +++ b/include/sys/zfs_sa.h @@ -76,10 +76,11 @@ typedef enum zpl_attr { ZPL_DXATTR, ZPL_PROJID, - /* Apple defines a ADDEDTIME, which is the time the entry was placed in - * the containing directory. Ie, CRTIME and updated when moved into - * a different directory. This can be retrieved with getxattr "FinderInfo" - * or the getattrlist() syscall. + /* + * Apple defines a ADDEDTIME, which is the time the entry was placed + * in the containing directory. Ie, CRTIME and updated when moved + * into a different directory. This can be retrieved with getxattr + * "FinderInfo" or the getattrlist() syscall. */ ZPL_ADDTIME, ZPL_DOCUMENTID, diff --git a/lib/libefi/rdwr_efi_macos.c b/lib/libefi/rdwr_efi_macos.c index 9a1a64ebe4..c2ef128a8f 100644 --- a/lib/libefi/rdwr_efi_macos.c +++ b/lib/libefi/rdwr_efi_macos.c @@ -248,8 +248,8 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) dki_info->dki_partition = 0; } strlcpy(dki_info->dki_dname, - &pathbuf[5], - sizeof(dki_info->dki_dname)); + &pathbuf[5], + sizeof (dki_info->dki_dname)); } /* @@ -1663,7 +1663,7 @@ isDeviceMatchForKeyAndSubstr(char *device, CFStringRef key, CFStringRef substr, if ((error = setupDADiskSession(&ds, device)) == 0) { CFDictionaryRef descDict = NULL; - if((descDict = DADiskCopyDescription(ds.disk)) != NULL) { + if ((descDict = DADiskCopyDescription(ds.disk)) != NULL) { *isMatch = CFDictionaryValueIfPresentMatchesSubstring(descDict, key, substr); @@ -1709,5 +1709,5 @@ osx_device_isvirtual(char *device) isCoreStorageLV, isVirtualInterface); - return (isCoreStorageLV || isVirtualInterface); + return (isCoreStorageLV /* || isVirtualInterface*/); } diff --git a/lib/libspl/include/os/Makefile.am b/lib/libspl/include/os/Makefile.am index 7b362e02ad..22495a05b7 100644 --- a/lib/libspl/include/os/Makefile.am +++ b/lib/libspl/include/os/Makefile.am @@ -5,3 +5,7 @@ endif if BUILD_LINUX SUBDIRS = linux endif + +if BUILD_MACOS +SUBDIRS = macos +endif diff --git a/lib/libspl/include/os/macos/Makefile.am b/lib/libspl/include/os/macos/Makefile.am new file mode 100644 index 0000000000..1d3c559bed --- /dev/null +++ b/lib/libspl/include/os/macos/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = mach rpc sys diff --git a/lib/libspl/include/os/macos/dirent.h b/lib/libspl/include/os/macos/dirent.h new file mode 100644 index 0000000000..b7ffe3d89c --- /dev/null +++ b/lib/libspl/include/os/macos/dirent.h @@ -0,0 +1,37 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBSPL_DIRENT_H +#define _LIBSPL_DIRENT_H + +#include_next + + +/* Handle Linux use of 64 names */ + +#define readdir64 readdir +#define dirent64 dirent + +#endif diff --git a/lib/libspl/include/os/macos/ia32/Makefile.am b/lib/libspl/include/os/macos/ia32/Makefile.am new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/libspl/include/os/macos/ia32/sys/Makefile.am b/lib/libspl/include/os/macos/ia32/sys/Makefile.am new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/libspl/include/os/macos/ia32/sys/asm_linkage.h b/lib/libspl/include/os/macos/ia32/sys/asm_linkage.h new file mode 100644 index 0000000000..0009705ad6 --- /dev/null +++ b/lib/libspl/include/os/macos/ia32/sys/asm_linkage.h @@ -0,0 +1,297 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IA32_SYS_ASM_LINKAGE_H +#define _IA32_SYS_ASM_LINKAGE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _ASM /* The remainder of this file is only for assembly files */ + +/* + * make annoying differences in assembler syntax go away + */ + +/* + * D16 and A16 are used to insert instructions prefixes; the + * macros help the assembler code be slightly more portable. + */ +#if !defined(__GNUC_AS__) +/* + * /usr/ccs/bin/as prefixes are parsed as separate instructions + */ +#define D16 data16; +#define A16 addr16; + +/* + * (There are some weird constructs in constant expressions) + */ +#define _CONST(const) [const] +#define _BITNOT(const) -1!_CONST(const) +#define _MUL(a, b) _CONST(a \* b) + +#else +/* + * Why not use the 'data16' and 'addr16' prefixes .. well, the + * assembler doesn't quite believe in real mode, and thus argues with + * us about what we're trying to do. + */ +#define D16 .byte 0x66; +#define A16 .byte 0x67; + +#define _CONST(const) (const) +#define _BITNOT(const) ~_CONST(const) +#define _MUL(a, b) _CONST(a * b) + +#endif + +/* + * C pointers are different sizes between i386 and amd64. + * These constants can be used to compute offsets into pointer arrays. + */ +#if defined(__amd64) +#define CLONGSHIFT 3 +#define CLONGSIZE 8 +#define CLONGMASK 7 +#elif defined(__i386) +#define CLONGSHIFT 2 +#define CLONGSIZE 4 +#define CLONGMASK 3 +#endif + +/* + * Since we know we're either ILP32 or LP64 .. + */ +#define CPTRSHIFT CLONGSHIFT +#define CPTRSIZE CLONGSIZE +#define CPTRMASK CLONGMASK + +#if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) +#error "inconsistent shift constants" +#endif + +#if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) +#error "inconsistent mask constants" +#endif + +#define ASM_ENTRY_ALIGN 4, 0x90 + +/* + * SSE register alignment and save areas + */ + +#define XMM_SIZE 16 +#define XMM_ALIGN 16 +#define XMM_ALIGN_LOG 4, 0x90 + +#if defined(__amd64) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp; \ + movq %rsp, sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp + +#elif defined(__i386) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \ + movl %esp, sreg; \ + addl $XMM_ALIGN, sreg; \ + andl $_BITNOT(XMM_ALIGN-1), sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; + +#endif /* __i386 */ + +/* + * profiling causes definitions of the MCOUNT and RTMCOUNT + * particular to the type + */ +#ifdef GPROF + +#define MCOUNT(x) \ + pushl %ebp; \ + movl %esp, %ebp; \ + call _mcount; \ + popl %ebp + +#endif /* GPROF */ + +#ifdef PROF + +#define MCOUNT(x) \ +/* CSTYLED */ \ + .lcomm .L_/**/x/**/1, 4, 4; \ + pushl %ebp; \ + movl %esp, %ebp; \ +/* CSTYLED */ \ + movl $.L_/**/x/**/1, %edx; \ + call _mcount; \ + popl %ebp + +#endif /* PROF */ + +/* + * if we are not profiling, MCOUNT should be defined to nothing + */ +#if !defined(PROF) && !defined(GPROF) +#define MCOUNT(x) +#endif /* !defined(PROF) && !defined(GPROF) */ + +#define RTMCOUNT(x) MCOUNT(x) + +/* + * Macro to define weak symbol aliases. These are similar to the ANSI-C + * #pragma weak name = _name + * except a compiler can determine type. The assembler must be told. Hence, + * the second parameter must be the type of the symbol (i.e.: function,...) + */ +#define ANSI_PRAGMA_WEAK(sym, stype) \ + .weak sym; \ +/* CSTYLED */ \ +sym = _/**/sym + +/* + * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in: + * #pragma weak sym1 = sym2 + */ +#define ANSI_PRAGMA_WEAK2(sym1, sym2, stype) \ + .weak sym1; \ +sym1 = sym2 + +/* + * ENTRY provides the standard procedure entry code and an easy way to + * insert the calls to mcount for profiling. ENTRY_NP is identical, but + * never calls mcount. + */ +#define ENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: MCOUNT(x) + +#define ENTRY_NP(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: + +#define RTENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: RTMCOUNT(x) + +/* + * ENTRY2 is identical to ENTRY but provides two labels for the entry point. + */ +#define ENTRY2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ +/* CSTYLED */ \ +x:; \ +y: MCOUNT(x) + +#define ENTRY_NP2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ +/* CSTYLED */ \ +x:; \ +y: + + +/* + * ALTENTRY provides for additional entry points. + */ +#define ALTENTRY(x) \ + .globl _##x; \ + .globl x; \ +_##x:; \ +x: + +/* + * DGDEF and DGDEF2 provide global data declarations. + * + * DGDEF provides a word aligned word of storage. + * + * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This + * implies this macro is best used for byte arrays. + * + * DGDEF3 allocates "sz" bytes of storage with "algn" alignment. + */ +#define DGDEF2(name, sz) \ + .data; \ + .globl name; \ +name: + +#define DGDEF3(name, sz, algn) \ + .data; \ + .align algn; \ + .globl name; \ +name: + +#define DGDEF(name) DGDEF3(name, 4, 4) + +/* + * SET_SIZE trails a function and set the size for the ELF symbol table. + */ +#define SET_SIZE(x) + +/* + * NWORD provides native word value. + */ +#if defined(__amd64) + +/*CSTYLED*/ +#define NWORD quad + +#elif defined(__i386) + +#define NWORD long + +#endif /* __i386 */ + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/lib/libspl/include/os/macos/mach/Makefile.am b/lib/libspl/include/os/macos/mach/Makefile.am new file mode 100644 index 0000000000..89b0459882 --- /dev/null +++ b/lib/libspl/include/os/macos/mach/Makefile.am @@ -0,0 +1,3 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + $(top_srcdir)/lib/libspl/include/os/macos/mach/boolean.h diff --git a/lib/libspl/include/os/macos/mach/boolean.h b/lib/libspl/include/os/macos/mach/boolean.h new file mode 100644 index 0000000000..47c93a3151 --- /dev/null +++ b/lib/libspl/include/os/macos/mach/boolean.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* Deal with XNU's own boolean_t version */ + +#define boolean_t xnu_boolean_t +#include_next +#undef boolean_t diff --git a/lib/libspl/include/os/macos/mntent.h b/lib/libspl/include/os/macos/mntent.h new file mode 100644 index 0000000000..8183dda00b --- /dev/null +++ b/lib/libspl/include/os/macos/mntent.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + */ + +#ifndef _SYS_MNTENT_H +#define _SYS_MNTENT_H + + +#ifdef __cplusplus +extern "C" { +#endif + +#define MNTTAB "/etc/mnttab" +#define VFSTAB "/etc/vfstab" +#define MNTMAXSTR 128 + +#define MNTTYPE_ZFS "zfs" /* ZFS file system */ +#define MNTTYPE_UFS "ufs" /* Unix file system */ +#define MNTTYPE_NFS "nfs" /* NFS file system */ +#define MNTTYPE_NFS3 "nfs3" /* NFS Version 3 file system */ +#define MNTTYPE_NFS4 "nfs4" /* NFS Version 4 file system */ +#define MNTTYPE_CACHEFS "cachefs" /* Cache File System */ +#define MNTTYPE_PCFS "pcfs" /* PC (MSDOS) file system */ +#define MNTTYPE_PC MNTTYPE_PCFS /* Deprecated name; use MNTTYPE_PCFS */ +#define MNTTYPE_LOFS "lofs" /* Loop back file system */ +#define MNTTYPE_LO MNTTYPE_LOFS /* Deprecated name; use MNTTYPE_LOFS */ +#define MNTTYPE_HSFS "hsfs" /* High Sierra (9660) file system */ +#define MNTTYPE_SWAP "swap" /* Swap file system */ +#define MNTTYPE_TMPFS "tmpfs" /* Tmp volatile file system */ +#define MNTTYPE_AUTOFS "autofs" /* Automounter ``file'' system */ +#define MNTTYPE_MNTFS "mntfs" /* In-kernel mnttab */ +#define MNTTYPE_DEV "dev" /* /dev file system */ +#define MNTTYPE_CTFS "ctfs" /* Contract file system */ +#define MNTTYPE_OBJFS "objfs" /* Kernel object file system */ +#define MNTTYPE_SHAREFS "sharefs" /* Kernel sharetab file system */ + + +#define MNTOPT_RO "ro" /* Read only */ +#define MNTOPT_RW "rw" /* Read/write */ +#define MNTOPT_RQ "rq" /* Read/write with quotas */ +#define MNTOPT_QUOTA "quota" /* Check quotas */ +#define MNTOPT_NOQUOTA "noquota" /* Don't check quotas */ +#define MNTOPT_ONERROR "onerror" /* action to taken on error */ +#define MNTOPT_SOFT "soft" /* Soft mount */ +#define MNTOPT_SEMISOFT "semisoft" /* partial soft, uncommited interface */ +#define MNTOPT_HARD "hard" /* Hard mount */ +#define MNTOPT_SUID "suid" /* Both setuid and devices allowed */ +#define MNTOPT_NOSUID "nosuid" /* Neither setuid nor devices allowed */ +#define MNTOPT_DEVICES "devices" /* Device-special allowed */ +#define MNTOPT_NODEVICES "nodevices" /* Device-special disallowed */ +#define MNTOPT_SETUID "setuid" /* Set uid allowed */ +#define MNTOPT_NOSETUID "nosetuid" /* Set uid not allowed */ +#define MNTOPT_GRPID "grpid" /* SysV-compatible gid on create */ +#define MNTOPT_REMOUNT "remount" /* Change mount options */ +#define MNTOPT_NOSUB "nosub" /* Disallow mounts on subdirs */ +#define MNTOPT_MULTI "multi" /* Do multi-component lookup */ +#define MNTOPT_INTR "intr" /* Allow NFS ops to be interrupted */ +#define MNTOPT_NOINTR "nointr" /* Don't allow interrupted ops */ +#define MNTOPT_PORT "port" /* NFS server IP port number */ +#define MNTOPT_SECURE "secure" /* Secure (AUTH_DES) mounting */ +#define MNTOPT_RSIZE "rsize" /* Max NFS read size (bytes) */ +#define MNTOPT_WSIZE "wsize" /* Max NFS write size (bytes) */ +#define MNTOPT_TIMEO "timeo" /* NFS timeout (1/10 sec) */ +#define MNTOPT_RETRANS "retrans" /* Max retransmissions (soft mnts) */ +#define MNTOPT_ACTIMEO "actimeo" /* Attr cache timeout (sec) */ +#define MNTOPT_ACREGMIN "acregmin" /* Min attr cache timeout (files) */ +#define MNTOPT_ACREGMAX "acregmax" /* Max attr cache timeout (files) */ +#define MNTOPT_ACDIRMIN "acdirmin" /* Min attr cache timeout (dirs) */ +#define MNTOPT_ACDIRMAX "acdirmax" /* Max attr cache timeout (dirs) */ +#define MNTOPT_NOAC "noac" /* Don't cache attributes at all */ +#define MNTOPT_NOCTO "nocto" /* No close-to-open consistency */ +#define MNTOPT_BG "bg" /* Do mount retries in background */ +#define MNTOPT_FG "fg" /* Do mount retries in foreground */ +#define MNTOPT_RETRY "retry" /* Number of mount retries */ +#define MNTOPT_DEV "dev" /* Device id of mounted fs */ +#define MNTOPT_POSIX "posix" /* Get static pathconf for mount */ +#define MNTOPT_MAP "map" /* Automount map */ +#define MNTOPT_DIRECT "direct" /* Automount direct map mount */ +#define MNTOPT_INDIRECT "indirect" /* Automount indirect map mount */ +#define MNTOPT_LLOCK "llock" /* Local locking (no lock manager) */ +#define MNTOPT_IGNORE "ignore" /* Ignore this entry */ +#define MNTOPT_VERS "vers" /* protocol version number indicator */ +#define MNTOPT_PROTO "proto" /* protocol network_id indicator */ +#define MNTOPT_SEC "sec" /* Security flavor indicator */ +#define MNTOPT_SYNCDIR "syncdir" /* Synchronous local directory ops */ +#define MNTOPT_NOSETSEC "nosec" /* Do no allow setting sec attrs */ +#define MNTOPT_NOPRINT "noprint" /* Do not print messages */ +#define MNTOPT_LARGEFILES "largefiles" /* allow large files */ +#define MNTOPT_NOLARGEFILES "nolargefiles" /* don't allow large files */ +#define MNTOPT_FORCEDIRECTIO "forcedirectio" /* Force DirectIO on all files */ +#define MNTOPT_NOFORCEDIRECTIO "noforcedirectio" /* No Force DirectIO */ +#define MNTOPT_DISABLEDIRECTIO "disabledirectio" /* Disable DirectIO ioctls */ +#define MNTOPT_PUBLIC "public" /* Use NFS public file handlee */ +#define MNTOPT_LOGGING "logging" /* enable logging */ +#define MNTOPT_NOLOGGING "nologging" /* disable logging */ +#define MNTOPT_ATIME "atime" /* update atime for files */ +#define MNTOPT_NOATIME "noatime" /* do not update atime for files */ +#define MNTOPT_GLOBAL "global" /* Cluster-wide global mount */ +#define MNTOPT_NOGLOBAL "noglobal" /* Mount local to single node */ +#define MNTOPT_DFRATIME "dfratime" /* Deferred access time updates */ +#define MNTOPT_NODFRATIME "nodfratime" /* No Deferred access time updates */ +#define MNTOPT_NBMAND "nbmand" /* allow non-blocking mandatory locks */ +#define MNTOPT_NONBMAND "nonbmand" /* deny non-blocking mandatory locks */ +#define MNTOPT_XATTR "xattr" /* enable extended attributes */ +#define MNTOPT_NOXATTR "noxattr" /* disable extended attributes */ +#define MNTOPT_EXEC "exec" /* enable executables */ +#define MNTOPT_NOEXEC "noexec" /* disable executables */ +#define MNTOPT_RESTRICT "restrict" /* restricted autofs mount */ +#define MNTOPT_BROWSE "browse" /* browsable autofs mount */ +#define MNTOPT_NOBROWSE "nobrowse" /* non-browsable autofs mount */ +/* VFS will not ignore ownership information on filesystem objects */ +#define MNTOPT_OWNERS "owners" +/* VFS will ignore ownership information on filesystem objects */ +#define MNTOPT_NOOWNERS "noowners" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MNTENT_H */ diff --git a/lib/libspl/include/os/macos/poll.h b/lib/libspl/include/os/macos/poll.h new file mode 100644 index 0000000000..2bb5203d00 --- /dev/null +++ b/lib/libspl/include/os/macos/poll.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LIBSPL_POLL_H +#define _LIBSPL_POLL_H + +#include_next + +#ifndef O_DIRECT +#define O_DIRECT 0 +#endif + +#endif diff --git a/lib/libspl/include/os/macos/rpc/Makefile.am b/lib/libspl/include/os/macos/rpc/Makefile.am new file mode 100644 index 0000000000..645ec772f9 --- /dev/null +++ b/lib/libspl/include/os/macos/rpc/Makefile.am @@ -0,0 +1,3 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + $(top_srcdir)/lib/libspl/include/os/macos/rpc/xdr.h diff --git a/lib/libspl/include/os/macos/rpc/xdr.h b/lib/libspl/include/os/macos/rpc/xdr.h new file mode 100644 index 0000000000..9fc13fefaf --- /dev/null +++ b/lib/libspl/include/os/macos/rpc/xdr.h @@ -0,0 +1,38 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef LIBSPL_MACOS_RPC_XDR_H +#define LIBSPL_MACOS_RPC_XDR_H + +#include +#include_next + +#endif /* LIBSPL_MACOS_RPC_XDR_H */ diff --git a/lib/libspl/include/os/macos/stdlib.h b/lib/libspl/include/os/macos/stdlib.h new file mode 100644 index 0000000000..30a9d29f17 --- /dev/null +++ b/lib/libspl/include/os/macos/stdlib.h @@ -0,0 +1,28 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LIBSPL_STDLIB_H +#define _LIBSPL_STDLIB_H + +#include_next +#include + +#endif diff --git a/lib/libspl/include/os/macos/sys/Makefile.am b/lib/libspl/include/os/macos/sys/Makefile.am new file mode 100644 index 0000000000..fb063b875c --- /dev/null +++ b/lib/libspl/include/os/macos/sys/Makefile.am @@ -0,0 +1,17 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + $(top_srcdir)/lib/libspl/include/os/macos/mach/boolean.h \ + $(top_srcdir)/lib/libspl/include/os/macos/rpc/xdr.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/byteorder.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/errno.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/file.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/kernel_types.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/mnttab.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/mount.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/param.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/stat.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/sysmacros.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/uio.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/vfs.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/xattr.h \ + $(top_srcdir)/lib/libspl/include/os/macos/sys/zfs_context_os.h diff --git a/lib/libspl/include/os/macos/sys/byteorder.h b/lib/libspl/include/os/macos/sys/byteorder.h new file mode 100644 index 0000000000..dd578e4493 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/byteorder.h @@ -0,0 +1,279 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_BYTEORDER_H +#define _SYS_BYTEORDER_H + +#include +#include <_types.h> + +/* + * Define the order of 32-bit words in 64-bit words. + */ +#define _QUAD_HIGHWORD 1 +#define _QUAD_LOWWORD 0 + +/* + * Definitions for byte order, according to byte significance from low + * address to high. + */ +#undef _LITTLE_ENDIAN +/* LSB first: i386, vax */ +#define _LITTLE_ENDIAN 1234 +/* LSB first in word, MSW first in long */ +#define _PDP_ENDIAN 3412 + +#define _BYTE_ORDER _LITTLE_ENDIAN + +/* + * Deprecated variants that don't have enough underscores to be useful in more + * strict namespaces. + */ +#if __BSD_VISIBLE +#define LITTLE_ENDIAN _LITTLE_ENDIAN +#define PDP_ENDIAN _PDP_ENDIAN +#define BYTE_ORDER _BYTE_ORDER +#endif + +#define __bswap16_gen(x) (__uint16_t)((x) << 8 | (x) >> 8) +#define __bswap32_gen(x) \ + (((__uint32_t)__bswap16((x) & 0xffff) << 16) | __bswap16((x) >> 16)) +#define __bswap64_gen(x) \ + (((__uint64_t)__bswap32((x) & 0xffffffff) << 32) | __bswap32((x) >> 32)) + +#ifdef __GNUCLIKE_BUILTIN_CONSTANT_P +#define __bswap16(x) \ + ((__uint16_t)(__builtin_constant_p(x) ? \ + __bswap16_gen((__uint16_t)(x)) : __bswap16_var(x))) +#define __bswap32(x) \ + (__builtin_constant_p(x) ? \ + __bswap32_gen((__uint32_t)(x)) : __bswap32_var(x)) +#define __bswap64(x) \ + (__builtin_constant_p(x) ? \ + __bswap64_gen((__uint64_t)(x)) : __bswap64_var(x)) +#else +/* XXX these are broken for use in static initializers. */ +#define __bswap16(x) __bswap16_var(x) +#define __bswap32(x) __bswap32_var(x) +#define __bswap64(x) __bswap64_var(x) +#endif + +/* These are defined as functions to avoid multiple evaluation of x. */ + +static __inline __uint16_t +__bswap16_var(__uint16_t _x) +{ + + return (__bswap16_gen(_x)); +} + +static __inline __uint32_t +__bswap32_var(__uint32_t _x) +{ + +#ifdef __GNUCLIKE_ASM + __asm("bswap %0" : "+r" (_x)); + return (_x); +#else + return (__bswap32_gen(_x)); +#endif +} +#define __htonl(x) __bswap32(x) +#define __htons(x) __bswap16(x) +#define __ntohl(x) __bswap32(x) +#define __ntohs(x) __bswap16(x) + +#include +#include + +#if defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * macros for conversion between host and (internet) network byte order + */ + +#if defined(_BIG_ENDIAN) && !defined(ntohl) && !defined(__lint) +/* big-endian */ +#if defined(_BIG_ENDIAN) && (defined(__amd64__) || defined(__amd64)) +#error "incompatible ENDIAN / ARCH combination" +#endif +#define ntohl(x) (x) +#define ntohs(x) (x) +#define htonl(x) (x) +#define htons(x) (x) + +#elif !defined(ntohl) /* little-endian */ + +#ifndef _IN_PORT_T +#define _IN_PORT_T +typedef uint16_t in_port_t; +#endif + +#ifndef _IN_ADDR_T +#define _IN_ADDR_T +typedef uint32_t in_addr_t; +#endif + +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) +extern uint32_t htonl(uint32_t); +extern uint16_t htons(uint16_t); +extern uint32_t ntohl(uint32_t); +extern uint16_t ntohs(uint16_t); +#else +extern in_addr_t htonl(in_addr_t); +extern in_port_t htons(in_port_t); +extern in_addr_t ntohl(in_addr_t); +extern in_port_t ntohs(in_port_t); +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) */ +#endif + +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +/* + * Macros to convert from a specific byte order to/from native byte order + */ +#ifdef _BIG_ENDIAN +#define BE_8(x) BMASK_8(x) +#define BE_16(x) BMASK_16(x) +#define BE_32(x) BMASK_32(x) +#define BE_64(x) BMASK_64(x) +#define LE_8(x) BSWAP_8(x) +#define LE_16(x) BSWAP_16(x) +#define LE_32(x) BSWAP_32(x) +#define LE_64(x) BSWAP_64(x) +#else +#define LE_8(x) BMASK_8(x) +#define LE_16(x) BMASK_16(x) +#define LE_32(x) BMASK_32(x) +#define LE_64(x) BMASK_64(x) +#define BE_8(x) BSWAP_8(x) +#define BE_16(x) BSWAP_16(x) +#define BE_32(x) BSWAP_32(x) +#define BE_64(x) BSWAP_64(x) +#endif + +/* + * Macros to read unaligned values from a specific byte order to + * native byte order + */ + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + +#define BE_IN64(xa) \ + (((uint64_t)BE_IN32(xa) << 32) | BE_IN32((uint8_t *)(xa)+4)) + +#define LE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define LE_IN16(xa) \ + (((uint16_t)LE_IN8((uint8_t *)(xa) + 1) << 8) | LE_IN8(xa)) + +#define LE_IN32(xa) \ + (((uint32_t)LE_IN16((uint8_t *)(xa) + 2) << 16) | LE_IN16(xa)) + +#define LE_IN64(xa) \ + (((uint64_t)LE_IN32((uint8_t *)(xa) + 4) << 32) | LE_IN32(xa)) + +/* + * Macros to write unaligned values from native byte order to a specific byte + * order. + */ + +#define BE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define BE_OUT16(xa, yv) \ + BE_OUT8((uint8_t *)(xa) + 1, yv); \ + BE_OUT8((uint8_t *)(xa), (yv) >> 8); + +#define BE_OUT32(xa, yv) \ + BE_OUT16((uint8_t *)(xa) + 2, yv); \ + BE_OUT16((uint8_t *)(xa), (yv) >> 16); + +#define BE_OUT64(xa, yv) \ + BE_OUT32((uint8_t *)(xa) + 4, yv); \ + BE_OUT32((uint8_t *)(xa), (yv) >> 32); + +#define LE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define LE_OUT16(xa, yv) \ + LE_OUT8((uint8_t *)(xa), yv); \ + LE_OUT8((uint8_t *)(xa) + 1, (yv) >> 8); + +#define LE_OUT32(xa, yv) \ + LE_OUT16((uint8_t *)(xa), yv); \ + LE_OUT16((uint8_t *)(xa) + 2, (yv) >> 16); + +#define LE_OUT64(xa, yv) \ + LE_OUT32((uint8_t *)(xa), yv); \ + LE_OUT32((uint8_t *)(xa) + 4, (yv) >> 32); + +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BYTEORDER_H */ diff --git a/lib/libspl/include/os/macos/sys/errno.h b/lib/libspl/include/os/macos/sys/errno.h new file mode 100644 index 0000000000..af4846ebb9 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/errno.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include_next + +#define EBADE EBADMACHO +#define ECKSUM EBADE +#define EFRAGS EIDRM +#define EREMOTEIO ENOLINK +#define ENOTACTIVE ENOPOLICY +#define ECHRNG EMULTIHOP + +#define ERESTART (-1) /* restart syscall */ diff --git a/lib/libspl/include/os/macos/sys/fcntl.h b/lib/libspl/include/os/macos/sys/fcntl.h new file mode 100644 index 0000000000..f0b03be99b --- /dev/null +++ b/lib/libspl/include/os/macos/sys/fcntl.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _LIBSPL_SYS_FCNTL_H +#define _LIBSPL_SYS_FCNTL_H + +#include_next + +#define O_LARGEFILE 0 +#define O_RSYNC 0 + +#ifndef O_DIRECT +#define O_DIRECT 0 +#endif + +#endif diff --git a/lib/libspl/include/os/macos/sys/file.h b/lib/libspl/include/os/macos/sys/file.h new file mode 100644 index 0000000000..94a33cbb33 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/file.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_FILE_H +#define _LIBSPL_SYS_FILE_H + +#include_next + +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FOFFMAX 0 +#define FSYNC O_SYNC +#define FDSYNC O_DSYNC +#define FRSYNC O_RSYNC +#define FEXCL O_EXCL + +#define IO_DIRECT 0 + +#define FNODSYNC 0x10000 /* fsync pseudo flag */ +#define FNOFOLLOW 0x20000 /* don't follow symlinks */ +#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ + +#endif diff --git a/lib/libspl/include/os/macos/sys/kernel_types.h b/lib/libspl/include/os/macos/sys/kernel_types.h new file mode 100644 index 0000000000..5796351a20 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/kernel_types.h @@ -0,0 +1,43 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef LIBSPL_SYS_KERNEL_TYPES_H +#define LIBSPL_SYS_KERNEL_TYPES_H + +/* + * Unfortunately, XNU defines uio_t, proc_t and vnode_t differently to + * ZFS, so we need to hack around it. + */ + +#undef vnode_t +#undef uio_t +#define proc_t kernel_proc_t +#include_next +#define vnode_t struct vnode +#define uio_t struct uio +#undef proc_t + + +/* Other missing Linux types */ +typedef off_t loff_t; + +#endif diff --git a/lib/libspl/include/os/macos/sys/mnttab.h b/lib/libspl/include/os/macos/sys/mnttab.h new file mode 100644 index 0000000000..9ba5d08b21 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/mnttab.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2006 Ricardo Correia */ + +#ifndef _SYS_MNTTAB_H +#define _SYS_MNTTAB_H + +#include +#include +#include +#include + +#ifdef MNTTAB +#undef MNTTAB +#endif /* MNTTAB */ + +#include +#include +#define MNTTAB _PATH_DEVNULL +#define MS_NOMNTTAB 0x0 +#define MS_RDONLY 0x1 +#define umount2(p, f) unmount(p, f) +#define MNT_LINE_MAX 4096 + +#define MNT_TOOLONG 1 /* entry exceeds MNT_LINE_MAX */ +#define MNT_TOOMANY 2 /* too many fields in line */ +#define MNT_TOOFEW 3 /* too few fields in line */ + +struct mnttab { + char *mnt_special; + char *mnt_mountp; + char *mnt_fstype; + char *mnt_mntopts; + uint_t mnt_major; + uint_t mnt_minor; + uint32_t mnt_fssubtype; +}; + +#define extmnttab mnttab + +struct stat64; +struct statfs; + +extern DIR *fdopendir(int fd); +extern int openat64(int, const char *, int, ...); + +extern int getmntany(FILE *fd, struct mnttab *mgetp, struct mnttab *mrefp); +extern int getmntent(FILE *fp, struct mnttab *mp); +extern char *hasmntopt(struct mnttab *mnt, char *opt); +extern int getextmntent(const char *path, struct extmnttab *entry, + struct stat64 *statbuf); + +extern void statfs2mnttab(struct statfs *sfs, struct mnttab *mp); + +#ifndef AT_SYMLINK_NOFOLLOW +#define AT_SYMLINK_NOFOLLOW 0x100 +#endif + +extern int fstatat64(int, const char *, struct stat *, int); + +#endif diff --git a/lib/libspl/include/os/macos/sys/mount.h b/lib/libspl/include/os/macos/sys/mount.h new file mode 100644 index 0000000000..970d8affcc --- /dev/null +++ b/lib/libspl/include/os/macos/sys/mount.h @@ -0,0 +1,114 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _LIBSPL_SYS_MOUNT_H +#define _LIBSPL_SYS_MOUNT_H + +#undef _SYS_MOUNT_H_ + +#include +#include +#include +#include +#include + +/* Unfortunately, XNU has a different meaning for "vnode_t". */ +#undef vnode_t +#include_next +#define vnode_t struct vnode + +/* + * Some old glibc headers don't define BLKGETSIZE64 + * and we don't want to require the kernel headers + */ +#if !defined(BLKGETSIZE64) +#define BLKGETSIZE64 _IOR(0x12, 114, size_t) +#endif + +/* + * Some old glibc headers don't correctly define MS_DIRSYNC and + * instead use the enum name S_WRITE. When using these older + * headers define MS_DIRSYNC to be S_WRITE. + */ +#if !defined(MS_DIRSYNC) +#define MS_DIRSYNC S_WRITE +#endif + +/* + * Some old glibc headers don't correctly define MS_POSIXACL and + * instead leave it undefined. When using these older headers define + * MS_POSIXACL to the reserved value of (1<<16). + */ +#if !defined(MS_POSIXACL) +#define MS_POSIXACL (1<<16) +#endif + +#define MS_NOSUID MNT_NOSUID +#define MS_NOEXEC MNT_NOEXEC +#define MS_NODEV 0 +#define S_WRITE 0 +#define MS_BIND 0 +#define MS_REMOUNT 0 +#define MS_SYNCHRONOUS MNT_SYNCHRONOUS + +#define MS_USERS (MS_NOEXEC|MS_NOSUID|MS_NODEV) +#define MS_OWNER (MS_NOSUID|MS_NODEV) +#define MS_GROUP (MS_NOSUID|MS_NODEV) +#define MS_COMMENT 0 + +/* + * Older glibc headers did not define all the available + * umount2(2) flags. Both MNT_FORCE and MNT_DETACH are supported in the + * kernel back to 2.4.11 so we define them correctly if they are missing. + */ +#ifdef MNT_FORCE +#define MS_FORCE MNT_FORCE +#else +#define MS_FORCE 0x00000001 +#endif /* MNT_FORCE */ + +#ifdef MNT_DETACH +#define MS_DETACH MNT_DETACH +#else +#define MS_DETACH 0x00000002 +#endif /* MNT_DETACH */ + +/* + * Overlay mount is default in Linux, but for solaris/zfs + * compatibility, MS_OVERLAY is defined to explicitly have the user + * provide a flag (-O) to mount over a non empty directory. + */ +#define MS_OVERLAY 0x00000004 + +/* + * MS_CRYPT indicates that encryption keys should be loaded if they are not + * already available. This is not defined in glibc, but it is never seen by + * the kernel so it will not cause any problems. + */ +#define MS_CRYPT 0x00000008 + +#endif /* _LIBSPL_SYS_MOUNT_H */ diff --git a/lib/libspl/include/os/macos/sys/param.h b/lib/libspl/include/os/macos/sys/param.h new file mode 100644 index 0000000000..a9815f10b5 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/param.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_PARAM_H +#define _LIBSPL_SYS_PARAM_H + +#include_next +#include + +/* + * File system parameters and macros. + * + * The file system is made out of blocks of at most MAXBSIZE units, + * with smaller units (fragments) only in the last direct block. + * MAXBSIZE primarily determines the size of buffers in the buffer + * pool. It may be made larger without any effect on existing + * file systems; however making it smaller may make some file + * systems unmountable. + * + * Note that the blocked devices are assumed to have DEV_BSIZE + * "sectors" and that fragments must be some multiple of this size. + */ +#define MAXNAMELEN 256 + +#define UID_NOBODY 60001 /* user ID no body */ +#define GID_NOBODY UID_NOBODY +#define UID_NOACCESS 60002 /* user ID no access */ + +#define MAXUID UINT32_MAX /* max user id */ +#define MAXPROJID MAXUID /* max project id */ + +#ifndef PAGESIZE +#define PAGESIZE (sysconf(_SC_PAGESIZE)) +#endif /* PAGESIZE */ + +extern int execvpe(const char *name, char * const argv[], char * const envp[]); + +struct zfs_handle; + +#endif diff --git a/lib/libspl/include/os/macos/sys/stat.h b/lib/libspl/include/os/macos/sys/stat.h new file mode 100644 index 0000000000..1c7858194d --- /dev/null +++ b/lib/libspl/include/os/macos/sys/stat.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBSPL_SYS_STAT_H +#define _LIBSPL_SYS_STAT_H + +#include_next + +#include +#include /* for BLKGETSIZE64 */ + +#define MAXOFFSET_T OFF_MAX + +#ifndef _KERNEL +#include +#endif + +static inline int +fstat_blk(int fd, struct stat *st) +{ + if (fstat(fd, st) == -1) + return (-1); + + /* In OS X we need to use ioctl to get the size of a block dev */ + if (st->st_mode & (S_IFBLK | S_IFCHR)) { + uint32_t blksize; + uint64_t blkcnt; + + if (ioctl(fd, DKIOCGETBLOCKSIZE, &blksize) < 0) { + return (-1); + } + if (ioctl(fd, DKIOCGETBLOCKCOUNT, &blkcnt) < 0) { + return (-1); + } + + st->st_size = (off_t)((uint64_t)blksize * blkcnt); + } + + return (0); +} + + +/* + * Deal with Linux use of 64 for everything. + * OsX has moved past it, dropped all 32 versions, and + * standard form is 64 bit. + */ + +#define stat64 stat +#define lstat64 lstat +#define fstat64 fstat +#define fstat64_blk fstat_blk +#define statfs64 statfs + +#endif /* _LIBSPL_SYS_STAT_H */ diff --git a/lib/libspl/include/os/macos/sys/sysmacros.h b/lib/libspl/include/os/macos/sys/sysmacros.h new file mode 100644 index 0000000000..7480eb85a5 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/sysmacros.h @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_SYSMACROS_H +#define _LIBSPL_SYS_SYSMACROS_H + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#define makedevice(maj, min) makedev(maj, min) +#define _sysconf(a) sysconf(a) + +/* + * Compatibility macros/typedefs needed for Solaris -> Linux port + */ +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define ISP2(x) (((x) & ((x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + + +/* avoid any possibility of clashing with version */ +#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + +#ifndef RLIM64_INFINITY +#define RLIM64_INFINITY (~0ULL) +#endif + +#endif /* _LIBSPL_SYS_SYSMACROS_H */ diff --git a/lib/libspl/include/os/macos/sys/uio.h b/lib/libspl/include/os/macos/sys/uio.h new file mode 100644 index 0000000000..f646b3e5d5 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/uio.h @@ -0,0 +1,175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _LIBSPL_SYS_UIO_H +#define _LIBSPL_SYS_UIO_H + +#include +#include_next + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* + * I/O parameter information. A uio structure describes the I/O which + * is to be performed by an operation. Typically the data movement will + * be performed by a routine such as uiomove(), which updates the uio + * structure to reflect what was done. + */ + +typedef struct iovec iovec_t; + + +/* + * I/O direction. + */ +// typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t; + +/* + * Segment flag values. + */ +typedef enum uio_seg { UIO_USERSPACE, UIO_SYSSPACE, UIO_USERISPACE } uio_seg_t; + + +struct uio { + struct iovec *uio_iov; /* pointer to array of iovecs */ + int uio_iovcnt; /* number of iovecs */ + off_t uio_offset; /* file offset */ + uio_seg_t uio_segflg; /* address space (kernel or user) */ + off_t uio_limit; /* u-limit (maximum byte offset) */ + ssize_t uio_resid; /* residual count */ + enum uio_rw uio_rw; + int uio_max_iovs; /* max iovecs this uio_t can hold */ + uint32_t uio_index; /* Current index */ +}; + + +uio_t *uio_create(int iovcount, off_t offset, int spacetype, int iodirection); +void uio_free(uio_t *uio); +int uio_addiov(uio_t *uio, user_addr_t baseaddr, user_size_t length); +int uio_isuserspace(uio_t *uio); +int uio_getiov(uio_t *uio, int index, user_addr_t *baseaddr, + user_size_t *length); +int uio_iovcnt(uio_t *uio); +off_t uio_offset(uio_t *uio); +void uio_update(uio_t *uio, user_size_t count); +uint64_t uio_resid(uio_t *uio); +user_addr_t uio_curriovbase(uio_t *uio); +user_size_t uio_curriovlen(uio_t *uio); +void uio_setoffset(uio_t *uio, off_t a_offset); +uio_t *uio_duplicate(uio_t *uio); +int uio_rw(uio_t *a_uio); +void uio_setrw(uio_t *a_uio, int a_value); + +int uiomove(void *, uint32_t, enum uio_rw, struct uio *); +int spllib_uiomove(const uint8_t *, uint32_t, struct uio *); +void uioskip(struct uio *, uint32_t); +int uiodup(struct uio *, struct uio *, iovec_t *, int); + +// xuio struct is not used in this platform, but we define it +// to allow compilation and easier patching +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY, +} xuio_type_t; + + +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { + int uioa_pfncnt; + void **uioa_ppp; + caddr_t uioa_base; + size_t uioa_len; +} uioa_page_t; + +typedef struct xuio { + uio_t *xu_uio; + enum xuio_type xu_type; + union { + struct { + uint32_t xu_a_state; + ssize_t xu_a_mbytes; + uioa_page_t *xu_a_lcur; + void **xu_a_lppp; + void *xu_a_hwst[4]; + uioa_page_t xu_a_locked[UIOA_IOV_MAX]; + } xu_aio; + struct { + int xu_zc_rw; + void *xu_zc_priv; + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +/* + * same as uiomove() but doesn't modify uio structure. + * return in cbytes how many bytes were copied. + */ +static inline int uiocopy(const unsigned char *p, uint32_t n, + enum uio_rw rw, struct uio *uio, uint64_t *cbytes) +{ + int result; + struct uio *nuio = uio_duplicate(uio); + unsigned long long x = uio_resid(uio); + if (!nuio) + return (ENOMEM); + uio_setrw(nuio, rw); + result = spllib_uiomove(p, n, nuio); + *cbytes = (x - uio_resid(nuio)); + uio_free(nuio); + return (result); +} + +// Apple's uiomove puts the uio_rw in uio_create +#define uiomove(A, B, C, D) spllib_uiomove((A), (B), (D)) +#define uioskip(A, B) uio_update((A), (B)) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UIO_H */ diff --git a/lib/libspl/include/os/macos/sys/vfs.h b/lib/libspl/include/os/macos/sys/vfs.h new file mode 100644 index 0000000000..a2ffcc08d8 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/vfs.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef ZFS_SYS_VFS_H_ +#define ZFS_SYS_VFS_H_ + +#endif /* !ZFS_SYS_VFS_H_ */ diff --git a/lib/libspl/include/os/macos/sys/xattr.h b/lib/libspl/include/os/macos/sys/xattr.h new file mode 100644 index 0000000000..045f681b1e --- /dev/null +++ b/lib/libspl/include/os/macos/sys/xattr.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBSPL_SYS_XATTR_H +#define _LIBSPL_SYS_XATTR_H + +#include_next + +/* macOS has one more argument */ +#define setxattr(A, B, C, D, E) setxattr(A, B, C, D, E, 0) +#define getxattr(A, B, C, D, E) getxattr(A, B, C, D, E, 0) + +#endif diff --git a/lib/libspl/include/os/macos/sys/zfs_context_os.h b/lib/libspl/include/os/macos/sys/zfs_context_os.h new file mode 100644 index 0000000000..3324a1cf25 --- /dev/null +++ b/lib/libspl/include/os/macos/sys/zfs_context_os.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef ZFS_CONTEXT_OS_H_ +#define ZFS_CONTEXT_OS_H_ + +#include + +#define ZFS_EXPORTS_PATH "/etc/exports" +#define MNTTYPE_ZFS_SUBTYPE ('Z'<<24|'F'<<16|'S'<<8) + +struct spa_iokit; +typedef struct spa_iokit spa_iokit_t; + +typedef off_t loff_t; + +struct zfs_handle; + +extern void zfs_rollback_os(struct zfs_handle *zhp); +extern void libzfs_macos_wrapfd(int *srcfd, boolean_t send); + +#endif diff --git a/lib/libspl/include/os/macos/time.h b/lib/libspl/include/os/macos/time.h new file mode 100644 index 0000000000..1f7068fcbf --- /dev/null +++ b/lib/libspl/include/os/macos/time.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LIBSPL_TIME_H +#define _LIBSPL_TIME_H + +#include_next + +/* Linux also has a timer_create() API we need to emulate. */ + +/* + * OsX version can probably be implemented by using: + * dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, 0, 0, queue); + * dispatch_source_set_event_handler(timer1, ^{vector1(timer1);}); + * dispatch_source_set_cancel_handler(timer1 + * dispatch_time_t start = dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC); + * dispatch_source_set_timer(timer1, start, NSEC_PER_SEC / 5, 0); + */ + +typedef void *timer_t; + +struct itimerspec { + struct timespec it_interval; /* timer period */ + struct timespec it_value; /* timer expiration */ +}; + +static inline int +timer_create(clockid_t clockid, struct sigevent *sevp, + timer_t *timerid) +{ + return (0); +} + +static inline int +timer_settime(timer_t id, int flags, + const struct itimerspec *its, struct itimerspec *remainvalue) +{ + return (0); +} + +static inline int +timer_delete(timer_t id) +{ + return (0); +} + +#endif diff --git a/lib/libspl/include/os/macos/unistd.h b/lib/libspl/include/os/macos/unistd.h new file mode 100644 index 0000000000..4a2814b9ec --- /dev/null +++ b/lib/libspl/include/os/macos/unistd.h @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#ifndef _LIBSPL_UNISTD_H +#define _LIBSPL_UNISTD_H + +#include_next +#include + +/* Handle Linux use of 64 names */ + +#define open64 open +#define pread64 pread +#define pwrite64 pwrite +#define ftruncate64 ftruncate +#define lseek64 lseek + + +static inline int +fdatasync(int fd) +{ + if (fcntl(fd, F_FULLFSYNC) == -1) + return (-1); + return (0); +} + +#endif diff --git a/lib/libspl/os/macos/getexecname.c b/lib/libspl/os/macos/getexecname.c new file mode 100644 index 0000000000..7e37958869 --- /dev/null +++ b/lib/libspl/os/macos/getexecname.c @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +const char * +getexecname(void) +{ + return (getprogname()); +} diff --git a/lib/libspl/os/macos/gethostid.c b/lib/libspl/os/macos/gethostid.c new file mode 100644 index 0000000000..bedea579ed --- /dev/null +++ b/lib/libspl/os/macos/gethostid.c @@ -0,0 +1,37 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2020, Jorgen Lundman + */ + +#include +#include +#include + +unsigned long +get_system_hostid(void) +{ + size_t len; + uint32_t myhostid = 0; + len = sizeof (myhostid); + sysctlbyname("kern.hostid", &myhostid, &len, NULL, 0); + return (myhostid); +} diff --git a/lib/libspl/os/macos/getmntany.c b/lib/libspl/os/macos/getmntany.c new file mode 100644 index 0000000000..f3fec9654e --- /dev/null +++ b/lib/libspl/os/macos/getmntany.c @@ -0,0 +1,462 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Ricardo Correia. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +#include +#include +#include +#include /* for isspace() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DIFF(xx) ((mrefp->xx != NULL) && \ + (mgetp->xx == NULL || strcmp(mrefp->xx, mgetp->xx) != 0)) + +static struct statfs *gsfs = NULL; +static int allfs = 0; +/* + * We will also query the extended filesystem capabilities API, to lookup + * other mount options, for example, XATTR. We can not use the MNTNOUSERXATTR + * option due to VFS rejecting with EACCESS. + */ + +#include +typedef struct attrlist attrlist_t; + +struct attrBufS { + u_int32_t length; + vol_capabilities_set_t caps; +} __attribute__((aligned(4), packed)); + + +DIR * +fdopendir(int fd) +{ + char fullpath[MAXPATHLEN]; + + if (fcntl(fd, F_GETPATH, fullpath) < 0) { + perror("fcntl"); + return (NULL); + } + if (close(fd) < 0) { + return (NULL); + } + + return (opendir(fullpath)); +} + +static int +chdir_block_begin(int newroot_fd) +{ + int cwdfd, error; + + cwdfd = open(".", O_RDONLY | O_DIRECTORY); + if (cwdfd == -1) + return (-1); + + if (fchdir(newroot_fd) == -1) { + error = errno; + (void) close(cwdfd); + errno = error; + return (-1); + } + return (cwdfd); +} + +static void +chdir_block_end(int cwdfd) +{ + int error = errno; + (void) fchdir(cwdfd); + (void) close(cwdfd); + errno = error; +} + +int +openat64(int dirfd, const char *path, int flags, ...) +{ + int cwdfd, filefd; + + if ((cwdfd = chdir_block_begin(dirfd)) == -1) + return (-1); + + if ((flags & O_CREAT) != 0) { + va_list ap; + int mode; + + va_start(ap, flags); + mode = va_arg(ap, int); + va_end(ap); + + filefd = open(path, flags, mode); + } else + filefd = open(path, flags); + + chdir_block_end(cwdfd); + return (filefd); +} + +int +fstatat64(int dirfd, const char *path, struct stat *statbuf, int flag) +{ + int cwdfd, error; + + if ((cwdfd = chdir_block_begin(dirfd)) == -1) + return (-1); + + if (flag == AT_SYMLINK_NOFOLLOW) + error = lstat(path, statbuf); + else + error = stat(path, statbuf); + + chdir_block_end(cwdfd); + return (error); +} + + +static char * +mntopt(char **p) +{ + char *cp = *p; + char *retstr; + + while (*cp && isspace(*cp)) + cp++; + + retstr = cp; + while (*cp && *cp != ',') + cp++; + + if (*cp) { + *cp = '\0'; + cp++; + } + + *p = cp; + return (retstr); +} + +char * +hasmntopt(struct mnttab *mnt, char *opt) +{ + char tmpopts[256]; + char *f, *opts = tmpopts; + + if (mnt->mnt_mntopts == NULL) + return (NULL); + (void) strlcpy(opts, mnt->mnt_mntopts, 256); + f = mntopt(&opts); + for (; *f; f = mntopt(&opts)) { + if (strncmp(opt, f, strlen(opt)) == 0) + return (f - tmpopts + mnt->mnt_mntopts); + } + return (NULL); +} + +static void +optadd(char *mntopts, size_t size, const char *opt) +{ + + if (mntopts[0] != '\0') + strlcat(mntopts, ",", size); + strlcat(mntopts, opt, size); +} + + +#include +#include +#include +#include +#include + + +char * +MYCFStringCopyUTF8String(CFStringRef aString) +{ + if (aString == NULL) + return (NULL); + + CFIndex length = CFStringGetLength(aString); + CFIndex maxSize = + CFStringGetMaximumSizeForEncoding(length, + kCFStringEncodingUTF8); + char *buffer = (char *)malloc(maxSize); + if (CFStringGetCString(aString, buffer, maxSize, + kCFStringEncodingUTF8)) { + return (buffer); + } + return (NULL); +} + +/* + * Given "/dev/disk6" connect to IOkit and fetch the dataset + * name "BOOM/lower", and use it instead. + */ +void +expand_disk_to_zfs(char *devname, int len) +{ + char *result = NULL; + CFMutableDictionaryRef matchingDict; + io_service_t service; + CFStringRef cfstr; + char *device; + + if (strncmp(devname, "/dev/disk", 9) != 0) + return; + + device = &devname[5]; + + matchingDict = IOBSDNameMatching(kIOMasterPortDefault, 0, device); + if (NULL == matchingDict) + return; + + /* + * Fetch the object with the matching BSD node name. + * Note that there should only be one match, so + * IOServiceGetMatchingService is used instead of + * IOServiceGetMatchingServices to simplify the code. + */ + service = IOServiceGetMatchingService(kIOMasterPortDefault, + matchingDict); + + if (IO_OBJECT_NULL == service) { + return; + } + + cfstr = IORegistryEntryCreateCFProperty(service, + CFSTR("ZFS Dataset"), kCFAllocatorDefault, 0); + if (cfstr) { + result = MYCFStringCopyUTF8String(cfstr); + CFRelease(cfstr); + } + + IOObjectRelease(service); + + if (result) { + strlcpy(devname, result, len); + free(result); + } +} + +void +statfs2mnttab(struct statfs *sfs, struct mnttab *mp) +{ + static char mntopts[MNTMAXSTR]; + long flags; + + mntopts[0] = '\0'; + + flags = sfs->f_flags; +#define OPTADD(opt) optadd(mntopts, sizeof (mntopts), (opt)) + if (flags & MNT_RDONLY) + OPTADD(MNTOPT_RO); + else + OPTADD(MNTOPT_RW); + if (flags & MNT_NOSUID) +#ifdef __FreeBSD__ + OPTADD(MNTOPT_NOSUID); +#elif defined(__APPLE__) + OPTADD(MNTOPT_NOSETUID); +#endif + else + OPTADD(MNTOPT_SETUID); + if (flags & MNT_UPDATE) + OPTADD(MNTOPT_REMOUNT); + if (flags & MNT_NOATIME) + OPTADD(MNTOPT_NOATIME); + else + OPTADD(MNTOPT_ATIME); + { + struct attrBufS attrBuf; + attrlist_t attrList; + + memset(&attrList, 0, sizeof (attrList)); + attrList.bitmapcount = ATTR_BIT_MAP_COUNT; + attrList.volattr = ATTR_VOL_INFO|ATTR_VOL_CAPABILITIES; + + if (getattrlist(sfs->f_mntonname, &attrList, &attrBuf, + sizeof (attrBuf), 0) == 0) { + + if (attrBuf.caps[VOL_CAPABILITIES_INTERFACES] & + VOL_CAP_INT_EXTENDED_ATTR) { + OPTADD(MNTOPT_XATTR); + } else { + OPTADD(MNTOPT_NOXATTR); + } // If EXTENDED + } // if getattrlist + } + if (flags & MNT_NOEXEC) + OPTADD(MNTOPT_NOEXEC); + else + OPTADD(MNTOPT_EXEC); + if (flags & MNT_NODEV) + OPTADD(MNTOPT_NODEVICES); + else + OPTADD(MNTOPT_DEVICES); + if (flags & MNT_DONTBROWSE) + OPTADD(MNTOPT_NOBROWSE); + else + OPTADD(MNTOPT_BROWSE); + if (flags & MNT_IGNORE_OWNERSHIP) + OPTADD(MNTOPT_NOOWNERS); + else + OPTADD(MNTOPT_OWNERS); + +#undef OPTADD + + // If a disk is /dev/diskX, lets see if it has "zfs_dataset_name" + // set, and if so, use it instead, for mount matching. + expand_disk_to_zfs(sfs->f_mntfromname, sizeof (sfs->f_mntfromname)); + + mp->mnt_special = sfs->f_mntfromname; + mp->mnt_mountp = sfs->f_mntonname; + mp->mnt_fstype = sfs->f_fstypename; + mp->mnt_mntopts = mntopts; + mp->mnt_fssubtype = sfs->f_fssubtype; + +} + +static int +statfs_init(void) +{ + struct statfs *sfs; + int error; + + if (gsfs != NULL) { + free(gsfs); + gsfs = NULL; + } + allfs = getfsstat(NULL, 0, MNT_NOWAIT); + if (allfs == -1) + goto fail; + gsfs = malloc(sizeof (gsfs[0]) * allfs * 2); + if (gsfs == NULL) + goto fail; + allfs = getfsstat(gsfs, (long)(sizeof (gsfs[0]) * allfs * 2), + MNT_NOWAIT); + if (allfs == -1) + goto fail; + sfs = realloc(gsfs, allfs * sizeof (gsfs[0])); + if (sfs != NULL) + gsfs = sfs; + return (0); +fail: + error = errno; + if (gsfs != NULL) + free(gsfs); + gsfs = NULL; + allfs = 0; + return (error); +} + +int +getmntany(FILE *fd __unused, struct mnttab *mgetp, struct mnttab *mrefp) +{ + int i, error; + + error = statfs_init(); + if (error != 0) + return (error); + + for (i = 0; i < allfs; i++) { + statfs2mnttab(&gsfs[i], mgetp); + if (mrefp->mnt_special != NULL && mgetp->mnt_special != NULL && + strcmp(mrefp->mnt_special, mgetp->mnt_special) != 0) { + continue; + } + if (mrefp->mnt_mountp != NULL && mgetp->mnt_mountp != NULL && + strcmp(mrefp->mnt_mountp, mgetp->mnt_mountp) != 0) { + continue; + } + if (mrefp->mnt_fstype != NULL && mgetp->mnt_fstype != NULL && + strcmp(mrefp->mnt_fstype, mgetp->mnt_fstype) != 0) { + continue; + } + return (0); + } + return (-1); +} + +int +getmntent(FILE *fp, struct mnttab *mp) +{ + static int index = -1; + int error = 0; + + if (index < 0) { + error = statfs_init(); + } + + if (error != 0) + return (error); + + index++; + + // If we have finished "reading" the mnttab, reset it to + // start from the beginning, and return EOF. + if (index >= allfs) { + index = -1; + return (-1); + } + + statfs2mnttab(&gsfs[index], mp); + return (0); +} + +int +getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) +{ + struct statfs sfs; + + if (strlen(path) >= MAXPATHLEN) { + (void) fprintf(stderr, "invalid object; pathname too long\n"); + return (-1); + } + + if (stat64(path, statbuf) != 0) { + (void) fprintf(stderr, "cannot open '%s': %s\n", + path, strerror(errno)); + return (-1); + } + + if (statfs(path, &sfs) != 0) { + (void) fprintf(stderr, "%s: %s\n", path, + strerror(errno)); + return (-1); + } + statfs2mnttab(&sfs, (struct mnttab *)entry); + return (0); +} diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 8cbbae7445..873d6b9127 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -2718,15 +2718,19 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, #ifdef __APPLE__ /* - * On OSX by default we mount pools under /Volumes unless - * the dataset property mountpoint specifies otherwise. - * In addition to this, there is an undocumented environment - * variable __ZFS_MAIN_MOUNTPOINT_DIR, used mainly by the - * testing environment, as it expects "/" by default. + * On OSX by default we mount pools under /Volumes + * unless the dataset property mountpoint specifies + * otherwise. + * In addition to this, there is an undocumented + * environment variable __ZFS_MAIN_MOUNTPOINT_DIR, + * used mainly by the testing environment, as it + * expects "/" by default. */ char *default_mountpoint; - default_mountpoint = getenv("__ZFS_MAIN_MOUNTPOINT_DIR"); - if (!default_mountpoint) default_mountpoint = "/Volumes/"; + default_mountpoint = + getenv("__ZFS_MAIN_MOUNTPOINT_DIR"); + if (!default_mountpoint) + default_mountpoint = "/Volumes/"; if (relpath[0] == '\0') (void) snprintf(propbuf, proplen, "%s%s", @@ -2734,8 +2738,8 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, else (void) snprintf(propbuf, proplen, "%s%s%s%s", root, str, source == NULL || - source[0] == '\0' ? default_mountpoint : "/", - relpath); + source[0] == '\0' ? default_mountpoint : + "/", relpath); #else diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index f1524c384f..b0b61521a4 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -1088,6 +1088,12 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) } if (!sdd->dryrun) { + +#if defined(__APPLE__) + /* Can't do IO on pipes, possibly wrap fd in domain socket */ + libzfs_macos_wrapfd(&sdd->outfd, B_TRUE); +#endif + /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. @@ -2473,6 +2479,11 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, if (flags->dryrun) return (0); +#if defined(__APPLE__) + /* Can't do IO on pipes, possibly wrap fd in domain socket */ + libzfs_macos_wrapfd(&fd, B_TRUE); +#endif + /* * If progress reporting is requested, spawn a new thread to poll * ZFS_IOC_SEND_PROGRESS at a regular interval. @@ -2563,6 +2574,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, return (zfs_standard_error(hdl, errno, errbuf)); } } + return (err != 0); } @@ -4629,6 +4641,11 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF); } +#if defined(__APPLE__) + /* Can't do IO on pipes, possibly wrap fd in domain socket */ + libzfs_macos_wrapfd(&infd, B_FALSE); +#endif + err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, raw, infd, drr_noswap, -1, &read_bytes, &errflags, diff --git a/lib/libzfs/os/macos/libzfs_mount_os.c b/lib/libzfs/os/macos/libzfs_mount_os.c new file mode 100644 index 0000000000..f5401a3b1f --- /dev/null +++ b/lib/libzfs/os/macos/libzfs_mount_os.c @@ -0,0 +1,713 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2019 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright 2017 RackTop Systems. + * Copyright (c) 2018 Datto Inc. + * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" +#include +#include + + +/* + * The default OpenZFS icon. Compare against known values to see if it needs + * updating. Allowing users to set own. + * No file: copy icon + * correct size: do nothing + * other size: user custom icon, do nothing + */ + +/* icon name on root of a mount */ +#define MOUNT_POINT_CUSTOM_ICON ".VolumeIcon.icns" + +/* source icon name from inside zfs.kext bundle */ +#define CUSTOM_ICON_PATH \ + KERNEL_MODPREFIX "/zfs.kext/Contents/Resources/VolumeIcon.icns" + +#include + + +/* + * On OSX we can set the icon to an Open ZFS specific one, just to be extra + * shiny + */ +static void +zfs_mount_seticon(const char *mountpoint) +{ + /* For a root file system, add a volume icon. */ + ssize_t attrsize; + uint16_t finderinfo[16]; + struct stat sbuf; + char *path = NULL; + FILE *dstfp = NULL, *srcfp = NULL; + unsigned char buf[1024]; + unsigned int red; + + if (asprintf(&path, "%s/%s", mountpoint, MOUNT_POINT_CUSTOM_ICON) == -1) + return; + + /* If we can stat it, and it has a size, leave it be. */ + if ((stat(path, &sbuf) == 0 && sbuf.st_size > 0)) + goto out; + + /* Looks like we should copy the icon over */ + + /* check if we can read in the default ZFS icon */ + srcfp = fopen(CUSTOM_ICON_PATH, "r"); + + /* No source icon */ + if (!srcfp) + goto out; + + /* Open the output icon for writing */ + dstfp = fopen(path, "w"); + if (!dstfp) + goto out; + + /* Copy icon */ + while ((red = fread(buf, 1, sizeof (buf), srcfp)) > 0) + (void) fwrite(buf, 1, red, dstfp); + + /* We have copied it, set icon */ + attrsize = getxattr(mountpoint, XATTR_FINDERINFO_NAME, &finderinfo, + sizeof (finderinfo), 0); + if (attrsize != sizeof (finderinfo)) + (void) memset(&finderinfo, 0, sizeof (finderinfo)); + if ((finderinfo[4] & BE_16(0x0400)) == 0) { + finderinfo[4] |= BE_16(0x0400); + (void) setxattr(mountpoint, XATTR_FINDERINFO_NAME, &finderinfo, + sizeof (finderinfo), 0); + } + + /* Now tell Finder to update */ +#if 0 + int fd = -1; + strlcpy(template, mountpoint, sizeof (template)); + strlcat(template, "/tempXXXXXX", sizeof (template)); + if ((fd = mkstemp(template)) != -1) { + unlink(template); // Just delete it right away + close(fd); + } +#endif + +out: + if (dstfp != NULL) + fclose(dstfp); + if (srcfp != NULL) + fclose(srcfp); + if (path != NULL) + free(path); +} + +/* + * zfs_init_libshare(zhandle, service) + * + * Initialize the libshare API if it hasn't already been initialized. + * In all cases it returns 0 if it succeeded and an error if not. The + * service value is which part(s) of the API to initialize and is a + * direct map to the libshare sa_init(service) interface. + */ +int +zfs_init_libshare(libzfs_handle_t *zhandle, int service) +{ + int ret = SA_OK; + + if (ret == SA_OK && zhandle->libzfs_shareflags & ZFSSHARE_MISS) { + /* + * We had a cache miss. Most likely it is a new ZFS + * dataset that was just created. We want to make sure + * so check timestamps to see if a different process + * has updated any of the configuration. If there was + * some non-ZFS change, we need to re-initialize the + * internal cache. + */ + zhandle->libzfs_shareflags &= ~ZFSSHARE_MISS; + if (sa_needs_refresh(zhandle->libzfs_sharehdl)) { + zfs_uninit_libshare(zhandle); + zhandle->libzfs_sharehdl = sa_init(service); + } + } + + if (ret == SA_OK && zhandle && zhandle->libzfs_sharehdl == NULL) + zhandle->libzfs_sharehdl = sa_init(service); + + if (ret == SA_OK && zhandle->libzfs_sharehdl == NULL) + ret = SA_NO_MEMORY; + return (ret); +} + + +/* + * Share the given filesystem according to the options in the specified + * protocol specific properties (sharenfs, sharesmb). We rely + * on "libshare" to do the dirty work for us. + */ +int +zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) +{ + char mountpoint[ZFS_MAXPROPLEN]; + char shareopts[ZFS_MAXPROPLEN]; + char sourcestr[ZFS_MAXPROPLEN]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + sa_share_t share; + zfs_share_proto_t *curr_proto; + zprop_source_t sourcetype; + int err, ret; + + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL, 0)) + return (0); + + for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { + /* + * Return success if there are no share options. + */ + if (zfs_prop_get(zhp, proto_table[*curr_proto].p_prop, + shareopts, sizeof (shareopts), &sourcetype, sourcestr, + ZFS_MAXPROPLEN, B_FALSE) != 0 || + strcmp(shareopts, "off") == 0) + continue; + + ret = zfs_init_libshare(hdl, SA_INIT_SHARE_API); + if (ret != SA_OK) { + (void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED, + dgettext(TEXT_DOMAIN, "cannot share '%s': %s"), + zfs_get_name(zhp), sa_errorstr(ret)); + return (-1); + } + + /* + * If the 'zoned' property is set, then zfs_is_mountable() + * will have already bailed out if we are in the global zone. + * But local zones cannot be NFS servers, so we ignore it for + * local zones as well. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) + continue; + + share = sa_find_share(hdl->libzfs_sharehdl, mountpoint); + if (share == NULL) { + /* + * This may be a new file system that was just + * created so isn't in the internal cache + * (second time through). Rather than + * reloading the entire configuration, we can + * assume ZFS has done the checking and it is + * safe to add this to the internal + * configuration. + */ + if (sa_zfs_process_share(hdl->libzfs_sharehdl, + NULL, NULL, mountpoint, + proto_table[*curr_proto].p_name, sourcetype, + shareopts, sourcestr, zhp->zfs_name) != SA_OK) { + (void) zfs_error_fmt(hdl, + proto_table[*curr_proto].p_share_err, + dgettext(TEXT_DOMAIN, "cannot share '%s'"), + zfs_get_name(zhp)); + return (-1); + } + hdl->libzfs_shareflags |= ZFSSHARE_MISS; + share = sa_find_share(hdl->libzfs_sharehdl, + mountpoint); + } + if (share != NULL) { + err = sa_enable_share(share, + proto_table[*curr_proto].p_name); + if (err != SA_OK) { + (void) zfs_error_fmt(hdl, + proto_table[*curr_proto].p_share_err, + dgettext(TEXT_DOMAIN, "cannot share '%s'"), + zfs_get_name(zhp)); + return (-1); + } + } else { + (void) zfs_error_fmt(hdl, + proto_table[*curr_proto].p_share_err, + dgettext(TEXT_DOMAIN, "cannot share '%s'"), + zfs_get_name(zhp)); + return (-1); + } + + } + return (0); +} + +/* + * Unshare a filesystem by mountpoint. + */ +int +unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, + zfs_share_proto_t proto) +{ + sa_share_t share; + int err; + char *mntpt; + + /* + * Mountpoint could get trashed if libshare calls getmntany + * which it does during API initialization, so strdup the + * value. + */ + mntpt = zfs_strdup(hdl, mountpoint); + + /* make sure libshare initialized */ + if ((err = zfs_init_libshare(hdl, SA_INIT_SHARE_API)) != SA_OK) { + free(mntpt); /* don't need the copy anymore */ + return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err, + dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), + name, sa_errorstr(err))); + } + + share = sa_find_share(hdl->libzfs_sharehdl, mntpt); + free(mntpt); /* don't need the copy anymore */ + + if (share != NULL) { + err = sa_disable_share(share, proto_table[proto].p_name); + if (err != SA_OK) { + return (zfs_error_fmt(hdl, + proto_table[proto].p_unshare_err, + dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), + name, sa_errorstr(err))); + } + } else { + return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err, + dgettext(TEXT_DOMAIN, "cannot unshare '%s': not found"), + name)); + } + return (0); +} + +/* + * Search the sharetab for the given mountpoint and protocol, returning + * a zfs_share_type_t value. + */ +zfs_share_type_t +is_shared_impl(libzfs_handle_t *hdl, const char *mountpoint, + zfs_share_proto_t proto) +{ + char buf[MAXPATHLEN], *tab; + char *ptr; + + if (hdl->libzfs_sharetab == NULL) + return (SHARED_NOT_SHARED); + + /* Reopen ZFS_SHARETAB to prevent reading stale data from open file */ + if (freopen(ZFS_SHARETAB, "r", hdl->libzfs_sharetab) == NULL) + return (SHARED_NOT_SHARED); + + (void) fseek(hdl->libzfs_sharetab, 0, SEEK_SET); + + while (fgets(buf, sizeof (buf), hdl->libzfs_sharetab) != NULL) { + + /* the mountpoint is the first entry on each line */ + if ((tab = strchr(buf, '\t')) == NULL) + continue; + + *tab = '\0'; + if (strcmp(buf, mountpoint) == 0) { + /* + * the protocol field is the third field + * skip over second field + */ + ptr = ++tab; + if ((tab = strchr(ptr, '\t')) == NULL) + continue; + ptr = ++tab; + if ((tab = strchr(ptr, '\t')) == NULL) + continue; + *tab = '\0'; + if (strcmp(ptr, + proto_table[proto].p_name) == 0) { + switch (proto) { + case PROTO_NFS: + return (SHARED_NFS); + case PROTO_SMB: + return (SHARED_SMB); + default: + return (0); + } + } + } + } + + return (SHARED_NOT_SHARED); +} + +/* + * if (zmount(zhp, zfs_get_name(zhp), mountpoint, MS_OPTIONSTR | flags, + * MNTTYPE_ZFS, NULL, 0, mntopts, sizeof (mntopts)) != 0) { + */ +int +do_mount(zfs_handle_t *zhp, const char *dir, char *optptr, int mflag) +{ + int rv; + const char *spec = zfs_get_name(zhp); + const char *fstype = MNTTYPE_ZFS; + struct zfs_mount_args mnt_args; + char *rpath = NULL; + zfs_cmd_t zc = { "\0" }; + int devdisk = ZFS_DEVDISK_POOLONLY; + int ispool = 0; // the pool dataset, that is + int optlen = 0; + + assert(spec != NULL); + assert(dir != NULL); + assert(fstype != NULL); + assert(mflag >= 0); + assert(strcmp(fstype, MNTTYPE_ZFS) == 0); + assert(dataptr == NULL); + assert(datalen == 0); + assert(optptr != NULL); + assert(optlen > 0); + + if (optptr != NULL) + optlen = strlen(optptr); + + /* + * Figure out if we want this mount as a /dev/diskX mount, if so + * ask kernel to create one for us, then use it to mount. + */ + + // Use dataset name by default + mnt_args.fspec = spec; + + /* + * Lookup the dataset property devdisk, and depending on its + * setting, we need to create a /dev/diskX for the mount + */ + if (zhp) { + + /* If we are in zfs-tests, no devdisks */ + if (getenv("__ZFS_MAIN_MOUNTPOINT_DIR") != NULL) + devdisk = ZFS_DEVDISK_OFF; + else + devdisk = zfs_prop_get_int(zhp, ZFS_PROP_DEVDISK); + + if (zhp && zhp->zpool_hdl && + strcmp(zpool_get_name(zhp->zpool_hdl), + zfs_get_name(zhp)) == 0) + ispool = 1; + + if ((devdisk == ZFS_DEVDISK_ON) || + ((devdisk == ZFS_DEVDISK_POOLONLY) && + ispool)) { + (void) strlcpy(zc.zc_name, zhp->zfs_name, + sizeof (zc.zc_name)); + zc.zc_value[0] = 0; + + rv = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_PROXY_DATASET, + &zc); + +#ifdef DEBUG + if (rv) + fprintf(stderr, + "proxy dataset returns %d '%s'\n", + rv, zc.zc_value); +#endif + + // Mount using /dev/diskX, use temporary buffer to + // give it full name + if (rv == 0) { + snprintf(zc.zc_name, sizeof (zc.zc_name), + "/dev/%s", zc.zc_value); + mnt_args.fspec = zc.zc_name; + } + } + } + + mnt_args.mflag = mflag; + mnt_args.optptr = optptr; + mnt_args.optlen = optlen; + mnt_args.struct_size = sizeof (mnt_args); + + /* + * There is a bug in XNU where /var/tmp is resolved as + * "private/var/tmp" without the leading "/", and both mount(2) and + * diskutil mount avoid this by calling realpath() first. So we will + * do the same. + */ + rpath = realpath(dir, NULL); + + dprintf("%s calling mount with fstype %s, %s %s, fspec %s, mflag %d," + " optptr %s, optlen %d, devdisk %d, ispool %d\n", + __func__, fstype, (rpath ? "rpath" : "dir"), + (rpath ? rpath : dir), mnt_args.fspec, mflag, optptr, optlen, + devdisk, ispool); + rv = mount(fstype, rpath ? rpath : dir, 0, &mnt_args); + + if (rpath) free(rpath); + + /* Check if we need to create/update icon */ + if (rv == 0) + zfs_mount_seticon(dir); + + return (rv); +} + +int +do_unmount_impl(const char *mntpt, int flags) +{ + char force_opt[] = "force"; + char *argv[7] = { + "/usr/sbin/diskutil", + "unmount", + NULL, NULL, NULL, NULL }; + int rc, count = 2; + + if (flags & MS_FORCE) { + argv[count] = force_opt; + count++; + } + + argv[count] = (char *)mntpt; + rc = libzfs_run_process(argv[0], argv, STDOUT_VERBOSE|STDERR_VERBOSE); + + /* + * There is a bug, where we can not unmount, with the error + * already unmounted, even though it wasn't. But it is easy + * to work around by calling 'umount'. Until a real fix is done... + * re-test this: 202004/lundman + */ + if (rc != 0) { + char *argv[7] = { + "/sbin/umount", + NULL, NULL, NULL, NULL }; + int rc, count = 1; + + fprintf(stderr, "Fallback umount called\r\n"); + if (flags & MS_FORCE) { + argv[count] = "-f"; + count++; + } + argv[count] = (char *)mntpt; + rc = libzfs_run_process(argv[0], argv, + STDOUT_VERBOSE|STDERR_VERBOSE); + } + + return (rc ? EINVAL : 0); +} + + +void unmount_snapshots(libzfs_handle_t *hdl, const char *mntpt, int flags); + +int +do_unmount(libzfs_handle_t *hdl, const char *mntpt, int flags) +{ + /* + * On OSX, the kernel can not unmount all snapshots for us, as XNU + * rejects the unmount before it reaches ZFS. But we can easily handle + * unmounting snapshots from userland. + */ + unmount_snapshots(hdl, mntpt, flags); + + return (do_unmount_impl(mntpt, flags)); +} + +/* + * Given "/Volumes/BOOM" look for any lower mounts with ".zfs/snapshot/" + * in them - issue unmount. + */ +void +unmount_snapshots(libzfs_handle_t *hdl, const char *mntpt, int flags) +{ + struct mnttab entry; + int len = strlen(mntpt); + + while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { + /* Starts with our mountpoint ? */ + if (strncmp(mntpt, entry.mnt_mountp, len) == 0) { + /* The next part is "/.zfs/snapshot/" ? */ + if (strncmp("/.zfs/snapshot/", &entry.mnt_mountp[len], + 15) == 0) { + /* Unmount it */ + do_unmount_impl(entry.mnt_mountp, MS_FORCE); + } + } + } +} + +int +zfs_mount_delegation_check(void) +{ + return ((geteuid() != 0) ? EACCES : 0); +} + +static char * +zfs_snapshot_mountpoint(zfs_handle_t *zhp) +{ + char *dataset_name, *snapshot_mountpoint, *parent_mountpoint; + libzfs_handle_t *hdl = zhp->zfs_hdl; + zfs_handle_t *parent; + char *r; + + dataset_name = zfs_strdup(hdl, zhp->zfs_name); + if (dataset_name == NULL) { + (void) fprintf(stderr, gettext("not enough memory")); + return (NULL); + } + + r = strrchr(dataset_name, '@'); + + if (r == NULL) { + (void) fprintf(stderr, gettext("snapshot '%s' " + "has no '@'\n"), zhp->zfs_name); + free(dataset_name); + return (NULL); + } + + r[0] = 0; + + /* Open the dataset */ + if ((parent = zfs_open(hdl, dataset_name, + ZFS_TYPE_FILESYSTEM)) == NULL) { + (void) fprintf(stderr, + gettext("unable to open parent dataset '%s'\n"), + dataset_name); + free(dataset_name); + return (NULL); + } + + if (!zfs_is_mounted(parent, &parent_mountpoint)) { + (void) fprintf(stderr, + gettext("parent dataset '%s' must be mounted\n"), + dataset_name); + free(dataset_name); + zfs_close(parent); + return (NULL); + } + + zfs_close(parent); + + snapshot_mountpoint = + zfs_asprintf(hdl, "%s/.zfs/snapshot/%s/", + parent_mountpoint, &r[1]); + + free(dataset_name); + free(parent_mountpoint); + + return (snapshot_mountpoint); +} + +/* + * Mount a snapshot; called from "zfs mount dataset@snapshot". + * Given "dataset@snapshot" construct mountpoint path of the + * style "/mountpoint/dataset/.zfs/snapshot/$name/". Ensure + * parent "dataset" is mounted, then issue mount for snapshot. + */ +int +zfs_snapshot_mount(zfs_handle_t *zhp, const char *options, + int flags) +{ + int ret = 0; + char *mountpoint; + + /* + * The automounting will kick in, and zed mounts it - so + * we temporarily disable it + */ + uint64_t automount = 0; + uint64_t saved_automount = 0; + size_t len = sizeof (automount); + size_t slen = sizeof (saved_automount); + + /* Remember what the user has it set to */ + sysctlbyname("kstat.zfs.darwin.tunable.zfs_auto_snapshot", + &saved_automount, &slen, NULL, 0); + + /* Disable automounting */ + sysctlbyname("kstat.zfs.darwin.tunable.zfs_auto_snapshot", + NULL, NULL, &automount, len); + + if (zfs_is_mounted(zhp, NULL)) { + return (EBUSY); + } + + mountpoint = zfs_snapshot_mountpoint(zhp); + if (mountpoint == NULL) + return (EINVAL); + + ret = zfs_mount_at(zhp, options, MS_RDONLY | flags, + mountpoint); + + /* If zed is running, it can mount it before us */ + if (ret == -1 && errno == EINVAL) + ret = 0; + + if (ret == 0) { + (void) fprintf(stderr, + gettext("ZFS: snapshot mountpoint '%s'\n"), + mountpoint); + } + + free(mountpoint); + + /* Restore automount setting */ + sysctlbyname("kstat.zfs.darwin.tunable.zfs_auto_snapshot", + NULL, NULL, &saved_automount, len); + + return (ret); +} + +int +zfs_snapshot_unmount(zfs_handle_t *zhp, int flags) +{ + int ret = 0; + char *mountpoint; + + if (!zfs_is_mounted(zhp, NULL)) { + return (ENOENT); + } + + mountpoint = zfs_snapshot_mountpoint(zhp); + if (mountpoint == NULL) + return (EINVAL); + + ret = zfs_unmount(zhp, mountpoint, flags); + + free(mountpoint); + + return (ret); +} diff --git a/lib/libzfs/os/macos/libzfs_pool_os.c b/lib/libzfs/os/macos/libzfs_pool_os.c new file mode 100644 index 0000000000..f7685e2f01 --- /dev/null +++ b/lib/libzfs/os/macos/libzfs_pool_os.c @@ -0,0 +1,345 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright (c) 2018 Datto Inc. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2018, loli10K + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "libzfs_impl.h" +#include "zfs_comutil.h" +#include "zfeature_common.h" + +/* + * If the device has being dynamically expanded then we need to relabel + * the disk to use the new unallocated space. + */ +int +zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) +{ + int fd, error; + + if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to open device: %d"), path, errno); + return (zfs_error(hdl, EZFS_OPENFAILED, msg)); + } + + /* + * It's possible that we might encounter an error if the device + * does not have any unallocated space left. If so, we simply + * ignore that error and continue on. + * + * Also, we don't call efi_rescan() - that would just return EBUSY. + * The module will do it for us in vdev_disk_open(). + */ + error = efi_use_whole_disk(fd); + + /* Flush the buffers to disk and invalidate the page cache. */ + (void) fsync(fd); +// (void) ioctl(fd, BLKFLSBUF); + + (void) close(fd); + if (error && error != VT_ENOSPC) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to read disk capacity"), path); + return (zfs_error(hdl, EZFS_NOCAP, msg)); + } + return (0); +} + +/* + * Read the EFI label from the config, if a label does not exist then + * pass back the error to the caller. If the caller has passed a non-NULL + * diskaddr argument then we set it to the starting address of the EFI + * partition. + */ +static int +read_efi_label(nvlist_t *config, diskaddr_t *sb) +{ + char *path; + int fd; + char diskname[MAXPATHLEN]; + int err = -1; + + if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) + return (err); + + (void) snprintf(diskname, sizeof (diskname), "%s%s", DISK_ROOT, + strrchr(path, '/')); + if ((fd = open(diskname, O_RDONLY|O_DIRECT)) >= 0) { + struct dk_gpt *vtoc; + + if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { + if (sb != NULL) + *sb = vtoc->efi_parts[0].p_start; + efi_free(vtoc); + } + (void) close(fd); + } + return (err); +} + +/* + * determine where a partition starts on a disk in the current + * configuration + */ +static diskaddr_t +find_start_block(nvlist_t *config) +{ + nvlist_t **child; + uint_t c, children; + diskaddr_t sb = MAXOFFSET_T; + uint64_t wholedisk; + + if (nvlist_lookup_nvlist_array(config, + ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) != 0 || !wholedisk) { + return (MAXOFFSET_T); + } + if (read_efi_label(config, &sb) < 0) + sb = MAXOFFSET_T; + return (sb); + } + + for (c = 0; c < children; c++) { + sb = find_start_block(child[c]); + if (sb != MAXOFFSET_T) { + return (sb); + } + } + return (MAXOFFSET_T); +} + +static int +zpool_label_disk_check(char *path) +{ + struct dk_gpt *vtoc; + int fd, err; + + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) + return (errno); + + if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) { + (void) close(fd); + return (err); + } + + if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { + efi_free(vtoc); + (void) close(fd); + return (EIDRM); + } + + efi_free(vtoc); + (void) close(fd); + return (0); +} + +/* + * Generate a unique partition name for the ZFS member. Partitions must + * have unique names to ensure udev will be able to create symlinks under + * /dev/disk/by-partlabel/ for all pool members. The partition names are + * of the form -. + */ +static void +zpool_label_name(char *label_name, int label_size) +{ + uint64_t id = 0; + int fd; + + fd = open("/dev/urandom", O_RDONLY); + if (fd >= 0) { + if (read(fd, &id, sizeof (id)) != sizeof (id)) + id = 0; + + close(fd); + } + + if (id == 0) + id = (((uint64_t)rand()) << 32) | (uint64_t)rand(); + + snprintf(label_name, label_size, "zfs-%016llx", (u_longlong_t)id); +} + +/* + * Label an individual disk. The name provided is the short name, + * stripped of any leading /dev path. + */ +int +zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) +{ + char path[MAXPATHLEN]; + struct dk_gpt *vtoc; + int rval, fd; + size_t resv = EFI_MIN_RESV_SIZE; + uint64_t slice_size; + diskaddr_t start_block; + char errbuf[1024]; + + /* prepare an error message just in case */ + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); + + if (zhp) { + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + + if (zhp->zpool_start_block == 0) + start_block = find_start_block(nvroot); + else + start_block = zhp->zpool_start_block; + zhp->zpool_start_block = start_block; + } else { + /* new pool */ + start_block = NEW_START_BLOCK; + } + + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + + if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) { + /* + * This shouldn't happen. We've long since verified that this + * is a valid device. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "label '%s': unable to open device: %d"), path, errno); + return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); + } + + if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) { + /* + * The only way this can fail is if we run out of memory, or we + * were unable to read the disk's capacity + */ + if (errno == ENOMEM) + (void) no_memory(hdl); + + (void) close(fd); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "label '%s': unable to read disk capacity"), path); + + return (zfs_error(hdl, EZFS_NOCAP, errbuf)); + } + + slice_size = vtoc->efi_last_u_lba + 1; + slice_size -= EFI_MIN_RESV_SIZE; + if (start_block == MAXOFFSET_T) + start_block = NEW_START_BLOCK; + slice_size -= start_block; + slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT); + + vtoc->efi_parts[0].p_start = start_block; + vtoc->efi_parts[0].p_size = slice_size; + + /* + * Why we use V_USR: V_BACKUP confuses users, and is considered + * disposable by some EFI utilities (since EFI doesn't have a backup + * slice). V_UNASSIGNED is supposed to be used only for zero size + * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, + * etc. were all pretty specific. V_USR is as close to reality as we + * can get, in the absence of V_OTHER. + */ + vtoc->efi_parts[0].p_tag = V_USR; + zpool_label_name(vtoc->efi_parts[0].p_name, EFI_PART_NAME_LEN); + + vtoc->efi_parts[8].p_start = slice_size + start_block; + vtoc->efi_parts[8].p_size = resv; + vtoc->efi_parts[8].p_tag = V_RESERVED; + + rval = efi_write(fd, vtoc); + + /* Flush the buffers to disk and invalidate the page cache. */ + (void) fsync(fd); +// (void) ioctl(fd, BLKFLSBUF); + + if (rval == 0) + rval = efi_rescan(fd); + + /* + * Some block drivers (like pcata) may not support EFI GPT labels. + * Print out a helpful error message directing the user to manually + * label the disk and give a specific slice. + */ + if (rval != 0) { + (void) close(fd); + efi_free(vtoc); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using " + "parted(8) and then provide a specific slice: %d"), rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + (void) close(fd); + efi_free(vtoc); + + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + (void) zfs_append_partition(path, MAXPATHLEN); + + /* Wait to udev to signal use the device has settled. */ + rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT); + if (rval) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to " + "detect device partitions on '%s': %d"), path, rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + /* We can't be to paranoid. Read the label back and verify it. */ + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + rval = zpool_label_disk_check(path); + if (rval) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written " + "EFI label on '%s' is damaged. Ensure\nthis device " + "is not in use, and is functioning properly: %d"), + path, rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + return (0); +} diff --git a/lib/libzfs/os/macos/libzfs_util_os.c b/lib/libzfs/os/macos/libzfs_util_os.c new file mode 100644 index 0000000000..cd732786ad --- /dev/null +++ b/lib/libzfs/os/macos/libzfs_util_os.c @@ -0,0 +1,575 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "libzfs_impl.h" +#include "zfs_prop.h" +#include +#include + +#define ZDIFF_SHARESDIR "/.zfs/shares/" + + +int +zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) +{ + return (zfs_ioctl_fd(hdl->libzfs_fd, request, zc)); +} + +const char * +libzfs_error_init(int error) +{ + switch (error) { + case ENXIO: + return (dgettext(TEXT_DOMAIN, "The ZFS modules are not " + "loaded.\nTry running '/sbin/kextload zfs.kext' as root " + "to load them.")); + case ENOENT: + return (dgettext(TEXT_DOMAIN, "/dev/zfs and /proc/self/mounts " + "are required.\nTry running 'udevadm trigger' and 'mount " + "-t proc proc /proc' as root.")); + case ENOEXEC: + return (dgettext(TEXT_DOMAIN, "The ZFS modules cannot be " + "auto-loaded.\nTry running '/sbin/kextload zfs.kext' as " + "root to manually load them.")); + case EACCES: + return (dgettext(TEXT_DOMAIN, "Permission denied the " + "ZFS utilities must be run as root.")); + default: + return (dgettext(TEXT_DOMAIN, "Failed to initialize the " + "libzfs library.")); + } +} + +static int +libzfs_module_loaded(const char *module) +{ + const char path_prefix[] = "/dev/"; + char path[256]; + + memcpy(path, path_prefix, sizeof (path_prefix) - 1); + strcpy(path + sizeof (path_prefix) - 1, module); + + return (access(path, F_OK) == 0); +} + +/* + * Verify the required ZFS_DEV device is available and optionally attempt + * to load the ZFS modules. Under normal circumstances the modules + * should already have been loaded by some external mechanism. + * + * Environment variables: + * - ZFS_MODULE_LOADING="YES|yes|ON|on" - Attempt to load modules. + * - ZFS_MODULE_TIMEOUT="" - Seconds to wait for ZFS_DEV + */ +static int +libzfs_load_module_impl(const char *module) +{ + char *argv[4] = {"/sbin/kextload", (char *)module, (char *)0}; + char *load_str, *timeout_str; + long timeout = 10; /* seconds */ + long busy_timeout = 10; /* milliseconds */ + int load = 0, fd; + hrtime_t start; + + /* Optionally request module loading */ + if (!libzfs_module_loaded(module)) { + load_str = getenv("ZFS_MODULE_LOADING"); + if (load_str) { + if (!strncasecmp(load_str, "YES", strlen("YES")) || + !strncasecmp(load_str, "ON", strlen("ON"))) + load = 1; + else + load = 0; + } + + if (load) { + if (libzfs_run_process("/sbin/kextload", argv, 0)) + return (ENOEXEC); + } + + if (!libzfs_module_loaded(module)) + return (ENXIO); + } + + /* + * Device creation by udev is asynchronous and waiting may be + * required. Busy wait for 10ms and then fall back to polling every + * 10ms for the allowed timeout (default 10s, max 10m). This is + * done to optimize for the common case where the device is + * immediately available and to avoid penalizing the possible + * case where udev is slow or unable to create the device. + */ + timeout_str = getenv("ZFS_MODULE_TIMEOUT"); + if (timeout_str) { + timeout = strtol(timeout_str, NULL, 0); + timeout = MAX(MIN(timeout, (10 * 60)), 0); /* 0 <= N <= 600 */ + } + + start = gethrtime(); + do { + fd = open(ZFS_DEV, O_RDWR); + if (fd >= 0) { + (void) close(fd); + return (0); + } else if (errno != ENOENT) { + return (errno); + } else if (NSEC2MSEC(gethrtime() - start) < busy_timeout) { + sched_yield(); + } else { + usleep(10 * MILLISEC); + } + } while (NSEC2MSEC(gethrtime() - start) < (timeout * MILLISEC)); + + return (ENOENT); +} + +int +libzfs_load_module(void) +{ + return (libzfs_load_module_impl(ZFS_DRIVER)); +} + +int +find_shares_object(differ_info_t *di) +{ + char fullpath[MAXPATHLEN]; + struct stat64 sb = { 0 }; + + (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN); + (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN); + + if (stat64(fullpath, &sb) != 0) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath); + return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf)); + } + + di->shares = (uint64_t)sb.st_ino; + return (0); +} + +/* + * Fill given version buffer with zfs kernel version read from ZFS_SYSFS_DIR + * Returns 0 on success, and -1 on error (with errno set) + */ +int +zfs_version_kernel(char *version, int len) +{ + size_t rlen = len; + + if (sysctlbyname("zfs.kext_version", + version, &rlen, NULL, 0) == -1) + return (-1); + + return (0); +} + +static int +execvPe(const char *name, const char *path, char * const *argv, + char * const *envp) +{ + const char **memp; + size_t cnt, lp, ln; + int eacces, save_errno; + char *cur, buf[MAXPATHLEN]; + const char *p, *bp; + struct stat sb; + + eacces = 0; + + /* If it's an absolute or relative path name, it's easy. */ + if (strchr(name, '/')) { + bp = name; + cur = NULL; + goto retry; + } + bp = buf; + + /* If it's an empty path name, fail in the usual POSIX way. */ + if (*name == '\0') { + errno = ENOENT; + return (-1); + } + + cur = alloca(strlen(path) + 1); + if (cur == NULL) { + errno = ENOMEM; + return (-1); + } + strcpy(cur, path); + while ((p = strsep(&cur, ":")) != NULL) { + /* + * It's a SHELL path -- double, leading and trailing colons + * mean the current directory. + */ + if (*p == '\0') { + p = "."; + lp = 1; + } else + lp = strlen(p); + ln = strlen(name); + + /* + * If the path is too long complain. This is a possible + * security issue; given a way to make the path too long + * the user may execute the wrong program. + */ + if (lp + ln + 2 > sizeof (buf)) { + (void) write(STDERR_FILENO, "execvP: ", 8); + (void) write(STDERR_FILENO, p, lp); + (void) write(STDERR_FILENO, ": path too long\n", + 16); + continue; + } + bcopy(p, buf, lp); + buf[lp] = '/'; + bcopy(name, buf + lp + 1, ln); + buf[lp + ln + 1] = '\0'; + +retry: + (void) execve(bp, argv, envp); + switch (errno) { + case E2BIG: + goto done; + case ELOOP: + case ENAMETOOLONG: + case ENOENT: + break; + case ENOEXEC: + for (cnt = 0; argv[cnt]; ++cnt) + ; + memp = alloca((cnt + 2) * sizeof (char *)); + if (memp == NULL) { + goto done; + } + memp[0] = "sh"; + memp[1] = bp; + bcopy(argv + 1, memp + 2, cnt * sizeof (char *)); + execve(_PATH_BSHELL, __DECONST(char **, memp), + envp); + goto done; + case ENOMEM: + goto done; + case ENOTDIR: + break; + case ETXTBSY: + /* + * We used to retry here, but sh(1) doesn't. + */ + goto done; + default: + /* + * EACCES may be for an inaccessible directory or + * a non-executable file. Call stat() to decide + * which. This also handles ambiguities for EFAULT + * and EIO, and undocumented errors like ESTALE. + * We hope that the race for a stat() is unimportant. + */ + save_errno = errno; + if (stat(bp, &sb) != 0) + break; + if (save_errno == EACCES) { + eacces = 1; + continue; + } + errno = save_errno; + goto done; + } + } + if (eacces) + errno = EACCES; + else + errno = ENOENT; +done: + return (-1); +} + +int +execvpe(const char *name, char * const argv[], char * const envp[]) +{ + const char *path; + + /* Get the path we're searching. */ + if ((path = getenv("PATH")) == NULL) + path = _PATH_DEFPATH; + + return (execvPe(name, path, argv, envp)); +} + +#include +#include +#include + +extern void libzfs_refresh_finder(char *); + +/* + * To tell Finder to refresh is relatively easy from Obj-C, but as this + * would be the only function to use Obj-C (and only .m), the following code: + * void libzfs_refresh_finder(char *mountpoint) + * { + * [[NSWorkspace sharedWorkspace] noteFileSystemChanged:[NSString + * stringWithUTF8String:mountpoint]]; + * } + * Has been converted to C to keep autoconf simpler. If in future we have + * more Obj-C source files, then we should re-address this. + */ +void +libzfs_refresh_finder(char *path) +{ + Class NSWorkspace = objc_getClass("NSWorkspace"); + Class NSString = objc_getClass("NSString"); + SEL stringWithUTF8String = sel_registerName("stringWithUTF8String:"); + SEL sharedWorkspace = sel_registerName("sharedWorkspace"); + SEL noteFileSystemChanged = sel_registerName("noteFileSystemChanged:"); + id ns_path = ((id(*)(Class, SEL, char *))objc_msgSend)(NSString, + stringWithUTF8String, path); + id workspace = ((id(*)(Class, SEL))objc_msgSend)(NSWorkspace, + sharedWorkspace); + ((id(*)(id, SEL, id))objc_msgSend)(workspace, noteFileSystemChanged, + ns_path); +} + +void +zfs_rollback_os(zfs_handle_t *zhp) +{ + char sourceloc[ZFS_MAX_DATASET_NAME_LEN]; + char mountpoint[ZFS_MAXPROPLEN]; + zprop_source_t sourcetype; + + if (zfs_prop_valid_for_type(ZFS_PROP_MOUNTPOINT, zhp->zfs_type, + B_FALSE)) { + if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, + mountpoint, sizeof (mountpoint), + &sourcetype, sourceloc, sizeof (sourceloc), B_FALSE) == 0) + libzfs_refresh_finder(mountpoint); + } +} + +struct pipe2file { + int from; + int to; +}; +typedef struct pipe2file pipe2file_t; + +#include + +static void * +pipe_io_relay(void *arg) +{ + pipe2file_t *p2f = (pipe2file_t *)arg; + int readfd, writefd; + unsigned char *buffer; + unsigned char space[1024]; + int size = 1024 * 1024; + int red, sent; + uint64_t total = 0; + + readfd = p2f->from; + writefd = p2f->to; + free(p2f); + p2f = NULL; + + buffer = malloc(size); + if (buffer == NULL) { + buffer = space; + size = sizeof (space); + } + + fprintf(stderr, "%s: thread up: read(%d) write(%d)\r\n", __func__, + readfd, writefd); + + for (;;) { + + red = read(readfd, buffer, size); + // fprintf(stderr, "%s: read(%d): %d (errno %d)\r\n", __func__, + // readfd, red, errno); + if (red == 0) + break; + if (red < 0 && errno != EWOULDBLOCK) + break; + sent = write(writefd, buffer, red); + // fprintf(stderr, "%s: write(%d): %d (errno %d)\r\n", __func__, + // writefd, sent, errno); + if (sent < 0) + break; + total += red; + } + + /* + * It seems unlikely that this code is ever reached, as the process + * calls exit() when done, and this thread is terminated. + */ + + fprintf(stderr, "loop exit\r\n"); + + close(readfd); + close(writefd); + + if (buffer != space) + free(buffer); + + fprintf(stderr, "%s: thread done: %llu bytes\r\n", __func__, total); + return (NULL); +} + +/* + * XNU only lets us do IO on vnodes, not pipes, so create a Unix + * Domain socket, open it to get a vnode for the kernel, and spawn + * thread to relay IO. + */ +void +libzfs_macos_wrapfd(int *srcfd, boolean_t send) +{ + char template[100]; + int readfd = -1; + int writefd = -1; + int error; + struct stat sb; + pipe2file_t *p2f = NULL; + + fprintf(stderr, "%s: checking if we need pipe wrap\r\n", __func__); + + // Check if it is a pipe + error = fstat(*srcfd, &sb); + + if (error != 0) + return; + + if (!S_ISFIFO(sb.st_mode)) + return; + + p2f = (pipe2file_t *)malloc(sizeof (pipe2file_t)); + if (p2f == NULL) + return; + + fprintf(stderr, "%s: is pipe: work on fd %d\r\n", __func__, *srcfd); + + snprintf(template, sizeof (template), "/tmp/.zfs.pipe.XXXXXX"); + + mktemp(template); + + mkfifo(template, 0600); + + readfd = open(template, O_RDONLY | O_NONBLOCK); + + fprintf(stderr, "%s: readfd %d (%d)\r\n", __func__, readfd, error); + + writefd = open(template, O_WRONLY | O_NONBLOCK); + + fprintf(stderr, "%s: writefd %d (%d)\r\n", __func__, writefd, error); + + // set it to delete + unlink(template); + + // Check delayed so unlink() is always called. + if (readfd < 0) + goto out; + if (writefd < 0) + goto out; + + /* Open needs NONBLOCK, so switch back to BLOCK */ + int flags; + flags = fcntl(readfd, F_GETFL); + flags &= ~O_NONBLOCK; + fcntl(readfd, F_SETFL, flags); + flags = fcntl(writefd, F_GETFL); + flags &= ~O_NONBLOCK; + fcntl(writefd, F_SETFL, flags); + + // create IO thread + + // Send, kernel was to be given *srcfd - to write to. + // Instead we give it writefd. + // thread then uses read(readfd) -> write(*srcfd) + if (send) { + p2f->from = readfd; + p2f->to = *srcfd; + } else { + p2f->from = *srcfd; + p2f->to = writefd; + } + fprintf(stderr, "%s: spawning thread\r\n", __func__); + + // pthread kills all threads on exit, and pipe_io_relay may not + // have fully completed. + error = fork(); + if (error == 0) { + + // Close the fd we don't need + if (send) + close(writefd); + else + close(readfd); + + setsid(); + pipe_io_relay(p2f); + _exit(0); + } + + if (error < 0) + goto out; + + // Return open(file) fd to kernel only after all error cases + if (send) { + *srcfd = writefd; + close(readfd); + } else { + *srcfd = readfd; + close(writefd); + } + return; + +out: + if (p2f != NULL) + free(p2f); + + if (readfd >= 0) + close(readfd); + + if (writefd >= 0) + close(writefd); +} + +void +libzfs_set_pipe_max(int infd) +{ + /* macOS automatically resizes */ +} diff --git a/lib/libzutil/os/macos/zutil_compat.c b/lib/libzutil/os/macos/zutil_compat.c new file mode 100644 index 0000000000..b46263ac27 --- /dev/null +++ b/lib/libzutil/os/macos/zutil_compat.c @@ -0,0 +1,94 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#include +#include +#include +#include +#include +#include + +static int +zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) +{ + int ret; + void *zc_c; + unsigned long ncmd; + zfs_iocparm_t zp; + + switch (cflag) { + case ZFS_CMD_COMPAT_NONE: + ncmd = _IOWR('Z', request, zfs_iocparm_t); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof (zfs_cmd_t); + zp.zfs_ioctl_version = ZFS_IOCVER_ZOF; + zp.zfs_ioc_error = 0; + + ret = ioctl(fd, ncmd, &zp); + + /* + * If ioctl worked, get actual rc from kernel, which goes + * into errno, and return -1 if not-zero. + */ + if (ret == 0) { + errno = zp.zfs_ioc_error; + if (zp.zfs_ioc_error != 0) + ret = -1; + } + return (ret); + + default: + abort(); + return (EINVAL); + } + + /* Pass-through ioctl, rarely used if at all */ + + ret = ioctl(fd, ncmd, zc_c); + ASSERT0(ret); + + zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag); + free(zc_c); + + return (ret); +} + +/* + * This is the macOS version of ioctl(). Because the XNU kernel + * handles copyin() and copyout(), we must return success from the + * ioctl() handler (or it will not copyout() for userland), + * and instead embed the error return value in the zc structure. + */ +int +zfs_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) +{ + size_t oldsize; + int ret, cflag = ZFS_CMD_COMPAT_NONE; + + oldsize = zc->zc_nvlist_dst_size; + ret = zcmd_ioctl_compat(fd, request, zc, cflag); + + if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { + ret = -1; + errno = ENOMEM; + } + + return (ret); +} diff --git a/lib/libzutil/os/macos/zutil_device_path_os.c b/lib/libzutil/os/macos/zutil_device_path_os.c new file mode 100644 index 0000000000..dec0281be9 --- /dev/null +++ b/lib/libzutil/os/macos/zutil_device_path_os.c @@ -0,0 +1,148 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +/* + * We don't strip/append partitions on FreeBSD. + */ + +/* + * Note: The caller must free the returned string. + */ +char * +zfs_strip_partition(char *dev) +{ + return (strdup(dev)); +} + +int +zfs_append_partition(char *path, size_t max_len) +{ + return (strnlen(path, max_len)); +} + +/* + * Strip the path from a device name. + * On FreeBSD we only want to remove "/dev/" from the beginning of + * paths if present. + */ +char * +zfs_strip_path(char *path) +{ + if (strncmp(path, _PATH_DEV, sizeof (_PATH_DEV) - 1) == 0) + return (path + sizeof (_PATH_DEV) - 1); + else + return (path); +} + +char * +zfs_get_underlying_path(const char *dev_name) +{ + + if (dev_name == NULL) + return (NULL); + + return (realpath(dev_name, NULL)); +} + +boolean_t +zfs_dev_is_whole_disk(const char *dev_name) +{ + struct dk_gpt *label; + int fd; + + if ((fd = open(dev_name, O_RDONLY | O_DIRECT)) < 0) + return (B_FALSE); + + if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { + (void) close(fd); + return (B_FALSE); + } + + efi_free(label); + (void) close(fd); + + return (B_TRUE); +} + +/* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when libudev determines it has been initialized, all of + * the device links have been verified to exist, and it has been allowed to + * settle. At this point the device the device can be accessed reliably. + * Depending on the complexity of the udev rules this process could take + * several seconds. + */ +int +zpool_label_disk_wait(const char *path, int timeout_ms) +{ + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + struct stat64 statbuf; + + start = gethrtime(); + settle = 0; + + do { + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) { + if (settle == 0) + settle = gethrtime(); + else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) + return (0); + } else if (errno != ENOENT) { + return (errno); + } + + usleep(sleep_ms * MILLISEC); + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + return (ENODEV); +} + +/* ARGSUSED */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + return (B_FALSE); +} + +/* + * Return B_TRUE if device is a device mapper or multipath device. + * Return B_FALSE if not. + */ +boolean_t +zfs_dev_is_dm(const char *dev_name) +{ + return (B_FALSE); +} diff --git a/lib/libzutil/os/macos/zutil_import_os.c b/lib/libzutil/os/macos/zutil_import_os.c new file mode 100644 index 0000000000..251096301c --- /dev/null +++ b/lib/libzutil/os/macos/zutil_import_os.c @@ -0,0 +1,483 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * Pool import support functions. + * + * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since + * these commands are expected to run in the global zone, we can assume + * that the devices are all readable when called. + * + * To import a pool, we rely on reading the configuration information from the + * ZFS label of each device. If we successfully read the label, then we + * organize the configuration information in the following hierarchy: + * + * pool guid -> toplevel vdev guid -> label txg + * + * Duplicate entries matching this same tuple will be discarded. Once we have + * examined every device, we pick the best label txg config for each toplevel + * vdev. We then arrange these toplevel vdevs into a complete pool config, and + * update any paths that have changed. Finally, we attempt to import the pool + * using our derived config, and record the results. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zutil_import.h" + +#ifdef HAVE_LIBUDEV +#include +#include +#endif + +#define DEFAULT_IMPORT_PATH_SIZE 9 +#define DEV_BYID_PATH "/dev/disk/by-id/" + +static boolean_t +is_watchdog_dev(char *dev) +{ + /* For 'watchdog' dev */ + if (strcmp(dev, "watchdog") == 0) + return (B_TRUE); + + /* For 'watchdog */ + if (strstr(dev, "watchdog") == dev && isdigit(dev[8])) + return (B_TRUE); + + return (B_FALSE); +} + +int +zfs_dev_flush(int fd) +{ +// return (ioctl(fd, BLKFLSBUF)); + return (0); +} + +void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + libpc_handle_t *hdl = rn->rn_hdl; + struct stat64 statbuf; + nvlist_t *config; + char *bname, *dupname; + uint64_t vdev_guid = 0; + int error; + int num_labels = 0; + int fd; + + /* + * Skip devices with well known prefixes there can be side effects + * when opening devices which need to be avoided. + * + * hpet - High Precision Event Timer + * watchdog - Watchdog must be closed in a special way. + */ + dupname = zutil_strdup(hdl, rn->rn_name); + bname = basename(dupname); + error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname)); + free(dupname); + if (error) + return; + + /* + * Ignore failed stats. We only want regular files and block devices. + */ + if (stat64(rn->rn_name, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) + return; + + fd = open(rn->rn_name, O_RDONLY); + if ((fd < 0) && (errno == EINVAL)) + fd = open(rn->rn_name, O_RDONLY); + if ((fd < 0) && (errno == EACCES)) + hdl->lpc_open_access_error = B_TRUE; + if (fd < 0) + return; + + /* + * This file is too small to hold a zpool + */ + if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } + + error = zpool_read_label(fd, &config, &num_labels); + if (error != 0) { + (void) close(fd); + return; + } + + if (num_labels == 0) { + (void) close(fd); + nvlist_free(config); + return; + } + + /* + * Check that the vdev is for the expected guid. Additional entries + * are speculatively added based on the paths stored in the labels. + * Entries with valid paths but incorrect guids must be removed. + */ + error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { + (void) close(fd); + nvlist_free(config); + return; + } + + (void) close(fd); + + rn->rn_config = config; + rn->rn_num_labels = num_labels; + + /* + * Add additional entries for paths described by this label. + */ + if (rn->rn_labelpaths) { + char *path = NULL; + char *devid = NULL; + char *env = NULL; + rdsk_node_t *slice; + avl_index_t where; + int timeout; + int error; + + if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) + return; + + env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS"); + if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 || + timeout < 0) { + timeout = DISK_LABEL_WAIT; + } + + /* + * Allow devlinks to stabilize so all paths are available. + */ + zpool_label_disk_wait(rn->rn_name, timeout); + + if (path != NULL) { + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, path); + slice->rn_vdev_guid = vdev_guid; + slice->rn_avl = rn->rn_avl; + slice->rn_hdl = hdl; + slice->rn_order = IMPORT_ORDER_PREFERRED_1; + slice->rn_labelpaths = B_FALSE; + pthread_mutex_lock(rn->rn_lock); + if (avl_find(rn->rn_avl, slice, &where)) { + pthread_mutex_unlock(rn->rn_lock); + free(slice->rn_name); + free(slice); + } else { + avl_insert(rn->rn_avl, slice, where); + pthread_mutex_unlock(rn->rn_lock); + zpool_open_func(slice); + } + } + + if (devid != NULL) { + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + error = asprintf(&slice->rn_name, "%s%s", + DEV_BYID_PATH, devid); + if (error == -1) { + free(slice); + return; + } + + slice->rn_vdev_guid = vdev_guid; + slice->rn_avl = rn->rn_avl; + slice->rn_hdl = hdl; + slice->rn_order = IMPORT_ORDER_PREFERRED_2; + slice->rn_labelpaths = B_FALSE; + pthread_mutex_lock(rn->rn_lock); + if (avl_find(rn->rn_avl, slice, &where)) { + pthread_mutex_unlock(rn->rn_lock); + free(slice->rn_name); + free(slice); + } else { + avl_insert(rn->rn_avl, slice, where); + pthread_mutex_unlock(rn->rn_lock); + zpool_open_func(slice); + } + } + } +} + +static char * +zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = { + "/private/var/run/disk/by-id", + "/private/var/run/disk/by-path", +#ifndef __UNSAFE_DEFAULT_IMPORT_PATH__ + "/private/var/run/disk/by-serial" +#else + "/private/var/run/disk/by-serial", + "/dev" /* UNSAFE device names will change */ +#endif /* !__UNSAFE_DEFAULT_IMPORT_PATH__ */ +}; + +const char * const * +zpool_default_search_paths(size_t *count) +{ + *count = DEFAULT_IMPORT_PATH_SIZE; + return ((const char * const *)zpool_default_import_path); +} + +int +zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache) +{ + int i, dirs; + struct dirent *dp; + char path[MAXPATHLEN]; + char *end, **dir; + size_t pathleft; + avl_index_t where; + rdsk_node_t *slice; + + dir = zpool_default_import_path; + dirs = DEFAULT_IMPORT_PATH_SIZE; + + /* + * Go through and read the label configuration information from every + * possible device, organizing the information according to pool GUID + * and toplevel GUID. + */ + for (i = 0; i < dirs; i++) { + char rdsk[MAXPATHLEN]; + int dfd; + DIR *dirp; + + /* use realpath to normalize the path */ + if (realpath(dir[i], path) == 0) { + + /* it is safe to skip missing search paths */ + if (errno == ENOENT) + continue; + + return (EPERM); + } + end = &path[strlen(path)]; + *end++ = '/'; + *end = 0; + pathleft = &path[sizeof (path)] - end; + + (void) strlcpy(rdsk, path, sizeof (rdsk)); + + if ((dfd = open(rdsk, O_RDONLY)) < 0 || + (dirp = fdopendir(dfd)) == NULL) { + if (dfd >= 0) + (void) close(dfd); + return (ENOENT); + } + + *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t)); + avl_create(*slice_cache, slice_cache_compare, + sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); + + while ((dp = readdir(dirp)) != NULL) { + const char *name = dp->d_name; + if (name[0] == '.' && + (name[1] == 0 || (name[1] == '.' && name[2] == 0))) + continue; + + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, path); + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = *slice_cache; + slice->rn_hdl = hdl; + slice->rn_labelpaths = B_FALSE; + slice->rn_order = IMPORT_ORDER_DEFAULT; + + pthread_mutex_lock(lock); + if (avl_find(*slice_cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(*slice_cache, slice, where); + } + pthread_mutex_unlock(lock); + } + + (void) closedir(dirp); + } + + return (0); +} + +/* + * Linux persistent device strings for vdev labels + * + * based on libudev for consistency with libudev disk add/remove events + */ + +typedef struct vdev_dev_strs { + char vds_devid[128]; + char vds_devphys[128]; +} vdev_dev_strs_t; + +/* ARGSUSED */ +int +zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) +{ + return (ENODATA); +} + +/* ARGSUSED */ +int +zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) +{ + return (ENODATA); +} + +/* + * Encode the persistent devices strings + * used for the vdev disk label + */ +static int +encode_device_strings(const char *path, vdev_dev_strs_t *ds, + boolean_t wholedisk) +{ + return (ENOENT); +} + +/* + * Update a leaf vdev's persistent device strings + * + * - only applies for a dedicated leaf vdev (aka whole disk) + * - updated during pool create|add|attach|import + * - used for matching device matching during auto-{online,expand,replace} + * - stored in a leaf disk config label (i.e. alongside 'path' NVP) + * - these strings are currently not used in kernel (i.e. for vdev_disk_open) + * + * single device node example: + * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1' + * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0' + * + * multipath device node example: + * devid: 'dm-uuid-mpath-35000c5006304de3f' + * + * We also store the enclosure sysfs path for turning on enclosure LEDs + * (if applicable): + * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + */ +void +update_vdev_config_dev_strs(nvlist_t *nv) +{ + vdev_dev_strs_t vds; + char *env, *type, *path; + uint64_t wholedisk = 0; + char *upath, *spath; + + /* + * For the benefit of legacy ZFS implementations, allow + * for opting out of devid strings in the vdev label. + * + * example use: + * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer + * + * explanation: + * Older ZFS on Linux implementations had issues when attempting to + * display pool config VDEV names if a "devid" NVP value is present + * in the pool's config. + * + * For example, a pool that originated on illumos platform would + * have a devid value in the config and "zpool status" would fail + * when listing the config. + * + * A pool can be stripped of any "devid" values on import or + * prevented from adding them on zpool create|add by setting + * ZFS_VDEV_DEVID_OPT_OUT. + */ + env = getenv("ZFS_VDEV_DEVID_OPT_OUT"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) { + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + return; + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 || + strcmp(type, VDEV_TYPE_DISK) != 0) { + return; + } + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + + /* + * Update device string values in the config nvlist. + */ + if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid); + if (vds.vds_devphys[0] != '\0') { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vds.vds_devphys); + } + + /* Add enclosure sysfs path (if disk is in an enclosure). */ + upath = zfs_get_underlying_path(path); + spath = zfs_get_enclosure_sysfs_path(upath); + if (spath) + nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + spath); + else + nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + + free(upath); + free(spath); + } else { + /* Clear out any stale entries. */ + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + } +} diff --git a/module/icp/asm-x86_64/os/macos/aes/aes_aesni.S b/module/icp/asm-x86_64/os/macos/aes/aes_aesni.S new file mode 100644 index 0000000000..9be7f63151 --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/aes/aes_aesni.S @@ -0,0 +1,855 @@ +/* + * ==================================================================== + * Written by Intel Corporation for the OpenSSL project to add support + * for Intel AES-NI instructions. Rights for redistribution and usage + * in source and binary forms are granted according to the OpenSSL + * license. + * + * Author: Huang Ying + * Vinodh Gopal + * Kahraman Akdemir + * + * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD) + * instructions that are going to be introduced in the next generation + * of Intel processor, as of 2009. These instructions enable fast and + * secure data encryption and decryption, using the Advanced Encryption + * Standard (AES), defined by FIPS Publication number 197. The + * architecture introduces six instructions that offer full hardware + * support for AES. Four of them support high performance data + * encryption and decryption, and the other two instructions support + * the AES key expansion procedure. + * ==================================================================== + */ + +/* + * ==================================================================== + * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +/* + * ==================================================================== + * OpenSolaris OS modifications + * + * This source originates as files aes-intel.S and eng_aesni_asm.pl, in + * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by + * Huang Ying of Intel to the openssl-dev mailing list under the subject + * of "Add support to Intel AES-NI instruction set for x86_64 platform". + * + * This OpenSolaris version has these major changes from the original source: + * + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function + * definitions for lint. + * + * 2. Formatted code, added comments, and added #includes and #defines. + * + * 3. If bit CR0.TS is set, clear and set the TS bit, after and before + * calling kpreempt_disable() and kpreempt_enable(). + * If the TS bit is not set, Save and restore %xmm registers at the beginning + * and end of function calls (%xmm* registers are not saved and restored by + * during kernel thread preemption). + * + * 4. Renamed functions, reordered parameters, and changed return value + * to match OpenSolaris: + * + * OpenSSL interface: + * int intel_AES_set_encrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * int intel_AES_set_decrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return values for above are non-zero on error, 0 on success. + * + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + * typedef struct aes_key_st { + * unsigned int rd_key[4 *(AES_MAXNR + 1)]; + * int rounds; + * unsigned int pad[3]; + * } AES_KEY; + * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules + * (ks32) instead of 64-bit (ks64). + * Number of rounds (aka round count) is at offset 240 of AES_KEY. + * + * OpenSolaris OS interface (#ifdefs removed for readability): + * int rijndael_key_setup_dec_intel(uint32_t rk[], + * const uint32_t cipherKey[], uint64_t keyBits); + * int rijndael_key_setup_enc_intel(uint32_t rk[], + * const uint32_t cipherKey[], uint64_t keyBits); + * Return values for above are 0 on error, number of rounds on success. + * + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]); + * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]); + * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]; + * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t; + * + * typedef union { + * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; + * } aes_ks_t; + * typedef struct aes_key { + * aes_ks_t encr_ks, decr_ks; + * long double align128; + * int flags, nr, type; + * } aes_key_t; + * + * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, + * ct is crypto text, and MAX_AES_NR is 14. + * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. + * + * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary. + * ==================================================================== + * Mac OS X modifications + * 1. Removed CR0.TS / STTS / CLTS since the XNU kernel can apparently use floating point + * registers without this. + * + * ==================================================================== + */ + +#if defined(lint) || defined(__lint) + +#include + +/* ARGSUSED */ +void +aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) { +} +/* ARGSUSED */ +void +aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) { +} +/* ARGSUSED */ +int +rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], + uint64_t keyBits) { + return (0); +} +/* ARGSUSED */ +int +rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], + uint64_t keyBits) { + return (0); +} + + +#else /* lint */ + +#define _ASM +#include + +#if defined(_KERNEL) && !defined(__APPLE__) + /* + * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, + * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it + * uses it to pass P2 to syscall. + * This also occurs with the STTS macro, but we dont care if + * P2 (%rsi) is modified just before function exit. + * The CLTS and STTS macros push and pop P1 (%rdi) already. + */ +#ifdef __xpv +#define PROTECTED_CLTS \ + push %rsi; \ + CLTS; \ + pop %rsi +#else +#define PROTECTED_CLTS \ + CLTS +#endif /* __xpv */ + +#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \ + push %rbp; \ + mov %rsp, %rbp; \ + movq %cr0, tmpreg; \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + and $-XMM_ALIGN, %rsp; \ + sub $(XMM_SIZE * 2), %rsp; \ + movaps %xmm0, 16(%rsp); \ + movaps %xmm1, (%rsp); \ + jmp 2f; \ +1: \ + PROTECTED_CLTS; \ +2: + + /* + * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack, + * otherwise set CR0_TS. + */ +#define SET_TS_OR_POP_XMM0_XMM1(tmpreg) \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + movaps (%rsp), %xmm1; \ + movaps 16(%rsp), %xmm0; \ + jmp 2f; \ +1: \ + STTS(tmpreg); \ +2: \ + mov %rbp, %rsp; \ + pop %rbp + + /* + * If CR0_TS is not set, align stack (with push %rbp) and push + * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS + */ +#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \ + push %rbp; \ + mov %rsp, %rbp; \ + movq %cr0, tmpreg; \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + and $-XMM_ALIGN, %rsp; \ + sub $(XMM_SIZE * 7), %rsp; \ + movaps %xmm0, 96(%rsp); \ + movaps %xmm1, 80(%rsp); \ + movaps %xmm2, 64(%rsp); \ + movaps %xmm3, 48(%rsp); \ + movaps %xmm4, 32(%rsp); \ + movaps %xmm5, 16(%rsp); \ + movaps %xmm6, (%rsp); \ + jmp 2f; \ +1: \ + PROTECTED_CLTS; \ +2: + + + /* + * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack, + * otherwise set CR0_TS. + */ +#define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + movaps (%rsp), %xmm6; \ + movaps 16(%rsp), %xmm5; \ + movaps 32(%rsp), %xmm4; \ + movaps 48(%rsp), %xmm3; \ + movaps 64(%rsp), %xmm2; \ + movaps 80(%rsp), %xmm1; \ + movaps 96(%rsp), %xmm0; \ + jmp 2f; \ +1: \ + STTS(tmpreg); \ +2: \ + mov %rbp, %rsp; \ + pop %rbp + + +#else +#define PROTECTED_CLTS +#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) +#define SET_TS_OR_POP_XMM0_XMM1(tmpreg) +#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) +#define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) +#endif /* _KERNEL */ + + +/* + * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(), + * _key_expansion_256a(), _key_expansion_256b() + * + * Helper functions called by rijndael_key_setup_inc_intel(). + * Also used indirectly by rijndael_key_setup_dec_intel(). + * + * Input: + * %xmm0 User-provided cipher key + * %xmm1 Round constant + * Output: + * (%rcx) AES key + */ + +.align 4, 0x90 +_key_expansion_128: +_key_expansion_256a: + pshufd $0b11111111, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + movups %xmm0, (%rcx) + add $0x10, %rcx + ret + SET_SIZE(_key_expansion_128) + SET_SIZE(_key_expansion_256a) + +.align 4, 0x90 +_key_expansion_192a: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movups %xmm2, %xmm5 + movups %xmm2, %xmm6 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movups %xmm0, %xmm1 + shufps $0b01000100, %xmm0, %xmm6 + movups %xmm6, (%rcx) + shufps $0b01001110, %xmm2, %xmm1 + movups %xmm1, 0x10(%rcx) + add $0x20, %rcx + ret + SET_SIZE(_key_expansion_192a) + +.align 4, 0x90 +_key_expansion_192b: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movups %xmm2, %xmm5 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movups %xmm0, (%rcx) + add $0x10, %rcx + ret + SET_SIZE(_key_expansion_192b) + +.align 4, 0x90 +_key_expansion_256b: + pshufd $0b10101010, %xmm1, %xmm1 + shufps $0b00010000, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + shufps $0b10001100, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + pxor %xmm1, %xmm2 + movups %xmm2, (%rcx) + add $0x10, %rcx + ret + SET_SIZE(_key_expansion_256b) + + +/* + * rijndael_key_setup_enc_intel() + * Expand the cipher key into the encryption key schedule. + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * OpenSolaris interface: + * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], + * uint64_t keyBits); + * Return value is 0 on error, number of rounds on success. + * + * Original Intel OpenSSL interface: + * int intel_AES_set_encrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return value is non-zero on error, 0 on success. + */ + +#ifdef OPENSSL_INTERFACE +#define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key +#define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key + +#define USERCIPHERKEY rdi /* P1, 64 bits */ +#define KEYSIZE32 esi /* P2, 32 bits */ +#define KEYSIZE64 rsi /* P2, 64 bits */ +#define AESKEY rdx /* P3, 64 bits */ + +#else /* OpenSolaris Interface */ +#define AESKEY rdi /* P1, 64 bits */ +#define USERCIPHERKEY rsi /* P2, 64 bits */ +#define KEYSIZE32 edx /* P3, 32 bits */ +#define KEYSIZE64 rdx /* P3, 64 bits */ +#endif /* OPENSSL_INTERFACE */ + +#define ROUNDS32 KEYSIZE32 /* temp */ +#define ROUNDS64 KEYSIZE64 /* temp */ +#define ENDAESKEY USERCIPHERKEY /* temp */ + +ENTRY_NP(rijndael_key_setup_enc_intel) +rijndael_key_setup_enc_intel_local: + CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10) + + // NULL pointer sanity check + test %USERCIPHERKEY, %USERCIPHERKEY + jz .Lenc_key_invalid_param + test %AESKEY, %AESKEY + jz .Lenc_key_invalid_param + + movups (%USERCIPHERKEY), %xmm0 // user key (first 16 bytes) + movups %xmm0, (%AESKEY) + lea 0x10(%AESKEY), %rcx // key addr + pxor %xmm4, %xmm4 // xmm4 is assumed 0 in _key_expansion_x + + cmp $256, %KEYSIZE32 + jnz .Lenc_key192 + + // AES 256: 14 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $14, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 14 +#endif /* OPENSSL_INTERFACE */ + + movups 0x10(%USERCIPHERKEY), %xmm2 // other user key (2nd 16 bytes) + movups %xmm2, (%rcx) + add $0x10, %rcx + + aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x1, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x2, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x4, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x8, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x10, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x20, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* Open Solaris Interface */ + mov $14, %rax // return # rounds = 14 +#endif + ret + +.align 4 +.Lenc_key192: + cmp $192, %KEYSIZE32 + jnz .Lenc_key128 + + // AES 192: 12 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $12, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 12 +#endif /* OPENSSL_INTERFACE */ + + movq 0x10(%USERCIPHERKEY), %xmm2 // other user key + aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x80, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* OpenSolaris Interface */ + mov $12, %rax // return # rounds = 12 +#endif + ret + +.align 4 +.Lenc_key128: + cmp $128, %KEYSIZE32 + jnz .Lenc_key_invalid_key_bits + + // AES 128: 10 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $10, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 10 +#endif /* OPENSSL_INTERFACE */ + + aeskeygenassist $0x1, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x2, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x4, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x8, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x10, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x20, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x40, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x80, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x1b, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x36, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* OpenSolaris Interface */ + mov $10, %rax // return # rounds = 10 +#endif + ret + +.Lenc_key_invalid_param: +#ifdef OPENSSL_INTERFACE + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) + mov $-1, %rax // user key or AES key pointer is NULL + ret +#else + /* FALLTHROUGH */ +#endif /* OPENSSL_INTERFACE */ + +.Lenc_key_invalid_key_bits: + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + mov $-2, %rax // keysize is invalid +#else /* Open Solaris Interface */ + xor %rax, %rax // a key pointer is NULL or invalid keysize +#endif /* OPENSSL_INTERFACE */ + + ret + SET_SIZE(rijndael_key_setup_enc_intel) + + +/* + * rijndael_key_setup_dec_intel() + * Expand the cipher key into the decryption key schedule. + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * OpenSolaris interface: + * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], + * uint64_t keyBits); + * Return value is 0 on error, number of rounds on success. + * P1->P2, P2->P3, P3->P1 + * + * Original Intel OpenSSL interface: + * int intel_AES_set_decrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return value is non-zero on error, 0 on success. + */ +ENTRY_NP(rijndael_key_setup_dec_intel) + // Generate round keys used for encryption + call rijndael_key_setup_enc_intel_local + test %rax, %rax +#ifdef OPENSSL_INTERFACE + jnz .Ldec_key_exit // Failed if returned non-0 +#else /* OpenSolaris Interface */ + jz .Ldec_key_exit // Failed if returned 0 +#endif /* OPENSSL_INTERFACE */ + + CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) + + /* + * Convert round keys used for encryption + * to a form usable for decryption + */ +#ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */ + mov %rax, %ROUNDS64 // set # rounds (10, 12, or 14) + // (already set for OpenSSL) +#endif + + lea 0x10(%AESKEY), %rcx // key addr + shl $4, %ROUNDS32 + add %AESKEY, %ROUNDS64 + mov %ROUNDS64, %ENDAESKEY + +.align 4 +.Ldec_key_reorder_loop: + movups (%AESKEY), %xmm0 + movups (%ROUNDS64), %xmm1 + movups %xmm0, (%ROUNDS64) + movups %xmm1, (%AESKEY) + lea 0x10(%AESKEY), %AESKEY + lea -0x10(%ROUNDS64), %ROUNDS64 + cmp %AESKEY, %ROUNDS64 + ja .Ldec_key_reorder_loop + +.align 4 +.Ldec_key_inv_loop: + movups (%rcx), %xmm0 + // Convert an encryption round key to a form usable for decryption + // with the "AES Inverse Mix Columns" instruction + aesimc %xmm0, %xmm1 + movups %xmm1, (%rcx) + lea 0x10(%rcx), %rcx + cmp %ENDAESKEY, %rcx + jnz .Ldec_key_inv_loop + + SET_TS_OR_POP_XMM0_XMM1(%r10) + +.Ldec_key_exit: + // OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error + // OpenSSL: rax = 0 for OK, or non-zero for error + ret + SET_SIZE(rijndael_key_setup_dec_intel) + + +/* + * aes_encrypt_intel() + * Encrypt a single block (in and out can overlap). + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * Temporary register usage: + * %xmm0 State + * %xmm1 Key + * + * Original OpenSolaris Interface: + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]) + * + * Original Intel OpenSSL Interface: + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key) + */ + +#ifdef OPENSSL_INTERFACE +#define aes_encrypt_intel intel_AES_encrypt +#define aes_decrypt_intel intel_AES_decrypt + +#define INP rdi /* P1, 64 bits */ +#define OUTP rsi /* P2, 64 bits */ +#define KEYP rdx /* P3, 64 bits */ + +/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */ +#define NROUNDS32 ecx /* temporary, 32 bits */ +#define NROUNDS cl /* temporary, 8 bits */ + +#else /* OpenSolaris Interface */ +#define KEYP rdi /* P1, 64 bits */ +#define NROUNDS esi /* P2, 32 bits */ +#define INP rdx /* P3, 64 bits */ +#define OUTP rcx /* P4, 64 bits */ +#endif /* OPENSSL_INTERFACE */ + +#define STATE xmm0 /* temporary, 128 bits */ +#define KEY xmm1 /* temporary, 128 bits */ + +ENTRY_NP(aes_encrypt_intel) + CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) + + movups (%INP), %STATE // input + movups (%KEYP), %KEY // key +#ifdef OPENSSL_INTERFACE + mov 240(%KEYP), %NROUNDS32 // round count +#else /* OpenSolaris Interface */ + /* Round count is already present as P2 in %rsi/%esi */ +#endif /* OPENSSL_INTERFACE */ + + pxor %KEY, %STATE // round 0 + lea 0x30(%KEYP), %KEYP + cmp $12, %NROUNDS + jb .Lenc128 + lea 0x20(%KEYP), %KEYP + je .Lenc192 + + // AES 256 + lea 0x20(%KEYP), %KEYP + movups -0x60(%KEYP), %KEY + aesenc %KEY, %STATE + movups -0x50(%KEYP), %KEY + aesenc %KEY, %STATE + +.align 4 +.Lenc192: + // AES 192 and 256 + movups -0x40(%KEYP), %KEY + aesenc %KEY, %STATE + movups -0x30(%KEYP), %KEY + aesenc %KEY, %STATE + +.align 4 +.Lenc128: + // AES 128, 192, and 256 + movups -0x20(%KEYP), %KEY + aesenc %KEY, %STATE + movups -0x10(%KEYP), %KEY + aesenc %KEY, %STATE + movups (%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x10(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x20(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x30(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x40(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x50(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x60(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x70(%KEYP), %KEY + aesenclast %KEY, %STATE // last round + movups %STATE, (%OUTP) // output + + SET_TS_OR_POP_XMM0_XMM1(%r10) + ret + SET_SIZE(aes_encrypt_intel) + + +/* + * aes_decrypt_intel() + * Decrypt a single block (in and out can overlap). + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * Temporary register usage: + * %xmm0 State + * %xmm1 Key + * + * Original OpenSolaris Interface: + * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original Intel OpenSSL Interface: + * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + */ +ENTRY_NP(aes_decrypt_intel) + CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) + + movups (%INP), %STATE // input + movups (%KEYP), %KEY // key +#ifdef OPENSSL_INTERFACE + mov 240(%KEYP), %NROUNDS32 // round count +#else /* OpenSolaris Interface */ + /* Round count is already present as P2 in %rsi/%esi */ +#endif /* OPENSSL_INTERFACE */ + + pxor %KEY, %STATE // round 0 + lea 0x30(%KEYP), %KEYP + cmp $12, %NROUNDS + jb .Ldec128 + lea 0x20(%KEYP), %KEYP + je .Ldec192 + + // AES 256 + lea 0x20(%KEYP), %KEYP + movups -0x60(%KEYP), %KEY + aesdec %KEY, %STATE + movups -0x50(%KEYP), %KEY + aesdec %KEY, %STATE + +.align 4 +.Ldec192: + // AES 192 and 256 + movups -0x40(%KEYP), %KEY + aesdec %KEY, %STATE + movups -0x30(%KEYP), %KEY + aesdec %KEY, %STATE + +.align 4 +.Ldec128: + // AES 128, 192, and 256 + movups -0x20(%KEYP), %KEY + aesdec %KEY, %STATE + movups -0x10(%KEYP), %KEY + aesdec %KEY, %STATE + movups (%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x10(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x20(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x30(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x40(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x50(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x60(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x70(%KEYP), %KEY + aesdeclast %KEY, %STATE // last round + movups %STATE, (%OUTP) // output + + SET_TS_OR_POP_XMM0_XMM1(%r10) + ret + SET_SIZE(aes_decrypt_intel) + +#endif /* lint || __lint */ diff --git a/module/icp/asm-x86_64/os/macos/aes/aes_amd64.S b/module/icp/asm-x86_64/os/macos/aes/aes_amd64.S new file mode 100644 index 0000000000..cdd9a861be --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/aes/aes_amd64.S @@ -0,0 +1,900 @@ +/* + * --------------------------------------------------------------------------- + * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software is allowed (with or without + * changes) provided that: + * + * 1. source code distributions include the above copyright notice, this + * list of conditions and the following disclaimer; + * + * 2. binary distributions include the above copyright notice, this list + * of conditions and the following disclaimer in their documentation; + * + * 3. the name of the copyright holder is not used to endorse products + * built using this software without specific written permission. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + * Issue 20/12/2007 + * + * I am grateful to Dag Arne Osvik for many discussions of the techniques that + * can be used to optimise AES assembler code on AMD64/EM64T architectures. + * Some of the techniques used in this implementation are the result of + * suggestions made by him for which I am most grateful. + * + * An AES implementation for AMD64 processors using the YASM assembler. This + * implementation provides only encryption, decryption and hence requires key + * scheduling support in C. It uses 8k bytes of tables but its encryption and + * decryption performance is very close to that obtained using large tables. + * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, + * which are as follows: + * ms windows gnu/linux/opensolaris os + * + * in_blk rcx rdi + * out_blk rdx rsi + * context (cx) r8 rdx + * + * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 + * registers rdi - on both + * + * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 + * registers - rdi on both + * + * The convention used here is that for gnu/linux/opensolaris os. + * + * This code provides the standard AES block size (128 bits, 16 bytes) and the + * three standard AES key sizes (128, 192 and 256 bits). It has the same call + * interface as my C implementation. It uses the Microsoft C AMD64 calling + * conventions in which the three parameters are placed in rcx, rdx and r8 + * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. + * + * OpenSolaris Note: + * Modified to use GNU/Linux/Solaris calling conventions. + * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. + * + * AES_RETURN aes_encrypt(const unsigned char in_blk[], + * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt(const unsigned char in_blk[], + * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_encrypt_key(const unsigned char key[], + * const aes_encrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt_key(const unsigned char key[], + * const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_encrypt_key(const unsigned char key[], + * unsigned int len, const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt_key(const unsigned char key[], + * unsigned int len, const aes_decrypt_ctx cx[1])/ + * + * where is 128, 102 or 256. In the last two calls the length can be in + * either bits or bytes. + * + * Comment in/out the following lines to obtain the desired subroutines. These + * selections MUST match those in the C header file aesopt.h + */ +#define AES_REV_DKS /* define if key decryption schedule is reversed */ + +#define LAST_ROUND_TABLES /* define for the faster version using extra tables */ + +/* + * The encryption key schedule has the following in memory layout where N is the + * number of rounds (10, 12 or 14): + * + * lo: | input key (round 0) | / each round is four 32-bit words + * | encryption round 1 | + * | encryption round 2 | + * .... + * | encryption round N-1 | + * hi: | encryption round N | + * + * The decryption key schedule is normally set up so that it has the same + * layout as above by actually reversing the order of the encryption key + * schedule in memory (this happens when AES_REV_DKS is set): + * + * lo: | decryption round 0 | = | encryption round N | + * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] + * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] + * .... .... + * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] + * hi: | decryption round N | = | input key (round 0) | + * + * with rounds except the first and last modified using inv_mix_column() + * But if AES_REV_DKS is NOT set the order of keys is left as it is for + * encryption so that it has to be accessed in reverse when used for + * decryption (although the inverse mix column modifications are done) + * + * lo: | decryption round 0 | = | input key (round 0) | + * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] + * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] + * .... .... + * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] + * hi: | decryption round N | = | encryption round N | + * + * This layout is faster when the assembler key scheduling provided here + * is used. + * + * End of user defines + */ + +/* + * --------------------------------------------------------------------------- + * OpenSolaris OS modifications + * + * This source originates from Brian Gladman file aes_amd64.asm + * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip + * with these changes: + * + * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and + * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, + * AES_128, AES_192, AES_256, AES_VAR ifdefs. + * + * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define + * + * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef + * + * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax + * (operands reversed, literals prefixed with "$", registers prefixed with "%", + * and "[register+offset]", addressing changed to "offset(register)", + * parenthesis in constant expressions "()" changed to square brackets "[]", + * "." removed from local (numeric) labels, and other changes. + * Examples: + * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax + * mov rax,(4*20h) mov $[4*0x20],%rax + * mov rax,[ebx+20h] mov 0x20(%ebx),%rax + * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax + * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax + * + * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function + * definitions for lint. + * + * 6. Renamed functions and reordered parameters to match OpenSolaris: + * Original Gladman interface: + * int aes_encrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + * int aes_decrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, + * and a union type, inf., containing inf.l, a uint32_t and + * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is + * used and contains the key schedule length * 16 where key schedule length is + * 10, 12, or 14 bytes. + * + * OpenSolaris OS interface: + * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ + * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ + * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, + * ct is crypto text, and MAX_AES_NR is 14. + * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. + */ + +#if defined(lint) || defined(__lint) + +#include +/* ARGSUSED */ +void +aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) { +} +/* ARGSUSED */ +void +aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) { +} + + +#else + +#define _ASM +#include + +#define KS_LENGTH 60 + +#define raxd eax +#define rdxd edx +#define rcxd ecx +#define rbxd ebx +#define rsid esi +#define rdid edi + +#define raxb al +#define rdxb dl +#define rcxb cl +#define rbxb bl +#define rsib sil +#define rdib dil + +// finite field multiplies by {02}, {04} and {08} + +#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) +#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) +#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) + +// finite field multiplies required in table generation + +#define f3(x) ((f2(x)) ^ (x)) +#define f9(x) ((f8(x)) ^ (x)) +#define fb(x) ((f8(x)) ^ (f2(x)) ^ (x)) +#define fd(x) ((f8(x)) ^ (f4(x)) ^ (x)) +#define fe(x) ((f8(x)) ^ (f4(x)) ^ (f2(x))) + +// macros for expanding S-box data + +#define u8(x) (f2(x)), (x), (x), (f3(x)), (f2(x)), (x), (x), (f3(x)) +#define v8(x) (fe(x)), (f9(x)), (fd(x)), (fb(x)), (fe(x)), (f9(x)), (fd(x)), (x) +#define w8(x) (x), 0, 0, 0, (x), 0, 0, 0 + +#define enc_vals(x) \ + .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ + .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ + .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ + .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ + .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ + .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ + .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ + .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ + .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ + .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ + .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ + .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ + .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ + .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ + .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ + .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ + .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ + .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ + .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ + .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ + .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ + .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ + .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ + .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ + .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ + .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ + .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ + .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ + .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ + .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ + .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ + .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) + +#define dec_vals(x) \ + .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ + .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ + .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ + .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ + .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ + .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ + .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ + .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ + .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ + .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ + .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ + .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ + .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ + .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ + .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ + .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ + .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ + .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ + .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ + .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ + .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ + .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ + .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ + .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ + .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ + .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ + .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ + .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ + .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ + .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ + .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ + .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) + +#define tptr %rbp /* table pointer */ +#define kptr %r8 /* key schedule pointer */ +#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ +#define fk_ref(x, y) -16*x+fofs+4*y(kptr) + +#ifdef AES_REV_DKS +#define rofs 128 +#define ik_ref(x, y) -16*x+rofs+4*y(kptr) + +#else +#define rofs -128 +#define ik_ref(x, y) 16*x+rofs+4*y(kptr) +#endif /* AES_REV_DKS */ + +#define tab_0(x) (tptr,x,8) +#define tab_1(x) 3(tptr,x,8) +#define tab_2(x) 2(tptr,x,8) +#define tab_3(x) 1(tptr,x,8) +#define tab_f(x) 1(tptr,x,8) +#define tab_i(x) 7(tptr,x,8) + +#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p1; \ + \ + mov p1, %eax; \ + mov p2, %ebx; \ + mov p3, %ecx; \ + mov p4, %edx + +#ifdef LAST_ROUND_TABLES + +#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ + add $2048, tptr; \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p1 + +#else + +#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + xor %esi, p1; \ + rol $8, %edi; \ + xor %edi, p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p3; \ + xor %edi, p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + xor %esi, p2; \ + rol $8, %edi; \ + xor %edi, p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p4; \ + xor %edi, p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + shr $16, %ecx; \ + xor %esi, p3; \ + rol $8, %edi; \ + xor %edi, p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p1; \ + xor %edi, p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + shr $16, %edx; \ + xor %esi, p4; \ + rol $8, %edi; \ + xor %edi, p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p2; \ + xor %edi, p1 + +#endif /* LAST_ROUND_TABLES */ + +#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p3; \ + \ + mov p1, %eax; \ + mov p2, %ebx; \ + mov p3, %ecx; \ + mov p4, %edx + +#ifdef LAST_ROUND_TABLES + +#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ + add $2048, tptr; \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p3 + +#else + +#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %eax; \ + xor %esi, p1; \ + rol $8, %edi; \ + xor %edi, p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p3; \ + xor %edi, p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %ebx; \ + xor %esi, p2; \ + rol $8, %edi; \ + xor %edi, p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p4; \ + xor %edi, p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %ecx; \ + xor %esi, p3; \ + rol $8, %edi; \ + xor %edi, p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p1; \ + xor %edi, p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %edx; \ + xor %esi, p4; \ + rol $8, %edi; \ + xor %edi, p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p2; \ + xor %edi, p3 + +#endif /* LAST_ROUND_TABLES */ + +/* + * OpenSolaris OS: + * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original interface: + * int aes_encrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + */ + .align 6, 0x90 +enc_tab: + enc_vals(u8) +#ifdef LAST_ROUND_TABLES + // Last Round Tables: + enc_vals(w8) +#endif + + + ENTRY_NP(aes_encrypt_amd64) +#ifdef GLADMAN_INTERFACE + // Original interface + sub $[4*8], %rsp // gnu/linux/opensolaris binary interface + mov %rsi, (%rsp) // output pointer (P2) + mov %rdx, %r8 // context (P3) + + mov %rbx, 1*8(%rsp) // P1: input pointer in rdi + mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) + mov %r12, 3*8(%rsp) // P3: context in r8 + movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 + +#else + // OpenSolaris OS interface + sub $(4*8), %rsp // Make room on stack to save registers + mov %rcx, (%rsp) // Save output pointer (P4) on stack + mov %rdi, %r8 // context (P1) + mov %rdx, %rdi // P3: save input pointer + shl $4, %esi // P2: esi byte key length * 16 + + mov %rbx, 1*8(%rsp) // Save registers + mov %rbp, 2*8(%rsp) + mov %r12, 3*8(%rsp) + // P1: context in r8 + // P2: byte key length * 16 in esi + // P3: input pointer in rdi + // P4: output pointer in (rsp) +#endif /* GLADMAN_INTERFACE */ + + lea enc_tab(%rip), tptr + sub $fofs, kptr + + // Load input block into registers + mov (%rdi), %eax + mov 1*4(%rdi), %ebx + mov 2*4(%rdi), %ecx + mov 3*4(%rdi), %edx + + xor fofs(kptr), %eax + xor fofs+4(kptr), %ebx + xor fofs+8(kptr), %ecx + xor fofs+12(kptr), %edx + + lea (kptr,%rsi), kptr + // Jump based on byte key length * 16: + cmp $(10*16), %esi + je 3f + cmp $(12*16), %esi + je 2f + cmp $(14*16), %esi + je 1f + mov $-1, %rax // error + jmp 4f + + // Perform normal forward rounds +1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) +2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) +3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) + fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) + + // Copy results + mov (%rsp), %rbx + mov %r9d, (%rbx) + mov %r10d, 4(%rbx) + mov %r11d, 8(%rbx) + mov %r12d, 12(%rbx) + xor %rax, %rax +4: // Restore registers + mov 1*8(%rsp), %rbx + mov 2*8(%rsp), %rbp + mov 3*8(%rsp), %r12 + add $(4*8), %rsp + ret + + SET_SIZE(aes_encrypt_amd64) + +/* + * OpenSolaris OS: + * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original interface: + * int aes_decrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + */ + .align 6, 0x90 +dec_tab: + dec_vals(v8) +#ifdef LAST_ROUND_TABLES + // Last Round Tables: + dec_vals(w8) +#endif + + + ENTRY_NP(aes_decrypt_amd64) +#ifdef GLADMAN_INTERFACE + // Original interface + sub $[4*8], %rsp // gnu/linux/opensolaris binary interface + mov %rsi, (%rsp) // output pointer (P2) + mov %rdx, %r8 // context (P3) + + mov %rbx, 1*8(%rsp) // P1: input pointer in rdi + mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) + mov %r12, 3*8(%rsp) // P3: context in r8 + movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 + +#else + // OpenSolaris OS interface + sub $(4*8), %rsp // Make room on stack to save registers + mov %rcx, (%rsp) // Save output pointer (P4) on stack + mov %rdi, %r8 // context (P1) + mov %rdx, %rdi // P3: save input pointer + shl $4, %esi // P2: esi byte key length * 16 + + mov %rbx, 1*8(%rsp) // Save registers + mov %rbp, 2*8(%rsp) + mov %r12, 3*8(%rsp) + // P1: context in r8 + // P2: byte key length * 16 in esi + // P3: input pointer in rdi + // P4: output pointer in (rsp) +#endif /* GLADMAN_INTERFACE */ + + lea dec_tab(%rip), tptr + sub $rofs, kptr + + // Load input block into registers + mov (%rdi), %eax + mov 1*4(%rdi), %ebx + mov 2*4(%rdi), %ecx + mov 3*4(%rdi), %edx + +#ifdef AES_REV_DKS + mov kptr, %rdi + lea (kptr,%rsi), kptr +#else + lea (kptr,%rsi), %rdi +#endif + + xor rofs(%rdi), %eax + xor rofs+4(%rdi), %ebx + xor rofs+8(%rdi), %ecx + xor rofs+12(%rdi), %edx + + // Jump based on byte key length * 16: + cmp $(10*16), %esi + je 3f + cmp $(12*16), %esi + je 2f + cmp $(14*16), %esi + je 1f + mov $-1, %rax // error + jmp 4f + + // Perform normal inverse rounds +1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) +2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) +3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) + il_rnd(%r9d, %r10d, %r11d, %r12d, 0) + + // Copy results + mov (%rsp), %rbx + mov %r9d, (%rbx) + mov %r10d, 4(%rbx) + mov %r11d, 8(%rbx) + mov %r12d, 12(%rbx) + xor %rax, %rax +4: // Restore registers + mov 1*8(%rsp), %rbx + mov 2*8(%rsp), %rbp + mov 3*8(%rsp), %r12 + add $(4*8), %rsp + ret + + SET_SIZE(aes_decrypt_amd64) +#endif /* lint || __lint */ diff --git a/module/icp/asm-x86_64/os/macos/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/os/macos/modes/gcm_pclmulqdq.S new file mode 100644 index 0000000000..20f4d14c78 --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/modes/gcm_pclmulqdq.S @@ -0,0 +1,334 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009 Intel Corporation + * All Rights Reserved. + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains an accelerated + * Galois Field Multiplication implementation. + * + * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, + * carry-less multiplication. More information about PCLMULQDQ can be + * found at: + * http://software.intel.com/en-us/articles/ + * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + */ + +/* + * ==================================================================== + * OpenSolaris OS modifications + * + * This source originates as file galois_hash_asm.c from + * Intel Corporation dated September 21, 2009. + * + * This OpenSolaris version has these major changes from the original source: + * + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function + * definition for lint. + * + * 2. Formatted code, added comments, and added #includes and #defines. + * + * 3. If bit CR0.TS is set, clear and set the TS bit, after and before + * calling kpreempt_disable() and kpreempt_enable(). + * If the TS bit is not set, Save and restore %xmm registers at the beginning + * and end of function calls (%xmm* registers are not saved and restored by + * during kernel thread preemption). + * + * 4. Removed code to perform hashing. This is already done with C macro + * GHASH in gcm.c. For better performance, this removed code should be + * reintegrated in the future to replace the C GHASH macro. + * + * 5. Added code to byte swap 16-byte input and output. + * + * 6. Folded in comments from the original C source with embedded assembly + * (SB_w_shift_xor.c) + * + * 7. Renamed function and reordered parameters to match OpenSolaris: + * Intel interface: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * OpenSolaris OS interface: + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * ==================================================================== + */ + + +#if defined(lint) || defined(__lint) + +#include + +/* ARGSUSED */ +void +gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { +} + +#else /* lint */ + +#define _ASM +#include + +#if defined(_KERNEL) && !defined(__APPLE__) + /* + * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, + * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it + * uses it to pass P2 to syscall. + * This also occurs with the STTS macro, but we dont care if + * P2 (%rsi) is modified just before function exit. + * The CLTS and STTS macros push and pop P1 (%rdi) already. + */ +#ifdef __xpv +#define PROTECTED_CLTS \ + push %rsi; \ + CLTS; \ + pop %rsi +#else +#define PROTECTED_CLTS \ + CLTS +#endif /* __xpv */ + + /* + * If CR0_TS is not set, align stack (with push %rbp) and push + * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS + */ +#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \ + push %rbp; \ + mov %rsp, %rbp; \ + movq %cr0, tmpreg; \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + and $-XMM_ALIGN, %rsp; \ + sub $(XMM_SIZE * 11), %rsp; \ + movaps %xmm0, 160(%rsp); \ + movaps %xmm1, 144(%rsp); \ + movaps %xmm2, 128(%rsp); \ + movaps %xmm3, 112(%rsp); \ + movaps %xmm4, 96(%rsp); \ + movaps %xmm5, 80(%rsp); \ + movaps %xmm6, 64(%rsp); \ + movaps %xmm7, 48(%rsp); \ + movaps %xmm8, 32(%rsp); \ + movaps %xmm9, 16(%rsp); \ + movaps %xmm10, (%rsp); \ + jmp 2f; \ +1: \ + PROTECTED_CLTS; \ +2: + + + /* + * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack, + * otherwise set CR0_TS. + */ +#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + movaps (%rsp), %xmm10; \ + movaps 16(%rsp), %xmm9; \ + movaps 32(%rsp), %xmm8; \ + movaps 48(%rsp), %xmm7; \ + movaps 64(%rsp), %xmm6; \ + movaps 80(%rsp), %xmm5; \ + movaps 96(%rsp), %xmm4; \ + movaps 112(%rsp), %xmm3; \ + movaps 128(%rsp), %xmm2; \ + movaps 144(%rsp), %xmm1; \ + movaps 160(%rsp), %xmm0; \ + jmp 2f; \ +1: \ + STTS(tmpreg); \ +2: \ + mov %rbp, %rsp; \ + pop %rbp + + +#else +#define PROTECTED_CLTS +#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) +#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) +#endif /* _KERNEL */ + +/* + * Use this mask to byte-swap a 16-byte integer with the pshufb instruction + */ + +// static uint8_t byte_swap16_mask[] = { +// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; +.text +.align XMM_ALIGN_LOG +.Lbyte_swap16_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + + + +/* + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * + * Perform a carry-less multiplication (that is, use XOR instead of the + * multiply operator) on P1 and P2 and place the result in P3. + * + * Byte swap the input and the output. + * + * Note: x_in, y, and res all point to a block of 20-byte numbers + * (an array of two 64-bit integers). + * + * Note2: For kernel code, caller is responsible for ensuring + * kpreempt_disable() has been called. This is because %xmm registers are + * not saved/restored. Clear and set the CR0.TS bit on entry and exit, + * respectively, if TS is set on entry. Otherwise, if TS is not set, + * save and restore %xmm registers on the stack. + * + * Note3: Original Intel definition: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * + * Note4: Register/parameter mapping: + * Intel: + * Parameter 1: %rcx (copied to %xmm0) hk or x_in + * Parameter 2: %rdx (copied to %xmm1) s or y + * Parameter 3: %rdi (result) d or res + * OpenSolaris: + * Parameter 1: %rdi (copied to %xmm0) x_in + * Parameter 2: %rsi (copied to %xmm1) y + * Parameter 3: %rdx (result) res + */ + +ENTRY_NP(gcm_mul_pclmulqdq) + CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10) + + // + // Copy Parameters + // + movdqu (%rdi), %xmm0 // P1 + movdqu (%rsi), %xmm1 // P2 + + // + // Byte swap 16-byte input + // + lea .Lbyte_swap16_mask(%rip), %rax + movups (%rax), %xmm10 + pshufb %xmm10, %xmm0 + pshufb %xmm10, %xmm1 + + + // + // Multiply with the hash key + // + movdqu %xmm0, %xmm3 + pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 + + movdqu %xmm0, %xmm4 + pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 + + movdqu %xmm0, %xmm5 + pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 + movdqu %xmm0, %xmm6 + pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 + + pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 + + movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 + psrldq $8, %xmm4 // shift by xmm4 64 bits to the right + pslldq $8, %xmm5 // shift by xmm5 64 bits to the left + pxor %xmm5, %xmm3 + pxor %xmm4, %xmm6 // Register pair holds the result + // of the carry-less multiplication of + // xmm0 by xmm1. + + // We shift the result of the multiplication by one bit position + // to the left to cope for the fact that the bits are reversed. + movdqu %xmm3, %xmm7 + movdqu %xmm6, %xmm8 + pslld $1, %xmm3 + pslld $1, %xmm6 + psrld $31, %xmm7 + psrld $31, %xmm8 + movdqu %xmm7, %xmm9 + pslldq $4, %xmm8 + pslldq $4, %xmm7 + psrldq $12, %xmm9 + por %xmm7, %xmm3 + por %xmm8, %xmm6 + por %xmm9, %xmm6 + + // + // First phase of the reduction + // + // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts + // independently. + movdqu %xmm3, %xmm7 + movdqu %xmm3, %xmm8 + movdqu %xmm3, %xmm9 + pslld $31, %xmm7 // packed right shift shifting << 31 + pslld $30, %xmm8 // packed right shift shifting << 30 + pslld $25, %xmm9 // packed right shift shifting << 25 + pxor %xmm8, %xmm7 // xor the shifted versions + pxor %xmm9, %xmm7 + movdqu %xmm7, %xmm8 + pslldq $12, %xmm7 + psrldq $4, %xmm8 + pxor %xmm7, %xmm3 // first phase of the reduction complete + + // + // Second phase of the reduction + // + // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these + // shift operations. + movdqu %xmm3, %xmm2 + movdqu %xmm3, %xmm4 // packed left shifting >> 1 + movdqu %xmm3, %xmm5 + psrld $1, %xmm2 + psrld $2, %xmm4 // packed left shifting >> 2 + psrld $7, %xmm5 // packed left shifting >> 7 + pxor %xmm4, %xmm2 // xor the shifted versions + pxor %xmm5, %xmm2 + pxor %xmm8, %xmm2 + pxor %xmm2, %xmm3 + pxor %xmm3, %xmm6 // the result is in xmm6 + + // + // Byte swap 16-byte result + // + pshufb %xmm10, %xmm6 // %xmm10 has the swap mask + + // + // Store the result + // + movdqu %xmm6, (%rdx) // P3 + + + // + // Cleanup and Return + // + SET_TS_OR_POP_XMM_REGISTERS(%r10) + ret + SET_SIZE(gcm_mul_pclmulqdq) + +#endif /* lint || __lint */ diff --git a/module/icp/asm-x86_64/os/macos/sha1/sha1-x86_64.S b/module/icp/asm-x86_64/os/macos/sha1/sha1-x86_64.S new file mode 100644 index 0000000000..cb923784a7 --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/sha1/sha1-x86_64.S @@ -0,0 +1,1353 @@ +/* + * !/usr/bin/env perl + * + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. The module is, however, dual licensed under OpenSSL and + * CRYPTOGAMS licenses depending on where you obtain it. For further + * details see http://www.openssl.org/~appro/cryptogams/. + * ==================================================================== + * + * sha1_block procedure for x86_64. + * + * It was brought to my attention that on EM64T compiler-generated code + * was far behind 32-bit assembler implementation. This is unlike on + * Opteron where compiler-generated code was only 15% behind 32-bit + * assembler, which originally made it hard to motivate the effort. + * There was suggestion to mechanically translate 32-bit code, but I + * dismissed it, reasoning that x86_64 offers enough register bank + * capacity to fully utilize SHA-1 parallelism. Therefore this fresh + * implementation:-) However! While 64-bit code does performs better + * on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, + * x86_64 does offer larger *addressable* bank, but out-of-order core + * reaches for even more registers through dynamic aliasing, and EM64T + * core must have managed to run-time optimize even 32-bit code just as + * good as 64-bit one. Performance improvement is summarized in the + * following table: + * + * gcc 3.4 32-bit asm cycles/byte + * Opteron +45% +20% 6.8 + * Xeon P4 +65% +0% 9.9 + * Core2 +60% +10% 7.0 + * + * + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha1-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). + * + */ + +/* + * This file was generated by a perl script (sha1-x86_64.pl). The comments from + * the original file have been pasted above. + */ + +#if defined(lint) || defined(__lint) +#include +#include + + +/* ARGSUSED */ +void +sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks) +{ +} + +#else +#define _ASM +#include +ENTRY_NP(sha1_block_data_order) + push %rbx + push %rbp + push %r12 + mov %rsp,%rax + mov %rdi,%r8 # reassigned argument + sub $72,%rsp + mov %rsi,%r9 # reassigned argument + and $-64,%rsp + mov %rdx,%r10 # reassigned argument + mov %rax,64(%rsp) + + mov 0(%r8),%edx + mov 4(%r8),%esi + mov 8(%r8),%edi + mov 12(%r8),%ebp + mov 16(%r8),%r11d +.align 4 +.Lloop: + mov 0(%r9),%eax + bswap %eax + mov %eax,0(%rsp) + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 4(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,4(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 8(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,8(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 12(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,12(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov %r12d,%ebx + mov 16(%r9),%eax + mov %ebp,%esi + xor %edx,%ebx + bswap %eax + rol $5,%esi + and %r11d,%ebx + mov %eax,16(%rsp) + add %esi,%edi + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + lea 0x5a827999(%eax,%edx),%esi + mov %r11d,%ebx + mov 20(%r9),%eax + mov %edi,%edx + xor %r12d,%ebx + bswap %eax + rol $5,%edx + and %ebp,%ebx + mov %eax,20(%rsp) + add %edx,%esi + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + lea 0x5a827999(%eax,%r12d),%edx + mov %ebp,%ebx + mov 24(%r9),%eax + mov %esi,%r12d + xor %r11d,%ebx + bswap %eax + rol $5,%r12d + and %edi,%ebx + mov %eax,24(%rsp) + add %r12d,%edx + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 28(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,28(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 32(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,32(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 36(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,36(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov %r12d,%ebx + mov 40(%r9),%eax + mov %ebp,%esi + xor %edx,%ebx + bswap %eax + rol $5,%esi + and %r11d,%ebx + mov %eax,40(%rsp) + add %esi,%edi + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + lea 0x5a827999(%eax,%edx),%esi + mov %r11d,%ebx + mov 44(%r9),%eax + mov %edi,%edx + xor %r12d,%ebx + bswap %eax + rol $5,%edx + and %ebp,%ebx + mov %eax,44(%rsp) + add %edx,%esi + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + lea 0x5a827999(%eax,%r12d),%edx + mov %ebp,%ebx + mov 48(%r9),%eax + mov %esi,%r12d + xor %r11d,%ebx + bswap %eax + rol $5,%r12d + and %edi,%ebx + mov %eax,48(%rsp) + add %r12d,%edx + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 52(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,52(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 56(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,56(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 60(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,60(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov 0(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 8(%rsp),%eax + xor %edx,%ebx + rol $5,%esi + xor 32(%rsp),%eax + and %r11d,%ebx + add %esi,%edi + xor 52(%rsp),%eax + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,0(%rsp) + lea 0x5a827999(%eax,%edx),%esi + mov 4(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 12(%rsp),%eax + xor %r12d,%ebx + rol $5,%edx + xor 36(%rsp),%eax + and %ebp,%ebx + add %edx,%esi + xor 56(%rsp),%eax + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,4(%rsp) + lea 0x5a827999(%eax,%r12d),%edx + mov 8(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 16(%rsp),%eax + xor %r11d,%ebx + rol $5,%r12d + xor 40(%rsp),%eax + and %edi,%ebx + add %r12d,%edx + xor 60(%rsp),%eax + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,8(%rsp) + lea 0x5a827999(%eax,%r11d),%r12d + mov 12(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 20(%rsp),%eax + xor %ebp,%ebx + rol $5,%r11d + xor 44(%rsp),%eax + and %esi,%ebx + add %r11d,%r12d + xor 0(%rsp),%eax + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,12(%rsp) + lea 0x5a827999(%eax,%ebp),%r11d + mov 16(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 24(%rsp),%eax + xor %edi,%ebx + rol $5,%ebp + xor 48(%rsp),%eax + and %edx,%ebx + add %ebp,%r11d + xor 4(%rsp),%eax + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,16(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 20(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 28(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 52(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 8(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,20(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 24(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 32(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 56(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 12(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,24(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 28(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 36(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 60(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 16(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,28(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 32(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 40(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 0(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 20(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,32(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 36(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 44(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 4(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 24(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,36(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 40(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 48(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 8(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 28(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,40(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 44(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 52(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 12(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 32(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,44(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 48(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 56(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 16(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 36(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,48(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 52(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 60(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 20(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 40(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,52(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 56(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 0(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 24(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 44(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,56(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 60(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 4(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 28(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 48(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,60(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 0(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 8(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 32(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 52(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,0(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 4(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 12(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 36(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 56(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,4(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 8(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 16(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 40(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 60(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,8(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 12(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 20(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 44(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 0(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,12(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 16(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 24(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 48(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 4(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,16(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 20(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 28(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 52(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 8(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,20(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 24(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 32(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 56(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 12(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,24(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 28(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 36(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 60(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 16(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,28(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 32(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 40(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 0(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 20(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,32(%rsp) + lea -0x70e44324(%eax,%edx),%esi + mov 36(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 44(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 4(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 24(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,36(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 40(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 48(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 8(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 28(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,40(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 44(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 52(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 12(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 32(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,44(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 48(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 56(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 16(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 36(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,48(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 52(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 60(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 20(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 40(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,52(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 56(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 0(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 24(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 44(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,56(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 60(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 4(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 28(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 48(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,60(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 0(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 8(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 32(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 52(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,0(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 4(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 12(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 36(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 56(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,4(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 8(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 16(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 40(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 60(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,8(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 12(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 20(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 44(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 0(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,12(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 16(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 24(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 48(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 4(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,16(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 20(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 28(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 52(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 8(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,20(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 24(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 32(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 56(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 12(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,24(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 28(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 36(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 60(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 16(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,28(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 32(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 40(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 0(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 20(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,32(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 36(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 44(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 4(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 24(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,36(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 40(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 48(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 8(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 28(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,40(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 44(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 52(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 12(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 32(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,44(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 48(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 56(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 16(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 36(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,48(%rsp) + add %ebx,%edx + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 52(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 60(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 20(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 40(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,52(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 56(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 0(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 24(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 44(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,56(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 60(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 4(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 28(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 48(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,60(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 0(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 8(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 32(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 52(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,0(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 4(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 12(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 36(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 56(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,4(%rsp) + lea -0x359d3e2a(%eax,%r12d),%edx + mov 8(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 16(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 40(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 60(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,8(%rsp) + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 12(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 20(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 44(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 0(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,12(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 16(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 24(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 48(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 4(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,16(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 20(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 28(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 52(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 8(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,20(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 24(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 32(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 56(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 12(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,24(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 28(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 36(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 60(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 16(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,28(%rsp) + lea -0x359d3e2a(%eax,%r12d),%edx + mov 32(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 40(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 0(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 20(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,32(%rsp) + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 36(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 44(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 4(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 24(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,36(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 40(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 48(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 8(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 28(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,40(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 44(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 52(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 12(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 32(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,44(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 48(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 56(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 16(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 36(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,48(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 52(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 60(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 20(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 40(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + lea -0x359d3e2a(%eax,%r12d),%edx + mov 56(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 0(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 24(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 44(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 60(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 4(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 28(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 48(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + lea -0x359d3e2a(%eax,%ebp),%r11d + mov %esi,%ebx + mov %r12d,%ebp + xor %edx,%ebx + rol $5,%ebp + xor %edi,%ebx + add %ebp,%r11d + rol $30,%edx + add %ebx,%r11d + // Update and save state information in SHA-1 context + add 0(%r8),%r11d + add 4(%r8),%r12d + add 8(%r8),%edx + add 12(%r8),%esi + add 16(%r8),%edi + mov %r11d,0(%r8) + mov %r12d,4(%r8) + mov %edx,8(%r8) + mov %esi,12(%r8) + mov %edi,16(%r8) + + xchg %r11d,%edx # mov %r11d,%edx + xchg %r12d,%esi # mov %r12d,%esi + xchg %r11d,%edi # mov %edx,%edi + xchg %r12d,%ebp # mov %esi,%ebp + # mov %edi,%r11d + lea 64(%r9),%r9 + sub $1,%r10 + jnz .Lloop + mov 64(%rsp),%rsp + pop %r12 + pop %rbp + pop %rbx + ret +SET_SIZE(sha1_block_data_order) + +.data +.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by " + +#endif /* lint || __lint */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/os/macos/sha2/sha256_impl.S b/module/icp/asm-x86_64/os/macos/sha2/sha256_impl.S new file mode 100644 index 0000000000..0b0f3444fa --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/sha2/sha256_impl.S @@ -0,0 +1,2058 @@ + +/* + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. Rights for redistribution and usage in source and binary + * forms are granted according to the OpenSSL license. + * ==================================================================== + * + * sha256/512_block procedure for x86_64. + * + * 40% improvement over compiler-generated code on Opteron. On EM64T + * sha256 was observed to run >80% faster and sha512 - >40%. No magical + * tricks, just straight implementation... I really wonder why gcc + * [being armed with inline assembler] fails to generate as fast code. + * The only thing which is cool about this module is that it's very + * same instruction sequence used for both SHA-256 and SHA-512. In + * former case the instructions operate on 32-bit operands, while in + * latter - on 64-bit ones. All I had to do is to get one flavor right, + * the other one passed the test right away:-) + * + * sha256_block runs in ~1005 cycles on Opteron, which gives you + * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock + * frequency in GHz. sha512_block runs in ~1275 cycles, which results + * in 128*1000/1275=100MBps per GHz. Is there room for improvement? + * Well, if you compare it to IA-64 implementation, which maintains + * X[16] in register bank[!], tends to 4 instructions per CPU clock + * cycle and runs in 1003 cycles, 1275 is very good result for 3-way + * issue Opteron pipeline and X[16] maintained in memory. So that *if* + * there is a way to improve it, *then* the only way would be to try to + * offload X[16] updates to SSE unit, but that would require "deeper" + * loop unroll, which in turn would naturally cause size blow-up, not + * to mention increased complexity! And once again, only *if* it's + * actually possible to noticeably improve overall ILP, instruction + * level parallelism, on a given CPU implementation in this case. + * + * Special note on Intel EM64T. While Opteron CPU exhibits perfect + * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], + * [currently available] EM64T CPUs apparently are far from it. On the + * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit + * sha256_block:-( This is presumably because 64-bit shifts/rotates + * apparently are not atomic instructions, but implemented in microcode. + */ + +/* + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha512-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). Replaced the .picmeup macro with assembler code. + * + * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", + * at the beginning of SHA2_CTX (the next field is 8-byte aligned). + */ + +/* + * This file was generated by a perl script (sha512-x86_64.pl) that were + * used to generate sha256 and sha512 variants from the same code base. + * The comments from the original file have been pasted above. + */ + +#if defined(lint) || defined(__lint) +#include +#include + +/* ARGSUSED */ +void +SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) +{ +} + + +#else +#define _ASM +#include + +ENTRY_NP(SHA256TransformBlocks) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%rbp # copy %rsp + shl $4,%rdx # num*16 + sub $16*4+4*8,%rsp + lea (%rsi,%rdx,4),%rdx # inp+num*16*4 + and $-64,%rsp # align stack frame + add $8,%rdi # Skip OpenSolaris field, "algotype" + mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg + mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg + mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg + mov %rbp,16*4+3*8(%rsp) # save copy of %rsp + + //.picmeup %rbp + // The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts + // the address of the "next" instruction into the target register + // (%rbp). This generates these 2 instructions: + lea .Llea(%rip),%rbp + //nop // .picmeup generates a nop for mod 8 alignment--not needed here + +.Llea: + lea K256-.(%rbp),%rbp + + mov 4*0(%rdi),%eax + mov 4*1(%rdi),%ebx + mov 4*2(%rdi),%ecx + mov 4*3(%rdi),%edx + mov 4*4(%rdi),%r8d + mov 4*5(%rdi),%r9d + mov 4*6(%rdi),%r10d + mov 4*7(%rdi),%r11d + jmp .Lloop + +.align 4, 0x90 +.Lloop: + xor %rdi,%rdi + mov 4*0(%rsi),%r12d + bswap %r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,0(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 4*1(%rsi),%r12d + bswap %r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,4(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 4*2(%rsi),%r12d + bswap %r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,8(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 4*3(%rsi),%r12d + bswap %r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,12(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 4*4(%rsi),%r12d + bswap %r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,16(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 4*5(%rsi),%r12d + bswap %r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,20(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 4*6(%rsi),%r12d + bswap %r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,24(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 4*7(%rsi),%r12d + bswap %r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,28(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + mov 4*8(%rsi),%r12d + bswap %r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,32(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 4*9(%rsi),%r12d + bswap %r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,36(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 4*10(%rsi),%r12d + bswap %r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,40(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 4*11(%rsi),%r12d + bswap %r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,44(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 4*12(%rsi),%r12d + bswap %r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,48(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 4*13(%rsi),%r12d + bswap %r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,52(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 4*14(%rsi),%r12d + bswap %r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,56(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 4*15(%rsi),%r12d + bswap %r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,60(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + jmp .Lrounds_16_xx +.align 4, 0x90 +.Lrounds_16_xx: + mov 4(%rsp),%r13d + mov 56(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 36(%rsp),%r12d + + add 0(%rsp),%r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,0(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 8(%rsp),%r13d + mov 60(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 40(%rsp),%r12d + + add 4(%rsp),%r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,4(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 12(%rsp),%r13d + mov 0(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 44(%rsp),%r12d + + add 8(%rsp),%r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,8(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 16(%rsp),%r13d + mov 4(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 48(%rsp),%r12d + + add 12(%rsp),%r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,12(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 20(%rsp),%r13d + mov 8(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 52(%rsp),%r12d + + add 16(%rsp),%r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,16(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 24(%rsp),%r13d + mov 12(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 56(%rsp),%r12d + + add 20(%rsp),%r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,20(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 28(%rsp),%r13d + mov 16(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 60(%rsp),%r12d + + add 24(%rsp),%r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,24(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 32(%rsp),%r13d + mov 20(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 0(%rsp),%r12d + + add 28(%rsp),%r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,28(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + mov 36(%rsp),%r13d + mov 24(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 4(%rsp),%r12d + + add 32(%rsp),%r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,32(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 40(%rsp),%r13d + mov 28(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 8(%rsp),%r12d + + add 36(%rsp),%r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,36(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 44(%rsp),%r13d + mov 32(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 12(%rsp),%r12d + + add 40(%rsp),%r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,40(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 48(%rsp),%r13d + mov 36(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 16(%rsp),%r12d + + add 44(%rsp),%r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,44(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 52(%rsp),%r13d + mov 40(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 20(%rsp),%r12d + + add 48(%rsp),%r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,48(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 56(%rsp),%r13d + mov 44(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 24(%rsp),%r12d + + add 52(%rsp),%r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,52(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 60(%rsp),%r13d + mov 48(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 28(%rsp),%r12d + + add 56(%rsp),%r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,56(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 0(%rsp),%r13d + mov 52(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 32(%rsp),%r12d + + add 60(%rsp),%r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,60(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + cmp $64,%rdi + jb .Lrounds_16_xx + + mov 16*4+0*8(%rsp),%rdi + lea 16*4(%rsi),%rsi + + add 4*0(%rdi),%eax + add 4*1(%rdi),%ebx + add 4*2(%rdi),%ecx + add 4*3(%rdi),%edx + add 4*4(%rdi),%r8d + add 4*5(%rdi),%r9d + add 4*6(%rdi),%r10d + add 4*7(%rdi),%r11d + + cmp 16*4+2*8(%rsp),%rsi + + mov %eax,4*0(%rdi) + mov %ebx,4*1(%rdi) + mov %ecx,4*2(%rdi) + mov %edx,4*3(%rdi) + mov %r8d,4*4(%rdi) + mov %r9d,4*5(%rdi) + mov %r10d,4*6(%rdi) + mov %r11d,4*7(%rdi) + jb .Lloop + + mov 16*4+3*8(%rsp),%rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +SET_SIZE(SHA256TransformBlocks) + +.align 6, 0x90 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +#endif /* !lint && !__lint */ diff --git a/module/icp/asm-x86_64/os/macos/sha2/sha512_impl.S b/module/icp/asm-x86_64/os/macos/sha2/sha512_impl.S new file mode 100644 index 0000000000..1b51f9d5b4 --- /dev/null +++ b/module/icp/asm-x86_64/os/macos/sha2/sha512_impl.S @@ -0,0 +1,2082 @@ +/* + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. Rights for redistribution and usage in source and binary + * forms are granted according to the OpenSSL license. + * ==================================================================== + * + * sha256/512_block procedure for x86_64. + * + * 40% improvement over compiler-generated code on Opteron. On EM64T + * sha256 was observed to run >80% faster and sha512 - >40%. No magical + * tricks, just straight implementation... I really wonder why gcc + * [being armed with inline assembler] fails to generate as fast code. + * The only thing which is cool about this module is that it's very + * same instruction sequence used for both SHA-256 and SHA-512. In + * former case the instructions operate on 32-bit operands, while in + * latter - on 64-bit ones. All I had to do is to get one flavor right, + * the other one passed the test right away:-) + * + * sha256_block runs in ~1005 cycles on Opteron, which gives you + * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock + * frequency in GHz. sha512_block runs in ~1275 cycles, which results + * in 128*1000/1275=100MBps per GHz. Is there room for improvement? + * Well, if you compare it to IA-64 implementation, which maintains + * X[16] in register bank[!], tends to 4 instructions per CPU clock + * cycle and runs in 1003 cycles, 1275 is very good result for 3-way + * issue Opteron pipeline and X[16] maintained in memory. So that *if* + * there is a way to improve it, *then* the only way would be to try to + * offload X[16] updates to SSE unit, but that would require "deeper" + * loop unroll, which in turn would naturally cause size blow-up, not + * to mention increased complexity! And once again, only *if* it's + * actually possible to noticeably improve overall ILP, instruction + * level parallelism, on a given CPU implementation in this case. + * + * Special note on Intel EM64T. While Opteron CPU exhibits perfect + * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], + * [currently available] EM64T CPUs apparently are far from it. On the + * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit + * sha256_block:-( This is presumably because 64-bit shifts/rotates + * apparently are not atomic instructions, but implemented in microcode. + */ + +/* + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha512-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). Replaced the .picmeup macro with assembler code. + * + * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", + * at the beginning of SHA2_CTX (the next field is 8-byte aligned). + */ + +/* + * This file was generated by a perl script (sha512-x86_64.pl) that were + * used to generate sha256 and sha512 variants from the same code base. + * The comments from the original file have been pasted above. + */ + + +#if defined(lint) || defined(__lint) +#include +#include + +/* ARGSUSED */ +void +SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) +{ +} + + +#else +#define _ASM +#include + +ENTRY_NP(SHA512TransformBlocks) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%rbp # copy %rsp + shl $4,%rdx # num*16 + sub $16*8+4*8,%rsp + lea (%rsi,%rdx,8),%rdx # inp+num*16*8 + and $-64,%rsp # align stack frame + add $8,%rdi # Skip OpenSolaris field, "algotype" + mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg + mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg + mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg + mov %rbp,16*8+3*8(%rsp) # save copy of %rsp + + //.picmeup %rbp + // The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts + // the address of the "next" instruction into the target register + // (%rbp). This generates these 2 instructions: + lea .Llea(%rip),%rbp + //nop // .picmeup generates a nop for mod 8 alignment--not needed here + +.Llea: + lea K512-.(%rbp),%rbp + + mov 8*0(%rdi),%rax + mov 8*1(%rdi),%rbx + mov 8*2(%rdi),%rcx + mov 8*3(%rdi),%rdx + mov 8*4(%rdi),%r8 + mov 8*5(%rdi),%r9 + mov 8*6(%rdi),%r10 + mov 8*7(%rdi),%r11 + jmp .Lloop + +.align 4, 0x90 +.Lloop: + xor %rdi,%rdi + mov 8*0(%rsi),%r12 + bswap %r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,0(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 8*1(%rsi),%r12 + bswap %r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,8(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 8*2(%rsi),%r12 + bswap %r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,16(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 8*3(%rsi),%r12 + bswap %r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,24(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 8*4(%rsi),%r12 + bswap %r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,32(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 8*5(%rsi),%r12 + bswap %r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,40(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 8*6(%rsi),%r12 + bswap %r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,48(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 8*7(%rsi),%r12 + bswap %r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,56(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + mov 8*8(%rsi),%r12 + bswap %r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,64(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 8*9(%rsi),%r12 + bswap %r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,72(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 8*10(%rsi),%r12 + bswap %r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,80(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 8*11(%rsi),%r12 + bswap %r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,88(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 8*12(%rsi),%r12 + bswap %r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,96(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 8*13(%rsi),%r12 + bswap %r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,104(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 8*14(%rsi),%r12 + bswap %r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,112(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 8*15(%rsi),%r12 + bswap %r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,120(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + jmp .Lrounds_16_xx +.align 4, 0x90 +.Lrounds_16_xx: + mov 8(%rsp),%r13 + mov 112(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 72(%rsp),%r12 + + add 0(%rsp),%r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,0(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 16(%rsp),%r13 + mov 120(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 80(%rsp),%r12 + + add 8(%rsp),%r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,8(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 24(%rsp),%r13 + mov 0(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 88(%rsp),%r12 + + add 16(%rsp),%r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,16(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 32(%rsp),%r13 + mov 8(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 96(%rsp),%r12 + + add 24(%rsp),%r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,24(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 40(%rsp),%r13 + mov 16(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 104(%rsp),%r12 + + add 32(%rsp),%r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,32(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 48(%rsp),%r13 + mov 24(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 112(%rsp),%r12 + + add 40(%rsp),%r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,40(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 56(%rsp),%r13 + mov 32(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 120(%rsp),%r12 + + add 48(%rsp),%r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,48(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 64(%rsp),%r13 + mov 40(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 0(%rsp),%r12 + + add 56(%rsp),%r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,56(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + mov 72(%rsp),%r13 + mov 48(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 8(%rsp),%r12 + + add 64(%rsp),%r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,64(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 80(%rsp),%r13 + mov 56(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 16(%rsp),%r12 + + add 72(%rsp),%r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,72(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 88(%rsp),%r13 + mov 64(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 24(%rsp),%r12 + + add 80(%rsp),%r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,80(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 96(%rsp),%r13 + mov 72(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 32(%rsp),%r12 + + add 88(%rsp),%r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,88(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 104(%rsp),%r13 + mov 80(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 40(%rsp),%r12 + + add 96(%rsp),%r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,96(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 112(%rsp),%r13 + mov 88(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 48(%rsp),%r12 + + add 104(%rsp),%r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,104(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 120(%rsp),%r13 + mov 96(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 56(%rsp),%r12 + + add 112(%rsp),%r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,112(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 0(%rsp),%r13 + mov 104(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 64(%rsp),%r12 + + add 120(%rsp),%r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,120(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + cmp $80,%rdi + jb .Lrounds_16_xx + + mov 16*8+0*8(%rsp),%rdi + lea 16*8(%rsi),%rsi + + add 8*0(%rdi),%rax + add 8*1(%rdi),%rbx + add 8*2(%rdi),%rcx + add 8*3(%rdi),%rdx + add 8*4(%rdi),%r8 + add 8*5(%rdi),%r9 + add 8*6(%rdi),%r10 + add 8*7(%rdi),%r11 + + cmp 16*8+2*8(%rsp),%rsi + + mov %rax,8*0(%rdi) + mov %rbx,8*1(%rdi) + mov %rcx,8*2(%rdi) + mov %rdx,8*3(%rdi) + mov %r8,8*4(%rdi) + mov %r9,8*5(%rdi) + mov %r10,8*6(%rdi) + mov %r11,8*7(%rdi) + jb .Lloop + + mov 16*8+3*8(%rsp),%rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +SET_SIZE(SHA512TransformBlocks) + +.align 6, 0x90 +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +#endif /* !lint && !__lint */ diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index 503d4e48b4..b6de853b1b 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -1199,7 +1199,8 @@ taskq_destroy(taskq_t *tq) } EXPORT_SYMBOL(taskq_destroy); -int EMPTY_TASKQ(taskq_t *tq) +int +EMPTY_TASKQ(taskq_t *tq) { #ifdef _KERNEL return (tq->tq_lowest_id == tq->tq_next_id); diff --git a/module/os/macos/.gitignore b/module/os/macos/.gitignore new file mode 100644 index 0000000000..14140f47af --- /dev/null +++ b/module/os/macos/.gitignore @@ -0,0 +1 @@ +*.in diff --git a/module/os/macos/Makefile.am b/module/os/macos/Makefile.am new file mode 100644 index 0000000000..5a729e5ad2 --- /dev/null +++ b/module/os/macos/Makefile.am @@ -0,0 +1,6 @@ +# Makefile used only by macOS. Should define no dependencies for +# other platforms. + +if BUILD_MACOS +SUBDIRS=kernel spl zfs +endif diff --git a/module/os/macos/README.md b/module/os/macos/README.md new file mode 100644 index 0000000000..45c21cf4a0 --- /dev/null +++ b/module/os/macos/README.md @@ -0,0 +1,8 @@ + +OpenZFS on OS X, the [macOS](https://openzfsonosx.org) port of [Open ZFS](https://openzfs.org) + +Please use the [OpenZFSOnOsX](https://github.com/openzfsonosx/openzfs) +repository for support, troubleshooting, and using GitHub issues. + +For more compiling information please visit the +[wiki](https://openzfsonosx.org/wiki/Install#Initial_installation_from_source) diff --git a/module/os/macos/kernel/.gitignore b/module/os/macos/kernel/.gitignore new file mode 100644 index 0000000000..f0d4d14070 --- /dev/null +++ b/module/os/macos/kernel/.gitignore @@ -0,0 +1,5 @@ +allsymbols +kernelexports +kernelexports_32 +kernelexports_64 +kextsymboltool diff --git a/module/os/macos/kernel/Info.plist b/module/os/macos/kernel/Info.plist new file mode 100644 index 0000000000..5dd4c6b416 --- /dev/null +++ b/module/os/macos/kernel/Info.plist @@ -0,0 +1,34 @@ + + + + + CFBundleDevelopmentRegion + English + CFBundleExecutable + KernelExports + CFBundleGetInfoString + Mach Kernel Pseudoextension, Apple Computer Inc, 12.5.0 + CFBundleIdentifier + net.lundman.kernel.dependencies + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + Mach Kernel Pseudoextension + CFBundlePackageType + KEXT + CFBundleShortVersionString + 12.5.0 + CFBundleSignature + ???? + CFBundleVersion + 12.5.0 + OSBundleCompatibleVersion + 8.0.0d0 + OSKernelResource + + OSBundleAllowUserLoad + + OSBundleRequired + Root + + diff --git a/module/os/macos/kernel/Makefile.am b/module/os/macos/kernel/Makefile.am new file mode 100644 index 0000000000..499bc3ef3a --- /dev/null +++ b/module/os/macos/kernel/Makefile.am @@ -0,0 +1,25 @@ + +AUTOMAKE_OPTIONS = subdir-objects + +noinst_PROGRAMS = kextsymboltool + +kextsymboltool_SOURCES = \ + kextsymboltool.c + +kextsymboltool_CPPFLAGS = +kextsymboltool_CFLAGS = +kextsymboltool_LDFLAGS = -lstdc++ + +kernelexports: zfs.exports | kextsymboltool + ./kextsymboltool -arch x86_64 -import allsymbols -export zfs.exports -output kernelexports_64 + ./kextsymboltool -arch i386 -import allsymbols -export zfs.exports -output kernelexports_32 + lipo -create kernelexports_32 kernelexports_64 -output kernelexports + +clean: + rm -f kernelexports kernelexports_32 kernelexports_64 allsymbols + rm -f kextsymboltool.o kextsymboltool + +allsymbols: + $(NM) -gj $(MACH_KERNEL) > allsymbols + +all:kextsymboltool allsymbols kernelexports diff --git a/module/os/macos/kernel/README.txt b/module/os/macos/kernel/README.txt new file mode 100644 index 0000000000..104f668bbe --- /dev/null +++ b/module/os/macos/kernel/README.txt @@ -0,0 +1,10 @@ + +Not all symbols are exported by default in OS X, and we have to do +a little magic to get around that. + +This uses the OpenSource kextsymbol.c utility, and a dump of all the +symbols in the kernel, to produce a link helper kext. + +We most likely need to make it better to handle kernel versions +a little more flexibly. + diff --git a/module/os/macos/kernel/kextsymboltool.c b/module/os/macos/kernel/kextsymboltool.c new file mode 100644 index 0000000000..19ffb2f306 --- /dev/null +++ b/module/os/macos/kernel/kextsymboltool.c @@ -0,0 +1,912 @@ +/* + * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#pragma mark Typedefs, Enums, Constants +/********************************************************************* +* Typedefs, Enums, Constants +*********************************************************************/ +typedef enum { + kErrorNone = 0, + kError, + kErrorFileAccess, + kErrorDiskFull, + kErrorDuplicate +} ToolError; + +#pragma mark Function Protos +/********************************************************************* +* Function Protos +*********************************************************************/ +__private_extern__ ToolError +readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize); + +__private_extern__ ToolError +writeFile(int fd, const void * data, size_t length); + +extern char* __cxa_demangle (const char* mangled_name, + char* buf, + size_t* n, + int* status); + +#pragma mark Functions +/********************************************************************* +*********************************************************************/ +__private_extern__ ToolError +writeFile(int fd, const void * data, size_t length) +{ + ToolError err; + + if (length != (size_t)write(fd, data, length)) + err = kErrorDiskFull; + else + err = kErrorNone; + + if (kErrorNone != err) + perror("couldn't write output"); + + return( err ); +} + +/********************************************************************* +*********************************************************************/ +__private_extern__ ToolError +readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize) +{ + ToolError err = kErrorFileAccess; + int fd; + struct stat stat_buf; + + *objAddr = 0; + *objSize = 0; + + do + { + if((fd = open(path, O_RDONLY)) == -1) + continue; + + if(fstat(fd, &stat_buf) == -1) + continue; + + if (0 == (stat_buf.st_mode & S_IFREG)) + continue; + + /* Don't try to map an empty file, it fails now due to conformance + * stuff (PR 4611502). + */ + if (0 == stat_buf.st_size) { + err = kErrorNone; + continue; + } + + *objSize = stat_buf.st_size; + + *objAddr = (vm_offset_t)mmap(NULL /* address */, *objSize, + PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE /* flags */, + fd, 0 /* offset */); + + if ((void *)*objAddr == MAP_FAILED) { + *objAddr = 0; + *objSize = 0; + continue; + } + + err = kErrorNone; + + } while( false ); + + if (-1 != fd) + { + close(fd); + } + if (kErrorNone != err) + { + fprintf(stderr, "couldn't read %s: %s\n", path, strerror(errno)); + } + + return( err ); +} + + +enum { kExported = 0x00000001, kObsolete = 0x00000002 }; + +struct symbol { + char * name; + unsigned int name_len; + char * indirect; + unsigned int indirect_len; + unsigned int flags; + struct symbol * list; + unsigned int list_count; +}; + +static bool issymchar( char c ) +{ + return ((c > ' ') && (c <= '~') && (c != ':') && (c != '#')); +} + +static bool iswhitespace( char c ) +{ + return ((c == ' ') || (c == '\t')); +} + +/* + * Function for qsort for comparing symbol list names. + */ +static int +qsort_cmp(const void * _left, const void * _right) +{ + struct symbol * left = (struct symbol *) _left; + struct symbol * right = (struct symbol *) _right; + + return (strcmp(left->name, right->name)); +} + +/* + * Function for bsearch for finding a symbol name. + */ + +static int +bsearch_cmp( const void * _key, const void * _cmp) +{ + char * key = (char *)_key; + struct symbol * cmp = (struct symbol *) _cmp; + + return(strcmp(key, cmp->name)); +} + +struct bsearch_key +{ + char * name; + unsigned int name_len; +}; + +static int +bsearch_cmp_prefix( const void * _key, const void * _cmp) +{ + struct bsearch_key * key = (struct bsearch_key *)_key; + struct symbol * cmp = (struct symbol *) _cmp; + + return(strncmp(key->name, cmp->name, key->name_len)); +} + +static uint32_t +count_symbols(char * file, vm_size_t file_size) +{ + uint32_t nsyms = 0; + char * scan; + char * eol; + char * next; + + for (scan = file; true; scan = next) { + + eol = memchr(scan, '\n', file_size - (scan - file)); + if (eol == NULL) { + break; + } + next = eol + 1; + + /* Skip empty lines. + */ + if (eol == scan) { + continue; + } + + /* Skip comment lines. + */ + if (scan[0] == '#') { + continue; + } + + /* Scan past any non-symbol characters at the beginning of the line. */ + while ((scan < eol) && !issymchar(*scan)) { + scan++; + } + + /* No symbol on line? Move along. + */ + if (scan == eol) { + continue; + } + + /* Skip symbols starting with '.'. + */ + if (scan[0] == '.') { + continue; + } + nsyms++; + } + + return nsyms; +} + +static uint32_t +store_symbols(char * file, vm_size_t file_size, struct symbol * symbols, uint32_t idx, uint32_t max_symbols) +{ + char * scan; + char * line; + char * eol; + char * next; + + uint32_t strtabsize; + + strtabsize = 0; + + for (scan = file, line = file; true; scan = next, line = next) { + + char * name = NULL; + char * name_term = NULL; + unsigned int name_len = 0; + char * indirect = NULL; + char * indirect_term = NULL; + unsigned int indirect_len = 0; + char * option = NULL; + char * option_term = NULL; + unsigned int option_len = 0; + char optionstr[256]; + boolean_t obsolete = 0; + + eol = memchr(scan, '\n', file_size - (scan - file)); + if (eol == NULL) { + break; + } + next = eol + 1; + + /* Skip empty lines. + */ + if (eol == scan) { + continue; + } + + *eol = '\0'; + + /* Skip comment lines. + */ + if (scan[0] == '#') { + continue; + } + + /* Scan past any non-symbol characters at the beginning of the line. */ + while ((scan < eol) && !issymchar(*scan)) { + scan++; + } + + /* No symbol on line? Move along. + */ + if (scan == eol) { + continue; + } + + /* Skip symbols starting with '.'. + */ + if (scan[0] == '.') { + continue; + } + + name = scan; + + /* Find the end of the symbol. + */ + while ((*scan != '\0') && issymchar(*scan)) { + scan++; + } + + /* Note char past end of symbol. + */ + name_term = scan; + + /* Stored length must include the terminating nul char. + */ + name_len = name_term - name + 1; + + /* Now look for an indirect. + */ + if (*scan != '\0') { + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + if (*scan == ':') { + scan++; + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + if (issymchar(*scan)) { + indirect = scan; + + /* Find the end of the symbol. + */ + while ((*scan != '\0') && issymchar(*scan)) { + scan++; + } + + /* Note char past end of symbol. + */ + indirect_term = scan; + + /* Stored length must include the terminating nul char. + */ + indirect_len = indirect_term - indirect + 1; + + } else if (*scan == '\0') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + } else if (*scan != '\0' && *scan != '-') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + } + + /* Look for options. + */ + if (*scan != '\0') { + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + + if (*scan == '-') { + scan++; + + if (isalpha(*scan)) { + option = scan; + + /* Find the end of the option. + */ + while ((*scan != '\0') && isalpha(*scan)) { + scan++; + } + + /* Note char past end of option. + */ + option_term = scan; + option_len = option_term - option; + + if (option_len >= sizeof(optionstr)) { + fprintf(stderr, "option too long in symbol line: %s\n", line); + exit(1); + } + memcpy(optionstr, option, option_len); + optionstr[option_len] = '\0'; + + /* Find the option. + */ + if (!strncmp(optionstr, "obsolete", option_len)) { + obsolete = TRUE; + } + + } else if (*scan == '\0') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + + } + + } + + if(idx >= max_symbols) { + fprintf(stderr, "symbol[%d/%d] overflow: %s\n", idx, max_symbols, line); + exit(1); + } + + *name_term = '\0'; + if (indirect_term) { + *indirect_term = '\0'; + } + + symbols[idx].name = name; + symbols[idx].name_len = name_len; + symbols[idx].indirect = indirect; + symbols[idx].indirect_len = indirect_len; + symbols[idx].flags = (obsolete) ? kObsolete : 0; + + strtabsize += symbols[idx].name_len + symbols[idx].indirect_len; + idx++; + } + + return strtabsize; +} + +/********************************************************************* +*********************************************************************/ +int main(int argc, char * argv[]) +{ + ToolError err; + int i, fd; + const char * output_name = NULL; + uint32_t zero = 0, num_files = 0; + uint32_t filenum; + uint32_t strx, strtabsize, strtabpad; + struct symbol * import_symbols; + struct symbol * export_symbols; + uint32_t num_import_syms, num_export_syms; + uint32_t result_count, num_removed_syms; + uint32_t import_idx, export_idx; + const NXArchInfo * host_arch; + const NXArchInfo * target_arch; + boolean_t require_imports = true; + boolean_t diff = false; + + + struct file { + vm_offset_t mapped; + vm_size_t mapped_size; + uint32_t nsyms; + boolean_t import; + const char * path; + }; + struct file files[64]; + + host_arch = NXGetLocalArchInfo(); + target_arch = host_arch; + + for( i = 1; i < argc; i += 2) + { + boolean_t import; + + if (!strcmp("-sect", argv[i])) + { + require_imports = false; + i--; + continue; + } + if (!strcmp("-diff", argv[i])) + { + require_imports = false; + diff = true; + i--; + continue; + } + + if (i == (argc - 1)) + { + fprintf(stderr, "bad arguments: %s\n", argv[i]); + exit(1); + } + + if (!strcmp("-arch", argv[i])) + { + target_arch = NXGetArchInfoFromName(argv[i + 1]); + if (!target_arch) + { + fprintf(stderr, "unknown architecture name: %s\n", argv[i+1]); + exit(1); + } + continue; + } + if (!strcmp("-output", argv[i])) + { + output_name = argv[i+1]; + continue; + } + + if (!strcmp("-import", argv[i])) + import = true; + else if (!strcmp("-export", argv[i])) + import = false; + else + { + fprintf(stderr, "unknown option: %s\n", argv[i]); + exit(1); + } + + err = readFile(argv[i+1], &files[num_files].mapped, &files[num_files].mapped_size); + if (kErrorNone != err) + exit(1); + + if (files[num_files].mapped && files[num_files].mapped_size) + { + files[num_files].import = import; + files[num_files].path = argv[i+1]; + num_files++; + } + } + + if (!output_name) + { + fprintf(stderr, "no output file\n"); + exit(1); + } + + num_import_syms = 0; + num_export_syms = 0; + for (filenum = 0; filenum < num_files; filenum++) + { + files[filenum].nsyms = count_symbols((char *) files[filenum].mapped, files[filenum].mapped_size); + if (files[filenum].import) + num_import_syms += files[filenum].nsyms; + else + num_export_syms += files[filenum].nsyms; + } + if (!num_export_syms) + { + fprintf(stderr, "no export names\n"); + exit(1); + } + + import_symbols = calloc(num_import_syms, sizeof(struct symbol)); + export_symbols = calloc(num_export_syms, sizeof(struct symbol)); + + import_idx = 0; + export_idx = 0; + + for (filenum = 0; filenum < num_files; filenum++) + { + if (files[filenum].import) + { + store_symbols((char *) files[filenum].mapped, files[filenum].mapped_size, + import_symbols, import_idx, num_import_syms); + import_idx += files[filenum].nsyms; + } + else + { + store_symbols((char *) files[filenum].mapped, files[filenum].mapped_size, + export_symbols, export_idx, num_export_syms); + export_idx += files[filenum].nsyms; + } + if (false && !files[filenum].nsyms) + { + fprintf(stderr, "warning: file %s contains no names\n", files[filenum].path); + } + } + + + qsort(import_symbols, num_import_syms, sizeof(struct symbol), &qsort_cmp); + qsort(export_symbols, num_export_syms, sizeof(struct symbol), &qsort_cmp); + + result_count = 0; + num_removed_syms = 0; + strtabsize = 4; + if (num_import_syms) + { + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + struct symbol * result; + char * name; + size_t len; + boolean_t wild; + + name = export_symbols[export_idx].indirect; + len = export_symbols[export_idx].indirect_len; + if (!name) + { + name = export_symbols[export_idx].name; + len = export_symbols[export_idx].name_len; + } + wild = ((len > 2) && ('*' == name[len-=2])); + if (wild) + { + struct bsearch_key key; + key.name = name; + key.name_len = len; + result = bsearch(&key, import_symbols, + num_import_syms, sizeof(struct symbol), &bsearch_cmp_prefix); + + if (result) + { + struct symbol * first; + struct symbol * last; + + strtabsize += (result->name_len + result->indirect_len); + + first = result; + while (--first >= &import_symbols[0]) + { + if (bsearch_cmp_prefix(&key, first)) + break; + strtabsize += (first->name_len + first->indirect_len); + } + first++; + + last = result; + while (++last < (&import_symbols[0] + num_import_syms)) + { + if (bsearch_cmp_prefix(&key, last)) + break; + strtabsize += (last->name_len + last->indirect_len); + } + result_count += last - first; + result = first; + export_symbols[export_idx].list = first; + export_symbols[export_idx].list_count = last - first; + export_symbols[export_idx].flags |= kExported; + } + } + else + result = bsearch(name, import_symbols, + num_import_syms, sizeof(struct symbol), &bsearch_cmp); + + if (!result && require_imports) + { + int status; + char * demangled_result = + __cxa_demangle(export_symbols[export_idx].name + 1, NULL, NULL, &status); + fprintf(stderr, "exported name not in import list: %s\n", + demangled_result ? demangled_result : export_symbols[export_idx].name); +// fprintf(stderr, " : %s\n", export_symbols[export_idx].name); + if (demangled_result) { + free(demangled_result); + } + num_removed_syms++; + } + if (diff) + { + if (!result) + result = &export_symbols[export_idx]; + else + result = NULL; + } + if (result && !wild) + { + export_symbols[export_idx].flags |= kExported; + strtabsize += (export_symbols[export_idx].name_len + export_symbols[export_idx].indirect_len); + result_count++; + export_symbols[export_idx].list = &export_symbols[export_idx]; + export_symbols[export_idx].list_count = 1; + } + } + } + strtabpad = (strtabsize + 3) & ~3; + + if (require_imports && num_removed_syms) + { + err = kError; + goto finish; + } + + fd = open(output_name, O_WRONLY|O_CREAT|O_TRUNC, 0755); + if (-1 == fd) + { + perror("couldn't write output"); + err = kErrorFileAccess; + goto finish; + } + + struct symtab_command symcmd; + struct uuid_command uuidcmd; + + symcmd.cmd = LC_SYMTAB; + symcmd.cmdsize = sizeof(symcmd); + symcmd.symoff = sizeof(symcmd) + sizeof(uuidcmd); + symcmd.nsyms = result_count; + symcmd.strsize = strtabpad; + + uuidcmd.cmd = LC_UUID; + uuidcmd.cmdsize = sizeof(uuidcmd); + uuid_generate(uuidcmd.uuid); + + if (CPU_ARCH_ABI64 & target_arch->cputype) + { + struct mach_header_64 hdr; + hdr.magic = MH_MAGIC_64; + hdr.cputype = target_arch->cputype; + hdr.cpusubtype = target_arch->cpusubtype; + hdr.filetype = MH_KEXT_BUNDLE; + hdr.ncmds = 2; + hdr.sizeofcmds = sizeof(symcmd) + sizeof(uuidcmd); + hdr.flags = MH_INCRLINK; + + symcmd.symoff += sizeof(hdr); + symcmd.stroff = result_count * sizeof(struct nlist_64) + + symcmd.symoff; + + if (target_arch->byteorder != host_arch->byteorder) + swap_mach_header_64(&hdr, target_arch->byteorder); + err = writeFile(fd, &hdr, sizeof(hdr)); + } + else + { + struct mach_header hdr; + hdr.magic = MH_MAGIC; + hdr.cputype = target_arch->cputype; + hdr.cpusubtype = target_arch->cpusubtype; + hdr.filetype = (target_arch->cputype == CPU_TYPE_I386) ? MH_OBJECT : MH_KEXT_BUNDLE; + hdr.ncmds = 2; + hdr.sizeofcmds = sizeof(symcmd) + sizeof(uuidcmd); + hdr.flags = MH_INCRLINK; + + symcmd.symoff += sizeof(hdr); + symcmd.stroff = result_count * sizeof(struct nlist) + + symcmd.symoff; + + if (target_arch->byteorder != host_arch->byteorder) + swap_mach_header(&hdr, target_arch->byteorder); + err = writeFile(fd, &hdr, sizeof(hdr)); + } + + if (kErrorNone != err) + goto finish; + + if (target_arch->byteorder != host_arch->byteorder) { + swap_symtab_command(&symcmd, target_arch->byteorder); + swap_uuid_command(&uuidcmd, target_arch->byteorder); + } + err = writeFile(fd, &symcmd, sizeof(symcmd)); + if (kErrorNone != err) + goto finish; + err = writeFile(fd, &uuidcmd, sizeof(uuidcmd)); + if (kErrorNone != err) + goto finish; + + strx = 4; + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + if (!export_symbols[export_idx].name) + continue; + if (!(kExported & export_symbols[export_idx].flags)) + continue; + + if (export_idx + && export_symbols[export_idx - 1].name + && !strcmp(export_symbols[export_idx - 1].name, export_symbols[export_idx].name)) + { + fprintf(stderr, "duplicate export: %s\n", export_symbols[export_idx - 1].name); + err = kErrorDuplicate; + goto finish; + } + + for (import_idx = 0; import_idx < export_symbols[export_idx].list_count; import_idx++) + { + + if (export_symbols[export_idx].list != &export_symbols[export_idx]) + { + printf("wild: %s, %s\n", export_symbols[export_idx].name, + export_symbols[export_idx].list[import_idx].name); + } + if (CPU_ARCH_ABI64 & target_arch->cputype) + { + struct nlist_64 nl; + + nl.n_sect = 0; + nl.n_desc = 0; + nl.n_un.n_strx = strx; + strx += export_symbols[export_idx].list[import_idx].name_len; + + if (export_symbols[export_idx].flags & kObsolete) { + nl.n_desc |= N_DESC_DISCARDED; + } + + if (export_symbols[export_idx].list[import_idx].indirect) + { + nl.n_type = N_INDR | N_EXT; + nl.n_value = strx; + strx += export_symbols[export_idx].list[import_idx].indirect_len; + } + else + { + nl.n_type = N_UNDF | N_EXT; + nl.n_value = 0; + } + + if (target_arch->byteorder != host_arch->byteorder) + swap_nlist_64(&nl, 1, target_arch->byteorder); + + err = writeFile(fd, &nl, sizeof(nl)); + } + else + { + struct nlist nl; + + nl.n_sect = 0; + nl.n_desc = 0; + nl.n_un.n_strx = strx; + strx += export_symbols[export_idx].list[import_idx].name_len; + + if (export_symbols[export_idx].flags & kObsolete) { + nl.n_desc |= N_DESC_DISCARDED; + } + + if (export_symbols[export_idx].list[import_idx].indirect) + { + nl.n_type = N_INDR | N_EXT; + nl.n_value = strx; + strx += export_symbols[export_idx].list[import_idx].indirect_len; + } + else + { + nl.n_type = N_UNDF | N_EXT; + nl.n_value = 0; + } + + if (target_arch->byteorder != host_arch->byteorder) + swap_nlist(&nl, 1, target_arch->byteorder); + + err = writeFile(fd, &nl, sizeof(nl)); + } + } + + if (kErrorNone != err) + goto finish; + } + + strx = sizeof(uint32_t); + err = writeFile(fd, &zero, strx); + if (kErrorNone != err) + goto finish; + + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + if (!export_symbols[export_idx].name) + continue; + + for (import_idx = 0; import_idx < export_symbols[export_idx].list_count; import_idx++) + { + err = writeFile(fd, export_symbols[export_idx].list[import_idx].name, + export_symbols[export_idx].list[import_idx].name_len); + if (kErrorNone != err) + goto finish; + if (export_symbols[export_idx].list[import_idx].indirect) + { + err = writeFile(fd, export_symbols[export_idx].list[import_idx].indirect, + export_symbols[export_idx].list[import_idx].indirect_len); + if (kErrorNone != err) + goto finish; + } + } + } + + err = writeFile(fd, &zero, strtabpad - strtabsize); + if (kErrorNone != err) + goto finish; + + close(fd); + + +finish: + for (filenum = 0; filenum < num_files; filenum++) { + // unmap file + if (files[filenum].mapped_size) + { + munmap((caddr_t)files[filenum].mapped, files[filenum].mapped_size); + files[filenum].mapped = 0; + files[filenum].mapped_size = 0; + } + + } + + if (kErrorNone != err) + { + if (output_name) + unlink(output_name); + exit(1); + } + else + exit(0); + return(0); +} + diff --git a/module/os/macos/kernel/version.plist b/module/os/macos/kernel/version.plist new file mode 100644 index 0000000000..93dfa2a162 --- /dev/null +++ b/module/os/macos/kernel/version.plist @@ -0,0 +1,16 @@ + + + + + BuildVersion + 1 + CFBundleShortVersionString + 12.0.0 + CFBundleVersion + 12.0.0 + ProjectName + xnu + SourceVersion + 2050007009000000 + + diff --git a/module/os/macos/kernel/zfs.exports b/module/os/macos/kernel/zfs.exports new file mode 100644 index 0000000000..82752554d0 --- /dev/null +++ b/module/os/macos/kernel/zfs.exports @@ -0,0 +1,32 @@ +_cpu_number +_fp_lookup +_fd_rdwr +_hostname +_kernel_memory_allocate +_virtual_space_start +_virtual_space_end +_vm_page_free_wanted +_vm_page_free_count +_vm_page_free_min +_vm_page_speculative_count +_VFS_ROOT +_vm_pool_low +_fp_drop +_fp_drop_written +_fo_read +_fo_write +_system_inshutdown +_cache_purgevfs +_vfs_context_kernel +_build_path +_kvtophys +__mh_execute_header +_gLoadedKextSummaries +_VNOP_LOOKUP +_vnode_notify +_vfs_get_notify_attributes +_kauth_cred_getgroups +_rootvnode +_cpuid_info +_vnode_iocount +_kx_qsort diff --git a/module/os/macos/spl/Makefile.am b/module/os/macos/spl/Makefile.am new file mode 100644 index 0000000000..4bdcf4aaec --- /dev/null +++ b/module/os/macos/spl/Makefile.am @@ -0,0 +1,59 @@ + +# Anyone remember why we made this a library? +libspl_la_CPPFLAGS= \ + -Wall \ + -nostdinc \ + -mkernel \ + -fno-builtin-printf \ + -D_KERNEL \ + -DKERNEL \ + -DKERNEL_PRIVATE \ + -DDRIVER_PRIVATE \ + -DAPPLE \ + -DNeXT \ + -I$(top_srcdir)/include/os/macos/spl \ + -I$(top_srcdir)/include \ + -I@KERNEL_HEADERS@/Headers \ + -I@KERNEL_HEADERS@/PrivateHeaders + +libspl_la_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ + +libspl_la_LDFLAGS= \ + -Xlinker \ + -kext \ + -nostdlib \ + -lkmodc++ \ + -lkmod \ + -lcc_kext + +libspl_la_LIBS = -lnone + +# If we don't set this to nothing, it adds "-lz -liconv" +LIBS = + +noinst_LTLIBRARIES = libspl.la + +libspl_la_SOURCES = \ + spl-atomic.c \ + spl-condvar.c \ + spl-cred.c \ + spl-ddi.c \ + spl-err.c \ + spl-kmem.c \ + spl-kstat.c \ + spl-list.c \ + spl-mutex.c \ + spl-osx.c \ + spl-policy.c \ + spl-proc.c \ + spl-processor.c \ + spl-proc_list.c \ + spl-rwlock.c \ + spl-seg_kmem.c \ + spl-taskq.c \ + spl-thread.c \ + spl-time.c \ + spl-tsd.c \ + spl-vmem.c \ + spl-vnode.c \ + spl-xdr.c diff --git a/module/os/macos/spl/README.md b/module/os/macos/spl/README.md new file mode 100644 index 0000000000..cc8dbf288e --- /dev/null +++ b/module/os/macos/spl/README.md @@ -0,0 +1,14 @@ +The Solaris Porting Layer, SPL, is a macOS kernel module which provides a +compatibility layer used by the macOS port of Open ZFS. + +# Installation + +The latest version of the SPL is maintained as part of this repository. +Only when building ZFS version 1.9.4 or earlier must an external SPL release +be used. These releases can be found at: + + * Version 1.9.4: https://github.com/openzfsonosx/spl/tree/spl-1.9.4-release + +# Release + +The SPL is released under a CDDL license. diff --git a/module/os/macos/spl/spl-atomic.c b/module/os/macos/spl/spl-atomic.c new file mode 100644 index 0000000000..973f462fdf --- /dev/null +++ b/module/os/macos/spl/spl-atomic.c @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Solaris Porting Layer (SPL) Atomic Implementation. + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include +#include + + +#include +#include +#include +#include + +void * +atomic_cas_ptr(volatile void *target, void *cmp, void *new) +{ +#ifdef __LP64__ + return (void *)__sync_val_compare_and_swap((uint64_t *)target, + (uint64_t)cmp, (uint64_t)new); +#else + return (void *)__sync_val_compare_and_swap((uint32_t *)target, cmp, + new); +#endif +} diff --git a/module/os/macos/spl/spl-condvar.c b/module/os/macos/spl/spl-condvar.c new file mode 100644 index 0000000000..e8618161b3 --- /dev/null +++ b/module/os/macos/spl/spl-condvar.c @@ -0,0 +1,232 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#include +#include +#include + +/* + * cv_timedwait() is similar to cv_wait() except that it additionally expects + * a timeout value specified in ticks. When woken by cv_signal() or + * cv_broadcast() it returns 1, otherwise when the timeout is reached -1 is + * returned. + * + * cv_timedwait_sig() behaves the same as cv_timedwait() but blocks + * interruptibly and can be woken by a signal (EINTR, ERESTART). When + * this occurs 0 is returned. + * + * cv_timedwait_io() and cv_timedwait_sig_io() are variants of cv_timedwait() + * and cv_timedwait_sig() which should be used when waiting for outstanding + * IO to complete. They are responsible for updating the iowait accounting + * when this is supported by the platform. + * + * cv_timedwait_hires() and cv_timedwait_sig_hires() are high resolution + * versions of cv_timedwait() and cv_timedwait_sig(). They expect the timeout + * to be specified as a hrtime_t allowing for timeouts of less than a tick. + * + * N.B. The return values differ slightly from the illumos implementation + * which returns the time remaining, instead of 1, when woken. They both + * return -1 on timeout. Consumers which need to know the time remaining + * are responsible for tracking it themselves. + */ + +#ifdef SPL_DEBUG_MUTEX +void spl_wdlist_settime(void *mpleak, uint64_t value); +#endif + +void +spl_cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) +{ +} + +void +spl_cv_destroy(kcondvar_t *cvp) +{ +} + +void +spl_cv_signal(kcondvar_t *cvp) +{ + wakeup_one((caddr_t)cvp); +} + +void +spl_cv_broadcast(kcondvar_t *cvp) +{ + wakeup((caddr_t)cvp); +} + + +/* + * Block on the indicated condition variable and + * release the associated mutex while blocked. + */ +int +spl_cv_wait(kcondvar_t *cvp, kmutex_t *mp, int flags, const char *msg) +{ + int result; + + if (msg != NULL && msg[0] == '&') + ++msg; /* skip over '&' prefixes */ + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, 0); +#endif + mp->m_owner = NULL; + result = msleep(cvp, (lck_mtx_t *)&mp->m_lock, flags, msg, 0); + mp->m_owner = current_thread(); +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, gethrestime_sec()); +#endif + + /* + * 1 - condvar got cv_signal()/cv_broadcast() + * 0 - received signal (kill -signal) + */ + return (result == EINTR ? 0 : 1); +} + +/* + * Same as cv_wait except the thread will unblock at 'tim' + * (an absolute time) if it hasn't already unblocked. + * + * Returns the amount of time left from the original 'tim' value + * when it was unblocked. + */ +int +spl_cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flags, + const char *msg) +{ + struct timespec ts; + int result; + + if (msg != NULL && msg[0] == '&') + ++msg; /* skip over '&' prefixes */ + + clock_t timenow = zfs_lbolt(); + + /* Already expired? */ + if (timenow >= tim) + return (-1); + + tim -= timenow; + + ts.tv_sec = (tim / hz); + ts.tv_nsec = (tim % hz) * NSEC_PER_SEC / hz; + + /* Both sec and nsec zero is a blocking call in XNU. (Not poll) */ + if (ts.tv_sec == 0 && ts.tv_nsec == 0) + ts.tv_nsec = 1000; + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, 0); +#endif + + mp->m_owner = NULL; + result = msleep(cvp, (lck_mtx_t *)&mp->m_lock, flags, msg, &ts); + + /* msleep grabs the mutex, even if timeout/signal */ + mp->m_owner = current_thread(); + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, gethrestime_sec()); +#endif + + switch (result) { + + case EINTR: /* Signal */ + case ERESTART: + return (0); + + case EWOULDBLOCK: /* Timeout */ + return (-1); + } + + return (1); +} + + +/* + * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. + */ +int +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + struct timespec ts; + int result; + + if (res > 1) { + /* + * Align expiration to the specified resolution. + */ + if (flag & CALLOUT_FLAG_ROUNDUP) + tim += res - 1; + tim = (tim / res) * res; + } + + if ((flag & CALLOUT_FLAG_ABSOLUTE)) { + hrtime_t timenow = gethrtime(); + + /* Already expired? */ + if (timenow >= tim) + return (-1); + + tim -= timenow; + } + + ts.tv_sec = NSEC2SEC(tim); + ts.tv_nsec = tim - SEC2NSEC(ts.tv_sec); + + /* Both sec and nsec set to zero is a blocking call in XNU. */ + if (ts.tv_sec == 0 && ts.tv_nsec == 0) + ts.tv_nsec = 1000; + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, 0); +#endif + + mp->m_owner = NULL; + result = msleep(cvp, (lck_mtx_t *)&mp->m_lock, + flag, "cv_timedwait_hires", &ts); + mp->m_owner = current_thread(); +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, gethrestime_sec()); +#endif + + switch (result) { + + case EINTR: /* Signal */ + case ERESTART: + return (0); + + case EWOULDBLOCK: /* Timeout */ + return (-1); + } + + return (1); +} diff --git a/module/os/macos/spl/spl-cred.c b/module/os/macos/spl/spl-cred.c new file mode 100644 index 0000000000..e7d9da0360 --- /dev/null +++ b/module/os/macos/spl/spl-cred.c @@ -0,0 +1,166 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include +#include + +/* Return the effective user id */ +uid_t +crgetuid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getuid((kauth_cred_t)cr)); +} + +/* Return the real user id */ +uid_t +crgetruid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getruid((kauth_cred_t)cr)); +} + +/* Return the saved user id */ +uid_t +crgetsuid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getsvuid((kauth_cred_t)cr)); +} + +/* Return the filesystem user id */ +uid_t +crgetfsuid(const cred_t *cr) +{ + if (!cr) + return (0); + return (-1); +} + +/* Return the effective group id */ +gid_t +crgetgid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getgid((kauth_cred_t)cr)); +} + +/* Return the real group id */ +gid_t +crgetrgid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getrgid((kauth_cred_t)cr)); +} + +/* Return the saved group id */ +gid_t +crgetsgid(const cred_t *cr) +{ + if (!cr) + return (0); + return (kauth_cred_getsvgid((kauth_cred_t)cr)); +} + +/* Return the filesystem group id */ +gid_t +crgetfsgid(const cred_t *cr) +{ + return (-1); +} + + +extern int kauth_cred_getgroups(kauth_cred_t _cred, gid_t *_groups, + int *_groupcount); +/* + * Unfortunately, to get the count of groups, we have to call XNU which + * memcpy's them over. No real clean way to get around that, but at least + * these calls are done sparingly. + */ +int +crgetngroups(const cred_t *cr) +{ + gid_t gids[NGROUPS]; + int count = NGROUPS; + int ret; + + ret = kauth_cred_getgroups((kauth_cred_t)cr, gids, &count); + + if (!ret) + return (count); + + return (0); +} + + +/* + * We always allocate NGROUPs here, since we don't know how many there will + * be until after the call. Unlike IllumOS, the ptr returned is allocated + * and must be returned by a call to crgetgroupsfree(). + */ +gid_t * +crgetgroups(const cred_t *cr) +{ + gid_t *gids; + int count = NGROUPS; + + gids = kmem_zalloc(sizeof (gid_t) * count, KM_SLEEP); + if (!gids) + return (NULL); + + kauth_cred_getgroups((kauth_cred_t)cr, gids, &count); + + return (gids); +} + +void +crgetgroupsfree(gid_t *gids) +{ + if (!gids) + return; + kmem_free(gids, sizeof (gid_t) * NGROUPS); +} + +/* + * Return true if "cr" belongs in group "gid". + */ +int +spl_cred_ismember_gid(cred_t *cr, gid_t gid) +{ + int ret = 0; // Is not member. + kauth_cred_ismember_gid((kauth_cred_t)cr, gid, &ret); + if (ret == 1) + return (TRUE); + return (FALSE); +} diff --git a/module/os/macos/spl/spl-ddi.c b/module/os/macos/spl/spl-ddi.c new file mode 100644 index 0000000000..0f74af6e9f --- /dev/null +++ b/module/os/macos/spl/spl-ddi.c @@ -0,0 +1,383 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include + + +/* + * Allocate a set of pointers to 'n_items' objects of size 'size' + * bytes. Each pointer is initialized to nil. + * + * The 'size' and 'n_items' values are stashed in the opaque + * handle returned to the caller. + * + * This implementation interprets 'set of pointers' to mean 'array + * of pointers' but note that nothing in the interface definition + * precludes an implementation that uses, for example, a linked list. + * However there should be a small efficiency gain from using an array + * at lookup time. + * + * NOTE As an optimization, we make our growable array allocations in + * powers of two (bytes), since that's how much kmem_alloc (currently) + * gives us anyway. It should save us some free/realloc's .. + * + * As a further optimization, we make the growable array start out + * with MIN_N_ITEMS in it. + */ + + +int +ddi_soft_state_init(void **state_p, size_t size, size_t n_items) +{ + struct i_ddi_soft_state *ss; + + if (state_p == NULL || *state_p != NULL || size == 0) + return (EINVAL); + + ss = kmem_zalloc(sizeof (*ss), KM_SLEEP); + mutex_init(&ss->lock, NULL, MUTEX_DRIVER, NULL); + ss->size = size; + + if (n_items < MIN_N_ITEMS) + ss->n_items = MIN_N_ITEMS; + else { + int bitlog; + + if ((bitlog = ddi_fls(n_items)) == ddi_ffs(n_items)) + bitlog--; + ss->n_items = 1 << bitlog; + } + + ASSERT(ss->n_items >= n_items); + + ss->array = kmem_zalloc(ss->n_items * sizeof (void *), KM_SLEEP); + + *state_p = ss; + + return (0); +} + + +/* + * Allocate a state structure of size 'size' to be associated + * with item 'item'. + * + * In this implementation, the array is extended to + * allow the requested offset, if needed. + */ +int +ddi_soft_state_zalloc(void *state, int item) +{ + struct i_ddi_soft_state *ss; + void **array; + void *new_element; + + if ((ss = state) == NULL || item < 0) + return (DDI_FAILURE); + + mutex_enter(&ss->lock); + if (ss->size == 0) { + mutex_exit(&ss->lock); + cmn_err(CE_WARN, "ddi_soft_state_zalloc: bad handle"); + return (DDI_FAILURE); + } + + array = ss->array; /* NULL if ss->n_items == 0 */ + ASSERT(ss->n_items != 0 && array != NULL); + + /* + * refuse to tread on an existing element + */ + if (item < ss->n_items && array[item] != NULL) { + mutex_exit(&ss->lock); + return (DDI_FAILURE); + } + + /* + * Allocate a new element to plug in + */ + new_element = kmem_zalloc(ss->size, KM_SLEEP); + + /* + * Check if the array is big enough, if not, grow it. + */ + if (item >= ss->n_items) { + void **new_array; + size_t new_n_items; + struct i_ddi_soft_state *dirty; + + /* + * Allocate a new array of the right length, copy + * all the old pointers to the new array, then + * if it exists at all, put the old array on the + * dirty list. + * + * Note that we can't kmem_free() the old array. + * + * Why -- well the 'get' operation is 'mutex-free', so we + * can't easily catch a suspended thread that is just about + * to dereference the array we just grew out of. So we + * cons up a header and put it on a list of 'dirty' + * pointer arrays. (Dirty in the sense that there may + * be suspended threads somewhere that are in the middle + * of referencing them). Fortunately, we -can- garbage + * collect it all at ddi_soft_state_fini time. + */ + new_n_items = ss->n_items; + while (new_n_items < (1 + item)) + new_n_items <<= 1; /* double array size .. */ + + ASSERT(new_n_items >= (1 + item)); /* sanity check! */ + + new_array = kmem_zalloc(new_n_items * sizeof (void *), + KM_SLEEP); + /* + * Copy the pointers into the new array + */ + bcopy(array, new_array, ss->n_items * sizeof (void *)); + + /* + * Save the old array on the dirty list + */ + dirty = kmem_zalloc(sizeof (*dirty), KM_SLEEP); + dirty->array = ss->array; + dirty->n_items = ss->n_items; + dirty->next = ss->next; + ss->next = dirty; + + ss->array = (array = new_array); + ss->n_items = new_n_items; + } + + ASSERT(array != NULL && item < ss->n_items && array[item] == NULL); + + array[item] = new_element; + + mutex_exit(&ss->lock); + return (DDI_SUCCESS); +} + + +/* + * Fetch a pointer to the allocated soft state structure. + * + * This is designed to be cheap. + * + * There's an argument that there should be more checking for + * nil pointers and out of bounds on the array.. but we do a lot + * of that in the alloc/free routines. + * + * An array has the convenience that we don't need to lock read-access + * to it c.f. a linked list. However our "expanding array" strategy + * means that we should hold a readers lock on the i_ddi_soft_state + * structure. + * + * However, from a performance viewpoint, we need to do it without + * any locks at all -- this also makes it a leaf routine. The algorithm + * is 'lock-free' because we only discard the pointer arrays at + * ddi_soft_state_fini() time. + */ +void * +ddi_get_soft_state(void *state, int item) +{ + struct i_ddi_soft_state *ss = state; + + ASSERT(ss != NULL && item >= 0); + + if (item < ss->n_items && ss->array != NULL) + return (ss->array[item]); + return (NULL); +} + +/* + * Free the state structure corresponding to 'item.' Freeing an + * element that has either gone or was never allocated is not + * considered an error. Note that we free the state structure, but + * we don't shrink our pointer array, or discard 'dirty' arrays, + * since even a few pointers don't really waste too much memory. + * + * Passing an item number that is out of bounds, or a null pointer will + * provoke an error message. + */ +void +ddi_soft_state_free(void *state, int item) +{ + struct i_ddi_soft_state *ss; + void **array; + void *element; + static char msg[] = "ddi_soft_state_free:"; + + if ((ss = state) == NULL) { + cmn_err(CE_WARN, "%s null handle", + msg); + return; + } + + element = NULL; + + mutex_enter(&ss->lock); + + if ((array = ss->array) == NULL || ss->size == 0) { + cmn_err(CE_WARN, "%s bad handle", + msg); + } else if (item < 0 || item >= ss->n_items) { + cmn_err(CE_WARN, "%s item %d not in range [0..%lu]", + msg, item, ss->n_items - 1); + } else if (array[item] != NULL) { + element = array[item]; + array[item] = NULL; + } + + mutex_exit(&ss->lock); + + if (element) + kmem_free(element, ss->size); +} + + +/* + * Free the entire set of pointers, and any + * soft state structures contained therein. + * + * Note that we don't grab the ss->lock mutex, even though + * we're inspecting the various fields of the data structure. + * + * There is an implicit assumption that this routine will + * never run concurrently with any of the above on this + * particular state structure i.e. by the time the driver + * calls this routine, there should be no other threads + * running in the driver. + */ +void +ddi_soft_state_fini(void **state_p) +{ + struct i_ddi_soft_state *ss, *dirty; + int item; + static char msg[] = "ddi_soft_state_fini:"; + + if (state_p == NULL || (ss = *state_p) == NULL) + return; + + if (ss->size == 0) { + cmn_err(CE_WARN, "%s bad handle", + msg); + return; + } + + if (ss->n_items > 0) { + for (item = 0; item < ss->n_items; item++) + ddi_soft_state_free(ss, item); + kmem_free(ss->array, ss->n_items * sizeof (void *)); + } + + /* + * Now delete any dirty arrays from previous 'grow' operations + */ + for (dirty = ss->next; dirty; dirty = ss->next) { + ss->next = dirty->next; + kmem_free(dirty->array, dirty->n_items * sizeof (void *)); + kmem_free(dirty, sizeof (*dirty)); + } + + mutex_destroy(&ss->lock); + kmem_free(ss, sizeof (*ss)); + + *state_p = NULL; +} + +int +ddi_create_minor_node(dev_info_t *dip, char *name, int spec_type, + minor_t minor_num, char *node_type, int flag) +{ + dev_t dev; + int error = 0; + char *r, *dup; + + dev = makedev(flag, minor_num); + dip->dev = dev; + + /* + * http://lists.apple.com/archives/darwin-kernel/2007/Nov/msg00038.html + * + * devfs_make_name() has an off-by-one error when using directories + * and it appears Apple does not want to fix it. + * + * We then change "/" to "_" and create more Apple-like /dev names + * + */ + MALLOC(dup, char *, strlen(name)+1, M_TEMP, M_WAITOK); + if (dup == NULL) + return (ENOMEM); + bcopy(name, dup, strlen(name)); + dup[strlen(name)] = '\0'; + + for (r = dup; + (r = strchr(r, '/')); + *r = '_') + /* empty */; + + dip->devc = NULL; + dip->devb = NULL; + + if (spec_type == S_IFCHR) + dip->devc = devfs_make_node(dev, DEVFS_CHAR, + UID_ROOT, GID_OPERATOR, + 0600, "rdisk_%s", dup); + else + dip->devb = devfs_make_node(dev, DEVFS_BLOCK, + UID_ROOT, GID_OPERATOR, + 0600, "disk_%s", dup); + FREE(dup, M_TEMP); + + return (error); +} + +void +ddi_remove_minor_node(dev_info_t *dip, char *name) +{ + if (dip->devc) { + devfs_remove(dip->devc); + dip->devc = NULL; + } + if (dip->devb) { + devfs_remove(dip->devb); + dip->devb = NULL; + } +} + +int +strspn(const char *string, + register char *charset) +{ + register const char *p, *q; + + for (q = string; *q != '\0'; ++q) { + for (p = charset; *p != '\0' && *p != *q; ++p) + ; + if (*p == '\0') + break; + } + return (q-string); +} diff --git a/module/os/macos/spl/spl-debug.c b/module/os/macos/spl/spl-debug.c new file mode 100644 index 0000000000..28ec1612d4 --- /dev/null +++ b/module/os/macos/spl/spl-debug.c @@ -0,0 +1,10 @@ +#include + + + +/* Debug log support enabled */ +__attribute__((noinline)) int assfail(const char *str, const char *file, + unsigned int line) __attribute__((optnone)) +{ + return (1); /* Must return true for ASSERT macro */ +} diff --git a/module/os/macos/spl/spl-err.c b/module/os/macos/spl/spl-err.c new file mode 100644 index 0000000000..455bf2c8b9 --- /dev/null +++ b/module/os/macos/spl/spl-err.c @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#include +#include +#include + +void +vcmn_err(int ce, const char *fmt, va_list ap) +{ + char msg[MAXMSGLEN]; + + vsnprintf(msg, MAXMSGLEN - 1, fmt, ap); + + switch (ce) { + case CE_IGNORE: + break; + case CE_CONT: + printf("%s", msg); + break; + case CE_NOTE: + printf("SPL: Notice: %s\n", msg); + break; + case CE_WARN: + printf("SPL: Warning: %s\n", msg); + break; + case CE_PANIC: + PANIC("%s", msg); + break; + } +} /* vcmn_err() */ + +void +cmn_err(int ce, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(ce, fmt, ap); + va_end(ap); +} /* cmn_err() */ + + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + char msg[MAXMSGLEN]; + va_list ap; + + va_start(ap, fmt); + (void) vsnprintf(msg, sizeof (msg), fmt, ap); + va_end(ap); + + printf("%s", msg); + panic("%s", msg); + + /* Unreachable */ + return (1); +} diff --git a/module/os/macos/spl/spl-kmem.c b/module/os/macos/spl/spl-kmem.c new file mode 100644 index 0000000000..98a89823d3 --- /dev/null +++ b/module/os/macos/spl/spl-kmem.c @@ -0,0 +1,6828 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013, 2020 Jorgen Lundman + * Copyright (C) 2014 Brendon Humphrey + * Copyright (C) 2017 Sean Doran + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// =============================================================== +// Options +// =============================================================== +// #define PRINT_CACHE_STATS 1 + +// Uncomment to turn on kmems' debug features. +#define dprintf if (0) printf + +// =============================================================== +// OS Interface +// =============================================================== + +// This variable is a count of the number of threads +// blocked waiting for memory pages to become free. +// We are using wake indications on this event as a +// indication of paging activity, and therefore as a +// proxy to the machine experiencing memory pressure. +// +// xnu vm variables + +// 0 by default smd +extern volatile unsigned int vm_page_free_wanted; + +// 3500 kern.vm_page_free_min, rarely changes +extern unsigned int vm_page_free_min; + +// will tend to vm_page_free_min smd +extern volatile unsigned int vm_page_free_count; + +#define SMALL_PRESSURE_INCURSION_PAGES (vm_page_free_min >> 5) + +static kcondvar_t spl_free_thread_cv; +static kmutex_t spl_free_thread_lock; +static boolean_t spl_free_thread_exit; +static volatile _Atomic int64_t spl_free; +int64_t spl_free_delta_ema; + +static volatile _Atomic int64_t spl_free_manual_pressure = 0; +static volatile _Atomic boolean_t spl_free_fast_pressure = FALSE; +static _Atomic bool spl_free_maybe_reap_flag = false; +static _Atomic uint64_t spl_free_last_pressure = 0; + +// Start and end address of kernel memory +extern vm_offset_t virtual_space_start; +extern vm_offset_t virtual_space_end; + +// Can be polled to determine if the VM is experiecing +// a shortage of free pages. +extern int vm_pool_low(void); + +// Which CPU are we executing on? +extern int cpu_number(void); + +// Invoke the kernel debugger +extern void Debugger(const char *message); + +// Read from /dev/random +void read_random(void *buffer, uint_t numbytes); + +// =============================================================== +// Non Illumos Variables +// =============================================================== + +// Flag to cause tasks and threads to terminate as +// the kmem module is preparing to unload. +static int shutting_down = 0; + +// Amount of RAM in machine +uint64_t physmem = 0; + +// Size in bytes of the memory allocated in seg_kmem +extern uint64_t segkmem_total_mem_allocated; + +// Number of active threads +extern uint64_t zfs_threads; +extern uint64_t zfs_active_mutex; +extern uint64_t zfs_active_rwlock; + +extern uint64_t total_memory; +extern uint64_t real_total_memory; + +#define MULT 1 + +static const char *KMEM_VA_PREFIX = "kmem_va"; +static const char *KMEM_MAGAZINE_PREFIX = "kmem_magazine_"; + +// =============================================================== +// Illumos Variables +// =============================================================== + +struct kmem_cache_kstat { + kstat_named_t kmc_buf_size; + kstat_named_t kmc_align; + kstat_named_t kmc_chunk_size; + kstat_named_t kmc_slab_size; + kstat_named_t kmc_alloc; + kstat_named_t kmc_alloc_fail; + kstat_named_t kmc_free; + kstat_named_t kmc_depot_alloc; + kstat_named_t kmc_depot_free; + kstat_named_t kmc_depot_contention; + kstat_named_t kmc_slab_alloc; + kstat_named_t kmc_slab_free; + kstat_named_t kmc_buf_constructed; + kstat_named_t kmc_buf_avail; + kstat_named_t kmc_buf_inuse; + kstat_named_t kmc_buf_total; + kstat_named_t kmc_buf_max; + kstat_named_t kmc_slab_create; + kstat_named_t kmc_slab_destroy; + kstat_named_t kmc_vmem_source; + kstat_named_t kmc_hash_size; + kstat_named_t kmc_hash_lookup_depth; + kstat_named_t kmc_hash_rescale; + kstat_named_t kmc_full_magazines; + kstat_named_t kmc_empty_magazines; + kstat_named_t kmc_magazine_size; + kstat_named_t kmc_reap; /* number of kmem_cache_reap() calls */ + kstat_named_t kmc_defrag; /* attempts to defrag all partial slabs */ + kstat_named_t kmc_scan; /* attempts to defrag one partial slab */ + kstat_named_t kmc_move_callbacks; /* sum of yes, no, later, dn, dk */ + kstat_named_t kmc_move_yes; + kstat_named_t kmc_move_no; + kstat_named_t kmc_move_later; + kstat_named_t kmc_move_dont_need; + kstat_named_t kmc_move_dont_know; /* obj unrecognized by client ... */ + kstat_named_t kmc_move_hunt_found; /* ... but found in mag layer */ + kstat_named_t kmc_move_slabs_freed; /* slabs freed by consolidator */ + kstat_named_t kmc_move_reclaimable; /* buffers, if consolidator ran */ + kstat_named_t kmc_no_vba_success; + kstat_named_t kmc_no_vba_fail; + kstat_named_t kmc_arc_no_grow_set; + kstat_named_t kmc_arc_no_grow; +} kmem_cache_kstat = { + { "buf_size", KSTAT_DATA_UINT64 }, + { "align", KSTAT_DATA_UINT64 }, + { "chunk_size", KSTAT_DATA_UINT64 }, + { "slab_size", KSTAT_DATA_UINT64 }, + { "alloc", KSTAT_DATA_UINT64 }, + { "alloc_fail", KSTAT_DATA_UINT64 }, + { "free", KSTAT_DATA_UINT64 }, + { "depot_alloc", KSTAT_DATA_UINT64 }, + { "depot_free", KSTAT_DATA_UINT64 }, + { "depot_contention", KSTAT_DATA_UINT64 }, + { "slab_alloc", KSTAT_DATA_UINT64 }, + { "slab_free", KSTAT_DATA_UINT64 }, + { "buf_constructed", KSTAT_DATA_UINT64 }, + { "buf_avail", KSTAT_DATA_UINT64 }, + { "buf_inuse", KSTAT_DATA_UINT64 }, + { "buf_total", KSTAT_DATA_UINT64 }, + { "buf_max", KSTAT_DATA_UINT64 }, + { "slab_create", KSTAT_DATA_UINT64 }, + { "slab_destroy", KSTAT_DATA_UINT64 }, + { "vmem_source", KSTAT_DATA_UINT64 }, + { "hash_size", KSTAT_DATA_UINT64 }, + { "hash_lookup_depth", KSTAT_DATA_UINT64 }, + { "hash_rescale", KSTAT_DATA_UINT64 }, + { "full_magazines", KSTAT_DATA_UINT64 }, + { "empty_magazines", KSTAT_DATA_UINT64 }, + { "magazine_size", KSTAT_DATA_UINT64 }, + { "reap", KSTAT_DATA_UINT64 }, + { "defrag", KSTAT_DATA_UINT64 }, + { "scan", KSTAT_DATA_UINT64 }, + { "move_callbacks", KSTAT_DATA_UINT64 }, + { "move_yes", KSTAT_DATA_UINT64 }, + { "move_no", KSTAT_DATA_UINT64 }, + { "move_later", KSTAT_DATA_UINT64 }, + { "move_dont_need", KSTAT_DATA_UINT64 }, + { "move_dont_know", KSTAT_DATA_UINT64 }, + { "move_hunt_found", KSTAT_DATA_UINT64 }, + { "move_slabs_freed", KSTAT_DATA_UINT64 }, + { "move_reclaimable", KSTAT_DATA_UINT64 }, + { "no_vba_success", KSTAT_DATA_UINT64 }, + { "no_vba_fail", KSTAT_DATA_UINT64 }, + { "arc_no_grow_set", KSTAT_DATA_UINT64 }, + { "arc_no_grow", KSTAT_DATA_UINT64 }, +}; + +static kmutex_t kmem_cache_kstat_lock; + +/* + * The default set of caches to back kmem_alloc(). + * These sizes should be reevaluated periodically. + * + * We want allocations that are multiples of the coherency granularity + * (64 bytes) to be satisfied from a cache which is a multiple of 64 + * bytes, so that it will be 64-byte aligned. For all multiples of 64, + * the next 1 greater than or equal to it must be a + * multiple of 64. + * + * We split the table into two sections: size <= 4k and size > 4k. This + * saves a lot of space and cache footprint in our cache tables. + */ +static const int kmem_alloc_sizes[] = { + 1 * 8, + 2 * 8, + 3 * 8, + 4 * 8, 5 * 8, 6 * 8, 7 * 8, + 4 * 16, 5 * 16, 6 * 16, 7 * 16, + 4 * 32, 5 * 32, 6 * 32, 7 * 32, + 4 * 64, 5 * 64, 6 * 64, 7 * 64, + 4 * 128, 9*64, 5 * 128, 6 * 128, 13*64, 7 * 128, + P2ALIGN(8192 / 8, 64), + P2ALIGN(8192 / 7, 64), + P2ALIGN(8192 / 6, 64), + P2ALIGN(8192 / 5, 64), + P2ALIGN(8192 / 4, 64), + P2ALIGN(8192 / 3, 64), + P2ALIGN(8192 / 2, 64), +}; + +static const int kmem_big_alloc_sizes[] = { + 2 * 4096, 3 * 4096, + 2 * 8192, 3 * 8192, + 4 * 8192, 5 * 8192, 6 * 8192, 7 * 8192, + 8 * 8192, 9 * 8192, 10 * 8192, 11 * 8192, + 12 * 8192, 13 * 8192, 14 * 8192, 15 * 8192, + 16 * 8192 +}; + +#define KMEM_MAXBUF 4096 +#define KMEM_BIG_MAXBUF_32BIT 32768 +#define KMEM_BIG_MAXBUF 131072 + +#define KMEM_BIG_MULTIPLE 4096 /* big_alloc_sizes must be a multiple */ +#define KMEM_BIG_SHIFT 12 /* lg(KMEM_BIG_MULTIPLE) */ + +static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT]; +static kmem_cache_t *kmem_big_alloc_table[KMEM_BIG_MAXBUF >> KMEM_BIG_SHIFT]; + +#define KMEM_ALLOC_TABLE_MAX (KMEM_MAXBUF >> KMEM_ALIGN_SHIFT) +static size_t kmem_big_alloc_table_max = 0; /* # of filled elements */ + +static kmem_magtype_t kmem_magtype[] = { + { 1, 8, 3200, 65536 }, + { 3, 16, 256, 32768 }, + { 7, 32, 64, 16384 }, + { 15, 64, 0, 8192 }, + { 31, 64, 0, 4096 }, + { 47, 64, 0, 2048 }, + { 63, 64, 0, 1024 }, + { 95, 64, 0, 512 }, + { 143, 64, 0, 0 }, +}; + +static uint32_t kmem_reaping; +static uint32_t kmem_reaping_idspace; + +/* + * kmem tunables + */ +static struct timespec kmem_reap_interval = {15, 0}; +int kmem_depot_contention = 3; /* max failed tryenters per real interval */ +pgcnt_t kmem_reapahead = 0; /* start reaping N pages before pageout */ +int kmem_panic = 1; /* whether to panic on error */ +int kmem_logging = 0; /* kmem_log_enter() override */ +uint32_t kmem_mtbf = 0; /* mean time between failures [default: off] */ +size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */ +size_t kmem_content_log_size; /* content log size [2% of memory] */ +size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */ +size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */ +size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */ +size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */ +size_t kmem_lite_maxalign = 8192; /* maximum buffer alignment for KMF_LITE */ +int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */ +size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */ +size_t kmem_minfirewall; /* hardware-enforced redzone threshold */ + +size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */ + +/* + * Be aware that KMF_AUDIT does not release memory, and you will eventually + * grind to a halt. But it is useful to enable if you can trigger a memory + * fault, and wish to see the calling stack. + */ +#ifdef DEBUG +// can be 0 or KMF_LITE +// or KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS +// with or without KMF_AUDIT +int kmem_flags = KMF_DEADBEEF | KMF_REDZONE | KMF_LITE; +#else +int kmem_flags = 0; +#endif +int kmem_ready; + +static kmem_cache_t *kmem_slab_cache; +static kmem_cache_t *kmem_bufctl_cache; +static kmem_cache_t *kmem_bufctl_audit_cache; + +static kmutex_t kmem_cache_lock; /* inter-cache linkage only */ +static list_t kmem_caches; +extern vmem_t *heap_arena; +static taskq_t *kmem_taskq; +static kmutex_t kmem_flags_lock; +static vmem_t *kmem_metadata_arena; +static vmem_t *kmem_msb_arena; /* arena for metadata caches */ +static vmem_t *kmem_cache_arena; +static vmem_t *kmem_hash_arena; +static vmem_t *kmem_log_arena; +static vmem_t *kmem_oversize_arena; +static vmem_t *kmem_va_arena; +static vmem_t *kmem_default_arena; +static vmem_t *kmem_firewall_arena; + +/* + * kmem slab consolidator thresholds (tunables) + */ +size_t kmem_frag_minslabs = 101; /* minimum total slabs */ +size_t kmem_frag_numer = 1; /* free buffers (numerator) */ +size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */ +/* + * Maximum number of slabs from which to move buffers during a single + * maintenance interval while the system is not low on memory. + */ +size_t kmem_reclaim_max_slabs = 4; // smd 1 +/* + * Number of slabs to scan backwards from the end of the partial slab list + * when searching for buffers to relocate. + */ +size_t kmem_reclaim_scan_range = 48; // smd 12 + +/* consolidator knobs */ +static boolean_t kmem_move_noreap; +static boolean_t kmem_move_blocked; +static boolean_t kmem_move_fulltilt; +static boolean_t kmem_move_any_partial; + +#ifdef DEBUG +/* + * kmem consolidator debug tunables: + * Ensure code coverage by occasionally running the consolidator even when the + * caches are not fragmented (they may never be). These intervals are mean time + * in cache maintenance intervals (kmem_cache_update). + */ +uint32_t kmem_mtb_move = 20; /* defrag 1 slab (~5min) */ +uint32_t kmem_mtb_reap = 240; /* defrag all slabs (~1hrs) */ +uint32_t kmem_mtb_reap_count = 0; +#endif /* DEBUG */ + +static kmem_cache_t *kmem_defrag_cache; +static kmem_cache_t *kmem_move_cache; +static taskq_t *kmem_move_taskq; + +static void kmem_cache_scan(kmem_cache_t *); +static void kmem_cache_defrag(kmem_cache_t *); +static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *); + + +kmem_log_header_t *kmem_transaction_log; +kmem_log_header_t *kmem_content_log; +kmem_log_header_t *kmem_failure_log; +kmem_log_header_t *kmem_slab_log; + +static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */ + +#define KMEM_BUFTAG_LITE_ENTER(bt, count, caller) \ +if ((count) > 0) { \ +pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history; \ +pc_t *_e; \ +/* memmove() the old entries down one notch */ \ +for (_e = &_s[(count) - 1]; _e > _s; _e--) \ +*_e = *(_e - 1); \ +*_s = (uintptr_t)(caller); \ +} + +#define KMERR_MODIFIED 0 /* buffer modified while on freelist */ +#define KMERR_REDZONE 1 /* redzone violation (write past end of buf) */ +#define KMERR_DUPFREE 2 /* freed a buffer twice */ +#define KMERR_BADADDR 3 /* freed a bad (unallocated) address */ +#define KMERR_BADBUFTAG 4 /* buftag corrupted */ +#define KMERR_BADBUFCTL 5 /* bufctl corrupted */ +#define KMERR_BADCACHE 6 /* freed a buffer to the wrong cache */ +#define KMERR_BADSIZE 7 /* alloc size != free size */ +#define KMERR_BADBASE 8 /* buffer base address wrong */ + +struct { + hrtime_t kmp_timestamp; /* timestamp of panic */ + int kmp_error; /* type of kmem error */ + void *kmp_buffer; /* buffer that induced panic */ + void *kmp_realbuf; /* real start address for buffer */ + kmem_cache_t *kmp_cache; /* buffer's cache according to client */ + kmem_cache_t *kmp_realcache; /* actual cache containing buffer */ + kmem_slab_t *kmp_slab; /* slab accoring to kmem_findslab() */ + kmem_bufctl_t *kmp_bufctl; /* bufctl */ +} kmem_panic_info; + +extern uint64_t stat_osif_malloc_success; +extern uint64_t stat_osif_malloc_bytes; +extern uint64_t stat_osif_free; +extern uint64_t stat_osif_free_bytes; + +extern uint64_t spl_bucket_non_pow2_allocs; + +// stats for spl_root_allocator(); +extern uint64_t spl_root_allocator_calls; +extern uint64_t spl_root_allocator_large_bytes_asked; +extern uint64_t spl_root_allocator_small_bytes_asked; +extern uint64_t spl_root_allocator_minalloc_bytes_asked; +extern uint64_t spl_root_allocator_extra_pass; +extern uint64_t spl_root_allocator_recovered; +extern uint64_t spl_root_allocator_recovered_bytes; + +extern uint64_t spl_vmem_unconditional_allocs; +extern uint64_t spl_vmem_unconditional_alloc_bytes; +extern uint64_t spl_vmem_conditional_allocs; +extern uint64_t spl_vmem_conditional_alloc_bytes; +extern uint64_t spl_vmem_conditional_alloc_deny; +extern uint64_t spl_vmem_conditional_alloc_deny_bytes; + +extern uint64_t spl_xat_success; +extern uint64_t spl_xat_late_success; +extern uint64_t spl_xat_late_success_nosleep; +extern uint64_t spl_xat_pressured; +extern uint64_t spl_xat_bailed; +extern uint64_t spl_xat_bailed_contended; +extern uint64_t spl_xat_lastalloc; +extern uint64_t spl_xat_lastfree; +extern uint64_t spl_xat_forced; +extern uint64_t spl_xat_sleep; +extern uint64_t spl_xat_late_deny; +extern uint64_t spl_xat_no_waiters; +extern uint64_t spl_xft_wait; + +extern uint64_t spl_vba_parent_memory_appeared; +extern uint64_t spl_vba_parent_memory_blocked; +extern uint64_t spl_vba_hiprio_blocked; +extern uint64_t spl_vba_cv_timeout; +extern uint64_t spl_vba_loop_timeout; +extern uint64_t spl_vba_cv_timeout_blocked; +extern uint64_t spl_vba_loop_timeout_blocked; +extern uint64_t spl_vba_sleep; +extern uint64_t spl_vba_loop_entries; + +extern uint64_t spl_bucket_tunable_large_span; +extern uint64_t spl_bucket_tunable_small_span; +extern void spl_set_bucket_tunable_large_span(uint64_t); +extern void spl_set_bucket_tunable_small_span(uint64_t); + +extern _Atomic uint64_t spl_arc_no_grow_bits; +extern uint64_t spl_arc_no_grow_count; + +extern uint64_t spl_frag_max_walk; +extern uint64_t spl_frag_walked_out; +extern uint64_t spl_frag_walk_cnt; + +uint64_t spl_buckets_mem_free = 0; +uint64_t spl_arc_reclaim_avoided = 0; + +uint64_t kmem_free_to_slab_when_fragmented = 0; + +typedef struct spl_stats { + kstat_named_t spl_os_alloc; + kstat_named_t spl_active_threads; + kstat_named_t spl_active_mutex; + kstat_named_t spl_active_rwlock; + kstat_named_t spl_active_tsd; + kstat_named_t spl_free_wake_count; + kstat_named_t spl_spl_free; + kstat_named_t spl_spl_free_manual_pressure; + kstat_named_t spl_spl_free_fast_pressure; + kstat_named_t spl_spl_free_delta_ema; + kstat_named_t spl_spl_free_negative_count; + kstat_named_t spl_osif_malloc_success; + kstat_named_t spl_osif_malloc_bytes; + kstat_named_t spl_osif_free; + kstat_named_t spl_osif_free_bytes; + kstat_named_t spl_bucket_non_pow2_allocs; + + kstat_named_t spl_vmem_unconditional_allocs; + kstat_named_t spl_vmem_unconditional_alloc_bytes; + kstat_named_t spl_vmem_conditional_allocs; + kstat_named_t spl_vmem_conditional_alloc_bytes; + kstat_named_t spl_vmem_conditional_alloc_deny; + kstat_named_t spl_vmem_conditional_alloc_deny_bytes; + + kstat_named_t spl_xat_success; + kstat_named_t spl_xat_late_success; + kstat_named_t spl_xat_late_success_nosleep; + kstat_named_t spl_xat_pressured; + kstat_named_t spl_xat_bailed; + kstat_named_t spl_xat_bailed_contended; + kstat_named_t spl_xat_lastalloc; + kstat_named_t spl_xat_lastfree; + kstat_named_t spl_xat_forced; + kstat_named_t spl_xat_sleep; + kstat_named_t spl_xat_late_deny; + kstat_named_t spl_xat_no_waiters; + kstat_named_t spl_xft_wait; + + kstat_named_t spl_vba_parent_memory_appeared; + kstat_named_t spl_vba_parent_memory_blocked; + kstat_named_t spl_vba_hiprio_blocked; + kstat_named_t spl_vba_cv_timeout; + kstat_named_t spl_vba_loop_timeout; + kstat_named_t spl_vba_cv_timeout_blocked; + kstat_named_t spl_vba_loop_timeout_blocked; + kstat_named_t spl_vba_sleep; + kstat_named_t spl_vba_loop_entries; + + kstat_named_t spl_bucket_tunable_large_span; + kstat_named_t spl_bucket_tunable_small_span; + + kstat_named_t spl_buckets_mem_free; + kstat_named_t spl_arc_no_grow_bits; + kstat_named_t spl_arc_no_grow_count; + kstat_named_t spl_frag_max_walk; + kstat_named_t spl_frag_walked_out; + kstat_named_t spl_frag_walk_cnt; + kstat_named_t spl_arc_reclaim_avoided; + + kstat_named_t kmem_free_to_slab_when_fragmented; +} spl_stats_t; + +static spl_stats_t spl_stats = { + {"os_mem_alloc", KSTAT_DATA_UINT64}, + {"active_threads", KSTAT_DATA_UINT64}, + {"active_mutex", KSTAT_DATA_UINT64}, + {"active_rwlock", KSTAT_DATA_UINT64}, + {"active_tsd", KSTAT_DATA_UINT64}, + {"spl_free_wake_count", KSTAT_DATA_UINT64}, + {"spl_spl_free", KSTAT_DATA_INT64}, + {"spl_spl_free_manual_pressure", KSTAT_DATA_UINT64}, + {"spl_spl_free_fast_pressure", KSTAT_DATA_UINT64}, + {"spl_spl_free_delta_ema", KSTAT_DATA_UINT64}, + {"spl_spl_free_negative_count", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_success", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_bytes", KSTAT_DATA_UINT64}, + {"spl_osif_free", KSTAT_DATA_UINT64}, + {"spl_osif_free_bytes", KSTAT_DATA_UINT64}, + {"spl_bucket_non_pow2_allocs", KSTAT_DATA_UINT64}, + + {"vmem_unconditional_allocs", KSTAT_DATA_UINT64}, + {"vmem_unconditional_alloc_bytes", KSTAT_DATA_UINT64}, + {"vmem_conditional_allocs", KSTAT_DATA_UINT64}, + {"vmem_conditional_alloc_bytes", KSTAT_DATA_UINT64}, + {"vmem_conditional_alloc_deny", KSTAT_DATA_UINT64}, + {"vmem_conditional_alloc_deny_bytes", KSTAT_DATA_UINT64}, + + {"spl_xat_success", KSTAT_DATA_UINT64}, + {"spl_xat_late_success", KSTAT_DATA_UINT64}, + {"spl_xat_late_success_nosleep", KSTAT_DATA_UINT64}, + {"spl_xat_pressured", KSTAT_DATA_UINT64}, + {"spl_xat_bailed", KSTAT_DATA_UINT64}, + {"spl_xat_bailed_contended", KSTAT_DATA_UINT64}, + {"spl_xat_lastalloc", KSTAT_DATA_UINT64}, + {"spl_xat_lastfree", KSTAT_DATA_UINT64}, + {"spl_xat_forced", KSTAT_DATA_UINT64}, + {"spl_xat_sleep", KSTAT_DATA_UINT64}, + {"spl_xat_late_deny", KSTAT_DATA_UINT64}, + {"spl_xat_no_waiters", KSTAT_DATA_UINT64}, + {"spl_xft_wait", KSTAT_DATA_UINT64}, + + {"spl_vba_parent_memory_appeared", KSTAT_DATA_UINT64}, + {"spl_vba_parent_memory_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_hiprio_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_cv_timeout", KSTAT_DATA_UINT64}, + {"spl_vba_loop_timeout", KSTAT_DATA_UINT64}, + {"spl_vba_cv_timeout_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_loop_timeout_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_sleep", KSTAT_DATA_UINT64}, + {"spl_vba_loop_entries", KSTAT_DATA_UINT64}, + + {"spl_tunable_large_span", KSTAT_DATA_UINT64}, + {"spl_tunable_small_span", KSTAT_DATA_UINT64}, + + {"spl_buckets_mem_free", KSTAT_DATA_UINT64}, + {"spl_arc_no_grow_bits", KSTAT_DATA_UINT64}, + {"spl_arc_no_grow_count", KSTAT_DATA_UINT64}, + + {"spl_vmem_frag_max_walk", KSTAT_DATA_UINT64}, + {"spl_vmem_frag_walked_out", KSTAT_DATA_UINT64}, + {"spl_vmem_frag_walk_cnt", KSTAT_DATA_UINT64}, + {"spl_arc_reclaim_avoided", KSTAT_DATA_UINT64}, + + {"kmem_free_to_slab_when_fragmented", KSTAT_DATA_UINT64}, +}; + +static kstat_t *spl_ksp = 0; + +// Stub out caller() +caddr_t +caller() +{ + return ((caddr_t)(0)); +} + +void * +calloc(size_t n, size_t s) +{ + return (zfs_kmem_zalloc(n * s, KM_NOSLEEP)); +} + +#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9') + +#define IS_ALPHA(c) \ +(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + +/* + * Get bytes from the /dev/random generator. Returns 0 + * on success. Returns EAGAIN if there is insufficient entropy. + */ +int +random_get_bytes(uint8_t *ptr, size_t len) +{ + read_random(ptr, len); + return (0); +} + +/* + * BGH - Missing from OSX? + * + * Convert a string into a valid C identifier by replacing invalid + * characters with '_'. Also makes sure the string is nul-terminated + * and takes up at most n bytes. + */ +void +strident_canon(char *s, size_t n) +{ + char c; + char *end = s + n - 1; + + if ((c = *s) == 0) + return; + + if (!IS_ALPHA(c) && c != '_') + *s = '_'; + + while (s < end && ((c = *(++s)) != 0)) { + if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_') + *s = '_'; + } + *s = 0; +} + +int +strident_valid(const char *id) +{ + int c = *id++; + + if (!IS_ALPHA(c) && c != '_') + return (0); + while ((c = *id++) != 0) { + if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_') + return (0); + } + return (1); +} + +static void +copy_pattern(uint64_t pattern, void *buf_arg, size_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf = buf_arg; + + while (buf < bufend) + *buf++ = pattern; +} + +static void * +verify_pattern(uint64_t pattern, void *buf_arg, size_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf; + + for (buf = buf_arg; buf < bufend; buf++) + if (*buf != pattern) + return (buf); + return (NULL); +} + +static void * +verify_and_copy_pattern(uint64_t old, uint64_t new, void *buf_arg, size_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf; + + for (buf = buf_arg; buf < bufend; buf++) { + if (*buf != old) { + copy_pattern(old, buf_arg, + (char *)buf - (char *)buf_arg); + return (buf); + } + *buf = new; + } + + return (NULL); +} + +static void +kmem_cache_applyall(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag) +{ + kmem_cache_t *cp; + + mutex_enter(&kmem_cache_lock); + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) + if (tq != NULL) + (void) taskq_dispatch(tq, (task_func_t *)func, cp, + tqflag); + else + func(cp); + mutex_exit(&kmem_cache_lock); +} + +static void +kmem_cache_applyall_id(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag) +{ + kmem_cache_t *cp; + + mutex_enter(&kmem_cache_lock); + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (!(cp->cache_cflags & KMC_IDENTIFIER)) + continue; + if (tq != NULL) + (void) taskq_dispatch(tq, (task_func_t *)func, cp, + tqflag); + else + func(cp); + } + mutex_exit(&kmem_cache_lock); +} + +/* + * Debugging support. Given a buffer address, find its slab. + */ +static kmem_slab_t * +kmem_findslab(kmem_cache_t *cp, void *buf) +{ + kmem_slab_t *sp; + + mutex_enter(&cp->cache_lock); + for (sp = list_head(&cp->cache_complete_slabs); sp != NULL; + sp = list_next(&cp->cache_complete_slabs, sp)) { + if (KMEM_SLAB_MEMBER(sp, buf)) { + mutex_exit(&cp->cache_lock); + return (sp); + } + } + for (sp = avl_first(&cp->cache_partial_slabs); sp != NULL; + sp = AVL_NEXT(&cp->cache_partial_slabs, sp)) { + if (KMEM_SLAB_MEMBER(sp, buf)) { + mutex_exit(&cp->cache_lock); + return (sp); + } + } + mutex_exit(&cp->cache_lock); + + return (NULL); +} + +static void +kmem_error(int error, kmem_cache_t *cparg, void *bufarg) +{ + kmem_buftag_t *btp = NULL; + kmem_bufctl_t *bcp = NULL; + kmem_cache_t *cp = cparg; + kmem_slab_t *sp; + uint64_t *off; + void *buf = bufarg; + + kmem_logging = 0; /* stop logging when a bad thing happens */ + + kmem_panic_info.kmp_timestamp = gethrtime(); + + sp = kmem_findslab(cp, buf); + if (sp == NULL) { + for (cp = list_tail(&kmem_caches); cp != NULL; + cp = list_prev(&kmem_caches, cp)) { + if ((sp = kmem_findslab(cp, buf)) != NULL) + break; + } + } + + if (sp == NULL) { + cp = NULL; + error = KMERR_BADADDR; + } else { + if (cp != cparg) + error = KMERR_BADCACHE; + else + buf = (char *)bufarg - + ((uintptr_t)bufarg - + (uintptr_t)sp->slab_base) % cp->cache_chunksize; + if (buf != bufarg) + error = KMERR_BADBASE; + if (cp->cache_flags & KMF_BUFTAG) + btp = KMEM_BUFTAG(cp, buf); + if (cp->cache_flags & KMF_HASH) { + mutex_enter(&cp->cache_lock); + for (bcp = *KMEM_HASH(cp, buf); bcp; bcp = bcp->bc_next) + if (bcp->bc_addr == buf) + break; + mutex_exit(&cp->cache_lock); + if (bcp == NULL && btp != NULL) + bcp = btp->bt_bufctl; + if (kmem_findslab(cp->cache_bufctl_cache, bcp) == + NULL || P2PHASE((uintptr_t)bcp, KMEM_ALIGN) || + bcp->bc_addr != buf) { + error = KMERR_BADBUFCTL; + bcp = NULL; + } + } + } + + kmem_panic_info.kmp_error = error; + kmem_panic_info.kmp_buffer = bufarg; + kmem_panic_info.kmp_realbuf = buf; + kmem_panic_info.kmp_cache = cparg; + kmem_panic_info.kmp_realcache = cp; + kmem_panic_info.kmp_slab = sp; + kmem_panic_info.kmp_bufctl = bcp; + + printf("SPL: kernel memory allocator: "); + + switch (error) { + + case KMERR_MODIFIED: + printf("buffer modified after being freed\n"); + off = verify_pattern(KMEM_FREE_PATTERN, buf, + cp->cache_verify); + if (off == NULL) /* shouldn't happen */ + off = buf; + printf("SPL: modification occurred at offset 0x%lx " + "(0x%llx replaced by 0x%llx)\n", + (uintptr_t)off - (uintptr_t)buf, + (longlong_t)KMEM_FREE_PATTERN, (longlong_t)*off); + break; + + case KMERR_REDZONE: + printf("redzone violation: write past end of buffer\n"); + break; + + case KMERR_BADADDR: + printf("invalid free: buffer not in cache\n"); + break; + + case KMERR_DUPFREE: + printf("duplicate free: buffer freed twice\n"); + break; + + case KMERR_BADBUFTAG: + printf("boundary tag corrupted\n"); + printf("SPL: bcp ^ bxstat = %lx, should be %lx\n", + (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat, + KMEM_BUFTAG_FREE); + break; + + case KMERR_BADBUFCTL: + printf("bufctl corrupted\n"); + break; + + case KMERR_BADCACHE: + printf("buffer freed to wrong cache\n"); + printf("SPL: buffer was allocated from %s,\n", + cp->cache_name); + printf("SPL: caller attempting free to %s.\n", + cparg->cache_name); + break; + + case KMERR_BADSIZE: + printf("bad free: free size (%u) != alloc size (%u)\n", + KMEM_SIZE_DECODE(((uint32_t *)btp)[0]), + KMEM_SIZE_DECODE(((uint32_t *)btp)[1])); + break; + + case KMERR_BADBASE: + printf("bad free: free address (%p) != alloc address" + " (%p)\n", bufarg, buf); + break; + } + + printf("SPL: buffer=%p bufctl=%p cache: %s\n", + bufarg, (void *)bcp, cparg->cache_name); + + if (bcp != NULL && (cp->cache_flags & KMF_AUDIT) && + error != KMERR_BADBUFCTL) { + int d; + timestruc_t ts = {0, 0}; + kmem_bufctl_audit_t *bcap = (kmem_bufctl_audit_t *)bcp; + + hrt2ts(kmem_panic_info.kmp_timestamp - bcap->bc_timestamp, &ts); + printf("SPL: previous transaction on buffer %p:\n", buf); + printf("SPL: thread=%p time=T-%ld.%09ld slab=%p cache: %s\n", + (void *)bcap->bc_thread, ts.tv_sec, ts.tv_nsec, + (void *)sp, cp->cache_name); + for (d = 0; d < MIN(bcap->bc_depth, KMEM_STACK_DEPTH); d++) { + print_symbol(bcap->bc_stack[d]); + } + } + + if (kmem_panic > 0) { + extern void IODelay(unsigned microseconds); // lh_cpu[max_ncpus]; + int i; + + /* + * Make sure that lhp->lh_cpu[] is nicely aligned + * to prevent false sharing of cache lines. + */ + lhsize = P2ROUNDUP(lhsize, KMEM_ALIGN); + lhp = vmem_xalloc(kmem_log_arena, lhsize, 64, P2NPHASE(lhsize, 64), 0, + NULL, NULL, VM_SLEEP); + bzero(lhp, lhsize); + + mutex_init(&lhp->lh_lock, NULL, MUTEX_DEFAULT, NULL); + lhp->lh_nchunks = nchunks; + lhp->lh_chunksize = P2ROUNDUP(logsize / nchunks + 1, PAGESIZE); + lhp->lh_base = vmem_alloc(kmem_log_arena, + lhp->lh_chunksize * nchunks, VM_SLEEP); + lhp->lh_free = vmem_alloc(kmem_log_arena, + nchunks * sizeof (int), VM_SLEEP); + bzero(lhp->lh_base, lhp->lh_chunksize * nchunks); + + for (i = 0; i < max_ncpus; i++) { + kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[i]; + mutex_init(&clhp->clh_lock, NULL, MUTEX_DEFAULT, NULL); + clhp->clh_chunk = i; + } + + for (i = max_ncpus; i < nchunks; i++) + lhp->lh_free[i] = i; + + lhp->lh_head = max_ncpus; + lhp->lh_tail = 0; + + return (lhp); +} + + +static void +kmem_log_fini(kmem_log_header_t *lhp) +{ + int nchunks = 4 * max_ncpus; + size_t lhsize = (size_t)&((kmem_log_header_t *)0)->lh_cpu[max_ncpus]; + int i; + + + + for (i = 0; i < max_ncpus; i++) { + kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[i]; + mutex_destroy(&clhp->clh_lock); + } + + vmem_free(kmem_log_arena, lhp->lh_free, nchunks * sizeof (int)); + + vmem_free(kmem_log_arena, lhp->lh_base, lhp->lh_chunksize * nchunks); + + mutex_destroy(&lhp->lh_lock); + + lhsize = P2ROUNDUP(lhsize, KMEM_ALIGN); + vmem_xfree(kmem_log_arena, lhp, lhsize); +} + + +static void * +kmem_log_enter(kmem_log_header_t *lhp, void *data, size_t size) +{ + void *logspace; + + kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[cpu_number()]; + + // if (lhp == NULL || kmem_logging == 0 || panicstr) + if (lhp == NULL || kmem_logging == 0) + return (NULL); + + mutex_enter(&clhp->clh_lock); + clhp->clh_hits++; + if (size > clhp->clh_avail) { + mutex_enter(&lhp->lh_lock); + lhp->lh_hits++; + lhp->lh_free[lhp->lh_tail] = clhp->clh_chunk; + lhp->lh_tail = (lhp->lh_tail + 1) % lhp->lh_nchunks; + clhp->clh_chunk = lhp->lh_free[lhp->lh_head]; + lhp->lh_head = (lhp->lh_head + 1) % lhp->lh_nchunks; + clhp->clh_current = lhp->lh_base + + clhp->clh_chunk * lhp->lh_chunksize; + clhp->clh_avail = lhp->lh_chunksize; + if (size > lhp->lh_chunksize) + size = lhp->lh_chunksize; + mutex_exit(&lhp->lh_lock); + } + logspace = clhp->clh_current; + clhp->clh_current += size; + clhp->clh_avail -= size; + bcopy(data, logspace, size); + mutex_exit(&clhp->clh_lock); + return (logspace); +} + +#define KMEM_AUDIT(lp, cp, bcp) \ +{ \ +kmem_bufctl_audit_t *_bcp = (kmem_bufctl_audit_t *)(bcp); \ +_bcp->bc_timestamp = gethrtime(); \ +_bcp->bc_thread = spl_current_thread(); \ +_bcp->bc_depth = getpcstack(_bcp->bc_stack, KMEM_STACK_DEPTH); \ +_bcp->bc_lastlog = kmem_log_enter((lp), _bcp, sizeof (*_bcp)); \ +} + +static void +kmem_log_event(kmem_log_header_t *lp, kmem_cache_t *cp, + kmem_slab_t *sp, void *addr) +{ + kmem_bufctl_audit_t bca; + + bzero(&bca, sizeof (kmem_bufctl_audit_t)); + bca.bc_addr = addr; + bca.bc_slab = sp; + KMEM_AUDIT(lp, cp, &bca); +} + +/* + * Create a new slab for cache cp. + */ +static kmem_slab_t * +kmem_slab_create(kmem_cache_t *cp, int kmflag) +{ + size_t slabsize = cp->cache_slabsize; + size_t chunksize = cp->cache_chunksize; + int cache_flags = cp->cache_flags; + size_t color, chunks; + char *buf, *slab; + kmem_slab_t *sp; + kmem_bufctl_t *bcp; + vmem_t *vmp = cp->cache_arena; + + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + color = cp->cache_color + cp->cache_align; + if (color > cp->cache_maxcolor) + color = cp->cache_mincolor; + cp->cache_color = color; + + slab = vmem_alloc(vmp, slabsize, kmflag & KM_VMFLAGS); + + if (slab == NULL) + goto vmem_alloc_failure; + + ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0); + + /* + * Reverify what was already checked in kmem_cache_set_move(), since the + * consolidator depends (for correctness) on slabs being initialized + * with the 0xbaddcafe memory pattern (setting a low order bit usable by + * clients to distinguish uninitialized memory from known objects). + */ + ASSERT((cp->cache_move == NULL) || !(cp->cache_cflags & KMC_NOTOUCH)); + if (!(cp->cache_cflags & KMC_NOTOUCH)) + copy_pattern(KMEM_UNINITIALIZED_PATTERN, slab, slabsize); + + if (cache_flags & KMF_HASH) { + if ((sp = kmem_cache_alloc(kmem_slab_cache, kmflag)) == NULL) + goto slab_alloc_failure; + chunks = (slabsize - color) / chunksize; + } else { + sp = KMEM_SLAB(cp, slab); + chunks = (slabsize - sizeof (kmem_slab_t) - color) / chunksize; + } + + sp->slab_cache = cp; + sp->slab_head = NULL; + sp->slab_refcnt = 0; + sp->slab_base = buf = slab + color; + sp->slab_chunks = chunks; + sp->slab_stuck_offset = (uint32_t)-1; + sp->slab_later_count = 0; + sp->slab_flags = 0; + sp->slab_create_time = gethrtime(); + + ASSERT(chunks > 0); + while (chunks-- != 0) { + if (cache_flags & KMF_HASH) { + bcp = kmem_cache_alloc(cp->cache_bufctl_cache, kmflag); + if (bcp == NULL) + goto bufctl_alloc_failure; + if (cache_flags & KMF_AUDIT) { + kmem_bufctl_audit_t *bcap = + (kmem_bufctl_audit_t *)bcp; + bzero(bcap, sizeof (kmem_bufctl_audit_t)); + bcap->bc_cache = cp; + } + bcp->bc_addr = buf; + bcp->bc_slab = sp; + } else { + bcp = KMEM_BUFCTL(cp, buf); + } + if (cache_flags & KMF_BUFTAG) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + btp->bt_redzone = KMEM_REDZONE_PATTERN; + btp->bt_bufctl = bcp; + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE; + if (cache_flags & KMF_DEADBEEF) { + copy_pattern(KMEM_FREE_PATTERN, buf, + cp->cache_verify); + } + } + bcp->bc_next = sp->slab_head; + sp->slab_head = bcp; + buf += chunksize; + } + + kmem_log_event(kmem_slab_log, cp, sp, slab); + + return (sp); + +bufctl_alloc_failure: + + while ((bcp = sp->slab_head) != NULL) { + sp->slab_head = bcp->bc_next; + kmem_cache_free(cp->cache_bufctl_cache, bcp); + } + kmem_cache_free(kmem_slab_cache, sp); + +slab_alloc_failure: + + vmem_free(vmp, slab, slabsize); + +vmem_alloc_failure: + + if (0 == (kmflag & KM_NO_VBA)) { + kmem_log_event(kmem_failure_log, cp, NULL, NULL); + atomic_inc_64(&cp->cache_alloc_fail); + } + + return (NULL); +} + +/* + * Destroy a slab. + */ +static void +kmem_slab_destroy(kmem_cache_t *cp, kmem_slab_t *sp) +{ + vmem_t *vmp = cp->cache_arena; + void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum); + + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(sp->slab_refcnt == 0); + + if (cp->cache_flags & KMF_HASH) { + kmem_bufctl_t *bcp; + while ((bcp = sp->slab_head) != NULL) { + sp->slab_head = bcp->bc_next; + kmem_cache_free(cp->cache_bufctl_cache, bcp); + } + kmem_cache_free(kmem_slab_cache, sp); + } + kpreempt(KPREEMPT_SYNC); + vmem_free(vmp, slab, cp->cache_slabsize); +} + +static void * +kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp, boolean_t prefill) +{ + kmem_bufctl_t *bcp, **hash_bucket; + void *buf; + boolean_t new_slab = (sp->slab_refcnt == 0); + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + /* + * kmem_slab_alloc() drops cache_lock when it creates a new slab, so we + * can't ASSERT(avl_is_empty(&cp->cache_partial_slabs)) here when the + * slab is newly created. + */ + ASSERT(new_slab || (KMEM_SLAB_IS_PARTIAL(sp) && + (sp == avl_first(&cp->cache_partial_slabs)))); + ASSERT(sp->slab_cache == cp); + + cp->cache_slab_alloc++; + cp->cache_bufslab--; + sp->slab_refcnt++; + + bcp = sp->slab_head; + sp->slab_head = bcp->bc_next; + + if (cp->cache_flags & KMF_HASH) { + /* + * Add buffer to allocated-address hash table. + */ + buf = bcp->bc_addr; + hash_bucket = KMEM_HASH(cp, buf); + bcp->bc_next = *hash_bucket; + *hash_bucket = bcp; + if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) { + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + } else { + buf = KMEM_BUF(cp, bcp); + } + + ASSERT(KMEM_SLAB_MEMBER(sp, buf)); + + if (sp->slab_head == NULL) { + ASSERT(KMEM_SLAB_IS_ALL_USED(sp)); + if (new_slab) { + ASSERT(sp->slab_chunks == 1); + } else { + ASSERT(sp->slab_chunks > 1); /* the slab was partial */ + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_later_count = 0; /* clear history */ + sp->slab_flags &= ~KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = (uint32_t)-1; + } + list_insert_head(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count++; + return (buf); + } + + ASSERT(KMEM_SLAB_IS_PARTIAL(sp)); + /* + * Peek to see if the magazine layer is enabled before + * we prefill. We're not holding the cpu cache lock, + * so the peek could be wrong, but there's no harm in it. + */ + if (new_slab && prefill && (cp->cache_flags & KMF_PREFILL) && + (KMEM_CPU_CACHE(cp)->cc_magsize != 0)) { + kmem_slab_prefill(cp, sp); + return (buf); + } + + if (new_slab) { + avl_add(&cp->cache_partial_slabs, sp); + return (buf); + } + + /* + * The slab is now more allocated than it was, so the + * order remains unchanged. + */ + ASSERT(!avl_update(&cp->cache_partial_slabs, sp)); + return (buf); +} + +/* + * Allocate a raw (unconstructed) buffer from cp's slab layer. + */ +static void * +kmem_slab_alloc(kmem_cache_t *cp, int kmflag) +{ + kmem_slab_t *sp; + void *buf; + boolean_t test_destructor; + + mutex_enter(&cp->cache_lock); + test_destructor = (cp->cache_slab_alloc == 0); + sp = avl_first(&cp->cache_partial_slabs); + if (sp == NULL) { + ASSERT(cp->cache_bufslab == 0); + + /* + * The freelist is empty. Create a new slab. + */ + mutex_exit(&cp->cache_lock); + if ((sp = kmem_slab_create(cp, kmflag)) == NULL) { + return (NULL); + } + mutex_enter(&cp->cache_lock); + cp->cache_slab_create++; + if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax) + cp->cache_bufmax = cp->cache_buftotal; + cp->cache_bufslab += sp->slab_chunks; + } + + buf = kmem_slab_alloc_impl(cp, sp, B_TRUE); + ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == + (cp->cache_complete_slab_count + + avl_numnodes(&cp->cache_partial_slabs) + + (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount))); + mutex_exit(&cp->cache_lock); + + if (test_destructor && cp->cache_destructor != NULL) { + copy_pattern(KMEM_UNINITIALIZED_PATTERN, buf, + cp->cache_bufsize); + if (cp->cache_flags & KMF_DEADBEEF) { + copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); + } + } + + return (buf); +} + +static void kmem_slab_move_yes(kmem_cache_t *, kmem_slab_t *, void *); + +/* + * Free a raw (unconstructed) buffer to cp's slab layer. + */ +static void +kmem_slab_free(kmem_cache_t *cp, void *buf) +{ + kmem_slab_t *sp; + kmem_bufctl_t *bcp, **prev_bcpp; + + ASSERT(buf != NULL); + + mutex_enter(&cp->cache_lock); + cp->cache_slab_free++; + + if (cp->cache_flags & KMF_HASH) { + /* + * Look up buffer in allocated-address hash table. + */ + prev_bcpp = KMEM_HASH(cp, buf); + while ((bcp = *prev_bcpp) != NULL) { + if (bcp->bc_addr == buf) { + *prev_bcpp = bcp->bc_next; + sp = bcp->bc_slab; + break; + } + cp->cache_lookup_depth++; + prev_bcpp = &bcp->bc_next; + } + } else { + bcp = KMEM_BUFCTL(cp, buf); + sp = KMEM_SLAB(cp, buf); + } + + if (bcp == NULL || sp->slab_cache != cp || !KMEM_SLAB_MEMBER(sp, buf)) { + mutex_exit(&cp->cache_lock); + kmem_error(KMERR_BADADDR, cp, buf); + return; + } + + if (KMEM_SLAB_OFFSET(sp, buf) == sp->slab_stuck_offset) { + /* + * If this is the buffer that prevented the consolidator from + * clearing the slab, we can reset the slab flags now that the + * buffer is freed. (It makes sense to do this in + * kmem_cache_free(), where the client gives up ownership of the + * buffer, but on the hot path the test is too expensive.) + */ + kmem_slab_move_yes(cp, sp, buf); + } + + if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) { + if (cp->cache_flags & KMF_CONTENTS) + ((kmem_bufctl_audit_t *)bcp)->bc_contents = + kmem_log_enter(kmem_content_log, buf, + cp->cache_contents); + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + + bcp->bc_next = sp->slab_head; + sp->slab_head = bcp; + + cp->cache_bufslab++; + ASSERT(sp->slab_refcnt >= 1); + + if (--sp->slab_refcnt == 0) { + /* + * There are no outstanding allocations from this slab, + * so we can reclaim the memory. + */ + if (sp->slab_chunks == 1) { + list_remove(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count--; + } else { + avl_remove(&cp->cache_partial_slabs, sp); + } + + cp->cache_buftotal -= sp->slab_chunks; + cp->cache_bufslab -= sp->slab_chunks; + /* + * Defer releasing the slab to the virtual memory subsystem + * while there is a pending move callback, since we guarantee + * that buffers passed to the move callback have only been + * touched by kmem or by the client itself. Since the memory + * patterns baddcafe (uninitialized) and deadbeef (freed) both + * set at least one of the two lowest order bits, the client can + * test those bits in the move callback to determine whether or + * not it knows about the buffer (assuming that the client also + * sets one of those low order bits whenever it frees a buffer). + */ + if (cp->cache_defrag == NULL || + (avl_is_empty(&cp->cache_defrag->kmd_moves_pending) && + !(sp->slab_flags & KMEM_SLAB_MOVE_PENDING))) { + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + } else { + list_t *deadlist = + &cp->cache_defrag->kmd_deadlist; + /* + * Slabs are inserted at both ends of the + * deadlist to distinguish between slabs + * freed while move callbacks are pending + * (list head) and a slab freed while the + * lock is dropped in kmem_move_buffers() + * (list tail) so that in both cases + * slab_destroy() is called from the + * right context. + */ + if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) { + list_insert_tail(deadlist, sp); + } else { + list_insert_head(deadlist, sp); + } + cp->cache_defrag->kmd_deadcount++; + mutex_exit(&cp->cache_lock); + } + return; + } + + if (bcp->bc_next == NULL) { + /* Transition the slab from completely allocated to partial. */ + ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1)); + ASSERT(sp->slab_chunks > 1); + list_remove(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count--; + avl_add(&cp->cache_partial_slabs, sp); + } else { + (void) avl_update_gt(&cp->cache_partial_slabs, sp); + } + + ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == + (cp->cache_complete_slab_count + + avl_numnodes(&cp->cache_partial_slabs) + + (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount))); + mutex_exit(&cp->cache_lock); +} + +/* + * Return -1 if kmem_error, 1 if constructor fails, 0 if successful. + */ +static int +kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct, + caddr_t caller) +{ + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl; + uint32_t mtbf; + + if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) { + kmem_error(KMERR_BADBUFTAG, cp, buf); + return (-1); + } + + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_ALLOC; + + if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) { + kmem_error(KMERR_BADBUFCTL, cp, buf); + return (-1); + } + + if (cp->cache_flags & KMF_DEADBEEF) { + if (!construct && (cp->cache_flags & KMF_LITE)) { + if (*(uint64_t *)buf != KMEM_FREE_PATTERN) { + kmem_error(KMERR_MODIFIED, cp, buf); + return (-1); + } + if (cp->cache_constructor != NULL) + *(uint64_t *)buf = btp->bt_redzone; + else + *(uint64_t *)buf = KMEM_UNINITIALIZED_PATTERN; + } else { + construct = 1; + if (verify_and_copy_pattern(KMEM_FREE_PATTERN, + KMEM_UNINITIALIZED_PATTERN, buf, + cp->cache_verify)) { + kmem_error(KMERR_MODIFIED, cp, buf); + return (-1); + } + } + } + btp->bt_redzone = KMEM_REDZONE_PATTERN; + + if ((mtbf = kmem_mtbf | cp->cache_mtbf) != 0 && + gethrtime() % mtbf == 0 && + (kmflag & (KM_NOSLEEP | KM_PANIC)) == KM_NOSLEEP) { + kmem_log_event(kmem_failure_log, cp, NULL, NULL); + if (!construct && cp->cache_destructor != NULL) + cp->cache_destructor(buf, cp->cache_private); + } else { + mtbf = 0; + } + + if (mtbf || (construct && cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, kmflag) != 0)) { + atomic_inc_64(&cp->cache_alloc_fail); + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE; + if (cp->cache_flags & KMF_DEADBEEF) + copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); + kmem_slab_free(cp, buf); + return (1); + } + + if (cp->cache_flags & KMF_AUDIT) { + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + + if ((cp->cache_flags & KMF_LITE) && + !(cp->cache_cflags & KMC_KMEM_ALLOC)) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller); + } + + return (0); +} + +static int +kmem_cache_free_debug(kmem_cache_t *cp, void *buf, caddr_t caller) +{ + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl; + kmem_slab_t *sp; + + if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_ALLOC)) { + if (btp->bt_bxstat == ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) { + kmem_error(KMERR_DUPFREE, cp, buf); + return (-1); + } + sp = kmem_findslab(cp, buf); + if (sp == NULL || sp->slab_cache != cp) + kmem_error(KMERR_BADADDR, cp, buf); + else + kmem_error(KMERR_REDZONE, cp, buf); + return (-1); + } + + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE; + + if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) { + kmem_error(KMERR_BADBUFCTL, cp, buf); + return (-1); + } + + if (btp->bt_redzone != KMEM_REDZONE_PATTERN) { + kmem_error(KMERR_REDZONE, cp, buf); + return (-1); + } + + if (cp->cache_flags & KMF_AUDIT) { + if (cp->cache_flags & KMF_CONTENTS) + bcp->bc_contents = kmem_log_enter(kmem_content_log, + buf, cp->cache_contents); + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + + if ((cp->cache_flags & KMF_LITE) && + !(cp->cache_cflags & KMC_KMEM_ALLOC)) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller); + } + + if (cp->cache_flags & KMF_DEADBEEF) { + if (cp->cache_flags & KMF_LITE) + btp->bt_redzone = *(uint64_t *)buf; + else if (cp->cache_destructor != NULL) + cp->cache_destructor(buf, cp->cache_private); + + copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); + } + + return (0); +} + +/* + * Free each object in magazine mp to cp's slab layer, and free mp itself. + */ +static void +kmem_magazine_destroy(kmem_cache_t *cp, kmem_magazine_t *mp, int nrounds) +{ + int round; + + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); + + for (round = 0; round < nrounds; round++) { + void *buf = mp->mag_round[round]; + + if (cp->cache_flags & KMF_DEADBEEF) { + if (verify_pattern(KMEM_FREE_PATTERN, buf, + cp->cache_verify) != NULL) { + kmem_error(KMERR_MODIFIED, cp, buf); + continue; + } + if ((cp->cache_flags & KMF_LITE) && + cp->cache_destructor != NULL) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + *(uint64_t *)buf = btp->bt_redzone; + cp->cache_destructor(buf, cp->cache_private); + *(uint64_t *)buf = KMEM_FREE_PATTERN; + } + } else if (cp->cache_destructor != NULL) { + cp->cache_destructor(buf, cp->cache_private); + } + + kmem_slab_free(cp, buf); + kpreempt(KPREEMPT_SYNC); + } + ASSERT(KMEM_MAGAZINE_VALID(cp, mp)); + kmem_cache_free(cp->cache_magtype->mt_cache, mp); +} + +/* + * Allocate a magazine from the depot. + */ +static kmem_magazine_t * +kmem_depot_alloc(kmem_cache_t *cp, kmem_maglist_t *mlp) +{ + kmem_magazine_t *mp; + + /* + * If we can't get the depot lock without contention, + * update our contention count. We use the depot + * contention rate to determine whether we need to + * increase the magazine size for better scalability. + */ + if (!mutex_tryenter(&cp->cache_depot_lock)) { + mutex_enter(&cp->cache_depot_lock); + cp->cache_depot_contention++; + } + + if ((mp = mlp->ml_list) != NULL) { + ASSERT(KMEM_MAGAZINE_VALID(cp, mp)); + mlp->ml_list = mp->mag_next; + if (--mlp->ml_total < mlp->ml_min) + mlp->ml_min = mlp->ml_total; + mlp->ml_alloc++; + } + + mutex_exit(&cp->cache_depot_lock); + + return (mp); +} + +/* + * Free a magazine to the depot. + */ +static void +kmem_depot_free(kmem_cache_t *cp, kmem_maglist_t *mlp, kmem_magazine_t *mp) +{ + mutex_enter(&cp->cache_depot_lock); + ASSERT(KMEM_MAGAZINE_VALID(cp, mp)); + mp->mag_next = mlp->ml_list; + mlp->ml_list = mp; + mlp->ml_total++; + mutex_exit(&cp->cache_depot_lock); +} + +/* + * Update the working set statistics for cp's depot. + */ +static void +kmem_depot_ws_update(kmem_cache_t *cp) +{ + mutex_enter(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit = cp->cache_full.ml_min; + cp->cache_full.ml_min = cp->cache_full.ml_total; + cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_min; + cp->cache_empty.ml_min = cp->cache_empty.ml_total; + mutex_exit(&cp->cache_depot_lock); +} + +/* + * Set the working set statistics for cp's depot to zero. (Everything is + * eligible for reaping.) + */ +void +kmem_depot_ws_zero(kmem_cache_t *cp) +{ + mutex_enter(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit = cp->cache_full.ml_total; + cp->cache_full.ml_min = cp->cache_full.ml_total; + cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_total; + cp->cache_empty.ml_min = cp->cache_empty.ml_total; + mutex_exit(&cp->cache_depot_lock); +} + +/* + * The number of bytes to reap before we call kpreempt(). The default (1MB) + * causes us to preempt reaping up to hundres of times per second. Using a + * larger value (1GB) causes this to have virtually no effect. + */ +size_t kmem_reap_preempt_bytes = 64 * 1024 * 1024; + + +/* + * Reap all magazines that have fallen out of the depot's working set. + */ +static void +kmem_depot_ws_reap(kmem_cache_t *cp) +{ + size_t bytes = 0; + long reap; + kmem_magazine_t *mp; + + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); + + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + while (reap-- && + (mp = kmem_depot_alloc(cp, &cp->cache_full)) != NULL) { + kmem_magazine_destroy(cp, mp, cp->cache_magtype->mt_magsize); + bytes += cp->cache_magtype->mt_magsize * cp->cache_bufsize; + if (bytes > kmem_reap_preempt_bytes) { + kpreempt(KPREEMPT_SYNC); + bytes = 0; + } + } + + reap = MIN(cp->cache_empty.ml_reaplimit, cp->cache_empty.ml_min); + while (reap-- && + (mp = kmem_depot_alloc(cp, &cp->cache_empty)) != NULL) { + kmem_magazine_destroy(cp, mp, 0); + bytes += cp->cache_magtype->mt_magsize * cp->cache_bufsize; + if (bytes > kmem_reap_preempt_bytes) { + kpreempt(KPREEMPT_SYNC); + bytes = 0; + } + } +} + +static void +kmem_cpu_reload(kmem_cpu_cache_t *ccp, kmem_magazine_t *mp, int rounds) +{ + ASSERT((ccp->cc_loaded == NULL && ccp->cc_rounds == -1) || + (ccp->cc_loaded && ccp->cc_rounds + rounds == ccp->cc_magsize)); + ASSERT(ccp->cc_magsize > 0); + + ccp->cc_ploaded = ccp->cc_loaded; + ccp->cc_prounds = ccp->cc_rounds; + ccp->cc_loaded = mp; + ccp->cc_rounds = rounds; +} + +/* + * Intercept kmem alloc/free calls during crash dump in order to avoid + * changing kmem state while memory is being saved to the dump device. + * Otherwise, ::kmem_verify will report "corrupt buffers". Note that + * there are no locks because only one CPU calls kmem during a crash + * dump. To enable this feature, first create the associated vmem + * arena with VMC_DUMPSAFE. + */ +static void *kmem_dump_start; /* start of pre-reserved heap */ +static void *kmem_dump_end; /* end of heap area */ +static void *kmem_dump_curr; /* current free heap pointer */ +static size_t kmem_dump_size; /* size of heap area */ + +/* append to each buf created in the pre-reserved heap */ +typedef struct kmem_dumpctl { + void *kdc_next; /* cache dump free list linkage */ +} kmem_dumpctl_t; + +#define KMEM_DUMPCTL(cp, buf) \ +((kmem_dumpctl_t *)P2ROUNDUP((uintptr_t)(buf) + (cp)->cache_bufsize, \ +sizeof (void *))) + +/* Keep some simple stats. */ +#define KMEM_DUMP_LOGS (100) + +typedef struct kmem_dump_log { + kmem_cache_t *kdl_cache; + uint_t kdl_allocs; /* # of dump allocations */ + uint_t kdl_frees; /* # of dump frees */ + uint_t kdl_alloc_fails; /* # of allocation failures */ + uint_t kdl_free_nondump; /* # of non-dump frees */ + uint_t kdl_unsafe; /* cache was used, but unsafe */ +} kmem_dump_log_t; + +static kmem_dump_log_t *kmem_dump_log; +static int kmem_dump_log_idx; + +#define KDI_LOG(cp, stat) { \ +kmem_dump_log_t *kdl; \ +if ((kdl = (kmem_dump_log_t *)((cp)->cache_dumplog)) != NULL) { \ +kdl->stat++; \ +} else if (kmem_dump_log_idx < KMEM_DUMP_LOGS) { \ +kdl = &kmem_dump_log[kmem_dump_log_idx++]; \ +kdl->stat++; \ +kdl->kdl_cache = (cp); \ +(cp)->cache_dumplog = kdl; \ +} \ +} + +/* set non zero for full report */ +uint_t kmem_dump_verbose = 0; + +/* stats for overize heap */ +uint_t kmem_dump_oversize_allocs = 0; +uint_t kmem_dump_oversize_max = 0; + +static void +kmem_dumppr(char **pp, char *e, const char *format, ...) +{ + char *p = *pp; + + if (p < e) { + int n; + va_list ap; + + va_start(ap, format); + n = vsnprintf(p, e - p, format, ap); + va_end(ap); + *pp = p + n; + } +} + +/* + * Called when dumpadm(1M) configures dump parameters. + */ +void +kmem_dump_init(size_t size) +{ + if (kmem_dump_start != NULL) + zfs_kmem_free(kmem_dump_start, kmem_dump_size); + + if (kmem_dump_log == NULL) + kmem_dump_log = + (kmem_dump_log_t *)zfs_kmem_zalloc( + KMEM_DUMP_LOGS * sizeof (kmem_dump_log_t), KM_SLEEP); + + kmem_dump_start = zfs_kmem_alloc(size, KM_SLEEP); + + if (kmem_dump_start != NULL) { + kmem_dump_size = size; + kmem_dump_curr = kmem_dump_start; + kmem_dump_end = (void *)((char *)kmem_dump_start + size); + copy_pattern(KMEM_UNINITIALIZED_PATTERN, kmem_dump_start, size); + } else { + kmem_dump_size = 0; + kmem_dump_curr = NULL; + kmem_dump_end = NULL; + } +} + +/* + * Set flag for each kmem_cache_t if is safe to use alternate dump + * memory. Called just before panic crash dump starts. Set the flag + * for the calling CPU. + */ +void +kmem_dump_begin(void) +{ + if (kmem_dump_start != NULL) { + kmem_cache_t *cp; + + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + if (cp->cache_arena->vm_cflags & VMC_DUMPSAFE) { + cp->cache_flags |= KMF_DUMPDIVERT; + ccp->cc_flags |= KMF_DUMPDIVERT; + ccp->cc_dump_rounds = ccp->cc_rounds; + ccp->cc_dump_prounds = ccp->cc_prounds; + ccp->cc_rounds = ccp->cc_prounds = -1; + } else { + cp->cache_flags |= KMF_DUMPUNSAFE; + ccp->cc_flags |= KMF_DUMPUNSAFE; + } + } + } +} + +/* + * finished dump intercept + * print any warnings on the console + * return verbose information to dumpsys() in the given buffer + */ +size_t +kmem_dump_finish(char *buf, size_t size) +{ + int kdi_idx; + int kdi_end = kmem_dump_log_idx; + int percent = 0; + int header = 0; + int warn = 0; + size_t used; + kmem_cache_t *cp; + kmem_dump_log_t *kdl; + char *e = buf + size; + char *p = buf; + + if (kmem_dump_size == 0 || kmem_dump_verbose == 0) + return (0); + + used = (char *)kmem_dump_curr - (char *)kmem_dump_start; + percent = (used * 100) / kmem_dump_size; + + kmem_dumppr(&p, e, "%% heap used,%d\n", percent); + kmem_dumppr(&p, e, "used bytes,%ld\n", used); + kmem_dumppr(&p, e, "heap size,%ld\n", kmem_dump_size); + kmem_dumppr(&p, e, "Oversize allocs,%d\n", + kmem_dump_oversize_allocs); + kmem_dumppr(&p, e, "Oversize max size,%ld\n", + kmem_dump_oversize_max); + + for (kdi_idx = 0; kdi_idx < kdi_end; kdi_idx++) { + kdl = &kmem_dump_log[kdi_idx]; + cp = kdl->kdl_cache; + if (cp == NULL) + break; + if (kdl->kdl_alloc_fails) + ++warn; + if (header == 0) { + kmem_dumppr(&p, e, + "Cache Name,Allocs,Frees,Alloc Fails," + "Nondump Frees,Unsafe Allocs/Frees\n"); + header = 1; + } + kmem_dumppr(&p, e, "%s,%d,%d,%d,%d,%d\n", + cp->cache_name, kdl->kdl_allocs, kdl->kdl_frees, + kdl->kdl_alloc_fails, kdl->kdl_free_nondump, + kdl->kdl_unsafe); + } + + /* return buffer size used */ + if (p < e) + bzero(p, e - p); + return (p - buf); +} + +/* + * Allocate a constructed object from alternate dump memory. + */ +void * +kmem_cache_alloc_dump(kmem_cache_t *cp, int kmflag) +{ + void *buf; + void *curr; + char *bufend; + + /* return a constructed object */ + if ((buf = cp->cache_dumpfreelist) != NULL) { + cp->cache_dumpfreelist = KMEM_DUMPCTL(cp, buf)->kdc_next; + KDI_LOG(cp, kdl_allocs); + return (buf); + } + + /* create a new constructed object */ + curr = kmem_dump_curr; + buf = (void *)P2ROUNDUP((uintptr_t)curr, cp->cache_align); + bufend = (char *)KMEM_DUMPCTL(cp, buf) + sizeof (kmem_dumpctl_t); + + /* hat layer objects cannot cross a page boundary */ + if (cp->cache_align < PAGESIZE) { + char *page = (char *)P2ROUNDUP((uintptr_t)buf, PAGESIZE); + if (bufend > page) { + bufend += page - (char *)buf; + buf = (void *)page; + } + } + + /* fall back to normal alloc if reserved area is used up */ + if (bufend > (char *)kmem_dump_end) { + kmem_dump_curr = kmem_dump_end; + KDI_LOG(cp, kdl_alloc_fails); + return (NULL); + } + + /* + * Must advance curr pointer before calling a constructor that + * may also allocate memory. + */ + kmem_dump_curr = bufend; + + /* run constructor */ + if (cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, kmflag) + != 0) { +#ifdef DEBUG + printf("name='%s' cache=0x%p: kmem cache constructor failed\n", + cp->cache_name, (void *)cp); +#endif + /* reset curr pointer iff no allocs were done */ + if (kmem_dump_curr == bufend) + kmem_dump_curr = curr; + + /* fall back to normal alloc if the constructor fails */ + KDI_LOG(cp, kdl_alloc_fails); + return (NULL); + } + + KDI_LOG(cp, kdl_allocs); + return (buf); +} + +/* + * Free a constructed object in alternate dump memory. + */ +int +kmem_cache_free_dump(kmem_cache_t *cp, void *buf) +{ + /* save constructed buffers for next time */ + if ((char *)buf >= (char *)kmem_dump_start && + (char *)buf < (char *)kmem_dump_end) { + KMEM_DUMPCTL(cp, buf)->kdc_next = cp->cache_dumpfreelist; + cp->cache_dumpfreelist = buf; + KDI_LOG(cp, kdl_frees); + return (0); + } + + /* count all non-dump buf frees */ + KDI_LOG(cp, kdl_free_nondump); + + /* just drop buffers that were allocated before dump started */ + if (kmem_dump_curr < kmem_dump_end) + return (0); + + /* fall back to normal free if reserved area is used up */ + return (1); +} + +/* + * Allocate a constructed object from cache cp. + */ +void * +kmem_cache_alloc(kmem_cache_t *cp, int kmflag) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + kmem_magazine_t *fmp; + void *buf; + mutex_enter(&ccp->cc_lock); + for (;;) { + /* + * If there's an object available in the current CPU's + * loaded magazine, just take it and return. + */ + if (ccp->cc_rounds > 0) { + buf = ccp->cc_loaded->mag_round[--ccp->cc_rounds]; + ccp->cc_alloc++; + mutex_exit(&ccp->cc_lock); + if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & + KMF_DUMPDIVERT)); + KDI_LOG(cp, kdl_unsafe); + } + if ((ccp->cc_flags & KMF_BUFTAG) && + kmem_cache_alloc_debug(cp, buf, kmflag, 0, + caller()) != 0) { + if (kmflag & KM_NOSLEEP) + return (NULL); + mutex_enter(&ccp->cc_lock); + continue; + } + } + return (buf); + } + + /* + * The loaded magazine is empty. If the previously loaded + * magazine was full, exchange them and try again. + */ + if (ccp->cc_prounds > 0) { + kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds); + continue; + } + + /* + * Return an alternate buffer at dump time to preserve + * the heap. + */ + if (ccp->cc_flags & (KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); + /* log it so that we can warn about it */ + KDI_LOG(cp, kdl_unsafe); + } else { + if ((buf = kmem_cache_alloc_dump(cp, kmflag)) != + NULL) { + mutex_exit(&ccp->cc_lock); + return (buf); + } + break; /* fall back to slab layer */ + } + } + + /* + * If the magazine layer is disabled, break out now. + */ + if (ccp->cc_magsize == 0) + break; + + /* + * Try to get a full magazine from the depot. + */ + fmp = kmem_depot_alloc(cp, &cp->cache_full); + if (fmp != NULL) { + if (ccp->cc_ploaded != NULL) + kmem_depot_free(cp, &cp->cache_empty, + ccp->cc_ploaded); + kmem_cpu_reload(ccp, fmp, ccp->cc_magsize); + continue; + } + + /* + * There are no full magazines in the depot, + * so fall through to the slab layer. + */ + break; + } + mutex_exit(&ccp->cc_lock); + + /* + * We couldn't allocate a constructed object from the magazine layer, + * so get a raw buffer from the slab layer and apply its constructor. + */ + buf = kmem_slab_alloc(cp, kmflag); + + if (buf == NULL) + return (NULL); + + if (cp->cache_flags & KMF_BUFTAG) { + /* + * Make kmem_cache_alloc_debug() apply the constructor for us. + */ + int rc = kmem_cache_alloc_debug(cp, buf, kmflag, 1, caller()); + if (rc != 0) { + if (kmflag & KM_NOSLEEP) + return (NULL); + /* + * kmem_cache_alloc_debug() detected corruption + * but didn't panic (kmem_panic <= 0). We should not be + * here because the constructor failed (indicated by a + * return code of 1). Try again. + */ + ASSERT(rc == -1); + return (kmem_cache_alloc(cp, kmflag)); + } + return (buf); + } + + if (cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, kmflag) != 0) { + atomic_inc_64(&cp->cache_alloc_fail); + kmem_slab_free(cp, buf); + return (NULL); + } + + return (buf); +} + +/* + * The freed argument tells whether or not kmem_cache_free_debug() has already + * been called so that we can avoid the duplicate free error. For example, a + * buffer on a magazine has already been freed by the client but is still + * constructed. + */ +static void +kmem_slab_free_constructed(kmem_cache_t *cp, void *buf, boolean_t freed) +{ + if (!freed && (cp->cache_flags & KMF_BUFTAG)) + if (kmem_cache_free_debug(cp, buf, caller()) == -1) + return; + + /* + * Note that if KMF_DEADBEEF is in effect and KMF_LITE is not, + * kmem_cache_free_debug() will have already applied the destructor. + */ + if ((cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) != KMF_DEADBEEF && + cp->cache_destructor != NULL) { + if (cp->cache_flags & KMF_DEADBEEF) { /* KMF_LITE implied */ + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + *(uint64_t *)buf = btp->bt_redzone; + cp->cache_destructor(buf, cp->cache_private); + *(uint64_t *)buf = KMEM_FREE_PATTERN; + } else { + cp->cache_destructor(buf, cp->cache_private); + } + } + + kmem_slab_free(cp, buf); +} + +/* + * Used when there's no room to free a buffer to the per-CPU cache. + * Drops and re-acquires &ccp->cc_lock, and returns non-zero if the + * caller should try freeing to the per-CPU cache again. + * Note that we don't directly install the magazine in the cpu cache, + * since its state may have changed wildly while the lock was dropped. + */ +static int +kmem_cpucache_magazine_alloc(kmem_cpu_cache_t *ccp, kmem_cache_t *cp) +{ + kmem_magazine_t *emp; + kmem_magtype_t *mtp; + + ASSERT(MUTEX_HELD(&ccp->cc_lock)); + ASSERT(((uint_t)ccp->cc_rounds == ccp->cc_magsize || + ((uint_t)ccp->cc_rounds == -1)) && + ((uint_t)ccp->cc_prounds == ccp->cc_magsize || + ((uint_t)ccp->cc_prounds == -1))); + + emp = kmem_depot_alloc(cp, &cp->cache_empty); + if (emp != NULL) { + if (ccp->cc_ploaded != NULL) + kmem_depot_free(cp, &cp->cache_full, + ccp->cc_ploaded); + kmem_cpu_reload(ccp, emp, 0); + return (1); + } + /* + * There are no empty magazines in the depot, + * so try to allocate a new one. We must drop all locks + * across kmem_cache_alloc() because lower layers may + * attempt to allocate from this cache. + */ + mtp = cp->cache_magtype; + mutex_exit(&ccp->cc_lock); + emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP); + mutex_enter(&ccp->cc_lock); + + if (emp != NULL) { + /* + * We successfully allocated an empty magazine. + * However, we had to drop ccp->cc_lock to do it, + * so the cache's magazine size may have changed. + * If so, free the magazine and try again. + */ + if (ccp->cc_magsize != mtp->mt_magsize) { + mutex_exit(&ccp->cc_lock); + kmem_cache_free(mtp->mt_cache, emp); + mutex_enter(&ccp->cc_lock); + return (1); + } + + /* + * We got a magazine of the right size. Add it to + * the depot and try the whole dance again. + */ + kmem_depot_free(cp, &cp->cache_empty, emp); + return (1); + } + + /* + * We couldn't allocate an empty magazine, + * so fall through to the slab layer. + */ + return (0); +} + +/* + * If the cache's parent arena is a leaf arena (i.e., it imports all its memory) + * then we can consider it fragmented if either there is 1 GiB free in the arena + * or one eighth of the arena is free. + * + * This is useful in kmem_cache_free{_debug} to determine whether to free to the + * slab layer if the loaded magazine is full. + */ +static inline boolean_t +kmem_cache_parent_arena_fragmented(kmem_cache_t *cp) +{ + const vmem_kstat_t *kp = &cp->cache_arena->vm_kstat; + const int64_t vk_import = kp->vk_mem_import.value.ui64; + const int64_t vk_inuse = kp->vk_mem_inuse.value.ui64; + const int64_t vk_total = kp->vk_mem_total.value.ui64; + + if (vk_import == vk_total && vk_inuse < vk_total) { + const int64_t vk_free = vk_total - vk_inuse; + const int64_t highthresh = 1024LL*1024LL*1024LL; + // we are fragmented if we have 1GiB free + if (vk_free >= highthresh) + return (B_TRUE); + // we are fragmented if at least 1/8 of the + // total arena space is free + if (vk_free > 0 && vk_total > 0) { + const int64_t eighth_total = vk_total / 8; + if (vk_free >= eighth_total) + return (B_TRUE); + } + } + return (B_FALSE); +} + +/* + * Free a constructed object to cache cp. + */ +void +kmem_cache_free(kmem_cache_t *cp, void *buf) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + /* + * The client must not free either of the buffers passed to the move + * callback function. + */ + ASSERT(cp->cache_defrag == NULL || + cp->cache_defrag->kmd_thread != spl_current_thread() || + (buf != cp->cache_defrag->kmd_from_buf && + buf != cp->cache_defrag->kmd_to_buf)); + + if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); + /* log it so that we can warn about it */ + KDI_LOG(cp, kdl_unsafe); + } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) { + return; + } + if (ccp->cc_flags & KMF_BUFTAG) { + if (kmem_cache_free_debug(cp, buf, caller()) == -1) + return; + } + } + + mutex_enter(&ccp->cc_lock); + /* + * Any changes to this logic should be reflected in kmem_slab_prefill() + */ + for (;;) { + /* + * If there's a slot available in the current CPU's + * loaded magazine, just put the object there and return. + */ + if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) { + ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf; + ccp->cc_free++; + mutex_exit(&ccp->cc_lock); + return; + } + + /* + * If the magazine layer is disabled, break out now. + */ + if (ccp->cc_magsize == 0) { + break; + } + + /* + * The magazine layer is on, but the loaded magazine is now + * full (of allocatable constructed elements). + * + * If the cache's arena is badly fragmented, break out now; + * this frees to the slab layer. + * + * Note: this is not reflected in kmem_slab_prefill() which + * deals with a freshly allocated slab. + */ + + if (kmem_free_to_slab_when_fragmented == 1 && + kmem_cache_parent_arena_fragmented(cp)) + break; + + /* + * The loaded magazine is full. If the previously loaded + * magazine was empty, exchange them and try again. + */ + if (ccp->cc_prounds == 0) { + kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds); + continue; + } + + if (!kmem_cpucache_magazine_alloc(ccp, cp)) { + /* + * We couldn't free our constructed object to the + * magazine layer, so apply its destructor and free it + * to the slab layer. + */ + break; + } + } + mutex_exit(&ccp->cc_lock); + kpreempt(KPREEMPT_SYNC); + kmem_slab_free_constructed(cp, buf, B_TRUE); +} + +/* + * Free a constructed object to cache cp. + * Do not free to the magazine layer. + * This is essentially just kmem_cache_free() without + * the for(;;) loop or the ccp critical section. + */ +void +kmem_cache_free_to_slab(kmem_cache_t *cp, void *buf) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + /* + * The client must not free either of the buffers passed to the move + * callback function. + */ + ASSERT(cp->cache_defrag == NULL || + cp->cache_defrag->kmd_thread != spl_current_thread() || + (buf != cp->cache_defrag->kmd_from_buf && + buf != cp->cache_defrag->kmd_to_buf)); + + if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); + /* log it so that we can warn about it */ + KDI_LOG(cp, kdl_unsafe); + } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) { + return; + } + if (ccp->cc_flags & KMF_BUFTAG) { + if (kmem_cache_free_debug(cp, buf, caller()) == -1) + return; + } + } + + /* omitted the for(;;) loop from kmem_cache_free */ + /* also do not take ccp mutex */ + + kmem_slab_free_constructed(cp, buf, B_TRUE); +} + +static void +kmem_slab_prefill(kmem_cache_t *cp, kmem_slab_t *sp) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + kmem_bufctl_t *next, *head; + size_t nbufs; + + /* + * Completely allocate the newly created slab and put the pre-allocated + * buffers in magazines. Any of the buffers that cannot be put in + * magazines must be returned to the slab. + */ + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(cp->cache_constructor == NULL); + ASSERT(sp->slab_cache == cp); + ASSERT(sp->slab_refcnt == 1); + ASSERT(sp->slab_head != NULL && sp->slab_chunks > sp->slab_refcnt); + ASSERT(avl_find(&cp->cache_partial_slabs, sp, NULL) == NULL); + + head = sp->slab_head; + nbufs = (sp->slab_chunks - sp->slab_refcnt); + sp->slab_head = NULL; + sp->slab_refcnt += nbufs; + cp->cache_bufslab -= nbufs; + cp->cache_slab_alloc += nbufs; + list_insert_head(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count++; + mutex_exit(&cp->cache_lock); + mutex_enter(&ccp->cc_lock); + + while (head != NULL) { + void *buf = KMEM_BUF(cp, head); + /* + * If there's a slot available in the current CPU's + * loaded magazine, just put the object there and + * continue. + */ + if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) { + ccp->cc_loaded->mag_round[ccp->cc_rounds++] = + buf; + ccp->cc_free++; + nbufs--; + head = head->bc_next; + continue; + } + + /* + * The loaded magazine is full. If the previously + * loaded magazine was empty, exchange them and try + * again. + */ + if (ccp->cc_prounds == 0) { + kmem_cpu_reload(ccp, ccp->cc_ploaded, + ccp->cc_prounds); + continue; + } + + /* + * If the magazine layer is disabled, break out now. + */ + + if (ccp->cc_magsize == 0) { + break; + } + + if (!kmem_cpucache_magazine_alloc(ccp, cp)) + break; + } + mutex_exit(&ccp->cc_lock); + if (nbufs != 0) { + ASSERT(head != NULL); + + /* + * If there was a failure, return remaining objects to + * the slab + */ + while (head != NULL) { + ASSERT(nbufs != 0); + next = head->bc_next; + head->bc_next = NULL; + kmem_slab_free(cp, KMEM_BUF(cp, head)); + head = next; + nbufs--; + } + } + ASSERT(head == NULL); + ASSERT(nbufs == 0); + mutex_enter(&cp->cache_lock); +} + +void * +zfs_kmem_zalloc(size_t size, int kmflag) +{ + size_t index; + void *buf; + + if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) { + kmem_cache_t *cp = kmem_alloc_table[index]; + buf = kmem_cache_alloc(cp, kmflag); + if (buf != NULL) { + if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE; + ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size); + + if (cp->cache_flags & KMF_LITE) { + KMEM_BUFTAG_LITE_ENTER(btp, + kmem_lite_count, caller()); + } + } + bzero(buf, size); + } + } else { + buf = zfs_kmem_alloc(size, kmflag); + if (buf != NULL) + bzero(buf, size); + } + return (buf); +} + +void * +zfs_kmem_alloc(size_t size, int kmflag) +{ + size_t index; + kmem_cache_t *cp; + void *buf; + + if (size == 0) + return (KMEM_ZERO_SIZE_PTR); + + if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) { + cp = kmem_alloc_table[index]; + /* fall through to kmem_cache_alloc() */ + + } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) < + kmem_big_alloc_table_max) { + cp = kmem_big_alloc_table[index]; + /* fall through to kmem_cache_alloc() */ + + } else { + + buf = vmem_alloc(kmem_oversize_arena, size, + kmflag & KM_VMFLAGS); + if (buf == NULL) + kmem_log_event(kmem_failure_log, NULL, NULL, + (void *)size); + else if (KMEM_DUMP(kmem_slab_cache)) { + /* stats for dump intercept */ + kmem_dump_oversize_allocs++; + if (size > kmem_dump_oversize_max) + kmem_dump_oversize_max = size; + } + return (buf); + } + + buf = kmem_cache_alloc(cp, kmflag); + if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp) && buf != NULL) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE; + ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size); + + if (cp->cache_flags & KMF_LITE) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller()); + } + } + return (buf); +} + +void +zfs_kmem_free(void *buf, size_t size) +{ + size_t index; + kmem_cache_t *cp; + + if (size == 0 || buf == KMEM_ZERO_SIZE_PTR || buf == NULL) + return; + + if ((index = (size - 1) >> KMEM_ALIGN_SHIFT) < KMEM_ALLOC_TABLE_MAX) { + cp = kmem_alloc_table[index]; + /* fall through to kmem_cache_free() */ + + } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) < + kmem_big_alloc_table_max) { + cp = kmem_big_alloc_table[index]; + /* fall through to kmem_cache_free() */ + + } else { + vmem_free(kmem_oversize_arena, buf, size); + return; + } + + if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + uint32_t *ip = (uint32_t *)btp; + if (ip[1] != KMEM_SIZE_ENCODE(size)) { + if (*(uint64_t *)buf == KMEM_FREE_PATTERN) { + kmem_error(KMERR_DUPFREE, cp, buf); + return; + } + if (KMEM_SIZE_VALID(ip[1])) { + ip[0] = KMEM_SIZE_ENCODE(size); + kmem_error(KMERR_BADSIZE, cp, buf); + } else { + kmem_error(KMERR_REDZONE, cp, buf); + } + return; + } + if (((uint8_t *)buf)[size] != KMEM_REDZONE_BYTE) { + kmem_error(KMERR_REDZONE, cp, buf); + return; + } + btp->bt_redzone = KMEM_REDZONE_PATTERN; + if (cp->cache_flags & KMF_LITE) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, + caller()); + } + } + kmem_cache_free(cp, buf); +} + +/* + * Try to allocate at least `size' bytes of memory without sleeping or + * panicking. Return actual allocated size in `asize'. If allocation failed, + * try final allocation with sleep or panic allowed. + */ +void * +kmem_alloc_tryhard(size_t size, size_t *asize, int kmflag) +{ + void *p; + + *asize = P2ROUNDUP(size, KMEM_ALIGN); + do { + p = kmem_alloc(*asize, (kmflag | KM_NOSLEEP) & ~KM_PANIC); + if (p != NULL) + return (p); + *asize += KMEM_ALIGN; + } while (*asize <= PAGESIZE); + + *asize = P2ROUNDUP(size, KMEM_ALIGN); + return (zfs_kmem_alloc(*asize, kmflag)); +} + +/* + * Reclaim all unused memory from a cache. + */ +static void +kmem_cache_reap(kmem_cache_t *cp) +{ + ASSERT(taskq_member(kmem_taskq, curthread)); + + cp->cache_reap++; + + /* + * Ask the cache's owner to free some memory if possible. + * The idea is to handle things like the inode cache, which + * typically sits on a bunch of memory that it doesn't truly + * *need*. Reclaim policy is entirely up to the owner; this + * callback is just an advisory plea for help. + */ + if (cp->cache_reclaim != NULL) { + long delta; + + /* + * Reclaimed memory should be reapable (not included in the + * depot's working set). + */ + delta = cp->cache_full.ml_total; + cp->cache_reclaim(cp->cache_private); + delta = cp->cache_full.ml_total - delta; + if (delta > 0) { + mutex_enter(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit += delta; + cp->cache_full.ml_min += delta; + mutex_exit(&cp->cache_depot_lock); + } + } + + kmem_depot_ws_reap(cp); + + if (cp->cache_defrag != NULL && !kmem_move_noreap) { + kmem_cache_defrag(cp); + } +} + +static void +kmem_reap_timeout(void *flag_arg) +{ + uint32_t *flag = (uint32_t *)flag_arg; + + ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace); + *flag = 0; +} + +static void +kmem_reap_done(void *flag) +{ + (void) bsd_timeout(kmem_reap_timeout, flag, &kmem_reap_interval); +} + +static void +kmem_reap_start(void *flag) +{ + ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace); + + if (flag == &kmem_reaping) { + kmem_cache_applyall(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP); + /* + * if we have segkp under heap, reap segkp cache. + */ + } + else + kmem_cache_applyall_id(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP); + + /* + * We use taskq_dispatch() to schedule a timeout to clear + * the flag so that kmem_reap() becomes self-throttling: + * we won't reap again until the current reap completes *and* + * at least kmem_reap_interval ticks have elapsed. + */ + if (!taskq_dispatch(kmem_taskq, kmem_reap_done, flag, TQ_NOSLEEP)) + kmem_reap_done(flag); +} + +static void +kmem_reap_common(void *flag_arg) +{ + uint32_t *flag = (uint32_t *)flag_arg; + + + if (MUTEX_HELD(&kmem_cache_lock) || kmem_taskq == NULL || + atomic_cas_32(flag, 0, 1) != 0) + return; + + /* + * It may not be kosher to do memory allocation when a reap is called + * is called (for example, if vmem_populate() is in the call chain). + * So we start the reap going with a TQ_NOALLOC dispatch. If the + * dispatch fails, we reset the flag, and the next reap will try again. + */ + if (!taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC)) + *flag = 0; +} + +/* + * Reclaim all unused memory from all caches. Called from the VM system + * when memory gets tight. + */ +void +kmem_reap(void) +{ + kmem_reap_common(&kmem_reaping); +} + +/* + * Reclaim all unused memory from identifier arenas, called when a vmem + * arena not back by memory is exhausted. Since reaping memory-backed caches + * cannot help with identifier exhaustion, we avoid both a large amount of + * work and unwanted side-effects from reclaim callbacks. + */ +void +kmem_reap_idspace(void) +{ + kmem_reap_common(&kmem_reaping_idspace); +} + +/* + * Purge all magazines from a cache and set its magazine limit to zero. + * All calls are serialized by the kmem_taskq lock, except for the final + * call from kmem_cache_destroy(). + */ +static void +kmem_cache_magazine_purge(kmem_cache_t *cp) +{ + kmem_cpu_cache_t *ccp; + kmem_magazine_t *mp, *pmp; + int rounds, prounds, cpu_seqid; + + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + ccp = &cp->cache_cpu[cpu_seqid]; + + mutex_enter(&ccp->cc_lock); + mp = ccp->cc_loaded; + pmp = ccp->cc_ploaded; + rounds = ccp->cc_rounds; + prounds = ccp->cc_prounds; + ccp->cc_loaded = NULL; + ccp->cc_ploaded = NULL; + ccp->cc_rounds = -1; + ccp->cc_prounds = -1; + ccp->cc_magsize = 0; + mutex_exit(&ccp->cc_lock); + + if (mp) + kmem_magazine_destroy(cp, mp, rounds); + + if (pmp) + kmem_magazine_destroy(cp, pmp, prounds); + } + + kmem_depot_ws_zero(cp); + kmem_depot_ws_reap(cp); +} + +/* + * Enable per-cpu magazines on a cache. + */ +static void +kmem_cache_magazine_enable(kmem_cache_t *cp) +{ + int cpu_seqid; + + if (cp->cache_flags & KMF_NOMAGAZINE) + return; + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + mutex_enter(&ccp->cc_lock); + ccp->cc_magsize = cp->cache_magtype->mt_magsize; + mutex_exit(&ccp->cc_lock); + } + +} + +static void +kmem_cache_magazine_disable(kmem_cache_t *cp) +{ + int cpu_seqid; + + if (cp->cache_flags & KMF_NOMAGAZINE) + return; + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + mutex_enter(&ccp->cc_lock); + ccp->cc_magsize = 0; + mutex_exit(&ccp->cc_lock); + } + +} + +/* + * Allow our caller to determine if there are running reaps. + * + * This call is very conservative and may return B_TRUE even when + * reaping activity isn't active. If it returns B_FALSE, then reaping + * activity is definitely inactive. + */ +boolean_t +kmem_cache_reap_active(void) +{ + return (!taskq_empty(kmem_taskq)); +} + +/* + * Reap (almost) everything right now. + */ +void +kmem_cache_reap_now(kmem_cache_t *cp) +{ + ASSERT(list_link_active(&cp->cache_link)); + + kmem_depot_ws_zero(cp); + + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_depot_ws_reap, cp, TQ_SLEEP); +} + +/* + * Recompute a cache's magazine size. The trade-off is that larger magazines + * provide a higher transfer rate with the depot, while smaller magazines + * reduce memory consumption. Magazine resizing is an expensive operation; + * it should not be done frequently. + * + * Changes to the magazine size are serialized by the kmem_taskq lock. + * + * Note: at present this only grows the magazine size. It might be useful + * to allow shrinkage too. + */ +static void +kmem_cache_magazine_resize(kmem_cache_t *cp) +{ + kmem_magtype_t *mtp = cp->cache_magtype; + + ASSERT(taskq_member(kmem_taskq, curthread)); + + if (cp->cache_chunksize < mtp->mt_maxbuf) { + kmem_cache_magazine_purge(cp); + mutex_enter(&cp->cache_depot_lock); + cp->cache_magtype = ++mtp; + cp->cache_depot_contention_prev = + cp->cache_depot_contention + INT_MAX; + mutex_exit(&cp->cache_depot_lock); + kmem_cache_magazine_enable(cp); + } +} + +/* + * Rescale a cache's hash table, so that the table size is roughly the + * cache size. We want the average lookup time to be extremely small. + */ +static void +kmem_hash_rescale(kmem_cache_t *cp) +{ + kmem_bufctl_t **old_table, **new_table, *bcp; + size_t old_size, new_size, h; + + ASSERT(taskq_member(kmem_taskq, curthread)); + + new_size = MAX(KMEM_HASH_INITIAL, + 1 << (highbit(3 * cp->cache_buftotal + 4) - 2)); + old_size = cp->cache_hash_mask + 1; + + if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) + return; + + new_table = vmem_alloc(kmem_hash_arena, new_size * sizeof (void *), + VM_NOSLEEP); + if (new_table == NULL) + return; + bzero(new_table, new_size * sizeof (void *)); + + mutex_enter(&cp->cache_lock); + + old_size = cp->cache_hash_mask + 1; + old_table = cp->cache_hash_table; + + cp->cache_hash_mask = new_size - 1; + cp->cache_hash_table = new_table; + cp->cache_rescale++; + + for (h = 0; h < old_size; h++) { + bcp = old_table[h]; + while (bcp != NULL) { + void *addr = bcp->bc_addr; + kmem_bufctl_t *next_bcp = bcp->bc_next; + kmem_bufctl_t **hash_bucket = KMEM_HASH(cp, addr); + bcp->bc_next = *hash_bucket; + *hash_bucket = bcp; + bcp = next_bcp; + } + } + + mutex_exit(&cp->cache_lock); + + vmem_free(kmem_hash_arena, old_table, old_size * sizeof (void *)); +} + +/* + * Perform periodic maintenance on a cache: hash rescaling, depot working-set + * update, magazine resizing, and slab consolidation. + */ +static void +kmem_cache_update(kmem_cache_t *cp) +{ + int need_hash_rescale = 0; + int need_magazine_resize = 0; + + /* + * If the cache has become much larger or smaller than its hash table, + * fire off a request to rescale the hash table. + */ + mutex_enter(&cp->cache_lock); + + if ((cp->cache_flags & KMF_HASH) && + (cp->cache_buftotal > (cp->cache_hash_mask << 1) || + (cp->cache_buftotal < (cp->cache_hash_mask >> 1) && + cp->cache_hash_mask > KMEM_HASH_INITIAL))) + need_hash_rescale = 1; + + mutex_exit(&cp->cache_lock); + + /* + * Update the depot working set statistics. + */ + kmem_depot_ws_update(cp); + + /* + * If there's a lot of contention in the depot, + * increase the magazine size. + */ + mutex_enter(&cp->cache_depot_lock); + + if (cp->cache_chunksize < cp->cache_magtype->mt_maxbuf && + (int)(cp->cache_depot_contention - + cp->cache_depot_contention_prev) > kmem_depot_contention) + need_magazine_resize = 1; + + cp->cache_depot_contention_prev = cp->cache_depot_contention; + + mutex_exit(&cp->cache_depot_lock); + + if (need_hash_rescale) + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_hash_rescale, cp, TQ_NOSLEEP); + + if (need_magazine_resize) + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_magazine_resize, + cp, TQ_NOSLEEP); + + // smd : the following if is only true for the dnode cache + if (cp->cache_defrag != NULL) + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_scan, cp, TQ_NOSLEEP); + +#ifdef DEBUG + else { + // for every other cache, duplicate some of the logic from + // kmem_cache_scan() below + // run reap occasionally even if there is plenty of memory + uint16_t debug_rand; + + (void) random_get_bytes((uint8_t *)&debug_rand, 2); + if (!kmem_move_noreap && + ((debug_rand % kmem_mtb_reap) == 0)) { + /* + * no mutex above, so no need to give it up as + * in kmem_cache_scan() + */ + } + } +#endif + +} + +static void kmem_update(void *); + +static void +kmem_update_timeout(void *dummy) +{ + (void) bsd_timeout(kmem_update, dummy, &kmem_reap_interval); +} + +static void +kmem_update(void *dummy) +{ + kmem_cache_applyall(kmem_cache_update, NULL, TQ_NOSLEEP); + + /* + * We use taskq_dispatch() to reschedule the timeout so that + * kmem_update() becomes self-throttling: it won't schedule + * new tasks until all previous tasks have completed. + */ + if (!taskq_dispatch(kmem_taskq, kmem_update_timeout, dummy, TQ_NOSLEEP)) + kmem_update_timeout(NULL); + +} + +static int +kmem_cache_kstat_update(kstat_t *ksp, int rw) +{ + struct kmem_cache_kstat *kmcp = &kmem_cache_kstat; + kmem_cache_t *cp = ksp->ks_private; + uint64_t cpu_buf_avail; + uint64_t buf_avail = 0; + int cpu_seqid; + long reap; + + if (rw == KSTAT_WRITE) + return (EACCES); + + mutex_enter(&cp->cache_lock); + + kmcp->kmc_alloc_fail.value.ui64 = cp->cache_alloc_fail; + kmcp->kmc_alloc.value.ui64 = cp->cache_slab_alloc; + kmcp->kmc_free.value.ui64 = cp->cache_slab_free; + kmcp->kmc_slab_alloc.value.ui64 = cp->cache_slab_alloc; + kmcp->kmc_slab_free.value.ui64 = cp->cache_slab_free; + kmcp->kmc_no_vba_success.value.ui64 = cp->no_vba_success; + kmcp->kmc_no_vba_fail.value.ui64 = cp->no_vba_fail; + kmcp->kmc_arc_no_grow_set.value.ui64 = cp->arc_no_grow_set; + kmcp->kmc_arc_no_grow.value.ui64 = cp->arc_no_grow; + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + + mutex_enter(&ccp->cc_lock); + + cpu_buf_avail = 0; + if (ccp->cc_rounds > 0) + cpu_buf_avail += ccp->cc_rounds; + if (ccp->cc_prounds > 0) + cpu_buf_avail += ccp->cc_prounds; + + kmcp->kmc_alloc.value.ui64 += ccp->cc_alloc; + kmcp->kmc_free.value.ui64 += ccp->cc_free; + buf_avail += cpu_buf_avail; + + mutex_exit(&ccp->cc_lock); + } + + mutex_enter(&cp->cache_depot_lock); + + kmcp->kmc_depot_alloc.value.ui64 = cp->cache_full.ml_alloc; + kmcp->kmc_depot_free.value.ui64 = cp->cache_empty.ml_alloc; + kmcp->kmc_depot_contention.value.ui64 = cp->cache_depot_contention; + kmcp->kmc_full_magazines.value.ui64 = cp->cache_full.ml_total; + kmcp->kmc_empty_magazines.value.ui64 = cp->cache_empty.ml_total; + kmcp->kmc_magazine_size.value.ui64 = + (cp->cache_flags & KMF_NOMAGAZINE) ? + 0 : cp->cache_magtype->mt_magsize; + + kmcp->kmc_alloc.value.ui64 += cp->cache_full.ml_alloc; + kmcp->kmc_free.value.ui64 += cp->cache_empty.ml_alloc; + buf_avail += cp->cache_full.ml_total * cp->cache_magtype->mt_magsize; + + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + reap = MIN(reap, cp->cache_full.ml_total); + + mutex_exit(&cp->cache_depot_lock); + + kmcp->kmc_buf_size.value.ui64 = cp->cache_bufsize; + kmcp->kmc_align.value.ui64 = cp->cache_align; + kmcp->kmc_chunk_size.value.ui64 = cp->cache_chunksize; + kmcp->kmc_slab_size.value.ui64 = cp->cache_slabsize; + kmcp->kmc_buf_constructed.value.ui64 = buf_avail; + buf_avail += cp->cache_bufslab; + kmcp->kmc_buf_avail.value.ui64 = buf_avail; + kmcp->kmc_buf_inuse.value.ui64 = cp->cache_buftotal - buf_avail; + kmcp->kmc_buf_total.value.ui64 = cp->cache_buftotal; + kmcp->kmc_buf_max.value.ui64 = cp->cache_bufmax; + kmcp->kmc_slab_create.value.ui64 = cp->cache_slab_create; + kmcp->kmc_slab_destroy.value.ui64 = cp->cache_slab_destroy; + kmcp->kmc_hash_size.value.ui64 = (cp->cache_flags & KMF_HASH) ? + cp->cache_hash_mask + 1 : 0; + kmcp->kmc_hash_lookup_depth.value.ui64 = cp->cache_lookup_depth; + kmcp->kmc_hash_rescale.value.ui64 = cp->cache_rescale; + kmcp->kmc_vmem_source.value.ui64 = cp->cache_arena->vm_id; + kmcp->kmc_reap.value.ui64 = cp->cache_reap; + + if (cp->cache_defrag == NULL) { + kmcp->kmc_move_callbacks.value.ui64 = 0; + kmcp->kmc_move_yes.value.ui64 = 0; + kmcp->kmc_move_no.value.ui64 = 0; + kmcp->kmc_move_later.value.ui64 = 0; + kmcp->kmc_move_dont_need.value.ui64 = 0; + kmcp->kmc_move_dont_know.value.ui64 = 0; + kmcp->kmc_move_hunt_found.value.ui64 = 0; + kmcp->kmc_move_slabs_freed.value.ui64 = 0; + kmcp->kmc_defrag.value.ui64 = 0; + kmcp->kmc_scan.value.ui64 = 0; + kmcp->kmc_move_reclaimable.value.ui64 = 0; + } else { + int64_t reclaimable; + + kmem_defrag_t *kd = cp->cache_defrag; + kmcp->kmc_move_callbacks.value.ui64 = kd->kmd_callbacks; + kmcp->kmc_move_yes.value.ui64 = kd->kmd_yes; + kmcp->kmc_move_no.value.ui64 = kd->kmd_no; + kmcp->kmc_move_later.value.ui64 = kd->kmd_later; + kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need; + kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know; + kmcp->kmc_move_hunt_found.value.ui64 = 0; + kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed; + kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags; + kmcp->kmc_scan.value.ui64 = kd->kmd_scans; + + reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1); + reclaimable = MAX(reclaimable, 0); + reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize); + kmcp->kmc_move_reclaimable.value.ui64 = reclaimable; + } + + mutex_exit(&cp->cache_lock); + return (0); +} + +/* + * Return a named statistic about a particular cache. + * This shouldn't be called very often, so it's currently designed for + * simplicity (leverages existing kstat support) rather than efficiency. + */ +uint64_t +kmem_cache_stat(kmem_cache_t *cp, char *name) +{ + int i; + kstat_t *ksp = cp->cache_kstat; + kstat_named_t *knp = (kstat_named_t *)&kmem_cache_kstat; + uint64_t value = 0; + + if (ksp != NULL) { + mutex_enter(&kmem_cache_kstat_lock); + (void) kmem_cache_kstat_update(ksp, KSTAT_READ); + for (i = 0; i < ksp->ks_ndata; i++) { + if (strcmp(knp[i].name, name) == 0) { + value = knp[i].value.ui64; + break; + } + } + mutex_exit(&kmem_cache_kstat_lock); + } + return (value); +} + +// TRUE if we have more than a critical minimum of memory +// used in arc_memory_throttle; if FALSE, we throttle +static inline bool +spl_minimal_physmem_p_logic() +{ + // do we have enough memory to avoid throttling? + if (vm_page_free_wanted > 0) + return (false); + if (vm_page_free_count < (vm_page_free_min + 512)) + // 512 pages above 3500 (normal vm_page_free_min) + // 2MiB above 13 MiB + return (false); + return (true); +} + +int32_t +spl_minimal_physmem_p(void) +{ + + // arc will throttle throttle if we are paging, otherwise + // we want a small bit of pressure here so that we can compete + // a little with the xnu buffer cache + + return (spl_free > -1024LL); +} + +/* + * Return the maximum amount of memory that is (in theory) allocatable + * from the heap. This may be used as an estimate only since there + * is no guarentee this space will still be available when an allocation + * request is made, nor that the space may be allocated in one big request + * due to kernel heap fragmentation. + */ +size_t +kmem_maxavail(void) +{ +#ifndef APPLE + // spgcnt_t pmem = availrmem - tune.t_minarmem; + // spgcnt_t vmem = btop(vmem_size(heap_arena, VMEM_FREE)); + // + // return ((size_t)ptob(MAX(MIN(pmem, vmem), 0))); +#endif + return (physmem * PAGE_SIZE); +} + +/* + * Indicate whether memory-intensive kmem debugging is enabled. + */ +int +kmem_debugging(void) +{ + return (kmem_flags & (KMF_AUDIT | KMF_REDZONE)); +} + +/* binning function, sorts finely at the two extremes */ +#define KMEM_PARTIAL_SLAB_WEIGHT(sp, binshift) \ +((((sp)->slab_refcnt <= (binshift)) || \ +(((sp)->slab_chunks - (sp)->slab_refcnt) <= (binshift))) \ +? -(sp)->slab_refcnt \ +: -((binshift) + ((sp)->slab_refcnt >> (binshift)))) + +/* + * Minimizing the number of partial slabs on the freelist minimizes + * fragmentation (the ratio of unused buffers held by the slab layer). There are + * two ways to get a slab off of the freelist: 1) free all the buffers on the + * slab, and 2) allocate all the buffers on the slab. It follows that we want + * the most-used slabs at the front of the list where they have the best chance + * of being completely allocated, and the least-used slabs at a safe distance + * from the front to improve the odds that the few remaining buffers will all be + * freed before another allocation can tie up the slab. For that reason a slab + * with a higher slab_refcnt sorts less than than a slab with a lower + * slab_refcnt. + * + * However, if a slab has at least one buffer that is deemed unfreeable, we + * would rather have that slab at the front of the list regardless of + * slab_refcnt, since even one unfreeable buffer makes the entire slab + * unfreeable. If the client returns KMEM_CBRC_NO in response to a cache_move() + * callback, the slab is marked unfreeable for as long as it remains on the + * freelist. + */ +static int +kmem_partial_slab_cmp(const void *pp0, const void *pp1) +{ + const kmem_cache_t *cp; + const kmem_slab_t *s0 = pp0; + const kmem_slab_t *s1 = pp1; + int w0, w1; + size_t binshift; + + ASSERT(KMEM_SLAB_IS_PARTIAL(s0)); + ASSERT(KMEM_SLAB_IS_PARTIAL(s1)); + ASSERT(s0->slab_cache == s1->slab_cache); + cp = s1->slab_cache; + ASSERT(MUTEX_HELD((struct kmutex *)&cp->cache_lock)); + binshift = cp->cache_partial_binshift; + + /* weight of first slab */ + w0 = KMEM_PARTIAL_SLAB_WEIGHT(s0, binshift); + if (s0->slab_flags & KMEM_SLAB_NOMOVE) { + w0 -= cp->cache_maxchunks; + } + + /* weight of second slab */ + w1 = KMEM_PARTIAL_SLAB_WEIGHT(s1, binshift); + if (s1->slab_flags & KMEM_SLAB_NOMOVE) { + w1 -= cp->cache_maxchunks; + } + + if (w0 < w1) + return (-1); + if (w0 > w1) + return (1); + + // compare slab age if available + hrtime_t c0 = s0->slab_create_time, c1 = s1->slab_create_time; + if (c0 != 0 && c1 != 0 && c0 != c1) { + // higher time is newer; newer sorts before older + if (c0 < c1) // c0 is older than c1 + return (1); // so c0 sorts after c1 + if (c0 > c1) + return (-1); + } + + /* compare pointer values */ + if ((uintptr_t)s0 < (uintptr_t)s1) + return (-1); + if ((uintptr_t)s0 > (uintptr_t)s1) + return (1); + + return (0); +} + +/* + * It must be valid to call the destructor (if any) on a newly created object. + * That is, the constructor (if any) must leave the object in a valid state for + * the destructor. + */ +kmem_cache_t * +kmem_cache_create( + char *name, /* descriptive name for this cache */ + size_t bufsize, /* size of the objects it manages */ + size_t align, /* required object alignment */ + int (*constructor)(void *, void *, int), /* object constructor */ + void (*destructor)(void *, void *), /* object destructor */ + void (*reclaim)(void *), /* memory reclaim callback */ + void *private, /* pass-thru arg for constr/destr/reclaim */ + vmem_t *vmp, /* vmem source for slab allocation */ + int cflags) /* cache creation flags */ +{ + int cpu_seqid; + size_t chunksize; + kmem_cache_t *cp; + kmem_magtype_t *mtp; + size_t csize = KMEM_CACHE_SIZE(max_ncpus); + +#ifdef DEBUG + /* + * Cache names should conform to the rules for valid C identifiers + */ + if (!strident_valid(name)) { + cmn_err(CE_CONT, + "kmem_cache_create: '%s' is an invalid cache name\n" + "cache names must conform to the rules for " + "C identifiers\n", name); + } +#endif /* DEBUG */ + + if (vmp == NULL) + vmp = kmem_default_arena; + + /* + * If this kmem cache has an identifier vmem arena as its source, mark + * it such to allow kmem_reap_idspace(). + */ + ASSERT(!(cflags & KMC_IDENTIFIER)); /* consumer should not set this */ + if (vmp->vm_cflags & VMC_IDENTIFIER) + cflags |= KMC_IDENTIFIER; + + /* + * Get a kmem_cache structure. We arrange that cp->cache_cpu[] + * is aligned on a KMEM_CPU_CACHE_SIZE boundary to prevent + * false sharing of per-CPU data. + */ + cp = vmem_xalloc(kmem_cache_arena, csize, + KMEM_CPU_CACHE_SIZE, + P2NPHASE(csize, KMEM_CPU_CACHE_SIZE), + 0, NULL, NULL, VM_SLEEP); + bzero(cp, csize); + list_link_init(&cp->cache_link); + + if (align == 0) + align = KMEM_ALIGN; + + /* + * If we're not at least KMEM_ALIGN aligned, we can't use free + * memory to hold bufctl information (because we can't safely + * perform word loads and stores on it). + */ + if (align < KMEM_ALIGN) + cflags |= KMC_NOTOUCH; + + if ((align & (align - 1)) != 0 || align > vmp->vm_quantum) + panic("kmem_cache_create: bad alignment %lu", align); + + mutex_enter(&kmem_flags_lock); + if (kmem_flags & KMF_RANDOMIZE) + kmem_flags = (((kmem_flags | ~KMF_RANDOM) + 1) & KMF_RANDOM) | + KMF_RANDOMIZE; + cp->cache_flags = (kmem_flags | cflags) & KMF_DEBUG; + mutex_exit(&kmem_flags_lock); + + /* + * Make sure all the various flags are reasonable. + */ + ASSERT(!(cflags & KMC_NOHASH) || !(cflags & KMC_NOTOUCH)); + + if (cp->cache_flags & KMF_LITE) { + if (bufsize >= kmem_lite_minsize && + align <= kmem_lite_maxalign && + P2PHASE(bufsize, kmem_lite_maxalign) != 0) { + cp->cache_flags |= KMF_BUFTAG; + cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL); + } else { + cp->cache_flags &= ~KMF_DEBUG; + } + } + + if (cp->cache_flags & KMF_DEADBEEF) + cp->cache_flags |= KMF_REDZONE; + + if ((cflags & KMC_QCACHE) && (cp->cache_flags & KMF_AUDIT)) + cp->cache_flags |= KMF_NOMAGAZINE; + + if (cflags & KMC_NODEBUG) + cp->cache_flags &= ~KMF_DEBUG; + + if (cflags & KMC_NOTOUCH) + cp->cache_flags &= ~KMF_TOUCH; + + if (cflags & KMC_PREFILL) + cp->cache_flags |= KMF_PREFILL; + + if (cflags & KMC_NOHASH) + cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL); + + if (cflags & KMC_NOMAGAZINE) + cp->cache_flags |= KMF_NOMAGAZINE; + + if ((cp->cache_flags & KMF_AUDIT) && !(cflags & KMC_NOTOUCH)) + cp->cache_flags |= KMF_REDZONE; + + if (!(cp->cache_flags & KMF_AUDIT)) + cp->cache_flags &= ~KMF_CONTENTS; + + if ((cp->cache_flags & KMF_BUFTAG) && bufsize >= kmem_minfirewall && + !(cp->cache_flags & KMF_LITE) && !(cflags & KMC_NOHASH)) + cp->cache_flags |= KMF_FIREWALL; + + if (vmp != kmem_default_arena || kmem_firewall_arena == NULL) + cp->cache_flags &= ~KMF_FIREWALL; + + if (cp->cache_flags & KMF_FIREWALL) { + cp->cache_flags &= ~KMF_BUFTAG; + cp->cache_flags |= KMF_NOMAGAZINE; + ASSERT(vmp == kmem_default_arena); + vmp = kmem_firewall_arena; + } + + /* + * Set cache properties. + */ + (void) strncpy(cp->cache_name, name, KMEM_CACHE_NAMELEN); + strident_canon(cp->cache_name, KMEM_CACHE_NAMELEN + 1); + cp->cache_bufsize = bufsize; + cp->cache_align = align; + cp->cache_constructor = constructor; + cp->cache_destructor = destructor; + cp->cache_reclaim = reclaim; + cp->cache_private = private; + cp->cache_arena = vmp; + cp->cache_cflags = cflags; + + /* + * Determine the chunk size. + */ + chunksize = bufsize; + + if (align >= KMEM_ALIGN) { + chunksize = P2ROUNDUP(chunksize, KMEM_ALIGN); + cp->cache_bufctl = chunksize - KMEM_ALIGN; + } + + if (cp->cache_flags & KMF_BUFTAG) { + cp->cache_bufctl = chunksize; + cp->cache_buftag = chunksize; + if (cp->cache_flags & KMF_LITE) + chunksize += KMEM_BUFTAG_LITE_SIZE(kmem_lite_count); + else + chunksize += sizeof (kmem_buftag_t); + } + + if (cp->cache_flags & KMF_DEADBEEF) { + cp->cache_verify = MIN(cp->cache_buftag, kmem_maxverify); + if (cp->cache_flags & KMF_LITE) + cp->cache_verify = sizeof (uint64_t); + } + + cp->cache_contents = MIN(cp->cache_bufctl, kmem_content_maxsave); + + cp->cache_chunksize = chunksize = P2ROUNDUP(chunksize, align); + + /* + * Now that we know the chunk size, determine the optimal slab size. + */ + + size_t vquantum = vmp->vm_quantum; + + if ((cflags & KMC_ARENA_SLAB) == KMC_ARENA_SLAB) { + VERIFY3U((vmp->vm_cflags & VMC_NO_QCACHE), ==, VMC_NO_QCACHE); + VERIFY3U(vmp->vm_min_import, >, 0); + VERIFY3U(vmp->vm_min_import, >=, (2 * vmp->vm_quantum)); + VERIFY(ISP2(vmp->vm_min_import)); + vquantum = vmp->vm_min_import >> 1; + } + + if (vmp == kmem_firewall_arena) { + cp->cache_slabsize = P2ROUNDUP(chunksize, vquantum); + cp->cache_mincolor = cp->cache_slabsize - chunksize; + cp->cache_maxcolor = cp->cache_mincolor; + cp->cache_flags |= KMF_HASH; + ASSERT(!(cp->cache_flags & KMF_BUFTAG)); + } else if ((cflags & KMC_NOHASH) || (!(cflags & KMC_NOTOUCH) && + !(cp->cache_flags & KMF_AUDIT) && + chunksize < vquantum / + KMEM_VOID_FRACTION)) { + cp->cache_slabsize = vquantum; + cp->cache_mincolor = 0; + cp->cache_maxcolor = + (cp->cache_slabsize - sizeof (kmem_slab_t)) % chunksize; + ASSERT(chunksize + sizeof (kmem_slab_t) <= cp->cache_slabsize); + ASSERT(!(cp->cache_flags & KMF_AUDIT)); + } else { + size_t chunks, bestfit, waste, slabsize; + size_t minwaste = LONG_MAX; + + for (chunks = 1; chunks <= KMEM_VOID_FRACTION; chunks++) { + slabsize = P2ROUNDUP(chunksize * chunks, + vquantum); + chunks = slabsize / chunksize; + waste = (slabsize % chunksize) / chunks; + if (waste < minwaste) { + minwaste = waste; + bestfit = slabsize; + } + } + if (cflags & KMC_QCACHE) + bestfit = VMEM_QCACHE_SLABSIZE(vmp->vm_qcache_max); + cp->cache_slabsize = bestfit; + cp->cache_mincolor = 0; + cp->cache_maxcolor = bestfit % chunksize; + cp->cache_flags |= KMF_HASH; + } + + cp->cache_maxchunks = (cp->cache_slabsize / cp->cache_chunksize); + cp->cache_partial_binshift = highbit(cp->cache_maxchunks / 16) + 1; + + /* + * Disallowing prefill when either the DEBUG or HASH flag is set or when + * there is a constructor avoids some tricky issues with debug setup + * that may be revisited later. We cannot allow prefill in a + * metadata cache because of potential recursion. + */ + if (vmp == kmem_msb_arena || + cp->cache_flags & (KMF_HASH | KMF_BUFTAG) || + cp->cache_constructor != NULL) + cp->cache_flags &= ~KMF_PREFILL; + + if (cp->cache_flags & KMF_HASH) { + ASSERT(!(cflags & KMC_NOHASH)); + cp->cache_bufctl_cache = (cp->cache_flags & KMF_AUDIT) ? + kmem_bufctl_audit_cache : kmem_bufctl_cache; + } + + if (cp->cache_maxcolor >= vquantum) + cp->cache_maxcolor = vquantum - 1; + + cp->cache_color = cp->cache_mincolor; + + /* + * Initialize the rest of the slab layer. + */ + mutex_init(&cp->cache_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&cp->cache_partial_slabs, kmem_partial_slab_cmp, + sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link)); + /* LINTED: E_TRUE_LOGICAL_EXPR */ + ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t)); + /* reuse partial slab AVL linkage for complete slab list linkage */ + list_create(&cp->cache_complete_slabs, + sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link)); + + if (cp->cache_flags & KMF_HASH) { + cp->cache_hash_table = vmem_alloc(kmem_hash_arena, + KMEM_HASH_INITIAL * sizeof (void *), + VM_SLEEP); + bzero(cp->cache_hash_table, + KMEM_HASH_INITIAL * sizeof (void *)); + cp->cache_hash_mask = KMEM_HASH_INITIAL - 1; + cp->cache_hash_shift = highbit((ulong_t)chunksize) - 1; + } + + /* + * Initialize the depot. + */ + mutex_init(&cp->cache_depot_lock, NULL, MUTEX_DEFAULT, NULL); + + for (mtp = kmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) + continue; + + cp->cache_magtype = mtp; + + /* + * Initialize the CPU layer. + */ + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + mutex_init(&ccp->cc_lock, NULL, MUTEX_DEFAULT, NULL); // XNU + ccp->cc_flags = cp->cache_flags; + ccp->cc_rounds = -1; + ccp->cc_prounds = -1; + } + + /* + * Create the cache's kstats. + */ + if ((cp->cache_kstat = kstat_create("unix", 0, cp->cache_name, + "kmem_cache", KSTAT_TYPE_NAMED, + sizeof (kmem_cache_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + cp->cache_kstat->ks_data = &kmem_cache_kstat; + cp->cache_kstat->ks_update = kmem_cache_kstat_update; + cp->cache_kstat->ks_private = cp; + cp->cache_kstat->ks_lock = &kmem_cache_kstat_lock; + kstat_install(cp->cache_kstat); + } + + /* + * Add the cache to the global list. This makes it visible + * to kmem_update(), so the cache must be ready for business. + */ + mutex_enter(&kmem_cache_lock); + list_insert_tail(&kmem_caches, cp); + mutex_exit(&kmem_cache_lock); + + if (kmem_ready) + kmem_cache_magazine_enable(cp); + + return (cp); +} + +static int +kmem_move_cmp(const void *buf, const void *p) +{ + const kmem_move_t *kmm = p; + uintptr_t v1 = (uintptr_t)buf; + uintptr_t v2 = (uintptr_t)kmm->kmm_from_buf; + return (v1 < v2 ? -1 : (v1 > v2 ? 1 : 0)); +} + +static void +kmem_reset_reclaim_threshold(kmem_defrag_t *kmd) +{ + kmd->kmd_reclaim_numer = 1; +} + +/* + * Initially, when choosing candidate slabs for buffers to move, we want to be + * very selective and take only slabs that are less than + * (1 / KMEM_VOID_FRACTION) allocated. If we have difficulty finding candidate + * slabs, then we raise the allocation ceiling incrementally. The reclaim + * threshold is reset to (1 / KMEM_VOID_FRACTION) as soon as the cache is no + * longer fragmented. + */ +static void +kmem_adjust_reclaim_threshold(kmem_defrag_t *kmd, int direction) +{ + if (direction > 0) { + /* make it easier to find a candidate slab */ + if (kmd->kmd_reclaim_numer < (KMEM_VOID_FRACTION - 1)) { + kmd->kmd_reclaim_numer++; + } + } else { + /* be more selective */ + if (kmd->kmd_reclaim_numer > 1) { + kmd->kmd_reclaim_numer--; + } + } +} + +uint64_t +spl_kmem_cache_inuse(kmem_cache_t *cache) +{ + return (cache->cache_buftotal); +} + +uint64_t +spl_kmem_cache_entry_size(kmem_cache_t *cache) +{ + return (cache->cache_bufsize); +} + +void +kmem_cache_set_move(kmem_cache_t *cp, + kmem_cbrc_t (*move)(void *, void *, size_t, void *)) +{ + kmem_defrag_t *defrag; + + ASSERT(move != NULL); + /* + * The consolidator does not support NOTOUCH caches because kmem cannot + * initialize their slabs with the 0xbaddcafe memory pattern, which sets + * a low order bit usable by clients to distinguish uninitialized memory + * from known objects (see kmem_slab_create). + */ + ASSERT(!(cp->cache_cflags & KMC_NOTOUCH)); + ASSERT(!(cp->cache_cflags & KMC_IDENTIFIER)); + + /* + * We should not be holding anyone's cache lock when calling + * kmem_cache_alloc(), so allocate in all cases before acquiring the + * lock. + */ + defrag = kmem_cache_alloc(kmem_defrag_cache, KM_SLEEP); + + mutex_enter(&cp->cache_lock); + + if (KMEM_IS_MOVABLE(cp)) { + if (cp->cache_move == NULL) { + ASSERT(cp->cache_slab_alloc == 0); + + cp->cache_defrag = defrag; + defrag = NULL; /* nothing to free */ + bzero(cp->cache_defrag, sizeof (kmem_defrag_t)); + avl_create(&cp->cache_defrag->kmd_moves_pending, + kmem_move_cmp, sizeof (kmem_move_t), + offsetof(kmem_move_t, kmm_entry)); + /* LINTED: E_TRUE_LOGICAL_EXPR */ + ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t)); + /* reuse the slab's AVL linkage for deadlist linkage */ + list_create(&cp->cache_defrag->kmd_deadlist, + sizeof (kmem_slab_t), + offsetof(kmem_slab_t, slab_link)); + kmem_reset_reclaim_threshold(cp->cache_defrag); + } + cp->cache_move = move; + } + + mutex_exit(&cp->cache_lock); + + if (defrag != NULL) { + kmem_cache_free(kmem_defrag_cache, defrag); /* unused */ + } +} + +void +kmem_qcache_destroy() +{ + kmem_cache_t *cp; + kmem_cache_t *cache_to_destroy = NULL; + + do { + cache_to_destroy = NULL; + mutex_enter(&kmem_cache_lock); + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (cp->cache_cflags & KMC_QCACHE) { + cache_to_destroy = cp; + break; + } + } + mutex_exit(&kmem_cache_lock); + + if (cache_to_destroy) { + kmem_cache_destroy(cache_to_destroy); + } + } while (cache_to_destroy); +} + +void +kmem_cache_destroy(kmem_cache_t *cp) +{ + int cpu_seqid; + + /* + * Remove the cache from the global cache list so that no one else + * can schedule tasks on its behalf, wait for any pending tasks to + * complete, purge the cache, and then destroy it. + */ + mutex_enter(&kmem_cache_lock); + list_remove(&kmem_caches, cp); + mutex_exit(&kmem_cache_lock); + + if (kmem_taskq != NULL) + taskq_wait(kmem_taskq); + + if (kmem_move_taskq != NULL && cp->cache_defrag != NULL) + taskq_wait(kmem_move_taskq); + + kmem_cache_magazine_purge(cp); + + mutex_enter(&cp->cache_lock); + + if (cp->cache_buftotal != 0) + cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty", + cp->cache_name, (void *)cp); + if (cp->cache_defrag != NULL) { + avl_destroy(&cp->cache_defrag->kmd_moves_pending); + list_destroy(&cp->cache_defrag->kmd_deadlist); + kmem_cache_free(kmem_defrag_cache, cp->cache_defrag); + cp->cache_defrag = NULL; + } + /* + * The cache is now dead. There should be no further activity. We + * enforce this by setting land mines in the constructor, destructor, + * reclaim, and move routines that induce a kernel text fault if + * invoked. + */ + cp->cache_constructor = (int (*)(void *, void *, int))1; + cp->cache_destructor = (void (*)(void *, void *))2; + cp->cache_reclaim = (void (*)(void *))3; + cp->cache_move = (kmem_cbrc_t (*)(void *, void *, size_t, void *))4; + mutex_exit(&cp->cache_lock); + + kstat_delete(cp->cache_kstat); + + if (cp->cache_hash_table != NULL) + vmem_free(kmem_hash_arena, cp->cache_hash_table, + (cp->cache_hash_mask + 1) * sizeof (void *)); + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) + mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock); + + mutex_destroy(&cp->cache_depot_lock); + mutex_destroy(&cp->cache_lock); + + vmem_free(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus)); +} + +static void +kmem_alloc_caches_create(const int *array, size_t count, + kmem_cache_t **alloc_table, size_t maxbuf, + uint_t shift) +{ + char name[KMEM_CACHE_NAMELEN + 1]; + size_t table_unit = (1 << shift); /* range of one alloc_table entry */ + size_t size = table_unit; + int i; + + for (i = 0; i < count; i++) { + size_t cache_size = array[i]; + size_t align = KMEM_ALIGN; + kmem_cache_t *cp; + + /* if the table has an entry for maxbuf, we're done */ + if (size > maxbuf) + break; + + /* cache size must be a multiple of the table unit */ + ASSERT(P2PHASE(cache_size, table_unit) == 0); + + /* + * If they allocate a multiple of the coherency granularity, + * they get a coherency-granularity-aligned address. + */ + if (IS_P2ALIGNED(cache_size, 64)) + align = 64; + if (IS_P2ALIGNED(cache_size, PAGESIZE)) + align = PAGESIZE; + (void) snprintf(name, sizeof (name), + "kmem_alloc_%lu", cache_size); + cp = kmem_cache_create(name, cache_size, align, + NULL, NULL, NULL, NULL, NULL, KMC_KMEM_ALLOC | KMF_HASH); + + while (size <= cache_size) { + alloc_table[(size - 1) >> shift] = cp; + size += table_unit; + } + } + + ASSERT(size > maxbuf); /* i.e. maxbuf <= max(cache_size) */ +} + +static void +kmem_alloc_caches_destroy() +{ + kmem_cache_t *cache_to_destroy = NULL; + kmem_cache_t *cp = NULL; + + do { + cache_to_destroy = NULL; + + // Locate the first cache that has the KMC_KMEM_ALLOC flag. + mutex_enter(&kmem_cache_lock); + + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (cp->cache_cflags & KMC_KMEM_ALLOC) { + cache_to_destroy = cp; + break; + } + } + + mutex_exit(&kmem_cache_lock); + + // Destroy the cache + if (cache_to_destroy) { + kmem_cache_destroy(cache_to_destroy); + } + + } while (cache_to_destroy); +} + +static void +kmem_destroy_cache_by_name(const char *substr) +{ + kmem_cache_t *cache_to_destroy = NULL; + kmem_cache_t *cp = NULL; + + do { + cache_to_destroy = NULL; + + // Locate the first cache that has the KMC_KMEM_ALLOC flag. + mutex_enter(&kmem_cache_lock); + + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (kmem_strstr(cp->cache_name, substr)) { + cache_to_destroy = cp; + break; + } + } + + mutex_exit(&kmem_cache_lock); + + // Destroy the cache + if (cache_to_destroy) { + kmem_cache_destroy(cache_to_destroy); + } + + } while (cache_to_destroy); +} + +static void +kmem_cache_init(int pass, int use_large_pages) +{ + int i; + size_t maxbuf; + kmem_magtype_t *mtp; + + for (i = 0; i < sizeof (kmem_magtype) / sizeof (*mtp); i++) { + char name[KMEM_CACHE_NAMELEN + 1]; + + mtp = &kmem_magtype[i]; + (void) snprintf(name, KMEM_CACHE_NAMELEN, "%s%d", + KMEM_MAGAZINE_PREFIX, + mtp->mt_magsize); + mtp->mt_cache = kmem_cache_create( + name, + (mtp->mt_magsize + 1) * sizeof (void *), + mtp->mt_align, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + } + + kmem_slab_cache = kmem_cache_create("kmem_slab_cache", + sizeof (kmem_slab_t), 0, NULL, NULL, + NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + kmem_bufctl_cache = kmem_cache_create("kmem_bufctl_cache", + sizeof (kmem_bufctl_t), 0, + NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + kmem_bufctl_audit_cache = kmem_cache_create("kmem_bufctl_audit_cache", + sizeof (kmem_bufctl_audit_t), + 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + if (pass == 2) { + kmem_va_arena = vmem_create(KMEM_VA_PREFIX, + NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, heap_arena, + 2 * PAGESIZE, VM_SLEEP); + + kmem_default_arena = vmem_create("kmem_default", + NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, kmem_va_arena, + 0, VMC_DUMPSAFE | VM_SLEEP); + + /* Figure out what our maximum cache size is */ + maxbuf = kmem_max_cached; + if (maxbuf <= KMEM_MAXBUF) { + maxbuf = 0; + kmem_max_cached = KMEM_MAXBUF; + } else { + size_t size = 0; + size_t max = + sizeof (kmem_big_alloc_sizes) / sizeof (int); + /* + * Round maxbuf up to an existing cache size. If maxbuf + * is larger than the largest cache, we truncate it to + * the largest cache's size. + */ + for (i = 0; i < max; i++) { + size = kmem_big_alloc_sizes[i]; + if (maxbuf <= size) + break; + } + kmem_max_cached = maxbuf = size; + } + + /* + * The big alloc table may not be completely overwritten, so + * we clear out any stale cache pointers from the first pass. + */ + bzero(kmem_big_alloc_table, sizeof (kmem_big_alloc_table)); + } else { + /* + * During the first pass, the kmem_alloc_* caches + * are treated as metadata. + */ + kmem_default_arena = kmem_msb_arena; + maxbuf = KMEM_BIG_MAXBUF_32BIT; + } + + /* + * Set up the default caches to back kmem_alloc() + */ + kmem_alloc_caches_create( + kmem_alloc_sizes, sizeof (kmem_alloc_sizes) / sizeof (int), + kmem_alloc_table, KMEM_MAXBUF, KMEM_ALIGN_SHIFT); + + kmem_alloc_caches_create( + kmem_big_alloc_sizes, sizeof (kmem_big_alloc_sizes) / sizeof (int), + kmem_big_alloc_table, maxbuf, KMEM_BIG_SHIFT); + + kmem_big_alloc_table_max = maxbuf >> KMEM_BIG_SHIFT; +} + +struct free_slab { + vmem_t *vmp; + size_t slabsize; + void *slab; + list_node_t next; +}; + +static list_t freelist; + + +void +kmem_cache_build_slablist(kmem_cache_t *cp) +{ + int cpu_seqid; + + vmem_t *vmp = cp->cache_arena; + kmem_slab_t *sp; + struct free_slab *fs; + + for (sp = list_head(&cp->cache_complete_slabs); sp != NULL; + sp = list_next(&cp->cache_complete_slabs, sp)) { + + MALLOC(fs, struct free_slab *, sizeof (struct free_slab), + M_TEMP, M_WAITOK); + fs->vmp = vmp; + fs->slabsize = cp->cache_slabsize; + fs->slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, + vmp->vm_quantum); + list_link_init(&fs->next); + list_insert_tail(&freelist, fs); + } + + for (sp = avl_first(&cp->cache_partial_slabs); sp != NULL; + sp = AVL_NEXT(&cp->cache_partial_slabs, sp)) { + + MALLOC(fs, struct free_slab *, sizeof (struct free_slab), + M_TEMP, M_WAITOK); + fs->vmp = vmp; + fs->slabsize = cp->cache_slabsize; + fs->slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, + vmp->vm_quantum); + list_link_init(&fs->next); + list_insert_tail(&freelist, fs); + } + + + kstat_delete(cp->cache_kstat); + + if (cp->cache_hash_table != NULL) + vmem_free(kmem_hash_arena, cp->cache_hash_table, + (cp->cache_hash_mask + 1) * sizeof (void *)); + + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) + mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock); + + mutex_destroy(&cp->cache_depot_lock); + mutex_destroy(&cp->cache_lock); + + vmem_free(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus)); +} + + +static void +kmem_cache_fini() +{ + kmem_cache_t *cp; + int i; + struct free_slab *fs; + + list_create(&freelist, sizeof (struct free_slab), + offsetof(struct free_slab, next)); + + mutex_enter(&kmem_cache_lock); + + while ((cp = list_head(&kmem_caches))) { + list_remove(&kmem_caches, cp); + mutex_exit(&kmem_cache_lock); + kmem_cache_build_slablist(cp); + mutex_enter(&kmem_cache_lock); + } + + mutex_exit(&kmem_cache_lock); + + i = 0; + while ((fs = list_head(&freelist))) { + i++; + list_remove(&freelist, fs); + vmem_free(fs->vmp, fs->slab, fs->slabsize); + FREE(fs, M_TEMP); + + } + printf("SPL: Released %u slabs\n", i); + list_destroy(&freelist); +} + + +// this is intended to substitute for kmem_avail() in arc.c +int64_t +spl_free_wrapper(void) +{ + return (spl_free); +} + +// this is intended to substitute for kmem_avail() in arc.c +// when arc_reclaim_thread() calls spl_free_set_pressure(0); +int64_t +spl_free_manual_pressure_wrapper(void) +{ + return (spl_free_manual_pressure); +} + +uint64_t +spl_free_last_pressure_wrapper(void) +{ + return (spl_free_last_pressure); +} + +int64_t +spl_free_set_and_wait_pressure(int64_t new_p, boolean_t fast, + clock_t check_interval) +{ + + int64_t snapshot_pressure = 0; + + if (new_p <= 0) + return (0); + + spl_free_fast_pressure = fast; + + if (spl_free_manual_pressure >= 0) + spl_free_manual_pressure += new_p; + else + spl_free_manual_pressure = new_p; + + // wait for another thread to reset pressure + const uint64_t start = zfs_lbolt(); + const uint64_t end_by = start + (hz*60); + const uint64_t double_at = start + (hz/2); + const uint64_t double_again_at = start + hz; + bool doubled = false, doubled_again = false; + uint64_t now; + + spl_free_last_pressure = start; + + for (; spl_free_manual_pressure != 0; ) { + // has another thread set spl_free_manual_pressure? + if (spl_free_manual_pressure < new_p) + spl_free_manual_pressure = new_p; + snapshot_pressure = spl_free_manual_pressure; + mutex_enter(&spl_free_thread_lock); + cv_timedwait_hires(&spl_free_thread_cv, + &spl_free_thread_lock, check_interval, 0, 0); + mutex_exit(&spl_free_thread_lock); + now = zfs_lbolt(); + if (now > end_by) { + printf("%s: ERROR: timed out after one minute!\n", + __func__); + break; + } else if (now > double_again_at && !doubled_again) { + doubled_again = true; + new_p *= 2; + } else if (now > double_at) { + doubled = true; + new_p *= 2; + } + } + return (snapshot_pressure); +} + +// routinely called by arc_reclaim_thread() with new_p == 0 +void +spl_free_set_pressure(int64_t new_p) +{ + if (new_p > spl_free_manual_pressure || new_p <= 0) + spl_free_manual_pressure = new_p; + if (new_p == 0) { + spl_free_fast_pressure = FALSE; + // wake up both spl_free_thread() to recalculate spl_free + // and any spl_free_set_and_wait_pressure() threads + cv_broadcast(&spl_free_thread_cv); + } + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_set_pressure_both(int64_t new_p, boolean_t fast) +{ + spl_free_fast_pressure = fast; + if (new_p > spl_free_manual_pressure || new_p <= 0) + spl_free_manual_pressure = new_p; + spl_free_last_pressure = zfs_lbolt(); +} + +void spl_free_maybe_reap(void); + +void +spl_free_set_emergency_pressure(int64_t new_p) +{ + spl_free_fast_pressure = TRUE; + if (new_p > spl_free_manual_pressure || new_p <= 0) + spl_free_manual_pressure = new_p; + spl_free_maybe_reap(); + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_set_emergency_pressure_additive(int64_t new_p) +{ + spl_free_fast_pressure = TRUE; + spl_free_manual_pressure += new_p; + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_set_pressure_additive(int64_t new_p) +{ + spl_free_manual_pressure += new_p; + spl_free_last_pressure = zfs_lbolt(); +} + +boolean_t +spl_free_fast_pressure_wrapper() +{ + return (spl_free_fast_pressure); +} + +void +spl_free_set_fast_pressure(boolean_t state) +{ + spl_free_fast_pressure = state; + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_reap_caches(void) +{ + // note: this may take some time + static hrtime_t last_reap = 0; + const hrtime_t reap_after = SEC2NSEC(60); + const hrtime_t curtime = gethrtime(); + + if (curtime - last_reap < reap_after) + return; + + vmem_qcache_reap(zio_arena_parent); + kmem_reap(); + vmem_qcache_reap(kmem_va_arena); +} + +void +spl_free_maybe_reap(void) +{ + static _Atomic uint64_t last_reap = 0; + const uint64_t lockout_time = 60 * hz; + + uint64_t now = zfs_lbolt(); + if (now > last_reap + lockout_time) { + last_reap = now; + spl_free_maybe_reap_flag = true; + } +} + +boolean_t +spl_maybe_send_large_pressure(uint64_t now, uint64_t minutes, boolean_t full) +{ + static volatile _Atomic uint64_t spl_last_large_pressure = 0; + const uint64_t interval_ticks = minutes * 60ULL * (uint64_t)hz; + + if (spl_last_large_pressure + interval_ticks > now) + return (false); + + spl_last_large_pressure = now; + + const int64_t sixteenth_physmem = (int64_t)real_total_memory / 16LL; + const int64_t sixtyfourth_physmem = sixteenth_physmem / 4LL; + int64_t howmuch = sixteenth_physmem; + + if (full == false) + howmuch = sixtyfourth_physmem; + + + printf("SPL: %s: %lld bytes at time %llu\n", + __func__, howmuch, now); + + spl_free_set_emergency_pressure(howmuch); + + return (true); +} + +static void +spl_free_thread() +{ + callb_cpr_t cpr; + uint64_t last_update = zfs_lbolt(); + int64_t last_spl_free; + double ema_new = 0; + double ema_old = 0; + double alpha; + + CALLB_CPR_INIT(&cpr, &spl_free_thread_lock, callb_generic_cpr, FTAG); + + spl_free = (int64_t)PAGESIZE * + (int64_t)(vm_page_free_count - vm_page_free_min); + + mutex_enter(&spl_free_thread_lock); + + printf("SPL: beginning spl_free_thread() loop, spl_free == %lld\n", + spl_free); + + uint64_t recent_lowmem = 0; + uint64_t last_disequilibrium = 0; + + while (!spl_free_thread_exit) { + mutex_exit(&spl_free_thread_lock); + boolean_t lowmem = false; + boolean_t emergency_lowmem = false; + int64_t base; + int64_t new_spl_free = 0LL; + + spl_stats.spl_free_wake_count.value.ui64++; + + if (spl_free_maybe_reap_flag == true) { + spl_free_maybe_reap_flag = false; + spl_free_reap_caches(); + } + + uint64_t time_now = zfs_lbolt(); + uint64_t time_now_seconds = 0; + if (time_now > hz) + time_now_seconds = time_now / hz; + + last_spl_free = spl_free; + + new_spl_free = 0LL; + + /* + * if there is pressure that has not yet reached + * arc_reclaim_thread() then start with a negative + * new_spl_free + */ + if (spl_free_manual_pressure > 0) { + int64_t old_pressure = spl_free_manual_pressure; + new_spl_free -= old_pressure * 2LL; + lowmem = true; + if (spl_free_fast_pressure) { + emergency_lowmem = true; + new_spl_free -= old_pressure * 4LL; + } + } + + /* + * can we allocate at least a 64 MiB segment + * from spl_heap_arena? this probes the reserve + * and also the largest imported spans, which + * vmem_alloc can fragment if needed. + */ + boolean_t reserve_low = false; + extern vmem_t *spl_heap_arena; + const uint64_t sixtyfour = 64ULL*1024ULL*1024ULL; + const uint64_t rvallones = (sixtyfour << 1ULL) - 1ULL; + const uint64_t rvmask = ~rvallones; + uint64_t rvfreebits = spl_heap_arena->vm_freemap; + + if ((rvfreebits & rvmask) == 0) { + reserve_low = true; + } else { + new_spl_free += (int64_t)sixtyfour; + } + + // do we have lots of memory in the spl_heap_arena ? + + boolean_t early_lots_free = false; + const uint64_t onetwentyeight = 128ULL*1024ULL*1024ULL; + const uint64_t sixteen = 16ULL*1024ULL*1024ULL; + if (!reserve_low) { + early_lots_free = true; + } else if (vmem_size_semi_atomic(spl_heap_arena, + VMEM_FREE) > onetwentyeight) { + early_lots_free = true; + new_spl_free += (int64_t)sixteen; + } + + // do we have lots of memory in the bucket_arenas ? + + extern int64_t vmem_buckets_size(int); // non-locking + int64_t buckets_free = vmem_buckets_size(VMEM_FREE); + if ((uint64_t)buckets_free != spl_buckets_mem_free) + spl_buckets_mem_free = (uint64_t)buckets_free; + + if (buckets_free >= 512LL*1024LL*1024LL) { + early_lots_free = true; + new_spl_free += (int64_t)sixteen; + } + if (buckets_free >= 1024LL*1024LL*1024LL) { + reserve_low = false; + new_spl_free += (int64_t)sixteen; + } + + /* + * if we have neither alloced or freed in + * several minutes, then we do not need to + * shrink back if there is a momentary transient + * memory spike (i.e., one that lasts less than a second) + */ + boolean_t memory_equilibrium = false; + const uint64_t five_minutes = 300ULL; + const uint64_t one_minute = 60ULL; + uint64_t last_xat_alloc_seconds = spl_xat_lastalloc; + uint64_t last_xat_free_seconds = spl_xat_lastfree; + + if (last_xat_alloc_seconds + five_minutes > time_now_seconds && + last_xat_free_seconds + five_minutes > time_now_seconds) { + if (last_disequilibrium + one_minute > + time_now_seconds) { + memory_equilibrium = true; + last_disequilibrium = 0; + } + } else { + last_disequilibrium = time_now_seconds; + } + + boolean_t just_alloced = false; + if (last_xat_alloc_seconds + 1 > time_now_seconds) + just_alloced = true; + + /* + * this is a sign of a period of time of low system + * memory, however XNU's generation of this variable + * is not very predictable, but generally it should be + * taken seriously when it's positive (it is often falsely 0) + */ + if ((vm_page_free_wanted > 0 && reserve_low && + !early_lots_free && !memory_equilibrium && + !just_alloced) || vm_page_free_wanted >= 1024) { + int64_t bminus = (int64_t)vm_page_free_wanted * + (int64_t)PAGESIZE * -16LL; + if (bminus > -16LL*1024LL*1024LL) + bminus = -16LL*1024LL*1024LL; + new_spl_free += bminus; + lowmem = true; + emergency_lowmem = true; + // atomic swaps to set these variables used in arc.c + int64_t previous_highest_pressure = 0; + int64_t new_p = -bminus; + previous_highest_pressure = spl_free_manual_pressure; + if (new_p > previous_highest_pressure || new_p <= 0) { + boolean_t fast = FALSE; + if (vm_page_free_wanted > vm_page_free_min / 8) + fast = TRUE; + spl_free_set_pressure_both(-16LL * new_spl_free, + fast); + } + last_disequilibrium = time_now_seconds; + } else if (vm_page_free_wanted > 0) { + int64_t bytes_wanted = (int64_t)vm_page_free_wanted * + (int64_t)PAGESIZE; + new_spl_free -= bytes_wanted; + if (reserve_low && !early_lots_free) { + lowmem = true; + if (recent_lowmem == 0) { + recent_lowmem = time_now; + } + if (!memory_equilibrium) { + last_disequilibrium = time_now_seconds; + } + } + } + + /* + * these variables are reliably maintained by XNU + * if vm_page_free_count > vm_page_free_min, then XNU + * is scanning pages and we may want to try to free some memory + */ + int64_t above_min_free_pages = (int64_t)vm_page_free_count - + (int64_t)vm_page_free_min; + int64_t above_min_free_bytes = (int64_t)PAGESIZE * + above_min_free_pages; + + /* + * vm_page_free_min normally 3500, page free target + * normally 4000 but not exported so we are not scanning + * if we are 500 pages above vm_page_free_min. even if + * we're scanning we may have plenty of space in the + * reserve arena, in which case we should not react too strongly + */ + + if (above_min_free_bytes < (int64_t)PAGESIZE * 500LL && + reserve_low && !early_lots_free && !memory_equilibrium) { + // trigger a reap below + lowmem = true; + } + + extern volatile unsigned int vm_page_speculative_count; + if ((above_min_free_bytes < 0LL && reserve_low && + !early_lots_free && !memory_equilibrium && !just_alloced) || + above_min_free_bytes <= -4LL*1024LL*1024LL) { + int64_t new_p = -1LL * above_min_free_bytes; + boolean_t fast = FALSE; + emergency_lowmem = true; + lowmem = true; + recent_lowmem = time_now; + last_disequilibrium = time_now_seconds; + int64_t spec_bytes = (int64_t)vm_page_speculative_count + * (int64_t)PAGESIZE; + if (vm_page_free_wanted > 0 || new_p > spec_bytes) { + // force a stronger reaction from ARC if we are + // also low on speculative pages (xnu prefetched + // file blocks with no clients yet) + fast = TRUE; + } + spl_free_set_pressure_both(new_p, fast); + } else if (above_min_free_bytes < 0LL && !early_lots_free) { + lowmem = true; + if (recent_lowmem == 0) + recent_lowmem = time_now; + if (!memory_equilibrium) + last_disequilibrium = time_now_seconds; + } + + new_spl_free += above_min_free_bytes; + + /* + * If we have already detected a memory shortage + * and we have not reaped in a while (a short while + * for emergency_lowmem), then do a kmem_reap() now. + * See http://comments.gmane.org/gmane.os.illumos.devel/22552 + * (notably Richard Elling's "A kernel module can call + * kmem_reap() whenever it wishes and some modules, + * like zfs, do so." If we reap, stop processing spl_free + * on this pass, to let the reaps (and arc, if pressure + * has been set above) do their job for a few milliseconds. + */ + if (emergency_lowmem || lowmem) { + static uint64_t last_reap = 0; + uint64_t now = time_now; + uint64_t elapsed = 60*hz; + if (emergency_lowmem) + elapsed = 15*hz; // min.freq. kmem_reap_interval + if (now - last_reap > elapsed) { + last_reap = now; + /* + * spl_free_reap_caches() calls functions + * that will acquire locks and can take a while + * so set spl_free to a small positive value + * to stop arc shrinking too much during this + * period when we expect to be freeing up + * arc-usable memory, but low enough that + * arc_no_grow likely will be set. + */ + const int64_t two_spamax = 32LL * 1024LL * + 1024LL; + if (spl_free < two_spamax) + spl_free = two_spamax; // atomic! + spl_free_reap_caches(); + // we do not have any lock now, so we can jump + // to just before the thread-suspending code + goto justwait; + } + } + + /* + * a number or exceptions to reverse the lowmem + * / emergency_lowmem states if we have recently reaped. + * we also take the strong reaction sting out of + * the set pressure by turning off spl_free_fast_pressure, + * since that automatically provokes an arc shrink + * and arc reap. + */ + + if (!reserve_low || early_lots_free || memory_equilibrium || + just_alloced) { + lowmem = false; + emergency_lowmem = false; + spl_free_fast_pressure = FALSE; + } + + if (vm_page_speculative_count > 0) { + /* + * speculative memory can be squeezed a bit; it is + * file blocks that have been prefetched by xnu but + * are not (yet) in use by any consumer + */ + if (vm_page_speculative_count / 4 + vm_page_free_count > + vm_page_free_min) { + emergency_lowmem = false; + spl_free_fast_pressure = FALSE; + } + if (vm_page_speculative_count / 2 + vm_page_free_count > + vm_page_free_min) { + lowmem = false; + spl_free_fast_pressure = FALSE; + } + } + + /* + * Stay in a low memory condition for several seconds + * after we first detect that we are in it, giving the + * system (arc, xnu and userland) time to adapt + */ + if (!lowmem && recent_lowmem > 0) { + if (recent_lowmem + 4*hz < time_now) + lowmem = true; + else + recent_lowmem = 0; + } + + /* + * if we are in a lowmem "hangover", cure it with + * pressure, then wait for the pressure to take + * effect in arc.c code. triggered when we have had + * at least one lowmem in the previous few seconds + * -- possibly two (one that causes a reap, one + * that falls through to the 4 second hold above). + */ + if (recent_lowmem == time_now && early_lots_free && + reserve_low) { + /* + * we can't grab 64 MiB as a single segment, + * but otherwise have ample memory brought in from xnu, + * but recently we had lowmem... and still have lowmem. + * cure this condition with a dose of pressure. + */ + if (above_min_free_bytes < 0) { + int64_t old_p = spl_free_manual_pressure; + if (old_p <= -above_min_free_bytes) { + recent_lowmem = 0; + spl_free_manual_pressure = + -above_min_free_bytes; + goto justwait; + } + } + } + + base = new_spl_free; + + // adjust for available memory in spl_heap_arena + // cf arc_available_memory() + if (!emergency_lowmem) { + extern vmem_t *spl_default_arena; + int64_t heap_free = (int64_t)vmem_size_semi_atomic( + spl_heap_arena, VMEM_FREE); + // grabbed buckets_free up above; we are OK with + // change to it in the meanwhile, + // it'll get an update on the next run. + int64_t combined_free = heap_free + buckets_free; + + if (combined_free != 0) { + const int64_t mb = 1024*1024; + if (!lowmem && above_min_free_bytes > + (int64_t)PAGESIZE * 10000LL) { + if (above_min_free_bytes < 64LL * mb) + new_spl_free += combined_free / + 16; + else if (above_min_free_bytes < + 128LL * mb) + new_spl_free += combined_free / + 8; + else if (above_min_free_bytes < + 256LL * mb) + new_spl_free += combined_free / + 4; + else + new_spl_free += combined_free / + 2; + } else { + new_spl_free -= 16LL * mb; + } + } + + // memory footprint has gotten really big, + // decrease spl_free substantially + int64_t total_mem_used = (int64_t) + segkmem_total_mem_allocated; + if ((segkmem_total_mem_allocated * 100LL / + real_total_memory) > 70) { + new_spl_free -= total_mem_used / 64; + } else if ((segkmem_total_mem_allocated * 100LL / + real_total_memory) > 75) { + new_spl_free -= total_mem_used / 32; + lowmem = true; + } + } + + // Adjust in the face of a large ARC. + // We don't treat (zfs) metadata and non-metadata + // differently here, and leave policy with respect + // to the relative value of each up to arc.c. + // O3X arc.c does not (yet) take these arena sizes into + // account like Illumos's does. + uint64_t zio_size = vmem_size_semi_atomic(zio_arena_parent, + VMEM_ALLOC | VMEM_FREE); + // wrap this in a basic block for lexical scope SSA convenience + if (zio_size > 0) { + static uint64_t zio_last_too_big = 0; + static int64_t imposed_cap = 75; + const uint64_t seconds_of_lower_cap = 10*hz; + uint64_t now = time_now; + uint32_t zio_pct = (uint32_t)(zio_size * 100ULL / + real_total_memory); + // if not hungry for memory, shrink towards a + // 75% total memory cap on zfs_file_data + if (!lowmem && !emergency_lowmem && zio_pct > 75 && + (now > zio_last_too_big + seconds_of_lower_cap)) { + new_spl_free -= zio_size / 64; + zio_last_too_big = now; + imposed_cap = 75; + } else if (lowmem || emergency_lowmem) { + // shrink towards stricter caps if we are hungry + // for memory + const uint32_t lowmem_cap = 25; + const uint32_t emergency_lowmem_cap = 5; + // we don't want the lowest cap to be so low + // that we will not make any use of the fixed + // size reserve + if (lowmem && zio_pct > lowmem_cap) { + new_spl_free -= zio_size / 32; + zio_last_too_big = now; + imposed_cap = lowmem_cap; + } + if (emergency_lowmem && zio_pct > + emergency_lowmem_cap) { + new_spl_free -= zio_size / 8; + zio_last_too_big = now; + imposed_cap = emergency_lowmem_cap; + } + } + if (zio_last_too_big != now && + now < zio_last_too_big + seconds_of_lower_cap && + zio_pct > imposed_cap) { + new_spl_free -= zio_size / 64; + } + } + + // try to get 1/64 of spl_heap_arena freed up + if (emergency_lowmem && new_spl_free >= 0LL) { + extern vmem_t *spl_root_arena; + uint64_t root_size = vmem_size_semi_atomic( + spl_heap_arena, VMEM_ALLOC | VMEM_FREE); + uint64_t root_free = vmem_size_semi_atomic( + spl_heap_arena, VMEM_FREE); + int64_t difference = root_size - root_free; + int64_t target = root_size / 64; + if (difference < target) { + new_spl_free -= target; + } + // and we should definitely not be returning + // positive now + if (new_spl_free >= 0LL) + new_spl_free = -1024LL; + } + + double delta = (double)new_spl_free - (double)last_spl_free; + + boolean_t spl_free_is_negative = false; + + if (new_spl_free < 0LL) { + spl_stats.spl_spl_free_negative_count.value.ui64++; + spl_free_is_negative = true; + } + + // NOW set spl_free from calculated new_spl_free + spl_free = new_spl_free; + // the direct equivalent of : + // __c11_atomic_store(&spl_free, new_spl_free, + // __ATOMIC_SEQ_CST); + + /* + * Because we're already negative, arc is likely to have + * been signalled already. We can rely on the _maybe_ in + * spl-vmem.c:xnu_alloc_throttled() [XAT] to try to give + * arc a kick with greater probability. However, if we've + * gone negative several times, and have not tried a full + * kick in a long time, do so now; if the full kick is + * refused because there has been a kick too few minutes + * ago, try a gentler kick. We do this outside the lock, + * as spl_maybe_send_large_pressure may need to take a + * mutex, and we forbid further mutex entry when + * spl_free_lock is held. + */ + + if (spl_free_is_negative) { + static volatile _Atomic uint32_t + negatives_since_last_kick = 0; + + if (negatives_since_last_kick++ > 8) { + if (spl_maybe_send_large_pressure(time_now, 360, + true) || + spl_maybe_send_large_pressure(time_now, 60, + false)) { + negatives_since_last_kick = 0; + } + } + } + + if (lowmem) + recent_lowmem = time_now; + + // maintain an exponential moving average for the ema kstat + if (last_update > hz) + alpha = 1.0; + else { + double td_tick = (double)(time_now - last_update); + alpha = td_tick / (double)(hz*50.0); // roughly 0.02 + } + + ema_new = (alpha * delta) + (1.0 - alpha)*ema_old; + spl_free_delta_ema = ema_new; + ema_old = ema_new; + + justwait: + mutex_enter(&spl_free_thread_lock); + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&spl_free_thread_cv, + &spl_free_thread_lock, MSEC2NSEC(10), 0, 0); + CALLB_CPR_SAFE_END(&cpr, &spl_free_thread_lock); + } + spl_free_thread_exit = FALSE; + printf("SPL: spl_free_thread_exit set to FALSE " \ + "and exiting: cv_broadcasting\n"); + spl_free_manual_pressure = 0; + cv_broadcast(&spl_free_thread_cv); + CALLB_CPR_EXIT(&cpr); + printf("SPL: %s thread_exit\n", __func__); + thread_exit(); +} + + +static int +spl_kstat_update(kstat_t *ksp, int rw) +{ + spl_stats_t *ks = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + + if (ks->spl_spl_free_manual_pressure.value.i64 != + spl_free_manual_pressure) { + spl_free_set_pressure( + ks->spl_spl_free_manual_pressure.value.i64 * 1024 * + 1024); + if (ks->spl_spl_free_manual_pressure.value.i64 > 0) { + spl_free_reap_caches(); + } + } + + if (ks->spl_spl_free_fast_pressure.value.i64 != + spl_free_fast_pressure) { + if (spl_free_wrapper() != 0) { + spl_free_set_fast_pressure(TRUE); + } + } + + if (ks->spl_bucket_tunable_large_span.value.ui64 != + spl_bucket_tunable_large_span) { + spl_set_bucket_tunable_large_span( + ks->spl_bucket_tunable_large_span.value.ui64); + } + + if (ks->spl_bucket_tunable_small_span.value.ui64 != + spl_bucket_tunable_small_span) { + spl_set_bucket_tunable_small_span( + ks->spl_bucket_tunable_small_span.value.ui64); + } + + if (ks->spl_frag_max_walk.value.ui64 != spl_frag_max_walk) { + spl_frag_max_walk = ks->spl_frag_max_walk.value.ui64; + } + + if (ks->kmem_free_to_slab_when_fragmented.value.ui64 != + kmem_free_to_slab_when_fragmented) { + kmem_free_to_slab_when_fragmented = + ks->kmem_free_to_slab_when_fragmented.value.ui64; + } + + } else { + ks->spl_os_alloc.value.ui64 = segkmem_total_mem_allocated; + ks->spl_active_threads.value.ui64 = zfs_threads; + ks->spl_active_mutex.value.ui64 = zfs_active_mutex; + ks->spl_active_rwlock.value.ui64 = zfs_active_rwlock; + ks->spl_active_tsd.value.ui64 = spl_tsd_size(); + ks->spl_spl_free.value.i64 = spl_free; + ks->spl_spl_free_manual_pressure.value.i64 = + spl_free_manual_pressure; + ks->spl_spl_free_fast_pressure.value.i64 = + spl_free_fast_pressure; + ks->spl_spl_free_delta_ema.value.i64 = spl_free_delta_ema; + ks->spl_osif_malloc_success.value.ui64 = + stat_osif_malloc_success; + ks->spl_osif_malloc_bytes.value.ui64 = stat_osif_malloc_bytes; + ks->spl_osif_free.value.ui64 = stat_osif_free; + ks->spl_osif_free_bytes.value.ui64 = stat_osif_free_bytes; + ks->spl_bucket_non_pow2_allocs.value.ui64 = + spl_bucket_non_pow2_allocs; + + ks->spl_vmem_unconditional_allocs.value.ui64 = + spl_vmem_unconditional_allocs; + ks->spl_vmem_unconditional_alloc_bytes.value.ui64 = + spl_vmem_unconditional_alloc_bytes; + ks->spl_vmem_conditional_allocs.value.ui64 = + spl_vmem_conditional_allocs; + ks->spl_vmem_conditional_alloc_bytes.value.ui64 = + spl_vmem_conditional_alloc_bytes; + ks->spl_vmem_conditional_alloc_deny.value.ui64 = + spl_vmem_conditional_alloc_deny; + ks->spl_vmem_conditional_alloc_deny_bytes.value.ui64 = + spl_vmem_conditional_alloc_deny_bytes; + + ks->spl_xat_success.value.ui64 = spl_xat_success; + ks->spl_xat_late_success.value.ui64 = spl_xat_late_success; + ks->spl_xat_late_success_nosleep.value.ui64 = + spl_xat_late_success_nosleep; + ks->spl_xat_pressured.value.ui64 = spl_xat_pressured; + ks->spl_xat_bailed.value.ui64 = spl_xat_bailed; + ks->spl_xat_bailed_contended.value.ui64 = + spl_xat_bailed_contended; + ks->spl_xat_lastalloc.value.ui64 = spl_xat_lastalloc; + ks->spl_xat_lastfree.value.ui64 = spl_xat_lastfree; + ks->spl_xat_forced.value.ui64 = spl_xat_forced; + ks->spl_xat_sleep.value.ui64 = spl_xat_sleep; + ks->spl_xat_late_deny.value.ui64 = spl_xat_late_deny; + ks->spl_xat_no_waiters.value.ui64 = spl_xat_no_waiters; + ks->spl_xft_wait.value.ui64 = spl_xft_wait; + + ks->spl_vba_parent_memory_appeared.value.ui64 = + spl_vba_parent_memory_appeared; + ks->spl_vba_parent_memory_blocked.value.ui64 = + spl_vba_parent_memory_blocked; + ks->spl_vba_hiprio_blocked.value.ui64 = spl_vba_hiprio_blocked; + ks->spl_vba_cv_timeout.value.ui64 = spl_vba_cv_timeout; + ks->spl_vba_loop_timeout.value.ui64 = spl_vba_loop_timeout; + ks->spl_vba_cv_timeout_blocked.value.ui64 = + spl_vba_cv_timeout_blocked; + ks->spl_vba_loop_timeout_blocked.value.ui64 = + spl_vba_loop_timeout_blocked; + ks->spl_vba_sleep.value.ui64 = spl_vba_sleep; + ks->spl_vba_loop_entries.value.ui64 = spl_vba_loop_entries; + + ks->spl_bucket_tunable_large_span.value.ui64 = + spl_bucket_tunable_large_span; + ks->spl_bucket_tunable_small_span.value.ui64 = + spl_bucket_tunable_small_span; + + ks->spl_buckets_mem_free.value.ui64 = spl_buckets_mem_free; + ks->spl_arc_no_grow_bits.value.ui64 = spl_arc_no_grow_bits; + ks->spl_arc_no_grow_count.value.ui64 = spl_arc_no_grow_count; + + ks->spl_frag_max_walk.value.ui64 = spl_frag_max_walk; + ks->spl_frag_walked_out.value.ui64 = spl_frag_walked_out; + ks->spl_frag_walk_cnt.value.ui64 = spl_frag_walk_cnt; + + ks->spl_arc_reclaim_avoided.value.ui64 = + spl_arc_reclaim_avoided; + + ks->kmem_free_to_slab_when_fragmented.value.ui64 = + kmem_free_to_slab_when_fragmented; + } + + return (0); +} + +void +spl_kmem_init(uint64_t xtotal_memory) +{ + int old_kmem_flags = kmem_flags; + int use_large_pages = 0; + size_t maxverify, minfirewall; + + printf("SPL: KMEM starting. Total memory %llu\n", xtotal_memory); + + // Initialise the kstat lock + mutex_init(&kmem_cache_lock, "kmem_cache_lock", MUTEX_DEFAULT, NULL); + mutex_init(&kmem_flags_lock, "kmem_flags_lock", MUTEX_DEFAULT, NULL); + mutex_init(&kmem_cache_kstat_lock, "kmem_kstat_lock", MUTEX_DEFAULT, + NULL); + + spl_kstat_init(); + + + /* + * Small-memory systems (< 24 MB) can't handle kmem_flags overhead. + */ + if (physmem < btop(24 << 20) && !(old_kmem_flags & KMF_STICKY)) + kmem_flags = 0; + + /* + * Don't do firewalled allocations if the heap is less than 1TB + * (i.e. on a 32-bit kernel) + * The resulting VM_NEXTFIT allocations would create too much + * fragmentation in a small heap. + */ + maxverify = minfirewall = PAGESIZE / 2; + + + /* LINTED */ + ASSERT(sizeof (kmem_cpu_cache_t) == KMEM_CPU_CACHE_SIZE); + + list_create(&kmem_caches, sizeof (kmem_cache_t), + offsetof(kmem_cache_t, cache_link)); + + kernelheap_init(); + + kmem_metadata_arena = vmem_create("kmem_metadata", NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, heap_arena, 8 * PAGESIZE, + VM_SLEEP | VMC_NO_QCACHE); + + kmem_msb_arena = vmem_create("kmem_msb", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, kmem_metadata_arena, 0, + VMC_DUMPSAFE | VM_SLEEP); + + kmem_cache_arena = vmem_create("kmem_cache", NULL, 0, KMEM_ALIGN, + vmem_alloc, vmem_free, kmem_metadata_arena, 0, VM_SLEEP); + + kmem_hash_arena = vmem_create("kmem_hash", NULL, 0, KMEM_ALIGN, + vmem_alloc, vmem_free, kmem_metadata_arena, 0, VM_SLEEP); + + kmem_log_arena = vmem_create("kmem_log", NULL, 0, KMEM_ALIGN, + vmem_alloc, vmem_free, kmem_metadata_arena, 0, VM_SLEEP); + + /* temporary oversize arena for mod_read_system_file */ + kmem_oversize_arena = vmem_create("kmem_oversize", NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, heap_arena, 0, VM_SLEEP); + + // statically declared above kmem_reap_interval = 15 * hz; + + /* + * Read /etc/system. This is a chicken-and-egg problem because + * kmem_flags may be set in /etc/system, but mod_read_system_file() + * needs to use the allocator. The simplest solution is to create + * all the standard kmem caches, read /etc/system, destroy all the + * caches we just created, and then create them all again in light + * of the (possibly) new kmem_flags and other kmem tunables. + */ + + if (old_kmem_flags & KMF_STICKY) + kmem_flags = old_kmem_flags; + + if (!(kmem_flags & KMF_AUDIT)) + vmem_seg_size = offsetof(vmem_seg_t, vs_thread); + + if (kmem_maxverify == 0) + kmem_maxverify = maxverify; + + if (kmem_minfirewall == 0) + kmem_minfirewall = minfirewall; + + /* + * give segkmem a chance to figure out if we are using large pages + * for the kernel heap + */ + // use_large_pages = segkmem_lpsetup(); + use_large_pages = 0; + + /* + * To protect against corruption, we keep the actual number of callers + * KMF_LITE records seperate from the tunable. We arbitrarily clamp + * to 16, since the overhead for small buffers quickly gets out of + * hand. + * + * The real limit would depend on the needs of the largest KMC_NOHASH + * cache. + */ + kmem_lite_count = MIN(MAX(0, kmem_lite_pcs), 16); + kmem_lite_pcs = kmem_lite_count; + + kmem_cache_init(2, use_large_pages); + + if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) { + if (kmem_transaction_log_size == 0) + kmem_transaction_log_size = MIN(kmem_maxavail() / 50ULL, + PAGESIZE<<4); + kmem_transaction_log = kmem_log_init(kmem_transaction_log_size); + } + + if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) { + if (kmem_content_log_size == 0) + kmem_content_log_size = MIN(kmem_maxavail() / 50ULL, + PAGESIZE<<4); + kmem_content_log = kmem_log_init(kmem_content_log_size); + } + + kmem_failure_log = kmem_log_init(kmem_failure_log_size); + + kmem_slab_log = kmem_log_init(kmem_slab_log_size); + + spl_tsd_init(); + spl_rwlock_init(); + spl_taskq_init(); + + /* + * Warn about invalid or dangerous values of kmem_flags. + * Always warn about unsupported values. + */ + if (((kmem_flags & ~(KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | + KMF_CONTENTS | KMF_LITE)) != 0) || + ((kmem_flags & KMF_LITE) && kmem_flags != KMF_LITE)) + cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x. " + "See the Solaris Tunable Parameters Reference Manual.", + kmem_flags); + +#ifdef DEBUG + if ((kmem_flags & KMF_DEBUG) == 0) + cmn_err(CE_NOTE, "kmem debugging disabled."); +#else + /* + * For non-debug kernels, the only "normal" flags are 0, KMF_LITE, + * KMF_REDZONE, and KMF_CONTENTS (the last because it is only enabled + * if KMF_AUDIT is set). We should warn the user about the performance + * penalty of KMF_AUDIT or KMF_DEADBEEF if they are set and KMF_LITE + * isn't set (since that disables AUDIT). + */ + if (!(kmem_flags & KMF_LITE) && + (kmem_flags & (KMF_AUDIT | KMF_DEADBEEF)) != 0) + cmn_err(CE_WARN, "High-overhead kmem debugging features " + "enabled (kmem_flags = 0x%x). Performance degradation " + "and large memory overhead possible. See the Solaris " + "Tunable Parameters Reference Manual.", kmem_flags); +#endif /* not DEBUG */ + + segkmem_zio_init(); + + kmem_cache_applyall(kmem_cache_magazine_enable, NULL, TQ_SLEEP); + + kmem_ready = 1; + + // Install spl kstats + spl_ksp = kstat_create("spl", 0, "spl_misc", "misc", KSTAT_TYPE_NAMED, + sizeof (spl_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); + + if (spl_ksp != NULL) { + spl_ksp->ks_data = &spl_stats; + spl_ksp->ks_update = spl_kstat_update; + kstat_install(spl_ksp); + } +} + +void +spl_kmem_fini(void) +{ + + kmem_cache_applyall(kmem_cache_magazine_disable, NULL, TQ_SLEEP); + + kstat_delete(spl_ksp); + + kmem_log_fini(kmem_slab_log); + kmem_log_fini(kmem_failure_log); + + if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) { + if (kmem_content_log_size == 0) + kmem_content_log_size = kmem_maxavail() / 50; + kmem_log_fini(kmem_content_log); + } + + if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) { + if (kmem_transaction_log_size == 0) + kmem_transaction_log_size = kmem_maxavail() / 50; + kmem_log_fini(kmem_transaction_log); + } + + // Destroy all the "general allocation" caches + kmem_alloc_caches_destroy(); + + // Destroy the VA associated caches + kmem_destroy_cache_by_name(KMEM_VA_PREFIX); + + kmem_qcache_destroy(); + // Destroy metadata caches + kmem_cache_destroy(kmem_bufctl_cache); + kmem_cache_destroy(kmem_bufctl_audit_cache); + kmem_cache_destroy(kmem_slab_cache); // Dont think this one + + // Some caches cannot be destroyed as + // they mutually reference each other. + // So we explicitly pull them apart piece-by-piece. + kmem_cache_fini(); + + segkmem_zio_fini(); + + // Now destroy the vmem arenas used by kmem. + vmem_destroy(kmem_default_arena); + vmem_destroy(kmem_va_arena); + vmem_destroy(kmem_oversize_arena); + vmem_destroy(kmem_log_arena); + vmem_destroy(kmem_hash_arena); + vmem_destroy(kmem_cache_arena); + vmem_destroy(kmem_msb_arena); + vmem_destroy(kmem_metadata_arena); + + kernelheap_fini(); + + list_destroy(&kmem_caches); + + mutex_destroy(&kmem_cache_kstat_lock); + mutex_destroy(&kmem_flags_lock); + mutex_destroy(&kmem_cache_lock); +} + +static void +kmem_move_init(void) +{ + kmem_defrag_cache = kmem_cache_create("kmem_defrag_cache", + sizeof (kmem_defrag_t), 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + kmem_move_cache = kmem_cache_create("kmem_move_cache", + sizeof (kmem_move_t), 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + /* + * kmem guarantees that move callbacks are sequential and that even + * across multiple caches no two moves ever execute simultaneously. + * Move callbacks are processed on a separate taskq so that client code + * does not interfere with internal maintenance tasks. + */ + kmem_move_taskq = taskq_create("kmem_move_taskq", 1, + minclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE); +} + +void +kmem_move_fini(void) +{ + + taskq_wait(kmem_move_taskq); + taskq_destroy(kmem_move_taskq); + kmem_move_taskq = 0; + + kmem_cache_destroy(kmem_move_cache); + kmem_cache_destroy(kmem_defrag_cache); + +} + +void +spl_kmem_thread_init(void) +{ + kmem_move_init(); + + // Initialize the spl_free locks + mutex_init(&spl_free_thread_lock, "spl_free_thead_lock", MUTEX_DEFAULT, + NULL); + + kmem_taskq = taskq_create("kmem_taskq", 1, minclsyspri, + 300, INT_MAX, TASKQ_PREPOPULATE); + + spl_free_thread_exit = FALSE; + (void) cv_init(&spl_free_thread_cv, NULL, CV_DEFAULT, NULL); + (void) thread_create(NULL, 0, spl_free_thread, 0, 0, 0, 0, 92); +} + +void +spl_kmem_thread_fini(void) +{ + shutting_down = 1; + + mutex_enter(&spl_free_thread_lock); + spl_free_thread_exit = TRUE; + while (spl_free_thread_exit) { + cv_signal(&spl_free_thread_cv); + cv_wait(&spl_free_thread_cv, &spl_free_thread_lock); + } + mutex_exit(&spl_free_thread_lock); + cv_destroy(&spl_free_thread_cv); + mutex_destroy(&spl_free_thread_lock); + + bsd_untimeout(kmem_update, 0); + bsd_untimeout(kmem_reap_timeout, &kmem_reaping); + bsd_untimeout(kmem_reap_timeout, &kmem_reaping_idspace); + + taskq_wait(kmem_taskq); + + taskq_destroy(kmem_taskq); + kmem_taskq = 0; + + kmem_move_fini(); + +} + +void +spl_kmem_mp_init(void) +{ + kmem_update_timeout(NULL); +} + +/* + * Return the slab of the allocated buffer, or NULL if the buffer is not + * allocated. This function may be called with a known slab address to determine + * whether or not the buffer is allocated, or with a NULL slab address to obtain + * an allocated buffer's slab. + */ +static kmem_slab_t * +kmem_slab_allocated(kmem_cache_t *cp, kmem_slab_t *sp, void *buf) +{ + kmem_bufctl_t *bcp, *bufbcp; + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(sp == NULL || KMEM_SLAB_MEMBER(sp, buf)); + + if (cp->cache_flags & KMF_HASH) { + for (bcp = *KMEM_HASH(cp, buf); + (bcp != NULL) && (bcp->bc_addr != buf); + bcp = bcp->bc_next) { + continue; + } + ASSERT(sp != NULL && bcp != NULL ? sp == bcp->bc_slab : 1); + return (bcp == NULL ? NULL : bcp->bc_slab); + } + + if (sp == NULL) { + sp = KMEM_SLAB(cp, buf); + } + bufbcp = KMEM_BUFCTL(cp, buf); + for (bcp = sp->slab_head; + (bcp != NULL) && (bcp != bufbcp); + bcp = bcp->bc_next) { + continue; + } + return (bcp == NULL ? sp : NULL); +} + +static boolean_t +kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags) +{ + long refcnt = sp->slab_refcnt; + + ASSERT(cp->cache_defrag != NULL); + + /* + * For code coverage we want to be able to move an object within the + * same slab (the only partial slab) even if allocating the destination + * buffer resulted in a completely allocated slab. + */ + if (flags & KMM_DEBUG) { + return ((flags & KMM_DESPERATE) || + ((sp->slab_flags & KMEM_SLAB_NOMOVE) == 0)); + } + + /* If we're desperate, we don't care if the client said NO. */ + if (flags & KMM_DESPERATE) { + return (refcnt < sp->slab_chunks); /* any partial */ + } + + if (sp->slab_flags & KMEM_SLAB_NOMOVE) { + return (B_FALSE); + } + + if ((refcnt == 1) || kmem_move_any_partial) { + return (refcnt < sp->slab_chunks); + } + + /* + * The reclaim threshold is adjusted at each kmem_cache_scan() so that + * slabs with a progressively higher percentage of used buffers can be + * reclaimed until the cache as a whole is no longer fragmented. + * + * sp->slab_refcnt kmd_reclaim_numer + * --------------- < ------------------ + * sp->slab_chunks KMEM_VOID_FRACTION + */ + return ((refcnt * KMEM_VOID_FRACTION) < + (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer)); +} + +/* + * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(), + * or when the buffer is freed. + */ +static void +kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf) +{ + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, from_buf)); + + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + return; + } + + if (sp->slab_flags & KMEM_SLAB_NOMOVE) { + if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) { + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_flags &= ~KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = (uint32_t)-1; + avl_add(&cp->cache_partial_slabs, sp); + } + } else { + sp->slab_later_count = 0; + sp->slab_stuck_offset = (uint32_t)-1; + } +} + +static void +kmem_slab_move_no(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf) +{ + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, from_buf)); + + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + return; + } + + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_later_count = 0; + sp->slab_flags |= KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, from_buf); + avl_add(&cp->cache_partial_slabs, sp); +} + +static void kmem_move_end(kmem_cache_t *, kmem_move_t *); + +/* + * The move callback takes two buffer addresses, the buffer to be moved, and a + * newly allocated and constructed buffer selected by kmem as the destination. + * It also takes the size of the buffer and an optional user argument specified + * at cache creation time. kmem guarantees that the buffer to be moved has not + * been unmapped by the virtual memory subsystem. Beyond that, it cannot + * guarantee the present whereabouts of the buffer to be moved, so it is up to + * the client to safely determine whether or not it is still using the buffer. + * The client must not free either of the buffers passed to the move callback, + * since kmem wants to free them directly to the slab layer. The client response + * tells kmem which of the two buffers to free: + * + * YES kmem frees the old buffer (the move was successful) + * NO kmem frees the new buffer, marks the slab of the old buffer + * non-reclaimable to avoid bothering the client again + * LATER kmem frees the new buffer, increments slab_later_count + * DONT_KNOW kmem frees the new buffer + * DONT_NEED kmem frees both the old buffer and the new buffer + * + * The pending callback argument now being processed contains both of the + * buffers (old and new) passed to the move callback function, the slab of the + * old buffer, and flags related to the move request, such as whether or not the + * system was desperate for memory. + * + * Slabs are not freed while there is a pending callback, but instead are kept + * on a deadlist, which is drained after the last callback completes. This means + * that slabs are safe to access until kmem_move_end(), no matter how many of + * their buffers have been freed. Once slab_refcnt reaches zero, it stays at + * zero for as long as the slab remains on the deadlist and until the slab is + * freed. + */ +static void +kmem_move_buffer(kmem_move_t *callback) +{ + kmem_cbrc_t response; + kmem_slab_t *sp = callback->kmm_from_slab; + kmem_cache_t *cp = sp->slab_cache; + boolean_t free_on_slab; + + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf)); + + /* + * The number of allocated buffers on the slab may have changed since we + * last checked the slab's reclaimability (when the pending move was + * enqueued), or the client may have responded NO when asked to move + * another buffer on the same slab. + */ + if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) { + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + /* + * Checking the slab layer is easy, so we might as well do that here + * in case we can avoid bothering the client. + */ + mutex_enter(&cp->cache_lock); + free_on_slab = (kmem_slab_allocated(cp, sp, + callback->kmm_from_buf) == NULL); + mutex_exit(&cp->cache_lock); + + if (free_on_slab) { + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + if (cp->cache_flags & KMF_BUFTAG) { + /* + * Make kmem_cache_alloc_debug() apply the constructor for us. + */ + if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf, + KM_NOSLEEP, 1, caller()) != 0) { + kmem_move_end(cp, callback); + return; + } + } else if (cp->cache_constructor != NULL && + cp->cache_constructor(callback->kmm_to_buf, cp->cache_private, + KM_NOSLEEP) != 0) { + atomic_inc_64(&cp->cache_alloc_fail); + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + cp->cache_defrag->kmd_callbacks++; + cp->cache_defrag->kmd_thread = spl_current_thread(); + cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf; + cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf; + DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *, + callback); + + response = cp->cache_move(callback->kmm_from_buf, + callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private); + + DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *, + callback, kmem_cbrc_t, response); + cp->cache_defrag->kmd_thread = NULL; + cp->cache_defrag->kmd_from_buf = NULL; + cp->cache_defrag->kmd_to_buf = NULL; + + if (response == KMEM_CBRC_YES) { + cp->cache_defrag->kmd_yes++; + kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); + /* slab safe to access until kmem_move_end() */ + if (sp->slab_refcnt == 0) + cp->cache_defrag->kmd_slabs_freed++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + kmem_move_end(cp, callback); + return; + } + + switch (response) { + case KMEM_CBRC_NO: + cp->cache_defrag->kmd_no++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_no(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_LATER: + cp->cache_defrag->kmd_later++; + mutex_enter(&cp->cache_lock); + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + mutex_exit(&cp->cache_lock); + break; + } + + if (++sp->slab_later_count >= KMEM_DISBELIEF) { + kmem_slab_move_no(cp, sp, + callback->kmm_from_buf); + } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) { + sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, + callback->kmm_from_buf); + } + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_DONT_NEED: + cp->cache_defrag->kmd_dont_need++; + kmem_slab_free_constructed(cp, callback->kmm_from_buf, + B_FALSE); + if (sp->slab_refcnt == 0) + cp->cache_defrag->kmd_slabs_freed++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_DONT_KNOW: + /* + * If we don't know if we can move this buffer or not, + * we'll just assume that we can't: if the buffer is + * in fact free, then it is sitting in one of the + * per-CPU magazines or in a full magazine in the depot + * layer. Either way, because defrag is induced in the + * same logic that reaps a cache, it's likely that full + * magazines will be returned to the system soon + * (thereby accomplishing what we're trying to + * accomplish here: return those magazines to their + * slabs). Given this, any work that we might do now to + * locate a buffer in a magazine is wasted (and + * expensive!) work; we bump a counter in this case and + * otherwise assume that we can't move it. + */ + cp->cache_defrag->kmd_dont_know++; + break; + default: + panic("'%s' (%p) unexpected move callback " + "response %d\n", cp->cache_name, (void *)cp, + response); + } + + kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE); + kmem_move_end(cp, callback); +} + +/* Return B_FALSE if there is insufficient memory for the move request. */ +static boolean_t +kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) +{ + void *to_buf; + avl_index_t index; + kmem_move_t *callback, *pending; + ulong_t n; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + + callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP); + + if (callback == NULL) + return (B_FALSE); + + callback->kmm_from_slab = sp; + callback->kmm_from_buf = buf; + callback->kmm_flags = flags; + + mutex_enter(&cp->cache_lock); + + n = avl_numnodes(&cp->cache_partial_slabs); + if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) { + mutex_exit(&cp->cache_lock); + kmem_cache_free(kmem_move_cache, callback); + return (B_TRUE); /* there is no need for the move request */ + } + + pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index); + if (pending != NULL) { + /* + * If the move is already pending and we're desperate now, + * update the move flags. + */ + if (flags & KMM_DESPERATE) { + pending->kmm_flags |= KMM_DESPERATE; + } + mutex_exit(&cp->cache_lock); + kmem_cache_free(kmem_move_cache, callback); + return (B_TRUE); + } + + to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs), + B_FALSE); + callback->kmm_to_buf = to_buf; + avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index); + + mutex_exit(&cp->cache_lock); + + if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer, + callback, TQ_NOSLEEP)) { + mutex_enter(&cp->cache_lock); + avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); + mutex_exit(&cp->cache_lock); + kmem_slab_free(cp, to_buf); + kmem_cache_free(kmem_move_cache, callback); + return (B_FALSE); + } + + return (B_TRUE); +} + +static void +kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback) +{ + avl_index_t index; + + ASSERT(cp->cache_defrag != NULL); + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + mutex_enter(&cp->cache_lock); + VERIFY(avl_find(&cp->cache_defrag->kmd_moves_pending, + callback->kmm_from_buf, &index) != NULL); + avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); + if (avl_is_empty(&cp->cache_defrag->kmd_moves_pending)) { + list_t *deadlist = &cp->cache_defrag->kmd_deadlist; + kmem_slab_t *sp; + + /* + * The last pending move completed. Release all slabs + * from the front of the dead list except for any slab + * at the tail that needs to be released from the context + * of kmem_move_buffers(). kmem deferred unmapping the + * buffers on these slabs in order to guarantee that + * buffers passed to the move callback have been touched + * only by kmem or by the client itself. + */ + while ((sp = list_remove_head(deadlist)) != NULL) { + if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) { + list_insert_tail(deadlist, sp); + break; + } + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + mutex_enter(&cp->cache_lock); + } + } + mutex_exit(&cp->cache_lock); + kmem_cache_free(kmem_move_cache, callback); +} + +/* + * Move buffers from least used slabs first by scanning backwards from the end + * of the partial slab list. Scan at most max_scan candidate slabs and move + * buffers from at most max_slabs slabs (0 for all partial slabs in both cases). + * If desperate to reclaim memory, move buffers from any partial slab, otherwise + * skip slabs with a ratio of allocated buffers at or above the current + * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the + * scan is aborted) so that the caller can adjust the reclaimability threshold + * depending on how many reclaimable slabs it finds. + * + * kmem_move_buffers() drops and reacquires cache_lock every time it issues a + * move request, since it is not valid for kmem_move_begin() to call + * kmem_cache_alloc() or taskq_dispatch() with cache_lock held. + */ +static int +kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, + int flags) +{ + kmem_slab_t *sp; + void *buf; + int i, j; /* slab index, buffer index */ + int s; /* reclaimable slabs */ + int b; /* allocated (movable) buffers on reclaimable slab */ + boolean_t success; + int refcnt; + int nomove; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(kmem_move_cache != NULL); + ASSERT(cp->cache_move != NULL && cp->cache_defrag != NULL); + ASSERT((flags & KMM_DEBUG) ? !avl_is_empty(&cp->cache_partial_slabs) : + avl_numnodes(&cp->cache_partial_slabs) > 1); + + if (kmem_move_blocked) { + return (0); + } + + if (kmem_move_fulltilt) { + flags |= KMM_DESPERATE; + } + + if (max_scan == 0 || (flags & KMM_DESPERATE)) { + /* + * Scan as many slabs as needed to find the desired number of + * candidate slabs. + */ + max_scan = (size_t)-1; + } + + if (max_slabs == 0 || (flags & KMM_DESPERATE)) { + /* Find as many candidate slabs as possible. */ + max_slabs = (size_t)-1; + } + + sp = avl_last(&cp->cache_partial_slabs); + ASSERT(KMEM_SLAB_IS_PARTIAL(sp)); + for (i = 0, s = 0; (i < max_scan) && (s < max_slabs) && (sp != NULL) && + ((sp != avl_first(&cp->cache_partial_slabs)) || + (flags & KMM_DEBUG)); + sp = AVL_PREV(&cp->cache_partial_slabs, sp), i++) { + + if (!kmem_slab_is_reclaimable(cp, sp, flags)) { + continue; + } + s++; + + /* Look for allocated buffers to move. */ + for (j = 0, b = 0, buf = sp->slab_base; + (j < sp->slab_chunks) && (b < sp->slab_refcnt); + buf = (((char *)buf) + cp->cache_chunksize), j++) { + + if (kmem_slab_allocated(cp, sp, buf) == NULL) { + continue; + } + + b++; + + /* + * Prevent the slab from being destroyed while we drop + * cache_lock and while the pending move is not yet + * registered. Flag the pending move while + * kmd_moves_pending may still be empty, since we can't + * yet rely on a non-zero pending move count to prevent + * the slab from being destroyed. + */ + ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING)); + sp->slab_flags |= KMEM_SLAB_MOVE_PENDING; + /* + * Recheck refcnt and nomove after reacquiring the lock, + * since these control the order of partial slabs, and + * we want to know if we can pick up the scan where we + * left off. + */ + refcnt = sp->slab_refcnt; + nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE); + mutex_exit(&cp->cache_lock); + + success = kmem_move_begin(cp, sp, buf, flags); + + /* + * Now, before the lock is reacquired, kmem could + * process all pending move requests and purge the + * deadlist, so that upon reacquiring the lock, sp has + * been remapped. Or, the client may free all the + * objects on the slab while the pending moves are still + * on the taskq. Therefore, the KMEM_SLAB_MOVE_PENDING + * flag causes the slab to be put at the end of the + * deadlist and prevents it from being destroyed, since + * we plan to destroy it here after reacquiring the + * lock. + */ + mutex_enter(&cp->cache_lock); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING; + + if (sp->slab_refcnt == 0) { + list_t *deadlist = + &cp->cache_defrag->kmd_deadlist; + list_remove(deadlist, sp); + + if (!avl_is_empty( + &cp->cache_defrag->kmd_moves_pending)) { + /* + * A pending move makes it unsafe to + * destroy the slab, because even though + * the move is no longer needed, the + * context where that is determined + * requires the slab to exist. + * Fortunately, a pending move also + * means we don't need to destroy the + * slab here, since it will get + * destroyed along with any other slabs + * on the deadlist after the last + * pending move completes. + */ + list_insert_head(deadlist, sp); + return (-1); + } + + /* + * Destroy the slab now if it was completely + * freed while we dropped cache_lock and there + * are no pending moves. Since slab_refcnt + * cannot change once it reaches zero, no new + * pending moves from that slab are possible. + */ + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + mutex_enter(&cp->cache_lock); + /* + * Since we can't pick up the scan where we left + * off, abort the scan and say nothing about the + * number of reclaimable slabs. + */ + return (-1); + } + + if (!success) { + /* + * Abort the scan if there is not enough memory + * for the request and say nothing about the + * number of reclaimable slabs. + */ + return (-1); + } + + /* + * The slab's position changed while the lock was + * dropped, so we don't know where we are in the + * sequence any more. + */ + if (sp->slab_refcnt != refcnt) { + /* + * If this is a KMM_DEBUG move, the slab_refcnt + * may have changed because we allocated a + * destination buffer on the same slab. In that + * case, we're not interested in counting it. + */ + return (-1); + } + if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) + return (-1); + + /* + * Generating a move request allocates a destination + * buffer from the slab layer, bumping the first partial + * slab if it is completely allocated. If the current + * slab becomes the first partial slab as a result, we + * can't continue to scan backwards. + * + * If this is a KMM_DEBUG move and we allocated the + * destination buffer from the last partial slab, then + * the buffer we're moving is on the same slab and our + * slab_refcnt has changed, causing us to return before + * reaching here if there are no partial slabs left. + */ + ASSERT(!avl_is_empty(&cp->cache_partial_slabs)); + if (sp == avl_first(&cp->cache_partial_slabs)) { + /* + * We're not interested in a second KMM_DEBUG + * move. + */ + goto end_scan; + } + } + } +end_scan: + + return (s); +} + +typedef struct kmem_move_notify_args { + kmem_cache_t *kmna_cache; + void *kmna_buf; +} kmem_move_notify_args_t; + +static void +kmem_cache_move_notify_task(void *arg) +{ + kmem_move_notify_args_t *args = arg; + kmem_cache_t *cp = args->kmna_cache; + void *buf = args->kmna_buf; + kmem_slab_t *sp; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(list_link_active(&cp->cache_link)); + + zfs_kmem_free(args, sizeof (kmem_move_notify_args_t)); + mutex_enter(&cp->cache_lock); + sp = kmem_slab_allocated(cp, NULL, buf); + + /* Ignore the notification if the buffer is no longer allocated. */ + if (sp == NULL) { + mutex_exit(&cp->cache_lock); + return; + } + + /* Ignore the notification if there's no reason to move the buffer. */ + if (avl_numnodes(&cp->cache_partial_slabs) > 1) { + /* + * So far the notification is not ignored. Ignore the + * notification if the slab is not marked by an earlier refusal + * to move a buffer. + */ + if (!(sp->slab_flags & KMEM_SLAB_NOMOVE) && + (sp->slab_later_count == 0)) { + mutex_exit(&cp->cache_lock); + return; + } + + kmem_slab_move_yes(cp, sp, buf); + ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING)); + sp->slab_flags |= KMEM_SLAB_MOVE_PENDING; + mutex_exit(&cp->cache_lock); + /* see kmem_move_buffers() about dropping the lock */ + (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY); + mutex_enter(&cp->cache_lock); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING; + if (sp->slab_refcnt == 0) { + list_t *deadlist = &cp->cache_defrag->kmd_deadlist; + list_remove(deadlist, sp); + + if (!avl_is_empty( + &cp->cache_defrag->kmd_moves_pending)) { + list_insert_head(deadlist, sp); + mutex_exit(&cp->cache_lock); + return; + } + + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + return; + } + } else { + kmem_slab_move_yes(cp, sp, buf); + } + mutex_exit(&cp->cache_lock); +} + +void +kmem_cache_move_notify(kmem_cache_t *cp, void *buf) +{ + kmem_move_notify_args_t *args; + + args = zfs_kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP); + if (args != NULL) { + args->kmna_cache = cp; + args->kmna_buf = buf; + if (!taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_move_notify_task, args, + TQ_NOSLEEP)) + zfs_kmem_free(args, sizeof (kmem_move_notify_args_t)); + } +} + +static void +kmem_cache_defrag(kmem_cache_t *cp) +{ + size_t n; + + ASSERT(cp->cache_defrag != NULL); + + mutex_enter(&cp->cache_lock); + n = avl_numnodes(&cp->cache_partial_slabs); + if (n > 1) { + /* kmem_move_buffers() drops and reacquires cache_lock */ + cp->cache_defrag->kmd_defrags++; + (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE); + } + mutex_exit(&cp->cache_lock); +} + +/* Is this cache above the fragmentation threshold? */ +static boolean_t +kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree) +{ + /* + * nfree kmem_frag_numer + * ------------------ > --------------- + * cp->cache_buftotal kmem_frag_denom + */ + return ((nfree * kmem_frag_denom) > + (cp->cache_buftotal * kmem_frag_numer)); +} + +static boolean_t +kmem_cache_is_fragmented(kmem_cache_t *cp, boolean_t *doreap) +{ + boolean_t fragmented; + uint64_t nfree; + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + *doreap = B_FALSE; + + if (kmem_move_fulltilt) { + if (avl_numnodes(&cp->cache_partial_slabs) > 1) { + return (B_TRUE); + } + } else { + if ((cp->cache_complete_slab_count + avl_numnodes( + &cp->cache_partial_slabs)) < kmem_frag_minslabs) { + return (B_FALSE); + } + } + + nfree = cp->cache_bufslab; + fragmented = ((avl_numnodes(&cp->cache_partial_slabs) > 1) && + kmem_cache_frag_threshold(cp, nfree)); + + /* + * Free buffers in the magazine layer appear allocated from the point of + * view of the slab layer. We want to know if the slab layer would + * appear fragmented if we included free buffers from magazines that + * have fallen out of the working set. + */ + if (!fragmented) { + long reap; + + mutex_enter(&cp->cache_depot_lock); + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + reap = MIN(reap, cp->cache_full.ml_total); + mutex_exit(&cp->cache_depot_lock); + + nfree += ((uint64_t)reap * cp->cache_magtype->mt_magsize); + if (kmem_cache_frag_threshold(cp, nfree)) { + *doreap = B_TRUE; + } + } + + return (fragmented); +} + +/* Called periodically from kmem_taskq */ +static void +kmem_cache_scan(kmem_cache_t *cp) +{ + boolean_t reap = B_FALSE; + kmem_defrag_t *kmd; + + ASSERT(taskq_member(kmem_taskq, curthread)); + + mutex_enter(&cp->cache_lock); + + kmd = cp->cache_defrag; + if (kmd->kmd_consolidate > 0) { + kmd->kmd_consolidate--; + mutex_exit(&cp->cache_lock); + kmem_cache_reap(cp); + return; + } + + if (kmem_cache_is_fragmented(cp, &reap)) { + size_t slabs_found; + + /* + * Consolidate reclaimable slabs from the end of the partial + * slab list (scan at most kmem_reclaim_scan_range slabs to find + * reclaimable slabs). Keep track of how many candidate slabs we + * looked for and how many we actually found so we can adjust + * the definition of a candidate slab if we're having trouble + * finding them. + * + * kmem_move_buffers() drops and reacquires cache_lock. + */ + kmd->kmd_scans++; + slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range, + kmem_reclaim_max_slabs, 0); + kmd->kmd_slabs_sought += kmem_reclaim_max_slabs; + kmd->kmd_slabs_found += slabs_found; + + if (++kmd->kmd_tries >= kmem_reclaim_scan_range) { + kmd->kmd_tries = 0; + + /* + * If we had difficulty finding candidate slabs in + * previous scans, adjust the threshold so that + * candidates are easier to find. + */ + if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) { + kmem_adjust_reclaim_threshold(kmd, -1); + } else if ((kmd->kmd_slabs_found * 2) < + kmd->kmd_slabs_sought) { + kmem_adjust_reclaim_threshold(kmd, 1); + } + kmd->kmd_slabs_sought = 0; + kmd->kmd_slabs_found = 0; + } + } else { + kmem_reset_reclaim_threshold(cp->cache_defrag); +#ifdef DEBUG + if (!avl_is_empty(&cp->cache_partial_slabs)) { + /* + * In a debug kernel we want the consolidator to + * run occasionally even when there is plenty of + * memory. + */ + uint16_t debug_rand; + + /* + * smd: note that this only gets called for the + * dnode cache because only the dnode cache has + * kmem_cache_set_move() applied to it + * brendon says move is voluntary and "tricky" + * the reason this is not called is because the source + * is kmem_cache_update(), that only calls this + * function (kmem_cache_scan()) + * if there is a move/defrag (same thing) associated + * with it so hoist some of this code up to to + * kmem_cache_update + */ + + (void) random_get_bytes((uint8_t *)&debug_rand, 2); + if (!kmem_move_noreap && + ((debug_rand % kmem_mtb_reap) == 0)) { + mutex_exit(&cp->cache_lock); + kmem_mtb_reap_count++; + return; + } else if ((debug_rand % kmem_mtb_move) == 0) { + kmd->kmd_scans++; + (void) kmem_move_buffers(cp, + kmem_reclaim_scan_range, 1, KMM_DEBUG); + } + } +#endif /* DEBUG */ + } + + mutex_exit(&cp->cache_lock); + +} + +// =============================================================== +// Status +// =============================================================== + + +size_t +kmem_size(void) +{ + return (total_memory); // smd +} + +// this is used in arc_reclaim_needed. if 1, reclaim is needed. +// returning 1 has the effect of throttling ARC, so be careful. +int +spl_vm_pool_low(void) +{ + bool m = spl_minimal_physmem_p_logic(); + + if (m) + return (0); + else + return (1); +} + +// =============================================================== +// String handling +// =============================================================== + +char * +kmem_strdup(const char *str) +{ + char *buf; + int len; + len = strlen(str) + 1; + buf = kmem_alloc(len, KM_SLEEP); + strlcpy(buf, str, len); + return (buf); +} + +void +kmem_strfree(char *str) +{ + zfs_kmem_free(str, strlen(str) + 1); +} + +char * +kvasprintf(const char *fmt, va_list ap) +{ + unsigned int len; + char *p; + va_list aq; + + va_copy(aq, ap); + len = vsnprintf(NULL, 0, fmt, aq); + va_end(aq); + p = zfs_kmem_alloc(len+1, KM_SLEEP); + if (!p) + return (NULL); + + vsnprintf(p, len+1, fmt, ap); + + return (p); +} + +char * +kmem_vasprintf(const char *fmt, va_list ap) +{ + va_list aq; + char *ptr; + + do { + va_copy(aq, ap); + ptr = kvasprintf(fmt, aq); + va_end(aq); + } while (ptr == NULL); + + return (ptr); +} + +char * +kmem_asprintf(const char *fmt, ...) +{ + va_list ap; + char *ptr; + + do { + va_start(ap, fmt); + ptr = kvasprintf(fmt, ap); + va_end(ap); + } while (ptr == NULL); + + return (ptr); +} + +char * +kmem_strstr(const char *in, const char *str) +{ + char c; + size_t len; + + c = *str++; + if (!c) + return ((char *)in); // Trivial empty string case + + len = strlen(str); + do { + char sc; + + do { + sc = *in++; + if (!sc) + return ((char *)0); + } while (sc != c); + } while (strncmp(in, str, len) != 0); + + return ((char *)(in - 1)); +} + + +// suppress timer and related logic for this kmem cache can live here +// three new per-kmem-cache stats: counters: non-vba-success non-vba-fail; +// flag: arc_no_grow +// from zfs/include/sys/spa.h + +#define SPA_MINBLOCKSHIFT 9 +#define SPA_MAXBLOCKSHIFT 24 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +typedef struct { + _Atomic(kmem_cache_t *)cp_metadata; + _Atomic(kmem_cache_t *)cp_filedata; + uint16_t pointed_to; + _Atomic int64_t suppress_count; + _Atomic uint64_t last_bumped; +} ksupp_t; + +typedef struct { + ksupp_t *ks_entry; +} iksupp_t; + +ksupp_t ksvec[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT] = + { { NULL, NULL, false, 0, 0 } }; +iksupp_t iksvec[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT] = + { { NULL } }; + +static bool spl_zio_no_grow_inited = false; + +/* + * Test that cp is in ks->cp_metadata or ks->cp_filedata; if so just return + * otherwise, choose the first (and possibly second) NULL + * and try to set it to cp. + * If successful, return. otherwise, sanity check that + * nobody has set ks->cp_metadata or ks->cp_filedata to cp already, and + * that ks->cp_metadata != ks->cp_filedata. + */ + +static void +ks_set_cp(ksupp_t *ks, kmem_cache_t *cp, const size_t cachenum) +{ + + ASSERT(cp != NULL); + ASSERT(ks != NULL); + + if (ks->cp_metadata == cp || ks->cp_filedata == cp) + return; + + const uint64_t b = cachenum; + + bool cp_is_metadata = false; + + vmem_t *vmp = cp->cache_arena; + + ASSERT(vmp == zio_metadata_arena || vmp == zio_arena); + + if (vmp == zio_metadata_arena) + cp_is_metadata = true; + + if (cp_is_metadata) { + for (uint32_t i = 0; ; i++) { + if (i >= 1000000) { + panic("SPL: %s: iterated out trying to set " + "ks->cp_metadata (%s)\n", __func__, + cp->cache_name); + } + kmem_cache_t *expected = NULL; + if (__c11_atomic_compare_exchange_strong( + &ks->cp_metadata, &expected, cp, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + printf("SPL: %s: set iskvec[%llu].ks->" + "cp_metadata (%s) OK\n", __func__, + b, cp->cache_name); + return; + } else if (ks->cp_metadata == cp) { + return; + } else if (ks->cp_metadata == NULL) { + continue; + } else { + panic("%s: CAS failed for iksvec[%llu]." + "ks->cp_metadata: %s wanted %s set\n", + __func__, b, cp->cache_name, + ks->cp_metadata->cache_name); + } + } + } else { + for (int32_t j = 0; ; j++) { + if (j >= 1000000) { + panic("SPL: %s: iterated out trying to set " + "ks->cp_filedata (%s)\n", __func__, + cp->cache_name); + } + kmem_cache_t *expected = NULL; + if (__c11_atomic_compare_exchange_strong( + &ks->cp_filedata, &expected, cp, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + printf("SPL: %s: set iskvec[%llu].ks->" + "cp_filedata (%s) OK\n", __func__, + b, cp->cache_name); + return; + } else if (ks->cp_filedata == cp) { + return; + } else if (ks->cp_filedata == NULL) { + continue; + } else { + panic("%s: CAS failed for iksvec[%llu].ks->" + "cp_metadata: %s wanted %s set\n", + __func__, b, cp->cache_name, + ks->cp_filedata->cache_name); + } + } + } +} + +void +spl_zio_no_grow_init(void) +{ + // this is the logic from zio.c:zio_init() + + ASSERT(spl_zio_no_grow_inited == false); + + size_t c = 0; + + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + size_t size = (c+1) << SPA_MINBLOCKSHIFT; + size_t p2 = size; + size_t align = 0; + + while (!ISP2(p2)) + p2 &= p2 - 1; + + if (size <= 4 * SPA_MINBLOCKSIZE) { + align = SPA_MINBLOCKSIZE; + } else if (size <= 128 * 1024 && IS_P2ALIGNED(size, p2 >> 4)) { + align = MIN(p2 >> 4, PAGESIZE); + } else if (IS_P2ALIGNED(size, p2 >> 3)) { + align = MIN(p2 >> 3, PAGESIZE); + } + + if (align != 0) { + iksvec[c].ks_entry = &ksvec[c]; + iksvec[c].ks_entry->pointed_to++; + } + } + + while (--c != 0) { + ASSERT(iksvec[c].ks_entry != NULL); + ASSERT(iksvec[c].ks_entry->pointed_to > 0); + if (iksvec[c - 1].ks_entry == NULL) { + iksvec[c - 1].ks_entry = iksvec[c].ks_entry; + iksvec[c - 1].ks_entry->pointed_to++; + } + } + + spl_zio_no_grow_inited = true; + + printf("SPL: %s done.\n", __func__); +} + +static void +spl_zio_no_grow_clear() +{ + for (size_t c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + ksupp_t *ks = iksvec[c].ks_entry; + ks->cp_metadata = NULL; + ks->cp_filedata = NULL; + ks->pointed_to = false; + ks->suppress_count = 0; + ks->last_bumped = 0; + iksvec[c].ks_entry = NULL; + } +} + +void +spl_zio_no_grow_fini(void) +{ + // zio_fini() is at its end, so the kmem_caches are gone, + // consequently this is safe. + spl_zio_no_grow_inited = false; + spl_zio_no_grow_clear(); + spl_zio_no_grow_init(); +} + +static void +spl_zio_set_no_grow(const size_t size, kmem_cache_t *cp, const size_t cachenum) +{ + ASSERT(spl_zio_no_grow_inited == true); + ASSERT(iksvec[cachenum].ks_entry != NULL); + + ksupp_t *ks = iksvec[cachenum].ks_entry; + + // maybe update size->cp mapping vector + + ks_set_cp(ks, cp, cachenum); + + if (ks->cp_metadata != cp && ks->cp_filedata != cp) { + panic("ks_cp_set bad for %s", cp->cache_name); + } + + // suppress the bucket for two allocations (is _Atomic) + ks->suppress_count += 2; + ks->last_bumped = zfs_lbolt(); +} + +bool +spl_zio_is_suppressed(const size_t size, const uint64_t now, + const boolean_t buf_is_metadata, kmem_cache_t **zp) +{ + + ASSERT(spl_zio_no_grow_inited == true); + + const size_t cachenum = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(cachenum, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + ksupp_t *ks = iksvec[cachenum].ks_entry; + + if (ks == NULL) { + return (false); + } else if (ks->pointed_to < 1) { + ASSERT(ks->pointed_to > 0); // throw an assertion + printf("SPL: %s: ERROR: iksvec[%llu].ks_entry->pointed_to " + "== %u for size %llu\n", __func__, (uint64_t)cachenum, + ks->pointed_to, (uint64_t)size); + return (false); + } else if (ks->suppress_count == 0) { + return (false); + } else { + const uint64_t two_minutes = 120 * hz; + if (ks->last_bumped + two_minutes >= now) { + ks->suppress_count = 0; + ks->last_bumped = now; + return (false); + } else { + ks->suppress_count--; + } + if (buf_is_metadata) { + if (ks->cp_metadata == NULL) { + ks_set_cp(ks, zp[cachenum], cachenum); + if (ks->cp_metadata != NULL) { + atomic_inc_64( + &ks->cp_metadata->arc_no_grow); + } else { + printf("WARNING: %s: ks_set_cp->" + "metadata == NULL after " + "ks_set_cp !size = %lu\n", + __func__, size); + } + } else { + atomic_inc_64(&ks->cp_metadata->arc_no_grow); + } + } else { + if (ks->cp_filedata == NULL) { + ks_set_cp(ks, zp[cachenum], cachenum); + if (ks->cp_filedata != NULL) { + atomic_inc_64( + &ks->cp_filedata->arc_no_grow); + } else { + printf("WARNING: %s: " + "ks_set_cp->filedata == NULL " + "after ks_set_cp !" + "size = %lu\n", + __func__, size); + } + } else { + atomic_inc_64(&ks->cp_filedata->arc_no_grow); + } + + } + return (true); + } +} + + +/* + * spl_zio_kmem_cache_alloc(): try to get an allocation without descending + * to the bucket layer, and if that fails, set a flag for spl_arc_no_grow() + * then perform the allocation normally. + */ + +void * +spl_zio_kmem_cache_alloc(kmem_cache_t *cp, int kmflag, size_t size, + size_t cachenum) +{ + // called by: + // spl_zio_kmem_cache_alloc(zio_buf_cache[size], kmflag, size, cachenum) + // spl_zio_kmem_cache_alloc(zio_data_buf_cache[size], kmflag, size, + // cachenum) + // those are e.g. + // kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPAMINBLOCKSHIFT] + // and are indexed as size_t cachenum = (size - 1) >> SPA_MIN~BLOCKSHIFT + // VERIFY3U(cachenum, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + // try to get memory from no lower than the bucket_heap + void *m = kmem_cache_alloc(cp, kmflag | KM_NO_VBA | KM_NOSLEEP); + + if (m != NULL) { + atomic_inc_64(&cp->no_vba_success); + return (m); + } + + atomic_inc_64(&cp->no_vba_fail); + + // we will have to go below the bucket_heap to a bucket arena. + // if the bucket arena cannot obviously satisfy the allocation, + // and xnu is tight for memory, then we turn on the no_grow suppression + + extern vmem_t *spl_vmem_bucket_arena_by_size(size_t); + extern uint64_t vmem_xnu_useful_bytes_free(void); + extern int vmem_canalloc_atomic(vmem_t *, size_t); + + vmem_t *bvmp = spl_vmem_bucket_arena_by_size(size); + + if (! vmem_canalloc_atomic(bvmp, size) && + vmem_xnu_useful_bytes_free() < 16ULL*1024ULL*1024ULL) { + spl_zio_set_no_grow(size, cp, cachenum); + atomic_inc_64(&cp->arc_no_grow_set); + } + + // perform the allocation as requested + void *n = kmem_cache_alloc(cp, kmflag); + + return (n); +} + +/* + * return true if the reclaim thread should be awakened + * because we do not have enough memory on hand + */ +boolean_t +spl_arc_reclaim_needed(const size_t bytes, kmem_cache_t **zp) +{ + + /* + * fast path: + * if our argument is 0, then do the equivalent of + * if (arc_available_memory() < 0) return (B_TRUE); + * which is traditional arc.c appraoch + * so we can arc_reclaim_needed() -> spl_arc_reclaim_needed(0) + * if we desire. + */ + if (bytes == 0 && spl_free < 0) { + return (B_TRUE); + } + + // copy some code from zio_buf_alloc() + size_t c = (bytes - 1) >> SPA_MINBLOCKSHIFT; + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + // if there is free memory in the kmem cache slab layer + // then we do not have to reclaim + + if (zp[c]->cache_bufslab > 1) { + if (spl_free < 0) + atomic_inc_64(&spl_arc_reclaim_avoided); + return (B_FALSE); + } + + extern uint64_t vmem_xnu_useful_bytes_free(void); + const uint64_t min_threshold = 64ULL*1024ULL*1024ULL; + const uint64_t pm_pct = real_total_memory >> 8; + const uint64_t high_threshold = MAX(min_threshold, (uint64_t)pm_pct); + const uint64_t low_threshold = bytes; + + const uint64_t f = vmem_xnu_useful_bytes_free(); + + if (f <= low_threshold) { + return (B_TRUE); + } else if (f > high_threshold) { + if (spl_free < 0) + atomic_inc_64(&spl_arc_reclaim_avoided); + return (B_FALSE); + } + + if (spl_free < 0) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* small auxiliary function since we do not export struct kmem_cache to zfs */ +size_t +kmem_cache_bufsize(kmem_cache_t *cp) +{ + return (cp->cache_bufsize); +} + +/* + * check that we would not have KMERR_BADCACHE error in the event + * we did kmem_cache_free(cp, buf) in a DEBUG setting + * + * returns: NULL if the buf is not found in any cache + * cparg if the buf is found in cparg + * a pointer to the cache the buf is found in, if not cparg + */ + +kmem_cache_t * +kmem_cache_buf_in_cache(kmem_cache_t *cparg, void *bufarg) +{ + kmem_cache_t *cp = cparg; + kmem_slab_t *sp; + void *buf = bufarg; + + sp = kmem_findslab(cp, buf); + if (sp == NULL) { + for (cp = list_tail(&kmem_caches); cp != NULL; + cp = list_prev(&kmem_caches, cp)) { + if ((sp = kmem_findslab(cp, buf)) != NULL) + break; + } + } + + if (sp == NULL) { + printf("SPL: %s: KMERR_BADADDR orig cache = %s\n", + __func__, cparg->cache_name); + return (NULL); + } + + if (cp == NULL) { + printf("SPL: %s: ERROR cp == NULL; cparg == %s", + __func__, cparg->cache_name); + return (NULL); + } + + if (cp != cparg) { + printf("SPL: %s: KMERR_BADCACHE arg cache = %s but found " + "in %s instead\n", + __func__, cparg->cache_name, cp->cache_name); + return (cp); + } + + ASSERT(cp == cparg); + + return (cp); +} diff --git a/module/os/macos/spl/spl-kstat.c b/module/os/macos/spl/spl-kstat.c new file mode 100644 index 0000000000..11a478c375 --- /dev/null +++ b/module/os/macos/spl/spl-kstat.c @@ -0,0 +1,721 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * Copyright (C) 2014 Brendon Humphrey + * + */ + +/* + * Provides an implementation of kstat that is backed by OSX sysctls. + */ + +#include +#include +#include +#include +#include +#include + +/* + * We need to get dynamically allocated memory from the kernel allocator + * (Our needs are small, we wont blow the zone_map). + */ +extern void *kalloc(vm_size_t size); +extern void kfree(void *data, vm_size_t size); + +/* + * Statically declared toplevel OID that all kstats + * will hang off. + */ +struct sysctl_oid_list sysctl__kstat_children; +SYSCTL_DECL(_kstat); +SYSCTL_NODE(, OID_AUTO, kstat, CTLFLAG_RW, 0, "kstat tree"); + +/* + * Sysctl node tree structure. + * + * These are wired into the OSX sysctl structure + * and also stored a list/tree/whatever for easy + * location and destruction at shutdown time. + */ +typedef struct sysctl_tree_node { + char tn_kstat_name[KSTAT_STRLEN]; + struct sysctl_oid_list tn_children; + struct sysctl_oid tn_oid; + struct sysctl_tree_node *tn_next; +} sysctl_tree_node_t; + +/* + * Each named kstats consists of one or more named + * fields which are implemented as OIDs parented + * off the kstat OID. + * + * To implement the kstat interface, we need to be able + * to call the update() function on the kstat to + * allow the owner to populate the kstat values from + * internal data. + * + * To do this we need the address of the kstat_named_t + * which contains the data value, and the owning kstat_t. + * + * OIDs allow a single void* user argument, so we will + * use a structure that contains both values and + * point to that. + */ +typedef struct sysctl_leaf { + kstat_t *l_ksp; + kstat_named_t *l_named; + struct sysctl_oid l_oid; /* kstats are backed w/sysctl */ + char l_name[64]; /* Name of the related sysctl */ + int l_oid_registered; /* !0 = registered */ +} sysctl_leaf_t; + +/* + * Extended kstat structure -- for internal use only. + */ +typedef struct ekstat { + kstat_t e_ks; /* the kstat itself */ + size_t e_size; /* total allocation size */ + kthread_t *e_owner; /* thread holding this kstat */ + kcondvar_t e_cv; /* wait for owner == NULL */ + /* contains the named values from the kstat */ + struct sysctl_oid_list e_children; + struct sysctl_oid e_oid; /* the kstat is itself an OID */ + /* array of OIDs that implement the children */ + sysctl_leaf_t *e_vals; + uint64_t e_num_vals; /* size of e_vals array */ +} ekstat_t; + +struct sysctl_tree_node *tree_nodes = 0; +struct sysctl_oid *e_sysctl = 0; + +static void +kstat_set_string(char *dst, const char *src) +{ + bzero(dst, KSTAT_STRLEN); + (void) strlcpy(dst, src, KSTAT_STRLEN); +} + +static struct sysctl_oid * +get_oid_with_name(struct sysctl_oid_list *list, char *name) +{ + struct sysctl_oid *oidp; + + SLIST_FOREACH(oidp, list, oid_link) { + if (strcmp(name, oidp->oid_name) == 0) { + return (oidp); + } + } + + return (0); +} + +static void +init_oid_tree_node(struct sysctl_oid_list *parent, char *name, + sysctl_tree_node_t *node) +{ + strlcpy(node->tn_kstat_name, name, KSTAT_STRLEN); + + node->tn_oid.oid_parent = parent; + node->tn_oid.oid_link.sle_next = 0; + node->tn_oid.oid_number = OID_AUTO; + node->tn_oid.oid_arg2 = 0; + node->tn_oid.oid_name = &node->tn_kstat_name[0]; + node->tn_oid.oid_descr = ""; + node->tn_oid.oid_version = SYSCTL_OID_VERSION; + node->tn_oid.oid_refcnt = 0; + node->tn_oid.oid_handler = 0; + node->tn_oid.oid_kind = CTLTYPE_NODE|CTLFLAG_RW|CTLFLAG_OID2; + node->tn_oid.oid_fmt = "N"; + node->tn_oid.oid_arg1 = (void*)(&node->tn_children); + + sysctl_register_oid(&node->tn_oid); + + node->tn_next = tree_nodes; + tree_nodes = node; +} + +static struct sysctl_oid_list * +get_kstat_parent(struct sysctl_oid_list *root, char *module_name, + char *class_name) +{ + struct sysctl_oid *the_module = 0; + struct sysctl_oid *the_class = 0; + sysctl_tree_node_t *new_node = 0; + struct sysctl_oid_list *container = root; + + /* + * Locate/create the module + */ + the_module = get_oid_with_name(root, module_name); + + if (!the_module) { + new_node = kalloc(sizeof (sysctl_tree_node_t)); + bzero(new_node, sizeof (sysctl_tree_node_t)); + init_oid_tree_node(root, module_name, new_node); + the_module = &new_node->tn_oid; + } + + /* + * Locate/create the class + */ + container = the_module->oid_arg1; + the_class = get_oid_with_name(container, class_name); + + if (!the_class) { + new_node = kalloc(sizeof (sysctl_tree_node_t)); + bzero(new_node, sizeof (sysctl_tree_node_t)); + init_oid_tree_node(container, class_name, new_node); + the_class = &new_node->tn_oid; + } + + container = the_class->oid_arg1; + return (container); +} + +static int +kstat_handle_i64 SYSCTL_HANDLER_ARGS +{ + int error = 0; + sysctl_leaf_t *params = (sysctl_leaf_t *)(arg1); + kstat_named_t *named = params->l_named; + kstat_t *ksp = params->l_ksp; + kmutex_t *lock = ksp->ks_lock; + int lock_needs_release = 0; + + if (lock && !MUTEX_NOT_HELD(lock)) { + mutex_enter(lock); + lock_needs_release = 1; + } + + if (!error && req->newptr) { + /* + * Write request - first read add current values for the kstat + * (remember that is sysctl is likely only one of many + * values that make up the kstat). + */ + if (ksp->ks_update) { + ksp->ks_update(ksp, KSTAT_READ); + } + + /* Copy the new value from user space */ + (void) copyin(req->newptr, &named->value.i64, + sizeof (named->value.i64)); + + /* and invoke the update operation */ + if (ksp->ks_update) { + error = ksp->ks_update(ksp, KSTAT_WRITE); + } + } else { + /* + * Read request + */ + if (ksp->ks_update) { + ksp->ks_update(ksp, KSTAT_READ); + } + error = SYSCTL_OUT(req, &named->value.i64, sizeof (int64_t)); + } + + if (lock_needs_release) { + mutex_exit(lock); + } + + return (error); +} + +static int +kstat_handle_ui64 SYSCTL_HANDLER_ARGS +{ + int error = 0; + sysctl_leaf_t *params = (sysctl_leaf_t *)(arg1); + kstat_named_t *named = params->l_named; + kstat_t *ksp = params->l_ksp; + kmutex_t *lock = ksp->ks_lock; + int lock_needs_release = 0; + + if (lock && !MUTEX_NOT_HELD(lock)) { + mutex_enter(lock); + lock_needs_release = 1; + } + + if (!error && req->newptr) { + /* + * Write request - first read add current values for the kstat + * (remember that is sysctl is likely only one of many + * values that make up the kstat). + */ + if (ksp->ks_update) { + ksp->ks_update(ksp, KSTAT_READ); + } + + /* Copy the new value from user space */ + (void) copyin(req->newptr, &named->value.ui64, + sizeof (named->value.ui64)); + + /* and invoke the update operation */ + if (ksp->ks_update) { + error = ksp->ks_update(ksp, KSTAT_WRITE); + } + } else { + /* + * Read request + */ + if (ksp->ks_update) { + ksp->ks_update(ksp, KSTAT_READ); + } + error = SYSCTL_OUT(req, &named->value.ui64, sizeof (uint64_t)); + } + + if (lock_needs_release) { + mutex_exit(lock); + } + + return (error); +} + +static int +kstat_handle_string SYSCTL_HANDLER_ARGS +{ + int error = 0; + sysctl_leaf_t *params = (sysctl_leaf_t *)(arg1); + kstat_named_t *named = params->l_named; + kstat_t *ksp = params->l_ksp; + kmutex_t *lock = ksp->ks_lock; + int lock_needs_release = 0; + + if (lock && !MUTEX_NOT_HELD(lock)) { + mutex_enter(lock); + lock_needs_release = 1; + } + if (!error && req->newptr) { + /* + * Write request - first read add current values for the kstat + * (remember that is sysctl is likely only one of many + * values that make up the kstat). + */ + if (ksp->ks_update) { + ksp->ks_update(ksp, KSTAT_READ); + } + + /* Copy the new value from user space */ + /* + * This should use copyinstr when copying in string from + * userland Fix this before attempting to use type STRING + * with kstat + */ + named->value.string.addr.ptr = (char *)(req->newptr); + named->value.string.len = strlen((char *)(req->newptr))+1; + + /* and invoke the update operation */ + if (ksp->ks_update) { + error = ksp->ks_update(ksp, KSTAT_WRITE); + } + } else { + /* + * Read request + */ + if (ksp->ks_update) { + ksp->ks_update(ksp, KSTAT_READ); + } + error = SYSCTL_OUT(req, named->value.string.addr.ptr, + named->value.string.len); + } + + if (lock_needs_release) { + mutex_exit(lock); + } + + return (error); +} + +kstat_t * +kstat_create(char *ks_module, int ks_instance, char *ks_name, char *ks_class, + uchar_t ks_type, ulong_t ks_ndata, uchar_t ks_flags) +{ + kstat_t *ksp = 0; + ekstat_t *e = 0; + size_t size = 0; + + /* + * Allocate memory for the new kstat header. + */ + size = sizeof (ekstat_t); + e = (ekstat_t *)kalloc(size); + bzero(e, size); + if (e == NULL) { + cmn_err(CE_NOTE, "kstat_create('%s', %d, '%s'): " + "insufficient kernel memory", + ks_module, ks_instance, ks_name); + return (NULL); + } + e->e_size = size; + + cv_init(&e->e_cv, NULL, CV_DEFAULT, NULL); + + /* + * Initialize as many fields as we can. The caller may reset + * ks_lock, ks_update, ks_private, and ks_snapshot as necessary. + * Creators of virtual kstats may also reset ks_data. It is + * also up to the caller to initialize the kstat data section, + * if necessary. All initialization must be complete before + * calling kstat_install(). + */ + ksp = &e->e_ks; + + ksp->ks_crtime = gethrtime(); + kstat_set_string(ksp->ks_module, ks_module); + ksp->ks_instance = ks_instance; + kstat_set_string(ksp->ks_name, ks_name); + ksp->ks_type = ks_type; + kstat_set_string(ksp->ks_class, ks_class); + ksp->ks_flags = ks_flags | KSTAT_FLAG_INVALID; + ksp->ks_ndata = ks_ndata; + ksp->ks_snaptime = ksp->ks_crtime; + ksp->ks_lock = 0; + + /* + * Initialise the sysctl that represents this kstat + */ + e->e_children.slh_first = 0; + + e->e_oid.oid_parent = get_kstat_parent(&sysctl__kstat_children, + ksp->ks_module, ksp->ks_class); + e->e_oid.oid_link.sle_next = 0; + e->e_oid.oid_number = OID_AUTO; + e->e_oid.oid_arg2 = 0; + e->e_oid.oid_name = ksp->ks_name; + e->e_oid.oid_descr = ""; + e->e_oid.oid_version = SYSCTL_OID_VERSION; + e->e_oid.oid_refcnt = 0; + e->e_oid.oid_handler = 0; + e->e_oid.oid_kind = CTLTYPE_NODE|CTLFLAG_RW|CTLFLAG_OID2; + e->e_oid.oid_fmt = "N"; + e->e_oid.oid_arg1 = (void*)(&e->e_children); + + /* If VIRTUAL we allocate memory to store data */ + if (ks_flags & KSTAT_FLAG_VIRTUAL) + ksp->ks_data = NULL; + else + ksp->ks_data = (void *)kmem_zalloc( + sizeof (kstat_named_t) * ks_ndata, KM_SLEEP); + + + sysctl_register_oid(&e->e_oid); + + return (ksp); +} + +void +kstat_install(kstat_t *ksp) +{ + ekstat_t *e = (ekstat_t *)ksp; + kstat_named_t *named_base = 0; + sysctl_leaf_t *vals_base = 0; + sysctl_leaf_t *params = 0; + int oid_permissions = CTLFLAG_RD; + + if (ksp->ks_type == KSTAT_TYPE_NAMED) { + + if (ksp->ks_flags & KSTAT_FLAG_WRITABLE) { + oid_permissions |= CTLFLAG_RW; + } + + // Create the leaf node OID objects + e->e_vals = (sysctl_leaf_t *)kalloc(ksp->ks_ndata * + sizeof (sysctl_leaf_t)); + bzero(e->e_vals, ksp->ks_ndata * sizeof (sysctl_leaf_t)); + e->e_num_vals = ksp->ks_ndata; + + named_base = (kstat_named_t *)(ksp->ks_data); + vals_base = e->e_vals; + + for (int i = 0; i < ksp->ks_ndata; i++) { + int oid_valid = 1; + + kstat_named_t *named = &named_base[i]; + sysctl_leaf_t *val = &vals_base[i]; + + // Perform basic initialisation of the sysctl. + // + // The sysctl: kstat.... + snprintf(val->l_name, KSTAT_STRLEN, "%s", named->name); + + val->l_oid.oid_parent = &e->e_children; + val->l_oid.oid_link.sle_next = 0; + val->l_oid.oid_number = OID_AUTO; + val->l_oid.oid_arg2 = 0; + val->l_oid.oid_name = val->l_name; + val->l_oid.oid_descr = ""; + val->l_oid.oid_version = SYSCTL_OID_VERSION; + val->l_oid.oid_refcnt = 0; + + // Based on the kstat type flags, provide location + // of data item and associated type and handler + // flags to the sysctl. + switch (named->data_type) { + case KSTAT_DATA_INT64: + params = (sysctl_leaf_t *)kalloc( + sizeof (sysctl_leaf_t)); + params->l_named = named; + params->l_ksp = ksp; + + val->l_oid.oid_handler = + kstat_handle_i64; + val->l_oid.oid_kind = CTLTYPE_QUAD | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "Q"; + val->l_oid.oid_arg1 = (void*)params; + params = 0; + break; + case KSTAT_DATA_UINT64: + params = (sysctl_leaf_t *)kalloc( + sizeof (sysctl_leaf_t)); + params->l_named = named; + params->l_ksp = ksp; + + val->l_oid.oid_handler = + kstat_handle_ui64; + val->l_oid.oid_kind = CTLTYPE_QUAD | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "Q"; + val->l_oid.oid_arg1 = (void*)params; + break; + case KSTAT_DATA_INT32: + val->l_oid.oid_handler = + sysctl_handle_int; + val->l_oid.oid_kind = CTLTYPE_INT | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "I"; + val->l_oid.oid_arg1 = &named->value.i32; + break; + case KSTAT_DATA_UINT32: + val->l_oid.oid_handler = + sysctl_handle_int; + val->l_oid.oid_kind = CTLTYPE_INT | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "IU"; + val->l_oid.oid_arg1 = + &named->value.ui32; + break; + case KSTAT_DATA_LONG: + val->l_oid.oid_handler = + sysctl_handle_long; + val->l_oid.oid_kind = CTLTYPE_INT | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "L"; + val->l_oid.oid_arg1 = &named->value.l; + break; + case KSTAT_DATA_ULONG: + val->l_oid.oid_handler = + sysctl_handle_long; + val->l_oid.oid_kind = CTLTYPE_INT | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "L"; + val->l_oid.oid_arg1 = &named->value.ul; + break; + case KSTAT_DATA_STRING: + params = (sysctl_leaf_t *)kalloc( + sizeof (sysctl_leaf_t)); + params->l_named = named; + params->l_ksp = ksp; + val->l_oid.oid_handler = + kstat_handle_string; + val->l_oid.oid_kind = CTLTYPE_STRING | + oid_permissions | CTLFLAG_OID2; + val->l_oid.oid_fmt = "S"; + val->l_oid.oid_arg1 = (void*)params; + break; + + case KSTAT_DATA_CHAR: + default: + oid_valid = 0; + break; + } + + /* + * Finally publish the OID, provided that there were + * no issues initialising it. + */ + if (oid_valid) { + sysctl_register_oid(&val->l_oid); + val->l_oid_registered = 1; + } else { + val->l_oid_registered = 0; + } + } + } + + ksp->ks_flags &= ~KSTAT_FLAG_INVALID; +} + +static void +remove_child_sysctls(ekstat_t *e) +{ + kstat_t *ksp = &e->e_ks; + kstat_named_t *named_base = (kstat_named_t *)(ksp->ks_data); + sysctl_leaf_t *vals_base = e->e_vals; + + for (int i = 0; i < ksp->ks_ndata; i++) { + if (vals_base[i].l_oid_registered) { + sysctl_unregister_oid(&vals_base[i].l_oid); + vals_base[i].l_oid_registered = 0; + } + + if (named_base[i].data_type == KSTAT_DATA_INT64 || + named_base[i].data_type == KSTAT_DATA_UINT64 || + named_base[i].data_type == KSTAT_DATA_STRING) { + + sysctl_leaf_t *leaf = (sysctl_leaf_t *) + vals_base[i].l_oid.oid_arg1; + kfree(leaf, sizeof (sysctl_leaf_t)); + } + } +} + +void +kstat_delete(kstat_t *ksp) +{ + ekstat_t *e = (ekstat_t *)ksp; + kmutex_t *lock = ksp->ks_lock; + int lock_needs_release = 0; + + // destroy the sysctl + if (ksp->ks_type == KSTAT_TYPE_NAMED) { + + if (lock && MUTEX_NOT_HELD(lock)) { + mutex_enter(lock); + lock_needs_release = 1; + } + + remove_child_sysctls(e); + + if (lock_needs_release) { + mutex_exit(lock); + } + } + + sysctl_unregister_oid(&e->e_oid); + + if (e->e_vals) { + kfree(e->e_vals, sizeof (sysctl_leaf_t) * e->e_num_vals); + } + + if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) + kmem_free(ksp->ks_data, sizeof (kstat_named_t) * ksp->ks_ndata); + + cv_destroy(&e->e_cv); + kfree(e, e->e_size); +} + +void +kstat_named_setstr(kstat_named_t *knp, const char *src) +{ + if (knp->data_type != KSTAT_DATA_STRING) + panic("kstat_named_setstr('%p', '%p'): " + "named kstat is not of type KSTAT_DATA_STRING", + (void *)knp, (void *)src); + + KSTAT_NAMED_STR_PTR(knp) = (char *)src; + if (src != NULL) + KSTAT_NAMED_STR_BUFLEN(knp) = strlen(src) + 1; + else + KSTAT_NAMED_STR_BUFLEN(knp) = 0; +} + +void +kstat_named_init(kstat_named_t *knp, const char *name, uchar_t data_type) +{ + kstat_set_string(knp->name, name); + knp->data_type = data_type; + + if (data_type == KSTAT_DATA_STRING) + kstat_named_setstr(knp, NULL); +} + + +void +kstat_waitq_enter(kstat_io_t *kiop) +{ +} + +void +kstat_waitq_exit(kstat_io_t *kiop) +{ +} + +void +kstat_runq_enter(kstat_io_t *kiop) +{ +} + +void +kstat_runq_exit(kstat_io_t *kiop) +{ +} + +void +__kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ +} + +void +spl_kstat_init() +{ + /* + * Create the kstat root OID + */ + sysctl_register_oid(&sysctl__kstat); +} + +void +spl_kstat_fini() +{ + /* + * Destroy the kstat module/class/name tree + * + * Done in two passes, first unregisters all + * of the oids, second releases all the memory. + */ + + sysctl_tree_node_t *iter = tree_nodes; + while (iter) { + sysctl_tree_node_t *tn = iter; + iter = tn->tn_next; + sysctl_unregister_oid(&tn->tn_oid); + } + + while (tree_nodes) { + sysctl_tree_node_t *tn = tree_nodes; + tree_nodes = tn->tn_next; + kfree(tn, sizeof (sysctl_tree_node_t)); + } + + /* + * Destroy the root oid + */ + sysctl_unregister_oid(&sysctl__kstat); +} diff --git a/module/os/macos/spl/spl-list.c b/module/os/macos/spl/spl-list.c new file mode 100644 index 0000000000..ede7a29c42 --- /dev/null +++ b/module/os/macos/spl/spl-list.c @@ -0,0 +1,197 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Generic doubly-linked list implementation + */ + +#include +#include +#include +#include + + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_prev = node; \ + lnew->list_next = node->list_next; \ + node->list_next->list_prev = lnew; \ + node->list_next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_next = node; \ + lnew->list_prev = node->list_prev; \ + node->list_prev->list_next = lnew; \ + node->list_prev = lnew; \ +} + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT(list); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.list_next = list->list_head.list_prev = + &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT(list); + ASSERT(list->list_head.list_next == node); + ASSERT(list->list_head.list_prev == node); + + node->list_next = node->list_prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + ASSERT(lold->list_next != NULL); + lold->list_prev->list_next = lold->list_next; + lold->list_next->list_prev = lold->list_prev; + lold->list_next = lold->list_prev = NULL; +} + + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_next != &list->list_head) + return (list_object(list, node->list_next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_prev != &list->list_head) + return (list_object(list, node->list_prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT(dst->list_size == src->list_size); + ASSERT(dst->list_offset == src->list_offset); + + if (list_empty(src)) + return; + + dstnode->list_prev->list_next = srcnode->list_next; + srcnode->list_next->list_prev = dstnode->list_prev; + dstnode->list_prev = srcnode->list_prev; + srcnode->list_prev->list_next = dstnode; + + /* empty src list */ + srcnode->list_next = srcnode->list_prev = srcnode; +} + +int +list_link_active(list_node_t *link) +{ + return (link->list_next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/module/os/macos/spl/spl-mutex.c b/module/os/macos/spl/spl-mutex.c new file mode 100644 index 0000000000..b7b47f42c4 --- /dev/null +++ b/module/os/macos/spl/spl-mutex.c @@ -0,0 +1,415 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013,2020 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Not defined in headers +extern boolean_t lck_mtx_try_lock(lck_mtx_t *lck); + + +static lck_attr_t *zfs_lock_attr = NULL; +static lck_grp_attr_t *zfs_group_attr = NULL; + +static lck_grp_t *zfs_mutex_group = NULL; + +uint64_t zfs_active_mutex = 0; + +#ifdef SPL_DEBUG_MUTEX +#include +static list_t mutex_list; +static kmutex_t mutex_list_mutex; + + +struct leak { + list_node_t mutex_leak_node; + +#define SPL_DEBUG_MUTEX_MAXCHAR 32 + char location_file[SPL_DEBUG_MUTEX_MAXCHAR]; + char location_function[SPL_DEBUG_MUTEX_MAXCHAR]; + uint64_t location_line; + void *mp; + + uint64_t wdlist_locktime; // time lock was taken + char wdlist_file[32]; // storing holder + uint64_t wdlist_line; +}; + +static int wdlist_exit = 0; + +void +spl_wdlist_settime(void *mpleak, uint64_t value) +{ + struct leak *leak = (struct leak *)mpleak; + if (!leak) + return; + leak->wdlist_locktime = value; +} + +inline static void +spl_wdlist_check(void *ignored) +{ + struct leak *mp; + printf("SPL: Mutex watchdog is alive\n"); + + while (!wdlist_exit) { + delay(hz * SPL_MUTEX_WATCHDOG_SLEEP); + uint64_t noe = gethrestime_sec(); + lck_mtx_lock((lck_mtx_t *)&mutex_list_mutex.m_lock); + for (mp = list_head(&mutex_list); + mp; + mp = list_next(&mutex_list, mp)) { + uint64_t locktime = mp->wdlist_locktime; + if ((locktime > 0) && (noe > locktime) && + noe - locktime >= SPL_MUTEX_WATCHDOG_TIMEOUT) { + printf("SPL: mutex (%p) held for %llus by " + "'%s':%llu\n", mp, noe - + mp->wdlist_locktime, mp->wdlist_file, + mp->wdlist_line); + } // if old + } // for all + lck_mtx_unlock((lck_mtx_t *)&mutex_list_mutex.m_lock); + } // while not exit + + printf("SPL: watchdog thread exit\n"); + wdlist_exit = 2; + thread_exit(); +} +#endif + + +int +spl_mutex_subsystem_init(void) +{ + zfs_lock_attr = lck_attr_alloc_init(); + zfs_group_attr = lck_grp_attr_alloc_init(); + zfs_mutex_group = lck_grp_alloc_init("zfs-mutex", zfs_group_attr); + +#ifdef SPL_DEBUG_MUTEX + { + unsigned char mutex[128]; + int i; + + memset(mutex, 0xAF, sizeof (mutex)); + lck_mtx_init((lck_mtx_t *)&mutex[0], zfs_mutex_group, + zfs_lock_attr); + for (i = sizeof (mutex) -1; i >= 0; i--) + if (mutex[i] != 0xAF) + break; + + printf("SPL: mutex size is %u\n", i+1); + + } + + list_create(&mutex_list, sizeof (struct leak), + offsetof(struct leak, mutex_leak_node)); + lck_mtx_init((lck_mtx_t *)&mutex_list_mutex.m_lock, zfs_mutex_group, + zfs_lock_attr); + mutex_list_mutex.m_initialised = MUTEX_INIT; + + (void) thread_create(NULL, 0, spl_wdlist_check, 0, 0, 0, 0, 92); +#endif + return (0); +} + + + +void +spl_mutex_subsystem_fini(void) +{ +#ifdef SPL_DEBUG_MUTEX + uint64_t total = 0; + printf("Dumping leaked mutex allocations...\n"); + + wdlist_exit = 1; + + mutex_enter(&mutex_list_mutex); + while (1) { + struct leak *leak, *runner; + uint32_t found; + + leak = list_head(&mutex_list); + + if (leak) { + list_remove(&mutex_list, leak); + } + if (!leak) + break; + + // Run through list and count up how many times this leak is + // found, removing entries as we go. + for (found = 1, runner = list_head(&mutex_list); + runner; + runner = runner ? list_next(&mutex_list, runner) : + list_head(&mutex_list)) { + + if (strcmp(leak->location_file, runner->location_file) + == 0 && strcmp(leak->location_function, + runner->location_function) == 0 && + leak->location_line == runner->location_line) { + // Same place + found++; + list_remove(&mutex_list, runner); + FREE(runner, M_TEMP); + runner = NULL; + } // if same + + } // for all nodes + + printf(" mutex %p : %s %s %llu : # leaks: %u\n", + leak->mp, + leak->location_file, + leak->location_function, + leak->location_line, + found); + + FREE(leak, M_TEMP); + total += found; + + } + mutex_exit(&mutex_list_mutex); + printf("Dumped %llu leaked allocations. Wait for watchdog " + "to exit..\n", total); + + while (wdlist_exit != 2) + delay(hz>>4); + + lck_mtx_destroy((lck_mtx_t *)&mutex_list_mutex.m_lock, zfs_mutex_group); + list_destroy(&mutex_list); +#endif + + lck_attr_free(zfs_lock_attr); + zfs_lock_attr = NULL; + + lck_grp_attr_free(zfs_group_attr); + zfs_group_attr = NULL; + + lck_grp_free(zfs_mutex_group); + zfs_mutex_group = NULL; +} + + + +#ifdef SPL_DEBUG_MUTEX +void +spl_mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc, + const char *file, const char *fn, int line) +#else +void +spl_mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc) +#endif +{ + ASSERT(type != MUTEX_SPIN); + ASSERT(ibc == NULL); + +#ifdef SPL_DEBUG_MUTEX + VERIFY3U(mp->m_initialised, !=, MUTEX_INIT); +#endif + + lck_mtx_init((lck_mtx_t *)&mp->m_lock, zfs_mutex_group, zfs_lock_attr); + mp->m_owner = NULL; + + atomic_inc_64(&zfs_active_mutex); + +#ifdef SPL_DEBUG_MUTEX + mp->m_initialised = MUTEX_INIT; + + struct leak *leak; + + MALLOC(leak, struct leak *, + sizeof (struct leak), M_TEMP, M_WAITOK); + + if (leak) { + bzero(leak, sizeof (struct leak)); + strlcpy(leak->location_file, file, SPL_DEBUG_MUTEX_MAXCHAR); + strlcpy(leak->location_function, fn, SPL_DEBUG_MUTEX_MAXCHAR); + leak->location_line = line; + leak->mp = mp; + + mutex_enter(&mutex_list_mutex); + list_link_init(&leak->mutex_leak_node); + list_insert_tail(&mutex_list, leak); + mp->leak = leak; + mutex_exit(&mutex_list_mutex); + } + leak->wdlist_locktime = 0; + leak->wdlist_file[0] = 0; + leak->wdlist_line = 0; +#endif +} + +void +spl_mutex_destroy(kmutex_t *mp) +{ + if (!mp) + return; + +#ifdef SPL_DEBUG_MUTEX + VERIFY3U(mp->m_initialised, ==, MUTEX_INIT); +#endif + + if (mp->m_owner != 0) + panic("SPL: releasing held mutex"); + + lck_mtx_destroy((lck_mtx_t *)&mp->m_lock, zfs_mutex_group); + + atomic_dec_64(&zfs_active_mutex); + +#ifdef SPL_DEBUG_MUTEX + mp->m_initialised = MUTEX_DESTROYED; + + if (mp->leak) { + struct leak *leak = (struct leak *)mp->leak; + mutex_enter(&mutex_list_mutex); + list_remove(&mutex_list, leak); + mp->leak = NULL; + mutex_exit(&mutex_list_mutex); + FREE(leak, M_TEMP); + } +#endif +} + + + +#ifdef SPL_DEBUG_MUTEX +void +spl_mutex_enter(kmutex_t *mp, char *file, int line) +#else +void +spl_mutex_enter(kmutex_t *mp) +#endif +{ +#ifdef SPL_DEBUG_MUTEX + VERIFY3U(mp->m_initialised, ==, MUTEX_INIT); +#endif + + if (mp->m_owner == current_thread()) + panic("mutex_enter: locking against myself!"); + +#ifdef DEBUG + if (*((uint64_t *)mp) == 0xdeadbeefdeadbeef) { + panic("SPL: mutex_enter"); + } +#endif + + lck_mtx_lock((lck_mtx_t *)&mp->m_lock); + mp->m_owner = current_thread(); + +#ifdef SPL_DEBUG_MUTEX + if (mp->leak) { + struct leak *leak = (struct leak *)mp->leak; + leak->wdlist_locktime = gethrestime_sec(); + strlcpy(leak->wdlist_file, file, sizeof (leak->wdlist_file)); + leak->wdlist_line = line; + } +#endif + +} + +void +spl_mutex_exit(kmutex_t *mp) +{ +#ifdef DEBUG + if (*((uint64_t *)mp) == 0xdeadbeefdeadbeef) { + panic("SPL: mutex_exit"); + } +#endif + +#ifdef SPL_DEBUG_MUTEX + VERIFY3U(mp->m_initialised, ==, MUTEX_INIT); +#endif + +#ifdef SPL_DEBUG_MUTEX + if (mp->leak) { + struct leak *leak = (struct leak *)mp->leak; + uint64_t locktime = leak->wdlist_locktime; + uint64_t noe = gethrestime_sec(); + if ((locktime > 0) && (noe > locktime) && + noe - locktime >= SPL_MUTEX_WATCHDOG_TIMEOUT) { + printf("SPL: mutex (%p) finally released after %llus " + "by '%s':%llu\n", leak, noe - leak->wdlist_locktime, + leak->wdlist_file, leak->wdlist_line); + } + leak->wdlist_locktime = 0; + leak->wdlist_file[0] = 0; + leak->wdlist_line = 0; + } +#endif + mp->m_owner = NULL; + lck_mtx_unlock((lck_mtx_t *)&mp->m_lock); +} + + +int +spl_mutex_tryenter(kmutex_t *mp) +{ + int held; + +#ifdef SPL_DEBUG_MUTEX + VERIFY3U(mp->m_initialised, ==, MUTEX_INIT); +#endif + + if (mp->m_owner == current_thread()) + panic("mutex_tryenter: locking against myself!"); + + held = lck_mtx_try_lock((lck_mtx_t *)&mp->m_lock); + if (held) { + mp->m_owner = current_thread(); + +#ifdef SPL_DEBUG_MUTEX + if (mp->leak) { + struct leak *leak = (struct leak *)mp->leak; + leak->wdlist_locktime = gethrestime_sec(); + strlcpy(leak->wdlist_file, "tryenter", + sizeof (leak->wdlist_file)); + leak->wdlist_line = 123; + } +#endif + + } + return (held); +} + +int +spl_mutex_owned(kmutex_t *mp) +{ + return (mp->m_owner == current_thread()); +} + +struct kthread * +spl_mutex_owner(kmutex_t *mp) +{ + return (mp->m_owner); +} diff --git a/module/os/macos/spl/spl-osx.c b/module/os/macos/spl/spl-osx.c new file mode 100644 index 0000000000..b65cb66e4d --- /dev/null +++ b/module/os/macos/spl/spl-osx.c @@ -0,0 +1,488 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define _task_user_ +#include + +#include +#include + +extern int system_inshutdown; + +static utsname_t utsname_static = { { 0 } }; + +unsigned int max_ncpus = 0; +uint64_t total_memory = 0; +uint64_t real_total_memory = 0; + +// Size in bytes of the memory allocated in seg_kmem +extern uint64_t segkmem_total_mem_allocated; + +extern char hostname[MAXHOSTNAMELEN]; + +utsname_t * +utsname(void) +{ + return (&utsname_static); +} + +/* + * Solaris delay is in ticks (hz) and Darwin uses microsecs + * 1 HZ is 10 milliseconds + */ +void +osx_delay(int ticks) +{ + if (ticks < 2) { + // IODelay spins and takes microseconds as an argument + // don't spend more than 10msec spinning. + IODelay(ticks * 10000); + return; + } + + // ticks are 10 msec units + int64_t ticks_to_go = (int64_t)ticks * 10LL; + // zfs_lbolt() is in 10 mec units + int64_t start_tick = (int64_t)zfs_lbolt(); + int64_t end_tick = start_tick + (int64_t)ticks_to_go; + + do { + IOSleep(ticks_to_go); + int64_t cur_tick = (int64_t)zfs_lbolt(); + ticks_to_go = (end_tick - cur_tick); + } while (ticks_to_go > 0); + +} + + +uint32_t +zone_get_hostid(void *zone) +{ + size_t len; + uint32_t myhostid = 0; + + len = sizeof (myhostid); + sysctlbyname("kern.hostid", &myhostid, &len, NULL, 0); + return (myhostid); +} + +extern void *(*__ihook_malloc)(size_t size); +extern void (*__ihook_free)(void *); + +const char * +spl_panicstr(void) +{ + return (NULL); +} + +int +spl_system_inshutdown(void) +{ + return (system_inshutdown); +} + +#include +typedef struct mach_header_64 kernel_mach_header_t; +#include +typedef struct nlist_64 kernel_nlist_t; + +typedef struct segment_command_64 kernel_segment_command_t; + +typedef struct _loaded_kext_summary { + char name[KMOD_MAX_NAME]; + uuid_t uuid; + uint64_t address; + uint64_t size; + uint64_t version; + uint32_t loadTag; + uint32_t flags; + uint64_t reference_list; +} OSKextLoadedKextSummary; + +typedef struct _loaded_kext_summary_header { + uint32_t version; + uint32_t entry_size; + uint32_t numSummaries; + uint32_t reserved; /* explicit alignment for gdb */ + OSKextLoadedKextSummary summaries[0]; +} OSKextLoadedKextSummaryHeader; + +extern OSKextLoadedKextSummaryHeader * gLoadedKextSummaries; + +typedef struct _cframe_t { + struct _cframe_t *prev; + uintptr_t caller; +#if PRINT_ARGS_FROM_STACK_FRAME + unsigned args[0]; +#endif +} cframe_t; + +extern kernel_mach_header_t _mh_execute_header; + +extern kmod_info_t *kmod; /* the list of modules */ + +extern addr64_t kvtophys(vm_offset_t va); + +static int +panic_print_macho_symbol_name(kernel_mach_header_t *mh, vm_address_t search, + const char *module_name) +{ + kernel_nlist_t *sym = NULL; + struct load_command *cmd; + kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; + struct symtab_command *orig_st = NULL; + unsigned int i; + char *strings, *bestsym = NULL; + vm_address_t bestaddr = 0, diff, curdiff; + + /* + * Assume that if it's loaded and linked into the kernel, + * it's a valid Mach-O + */ + cmd = (struct load_command *)&mh[1]; + for (i = 0; i < mh->ncmds; i++) { + if (cmd->cmd == LC_SEGMENT_64) { + kernel_segment_command_t *orig_sg = + (kernel_segment_command_t *)cmd; + + if (strncmp(SEG_TEXT, orig_sg->segname, + sizeof (orig_sg->segname)) == 0) + orig_ts = orig_sg; + else if (strncmp(SEG_LINKEDIT, orig_sg->segname, + sizeof (orig_sg->segname)) == 0) + orig_le = orig_sg; + /* pre-Lion i386 kexts have a single unnamed segment */ + else if (strncmp("", orig_sg->segname, + sizeof (orig_sg->segname)) == 0) + orig_ts = orig_sg; + } else if (cmd->cmd == LC_SYMTAB) + orig_st = (struct symtab_command *)cmd; + + cmd = (struct load_command *)((uintptr_t)cmd + cmd->cmdsize); + } + + if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) + return (0); + + if ((search < orig_ts->vmaddr) || + (search >= orig_ts->vmaddr + orig_ts->vmsize)) { + /* search out of range for this mach header */ + return (0); + } + + sym = (kernel_nlist_t *)(uintptr_t)(orig_le->vmaddr + + orig_st->symoff - orig_le->fileoff); + strings = (char *)(uintptr_t)(orig_le->vmaddr + + orig_st->stroff - orig_le->fileoff); + diff = search; + + for (i = 0; i < orig_st->nsyms; i++) { + if (sym[i].n_type & N_STAB) continue; + + if (sym[i].n_value <= search) { + curdiff = search - (vm_address_t)sym[i].n_value; + if (curdiff < diff) { + diff = curdiff; + bestaddr = sym[i].n_value; + bestsym = strings + sym[i].n_un.n_strx; + } + } + } + + if (bestsym != NULL) { + if (diff != 0) { + printf("%s : %s + 0x%lx", module_name, bestsym, + (unsigned long)diff); + } else { + printf("%s : %s", module_name, bestsym); + } + return (1); + } + return (0); +} + + +static void +panic_print_kmod_symbol_name(vm_address_t search) +{ + uint_t i; + + if (gLoadedKextSummaries == NULL) + return; + for (i = 0; i < gLoadedKextSummaries->numSummaries; ++i) { + OSKextLoadedKextSummary *summary = + gLoadedKextSummaries->summaries + i; + + if ((search >= summary->address) && + (search < (summary->address + summary->size))) { + kernel_mach_header_t *header = + (kernel_mach_header_t *)(uintptr_t)summary->address; + if (panic_print_macho_symbol_name(header, search, + summary->name) == 0) { + printf("%s + %llu", summary->name, + (unsigned long)search - summary->address); + } + break; + } + } +} + + +static void +panic_print_symbol_name(vm_address_t search) +{ + /* try searching in the kernel */ + if (panic_print_macho_symbol_name(&_mh_execute_header, + search, "mach_kernel") == 0) { + /* that failed, now try to search for the right kext */ + panic_print_kmod_symbol_name(search); + } +} + + +void +spl_backtrace(char *thesignal) +{ + void *stackptr; + + printf("SPL: backtrace \"%s\"\n", thesignal); + +#if defined(__i386__) + __asm__ volatile("movl %%ebp, %0" : "=m" (stackptr)); +#elif defined(__x86_64__) + __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr)); +#endif + + int frame_index; + int nframes = 16; + cframe_t *frame = (cframe_t *)stackptr; + + for (frame_index = 0; frame_index < nframes; frame_index++) { + vm_offset_t curframep = (vm_offset_t)frame; + if (!curframep) + break; + if (curframep & 0x3) { + printf("SPL: Unaligned frame\n"); + break; + } + if (!kvtophys(curframep) || + !kvtophys(curframep + sizeof (cframe_t) - 1)) { + printf("SPL: No mapping exists for frame pointer\n"); + break; + } + printf("SPL: %p : 0x%lx ", frame, frame->caller); + panic_print_symbol_name((vm_address_t)frame->caller); + printf("\n"); + frame = frame->prev; + } +} + +int +getpcstack(uintptr_t *pcstack, int pcstack_limit) +{ + int depth = 0; + void *stackptr; + +#if defined(__i386__) + __asm__ volatile("movl %%ebp, %0" : "=m" (stackptr)); +#elif defined(__x86_64__) + __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr)); +#endif + + int frame_index; + int nframes = pcstack_limit; + cframe_t *frame = (cframe_t *)stackptr; + + for (frame_index = 0; frame_index < nframes; frame_index++) { + vm_offset_t curframep = (vm_offset_t)frame; + if (!curframep) + break; + if (curframep & 0x3) { + break; + } + if (!kvtophys(curframep) || + !kvtophys(curframep + sizeof (cframe_t) - 1)) { + break; + } + pcstack[depth++] = frame->caller; + frame = frame->prev; + } + + return (depth); +} + +void +print_symbol(uintptr_t symbol) +{ + printf("SPL: "); + panic_print_symbol_name((vm_address_t)(symbol)); + printf("\n"); +} + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + int ret = 0; + + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) + bcopy(from, to, len); + else + ret = copyin((user_addr_t)from, (void *)to, len); + + return (ret); +} + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + int ret = 0; + + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + bcopy(from, to, len); + } else { + ret = copyout(from, (user_addr_t)to, len); + } + + return (ret); +} + +/* + * Technically, this call does not exist in illumos, but we use it for + * consistency. + */ +int +ddi_copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done) +{ + int ret; + size_t local_done; + +#undef copyinstr + ret = copyinstr((user_addr_t)uaddr, kaddr, len, &local_done); + if (done != NULL) + *done = local_done; + return (ret); +} + +kern_return_t +spl_start(kmod_info_t *ki, void *d) +{ + printf("SPL: loading\n"); + + int ncpus; + size_t len = sizeof (ncpus); + + /* + * Boot load time is excessively early, so we have to wait + * until certain subsystems are available. Surely there is + * a more elegant way to do this wait? + */ + + while (current_proc() == NULL) { + printf("SPL: waiting for kernel init...\n"); + delay(hz>>1); + } + + while (1) { + len = sizeof (total_memory); + sysctlbyname("hw.memsize", &total_memory, &len, NULL, 0); + if (total_memory != 0) break; + + printf("SPL: waiting for sysctl...\n"); + delay(hz>>1); + } + + sysctlbyname("hw.logicalcpu_max", &max_ncpus, &len, NULL, 0); + if (!max_ncpus) max_ncpus = 1; + + /* + * Setting the total memory to physmem * 50% here, since kmem is + * not in charge of all memory and we need to leave some room for + * the OS X allocator. We internally add pressure if we step over it + */ + real_total_memory = total_memory; + total_memory = total_memory * 50ULL / 100ULL; + physmem = total_memory / PAGE_SIZE; + + len = sizeof (utsname_static.sysname); + sysctlbyname("kern.ostype", &utsname_static.sysname, &len, NULL, 0); + + /* + * For some reason, (CTLFLAG_KERN is not set) looking up hostname + * returns 1. So we set it to uuid just to give it *something*. + * As it happens, ZFS sets the nodename on init. + */ + len = sizeof (utsname_static.nodename); + sysctlbyname("kern.uuid", &utsname_static.nodename, &len, NULL, 0); + + len = sizeof (utsname_static.release); + sysctlbyname("kern.osrelease", &utsname_static.release, &len, NULL, 0); + + len = sizeof (utsname_static.version); + sysctlbyname("kern.version", &utsname_static.version, &len, NULL, 0); + + strlcpy(utsname_static.nodename, hostname, + sizeof (utsname_static.nodename)); + + spl_mutex_subsystem_init(); + spl_kmem_init(total_memory); + spl_vnode_init(); + spl_kmem_thread_init(); + spl_kmem_mp_init(); + + return (KERN_SUCCESS); +} + +kern_return_t +spl_stop(kmod_info_t *ki, void *d) +{ + spl_kmem_thread_fini(); + spl_vnode_fini(); + spl_taskq_fini(); + spl_rwlock_fini(); + spl_tsd_fini(); + spl_kmem_fini(); + spl_kstat_fini(); + spl_mutex_subsystem_fini(); + + return (KERN_SUCCESS); +} diff --git a/module/os/macos/spl/spl-policy.c b/module/os/macos/spl/spl-policy.c new file mode 100644 index 0000000000..1b63dd9b57 --- /dev/null +++ b/module/os/macos/spl/spl-policy.c @@ -0,0 +1,178 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#include +#include +#include + +int +spl_priv_check_cred(kauth_cred_t cred, int priv, __unused int flags) +{ + int error; + + if (kauth_cred_getuid(cred) == 0) { + error = 0; + goto out; + } + + /* + * The default is deny, so if no policies have granted it, reject + * with a privilege error here. + */ + error = EPERM; +out: + return (error); +} + +int +secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_VFS_UNMOUNT, 0)); +} + +int +secpolicy_nfs(const cred_t *cr) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_NFS_DAEMON, 0)); +} + +int +secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_ZFS_POOL_CONFIG, 0)); +} + +int +secpolicy_zfs(const cred_t *cr) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_VFS_MOUNT, 0)); +} + +int +secpolicy_zinject(const cred_t *cr) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_ZFS_INJECT, 0)); +} + +int +secpolicy_vnode_any_access(const cred_t *cr, vnode_t *vp, uid_t owner) +{ + // FIXME + return (0); +} + +int +secpolicy_vnode_access2(const cred_t *cr, vnode_t *vp, uid_t owner, + mode_t curmode, mode_t wantmode) +{ + // FIXME + return (0); +} + +int +secpolicy_vnode_setattr(cred_t *cr, struct vnode *vp, vattr_t *vap, + const vattr_t *ovap, int flags, + int unlocked_access(void *, int, cred_t *), + void *node) +{ + // FIXME + return (0); +} + +int +secpolicy_vnode_stky_modify(const cred_t *cred) +{ + return (EPERM); +} + +int +secpolicy_setid_setsticky_clear(vnode_t *vp, vattr_t *vap, const vattr_t *ovap, + cred_t *cr) +{ + // FIXME + return (0); +} + +int +secpolicy_vnode_remove(struct vnode *vp, const cred_t *cr) +{ + return (0); +} + +int +secpolicy_vnode_create_gid(const cred_t *cred) +{ + return (0); +} + +int +secpolicy_vnode_setids_setgids(struct vnode *vp, const cred_t *cr, + gid_t gid) +{ + return (0); +} + +int +secpolicy_vnode_setdac(struct vnode *vp, const cred_t *cr, uid_t u) +{ + return (0); +} + +int +secpolicy_vnode_chown(struct vnode *vp, const cred_t *cr, uid_t u) +{ + return (0); +} + +int +secpolicy_vnode_setid_retain(const cred_t *cr, int fal) +{ + return (0); +} + +int +secpolicy_xvattr(vattr_t *vap, uid_t uid, const cred_t *cr, mode_t mod) +{ + return (0); +} + +int +secpolicy_setid_clear(vattr_t *vap, const cred_t *cr) +{ + return (0); +} + +int +secpolicy_basic_link(const cred_t *cr) +{ + return (0); +} + +int +secpolicy_fs_mount_clearopts(const cred_t *cr, struct mount *mp) +{ + return (0); +} + +int +secpolicy_fs_mount(const cred_t *cr, struct vnode *vp, struct mount *mp) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_VFS_MOUNT, 0)); +} diff --git a/module/os/macos/spl/spl-proc.c b/module/os/macos/spl/spl-proc.c new file mode 100644 index 0000000000..9c90559f0f --- /dev/null +++ b/module/os/macos/spl/spl-proc.c @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +struct proc { + void *nothing; +}; + +struct proc p0 = {0}; diff --git a/module/os/macos/spl/spl-proc_list.c b/module/os/macos/spl/spl-proc_list.c new file mode 100644 index 0000000000..4228f31339 --- /dev/null +++ b/module/os/macos/spl/spl-proc_list.c @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +void +seq_printf(struct seq_file *m, const char *fmt, ...) +{ +} + +void +procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_next_id = 1; + procfs_list->pl_node_offset = procfs_list_node_off; +} + +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{ +} + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} diff --git a/module/os/macos/spl/spl-processor.c b/module/os/macos/spl/spl-processor.c new file mode 100644 index 0000000000..6587aa7aee --- /dev/null +++ b/module/os/macos/spl/spl-processor.c @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include + +extern int cpu_number(void); + +uint32_t +getcpuid() +{ + return ((uint32_t)cpu_number()); +} + +uint64_t +spl_cpuid_features(void) +{ + i386_cpu_info_t *info; + + info = cpuid_info(); + return (info->cpuid_features); +} + +uint64_t +spl_cpuid_leaf7_features(void) +{ + i386_cpu_info_t *info; + + info = cpuid_info(); + return (info->cpuid_leaf7_features); +} diff --git a/module/os/macos/spl/spl-rwlock.c b/module/os/macos/spl/spl-rwlock.c new file mode 100644 index 0000000000..22fc260080 --- /dev/null +++ b/module/os/macos/spl/spl-rwlock.c @@ -0,0 +1,397 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + + +static lck_attr_t *zfs_rwlock_attr = NULL; +static lck_grp_attr_t *zfs_rwlock_group_attr = NULL; +static lck_grp_t *zfs_rwlock_group = NULL; + +uint64_t zfs_active_rwlock = 0; + +#ifdef SPL_DEBUG_RWLOCK +#include +static list_t rwlock_list; +static kmutex_t rwlock_list_mutex; +struct leak { + list_node_t rwlock_leak_node; + +#define SPL_DEBUG_RWLOCK_MAXCHAR 32 + char location_file[SPL_DEBUG_RWLOCK_MAXCHAR]; + char location_function[SPL_DEBUG_RWLOCK_MAXCHAR]; + uint64_t location_line; + void *mp; + + uint64_t wdlist_locktime; // time lock was taken + char wdlist_file[32]; // storing holder + uint64_t wdlist_line; +}; + +#endif + +/* + * We run rwlock with DEBUG on for now, as it protects against + * uninitialised access etc, and almost no cost. + */ +#ifndef DEBUG +#define DEBUG +#endif + +#ifdef DEBUG +int +rw_isinit(krwlock_t *rwlp) +{ + if (rwlp->rw_pad != 0x012345678) + return (0); + return (1); +} +#endif + + +#ifdef SPL_DEBUG_RWLOCK +void +rw_initx(krwlock_t *rwlp, char *name, krw_type_t type, __unused void *arg, + const char *file, const char *fn, int line) +#else +void +rw_init(krwlock_t *rwlp, char *name, krw_type_t type, __unused void *arg) +#endif +{ + ASSERT(type != RW_DRIVER); + +#ifdef DEBUG + VERIFY3U(rwlp->rw_pad, !=, 0x012345678); +#endif + + lck_rw_init((lck_rw_t *)&rwlp->rw_lock[0], + zfs_rwlock_group, zfs_rwlock_attr); + rwlp->rw_owner = NULL; + rwlp->rw_readers = 0; +#ifdef DEBUG + rwlp->rw_pad = 0x012345678; +#endif + atomic_inc_64(&zfs_active_rwlock); + +#ifdef SPL_DEBUG_RWLOCK + struct leak *leak; + + MALLOC(leak, struct leak *, + sizeof (struct leak), M_TEMP, M_WAITOK); + + if (leak) { + bzero(leak, sizeof (struct leak)); + strlcpy(leak->location_file, file, SPL_DEBUG_RWLOCK_MAXCHAR); + strlcpy(leak->location_function, fn, SPL_DEBUG_RWLOCK_MAXCHAR); + leak->location_line = line; + leak->mp = rwlp; + + mutex_enter(&rwlock_list_mutex); + list_link_init(&leak->rwlock_leak_node); + list_insert_tail(&rwlock_list, leak); + rwlp->leak = leak; + mutex_exit(&rwlock_list_mutex); + } + leak->wdlist_locktime = 0; + leak->wdlist_file[0] = 0; + leak->wdlist_line = 0; +#endif +} + +void +rw_destroy(krwlock_t *rwlp) +{ +#ifdef DEBUG + VERIFY3U(rwlp->rw_pad, ==, 0x012345678); +#endif + + lck_rw_destroy((lck_rw_t *)&rwlp->rw_lock[0], zfs_rwlock_group); +#ifdef DEBUG + rwlp->rw_pad = 0x99; +#endif + atomic_dec_64(&zfs_active_rwlock); + ASSERT(rwlp->rw_owner == NULL); + ASSERT(rwlp->rw_readers == 0); + +#ifdef SPL_DEBUG_RWLOCK + if (rwlp->leak) { + struct leak *leak = (struct leak *)rwlp->leak; + mutex_enter(&rwlock_list_mutex); + list_remove(&rwlock_list, leak); + rwlp->leak = NULL; + mutex_exit(&rwlock_list_mutex); + FREE(leak, M_TEMP); + } +#endif +} + +void +rw_enter(krwlock_t *rwlp, krw_t rw) +{ +#ifdef DEBUG + if (rwlp->rw_pad != 0x012345678) + panic("rwlock %p not initialised\n", rwlp); +#endif + + if (rw == RW_READER) { + lck_rw_lock_shared((lck_rw_t *)&rwlp->rw_lock[0]); + atomic_inc_32((volatile uint32_t *)&rwlp->rw_readers); + ASSERT(rwlp->rw_owner == 0); + } else { + if (rwlp->rw_owner == current_thread()) + panic("rw_enter: locking against myself!"); + lck_rw_lock_exclusive((lck_rw_t *)&rwlp->rw_lock[0]); + ASSERT(rwlp->rw_owner == 0); + ASSERT(rwlp->rw_readers == 0); + rwlp->rw_owner = current_thread(); + } +} + +/* + * kernel private from osfmk/kern/locks.h + */ +extern boolean_t lck_rw_try_lock(lck_rw_t *lck, lck_rw_type_t lck_rw_type); + +int +rw_tryenter(krwlock_t *rwlp, krw_t rw) +{ + int held = 0; + +#ifdef DEBUG + if (rwlp->rw_pad != 0x012345678) + panic("rwlock %p not initialised\n", rwlp); +#endif + + if (rw == RW_READER) { + held = lck_rw_try_lock((lck_rw_t *)&rwlp->rw_lock[0], + LCK_RW_TYPE_SHARED); + if (held) + atomic_inc_32((volatile uint32_t *)&rwlp->rw_readers); + } else { + if (rwlp->rw_owner == current_thread()) + panic("rw_tryenter: locking against myself!"); + held = lck_rw_try_lock((lck_rw_t *)&rwlp->rw_lock[0], + LCK_RW_TYPE_EXCLUSIVE); + if (held) + rwlp->rw_owner = current_thread(); + } + + return (held); +} + +/* + * It appears a difference between Darwin's + * lck_rw_lock_shared_to_exclusive() and Solaris's rw_tryupgrade() and + * FreeBSD's sx_try_upgrade() is that on failure to upgrade, the prior + * held shared/reader lock is lost on Darwin, but retained on + * Solaris/FreeBSD. We could re-acquire the lock in this situation, + * but it enters a possibility of blocking, when tryupgrade is meant + * to be non-blocking. + * Also note that XNU's lck_rw_lock_shared_to_exclusive() is always + * blocking (when waiting on readers), which means we can not use it. + */ +int +rw_tryupgrade(krwlock_t *rwlp) +{ + int held = 0; + + if (rwlp->rw_owner == current_thread()) + panic("rw_enter: locking against myself!"); + + /* More readers than us? give up */ + if (rwlp->rw_readers != 1) + return (0); + + /* + * It is ON. We need to drop our READER lock, and try to + * grab the WRITER as quickly as possible. + */ + atomic_dec_32((volatile uint32_t *)&rwlp->rw_readers); + lck_rw_unlock_shared((lck_rw_t *)&rwlp->rw_lock[0]); + + /* Grab the WRITER lock */ + held = lck_rw_try_lock((lck_rw_t *)&rwlp->rw_lock[0], + LCK_RW_TYPE_EXCLUSIVE); + + if (held) { + /* Looks like we won */ + rwlp->rw_owner = current_thread(); + ASSERT(rwlp->rw_readers == 0); + return (1); + } + + /* + * The worst has happened, we failed to grab WRITE lock, either + * due to another WRITER lock, or, some READER came along. + * IllumOS implementation returns with the READER lock again + * so we need to grab it. + */ + rw_enter(rwlp, RW_READER); + return (0); + +} + +void +rw_exit(krwlock_t *rwlp) +{ + if (rwlp->rw_owner == current_thread()) { + rwlp->rw_owner = NULL; + ASSERT(rwlp->rw_readers == 0); + lck_rw_unlock_exclusive((lck_rw_t *)&rwlp->rw_lock[0]); + } else { + atomic_dec_32((volatile uint32_t *)&rwlp->rw_readers); + ASSERT(rwlp->rw_owner == 0); + lck_rw_unlock_shared((lck_rw_t *)&rwlp->rw_lock[0]); + } +} + + +int +rw_lock_held(krwlock_t *rwlp) +{ + /* + * ### not sure about this one ### + */ + return (rwlp->rw_owner == current_thread() || rwlp->rw_readers > 0); +} + +int +rw_write_held(krwlock_t *rwlp) +{ + return (rwlp->rw_owner == current_thread()); +} + +void +rw_downgrade(krwlock_t *rwlp) +{ + if (rwlp->rw_owner != current_thread()) + panic("SPL: rw_downgrade not WRITE lock held\n"); + rwlp->rw_owner = NULL; + lck_rw_lock_exclusive_to_shared((lck_rw_t *)&rwlp->rw_lock[0]); + atomic_inc_32((volatile uint32_t *)&rwlp->rw_readers); +} + +int +spl_rwlock_init(void) +{ + zfs_rwlock_attr = lck_attr_alloc_init(); + zfs_rwlock_group_attr = lck_grp_attr_alloc_init(); + zfs_rwlock_group = lck_grp_alloc_init("zfs-rwlock", + zfs_rwlock_group_attr); + +#ifdef SPL_DEBUG_RWLOCK + list_create(&rwlock_list, sizeof (struct leak), + offsetof(struct leak, rwlock_leak_node)); + lck_mtx_init((lck_mtx_t *)&rwlock_list_mutex.m_lock, + zfs_rwlock_group, zfs_rwlock_attr); +#endif + + return (0); +} + +void +spl_rwlock_fini(void) +{ + +#ifdef SPL_DEBUG_RWLOCK + uint64_t total = 0; + printf("Dumping leaked rwlock allocations...\n"); + + mutex_enter(&rwlock_list_mutex); + while (1) { + struct leak *leak, *runner; + uint32_t found; + + leak = list_head(&rwlock_list); + + if (leak) { + list_remove(&rwlock_list, leak); + } + if (!leak) break; + + // Run through list and count up how many times this leak is + // found, removing entries as we go. + for (found = 1, runner = list_head(&rwlock_list); + runner; + runner = runner ? list_next(&rwlock_list, runner) : + list_head(&rwlock_list)) { + + if (strcmp(leak->location_file, runner->location_file) + == 0 && + strcmp(leak->location_function, + runner->location_function) == 0 && + leak->location_line == runner->location_line) { + // Same place + found++; + list_remove(&rwlock_list, runner); + FREE(runner, M_TEMP); + runner = NULL; + } // if same + + } // for all nodes + + printf(" rwlock %p : %s %s %llu : # leaks: %u\n", + leak->mp, + leak->location_file, + leak->location_function, + leak->location_line, + found); + + FREE(leak, M_TEMP); + total += found; + + } + mutex_exit(&rwlock_list_mutex); + printf("Dumped %llu leaked allocations.\n", total); + + lck_mtx_destroy((lck_mtx_t *)&rwlock_list_mutex.m_lock, + zfs_rwlock_group); + list_destroy(&rwlock_list); +#endif + + lck_grp_free(zfs_rwlock_group); + zfs_rwlock_group = NULL; + + lck_grp_attr_free(zfs_rwlock_group_attr); + zfs_rwlock_group_attr = NULL; + + lck_attr_free(zfs_rwlock_attr); + zfs_rwlock_attr = NULL; + + ASSERT3U(zfs_active_rwlock, ==, 0); +} diff --git a/module/os/macos/spl/spl-seg_kmem.c b/module/os/macos/spl/spl-seg_kmem.c new file mode 100644 index 0000000000..f2b36280f3 --- /dev/null +++ b/module/os/macos/spl/spl-seg_kmem.c @@ -0,0 +1,289 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include + +#include +#include +// ugly: smd +#ifdef kmem_free +#undef kmem_free +#endif + +#include +#include +#include + +#include + +/* + * seg_kmem is the primary kernel memory segment driver. It + * maps the kernel heap [kernelheap, ekernelheap), module text, + * and all memory which was allocated before the VM was initialized + * into kas. + * + * Pages which belong to seg_kmem are hashed into &kvp vnode at + * an offset equal to (u_offset_t)virt_addr, and have p_lckcnt >= 1. + * They must never be paged out since segkmem_fault() is a no-op to + * prevent recursive faults. + * + * Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on + * __x86 and are unlocked (p_sharelock == 0) on __sparc. Once __x86 + * supports relocation the #ifdef kludges can be removed. + * + * seg_kmem pages may be subject to relocation by page_relocate(), + * provided that the HAT supports it; if this is so, segkmem_reloc + * will be set to a nonzero value. All boot time allocated memory as + * well as static memory is considered off limits to relocation. + * Pages are "relocatable" if p_state does not have P_NORELOC set, so + * we request P_NORELOC pages for memory that isn't safe to relocate. + * + * The kernel heap is logically divided up into four pieces: + * + * heap32_arena is for allocations that require 32-bit absolute + * virtual addresses (e.g. code that uses 32-bit pointers/offsets). + * + * heap_core is for allocations that require 2GB *relative* + * offsets; in other words all memory from heap_core is within + * 2GB of all other memory from the same arena. This is a requirement + * of the addressing modes of some processors in supervisor code. + * + * heap_arena is the general heap arena. + * + * static_arena is the static memory arena. Allocations from it + * are not subject to relocation so it is safe to use the memory + * physical address as well as the virtual address (e.g. the VA to + * PA translations are static). Caches may import from static_arena; + * all other static memory allocations should use static_alloc_arena. + * + * On some platforms which have limited virtual address space, seg_kmem + * may share [kernelheap, ekernelheap) with seg_kp; if this is so, + * segkp_bitmap is non-NULL, and each bit represents a page of virtual + * address space which is actually seg_kp mapped. + */ + +/* + * Rough stubbed Port for XNU. + * + * Copyright (c) 2014 Brendon Humphrey (brendon.humphrey@mac.com) + */ + + +#ifdef _KERNEL +#define XNU_KERNEL_PRIVATE +#include +extern vm_map_t kernel_map; + +/* + * These extern prototypes has to be carefully checked against XNU source + * in case Apple changes them. They are not defined in the "allowed" parts + * of the kernel.framework + */ +typedef uint8_t vm_tag_t; + +/* + * Tag we use to identify memory we have allocated + * + * (VM_KERN_MEMORY_KEXT - mach_vm_statistics.h) + */ +#define SPL_TAG 6 + +/* + * In kernel lowlevel form of malloc. + */ +extern kern_return_t kernel_memory_allocate(vm_map_t map, void **addrp, + vm_size_t size, vm_offset_t mask, int flags, vm_tag_t tag); + +/* + * Free memory + */ +extern void kmem_free(vm_map_t map, void *addr, vm_size_t size); + +#endif /* _KERNEL */ + +typedef int page_t; + +void *segkmem_alloc(vmem_t *vmp, size_t size, int vmflag); +void segkmem_free(vmem_t *vmp, void *inaddr, size_t size); + +/* Total memory held allocated */ +uint64_t segkmem_total_mem_allocated = 0; + +/* primary kernel heap arena */ +vmem_t *heap_arena; + +/* qcaches for zio and abd arenas */ +vmem_t *zio_arena_parent; + +/* arena for allocating file data */ +vmem_t *zio_arena; + +/* and for allocation of zfs metadata */ +vmem_t *zio_metadata_arena; + +#ifdef _KERNEL +extern uint64_t total_memory; +uint64_t stat_osif_malloc_success = 0; +uint64_t stat_osif_free = 0; +uint64_t stat_osif_malloc_bytes = 0; +uint64_t stat_osif_free_bytes = 0; +#endif + +void * +osif_malloc(uint64_t size) +{ +#ifdef _KERNEL + void *tr; + + kern_return_t kr = kernel_memory_allocate(kernel_map, + &tr, size, PAGESIZE, 0, SPL_TAG); + + if (kr == KERN_SUCCESS) { + atomic_inc_64(&stat_osif_malloc_success); + atomic_add_64(&segkmem_total_mem_allocated, size); + atomic_add_64(&stat_osif_malloc_bytes, size); + return (tr); + } else { + // well, this can't really happen, kernel_memory_allocate + // would panic instead + return (NULL); + } +#else + return (malloc(size)); +#endif +} + +void +osif_free(void* buf, uint64_t size) +{ +#ifdef _KERNEL + kmem_free(kernel_map, buf, size); + atomic_inc_64(&stat_osif_free); + atomic_sub_64(&segkmem_total_mem_allocated, size); + atomic_add_64(&stat_osif_free_bytes, size); +#else + free(buf); +#endif /* _KERNEL */ +} + +/* + * Configure vmem, such that the heap arena is fed, + * and drains to the kernel low level allocator. + */ +void +kernelheap_init() +{ + heap_arena = vmem_init("heap", NULL, 0, PAGESIZE, segkmem_alloc, + segkmem_free); +} + + +void +kernelheap_fini(void) +{ + vmem_fini(heap_arena); +} + +void * +segkmem_alloc(vmem_t *vmp, size_t size, int maybe_unmasked_vmflag) +{ + return (osif_malloc(size)); +} + +void +segkmem_free(vmem_t *vmp, void *inaddr, size_t size) +{ + osif_free(inaddr, size); + // since this is mainly called by spl_root_arena and free_arena, + // do we really want to wake up a waiter, just because we have + // transferred from one to the other? + // we already have vmem_add_a_gibibyte waking up waiters + // so specializing here seems wasteful + // (originally included in vmem_experiments) + // cv_signal(&vmp->vm_cv); +} + +/* + * OSX does not use separate heaps for the ZIO buffers, + * the ZFS code is structured such that the zio caches will + * fallback to using the kmem_default arena same + * as all the other caches. + */ +// smd: we nevertheless plumb in an arena with heap as parent, so that +// we can track stats and maintain the VM_ / qc settings differently +void +segkmem_zio_init() +{ + // note: from startup.c and vm_machparam: SEGZIOMINSIZE = 512M. + // and SEGZSIOMAXSIZE = 512G; if physmem is between the two, then + // segziosize is (physmem - SEGZIOMAXSIZE) / 2. + + // Illumos does not segregate zio_metadata_arena out of heap, + // almost exclusively for reasons involving panic dump data + // retention. However, parenting zio_metadata_arena to + // spl_root_arena and giving it its own qcaches provides better + // kstat observability *and* noticeably better performance in + // realworld (zfs/dmu) metadata-heavy activity. Additionally, + // the qcaches pester spl_heap_arena only for slabs 256k and bigger, + // and each of the qcache entries (powers of two from PAGESIZE to + // 64k) are *exact-fit* and therefore dramatically reduce internal + // fragmentation and more than pay off for the extra code and (tiny) + // extra data for holding the arenas' segment tables. + + extern vmem_t *spl_heap_arena; + + zio_arena_parent = vmem_create("zfs_qcache", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, spl_heap_arena, + 16 * 1024, VM_SLEEP | VMC_TIMEFREE); + + ASSERT(zio_arena_parent != NULL); + + zio_arena = vmem_create("zfs_file_data", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, zio_arena_parent, + 0, VM_SLEEP); + + zio_metadata_arena = vmem_create("zfs_metadata", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, zio_arena_parent, + 0, VM_SLEEP); + + ASSERT(zio_arena != NULL); + ASSERT(zio_metadata_arena != NULL); + + extern void spl_zio_no_grow_init(void); + spl_zio_no_grow_init(); +} + +void +segkmem_zio_fini(void) +{ + if (zio_arena) { + vmem_destroy(zio_arena); + } + if (zio_metadata_arena) { + vmem_destroy(zio_metadata_arena); + } + if (zio_arena_parent) { + vmem_destroy(zio_arena_parent); + } +} diff --git a/module/os/macos/spl/spl-taskq.c b/module/os/macos/spl/spl-taskq.c new file mode 100644 index 0000000000..7040c66f11 --- /dev/null +++ b/module/os/macos/spl/spl-taskq.c @@ -0,0 +1,2529 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + */ + +/* + * Copyright (C) 2015, 2020 Jorgen Lundman + */ + +/* + * Kernel task queues: general-purpose asynchronous task scheduling. + * + * A common problem in kernel programming is the need to schedule tasks + * to be performed later, by another thread. There are several reasons + * you may want or need to do this: + * + * (1) The task isn't time-critical, but your current code path is. + * + * (2) The task may require grabbing locks that you already hold. + * + * (3) The task may need to block (e.g. to wait for memory), but you + * cannot block in your current context. + * + * (4) Your code path can't complete because of some condition, but you can't + * sleep or fail, so you queue the task for later execution when condition + * disappears. + * + * (5) You just want a simple way to launch multiple tasks in parallel. + * + * Task queues provide such a facility. In its simplest form (used when + * performance is not a critical consideration) a task queue consists of a + * single list of tasks, together with one or more threads to service the + * list. There are some cases when this simple queue is not sufficient: + * + * (1) The task queues are very hot and there is a need to avoid data and lock + * contention over global resources. + * + * (2) Some tasks may depend on other tasks to complete, so they can't be put in + * the same list managed by the same thread. + * + * (3) Some tasks may block for a long time, and this should not block other + * tasks in the queue. + * + * To provide useful service in such cases we define a "dynamic task queue" + * which has an individual thread for each of the tasks. These threads are + * dynamically created as they are needed and destroyed when they are not in + * use. The API for managing task pools is the same as for managing task queues + * with the exception of a taskq creation flag TASKQ_DYNAMIC which tells that + * dynamic task pool behavior is desired. + * + * Dynamic task queues may also place tasks in the normal queue (called "backing + * queue") when task pool runs out of resources. Users of task queues may + * disallow such queued scheduling by specifying TQ_NOQUEUE in the dispatch + * flags. + * + * The backing task queue is also used for scheduling internal tasks needed for + * dynamic task queue maintenance. + * + * INTERFACES ================================================================== + * + * taskq_t *taskq_create(name, nthreads, pri, minalloc, maxall, flags); + * + * Create a taskq with specified properties. + * Possible 'flags': + * + * TASKQ_DYNAMIC: Create task pool for task management. If this flag is + * specified, 'nthreads' specifies the maximum number of threads in + * the task queue. Task execution order for dynamic task queues is + * not predictable. + * + * If this flag is not specified (default case) a + * single-list task queue is created with 'nthreads' threads + * servicing it. Entries in this queue are managed by + * taskq_ent_alloc() and taskq_ent_free() which try to keep the + * task population between 'minalloc' and 'maxalloc', but the + * latter limit is only advisory for TQ_SLEEP dispatches and the + * former limit is only advisory for TQ_NOALLOC dispatches. If + * TASKQ_PREPOPULATE is set in 'flags', the taskq will be + * prepopulated with 'minalloc' task structures. + * + * Since non-DYNAMIC taskqs are queues, tasks are guaranteed to be + * executed in the order they are scheduled if nthreads == 1. + * If nthreads > 1, task execution order is not predictable. + * + * TASKQ_PREPOPULATE: Prepopulate task queue with threads. + * Also prepopulate the task queue with 'minalloc' task structures. + * + * TASKQ_THREADS_CPU_PCT: This flag specifies that 'nthreads' should be + * interpreted as a percentage of the # of online CPUs on the + * system. The taskq subsystem will automatically adjust the + * number of threads in the taskq in response to CPU online + * and offline events, to keep the ratio. nthreads must be in + * the range [0,100]. + * + * The calculation used is: + * + * MAX((ncpus_online * percentage)/100, 1) + * + * This flag is not supported for DYNAMIC task queues. + * This flag is not compatible with TASKQ_CPR_SAFE. + * + * TASKQ_CPR_SAFE: This flag specifies that users of the task queue will + * use their own protocol for handling CPR issues. This flag is not + * supported for DYNAMIC task queues. This flag is not compatible + * with TASKQ_THREADS_CPU_PCT. + * + * The 'pri' field specifies the default priority for the threads that + * service all scheduled tasks. + * + * taskq_t *taskq_create_instance(name, instance, nthreads, pri, minalloc, + * maxall, flags); + * + * Like taskq_create(), but takes an instance number (or -1 to indicate + * no instance). + * + * taskq_t *taskq_create_proc(name, nthreads, pri, minalloc, maxall, proc, + * flags); + * + * Like taskq_create(), but creates the taskq threads in the specified + * system process. If proc != &p0, this must be called from a thread + * in that process. + * + * taskq_t *taskq_create_sysdc(name, nthreads, minalloc, maxall, proc, + * dc, flags); + * + * Like taskq_create_proc(), but the taskq threads will use the + * System Duty Cycle (SDC) scheduling class with a duty cycle of dc. + * + * void taskq_destroy(tap): + * + * Waits for any scheduled tasks to complete, then destroys the taskq. + * Caller should guarantee that no new tasks are scheduled in the closing + * taskq. + * + * taskqid_t taskq_dispatch(tq, func, arg, flags): + * + * Dispatches the task "func(arg)" to taskq. The 'flags' indicates whether + * the caller is willing to block for memory. The function returns an + * opaque value which is zero iff dispatch fails. If flags is TQ_NOSLEEP + * or TQ_NOALLOC and the task can't be dispatched, taskq_dispatch() fails + * and returns (taskqid_t)0. + * + * ASSUMES: func != NULL. + * + * Possible flags: + * TQ_NOSLEEP: Do not wait for resources; may fail. + * + * TQ_NOALLOC: Do not allocate memory; may fail. May only be used with + * non-dynamic task queues. + * + * TQ_NOQUEUE: Do not enqueue a task if it can't dispatch it due to + * lack of available resources and fail. If this flag is not + * set, and the task pool is exhausted, the task may be scheduled + * in the backing queue. This flag may ONLY be used with dynamic + * task queues. + * + * NOTE: This flag should always be used when a task queue is used + * for tasks that may depend on each other for completion. + * Enqueueing dependent tasks may create deadlocks. + * + * TQ_SLEEP: May block waiting for resources. May still fail for + * dynamic task queues if TQ_NOQUEUE is also specified, otherwise + * always succeed. + * + * TQ_FRONT: Puts the new task at the front of the queue. Be careful. + * + * NOTE: Dynamic task queues are much more likely to fail in + * taskq_dispatch() (especially if TQ_NOQUEUE was specified), so it + * is important to have backup strategies handling such failures. + * + * void taskq_dispatch_ent(tq, func, arg, flags, tqent) + * + * This is a light-weight form of taskq_dispatch(), that uses a + * preallocated taskq_ent_t structure for scheduling. As a + * result, it does not perform allocations and cannot ever fail. + * Note especially that it cannot be used with TASKQ_DYNAMIC + * taskqs. The memory for the tqent must not be modified or used + * until the function (func) is called. (However, func itself + * may safely modify or free this memory, once it is called.) + * Note that the taskq framework will NOT free this memory. + * + * void taskq_wait(tq): + * + * Waits for all previously scheduled tasks to complete. + * + * NOTE: It does not stop any new task dispatches. + * Do NOT call taskq_wait() from a task: it will cause deadlock. + * + * void taskq_suspend(tq) + * + * Suspend all task execution. Tasks already scheduled for a dynamic task + * queue will still be executed, but all new scheduled tasks will be + * suspended until taskq_resume() is called. + * + * int taskq_suspended(tq) + * + * Returns 1 if taskq is suspended and 0 otherwise. It is intended to + * ASSERT that the task queue is suspended. + * + * void taskq_resume(tq) + * + * Resume task queue execution. + * + * int taskq_member(tq, thread) + * + * Returns 1 if 'thread' belongs to taskq 'tq' and 0 otherwise. The + * intended use is to ASSERT that a given function is called in taskq + * context only. + * + * system_taskq + * + * Global system-wide dynamic task queue for common uses. It may be used by + * any subsystem that needs to schedule tasks and does not need to manage + * its own task queues. It is initialized quite early during system boot. + * + * IMPLEMENTATION ============================================================== + * + * This is schematic representation of the task queue structures. + * + * taskq: + * +-------------+ + * | tq_lock | +---< taskq_ent_free() + * +-------------+ | + * |... | | tqent: tqent: + * +-------------+ | +------------+ +------------+ + * | tq_freelist |-->| tqent_next |--> ... ->| tqent_next | + * +-------------+ +------------+ +------------+ + * |... | | ... | | ... | + * +-------------+ +------------+ +------------+ + * | tq_task | | + * | | +-------------->taskq_ent_alloc() + * +--------------------------------------------------------------------------+ + * | | | tqent tqent | + * | +---------------------+ +--> +------------+ +--> +------------+ | + * | | ... | | | func, arg | | | func, arg | | + * +>+---------------------+ <---|-+ +------------+ <---|-+ +------------+ | + * | tq_taskq.tqent_next | ----+ | | tqent_next | --->+ | | tqent_next |--+ + * +---------------------+ | +------------+ ^ | +------------+ + * +-| tq_task.tqent_prev | +--| tqent_prev | | +--| tqent_prev | ^ + * | +---------------------+ +------------+ | +------------+ | + * | |... | | ... | | | ... | | + * | +---------------------+ +------------+ | +------------+ | + * | ^ | | + * | | | | + * +--------------------------------------+--------------+ TQ_APPEND() -+ + * | | | + * |... | taskq_thread()-----+ + * +-------------+ + * | tq_buckets |--+-------> [ NULL ] (for regular task queues) + * +-------------+ | + * | DYNAMIC TASK QUEUES: + * | + * +-> taskq_bucket[nCPU] taskq_bucket_dispatch() + * +-------------------+ ^ + * +--->| tqbucket_lock | | + * | +-------------------+ +--------+ +--------+ + * | | tqbucket_freelist |-->| tqent |-->...| tqent | ^ + * | +-------------------+<--+--------+<--...+--------+ | + * | | ... | | thread | | thread | | + * | +-------------------+ +--------+ +--------+ | + * | +-------------------+ | + * taskq_dispatch()--+--->| tqbucket_lock | TQ_APPEND()------+ + * TQ_HASH() | +-------------------+ +--------+ +--------+ + * | | tqbucket_freelist |-->| tqent |-->...| tqent | + * | +-------------------+<--+--------+<--...+--------+ + * | | ... | | thread | | thread | + * | +-------------------+ +--------+ +--------+ + * +---> ... + * + * + * Task queues use tq_task field to link new entry in the queue. The queue is a + * circular doubly-linked list. Entries are put in the end of the list with + * TQ_APPEND() and processed from the front of the list by taskq_thread() in + * FIFO order. Task queue entries are cached in the free list managed by + * taskq_ent_alloc() and taskq_ent_free() functions. + * + * All threads used by task queues mark t_taskq field of the thread to + * point to the task queue. + * + * Taskq Thread Management ----------------------------------------------------- + * + * Taskq's non-dynamic threads are managed with several variables and flags: + * + * * tq_nthreads - The number of threads in taskq_thread() for the + * taskq. + * + * * tq_active - The number of threads not waiting on a CV in + * taskq_thread(); includes newly created threads + * not yet counted in tq_nthreads. + * + * * tq_nthreads_target + * - The number of threads desired for the taskq. + * + * * tq_flags & TASKQ_CHANGING + * - Indicates that tq_nthreads != tq_nthreads_target. + * + * * tq_flags & TASKQ_THREAD_CREATED + * - Indicates that a thread is being created in the taskq. + * + * During creation, tq_nthreads and tq_active are set to 0, and + * tq_nthreads_target is set to the number of threads desired. The + * TASKQ_CHANGING flag is set, and taskq_thread_create() is called to + * create the first thread. taskq_thread_create() increments tq_active, + * sets TASKQ_THREAD_CREATED, and creates the new thread. + * + * Each thread starts in taskq_thread(), clears the TASKQ_THREAD_CREATED + * flag, and increments tq_nthreads. It stores the new value of + * tq_nthreads as its "thread_id", and stores its thread pointer in the + * tq_threadlist at the (thread_id - 1). We keep the thread_id space + * densely packed by requiring that only the largest thread_id can exit during + * normal adjustment. The exception is during the destruction of the + * taskq; once tq_nthreads_target is set to zero, no new threads will be created + * for the taskq queue, so every thread can exit without any ordering being + * necessary. + * + * Threads will only process work if their thread id is <= tq_nthreads_target. + * + * When TASKQ_CHANGING is set, threads will check the current thread target + * whenever they wake up, and do whatever they can to apply its effects. + * + * TASKQ_THREAD_CPU_PCT -------------------------------------------------------- + * + * When a taskq is created with TASKQ_THREAD_CPU_PCT, we store their requested + * percentage in tq_threads_ncpus_pct, start them off with the correct thread + * target, and add them to the taskq_cpupct_list for later adjustment. + * + * We register taskq_cpu_setup() to be called whenever a CPU changes state. It + * walks the list of TASKQ_THREAD_CPU_PCT taskqs, adjusts their nthread_target + * if need be, and wakes up all of the threads to process the change. + * + * Dynamic Task Queues Implementation ------------------------------------------ + * + * For a dynamic task queues there is a 1-to-1 mapping between a thread and + * taskq_ent_structure. Each entry is serviced by its own thread and each thread + * is controlled by a single entry. + * + * Entries are distributed over a set of buckets. To avoid using modulo + * arithmetics the number of buckets is 2^n and is determined as the nearest + * power of two roundown of the number of CPUs in the system. Tunable + * variable 'taskq_maxbuckets' limits the maximum number of buckets. Each entry + * is attached to a bucket for its lifetime and can't migrate to other buckets. + * + * Entries that have scheduled tasks are not placed in any list. The dispatch + * function sets their "func" and "arg" fields and signals the corresponding + * thread to execute the task. Once the thread executes the task it clears the + * "func" field and places an entry on the bucket cache of free entries pointed + * by "tqbucket_freelist" field. ALL entries on the free list should have "func" + * field equal to NULL. The free list is a circular doubly-linked list identical + * in structure to the tq_task list above, but entries are taken from it in LIFO + * order - the last freed entry is the first to be allocated. The + * taskq_bucket_dispatch() function gets the most recently used entry from the + * free list, sets its "func" and "arg" fields and signals a worker thread. + * + * After executing each task a per-entry thread taskq_d_thread() places its + * entry on the bucket free list and goes to a timed sleep. If it wakes up + * without getting new task it removes the entry from the free list and destroys + * itself. The thread sleep time is controlled by a tunable variable + * `taskq_thread_timeout'. + * + * There are various statistics kept in the bucket which allows for later + * analysis of taskq usage patterns. Also, a global copy of taskq creation and + * death statistics is kept in the global taskq data structure. Since thread + * creation and death happen rarely, updating such global data does not present + * a performance problem. + * + * NOTE: Threads are not bound to any CPU and there is absolutely no association + * between the bucket and actual thread CPU, so buckets are used only to + * split resources and reduce resource contention. Having threads attached + * to the CPU denoted by a bucket may reduce number of times the job + * switches between CPUs. + * + * Current algorithm creates a thread whenever a bucket has no free + * entries. It would be nice to know how many threads are in the running + * state and don't create threads if all CPUs are busy with existing + * tasks, but it is unclear how such strategy can be implemented. + * + * Currently buckets are created statically as an array attached to task + * queue. On some system with nCPUs < max_ncpus it may waste system + * memory. One solution may be allocation of buckets when they are first + * touched, but it is not clear how useful it is. + * + * SUSPEND/RESUME implementation ----------------------------------------------- + * + * Before executing a task taskq_thread() (executing non-dynamic task + * queues) obtains taskq's thread lock as a reader. The taskq_suspend() + * function gets the same lock as a writer blocking all non-dynamic task + * execution. The taskq_resume() function releases the lock allowing + * taskq_thread to continue execution. + * + * For dynamic task queues, each bucket is marked as TQBUCKET_SUSPEND by + * taskq_suspend() function. After that taskq_bucket_dispatch() always + * fails, so that taskq_dispatch() will either enqueue tasks for a + * suspended backing queue or fail if TQ_NOQUEUE is specified in dispatch + * flags. + * + * NOTE: taskq_suspend() does not immediately block any tasks already + * scheduled for dynamic task queues. It only suspends new tasks + * scheduled after taskq_suspend() was called. + * + * taskq_member() function works by comparing a thread t_taskq pointer with + * the passed thread pointer. + * + * LOCKS and LOCK Hierarchy ---------------------------------------------------- + * + * There are three locks used in task queues: + * + * 1) The taskq_t's tq_lock, protecting global task queue state. + * + * 2) Each per-CPU bucket has a lock for bucket management. + * + * 3) The global taskq_cpupct_lock, which protects the list of + * TASKQ_THREADS_CPU_PCT taskqs. + * + * If both (1) and (2) are needed, tq_lock should be taken *after* the bucket + * lock. + * + * If both (1) and (3) are needed, tq_lock should be taken *after* + * taskq_cpupct_lock. + * + * DEBUG FACILITIES ------------------------------------------------------------ + * + * For DEBUG kernels it is possible to induce random failures to + * taskq_dispatch() function when it is given TQ_NOSLEEP argument. The value of + * taskq_dmtbf and taskq_smtbf tunables control the mean time between induced + * failures for dynamic and static task queues respectively. + * + * Setting TASKQ_STATISTIC to 0 will disable per-bucket statistics. + * + * TUNABLES -------------------------------------------------------------------- + * + * system_taskq_size - Size of the global system_taskq. + * This value is multiplied by nCPUs to determine + * actual size. + * Default value: 64 + * + * taskq_minimum_nthreads_max + * - Minimum size of the thread list for a taskq. + * Useful for testing different thread pool + * sizes by overwriting tq_nthreads_target. + * + * taskq_thread_timeout - Maximum idle time for taskq_d_thread() + * Default value: 5 minutes + * + * taskq_maxbuckets - Maximum number of buckets in any task queue + * Default value: 128 + * + * taskq_search_depth - Maximum # of buckets searched for a free entry + * Default value: 4 + * + * taskq_dmtbf - Mean time between induced dispatch failures + * for dynamic task queues. + * Default value: UINT_MAX (no induced failures) + * + * taskq_smtbf - Mean time between induced dispatch failures + * for static task queues. + * Default value: UINT_MAX (no induced failures) + * + * CONDITIONAL compilation ----------------------------------------------------- + * + * TASKQ_STATISTIC - If set will enable bucket statistic (default). + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For throttlefree */ +#include +#include +#include + +static kmem_cache_t *taskq_ent_cache, *taskq_cache; + +static uint_t taskq_tsd; + +/* + * Pseudo instance numbers for taskqs without explicitly provided instance. + */ +static vmem_t *taskq_id_arena; + +/* Global system task queue for common use */ +taskq_t *system_taskq = NULL; +taskq_t *system_delay_taskq = NULL; + +/* + * Maximum number of entries in global system taskq is + * system_taskq_size * max_ncpus + */ +#define SYSTEM_TASKQ_SIZE 64 +int system_taskq_size = SYSTEM_TASKQ_SIZE; + +/* + * Minimum size for tq_nthreads_max; useful for those who want to play around + * with increasing a taskq's tq_nthreads_target. + */ +int taskq_minimum_nthreads_max = 1; + +/* + * We want to ensure that when taskq_create() returns, there is at least + * one thread ready to handle requests. To guarantee this, we have to wait + * for the second thread, since the first one cannot process requests until + * the second thread has been created. + */ +#define TASKQ_CREATE_ACTIVE_THREADS 2 + +/* Maximum percentage allowed for TASKQ_THREADS_CPU_PCT */ +#define TASKQ_CPUPCT_MAX_PERCENT 1000 +int taskq_cpupct_max_percent = TASKQ_CPUPCT_MAX_PERCENT; + +/* + * Dynamic task queue threads that don't get any work within + * taskq_thread_timeout destroy themselves + */ +#define TASKQ_THREAD_TIMEOUT (60 * 5) +int taskq_thread_timeout = TASKQ_THREAD_TIMEOUT; + +#define TASKQ_MAXBUCKETS 128 +int taskq_maxbuckets = TASKQ_MAXBUCKETS; + +/* + * When a bucket has no available entries another buckets are tried. + * taskq_search_depth parameter limits the amount of buckets that we search + * before failing. This is mostly useful in systems with many CPUs where we may + * spend too much time scanning busy buckets. + */ +#define TASKQ_SEARCH_DEPTH 4 +int taskq_search_depth = TASKQ_SEARCH_DEPTH; + +/* + * Hashing function: mix various bits of x. May be pretty much anything. + */ +#define TQ_HASH(x) ((x) ^ ((x) >> 11) ^ ((x) >> 17) ^ ((x) ^ 27)) + +/* + * We do not create any new threads when the system is low on memory and start + * throttling memory allocations. The following macro tries to estimate such + * condition. + */ +#ifdef __APPLE__ +#define ENOUGH_MEMORY() (!spl_vm_pool_low()) +#else +#define ENOUGH_MEMORY() (freemem > throttlefree) +#endif + +/* + * Static functions. + */ +static taskq_t *taskq_create_common(const char *, int, int, pri_t, int, + int, proc_t *, uint_t, uint_t); +static void taskq_thread(void *); +static void taskq_d_thread(taskq_ent_t *); +static void taskq_bucket_extend(void *); +static int taskq_constructor(void *, void *, int); +static void taskq_destructor(void *, void *); +static int taskq_ent_constructor(void *, void *, int); +static void taskq_ent_destructor(void *, void *); +static taskq_ent_t *taskq_ent_alloc(taskq_t *, int); +static void taskq_ent_free(taskq_t *, taskq_ent_t *); +static int taskq_ent_exists(taskq_t *, task_func_t, void *); +static taskq_ent_t *taskq_bucket_dispatch(taskq_bucket_t *, task_func_t, + void *); + +/* + * Task queues kstats. + */ +struct taskq_kstat { + kstat_named_t tq_pid; + kstat_named_t tq_tasks; + kstat_named_t tq_executed; + kstat_named_t tq_maxtasks; + kstat_named_t tq_totaltime; + kstat_named_t tq_nalloc; + kstat_named_t tq_nactive; + kstat_named_t tq_pri; + kstat_named_t tq_nthreads; +} taskq_kstat = { + { "pid", KSTAT_DATA_UINT64 }, + { "tasks", KSTAT_DATA_UINT64 }, + { "executed", KSTAT_DATA_UINT64 }, + { "maxtasks", KSTAT_DATA_UINT64 }, + { "totaltime", KSTAT_DATA_UINT64 }, + { "nactive", KSTAT_DATA_UINT64 }, + { "nalloc", KSTAT_DATA_UINT64 }, + { "priority", KSTAT_DATA_UINT64 }, + { "threads", KSTAT_DATA_UINT64 }, +}; + +struct taskq_d_kstat { + kstat_named_t tqd_pri; + kstat_named_t tqd_btasks; + kstat_named_t tqd_bexecuted; + kstat_named_t tqd_bmaxtasks; + kstat_named_t tqd_bnalloc; + kstat_named_t tqd_bnactive; + kstat_named_t tqd_btotaltime; + kstat_named_t tqd_hits; + kstat_named_t tqd_misses; + kstat_named_t tqd_overflows; + kstat_named_t tqd_tcreates; + kstat_named_t tqd_tdeaths; + kstat_named_t tqd_maxthreads; + kstat_named_t tqd_nomem; + kstat_named_t tqd_disptcreates; + kstat_named_t tqd_totaltime; + kstat_named_t tqd_nalloc; + kstat_named_t tqd_nfree; +} taskq_d_kstat = { + { "priority", KSTAT_DATA_UINT64 }, + { "btasks", KSTAT_DATA_UINT64 }, + { "bexecuted", KSTAT_DATA_UINT64 }, + { "bmaxtasks", KSTAT_DATA_UINT64 }, + { "bnalloc", KSTAT_DATA_UINT64 }, + { "bnactive", KSTAT_DATA_UINT64 }, + { "btotaltime", KSTAT_DATA_UINT64 }, + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "overflows", KSTAT_DATA_UINT64 }, + { "tcreates", KSTAT_DATA_UINT64 }, + { "tdeaths", KSTAT_DATA_UINT64 }, + { "maxthreads", KSTAT_DATA_UINT64 }, + { "nomem", KSTAT_DATA_UINT64 }, + { "disptcreates", KSTAT_DATA_UINT64 }, + { "totaltime", KSTAT_DATA_UINT64 }, + { "nalloc", KSTAT_DATA_UINT64 }, + { "nfree", KSTAT_DATA_UINT64 }, +}; + +static kmutex_t taskq_kstat_lock; +static kmutex_t taskq_d_kstat_lock; +static int taskq_kstat_update(kstat_t *, int); +static int taskq_d_kstat_update(kstat_t *, int); + +/* + * List of all TASKQ_THREADS_CPU_PCT taskqs. + */ +static list_t taskq_cpupct_list; /* protected by cpu_lock */ + +/* + * Collect per-bucket statistic when TASKQ_STATISTIC is defined. + */ +#define TASKQ_STATISTIC 1 + +#if TASKQ_STATISTIC +#define TQ_STAT(b, x) b->tqbucket_stat.x++ +#else +#define TQ_STAT(b, x) +#endif + +/* + * Random fault injection. + */ +uint_t taskq_random; +uint_t taskq_dmtbf = UINT_MAX; /* mean time between injected failures */ +uint_t taskq_smtbf = UINT_MAX; /* mean time between injected failures */ + +/* + * TQ_NOSLEEP dispatches on dynamic task queues are always allowed to fail. + * + * TQ_NOSLEEP dispatches on static task queues can't arbitrarily fail because + * they could prepopulate the cache and make sure that they do not use more + * then minalloc entries. So, fault injection in this case insures that + * either TASKQ_PREPOPULATE is not set or there are more entries allocated + * than is specified by minalloc. TQ_NOALLOC dispatches are always allowed + * to fail, but for simplicity we treat them identically to TQ_NOSLEEP + * dispatches. + */ +#ifdef DEBUG +#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag) \ + taskq_random = (taskq_random * 2416 + 374441) % 1771875;\ + if ((flag & TQ_NOSLEEP) && \ + taskq_random < 1771875 / taskq_dmtbf) { \ + return (0); \ + } + +#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag) \ + taskq_random = (taskq_random * 2416 + 374441) % 1771875;\ + if ((flag & (TQ_NOSLEEP | TQ_NOALLOC)) && \ + (!(tq->tq_flags & TASKQ_PREPOPULATE) || \ + (tq->tq_nalloc > tq->tq_minalloc)) && \ + (taskq_random < (1771875 / taskq_smtbf))) { \ + mutex_exit(&tq->tq_lock); \ + return (0); \ + } +#else +#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag) +#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag) +#endif + +#define IS_EMPTY(l) (((l).tqent_prev == (l).tqent_next) && \ + ((l).tqent_prev == &(l))) + +/* + * Append `tqe' in the end of the doubly-linked list denoted by l. + */ +#define TQ_APPEND(l, tqe) { \ + tqe->tqent_next = &l; \ + tqe->tqent_prev = l.tqent_prev; \ + tqe->tqent_next->tqent_prev = tqe; \ + tqe->tqent_prev->tqent_next = tqe; \ +} +/* + * Prepend 'tqe' to the beginning of l + */ +#define TQ_PREPEND(l, tqe) { \ + tqe->tqent_next = l.tqent_next; \ + tqe->tqent_prev = &l; \ + tqe->tqent_next->tqent_prev = tqe; \ + tqe->tqent_prev->tqent_next = tqe; \ +} + +/* + * Schedule a task specified by func and arg into the task queue entry tqe. + */ +#define TQ_DO_ENQUEUE(tq, tqe, func, arg, front) { \ + ASSERT(MUTEX_HELD(&tq->tq_lock)); \ + _NOTE(CONSTCOND) \ + if (front) { \ + TQ_PREPEND(tq->tq_task, tqe); \ + } else { \ + TQ_APPEND(tq->tq_task, tqe); \ + } \ + tqe->tqent_func = (func); \ + tqe->tqent_arg = (arg); \ + tq->tq_tasks++; \ + if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks) \ + tq->tq_maxtasks = tq->tq_tasks - tq->tq_executed; \ + cv_signal(&tq->tq_dispatch_cv); \ + DTRACE_PROBE2(taskq__enqueue, taskq_t *, tq, taskq_ent_t *, tqe); \ +} + +#define TQ_ENQUEUE(tq, tqe, func, arg) \ + TQ_DO_ENQUEUE(tq, tqe, func, arg, 0) + +#define TQ_ENQUEUE_FRONT(tq, tqe, func, arg) \ + TQ_DO_ENQUEUE(tq, tqe, func, arg, 1) + +/* + * Do-nothing task which may be used to prepopulate thread caches. + */ +/*ARGSUSED*/ +void +nulltask(void *unused) +{ +} + +/*ARGSUSED*/ +static int +taskq_constructor(void *buf, void *cdrarg, int kmflags) +{ + taskq_t *tq = buf; + + bzero(tq, sizeof (taskq_t)); + + mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); + cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_exit_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL); + + tq->tq_task.tqent_next = &tq->tq_task; + tq->tq_task.tqent_prev = &tq->tq_task; + + return (0); +} + +/*ARGSUSED*/ +static void +taskq_destructor(void *buf, void *cdrarg) +{ + taskq_t *tq = buf; + + ASSERT(tq->tq_nthreads == 0); + ASSERT(tq->tq_buckets == NULL); + ASSERT(tq->tq_tcreates == 0); + ASSERT(tq->tq_tdeaths == 0); + + mutex_destroy(&tq->tq_lock); + rw_destroy(&tq->tq_threadlock); + cv_destroy(&tq->tq_dispatch_cv); + cv_destroy(&tq->tq_exit_cv); + cv_destroy(&tq->tq_wait_cv); + cv_destroy(&tq->tq_maxalloc_cv); +} + +/*ARGSUSED*/ +static int +taskq_ent_constructor(void *buf, void *cdrarg, int kmflags) +{ + taskq_ent_t *tqe = buf; + + tqe->tqent_thread = NULL; + cv_init(&tqe->tqent_cv, NULL, CV_DEFAULT, NULL); +#ifdef __APPLE__ + /* Simulate TS_STOPPED */ + mutex_init(&tqe->tqent_thread_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&tqe->tqent_thread_cv, NULL, CV_DEFAULT, NULL); +#endif /* __APPLE__ */ + + return (0); +} + +/*ARGSUSED*/ +static void +taskq_ent_destructor(void *buf, void *cdrarg) +{ + taskq_ent_t *tqe = buf; + + ASSERT(tqe->tqent_thread == NULL); + cv_destroy(&tqe->tqent_cv); +#ifdef __APPLE__ + /* See comment in taskq_d_thread(). */ + mutex_destroy(&tqe->tqent_thread_lock); + cv_destroy(&tqe->tqent_thread_cv); +#endif /* __APPLE__ */ +} + +int +spl_taskq_init(void) +{ + tsd_create(&taskq_tsd, NULL); + + taskq_ent_cache = kmem_cache_create("taskq_ent_cache", + sizeof (taskq_ent_t), 0, taskq_ent_constructor, + taskq_ent_destructor, NULL, NULL, NULL, 0); + taskq_cache = kmem_cache_create("taskq_cache", sizeof (taskq_t), + 0, taskq_constructor, taskq_destructor, NULL, NULL, NULL, 0); + taskq_id_arena = vmem_create("taskq_id_arena", + (void *)1, INT32_MAX, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + + list_create(&taskq_cpupct_list, sizeof (taskq_t), + offsetof(taskq_t, tq_cpupct_link)); + + mutex_init(&taskq_kstat_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&taskq_d_kstat_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +void +spl_taskq_fini(void) +{ + mutex_destroy(&taskq_d_kstat_lock); + mutex_destroy(&taskq_kstat_lock); + + if (taskq_cache) { + kmem_cache_destroy(taskq_cache); + taskq_cache = NULL; + } + if (taskq_ent_cache) { + kmem_cache_destroy(taskq_ent_cache); + taskq_ent_cache = NULL; + } + + list_destroy(&taskq_cpupct_list); + + vmem_destroy(taskq_id_arena); + + tsd_destroy(&taskq_tsd); +} + + + + +static void +taskq_update_nthreads(taskq_t *tq, uint_t ncpus) +{ + uint_t newtarget = TASKQ_THREADS_PCT(ncpus, tq->tq_threads_ncpus_pct); + +#ifndef __APPLE__ + ASSERT(MUTEX_HELD(&cpu_lock)); +#endif + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + /* We must be going from non-zero to non-zero; no exiting. */ + ASSERT3U(tq->tq_nthreads_target, !=, 0); + ASSERT3U(newtarget, !=, 0); + + ASSERT3U(newtarget, <=, tq->tq_nthreads_max); + if (newtarget != tq->tq_nthreads_target) { + tq->tq_flags |= TASKQ_CHANGING; + tq->tq_nthreads_target = newtarget; + cv_broadcast(&tq->tq_dispatch_cv); + cv_broadcast(&tq->tq_exit_cv); + } +} + +#ifndef __APPLE__ +/* No dynamic CPU add/remove in XNU, so we can just use static ncpu math */ + +/* called during task queue creation */ +static void +taskq_cpupct_install(taskq_t *tq, cpupart_t *cpup) +{ + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + + mutex_enter(&cpu_lock); + mutex_enter(&tq->tq_lock); + tq->tq_cpupart = cpup->cp_id; + taskq_update_nthreads(tq, cpup->cp_ncpus); + mutex_exit(&tq->tq_lock); + + list_insert_tail(&taskq_cpupct_list, tq); + mutex_exit(&cpu_lock); +} + +static void +taskq_cpupct_remove(taskq_t *tq) +{ + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + + mutex_enter(&cpu_lock); + list_remove(&taskq_cpupct_list, tq); + mutex_exit(&cpu_lock); +} + +/*ARGSUSED*/ +static int +taskq_cpu_setup(cpu_setup_t what, int id, void *arg) +{ + taskq_t *tq; + cpupart_t *cp = cpu[id]->cpu_part; + uint_t ncpus = cp->cp_ncpus; + + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(ncpus > 0); + + switch (what) { + case CPU_OFF: + case CPU_CPUPART_OUT: + /* offlines are called *before* the cpu is offlined. */ + if (ncpus > 1) + ncpus--; + break; + + case CPU_ON: + case CPU_CPUPART_IN: + break; + + default: + return (0); /* doesn't affect cpu count */ + } + + for (tq = list_head(&taskq_cpupct_list); tq != NULL; + tq = list_next(&taskq_cpupct_list, tq)) { + + mutex_enter(&tq->tq_lock); + /* + * If the taskq is part of the cpuset which is changing, + * update its nthreads_target. + */ + if (tq->tq_cpupart == cp->cp_id) { + taskq_update_nthreads(tq, ncpus); + } + mutex_exit(&tq->tq_lock); + } + return (0); +} + +void +taskq_mp_init(void) +{ + mutex_enter(&cpu_lock); + register_cpu_setup_func(taskq_cpu_setup, NULL); + /* + * Make sure we're up to date. At this point in boot, there is only + * one processor set, so we only have to update the current CPU. + */ + (void) taskq_cpu_setup(CPU_ON, CPU->cpu_id, NULL); + mutex_exit(&cpu_lock); +} +#endif /* __APPLE__ */ + + +/* + * Create global system dynamic task queue. + */ +void +system_taskq_init(void) +{ + system_taskq = taskq_create_common("system_taskq", 0, + system_taskq_size * max_ncpus, minclsyspri, 4, 512, &p0, 0, + TASKQ_DYNAMIC | TASKQ_PREPOPULATE); + system_delay_taskq = taskq_create("system_delay_taskq", max_ncpus, + minclsyspri, 0, 0, 0); +} + + +void +system_taskq_fini(void) +{ + if (system_taskq) + taskq_destroy(system_delay_taskq); + if (system_taskq) + taskq_destroy(system_taskq); + system_taskq = NULL; +} + +/* + * taskq_ent_alloc() + * + * Allocates a new taskq_ent_t structure either from the free list or from the + * cache. Returns NULL if it can't be allocated. + * + * Assumes: tq->tq_lock is held. + */ +static taskq_ent_t * +taskq_ent_alloc(taskq_t *tq, int flags) +{ + int kmflags = (flags & TQ_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + taskq_ent_t *tqe; + clock_t wait_time; + clock_t wait_rv; + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + /* + * TQ_NOALLOC allocations are allowed to use the freelist, even if + * we are below tq_minalloc. + */ +again: if ((tqe = tq->tq_freelist) != NULL && + ((flags & TQ_NOALLOC) || tq->tq_nalloc >= tq->tq_minalloc)) { + tq->tq_freelist = tqe->tqent_next; + } else { + if (flags & TQ_NOALLOC) + return (NULL); + + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (kmflags & KM_NOSLEEP) + return (NULL); + + /* + * We don't want to exceed tq_maxalloc, but we can't + * wait for other tasks to complete (and thus free up + * task structures) without risking deadlock with + * the caller. So, we just delay for one second + * to throttle the allocation rate. If we have tasks + * complete before one second timeout expires then + * taskq_ent_free will signal us and we will + * immediately retry the allocation (reap free). + */ + wait_time = ddi_get_lbolt() + hz; + while (tq->tq_freelist == NULL) { + tq->tq_maxalloc_wait++; + wait_rv = cv_timedwait(&tq->tq_maxalloc_cv, + &tq->tq_lock, wait_time); + tq->tq_maxalloc_wait--; + if (wait_rv == -1) + break; + } + if (tq->tq_freelist) + goto again; /* reap freelist */ + + } + mutex_exit(&tq->tq_lock); + + tqe = kmem_cache_alloc(taskq_ent_cache, kmflags); + + mutex_enter(&tq->tq_lock); + if (tqe != NULL) + tq->tq_nalloc++; + } + return (tqe); +} + +/* + * taskq_ent_free() + * + * Free taskq_ent_t structure by either putting it on the free list or freeing + * it to the cache. + * + * Assumes: tq->tq_lock is held. + */ +static void +taskq_ent_free(taskq_t *tq, taskq_ent_t *tqe) +{ + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + if (tq->tq_nalloc <= tq->tq_minalloc) { + tqe->tqent_next = tq->tq_freelist; + tq->tq_freelist = tqe; + } else { + tq->tq_nalloc--; + mutex_exit(&tq->tq_lock); + kmem_cache_free(taskq_ent_cache, tqe); + mutex_enter(&tq->tq_lock); + } + + if (tq->tq_maxalloc_wait) + cv_signal(&tq->tq_maxalloc_cv); +} + +/* + * taskq_ent_exists() + * + * Return 1 if taskq already has entry for calling 'func(arg)'. + * + * Assumes: tq->tq_lock is held. + */ +static int +taskq_ent_exists(taskq_t *tq, task_func_t func, void *arg) +{ + taskq_ent_t *tqe; + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + for (tqe = tq->tq_task.tqent_next; tqe != &tq->tq_task; + tqe = tqe->tqent_next) + if ((tqe->tqent_func == func) && (tqe->tqent_arg == arg)) + return (1); + return (0); +} + +/* + * Dispatch a task "func(arg)" to a free entry of bucket b. + * + * Assumes: no bucket locks is held. + * + * Returns: a pointer to an entry if dispatch was successful. + * NULL if there are no free entries or if the bucket is suspended. + */ +static taskq_ent_t * +taskq_bucket_dispatch(taskq_bucket_t *b, task_func_t func, void *arg) +{ + taskq_ent_t *tqe; + + ASSERT(MUTEX_NOT_HELD(&b->tqbucket_lock)); + ASSERT(func != NULL); + + mutex_enter(&b->tqbucket_lock); + + ASSERT(b->tqbucket_nfree != 0 || IS_EMPTY(b->tqbucket_freelist)); + ASSERT(b->tqbucket_nfree == 0 || !IS_EMPTY(b->tqbucket_freelist)); + + /* + * Get en entry from the freelist if there is one. + * Schedule task into the entry. + */ + if ((b->tqbucket_nfree != 0) && + !(b->tqbucket_flags & TQBUCKET_SUSPEND)) { + tqe = b->tqbucket_freelist.tqent_prev; + + ASSERT(tqe != &b->tqbucket_freelist); + ASSERT(tqe->tqent_thread != NULL); + + tqe->tqent_prev->tqent_next = tqe->tqent_next; + tqe->tqent_next->tqent_prev = tqe->tqent_prev; + b->tqbucket_nalloc++; + b->tqbucket_nfree--; + tqe->tqent_func = func; + tqe->tqent_arg = arg; + TQ_STAT(b, tqs_hits); + cv_signal(&tqe->tqent_cv); + DTRACE_PROBE2(taskq__d__enqueue, taskq_bucket_t *, b, + taskq_ent_t *, tqe); + } else { + tqe = NULL; + TQ_STAT(b, tqs_misses); + } + mutex_exit(&b->tqbucket_lock); + return (tqe); +} + +/* + * Dispatch a task. + * + * Assumes: func != NULL + * + * Returns: NULL if dispatch failed. + * non-NULL if task dispatched successfully. + * Actual return value is the pointer to taskq entry that was used to + * dispatch a task. This is useful for debugging. + */ +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_bucket_t *bucket = NULL; /* Which bucket needs extension */ + taskq_ent_t *tqe = NULL; + taskq_ent_t *tqe1; + uint_t bsize; + + ASSERT(tq != NULL); + ASSERT(func != NULL); + + if (!(tq->tq_flags & TASKQ_DYNAMIC)) { + /* + * TQ_NOQUEUE flag can't be used with non-dynamic task queues. + */ + ASSERT(!(flags & TQ_NOQUEUE)); + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flags); + + if ((tqe = taskq_ent_alloc(tq, flags)) == NULL) { + mutex_exit(&tq->tq_lock); + return (0); + } + /* Make sure we start without any flags */ + tqe->tqent_un.tqent_flags = 0; + + if (flags & TQ_FRONT) { + TQ_ENQUEUE_FRONT(tq, tqe, func, arg); + } else { + TQ_ENQUEUE(tq, tqe, func, arg); + } + mutex_exit(&tq->tq_lock); + return ((taskqid_t)tqe); + } + + /* + * Dynamic taskq dispatching. + */ + ASSERT(!(flags & (TQ_NOALLOC | TQ_FRONT))); + TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flags); + + bsize = tq->tq_nbuckets; + + if (bsize == 1) { + /* + * In a single-CPU case there is only one bucket, so get + * entry directly from there. + */ + if ((tqe = taskq_bucket_dispatch(tq->tq_buckets, func, arg)) + != NULL) + return ((taskqid_t)tqe); /* Fastpath */ + bucket = tq->tq_buckets; + } else { + int loopcount; + taskq_bucket_t *b; + // uintptr_t h = ((uintptr_t)CPU + (uintptr_t)arg) >> 3; + uintptr_t h = ((uintptr_t)(cpu_number()<<3) + + (uintptr_t)arg) >> 3; + + h = TQ_HASH(h); + + /* + * The 'bucket' points to the original bucket that we hit. If we + * can't allocate from it, we search other buckets, but only + * extend this one. + */ + b = &tq->tq_buckets[h & (bsize - 1)]; + ASSERT(b->tqbucket_taskq == tq); /* Sanity check */ + + /* + * Do a quick check before grabbing the lock. If the bucket does + * not have free entries now, chances are very small that it + * will after we take the lock, so we just skip it. + */ + if (b->tqbucket_nfree != 0) { + if ((tqe = taskq_bucket_dispatch(b, func, arg)) != NULL) + return ((taskqid_t)tqe); /* Fastpath */ + } else { + TQ_STAT(b, tqs_misses); + } + + bucket = b; + loopcount = MIN(taskq_search_depth, bsize); + /* + * If bucket dispatch failed, search loopcount number of buckets + * before we give up and fail. + */ + do { + b = &tq->tq_buckets[++h & (bsize - 1)]; + ASSERT(b->tqbucket_taskq == tq); /* Sanity check */ + loopcount--; + + if (b->tqbucket_nfree != 0) { + tqe = taskq_bucket_dispatch(b, func, arg); + } else { + TQ_STAT(b, tqs_misses); + } + } while ((tqe == NULL) && (loopcount > 0)); + } + + /* + * At this point we either scheduled a task and (tqe != NULL) or failed + * (tqe == NULL). Try to recover from fails. + */ + + /* + * For KM_SLEEP dispatches, try to extend the bucket and retry dispatch. + */ + if ((tqe == NULL) && !(flags & TQ_NOSLEEP)) { + /* + * taskq_bucket_extend() may fail to do anything, but this is + * fine - we deal with it later. If the bucket was successfully + * extended, there is a good chance that taskq_bucket_dispatch() + * will get this new entry, unless someone is racing with us and + * stealing the new entry from under our nose. + * taskq_bucket_extend() may sleep. + */ + taskq_bucket_extend(bucket); + TQ_STAT(bucket, tqs_disptcreates); + if ((tqe = taskq_bucket_dispatch(bucket, func, arg)) != NULL) + return ((taskqid_t)tqe); + } + + ASSERT(bucket != NULL); + + /* + * Since there are not enough free entries in the bucket, add a + * taskq entry to extend it in the background using backing queue + * (unless we already have a taskq entry to perform that extension). + */ + mutex_enter(&tq->tq_lock); + if (!taskq_ent_exists(tq, taskq_bucket_extend, bucket)) { + if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) { + TQ_ENQUEUE_FRONT(tq, tqe1, taskq_bucket_extend, bucket); + } else { + TQ_STAT(bucket, tqs_nomem); + } + } + + /* + * Dispatch failed and we can't find an entry to schedule a task. + * Revert to the backing queue unless TQ_NOQUEUE was asked. + */ + if ((tqe == NULL) && !(flags & TQ_NOQUEUE)) { + if ((tqe = taskq_ent_alloc(tq, flags)) != NULL) { + TQ_ENQUEUE(tq, tqe, func, arg); + } else { + TQ_STAT(bucket, tqs_nomem); + } + } + mutex_exit(&tq->tq_lock); + + return ((taskqid_t)tqe); +} + +/* + * FIXME, Linux has added the ability to start taskq with a given + * delay. + */ +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + clock_t timo; + + /* If it has already expired, just dispatch */ + timo = expire_time - ddi_get_lbolt(); + if (timo <= 0) + return (taskq_dispatch(tq, func, arg, flags)); + + /* Insert delayed code here: */ + return (0); +} + +void +taskq_init_ent(taskq_ent_t *t) +{ + memset(t, 0, sizeof (*t)); +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *tqe) +{ + ASSERT(func != NULL); + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + tqe->tqent_un.tqent_flags |= TQENT_FLAG_PREALLOC; + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + if (flags & TQ_FRONT) { + TQ_ENQUEUE_FRONT(tq, tqe, func, arg); + } else { + TQ_ENQUEUE(tq, tqe, func, arg); + } + mutex_exit(&tq->tq_lock); +} + +/* + * Allow our caller to ask if there are tasks pending on the queue. + */ +boolean_t +taskq_empty(taskq_t *tq) +{ + boolean_t rv; + + mutex_enter(&tq->tq_lock); + rv = (tq->tq_task.tqent_next == &tq->tq_task) && (tq->tq_active == 0); + mutex_exit(&tq->tq_lock); + + return (rv); +} + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (IS_EMPTY(*t)); +} + +/* + * Wait for all pending tasks to complete. + * Calling taskq_wait from a task will cause deadlock. + */ +void +taskq_wait(taskq_t *tq) +{ +#ifndef __APPLE__ + ASSERT(tq != curthread->t_taskq); +#endif + + if (tq == NULL) + return; + + mutex_enter(&tq->tq_lock); + while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + mutex_exit(&tq->tq_lock); + + if (tq->tq_flags & TASKQ_DYNAMIC) { + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + mutex_enter(&b->tqbucket_lock); + while (b->tqbucket_nalloc > 0) + cv_wait(&b->tqbucket_cv, &b->tqbucket_lock); + mutex_exit(&b->tqbucket_lock); + } + } +} + +/* + * ZOL implements taskq_wait_id() that can wait for a specific + * taskq to finish, rather than all active taskqs. Until it is + * implemented, we wait for all to complete. + */ +void +taskq_wait_id(taskq_t *tq, taskqid_t id) +{ + return (taskq_wait(tq)); +} + +/* + * Suspend execution of tasks. + * + * Tasks in the queue part will be suspended immediately upon return from this + * function. Pending tasks in the dynamic part will continue to execute, but all + * new tasks will be suspended. + */ +void +taskq_suspend(taskq_t *tq) +{ + rw_enter(&tq->tq_threadlock, RW_WRITER); + + if (tq->tq_flags & TASKQ_DYNAMIC) { + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + mutex_enter(&b->tqbucket_lock); + b->tqbucket_flags |= TQBUCKET_SUSPEND; + mutex_exit(&b->tqbucket_lock); + } + } + /* + * Mark task queue as being suspended. Needed for taskq_suspended(). + */ + mutex_enter(&tq->tq_lock); + ASSERT(!(tq->tq_flags & TASKQ_SUSPENDED)); + tq->tq_flags |= TASKQ_SUSPENDED; + mutex_exit(&tq->tq_lock); +} + +/* + * returns: 1 if tq is suspended, 0 otherwise. + */ +int +taskq_suspended(taskq_t *tq) +{ + return ((tq->tq_flags & TASKQ_SUSPENDED) != 0); +} + +/* + * Resume taskq execution. + */ +void +taskq_resume(taskq_t *tq) +{ + ASSERT(RW_WRITE_HELD(&tq->tq_threadlock)); + + if (tq->tq_flags & TASKQ_DYNAMIC) { + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + mutex_enter(&b->tqbucket_lock); + b->tqbucket_flags &= ~TQBUCKET_SUSPEND; + mutex_exit(&b->tqbucket_lock); + } + } + mutex_enter(&tq->tq_lock); + ASSERT(tq->tq_flags & TASKQ_SUSPENDED); + tq->tq_flags &= ~TASKQ_SUSPENDED; + mutex_exit(&tq->tq_lock); + + rw_exit(&tq->tq_threadlock); +} + +int +taskq_member(taskq_t *tq, kthread_t *thread) +{ + return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, thread)); +} + +taskq_t * +taskq_of_curthread(void) +{ + return (tsd_get(taskq_tsd)); +} + +/* + * Cancel an already dispatched task given the task id. Still pending tasks + * will be immediately canceled, and if the task is active the function will + * block until it completes. Preallocated tasks which are canceled must be + * freed by the caller. + */ +int +taskq_cancel_id(taskq_t *tq, taskqid_t id) +{ + // taskq_t *task = (taskq_t *) id; + + /* So we want to tell task to stop, and wait until it does */ + if (!EMPTY_TASKQ(tq)) + taskq_wait(tq); + + return (0); +} + +/* + * Creates a thread in the taskq. We only allow one outstanding create at + * a time. We drop and reacquire the tq_lock in order to avoid blocking other + * taskq activity while thread_create() or lwp_kernel_create() run. + * + * The first time we're called, we do some additional setup, and do not + * return until there are enough threads to start servicing requests. + */ +static void +taskq_thread_create(taskq_t *tq) +{ + kthread_t *t; + const boolean_t first = (tq->tq_nthreads == 0); + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + ASSERT(tq->tq_flags & TASKQ_CHANGING); + ASSERT(tq->tq_nthreads < tq->tq_nthreads_target); + ASSERT(!(tq->tq_flags & TASKQ_THREAD_CREATED)); + + tq->tq_flags |= TASKQ_THREAD_CREATED; + tq->tq_active++; + mutex_exit(&tq->tq_lock); + + /* + * With TASKQ_DUTY_CYCLE the new thread must have an LWP + * as explained in ../disp/sysdc.c (for the msacct data). + * Otherwise simple kthreads are preferred. + */ + if ((tq->tq_flags & TASKQ_DUTY_CYCLE) != 0) { + /* Enforced in taskq_create_common */ + printf("SPL: taskq_thread_create(TASKQ_DUTY_CYCLE) seen\n"); +#ifndef __APPLE__ + ASSERT3P(tq->tq_proc, !=, &p0); + t = lwp_kernel_create(tq->tq_proc, taskq_thread, tq, TS_RUN, + tq->tq_pri); +#endif + } else { + t = thread_create(NULL, 0, taskq_thread, tq, 0, tq->tq_proc, + TS_RUN, tq->tq_pri); + } + + if (!first) { + mutex_enter(&tq->tq_lock); + return; + } + + /* + * We know the thread cannot go away, since tq cannot be + * destroyed until creation has completed. We can therefore + * safely dereference t. + */ + if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) { +#ifdef __APPLE__ + mutex_enter(&tq->tq_lock); + taskq_update_nthreads(tq, max_ncpus); + mutex_exit(&tq->tq_lock); +#else + taskq_cpupct_install(tq, t->t_cpupart); +#endif + } + mutex_enter(&tq->tq_lock); + + /* Wait until we can service requests. */ + while (tq->tq_nthreads != tq->tq_nthreads_target && + tq->tq_nthreads < TASKQ_CREATE_ACTIVE_THREADS) { + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + } +} + +/* + * Common "sleep taskq thread" function, which handles CPR stuff, as well + * as giving a nice common point for debuggers to find inactive threads. + */ +static clock_t +taskq_thread_wait(taskq_t *tq, kmutex_t *mx, kcondvar_t *cv, + callb_cpr_t *cprinfo, clock_t timeout) +{ + clock_t ret = 0; + + if (!(tq->tq_flags & TASKQ_CPR_SAFE)) { + CALLB_CPR_SAFE_BEGIN(cprinfo); + } + if ((signed long)timeout < 0) + cv_wait(cv, mx); + else + ret = cv_reltimedwait(cv, mx, timeout, TR_CLOCK_TICK); + + if (!(tq->tq_flags & TASKQ_CPR_SAFE)) { + CALLB_CPR_SAFE_END(cprinfo, mx); + } + + return (ret); +} + +/* + * Worker thread for processing task queue. + */ +static void +taskq_thread(void *arg) +{ + int thread_id; + + taskq_t *tq = arg; + taskq_ent_t *tqe; + callb_cpr_t cprinfo; + hrtime_t start, end; + boolean_t freeit; + + CALLB_CPR_INIT(&cprinfo, &tq->tq_lock, callb_generic_cpr, + tq->tq_name); + + tsd_set(taskq_tsd, tq); + mutex_enter(&tq->tq_lock); + thread_id = ++tq->tq_nthreads; + ASSERT(tq->tq_flags & TASKQ_THREAD_CREATED); + ASSERT(tq->tq_flags & TASKQ_CHANGING); + tq->tq_flags &= ~TASKQ_THREAD_CREATED; + + VERIFY3S(thread_id, <=, tq->tq_nthreads_max); + + if (tq->tq_nthreads_max == 1) + tq->tq_thread = (kthread_t *)curthread; + else + tq->tq_threadlist[thread_id - 1] = (kthread_t *)curthread; + + /* Allow taskq_create_common()'s taskq_thread_create() to return. */ + if (tq->tq_nthreads == TASKQ_CREATE_ACTIVE_THREADS) + cv_broadcast(&tq->tq_wait_cv); + + for (;;) { + if (tq->tq_flags & TASKQ_CHANGING) { + /* See if we're no longer needed */ + if (thread_id > tq->tq_nthreads_target) { + /* + * To preserve the one-to-one mapping between + * thread_id and thread, we must exit from + * highest thread ID to least. + * + * However, if everyone is exiting, the order + * doesn't matter, so just exit immediately. + * (this is safe, since you must wait for + * nthreads to reach 0 after setting + * tq_nthreads_target to 0) + */ + if (thread_id == tq->tq_nthreads || + tq->tq_nthreads_target == 0) + break; + + /* Wait for higher thread_ids to exit */ + (void) taskq_thread_wait(tq, &tq->tq_lock, + &tq->tq_exit_cv, &cprinfo, -1); + continue; + } + + /* + * If no thread is starting taskq_thread(), we can + * do some bookkeeping. + */ + if (!(tq->tq_flags & TASKQ_THREAD_CREATED)) { + /* Check if we've reached our target */ + if (tq->tq_nthreads == tq->tq_nthreads_target) { + tq->tq_flags &= ~TASKQ_CHANGING; + cv_broadcast(&tq->tq_wait_cv); + } + /* Check if we need to create a thread */ + if (tq->tq_nthreads < tq->tq_nthreads_target) { + taskq_thread_create(tq); + continue; /* tq_lock was dropped */ + } + } + } + if ((tqe = tq->tq_task.tqent_next) == &tq->tq_task) { + if (--tq->tq_active == 0) + cv_broadcast(&tq->tq_wait_cv); + (void) taskq_thread_wait(tq, &tq->tq_lock, + &tq->tq_dispatch_cv, &cprinfo, -1); + tq->tq_active++; + continue; + } + + tqe->tqent_prev->tqent_next = tqe->tqent_next; + tqe->tqent_next->tqent_prev = tqe->tqent_prev; + mutex_exit(&tq->tq_lock); + + /* + * For prealloc'd tasks, we don't free anything. We + * have to check this now, because once we call the + * function for a prealloc'd taskq, we can't touch the + * tqent any longer (calling the function returns the + * ownershp of the tqent back to caller of + * taskq_dispatch.) + */ + if ((!(tq->tq_flags & TASKQ_DYNAMIC)) && + (tqe->tqent_un.tqent_flags & TQENT_FLAG_PREALLOC)) { + /* clear pointers to assist assertion checks */ + tqe->tqent_next = tqe->tqent_prev = NULL; + freeit = B_FALSE; + } else { + freeit = B_TRUE; + } + + rw_enter(&tq->tq_threadlock, RW_READER); + start = gethrtime(); + DTRACE_PROBE2(taskq__exec__start, taskq_t *, tq, + taskq_ent_t *, tqe); + tqe->tqent_func(tqe->tqent_arg); + DTRACE_PROBE2(taskq__exec__end, taskq_t *, tq, + taskq_ent_t *, tqe); + end = gethrtime(); + rw_exit(&tq->tq_threadlock); + + mutex_enter(&tq->tq_lock); + tq->tq_totaltime += end - start; + tq->tq_executed++; + + if (freeit) + taskq_ent_free(tq, tqe); + } + + if (tq->tq_nthreads_max == 1) + tq->tq_thread = NULL; + else + tq->tq_threadlist[thread_id - 1] = NULL; + + /* We're exiting, and therefore no longer active */ + ASSERT(tq->tq_active > 0); + tq->tq_active--; + + ASSERT(tq->tq_nthreads > 0); + tq->tq_nthreads--; + + /* Wake up anyone waiting for us to exit */ + cv_broadcast(&tq->tq_exit_cv); + if (tq->tq_nthreads == tq->tq_nthreads_target) { + if (!(tq->tq_flags & TASKQ_THREAD_CREATED)) + tq->tq_flags &= ~TASKQ_CHANGING; + + cv_broadcast(&tq->tq_wait_cv); + } + + tsd_set(taskq_tsd, NULL); + + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); + +} + +/* + * Worker per-entry thread for dynamic dispatches. + */ +static void +taskq_d_thread(taskq_ent_t *tqe) +{ + taskq_bucket_t *bucket = tqe->tqent_un.tqent_bucket; + taskq_t *tq = bucket->tqbucket_taskq; + kmutex_t *lock = &bucket->tqbucket_lock; + kcondvar_t *cv = &tqe->tqent_cv; + callb_cpr_t cprinfo; + clock_t w; + + CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, tq->tq_name); + +#ifdef __APPLE__ + /* + * There's no way in Mac OS X KPI to create a thread + * in a suspended state (TS_STOPPED). So instead we + * use tqent_thread as a flag and wait for it to get + * initialized. + */ + mutex_enter(&tqe->tqent_thread_lock); + while (tqe->tqent_thread == (kthread_t *)0xCEDEC0DE) + cv_wait(&tqe->tqent_thread_cv, &tqe->tqent_thread_lock); + mutex_exit(&tqe->tqent_thread_lock); +#endif + + mutex_enter(lock); + + for (;;) { + /* + * If a task is scheduled (func != NULL), execute it, otherwise + * sleep, waiting for a job. + */ + if (tqe->tqent_func != NULL) { + hrtime_t start; + hrtime_t end; + + ASSERT(bucket->tqbucket_nalloc > 0); + + /* + * It is possible to free the entry right away before + * actually executing the task so that subsequent + * dispatches may immediately reuse it. But this, + * effectively, creates a two-length queue in the entry + * and may lead to a deadlock if the execution of the + * current task depends on the execution of the next + * scheduled task. So, we keep the entry busy until the + * task is processed. + */ + + mutex_exit(lock); + start = gethrtime(); + DTRACE_PROBE3(taskq__d__exec__start, taskq_t *, tq, + taskq_bucket_t *, bucket, taskq_ent_t *, tqe); + tqe->tqent_func(tqe->tqent_arg); + DTRACE_PROBE3(taskq__d__exec__end, taskq_t *, tq, + taskq_bucket_t *, bucket, taskq_ent_t *, tqe); + end = gethrtime(); + mutex_enter(lock); + bucket->tqbucket_totaltime += end - start; + + /* + * Return the entry to the bucket free list. + */ + tqe->tqent_func = NULL; + TQ_APPEND(bucket->tqbucket_freelist, tqe); + bucket->tqbucket_nalloc--; + bucket->tqbucket_nfree++; + ASSERT(!IS_EMPTY(bucket->tqbucket_freelist)); + /* + * taskq_wait() waits for nalloc to drop to zero on + * tqbucket_cv. + */ + cv_signal(&bucket->tqbucket_cv); + } + + /* + * At this point the entry must be in the bucket free list - + * either because it was there initially or because it just + * finished executing a task and put itself on the free list. + */ + ASSERT(bucket->tqbucket_nfree > 0); + /* + * Go to sleep unless we are closing. + * If a thread is sleeping too long, it dies. + */ + if (! (bucket->tqbucket_flags & TQBUCKET_CLOSE)) { + w = taskq_thread_wait(tq, lock, cv, + &cprinfo, taskq_thread_timeout * hz); + } + + /* + * At this point we may be in two different states: + * + * (1) tqent_func is set which means that a new task is + * dispatched and we need to execute it. + * + * (2) Thread is sleeping for too long or we are closing. In + * both cases destroy the thread and the entry. + */ + + /* If func is NULL we should be on the freelist. */ + ASSERT((tqe->tqent_func != NULL) || + (bucket->tqbucket_nfree > 0)); + /* If func is non-NULL we should be allocated */ + ASSERT((tqe->tqent_func == NULL) || + (bucket->tqbucket_nalloc > 0)); + + /* Check freelist consistency */ + ASSERT((bucket->tqbucket_nfree > 0) || + IS_EMPTY(bucket->tqbucket_freelist)); + ASSERT((bucket->tqbucket_nfree == 0) || + !IS_EMPTY(bucket->tqbucket_freelist)); + + if ((tqe->tqent_func == NULL) && + ((w == -1) || (bucket->tqbucket_flags & TQBUCKET_CLOSE))) { + /* + * This thread is sleeping for too long or we are + * closing - time to die. + * Thread creation/destruction happens rarely, + * so grabbing the lock is not a big performance issue. + * The bucket lock is dropped by CALLB_CPR_EXIT(). + */ + + /* Remove the entry from the free list. */ + tqe->tqent_prev->tqent_next = tqe->tqent_next; + tqe->tqent_next->tqent_prev = tqe->tqent_prev; + ASSERT(bucket->tqbucket_nfree > 0); + bucket->tqbucket_nfree--; + + TQ_STAT(bucket, tqs_tdeaths); + cv_signal(&bucket->tqbucket_cv); + tqe->tqent_thread = NULL; + mutex_enter(&tq->tq_lock); + tq->tq_tdeaths++; + mutex_exit(&tq->tq_lock); + CALLB_CPR_EXIT(&cprinfo); + kmem_cache_free(taskq_ent_cache, tqe); + thread_exit(); + } + } +} + + +/* + * Taskq creation. May sleep for memory. + * Always use automatically generated instances to avoid kstat name space + * collisions. + */ + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + + return (taskq_create_common(name, 0, nthreads, pri, minalloc, + maxalloc, &p0, 0, flags | TASKQ_NOINSTANCE)); +} + +/* + * Create an instance of task queue. It is legal to create task queues with the + * same name and different instances. + * + * taskq_create_instance is used by ddi_taskq_create() where it gets the + * instance from ddi_get_instance(). In some cases the instance is not + * initialized and is set to -1. This case is handled as if no instance was + * passed at all. + */ +taskq_t * +taskq_create_instance(const char *name, int instance, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + ASSERT((instance >= 0) || (instance == -1)); + + if (instance < 0) { + flags |= TASKQ_NOINSTANCE; + } + + return (taskq_create_common(name, instance, nthreads, + pri, minalloc, maxalloc, &p0, 0, flags)); +} + +taskq_t * +taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, proc_t *proc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); +#ifndef __APPLE__ + ASSERT(proc->p_flag & SSYS); +#endif + return (taskq_create_common(name, 0, nthreads, pri, minalloc, + maxalloc, proc, 0, flags | TASKQ_NOINSTANCE)); +} + +taskq_t * +taskq_create_sysdc(const char *name, int nthreads, int minalloc, + int maxalloc, proc_t *proc, uint_t dc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); +#ifndef __APPLE__ + ASSERT(proc->p_flag & SSYS); +#endif + return (taskq_create_common(name, 0, nthreads, minclsyspri, minalloc, + maxalloc, proc, dc, flags | TASKQ_NOINSTANCE | TASKQ_DUTY_CYCLE)); +} + +static taskq_t * +taskq_create_common(const char *name, int instance, int nthreads, pri_t pri, + int minalloc, int maxalloc, proc_t *proc, uint_t dc, uint_t flags) +{ + taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_SLEEP); +#ifdef __APPLE__ + uint_t ncpus = max_ncpus; +#else + uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus); +#endif + uint_t bsize; /* # of buckets - always power of 2 */ + int max_nthreads; + + /* + * We are not allowed to use TASKQ_DYNAMIC with taskq_dispatch_ent() + * but that is done by spa.c - so we will simply mask DYNAMIC out. + */ + flags &= ~TASKQ_DYNAMIC; + + /* + * TASKQ_DYNAMIC, TASKQ_CPR_SAFE and TASKQ_THREADS_CPU_PCT are all + * mutually incompatible. + */ + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_CPR_SAFE)); + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_THREADS_CPU_PCT)); + IMPLY((flags & TASKQ_CPR_SAFE), !(flags & TASKQ_THREADS_CPU_PCT)); + + /* Cannot have DYNAMIC with DUTY_CYCLE */ + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_DUTY_CYCLE)); + + /* Cannot have DUTY_CYCLE with a p0 kernel process */ + IMPLY((flags & TASKQ_DUTY_CYCLE), proc != &p0); + + /* Cannot have DC_BATCH without DUTY_CYCLE */ + ASSERT((flags & (TASKQ_DUTY_CYCLE|TASKQ_DC_BATCH)) != TASKQ_DC_BATCH); + + ASSERT(proc != NULL); + + bsize = 1 << (highbit(ncpus) - 1); + ASSERT(bsize >= 1); + bsize = MIN(bsize, taskq_maxbuckets); + + if (flags & TASKQ_DYNAMIC) { + ASSERT3S(nthreads, >=, 1); + tq->tq_maxsize = nthreads; + + /* For dynamic task queues use just one backup thread */ + nthreads = max_nthreads = 1; + + } else if (flags & TASKQ_THREADS_CPU_PCT) { + uint_t pct; + ASSERT3S(nthreads, >=, 0); + pct = nthreads; + + if (pct > taskq_cpupct_max_percent) + pct = taskq_cpupct_max_percent; + + /* + * If you're using THREADS_CPU_PCT, the process for the + * taskq threads must be curproc. This allows any pset + * binding to be inherited correctly. If proc is &p0, + * we won't be creating LWPs, so new threads will be assigned + * to the default processor set. + */ + /* ASSERT(curproc == proc || proc == &p0); */ + tq->tq_threads_ncpus_pct = pct; + nthreads = 1; /* corrected in taskq_thread_create() */ + max_nthreads = TASKQ_THREADS_PCT(max_ncpus, pct); + + } else { + ASSERT3S(nthreads, >=, 1); + max_nthreads = nthreads; + } + + if (max_nthreads < taskq_minimum_nthreads_max) + max_nthreads = taskq_minimum_nthreads_max; + + /* + * Make sure the name is 0-terminated, and conforms to the rules for + * C indentifiers + */ + (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1); + strident_canon(tq->tq_name, TASKQ_NAMELEN + 1); + + tq->tq_flags = flags | TASKQ_CHANGING; + tq->tq_active = 0; + tq->tq_instance = instance; + tq->tq_nthreads_target = nthreads; + tq->tq_nthreads_max = max_nthreads; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_nbuckets = bsize; + tq->tq_proc = proc; + tq->tq_pri = pri; + tq->tq_DC = dc; + list_link_init(&tq->tq_cpupct_link); + + if (max_nthreads > 1) + tq->tq_threadlist = kmem_alloc( + sizeof (kthread_t *) * max_nthreads, KM_SLEEP); + + mutex_enter(&tq->tq_lock); + if (flags & TASKQ_PREPOPULATE) { + while (minalloc-- > 0) + taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP)); + } + + /* + * Before we start creating threads for this taskq, take a + * zone hold so the zone can't go away before taskq_destroy + * makes sure all the taskq threads are gone. This hold is + * similar in purpose to those taken by zthread_create(). + */ +#ifndef __APPLE__ + zone_hold(tq->tq_proc->p_zone); +#endif + /* + * Create the first thread, which will create any other threads + * necessary. taskq_thread_create will not return until we have + * enough threads to be able to process requests. + */ + taskq_thread_create(tq); + mutex_exit(&tq->tq_lock); + + if (flags & TASKQ_DYNAMIC) { + taskq_bucket_t *bucket = kmem_zalloc(sizeof (taskq_bucket_t) * + bsize, KM_SLEEP); + int b_id; + + tq->tq_buckets = bucket; + + /* Initialize each bucket */ + for (b_id = 0; b_id < bsize; b_id++, bucket++) { + mutex_init(&bucket->tqbucket_lock, NULL, MUTEX_DEFAULT, + NULL); + cv_init(&bucket->tqbucket_cv, NULL, CV_DEFAULT, NULL); + bucket->tqbucket_taskq = tq; + bucket->tqbucket_freelist.tqent_next = + bucket->tqbucket_freelist.tqent_prev = + &bucket->tqbucket_freelist; + if (flags & TASKQ_PREPOPULATE) + taskq_bucket_extend(bucket); + } + } + + /* + * Install kstats. + * We have two cases: + * 1) Instance is provided to taskq_create_instance(). In this case it + * should be >= 0 and we use it. + * + * 2) Instance is not provided and is automatically generated + */ + if (flags & TASKQ_NOINSTANCE) { + instance = tq->tq_instance = + (int)(uintptr_t)vmem_alloc(taskq_id_arena, 1, VM_SLEEP); + } + + if (flags & TASKQ_DYNAMIC) { + if ((tq->tq_kstat = kstat_create("unix", instance, + tq->tq_name, "taskq_d", KSTAT_TYPE_NAMED, + sizeof (taskq_d_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + tq->tq_kstat->ks_lock = &taskq_d_kstat_lock; + tq->tq_kstat->ks_data = &taskq_d_kstat; + tq->tq_kstat->ks_update = taskq_d_kstat_update; + tq->tq_kstat->ks_private = tq; + kstat_install(tq->tq_kstat); + } + } else { + if ((tq->tq_kstat = kstat_create("unix", instance, tq->tq_name, + "taskq", KSTAT_TYPE_NAMED, + sizeof (taskq_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + tq->tq_kstat->ks_lock = &taskq_kstat_lock; + tq->tq_kstat->ks_data = &taskq_kstat; + tq->tq_kstat->ks_update = taskq_kstat_update; + tq->tq_kstat->ks_private = tq; + kstat_install(tq->tq_kstat); + } + } + + return (tq); +} + +/* + * taskq_destroy(). + * + * Assumes: by the time taskq_destroy is called no one will use this task queue + * in any way and no one will try to dispatch entries in it. + */ +void +taskq_destroy(taskq_t *tq) +{ + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + + ASSERT(! (tq->tq_flags & TASKQ_CPR_SAFE)); + + /* + * Destroy kstats. + */ + if (tq->tq_kstat != NULL) { + kstat_delete(tq->tq_kstat); + tq->tq_kstat = NULL; + } + + /* + * Destroy instance if needed. + */ + if (tq->tq_flags & TASKQ_NOINSTANCE) { + vmem_free(taskq_id_arena, (void *)(uintptr_t)(tq->tq_instance), + 1); + tq->tq_instance = 0; + } + + /* + * Unregister from the cpupct list. + */ +#ifndef __APPLE__ + if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) { + taskq_cpupct_remove(tq); + } +#endif + + /* + * Wait for any pending entries to complete. + */ + taskq_wait(tq); + + mutex_enter(&tq->tq_lock); + ASSERT((tq->tq_task.tqent_next == &tq->tq_task) && + (tq->tq_active == 0)); + + /* notify all the threads that they need to exit */ + tq->tq_nthreads_target = 0; + + tq->tq_flags |= TASKQ_CHANGING; + cv_broadcast(&tq->tq_dispatch_cv); + cv_broadcast(&tq->tq_exit_cv); + + while (tq->tq_nthreads != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + + if (tq->tq_nthreads_max != 1) + kmem_free(tq->tq_threadlist, sizeof (kthread_t *) * + tq->tq_nthreads_max); + + tq->tq_minalloc = 0; + while (tq->tq_nalloc != 0) + taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP)); + + mutex_exit(&tq->tq_lock); + + /* + * Mark each bucket as closing and wakeup all sleeping threads. + */ + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + taskq_ent_t *tqe; + + mutex_enter(&b->tqbucket_lock); + + b->tqbucket_flags |= TQBUCKET_CLOSE; + /* Wakeup all sleeping threads */ + + for (tqe = b->tqbucket_freelist.tqent_next; + tqe != &b->tqbucket_freelist; tqe = tqe->tqent_next) + cv_signal(&tqe->tqent_cv); + + ASSERT(b->tqbucket_nalloc == 0); + + /* + * At this point we waited for all pending jobs to complete (in + * both the task queue and the bucket and no new jobs should + * arrive. Wait for all threads to die. + */ + while (b->tqbucket_nfree > 0) + cv_wait(&b->tqbucket_cv, &b->tqbucket_lock); + mutex_exit(&b->tqbucket_lock); + mutex_destroy(&b->tqbucket_lock); + cv_destroy(&b->tqbucket_cv); + } + + if (tq->tq_buckets != NULL) { + ASSERT(tq->tq_flags & TASKQ_DYNAMIC); + kmem_free(tq->tq_buckets, + sizeof (taskq_bucket_t) * tq->tq_nbuckets); + + /* Cleanup fields before returning tq to the cache */ + tq->tq_buckets = NULL; + tq->tq_tcreates = 0; + tq->tq_tdeaths = 0; + } else { + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + } + + /* + * Now that all the taskq threads are gone, we can + * drop the zone hold taken in taskq_create_common + */ +#ifndef __APPLE__ + zone_rele(tq->tq_proc->p_zone); +#endif + + tq->tq_threads_ncpus_pct = 0; + tq->tq_totaltime = 0; + tq->tq_tasks = 0; + tq->tq_maxtasks = 0; + tq->tq_executed = 0; + kmem_cache_free(taskq_cache, tq); +} + +/* + * Extend a bucket with a new entry on the free list and attach a worker thread + * to it. + * + * Argument: pointer to the bucket. + * + * This function may quietly fail. It is only used by taskq_dispatch() which + * handles such failures properly. + */ +static void +taskq_bucket_extend(void *arg) +{ + taskq_ent_t *tqe; + taskq_bucket_t *b = (taskq_bucket_t *)arg; + taskq_t *tq = b->tqbucket_taskq; + int nthreads; +#ifdef __APPLE__ + kthread_t *thread; +#endif + + if (! ENOUGH_MEMORY()) { + TQ_STAT(b, tqs_nomem); + return; + } + + mutex_enter(&tq->tq_lock); + + /* + * Observe global taskq limits on the number of threads. + */ + if (tq->tq_tcreates++ - tq->tq_tdeaths > tq->tq_maxsize) { + tq->tq_tcreates--; + mutex_exit(&tq->tq_lock); + return; + } + mutex_exit(&tq->tq_lock); + + tqe = kmem_cache_alloc(taskq_ent_cache, KM_NOSLEEP); + + if (tqe == NULL) { + mutex_enter(&tq->tq_lock); + TQ_STAT(b, tqs_nomem); + tq->tq_tcreates--; + mutex_exit(&tq->tq_lock); + return; + } + + ASSERT(tqe->tqent_thread == NULL); + + tqe->tqent_un.tqent_bucket = b; + +#ifdef __APPLE__ + /* + * There's no way in Mac OS X KPI to create a thread + * in a suspended state (TS_STOPPED). So instead we + * use tqent_thread as a flag and the thread must wait + * for it to be initialized (below). + */ + tqe->tqent_thread = (kthread_t *)0xCEDEC0DE; + thread = thread_create(NULL, 0, (void (*)(void *))taskq_d_thread, + tqe, 0, pp0, TS_RUN, tq->tq_pri); +#else + + /* + * Create a thread in a TS_STOPPED state first. If it is successfully + * created, place the entry on the free list and start the thread. + */ + tqe->tqent_thread = thread_create(NULL, 0, taskq_d_thread, tqe, + 0, tq->tq_proc, TS_STOPPED, tq->tq_pri); +#endif /* __APPLE__ */ + + /* + * Once the entry is ready, link it to the the bucket free list. + */ + mutex_enter(&b->tqbucket_lock); + tqe->tqent_func = NULL; + TQ_APPEND(b->tqbucket_freelist, tqe); + b->tqbucket_nfree++; + TQ_STAT(b, tqs_tcreates); + +#if TASKQ_STATISTIC + nthreads = b->tqbucket_stat.tqs_tcreates - + b->tqbucket_stat.tqs_tdeaths; + b->tqbucket_stat.tqs_maxthreads = MAX(nthreads, + b->tqbucket_stat.tqs_maxthreads); +#endif + + mutex_exit(&b->tqbucket_lock); + /* + * Start the stopped thread. + */ +#ifdef __APPLE__ + mutex_enter(&tqe->tqent_thread_lock); + tqe->tqent_thread = thread; + cv_signal(&tqe->tqent_thread_cv); + mutex_exit(&tqe->tqent_thread_lock); +#else + thread_lock(tqe->tqent_thread); + tqe->tqent_thread->t_taskq = tq; + tqe->tqent_thread->t_schedflag |= TS_ALLSTART; + setrun_locked(tqe->tqent_thread); + thread_unlock(tqe->tqent_thread); +#endif /* __APPLE__ */ +} + +static int +taskq_kstat_update(kstat_t *ksp, int rw) +{ + struct taskq_kstat *tqsp = &taskq_kstat; + taskq_t *tq = ksp->ks_private; + + if (rw == KSTAT_WRITE) + return (EACCES); + +#ifdef __APPLE__ + tqsp->tq_pid.value.ui64 = 0; /* kernel_task'd pid is 0 */ +#else + tqsp->tq_pid.value.ui64 = proc_pid(tq->tq_proc->p_pid); +#endif + tqsp->tq_tasks.value.ui64 = tq->tq_tasks; + tqsp->tq_executed.value.ui64 = tq->tq_executed; + tqsp->tq_maxtasks.value.ui64 = tq->tq_maxtasks; + tqsp->tq_totaltime.value.ui64 = tq->tq_totaltime; + tqsp->tq_nactive.value.ui64 = tq->tq_active; + tqsp->tq_nalloc.value.ui64 = tq->tq_nalloc; + tqsp->tq_pri.value.ui64 = tq->tq_pri; + tqsp->tq_nthreads.value.ui64 = tq->tq_nthreads; + return (0); +} + +static int +taskq_d_kstat_update(kstat_t *ksp, int rw) +{ + struct taskq_d_kstat *tqsp = &taskq_d_kstat; + taskq_t *tq = ksp->ks_private; + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ASSERT(tq->tq_flags & TASKQ_DYNAMIC); + + tqsp->tqd_btasks.value.ui64 = tq->tq_tasks; + tqsp->tqd_bexecuted.value.ui64 = tq->tq_executed; + tqsp->tqd_bmaxtasks.value.ui64 = tq->tq_maxtasks; + tqsp->tqd_bnalloc.value.ui64 = tq->tq_nalloc; + tqsp->tqd_bnactive.value.ui64 = tq->tq_active; + tqsp->tqd_btotaltime.value.ui64 = tq->tq_totaltime; + tqsp->tqd_pri.value.ui64 = tq->tq_pri; + + tqsp->tqd_hits.value.ui64 = 0; + tqsp->tqd_misses.value.ui64 = 0; + tqsp->tqd_overflows.value.ui64 = 0; + tqsp->tqd_tcreates.value.ui64 = 0; + tqsp->tqd_tdeaths.value.ui64 = 0; + tqsp->tqd_maxthreads.value.ui64 = 0; + tqsp->tqd_nomem.value.ui64 = 0; + tqsp->tqd_disptcreates.value.ui64 = 0; + tqsp->tqd_totaltime.value.ui64 = 0; + tqsp->tqd_nalloc.value.ui64 = 0; + tqsp->tqd_nfree.value.ui64 = 0; + + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + tqsp->tqd_hits.value.ui64 += b->tqbucket_stat.tqs_hits; + tqsp->tqd_misses.value.ui64 += b->tqbucket_stat.tqs_misses; + tqsp->tqd_overflows.value.ui64 += b->tqbucket_stat.tqs_overflow; + tqsp->tqd_tcreates.value.ui64 += b->tqbucket_stat.tqs_tcreates; + tqsp->tqd_tdeaths.value.ui64 += b->tqbucket_stat.tqs_tdeaths; + tqsp->tqd_maxthreads.value.ui64 += + b->tqbucket_stat.tqs_maxthreads; + tqsp->tqd_nomem.value.ui64 += b->tqbucket_stat.tqs_nomem; + tqsp->tqd_disptcreates.value.ui64 += + b->tqbucket_stat.tqs_disptcreates; + tqsp->tqd_totaltime.value.ui64 += b->tqbucket_totaltime; + tqsp->tqd_nalloc.value.ui64 += b->tqbucket_nalloc; + tqsp->tqd_nfree.value.ui64 += b->tqbucket_nfree; + } + return (0); +} + +int +EMPTY_TASKQ(taskq_t *tq) +{ +#ifdef _KERNEL + return ((tq)->tq_task.tqent_next == &(tq)->tq_task); +#else + return (tq->tq_task.tqent_next == &tq->tq_task || tq->tq_active == 0); +#endif +} diff --git a/module/os/macos/spl/spl-thread.c b/module/os/macos/spl/spl-thread.c new file mode 100644 index 0000000000..886190cba7 --- /dev/null +++ b/module/os/macos/spl/spl-thread.c @@ -0,0 +1,148 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013, 2020 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +uint64_t zfs_threads = 0; + +kthread_t * +spl_thread_create( + caddr_t stk, + size_t stksize, + void (*proc)(void *), + void *arg, + size_t len, + /* struct proc *pp, */ + int state, +#ifdef SPL_DEBUG_THREAD + char *filename, + int line, +#endif + pri_t pri) +{ + kern_return_t result; + thread_t thread; + +#ifdef SPL_DEBUG_THREAD + printf("Start thread pri %d by '%s':%d\n", pri, + filename, line); +#endif + + result = kernel_thread_start((thread_continue_t)proc, arg, &thread); + + if (result != KERN_SUCCESS) + return (NULL); + + /* Improve the priority when asked to do so */ + if (pri > minclsyspri) { + thread_precedence_policy_data_t policy; + + /* + * kernel priorities (osfmk/kern/sched.h) + * + * 96 Reserved (real-time) + * 95 Kernel mode only + * A + * + + * (16 levels) + * + + * V + * 80 Kernel mode only + * 79 System high priority + * + * spl/include/sys/sysmacros.h + * #define maxclsyspri 89 + * #define minclsyspri 81 BASEPRI_KERNEL + * #define defclsyspri 81 BASEPRI_KERNEL + * + * Calling policy.importance = 10 will create + * a default pri (81) at pri (91). + * + * So asking for pri (85) we do 85-81 = 4. + * + * IllumOS priorities are: + * #define MAXCLSYSPRI 99 + * #define MINCLSYSPRI 60 + */ + + policy.importance = (pri - minclsyspri); + + thread_policy_set(thread, + THREAD_PRECEDENCE_POLICY, + (thread_policy_t)&policy, + THREAD_PRECEDENCE_POLICY_COUNT); + } + + thread_deallocate(thread); + + atomic_inc_64(&zfs_threads); + + return ((kthread_t *)thread); +} + +kthread_t * +spl_current_thread(void) +{ + thread_t cur_thread = current_thread(); + return ((kthread_t *)cur_thread); +} + +void +spl_thread_exit(void) +{ + atomic_dec_64(&zfs_threads); + + tsd_thread_exit(); + (void) thread_terminate(current_thread()); +} + + +/* + * IllumOS has callout.c - place it here until we find a better place + */ +callout_id_t +timeout_generic(int type, void (*func)(void *), void *arg, + hrtime_t expiration, hrtime_t resolution, int flags) +{ + struct timespec ts; + hrt2ts(expiration, &ts); + bsd_timeout(func, arg, &ts); + /* + * bsd_untimeout() requires func and arg to cancel the timeout, so + * pass it back as the callout_id. If we one day were to implement + * untimeout_generic() they would pass it back to us + */ + return ((callout_id_t)arg); +} diff --git a/module/os/macos/spl/spl-time.c b/module/os/macos/spl/spl-time.c new file mode 100644 index 0000000000..151691d60b --- /dev/null +++ b/module/os/macos/spl/spl-time.c @@ -0,0 +1,138 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include +#include + +/* + * gethrtime() provides high-resolution timestamps with + * machine-dependent origin Hence its primary use is to specify + * intervals. + */ + +static hrtime_t +zfs_abs_to_nano(uint64_t elapsed) +{ + static mach_timebase_info_data_t sTimebaseInfo = { 0, 0 }; + + /* + * If this is the first time we've run, get the timebase. + * We can use denom == 0 to indicate that sTimebaseInfo is + * uninitialised because it makes no sense to have a zero + * denominator in a fraction. + */ + + if (sTimebaseInfo.denom == 0) { + (void) clock_timebase_info(&sTimebaseInfo); + } + + /* + * Convert to nanoseconds. + * return (elapsed * (uint64_t)sTimebaseInfo.numer) / + * (uint64_t)sTimebaseInfo.denom; + * + * Provided the final result is representable in 64 bits the + * following maneuver will deliver that result without intermediate + * overflow. + */ + if (sTimebaseInfo.denom == sTimebaseInfo.numer) + return (elapsed); + else if (sTimebaseInfo.denom == 1) + return (elapsed * (uint64_t)sTimebaseInfo.numer); + else { + /* Decompose elapsed = eta32 * 2^32 + eps32: */ + uint64_t eta32 = elapsed >> 32; + uint64_t eps32 = elapsed & 0x00000000ffffffffLL; + + uint32_t numer = sTimebaseInfo.numer; + uint32_t denom = sTimebaseInfo.denom; + + /* Form product of elapsed64 (decomposed) and numer: */ + uint64_t mu64 = numer * eta32; + uint64_t lambda64 = numer * eps32; + + /* Divide the constituents by denom: */ + uint64_t q32 = mu64/denom; + uint64_t r32 = mu64 - (q32 * denom); /* mu64 % denom */ + + return ((q32 << 32) + ((r32 << 32) + lambda64) / denom); + } +} + + +hrtime_t +gethrtime(void) +{ + static uint64_t start = 0; + if (start == 0) + start = mach_absolute_time(); + return (zfs_abs_to_nano(mach_absolute_time() - start)); +} + + +void +gethrestime(struct timespec *ts) +{ + nanotime(ts); +} + +time_t +gethrestime_sec(void) +{ + struct timeval tv; + + microtime(&tv); + return (tv.tv_sec); +} + +void +hrt2ts(hrtime_t hrt, struct timespec *tsp) +{ + uint32_t sec, nsec, tmp; + + tmp = (uint32_t)(hrt >> 30); + sec = tmp - (tmp >> 2); + sec = tmp - (sec >> 5); + sec = tmp + (sec >> 1); + sec = tmp - (sec >> 6) + 7; + sec = tmp - (sec >> 3); + sec = tmp + (sec >> 1); + sec = tmp + (sec >> 3); + sec = tmp + (sec >> 4); + tmp = (sec << 7) - sec - sec - sec; + tmp = (tmp << 7) - tmp - tmp - tmp; + tmp = (tmp << 7) - tmp - tmp - tmp; + nsec = (uint32_t)hrt - (tmp << 9); + while (nsec >= NANOSEC) { + nsec -= NANOSEC; + sec++; + } + tsp->tv_sec = (time_t)sec; + tsp->tv_nsec = nsec; +} diff --git a/module/os/macos/spl/spl-tsd.c b/module/os/macos/spl/spl-tsd.c new file mode 100644 index 0000000000..6ca970a9f9 --- /dev/null +++ b/module/os/macos/spl/spl-tsd.c @@ -0,0 +1,389 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2014 Jorgen Lundman + * + * A thread will call tsd_create(&key, dtor) to allocate a new + * "variable" placement, called a "key". In illumos, this is the index + * into an array of dtors. (If dtor is passed as NULL, TSD internally + * set it to an empty function). So if the dtor array[i] is NULL, it + * is "free" and can be allocated. (returned as *key = i). + * illumos will grow this dtor array with realloc when required. + * Then Any Thread can set a value on this "key index", and this value + * is specific to each thread by calling tsd_set(key, value). + * And can be retrieved with tsd_get(key). + * When tsd_destroy(key) is called, we need to loop through all + * threads different "values", and call the dtor on each one. + * Likewise, we need to know when a thread exists, so we can clean up + * the values (by calling dtor for each one) so we patch into the + * thread_exit() call, to also call tsd_thread_exit(). + * + * In OsX, we build an array of the dtors, and return the key index, + * this is to store the dtor, and know which "key" values are valid. + * Then we build an AVL tree, indexed by , to store + * each thread's value. This allows us to do key access quick. + * On thread_exit, we iterate the dtor array, and for each key + * remove . + * On tsd_destroy(key), we use AVL find nearest with , then + * avl_next as long as key remains the same, to remove each thread value. + * + * Note a key of "0" is considered "invalid" in IllumOS, so we return + * a "1" based index, even though internally it is 0 based. + * + */ + +#include +#include +#include +#include +#include + +/* Initial size of array, and realloc growth size */ +#define TSD_ALLOC_SIZE 10 + +/* array of dtors, allocated in init */ +static dtor_func_t *tsd_dtor_array = NULL; +static uint32_t tsd_dtor_size = 0; +static avl_tree_t tsd_tree; + +struct spl_tsd_node_s +{ + /* The index/key */ + uint_t tsd_key; + thread_t tsd_thread; + + /* The payload */ + void *tsd_value; + + /* Internal mumbo */ + avl_node_t tsd_link_node; +}; +typedef struct spl_tsd_node_s spl_tsd_node_t; + +static kmutex_t spl_tsd_mutex; + +/* + * tsd_set - set thread specific data + * @key: lookup key + * @value: value to set + * + * Caller must prevent racing tsd_create() or tsd_destroy(), protected + * from racing tsd_get() or tsd_set() because it is thread specific. + * This function has been optimized to be fast for the update case. + * When setting the tsd initially it will be slower due to additional + * required locking and potential memory allocations. + * If the value is set to NULL, we also release it. + */ +int +tsd_set(uint_t key, void *value) +{ + spl_tsd_node_t *entry = NULL; + spl_tsd_node_t search; + avl_index_t loc; + uint_t i; + + /* Invalid key values? */ + if ((key < 1) || + (key >= tsd_dtor_size)) { + return (EINVAL); + } + + i = key - 1; + + /* + * First handle the easy case, already has a node/value + * so we just need to find it, update it. + */ + + search.tsd_key = i; + search.tsd_thread = current_thread(); + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + mutex_exit(&spl_tsd_mutex); + + if (entry) { + + /* If value is set to NULL, release it as well */ + if (value == NULL) { + mutex_enter(&spl_tsd_mutex); + avl_remove(&tsd_tree, entry); + mutex_exit(&spl_tsd_mutex); + kmem_free(entry, sizeof (*entry)); + return (0); + } + entry->tsd_value = value; + return (0); + } + + /* No node, we need to create a new one and insert it. */ + /* But if the value is NULL, then why create one eh? */ + if (value == NULL) + return (0); + + entry = kmem_alloc(sizeof (spl_tsd_node_t), KM_SLEEP); + + entry->tsd_key = i; + entry->tsd_thread = current_thread(); + entry->tsd_value = value; + + mutex_enter(&spl_tsd_mutex); + avl_add(&tsd_tree, entry); + mutex_exit(&spl_tsd_mutex); + + return (0); +} + +/* + * tsd_get - get thread specific data for specified thread + * @key: lookup key + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get_by_thread(uint_t key, thread_t thread) +{ + spl_tsd_node_t *entry = NULL; + spl_tsd_node_t search; + avl_index_t loc; + uint_t i; + + /* Invalid key values? */ + if ((key < 1) || + (key >= tsd_dtor_size)) { + return (NULL); + } + + i = key - 1; + + search.tsd_key = i; + search.tsd_thread = thread; + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + mutex_exit(&spl_tsd_mutex); + + return (entry ? entry->tsd_value : NULL); +} + +void * +tsd_get(uint_t key) +{ + return (tsd_get_by_thread(key, current_thread())); +} + +static void +tsd_internal_dtor(void *value) +{ +} + +/* + * Create TSD for a pid and fill in key with unique value, remember the dtor + * + * We cheat and create an entry with pid=0, to keep the dtor. + */ +void +tsd_create(uint_t *keyp, dtor_func_t dtor) +{ + uint_t i; + + if (*keyp) + return; + + // Iterate the dtor_array, looking for first NULL + for (i = 0; i < TSD_ALLOC_SIZE; i++) { + if (tsd_dtor_array[i] == NULL) break; + } + + /* Do we need to grow the list? */ + if (i >= tsd_dtor_size) { + printf("SPL: tsd list growing not implemented\n"); + return; + } + + if (dtor == NULL) + dtor = tsd_internal_dtor; + + tsd_dtor_array[i] = dtor; + + *keyp = i + 1; +} + +void +tsd_destroy(uint_t *keyp) +{ + spl_tsd_node_t *entry = NULL, *next = NULL; + spl_tsd_node_t search; + avl_index_t loc; + dtor_func_t dtor = NULL; + uint_t i; + + /* Invalid key values? */ + if ((*keyp < 1) || + (*keyp >= tsd_dtor_size)) { + return; + } + + i = *keyp - 1; + *keyp = 0; + + ASSERT(tsd_dtor_array[i] != NULL); + + dtor = tsd_dtor_array[i]; + tsd_dtor_array[i] = NULL; + + /* + * For each thread; + * if it has a value + * call the dtor + */ + search.tsd_key = i; + search.tsd_thread = NULL; + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + + /* + * "entry" should really be NULL here, as we searched for the + * NULL thread + */ + if (entry == NULL) + entry = avl_nearest(&tsd_tree, loc, AVL_AFTER); + + /* Now, free node, and go to next, as long as the key matches */ + while (entry && (entry->tsd_key == i)) { + next = AVL_NEXT(&tsd_tree, entry); + + /* If we have a value, call the dtor for this thread */ + if (entry->tsd_value) + dtor(entry->tsd_value); + + avl_remove(&tsd_tree, entry); + + kmem_free(entry, sizeof (*entry)); + + entry = next; + } + + mutex_exit(&spl_tsd_mutex); +} + + + +/* + * A thread is exiting, clear out any tsd values it might have. + */ +void +tsd_thread_exit(void) +{ + spl_tsd_node_t *entry = NULL; + spl_tsd_node_t search; + avl_index_t loc; + int i; + + search.tsd_thread = current_thread(); + + /* For all defined dtor/values */ + for (i = 0; i < tsd_dtor_size; i++) { + + /* If not allocated, skip */ + if (tsd_dtor_array[i] == NULL) continue; + + /* Find out of this thread has a value */ + search.tsd_key = i; + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + if (entry) avl_remove(&tsd_tree, entry); + mutex_exit(&spl_tsd_mutex); + + if (entry == NULL) continue; + + /* If we have a value, call dtor */ + if (entry->tsd_value) + tsd_dtor_array[i](entry->tsd_value); + + kmem_free(entry, sizeof (*entry)); + } // for all i +} + +static int +tsd_tree_cmp(const void *arg1, const void *arg2) +{ + const spl_tsd_node_t *node1 = arg1; + const spl_tsd_node_t *node2 = arg2; + if (node1->tsd_key > node2->tsd_key) + return (1); + if (node1->tsd_key < node2->tsd_key) + return (-1); + if (node1->tsd_thread > node2->tsd_thread) + return (1); + if (node1->tsd_thread < node2->tsd_thread) + return (-1); + return (0); +} + +int +spl_tsd_init(void) +{ + tsd_dtor_array = kmem_zalloc(sizeof (dtor_func_t) * TSD_ALLOC_SIZE, + KM_SLEEP); + tsd_dtor_size = TSD_ALLOC_SIZE; + + mutex_init(&spl_tsd_mutex, NULL, MUTEX_DEFAULT, NULL); + avl_create(&tsd_tree, tsd_tree_cmp, + sizeof (spl_tsd_node_t), + offsetof(spl_tsd_node_t, tsd_link_node)); + return (0); +} + + +uint64_t +spl_tsd_size(void) +{ + return (avl_numnodes(&tsd_tree)); +} + +void +spl_tsd_fini(void) +{ + spl_tsd_node_t *entry = NULL; + void *cookie = NULL; + + printf("SPL: tsd unloading %llu\n", spl_tsd_size()); + + mutex_enter(&spl_tsd_mutex); + cookie = NULL; + while ((entry = avl_destroy_nodes(&tsd_tree, &cookie))) { + kmem_free(entry, sizeof (*entry)); + } + mutex_exit(&spl_tsd_mutex); + + avl_destroy(&tsd_tree); + mutex_destroy(&spl_tsd_mutex); + + kmem_free(tsd_dtor_array, sizeof (dtor_func_t) * tsd_dtor_size); + tsd_dtor_size = 0; +} diff --git a/module/os/macos/spl/spl-vmem.c b/module/os/macos/spl/spl-vmem.c new file mode 100644 index 0000000000..f54a0b133f --- /dev/null +++ b/module/os/macos/spl/spl-vmem.c @@ -0,0 +1,3935 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2017 Sean Doran + */ + +/* + * Big Theory Statement for the virtual memory allocator. + * + * For a more complete description of the main ideas, see: + * + * Jeff Bonwick and Jonathan Adams, + * + * Magazines and vmem: Extending the Slab Allocator to Many CPUs and + * Arbitrary Resources. + * + * Proceedings of the 2001 Usenix Conference. + * Available as http://www.usenix.org/event/usenix01/bonwick.html + * + * + * 1. General Concepts + * ------------------- + * + * 1.1 Overview + * ------------ + * We divide the kernel address space into a number of logically distinct + * pieces, or *arenas*: text, data, heap, stack, and so on. Within these + * arenas we often subdivide further; for example, we use heap addresses + * not only for the kernel heap (kmem_alloc() space), but also for DVMA, + * bp_mapin(), /dev/kmem, and even some device mappings like the TOD chip. + * The kernel address space, therefore, is most accurately described as + * a tree of arenas in which each node of the tree *imports* some subset + * of its parent. The virtual memory allocator manages these arenas and + * supports their natural hierarchical structure. + * + * 1.2 Arenas + * ---------- + * An arena is nothing more than a set of integers. These integers most + * commonly represent virtual addresses, but in fact they can represent + * anything at all. For example, we could use an arena containing the + * integers minpid through maxpid to allocate process IDs. vmem_create() + * and vmem_destroy() create and destroy vmem arenas. In order to + * differentiate between arenas used for adresses and arenas used for + * identifiers, the VMC_IDENTIFIER flag is passed to vmem_create(). This + * prevents identifier exhaustion from being diagnosed as general memory + * failure. + * + * 1.3 Spans + * --------- + * We represent the integers in an arena as a collection of *spans*, or + * contiguous ranges of integers. For example, the kernel heap consists + * of just one span: [kernelheap, ekernelheap). Spans can be added to an + * arena in two ways: explicitly, by vmem_add(), or implicitly, by + * importing, as described in Section 1.5 below. + * + * 1.4 Segments + * ------------ + * Spans are subdivided into *segments*, each of which is either allocated + * or free. A segment, like a span, is a contiguous range of integers. + * Each allocated segment [addr, addr + size) represents exactly one + * vmem_alloc(size) that returned addr. Free segments represent the space + * between allocated segments. If two free segments are adjacent, we + * coalesce them into one larger segment; that is, if segments [a, b) and + * [b, c) are both free, we merge them into a single segment [a, c). + * The segments within a span are linked together in increasing-address order + * so we can easily determine whether coalescing is possible. + * + * Segments never cross span boundaries. When all segments within + * an imported span become free, we return the span to its source. + * + * 1.5 Imported Memory + * ------------------- + * As mentioned in the overview, some arenas are logical subsets of + * other arenas. For example, kmem_va_arena (a virtual address cache + * that satisfies most kmem_slab_create() requests) is just a subset + * of heap_arena (the kernel heap) that provides caching for the most + * common slab sizes. When kmem_va_arena runs out of virtual memory, + * it *imports* more from the heap; we say that heap_arena is the + * *vmem source* for kmem_va_arena. vmem_create() allows you to + * specify any existing vmem arena as the source for your new arena. + * Topologically, since every arena is a child of at most one source, + * the set of all arenas forms a collection of trees. + * + * 1.6 Constrained Allocations + * --------------------------- + * Some vmem clients are quite picky about the kind of address they want. + * For example, the DVMA code may need an address that is at a particular + * phase with respect to some alignment (to get good cache coloring), or + * that lies within certain limits (the addressable range of a device), + * or that doesn't cross some boundary (a DMA counter restriction) -- + * or all of the above. vmem_xalloc() allows the client to specify any + * or all of these constraints. + * + * 1.7 The Vmem Quantum + * -------------------- + * Every arena has a notion of 'quantum', specified at vmem_create() time, + * that defines the arena's minimum unit of currency. Most commonly the + * quantum is either 1 or PAGESIZE, but any power of 2 is legal. + * All vmem allocations are guaranteed to be quantum-aligned. + * + * 1.8 Quantum Caching + * ------------------- + * A vmem arena may be so hot (frequently used) that the scalability of vmem + * allocation is a significant concern. We address this by allowing the most + * common allocation sizes to be serviced by the kernel memory allocator, + * which provides low-latency per-cpu caching. The qcache_max argument to + * vmem_create() specifies the largest allocation size to cache. + * + * 1.9 Relationship to Kernel Memory Allocator + * ------------------------------------------- + * Every kmem cache has a vmem arena as its slab supplier. The kernel memory + * allocator uses vmem_alloc() and vmem_free() to create and destroy slabs. + * + * + * 2. Implementation + * ----------------- + * + * 2.1 Segment lists and markers + * ----------------------------- + * The segment structure (vmem_seg_t) contains two doubly-linked lists. + * + * The arena list (vs_anext/vs_aprev) links all segments in the arena. + * In addition to the allocated and free segments, the arena contains + * special marker segments at span boundaries. Span markers simplify + * coalescing and importing logic by making it easy to tell both when + * we're at a span boundary (so we don't coalesce across it), and when + * a span is completely free (its neighbors will both be span markers). + * + * Imported spans will have vs_import set. + * + * The next-of-kin list (vs_knext/vs_kprev) links segments of the same type: + * (1) for allocated segments, vs_knext is the hash chain linkage; + * (2) for free segments, vs_knext is the freelist linkage; + * (3) for span marker segments, vs_knext is the next span marker. + * + * 2.2 Allocation hashing + * ---------------------- + * We maintain a hash table of all allocated segments, hashed by address. + * This allows vmem_free() to discover the target segment in constant time. + * vmem_update() periodically resizes hash tables to keep hash chains short. + * + * 2.3 Freelist management + * ----------------------- + * We maintain power-of-2 freelists for free segments, i.e. free segments + * of size >= 2^n reside in vmp->vm_freelist[n]. To ensure constant-time + * allocation, vmem_xalloc() looks not in the first freelist that *might* + * satisfy the allocation, but in the first freelist that *definitely* + * satisfies the allocation (unless VM_BESTFIT is specified, or all larger + * freelists are empty). For example, a 1000-byte allocation will be + * satisfied not from the 512..1023-byte freelist, whose members *might* + * contains a 1000-byte segment, but from a 1024-byte or larger freelist, + * the first member of which will *definitely* satisfy the allocation. + * This ensures that vmem_xalloc() works in constant time. + * + * We maintain a bit map to determine quickly which freelists are non-empty. + * vmp->vm_freemap & (1 << n) is non-zero iff vmp->vm_freelist[n] is non-empty. + * + * The different freelists are linked together into one large freelist, + * with the freelist heads serving as markers. Freelist markers simplify + * the maintenance of vm_freemap by making it easy to tell when we're taking + * the last member of a freelist (both of its neighbors will be markers). + * + * 2.4 Vmem Locking + * ---------------- + * For simplicity, all arena state is protected by a per-arena lock. + * For very hot arenas, use quantum caching for scalability. + * + * 2.5 Vmem Population + * ------------------- + * Any internal vmem routine that might need to allocate new segment + * structures must prepare in advance by calling vmem_populate(), which + * will preallocate enough vmem_seg_t's to get is through the entire + * operation without dropping the arena lock. + * + * 2.6 Auditing + * ------------ + * If KMF_AUDIT is set in kmem_flags, we audit vmem allocations as well. + * Since virtual addresses cannot be scribbled on, there is no equivalent + * in vmem to redzone checking, deadbeef, or other kmem debugging features. + * Moreover, we do not audit frees because segment coalescing destroys the + * association between an address and its segment structure. Auditing is + * thus intended primarily to keep track of who's consuming the arena. + * Debugging support could certainly be extended in the future if it proves + * necessary, but we do so much live checking via the allocation hash table + * that even non-DEBUG systems get quite a bit of sanity checking already. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VMEM_INITIAL 21 /* early vmem arenas */ +#define VMEM_SEG_INITIAL 800 + +/* + * Adding a new span to an arena requires two segment structures: one to + * represent the span, and one to represent the free segment it contains. + */ +#define VMEM_SEGS_PER_SPAN_CREATE 2 + +/* + * Allocating a piece of an existing segment requires 0-2 segment structures + * depending on how much of the segment we're allocating. + * + * To allocate the entire segment, no new segment structures are needed; we + * simply move the existing segment structure from the freelist to the + * allocation hash table. + * + * To allocate a piece from the left or right end of the segment, we must + * split the segment into two pieces (allocated part and remainder), so we + * need one new segment structure to represent the remainder. + * + * To allocate from the middle of a segment, we need two new segment strucures + * to represent the remainders on either side of the allocated part. + */ +#define VMEM_SEGS_PER_EXACT_ALLOC 0 +#define VMEM_SEGS_PER_LEFT_ALLOC 1 +#define VMEM_SEGS_PER_RIGHT_ALLOC 1 +#define VMEM_SEGS_PER_MIDDLE_ALLOC 2 + +/* + * vmem_populate() preallocates segment structures for vmem to do its work. + * It must preallocate enough for the worst case, which is when we must import + * a new span and then allocate from the middle of it. + */ +#define VMEM_SEGS_PER_ALLOC_MAX \ +(VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC) + +/* + * The segment structures themselves are allocated from vmem_seg_arena, so + * we have a recursion problem when vmem_seg_arena needs to populate itself. + * We address this by working out the maximum number of segment structures + * this act will require, and multiplying by the maximum number of threads + * that we'll allow to do it simultaneously. + * + * The worst-case segment consumption to populate vmem_seg_arena is as + * follows (depicted as a stack trace to indicate why events are occurring): + * + * (In order to lower the fragmentation in the heap_arena, we specify a + * minimum import size for the vmem_metadata_arena which is the same size + * as the kmem_va quantum cache allocations. This causes the worst-case + * allocation from the vmem_metadata_arena to be 3 segments.) + * + * vmem_alloc(vmem_seg_arena) -> 2 segs (span create + exact alloc) + * segkmem_alloc(vmem_metadata_arena) + * vmem_alloc(vmem_metadata_arena) -> 3 segs (span create + left alloc) + * vmem_alloc(heap_arena) -> 1 seg (left alloc) + * page_create() + * hat_memload() + * kmem_cache_alloc() + * kmem_slab_create() + * vmem_alloc(hat_memload_arena) -> 2 segs (span create + exact alloc) + * segkmem_alloc(heap_arena) + * vmem_alloc(heap_arena) -> 1 seg (left alloc) + * page_create() + * hat_memload() -> (hat layer won't recurse further) + * + * The worst-case consumption for each arena is 3 segment structures. + * Of course, a 3-seg reserve could easily be blown by multiple threads. + * Therefore, we serialize all allocations from vmem_seg_arena (which is OK + * because they're rare). We cannot allow a non-blocking allocation to get + * tied up behind a blocking allocation, however, so we use separate locks + * for VM_SLEEP and VM_NOSLEEP allocations. Similarly, VM_PUSHPAGE allocations + * must not block behind ordinary VM_SLEEPs. In addition, if the system is + * panicking then we must keep enough resources for panic_thread to do its + * work. Thus we have at most four threads trying to allocate from + * vmem_seg_arena, and each thread consumes at most three segment structures, + * so we must maintain a 12-seg reserve. + */ +#define VMEM_POPULATE_RESERVE 12 + +/* + * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures + * so that it can satisfy the worst-case allocation *and* participate in + * worst-case allocation from vmem_seg_arena. + */ +#define VMEM_MINFREE (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX) + +static vmem_t vmem0[VMEM_INITIAL]; +static vmem_t *vmem_populator[VMEM_INITIAL]; +static uint32_t vmem_id; +static uint32_t vmem_populators; +static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL]; +static vmem_seg_t *vmem_segfree; +static kmutex_t vmem_list_lock; +static kmutex_t vmem_segfree_lock; +static kmutex_t vmem_sleep_lock; +static kmutex_t vmem_nosleep_lock; +static kmutex_t vmem_pushpage_lock; +static kmutex_t vmem_panic_lock; +static kmutex_t vmem_xnu_alloc_lock; +static vmem_t *vmem_list; +static vmem_t *vmem_metadata_arena; +static vmem_t *vmem_seg_arena; +static vmem_t *vmem_hash_arena; +static vmem_t *vmem_vmem_arena; +vmem_t *spl_default_arena; // The bottom-most arena for SPL +static vmem_t *spl_default_arena_parent; // dummy arena as a placeholder +#define VMEM_BUCKETS 13 +#define VMEM_BUCKET_LOWBIT 12 +#define VMEM_BUCKET_HIBIT 24 +static vmem_t *vmem_bucket_arena[VMEM_BUCKETS]; +vmem_t *spl_heap_arena; +static void *spl_heap_arena_initial_alloc; +static size_t spl_heap_arena_initial_alloc_size = 0; +#define NUMBER_OF_ARENAS_IN_VMEM_INIT 21 +/* vmem_update() every 15 seconds */ +static struct timespec vmem_update_interval = {15, 0}; +uint32_t vmem_mtbf; /* mean time between failures [default: off] */ +size_t vmem_seg_size = sizeof (vmem_seg_t); + +// must match with include/sys/vmem_impl.h +static vmem_kstat_t vmem_kstat_template = { + { "mem_inuse", KSTAT_DATA_UINT64 }, + { "mem_import", KSTAT_DATA_UINT64 }, + { "mem_total", KSTAT_DATA_UINT64 }, + { "vmem_source", KSTAT_DATA_UINT32 }, + { "alloc", KSTAT_DATA_UINT64 }, + { "free", KSTAT_DATA_UINT64 }, + { "wait", KSTAT_DATA_UINT64 }, + { "fail", KSTAT_DATA_UINT64 }, + { "lookup", KSTAT_DATA_UINT64 }, + { "search", KSTAT_DATA_UINT64 }, + { "populate_fail", KSTAT_DATA_UINT64 }, + { "contains", KSTAT_DATA_UINT64 }, + { "contains_search", KSTAT_DATA_UINT64 }, + { "parent_alloc", KSTAT_DATA_UINT64 }, + { "parent_free", KSTAT_DATA_UINT64 }, + { "threads_waiting", KSTAT_DATA_UINT64 }, + { "excess", KSTAT_DATA_UINT64 }, +}; + + +/* + * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k'). + */ +#define VMEM_INSERT(vprev, vsp, type) \ +{ \ +vmem_seg_t *_vnext = (vprev)->vs_##type##next; \ +(vsp)->vs_##type##next = (_vnext); \ +(vsp)->vs_##type##prev = (vprev); \ +(vprev)->vs_##type##next = (vsp); \ +(_vnext)->vs_##type##prev = (vsp); \ +} + +#define VMEM_DELETE(vsp, type) \ +{ \ +vmem_seg_t *_vprev = (vsp)->vs_##type##prev; \ +vmem_seg_t *_vnext = (vsp)->vs_##type##next; \ +(_vprev)->vs_##type##next = (_vnext); \ +(_vnext)->vs_##type##prev = (_vprev); \ +} + +// vmem thread block count +uint64_t spl_vmem_threads_waiting = 0; + +// number of allocations > minalloc +uint64_t spl_bucket_non_pow2_allocs = 0; + +// allocator kstats +uint64_t spl_vmem_unconditional_allocs = 0; +uint64_t spl_vmem_unconditional_alloc_bytes = 0; +uint64_t spl_vmem_conditional_allocs = 0; +uint64_t spl_vmem_conditional_alloc_bytes = 0; +uint64_t spl_vmem_conditional_alloc_deny = 0; +uint64_t spl_vmem_conditional_alloc_deny_bytes = 0; + +// bucket allocator kstat +uint64_t spl_xat_success = 0; +uint64_t spl_xat_late_success = 0; +uint64_t spl_xat_late_success_nosleep = 0; +uint64_t spl_xat_pressured = 0; +uint64_t spl_xat_bailed = 0; +uint64_t spl_xat_bailed_contended = 0; +uint64_t spl_xat_lastalloc = 0; +uint64_t spl_xat_lastfree = 0; +uint64_t spl_xat_forced = 0; +uint64_t spl_xat_sleep = 0; +uint64_t spl_xat_late_deny = 0; +uint64_t spl_xat_no_waiters = 0; +uint64_t spl_xft_wait = 0; + +uint64_t spl_vba_parent_memory_appeared = 0; +uint64_t spl_vba_parent_memory_blocked = 0; +uint64_t spl_vba_hiprio_blocked = 0; +uint64_t spl_vba_cv_timeout = 0; +uint64_t spl_vba_loop_timeout = 0; +uint64_t spl_vba_cv_timeout_blocked = 0; +uint64_t spl_vba_loop_timeout_blocked = 0; +uint64_t spl_vba_sleep = 0; +uint64_t spl_vba_loop_entries = 0; + +// bucket minimum span size tunables +uint64_t spl_bucket_tunable_large_span = 0; +uint64_t spl_bucket_tunable_small_span = 0; + +// for XAT & XATB visibility into VBA queue +static _Atomic uint32_t spl_vba_threads[VMEM_BUCKETS] = { 0 }; +static uint32_t + vmem_bucket_id_to_bucket_number[NUMBER_OF_ARENAS_IN_VMEM_INIT] = { 0 }; +boolean_t spl_arc_no_grow(size_t, boolean_t, kmem_cache_t **); +_Atomic uint64_t spl_arc_no_grow_bits = 0; +uint64_t spl_arc_no_grow_count = 0; + +// compare span ages this many steps from the head of the freelist +uint64_t spl_frag_max_walk = 1000; +uint64_t spl_frag_walked_out = 0; +uint64_t spl_frag_walk_cnt = 0; + +extern void spl_free_set_emergency_pressure(int64_t p); +extern uint64_t segkmem_total_mem_allocated; +extern uint64_t total_memory; + +extern void IOSleep(unsigned milliseconds); + +/* + * Get a vmem_seg_t from the global segfree list. + */ +static vmem_seg_t * +vmem_getseg_global(void) +{ + vmem_seg_t *vsp; + + mutex_enter(&vmem_segfree_lock); + if ((vsp = vmem_segfree) != NULL) + vmem_segfree = vsp->vs_knext; + mutex_exit(&vmem_segfree_lock); + + if (vsp != NULL) + vsp->vs_span_createtime = 0; + + return (vsp); +} + +/* + * Put a vmem_seg_t on the global segfree list. + */ +static void +vmem_putseg_global(vmem_seg_t *vsp) +{ + mutex_enter(&vmem_segfree_lock); + vsp->vs_knext = vmem_segfree; + vmem_segfree = vsp; + mutex_exit(&vmem_segfree_lock); +} + +/* + * Get a vmem_seg_t from vmp's segfree list. + */ +static vmem_seg_t * +vmem_getseg(vmem_t *vmp) +{ + vmem_seg_t *vsp; + + ASSERT(vmp->vm_nsegfree > 0); + + vsp = vmp->vm_segfree; + vmp->vm_segfree = vsp->vs_knext; + vmp->vm_nsegfree--; + + return (vsp); +} + +/* + * Put a vmem_seg_t on vmp's segfree list. + */ +static void +vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp) +{ + vsp->vs_knext = vmp->vm_segfree; + vmp->vm_segfree = vsp; + vmp->vm_nsegfree++; +} + + +/* + * Add vsp to the appropriate freelist, at the appropriate location, + * keeping the freelist sorted by age. + */ + +#define dprintf(...) + +/* + * return true when we continue the for loop in + * vmem_freelist_insert_sort_by_time + */ +static inline bool +flist_sort_compare(bool newfirst, + const vmem_seg_t *vhead, + const vmem_seg_t *nextlist, + vmem_seg_t *p, vmem_seg_t *to_insert) +{ + /* + * vsp is the segment we are inserting into the freelist + * p is a freelist poniter or an element inside a non-empty freelist + * if we return false, then vsp is inserted immedaitely after p, + */ + + // always enter the for loop if we're at the front of a flist + if (p == vhead) + return (true); + + const vmem_seg_t *n = p->vs_knext; + + if (n == nextlist || n == NULL) { + // if we are at the tail of the flist, then + // insert vsp between p and n + return (false); + } + + if (n->vs_import == true && to_insert->vs_import == false) { + /* + * put non-imported segments before imported segments + * no matter what their respective create times are, + * thereby making imported segments more likely "age out" + */ + return (false); // inserts to_insert between p and n + } + + if (newfirst == true) { + if (n->vs_span_createtime < to_insert->vs_span_createtime) { + // n is older than me, so insert me between p and n + return (false); + } + } else { + if (n->vs_span_createtime > to_insert->vs_span_createtime) { + // n is newer than me, so insert me between p and n + return (false); + } + } + // continue iterating + return (true); +} + +static void +vmem_freelist_insert_sort_by_time(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(vmp->vm_cflags & VMC_TIMEFREE); + ASSERT(vsp->vs_span_createtime > 0); + + const bool newfirst = 0 == (vmp->vm_cflags & VMC_OLDFIRST); + + const uint64_t abs_max_walk_steps = 1ULL << 30ULL; + uint32_t max_walk_steps = (uint32_t)MIN(spl_frag_max_walk, + abs_max_walk_steps); + + vmem_seg_t *vprev; + + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + + /* + * in vmem_create_common() the freelists are arranged: + * freelist[0].vs_kprev = NULL, freelist[VMEM_FREELISTS].vs_knext = NULL + * freelist[1].vs_kprev = freelist[0], freelist[1].vs_knext = + * freelist[2] ... + * from vmem_freelist_insert(): + * VS_SIZE is the segment size (->vs_end - ->vs_start), so say 8k-512 + * highbit is the higest bit set PLUS 1, so in this case would be the + * 16k list. so below, vprev is therefore pointing to the 8k list + * in vmem_alloc, the unconstrained allocation takes, for a 8k-512 + * block: vsp = flist[8k].vs_knext + * and calls vmem_seg_create() which sends any leftovers from vsp + * to vmem_freelist_insert + * + * vmem_freelist_insert would take the seg (as above, 8k-512 size), + * vprev points to the 16k list, and VMEM_INSERT(vprev, vsp, k) + * inserts the segment immediately after + * + * so vmem_seg_create(...8k-512...) pushes to the head of the 8k list, + * and vmem_alloc(...8-512k...) will pull from the head of the 8k list + * + * below we may want to push to the TAIL of the 8k list, which is + * just before flist[16k]. + */ + + vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; + + int my_listnum = highbit(VS_SIZE(vsp)) - 1; + + ASSERT(my_listnum >= 1); + ASSERT(my_listnum < VMEM_FREELISTS); + + int next_listnum = my_listnum + 1; + + const vmem_seg_t *nextlist = + (vmem_seg_t *)&vmp->vm_freelist[next_listnum]; + + ASSERT(vsp->vs_span_createtime != 0); + if (vsp->vs_span_createtime == 0) { + printf("SPL: %s: WARNING: vsp->vs_span_createtime == 0 (%s)!\n", + __func__, vmp->vm_name); + } + + // continuing our example, starts with p at flist[8k] + // and n at the following freelist entry + + const vmem_seg_t *vhead = vprev; + vmem_seg_t *p = vprev; + vmem_seg_t *n = p->vs_knext; + + // walk from the freelist head looking for + // a segment whose creation time is earlier than + // the segment to be inserted's creation time, + // then insert before that segment. + + for (uint32_t step = 0; + flist_sort_compare(newfirst, vhead, nextlist, p, vsp) == true; + step++) { + // iterating while predecessor pointer p was created + // at a later tick than funcarg vsp. + // + // below we set p to n and update n. + ASSERT(n != NULL); + if (n == nextlist) { + dprintf("SPL: %s: at marker (%s)(steps: %u) " + "p->vs_start, end == %lu, %lu\n", + __func__, vmp->vm_name, step, + (uintptr_t)p->vs_start, (uintptr_t)p->vs_end); + // IOSleep(1); + // the next entry is the next marker (e.g. 16k marker) + break; + } + if (n->vs_start == 0) { + // from vmem_freelist_delete, this is a head + dprintf("SPL: %s: n->vs_start == 0 (%s)(steps: %u) " + "p->vs_start, end == %lu, %lu\n", + __func__, vmp->vm_name, step, + (uintptr_t)p->vs_start, (uintptr_t)p->vs_end); + // IOSleep(1); + break; + } + if (step >= max_walk_steps) { + ASSERT(nextlist->vs_kprev != NULL); + // we have walked far enough. + // put this segment at the tail of the freelist. + if (nextlist->vs_kprev != NULL) { + n = (vmem_seg_t *)nextlist; + p = nextlist->vs_kprev; + } + dprintf("SPL: %s: walked out (%s)\n", __func__, + vmp->vm_name); + // IOSleep(1); + atomic_inc_64(&spl_frag_walked_out); + break; + } + if (n->vs_knext == NULL) { + dprintf("SPL: %s: n->vs_knext == NULL (my_listnum " + "== %d)\n", __func__, my_listnum); + // IOSleep(1); + break; + } + p = n; + n = n->vs_knext; + atomic_inc_64(&spl_frag_walk_cnt); + } + + ASSERT(p != NULL); + + // insert segment between p and n + + vsp->vs_type = VMEM_FREE; + vmp->vm_freemap |= VS_SIZE(vprev); + VMEM_INSERT(p, vsp, k); + + cv_broadcast(&vmp->vm_cv); +} + +/* + * Add vsp to the appropriate freelist. + */ +static void +vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp) +{ + + if (vmp->vm_cflags & VMC_TIMEFREE) { + vmem_freelist_insert_sort_by_time(vmp, vsp); + return; + } + + vmem_seg_t *vprev; + + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + + vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; + vsp->vs_type = VMEM_FREE; + vmp->vm_freemap |= VS_SIZE(vprev); + VMEM_INSERT(vprev, vsp, k); + + cv_broadcast(&vmp->vm_cv); +} + +/* + * Take vsp from the freelist. + */ +static void +vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + ASSERT(vsp->vs_type == VMEM_FREE); + + if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) { + /* + * The segments on both sides of 'vsp' are freelist heads, + * so taking vsp leaves the freelist at vsp->vs_kprev empty. + */ + ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev)); + vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev); + } + VMEM_DELETE(vsp, k); +} + +/* + * Add vsp to the allocated-segment hash table and update kstats. + */ +static void +vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp) +{ + vmem_seg_t **bucket; + + vsp->vs_type = VMEM_ALLOC; + bucket = VMEM_HASH(vmp, vsp->vs_start); + vsp->vs_knext = *bucket; + *bucket = vsp; + + if (vmem_seg_size == sizeof (vmem_seg_t)) { + // vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack, + // VMEM_STACK_DEPTH); + // vsp->vs_thread = curthread; + vsp->vs_depth = 0; + vsp->vs_thread = 0; + vsp->vs_timestamp = gethrtime(); + } else { + vsp->vs_depth = 0; + } + + vmp->vm_kstat.vk_alloc.value.ui64++; + vmp->vm_kstat.vk_mem_inuse.value.ui64 += VS_SIZE(vsp); +} + +/* + * Remove vsp from the allocated-segment hash table and update kstats. + */ +static vmem_seg_t * +vmem_hash_delete(vmem_t *vmp, uintptr_t addr, size_t size) +{ + vmem_seg_t *vsp, **prev_vspp; + + prev_vspp = VMEM_HASH(vmp, addr); + while ((vsp = *prev_vspp) != NULL) { + if (vsp->vs_start == addr) { + *prev_vspp = vsp->vs_knext; + break; + } + vmp->vm_kstat.vk_lookup.value.ui64++; + prev_vspp = &vsp->vs_knext; + } + + if (vsp == NULL) + panic("vmem_hash_delete(%p, %lx, %lu): bad free " + "(name: %s, addr, size)", + (void *)vmp, addr, size, vmp->vm_name); + if (VS_SIZE(vsp) != size) + panic("vmem_hash_delete(%p, %lx, %lu): (%s) wrong size" + "(expect %lu)", + (void *)vmp, addr, size, vmp->vm_name, VS_SIZE(vsp)); + + vmp->vm_kstat.vk_free.value.ui64++; + vmp->vm_kstat.vk_mem_inuse.value.ui64 -= size; + + return (vsp); +} + +/* + * Create a segment spanning the range [start, end) and add it to the arena. + */ +static vmem_seg_t * +vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end) +{ + vmem_seg_t *newseg = vmem_getseg(vmp); + + newseg->vs_start = start; + newseg->vs_end = end; + newseg->vs_type = 0; + newseg->vs_import = 0; + newseg->vs_span_createtime = 0; + + VMEM_INSERT(vprev, newseg, a); + + return (newseg); +} + +/* + * Remove segment vsp from the arena. + */ +static void +vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(vsp->vs_type != VMEM_ROTOR); + VMEM_DELETE(vsp, a); + + vmem_putseg(vmp, vsp); +} + +/* + * Add the span [vaddr, vaddr + size) to vmp and update kstats. + */ +static vmem_seg_t * +vmem_span_create(vmem_t *vmp, void *vaddr, size_t size, uint8_t import) +{ + vmem_seg_t *newseg, *span; + uintptr_t start = (uintptr_t)vaddr; + uintptr_t end = start + size; + + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + + if ((start | end) & (vmp->vm_quantum - 1)) + panic("vmem_span_create(%p, %p, %lu): misaligned (%s)", + (void *)vmp, vaddr, size, vmp->vm_name); + + span = vmem_seg_create(vmp, vmp->vm_seg0.vs_aprev, start, end); + span->vs_type = VMEM_SPAN; + span->vs_import = import; + + hrtime_t t = 0; + if (vmp->vm_cflags & VMC_TIMEFREE) { + t = gethrtime(); + } + span->vs_span_createtime = t; + + VMEM_INSERT(vmp->vm_seg0.vs_kprev, span, k); + + newseg = vmem_seg_create(vmp, span, start, end); + newseg->vs_span_createtime = t; + + vmem_freelist_insert(vmp, newseg); + + if (import) + vmp->vm_kstat.vk_mem_import.value.ui64 += size; + vmp->vm_kstat.vk_mem_total.value.ui64 += size; + + return (newseg); +} + +/* + * Remove span vsp from vmp and update kstats. + */ +static void +vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp) +{ + vmem_seg_t *span = vsp->vs_aprev; + size_t size = VS_SIZE(vsp); + + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + ASSERT(span->vs_type == VMEM_SPAN); + + if (span->vs_import) + vmp->vm_kstat.vk_mem_import.value.ui64 -= size; + vmp->vm_kstat.vk_mem_total.value.ui64 -= size; + + VMEM_DELETE(span, k); + + vmem_seg_destroy(vmp, vsp); + vmem_seg_destroy(vmp, span); +} + +/* + * Allocate the subrange [addr, addr + size) from segment vsp. + * If there are leftovers on either side, place them on the freelist. + * Returns a pointer to the segment representing [addr, addr + size). + */ +static vmem_seg_t * +vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size) +{ + uintptr_t vs_start = vsp->vs_start; + uintptr_t vs_end = vsp->vs_end; + size_t vs_size = vs_end - vs_start; + size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); + uintptr_t addr_end = addr + realsize; + + ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0); + ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0); + ASSERT(vsp->vs_type == VMEM_FREE); + ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1); + ASSERT(addr - 1 <= addr_end - 1); + + hrtime_t parent_seg_span_createtime = vsp->vs_span_createtime; + + /* + * If we're allocating from the start of the segment, and the + * remainder will be on the same freelist, we can save quite + * a bit of work. + */ + if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) { + ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); + vsp->vs_start = addr_end; + vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size); + vsp->vs_span_createtime = parent_seg_span_createtime; + vmem_hash_insert(vmp, vsp); + return (vsp); + } + + vmem_freelist_delete(vmp, vsp); + + if (vs_end != addr_end) { + vmem_seg_t *v = vmem_seg_create(vmp, vsp, addr_end, vs_end); + v->vs_span_createtime = parent_seg_span_createtime; + vmem_freelist_insert(vmp, v); + } + + if (vs_start != addr) { + vmem_seg_t *v = + vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr); + v->vs_span_createtime = parent_seg_span_createtime; + vmem_freelist_insert(vmp, v); + } + + vsp->vs_start = addr; + vsp->vs_end = addr + size; + + vsp->vs_span_createtime = parent_seg_span_createtime; + + vmem_hash_insert(vmp, vsp); + return (vsp); +} + +/* + * Returns 1 if we are populating, 0 otherwise. + * Call it if we want to prevent recursion from HAT. + */ +int +vmem_is_populator() +{ + return (mutex_owner(&vmem_sleep_lock) == curthread || + mutex_owner(&vmem_nosleep_lock) == curthread || + mutex_owner(&vmem_pushpage_lock) == curthread || + mutex_owner(&vmem_panic_lock) == curthread); +} + +/* + * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures. + */ +static int +vmem_populate(vmem_t *vmp, int vmflag) +{ + char *p; + vmem_seg_t *vsp; + ssize_t nseg; + size_t size; + kmutex_t *lp; + int i; + + while (vmp->vm_nsegfree < VMEM_MINFREE && + (vsp = vmem_getseg_global()) != NULL) + vmem_putseg(vmp, vsp); + + if (vmp->vm_nsegfree >= VMEM_MINFREE) + return (1); + + /* + * If we're already populating, tap the reserve. + */ + if (vmem_is_populator()) { + ASSERT(vmp->vm_cflags & VMC_POPULATOR); + return (1); + } + + mutex_exit(&vmp->vm_lock); + + // if (panic_thread == curthread) + // lp = &vmem_panic_lock; + // else + + if (vmflag & VM_NOSLEEP) + lp = &vmem_nosleep_lock; + else if (vmflag & VM_PUSHPAGE) + lp = &vmem_pushpage_lock; + else + lp = &vmem_sleep_lock; + + mutex_enter(lp); + + nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE; + size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum); + nseg = size / vmem_seg_size; + + /* + * The following vmem_alloc() may need to populate vmem_seg_arena + * and all the things it imports from. When doing so, it will tap + * each arena's reserve to prevent recursion (see the block comment + * above the definition of VMEM_POPULATE_RESERVE). + */ + p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_KMFLAGS); + if (p == NULL) { + mutex_exit(lp); + mutex_enter(&vmp->vm_lock); + vmp->vm_kstat.vk_populate_fail.value.ui64++; + return (0); + } + + /* + * Restock the arenas that may have been depleted during population. + */ + for (i = 0; i < vmem_populators; i++) { + mutex_enter(&vmem_populator[i]->vm_lock); + while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE) + vmem_putseg(vmem_populator[i], + (vmem_seg_t *)(p + --nseg * vmem_seg_size)); + mutex_exit(&vmem_populator[i]->vm_lock); + } + + mutex_exit(lp); + mutex_enter(&vmp->vm_lock); + + /* + * Now take our own segments. + */ + ASSERT(nseg >= VMEM_MINFREE); + while (vmp->vm_nsegfree < VMEM_MINFREE) + vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size)); + + /* + * Give the remainder to charity. + */ + while (nseg > 0) + vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size)); + + return (1); +} + +/* + * Advance a walker from its previous position to 'afterme'. + * Note: may drop and reacquire vmp->vm_lock. + */ +static void +vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme) +{ + vmem_seg_t *vprev = walker->vs_aprev; + vmem_seg_t *vnext = walker->vs_anext; + vmem_seg_t *vsp = NULL; + + VMEM_DELETE(walker, a); + + if (afterme != NULL) + VMEM_INSERT(afterme, walker, a); + + /* + * The walker segment's presence may have prevented its neighbors + * from coalescing. If so, coalesce them now. + */ + if (vprev->vs_type == VMEM_FREE) { + if (vnext->vs_type == VMEM_FREE) { + ASSERT(vprev->vs_end == vnext->vs_start); + ASSERT(vprev->vs_span_createtime == + vnext->vs_span_createtime); + vmem_freelist_delete(vmp, vnext); + vmem_freelist_delete(vmp, vprev); + vprev->vs_end = vnext->vs_end; + vmem_freelist_insert(vmp, vprev); + vmem_seg_destroy(vmp, vnext); + } + vsp = vprev; + } else if (vnext->vs_type == VMEM_FREE) { + vsp = vnext; + } + + /* + * vsp could represent a complete imported span, + * in which case we must return it to the source. + */ + if (vsp != NULL && vsp->vs_aprev->vs_import && + vmp->vm_source_free != NULL && + vsp->vs_aprev->vs_type == VMEM_SPAN && + vsp->vs_anext->vs_type == VMEM_SPAN) { + void *vaddr = (void *)vsp->vs_start; + size_t size = VS_SIZE(vsp); + ASSERT(size == VS_SIZE(vsp->vs_aprev)); + vmem_freelist_delete(vmp, vsp); + vmem_span_destroy(vmp, vsp); + vmp->vm_kstat.vk_parent_free.value.ui64++; + mutex_exit(&vmp->vm_lock); + vmp->vm_source_free(vmp->vm_source, vaddr, size); + mutex_enter(&vmp->vm_lock); + } +} + +/* + * VM_NEXTFIT allocations deliberately cycle through all virtual addresses + * in an arena, so that we avoid reusing addresses for as long as possible. + * This helps to catch used-after-freed bugs. It's also the perfect policy + * for allocating things like process IDs, where we want to cycle through + * all values in order. + */ +static void * +vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + vmem_seg_t *vsp, *rotor; + uintptr_t addr; + size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); + size_t vs_size; + + mutex_enter(&vmp->vm_lock); + + if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) { + mutex_exit(&vmp->vm_lock); + return (NULL); + } + + /* + * The common case is that the segment right after the rotor is free, + * and large enough that extracting 'size' bytes won't change which + * freelist it's on. In this case we can avoid a *lot* of work. + * Instead of the normal vmem_seg_alloc(), we just advance the start + * address of the victim segment. Instead of moving the rotor, we + * create the new segment structure *behind the rotor*, which has + * the same effect. And finally, we know we don't have to coalesce + * the rotor's neighbors because the new segment lies between them. + */ + rotor = &vmp->vm_rotor; + vsp = rotor->vs_anext; + if (vsp->vs_type == VMEM_FREE && (vs_size = VS_SIZE(vsp)) > realsize && + P2SAMEHIGHBIT(vs_size, vs_size - realsize)) { + ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); + addr = vsp->vs_start; + vsp->vs_start = addr + realsize; + hrtime_t t = vsp->vs_span_createtime; + vmem_hash_insert(vmp, + vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size)); + vsp->vs_span_createtime = t; + mutex_exit(&vmp->vm_lock); + return ((void *)addr); + } + + /* + * Starting at the rotor, look for a segment large enough to + * satisfy the allocation. + */ + for (;;) { + atomic_inc_64(&vmp->vm_kstat.vk_search.value.ui64); + if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) + break; + vsp = vsp->vs_anext; + if (vsp == rotor) { + /* + * We've come full circle. One possibility is that the + * there's actually enough space, but the rotor itself + * is preventing the allocation from succeeding because + * it's sitting between two free segments. Therefore, + * we advance the rotor and see if that liberates a + * suitable segment. + */ + vmem_advance(vmp, rotor, rotor->vs_anext); + vsp = rotor->vs_aprev; + if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) + break; + /* + * If there's a lower arena we can import from, or it's + * a VM_NOSLEEP allocation, let vmem_xalloc() handle it. + * Otherwise, wait until another thread frees something. + */ + if (vmp->vm_source_alloc != NULL || + (vmflag & VM_NOSLEEP)) { + mutex_exit(&vmp->vm_lock); + return (vmem_xalloc(vmp, size, vmp->vm_quantum, + 0, 0, NULL, NULL, + vmflag & (VM_KMFLAGS | VM_NEXTFIT))); + } + atomic_inc_64(&vmp->vm_kstat.vk_wait.value.ui64); + atomic_inc_64( + &vmp->vm_kstat.vk_threads_waiting.value.ui64); + atomic_inc_64(&spl_vmem_threads_waiting); + if (spl_vmem_threads_waiting > 1) + printf("SPL: %s: waiting for %lu sized alloc " + "after full circle of %s, waiting " + "threads %llu, total threads waiting " + "= %llu.\n", + __func__, size, vmp->vm_name, + vmp->vm_kstat.vk_threads_waiting.value.ui64, + spl_vmem_threads_waiting); + cv_wait(&vmp->vm_cv, &vmp->vm_lock); + atomic_dec_64(&spl_vmem_threads_waiting); + atomic_dec_64( + &vmp->vm_kstat.vk_threads_waiting.value.ui64); + vsp = rotor->vs_anext; + } + } + + /* + * We found a segment. Extract enough space to satisfy the allocation. + */ + addr = vsp->vs_start; + vsp = vmem_seg_alloc(vmp, vsp, addr, size); + ASSERT(vsp->vs_type == VMEM_ALLOC && + vsp->vs_start == addr && vsp->vs_end == addr + size); + + /* + * Advance the rotor to right after the newly-allocated segment. + * That's where the next VM_NEXTFIT allocation will begin searching. + */ + vmem_advance(vmp, rotor, vsp); + mutex_exit(&vmp->vm_lock); + return ((void *)addr); +} + +/* + * Checks if vmp is guaranteed to have a size-byte buffer somewhere on its + * freelist. If size is not a power-of-2, it can return a false-negative. + * + * Used to decide if a newly imported span is superfluous after re-acquiring + * the arena lock. + */ +static int +vmem_canalloc(vmem_t *vmp, size_t size) +{ + int hb; + int flist = 0; + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + + if ((size & (size - 1)) == 0) + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + else if ((hb = highbit(size)) < VMEM_FREELISTS) + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1ULL << hb)); + + return (flist); +} + +// Convenience functions for use when gauging +// allocation ability when not holding the lock. +// These are unreliable because vmp->vm_freemap is +// liable to change immediately after being examined. +int +vmem_canalloc_lock(vmem_t *vmp, size_t size) +{ + mutex_enter(&vmp->vm_lock); + int i = vmem_canalloc(vmp, size); + mutex_exit(&vmp->vm_lock); + return (i); +} + +int +vmem_canalloc_atomic(vmem_t *vmp, size_t size) +{ + int hb; + int flist = 0; + + ulong_t freemap = + __c11_atomic_load((_Atomic ulong_t *)&vmp->vm_freemap, + __ATOMIC_SEQ_CST); + + if (ISP2(size)) + flist = lowbit(P2ALIGN(freemap, size)); + else if ((hb = highbit(size)) < VMEM_FREELISTS) + flist = lowbit(P2ALIGN(freemap, 1ULL << hb)); + + return (flist); +} + +static inline uint64_t +spl_vmem_xnu_useful_bytes_free(void) +{ + extern volatile unsigned int vm_page_free_wanted; + extern volatile unsigned int vm_page_free_count; + extern volatile unsigned int vm_page_free_min; + + if (vm_page_free_wanted > 0) + return (0); + + uint64_t bytes_free = (uint64_t)vm_page_free_count * (uint64_t)PAGESIZE; + uint64_t bytes_min = (uint64_t)vm_page_free_min * (uint64_t)PAGESIZE; + + if (bytes_free <= bytes_min) + return (0); + + uint64_t useful_free = bytes_free - bytes_min; + + return (useful_free); +} + +uint64_t +vmem_xnu_useful_bytes_free(void) +{ + return (spl_vmem_xnu_useful_bytes_free()); +} + + +static void * +spl_vmem_malloc_unconditionally_unlocked(size_t size) +{ + extern void *osif_malloc(uint64_t); + atomic_inc_64(&spl_vmem_unconditional_allocs); + atomic_add_64(&spl_vmem_unconditional_alloc_bytes, size); + return (osif_malloc(size)); +} + +static void * +spl_vmem_malloc_unconditionally(size_t size) +{ + mutex_enter(&vmem_xnu_alloc_lock); + void *m = spl_vmem_malloc_unconditionally_unlocked(size); + mutex_exit(&vmem_xnu_alloc_lock); + return (m); +} + +static void * +spl_vmem_malloc_if_no_pressure(size_t size) +{ + // The mutex serializes concurrent callers, providing time for + // the variables in spl_vmem_xnu_useful_bytes_free() to be updated. + mutex_enter(&vmem_xnu_alloc_lock); + if (spl_vmem_xnu_useful_bytes_free() > (MAX(size, 1024ULL*1024ULL))) { + extern void *osif_malloc(uint64_t); + void *p = osif_malloc(size); + if (p != NULL) { + spl_vmem_conditional_allocs++; + spl_vmem_conditional_alloc_bytes += size; + } + mutex_exit(&vmem_xnu_alloc_lock); + return (p); + } else { + spl_vmem_conditional_alloc_deny++; + spl_vmem_conditional_alloc_deny_bytes += size; + mutex_exit(&vmem_xnu_alloc_lock); + return (NULL); + } +} + +/* + * Allocate size bytes at offset phase from an align boundary such that the + * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) + * that does not straddle a nocross-aligned boundary. + */ +void * +vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase, + size_t nocross, void *minaddr, void *maxaddr, int vmflag) +{ + vmem_seg_t *vsp; + vmem_seg_t *vbest = NULL; + uintptr_t addr, taddr, start, end; + uintptr_t align = (align_arg != 0) ? align_arg : vmp->vm_quantum; + void *vaddr, *xvaddr = NULL; + size_t xsize; + int hb, flist, resv; + uint32_t mtbf; + + if ((align | phase | nocross) & (vmp->vm_quantum - 1)) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "parameters not vm_quantum aligned", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + + if (nocross != 0 && + (align > nocross || P2ROUNDUP(phase + size, align) > nocross)) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "overconstrained allocation", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + + if (phase >= align || (align & (align - 1)) != 0 || + (nocross & (nocross - 1)) != 0) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "parameters inconsistent or invalid", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + + if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && + (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) + return (NULL); + + mutex_enter(&vmp->vm_lock); + for (;;) { + if (vmp->vm_nsegfree < VMEM_MINFREE && + !vmem_populate(vmp, vmflag)) + break; +do_alloc: + /* + * highbit() returns the highest bit + 1, which is exactly + * what we want: we want to search the first freelist whose + * members are *definitely* large enough to satisfy our + * allocation. However, there are certain cases in which we + * want to look at the next-smallest freelist (which *might* + * be able to satisfy the allocation): + * + * (1) The size is exactly a power of 2, in which case + * the smaller freelist is always big enough; + * + * (2) All other freelists are empty; + * + * (3) We're in the highest possible freelist, which is + * always empty (e.g. the 4GB freelist on 32-bit systems); + * + * (4) We're doing a best-fit or first-fit allocation. + */ + if ((size & (size - 1)) == 0) { + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + } else { + hb = highbit(size); + if ((vmp->vm_freemap >> hb) == 0 || + hb == VMEM_FREELISTS || + (vmflag & (VM_BESTFIT | VM_FIRSTFIT))) + hb--; + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); + } + + for (vbest = NULL, vsp = (flist == 0) ? NULL : + vmp->vm_freelist[flist - 1].vs_knext; + vsp != NULL; vsp = vsp->vs_knext) { + atomic_inc_64(&vmp->vm_kstat.vk_search.value.ui64); + if (vsp->vs_start == 0) { + /* + * We're moving up to a larger freelist, + * so if we've already found a candidate, + * the fit can't possibly get any better. + */ + if (vbest != NULL) + break; + /* + * Find the next non-empty freelist. + */ + flist = lowbit(P2ALIGN(vmp->vm_freemap, + VS_SIZE(vsp))); + if (flist-- == 0) + break; + vsp = (vmem_seg_t *)&vmp->vm_freelist[flist]; + ASSERT(vsp->vs_knext->vs_type == VMEM_FREE); + continue; + } + if (vsp->vs_end - 1 < (uintptr_t)minaddr) + continue; + if (vsp->vs_start > (uintptr_t)maxaddr - 1) + continue; + start = MAX(vsp->vs_start, (uintptr_t)minaddr); + end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; + taddr = P2PHASEUP(start, align, phase); + if (P2BOUNDARY(taddr, size, nocross)) + taddr += + P2ROUNDUP(P2NPHASE(taddr, nocross), align); + if ((taddr - start) + size > end - start || + (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest))) + continue; + vbest = vsp; + addr = taddr; + if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size) + break; + } + if (vbest != NULL) + break; + ASSERT(xvaddr == NULL); + if (size == 0) + panic("vmem_xalloc(): size == 0"); + if (vmp->vm_source_alloc != NULL && nocross == 0 && + minaddr == NULL && maxaddr == NULL) { + size_t aneeded, asize; + size_t aquantum = MAX(vmp->vm_quantum, + vmp->vm_source->vm_quantum); + size_t aphase = phase; + if ((align > aquantum) && + !(vmp->vm_cflags & VMC_XALIGN)) { + aphase = (P2PHASE(phase, aquantum) != 0) ? + align - vmp->vm_quantum : align - aquantum; + ASSERT(aphase >= phase); + } + aneeded = MAX(size + aphase, vmp->vm_min_import); + asize = P2ROUNDUP(aneeded, aquantum); + + if (asize < size) { + /* + * The rounding induced overflow; return NULL + * if we are permitted to fail the allocation + * (and explicitly panic if we aren't). + */ + if ((vmflag & VM_NOSLEEP) && + !(vmflag & VM_PANIC)) { + mutex_exit(&vmp->vm_lock); + return (NULL); + } + + panic("vmem_xalloc(): size overflow"); + } + + /* + * Determine how many segment structures we'll consume. + * The calculation must be precise because if we're + * here on behalf of vmem_populate(), we are taking + * segments from a very limited reserve. + */ + if (size == asize && !(vmp->vm_cflags & VMC_XALLOC)) + resv = VMEM_SEGS_PER_SPAN_CREATE + + VMEM_SEGS_PER_EXACT_ALLOC; + else if (phase == 0 && + align <= vmp->vm_source->vm_quantum) + resv = VMEM_SEGS_PER_SPAN_CREATE + + VMEM_SEGS_PER_LEFT_ALLOC; + else + resv = VMEM_SEGS_PER_ALLOC_MAX; + + ASSERT(vmp->vm_nsegfree >= resv); + vmp->vm_nsegfree -= resv; /* reserve our segs */ + mutex_exit(&vmp->vm_lock); + if (vmp->vm_cflags & VMC_XALLOC) { + // size_t oasize = asize; + vaddr = ((vmem_ximport_t *) + vmp->vm_source_alloc)(vmp->vm_source, + &asize, align, vmflag & VM_KMFLAGS); + // ASSERT(asize >= oasize); + ASSERT(P2PHASE(asize, + vmp->vm_source->vm_quantum) == 0); + ASSERT(!(vmp->vm_cflags & VMC_XALIGN) || + IS_P2ALIGNED(vaddr, align)); + } else { + atomic_inc_64( + &vmp->vm_kstat.vk_parent_alloc.value.ui64); + vaddr = vmp->vm_source_alloc(vmp->vm_source, + asize, vmflag & (VM_KMFLAGS | VM_NEXTFIT)); + } + mutex_enter(&vmp->vm_lock); + vmp->vm_nsegfree += resv; /* claim reservation */ + aneeded = size + align - vmp->vm_quantum; + aneeded = P2ROUNDUP(aneeded, vmp->vm_quantum); + if (vaddr != NULL) { + /* + * Since we dropped the vmem lock while + * calling the import function, other + * threads could have imported space + * and made our import unnecessary. In + * order to save space, we return + * excess imports immediately. + */ + // but if there are threads waiting below, + // do not return the excess import, rather + // wake those threads up so they can use it. + if (asize > aneeded && + vmp->vm_source_free != NULL && + vmp->vm_kstat.vk_threads_waiting.value.ui64 + == 0 && vmem_canalloc(vmp, aneeded)) { + ASSERT(resv >= + VMEM_SEGS_PER_MIDDLE_ALLOC); + xvaddr = vaddr; + xsize = asize; + goto do_alloc; + } else if ( + vmp->vm_kstat.vk_threads_waiting.value.ui64 + > 0) { + vmp->vm_kstat.vk_excess.value.ui64++; + cv_broadcast(&vmp->vm_cv); + } + vbest = vmem_span_create(vmp, vaddr, asize, 1); + addr = P2PHASEUP(vbest->vs_start, align, phase); + break; + } else if (vmem_canalloc(vmp, aneeded)) { + /* + * Our import failed, but another thread + * added sufficient free memory to the arena + * to satisfy our request. Go back and + * grab it. + */ + ASSERT(resv >= VMEM_SEGS_PER_MIDDLE_ALLOC); + goto do_alloc; + } + } + + /* + * If the requestor chooses to fail the allocation attempt + * rather than reap wait and retry - get out of the loop. + */ + if (vmflag & VM_ABORT) + break; + mutex_exit(&vmp->vm_lock); + +#if 0 + if (vmp->vm_cflags & VMC_IDENTIFIER) + kmem_reap_idspace(); + else + kmem_reap(); +#endif + + mutex_enter(&vmp->vm_lock); + if (vmflag & VM_NOSLEEP) + break; + atomic_inc_64(&vmp->vm_kstat.vk_wait.value.ui64); + atomic_inc_64(&vmp->vm_kstat.vk_threads_waiting.value.ui64); + atomic_inc_64(&spl_vmem_threads_waiting); + if (spl_vmem_threads_waiting > 0) { + printf("SPL: %s: vmem waiting for %lu sized alloc " + "for %s, waiting threads %llu, total threads " + "waiting = %llu\n", + __func__, size, vmp->vm_name, + vmp->vm_kstat.vk_threads_waiting.value.ui64, + spl_vmem_threads_waiting); + extern int64_t spl_free_set_and_wait_pressure(int64_t, + boolean_t, clock_t); + extern int64_t spl_free_manual_pressure_wrapper(void); + mutex_exit(&vmp->vm_lock); + // release other waiting threads + spl_free_set_pressure(0); + int64_t target_pressure = size * + spl_vmem_threads_waiting; + int64_t delivered_pressure = + spl_free_set_and_wait_pressure(target_pressure, + TRUE, USEC2NSEC(500)); + printf("SPL: %s: pressure %lld targeted, %lld " + "delivered\n", __func__, target_pressure, + delivered_pressure); + mutex_enter(&vmp->vm_lock); + } + cv_wait(&vmp->vm_cv, &vmp->vm_lock); + atomic_dec_64(&spl_vmem_threads_waiting); + atomic_dec_64(&vmp->vm_kstat.vk_threads_waiting.value.ui64); + } + if (vbest != NULL) { + ASSERT(vbest->vs_type == VMEM_FREE); + ASSERT(vbest->vs_knext != vbest); + /* re-position to end of buffer */ + if (vmflag & VM_ENDALLOC) { + addr += ((vbest->vs_end - (addr + size)) / align) * + align; + } + (void) vmem_seg_alloc(vmp, vbest, addr, size); + mutex_exit(&vmp->vm_lock); + if (xvaddr) { + atomic_inc_64(&vmp->vm_kstat.vk_parent_free.value.ui64); + vmp->vm_source_free(vmp->vm_source, xvaddr, xsize); + } + ASSERT(P2PHASE(addr, align) == phase); + ASSERT(!P2BOUNDARY(addr, size, nocross)); + ASSERT(addr >= (uintptr_t)minaddr); + ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); + return ((void *)addr); + } + if (0 == (vmflag & VM_NO_VBA)) { + vmp->vm_kstat.vk_fail.value.ui64++; + } + mutex_exit(&vmp->vm_lock); + if (vmflag & VM_PANIC) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "cannot satisfy mandatory allocation", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + ASSERT(xvaddr == NULL); + return (NULL); +} + +/* + * Free the segment [vaddr, vaddr + size), where vaddr was a constrained + * allocation. vmem_xalloc() and vmem_xfree() must always be paired because + * both routines bypass the quantum caches. + */ +void +vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) +{ + vmem_seg_t *vsp, *vnext, *vprev; + + mutex_enter(&vmp->vm_lock); + + vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size); + vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum); + + /* + * Attempt to coalesce with the next segment. + */ + vnext = vsp->vs_anext; + if (vnext->vs_type == VMEM_FREE) { + ASSERT(vsp->vs_end == vnext->vs_start); + vmem_freelist_delete(vmp, vnext); + vsp->vs_end = vnext->vs_end; + vmem_seg_destroy(vmp, vnext); + } + + /* + * Attempt to coalesce with the previous segment. + */ + vprev = vsp->vs_aprev; + if (vprev->vs_type == VMEM_FREE) { + ASSERT(vprev->vs_end == vsp->vs_start); + vmem_freelist_delete(vmp, vprev); + vprev->vs_end = vsp->vs_end; + vmem_seg_destroy(vmp, vsp); + vsp = vprev; + } + + /* + * If the entire span is free, return it to the source. + */ + if (vsp->vs_aprev->vs_import && vmp->vm_source_free != NULL && + vsp->vs_aprev->vs_type == VMEM_SPAN && + vsp->vs_anext->vs_type == VMEM_SPAN) { + vaddr = (void *)vsp->vs_start; + size = VS_SIZE(vsp); + ASSERT(size == VS_SIZE(vsp->vs_aprev)); + vmem_span_destroy(vmp, vsp); + vmp->vm_kstat.vk_parent_free.value.ui64++; + mutex_exit(&vmp->vm_lock); + vmp->vm_source_free(vmp->vm_source, vaddr, size); + } else { + vmem_freelist_insert(vmp, vsp); + mutex_exit(&vmp->vm_lock); + } +} + +/* + * Allocate size bytes from arena vmp. Returns the allocated address + * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP, + * and may also specify best-fit, first-fit, or next-fit allocation policy + * instead of the default instant-fit policy. VM_SLEEP allocations are + * guaranteed to succeed. + */ +void * +vmem_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + vmem_seg_t *vsp; + uintptr_t addr; + int hb; + int flist = 0; + uint32_t mtbf; + + if (size - 1 < vmp->vm_qcache_max) + return (kmem_cache_alloc(vmp->vm_qcache[(size - 1) >> + vmp->vm_qshift], vmflag & VM_KMFLAGS)); + + if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && + (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) + return (NULL); + + if (vmflag & VM_NEXTFIT) + return (vmem_nextfit_alloc(vmp, size, vmflag)); + + if (vmflag & (VM_BESTFIT | VM_FIRSTFIT)) + return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0, + NULL, NULL, vmflag)); + if (vmp->vm_cflags & VM_NEXTFIT) + return (vmem_nextfit_alloc(vmp, size, vmflag)); + + /* + * Unconstrained instant-fit allocation from the segment list. + */ + mutex_enter(&vmp->vm_lock); + + if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) { + if ((size & (size - 1)) == 0) + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + else if ((hb = highbit(size)) < VMEM_FREELISTS) + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); + } + + if (flist-- == 0) { + mutex_exit(&vmp->vm_lock); + return (vmem_xalloc(vmp, size, vmp->vm_quantum, + 0, 0, NULL, NULL, vmflag)); + } + + ASSERT(size <= (1UL << flist)); + vsp = vmp->vm_freelist[flist].vs_knext; + addr = vsp->vs_start; + if (vmflag & VM_ENDALLOC) { + addr += vsp->vs_end - (addr + size); + } + (void) vmem_seg_alloc(vmp, vsp, addr, size); + mutex_exit(&vmp->vm_lock); + return ((void *)addr); +} + +/* + * Free the segment [vaddr, vaddr + size). + */ +void +vmem_free(vmem_t *vmp, void *vaddr, size_t size) +{ + if (size - 1 < vmp->vm_qcache_max) + kmem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], + vaddr); + else + vmem_xfree(vmp, vaddr, size); +} + +/* + * Determine whether arena vmp contains the segment [vaddr, vaddr + size). + */ +int +vmem_contains(vmem_t *vmp, void *vaddr, size_t size) +{ + uintptr_t start = (uintptr_t)vaddr; + uintptr_t end = start + size; + vmem_seg_t *vsp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + + mutex_enter(&vmp->vm_lock); + vmp->vm_kstat.vk_contains.value.ui64++; + for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) { + vmp->vm_kstat.vk_contains_search.value.ui64++; + ASSERT(vsp->vs_type == VMEM_SPAN); + if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1) + break; + } + mutex_exit(&vmp->vm_lock); + return (vsp != seg0); +} + +/* + * Add the span [vaddr, vaddr + size) to arena vmp. + */ +void * +vmem_add(vmem_t *vmp, void *vaddr, size_t size, int vmflag) +{ + if (vaddr == NULL || size == 0) + panic("vmem_add(%p, %p, %lu): bad arguments", + (void *)vmp, vaddr, size); + + ASSERT(!vmem_contains(vmp, vaddr, size)); + + mutex_enter(&vmp->vm_lock); + if (vmem_populate(vmp, vmflag)) + (void) vmem_span_create(vmp, vaddr, size, 0); + else + vaddr = NULL; + mutex_exit(&vmp->vm_lock); + return (vaddr); +} + +/* + * Walk the vmp arena, applying func to each segment matching typemask. + * If VMEM_REENTRANT is specified, the arena lock is dropped across each + * call to func(); otherwise, it is held for the duration of vmem_walk() + * to ensure a consistent snapshot. Note that VMEM_REENTRANT callbacks + * are *not* necessarily consistent, so they may only be used when a hint + * is adequate. + */ +void +vmem_walk(vmem_t *vmp, int typemask, + void (*func)(void *, void *, size_t), void *arg) +{ + vmem_seg_t *vsp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t walker; + + if (typemask & VMEM_WALKER) + return; + + bzero(&walker, sizeof (walker)); + walker.vs_type = VMEM_WALKER; + + mutex_enter(&vmp->vm_lock); + VMEM_INSERT(seg0, &walker, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) { + if (vsp->vs_type & typemask) { + void *start = (void *)vsp->vs_start; + size_t size = VS_SIZE(vsp); + if (typemask & VMEM_REENTRANT) { + vmem_advance(vmp, &walker, vsp); + mutex_exit(&vmp->vm_lock); + func(arg, start, size); + mutex_enter(&vmp->vm_lock); + vsp = &walker; + } else { + func(arg, start, size); + } + } + } + vmem_advance(vmp, &walker, NULL); + mutex_exit(&vmp->vm_lock); +} + +/* + * Return the total amount of memory whose type matches typemask. Thus: + * + * typemask VMEM_ALLOC yields total memory allocated (in use). + * typemask VMEM_FREE yields total memory free (available). + * typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size. + */ +size_t +vmem_size(vmem_t *vmp, int typemask) +{ + int64_t size = 0; + + if (typemask & VMEM_ALLOC) + size += (int64_t)vmp->vm_kstat.vk_mem_inuse.value.ui64; + if (typemask & VMEM_FREE) + size += (int64_t)vmp->vm_kstat.vk_mem_total.value.ui64 - + (int64_t)vmp->vm_kstat.vk_mem_inuse.value.ui64; + if (size < 0) + size = 0; + + return ((size_t)size); +} + +size_t +vmem_size_locked(vmem_t *vmp, int typemask) +{ + boolean_t m = (mutex_owner(&vmp->vm_lock) == curthread); + + if (!m) + mutex_enter(&vmp->vm_lock); + size_t s = vmem_size(vmp, typemask); + if (!m) + mutex_exit(&vmp->vm_lock); + return (s); +} + +size_t +vmem_size_semi_atomic(vmem_t *vmp, int typemask) +{ + int64_t size = 0; + uint64_t inuse = 0; + uint64_t total = 0; + + __sync_swap(&total, vmp->vm_kstat.vk_mem_total.value.ui64); + __sync_swap(&inuse, vmp->vm_kstat.vk_mem_inuse.value.ui64); + + int64_t inuse_signed = (int64_t)inuse; + int64_t total_signed = (int64_t)total; + + if (typemask & VMEM_ALLOC) + size += inuse_signed; + if (typemask & VMEM_FREE) + size += total_signed - inuse_signed; + + if (size < 0) + size = 0; + + return ((size_t)size); +} + +size_t +spl_vmem_size(vmem_t *vmp, int typemask) +{ + return (vmem_size_locked(vmp, typemask)); +} + +/* + * Create an arena called name whose initial span is [base, base + size). + * The arena's natural unit of currency is quantum, so vmem_alloc() + * guarantees quantum-aligned results. The arena may import new spans + * by invoking afunc() on source, and may return those spans by invoking + * ffunc() on source. To make small allocations fast and scalable, + * the arena offers high-performance caching for each integer multiple + * of quantum up to qcache_max. + */ +static vmem_t * +vmem_create_common(const char *name, void *base, size_t size, size_t quantum, + void *(*afunc)(vmem_t *, size_t, int), + void (*ffunc)(vmem_t *, void *, size_t), + vmem_t *source, size_t qcache_max, int vmflag) +{ + int i; + size_t nqcache; + vmem_t *vmp, *cur, **vmpp; + vmem_seg_t *vsp; + vmem_freelist_t *vfp; + uint32_t id = atomic_inc_32_nv(&vmem_id); + + if (vmem_vmem_arena != NULL) { + vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t), + vmflag & VM_KMFLAGS); + } else { + ASSERT(id <= VMEM_INITIAL); + vmp = &vmem0[id - 1]; + } + + /* An identifier arena must inherit from another identifier arena */ + ASSERT(source == NULL || ((source->vm_cflags & VMC_IDENTIFIER) == + (vmflag & VMC_IDENTIFIER))); + + if (vmp == NULL) + return (NULL); + bzero(vmp, sizeof (vmem_t)); + + (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name); + mutex_init(&vmp->vm_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vmp->vm_cv, NULL, CV_DEFAULT, NULL); + vmp->vm_cflags = vmflag; + vmflag &= VM_KMFLAGS; + + hrtime_t hrnow = gethrtime(); + + vmp->vm_createtime = hrnow; + + vmp->vm_quantum = quantum; + vmp->vm_qshift = highbit(quantum) - 1; + nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX); + + for (i = 0; i <= VMEM_FREELISTS; i++) { + vfp = &vmp->vm_freelist[i]; + vfp->vs_end = 1UL << i; + vfp->vs_knext = (vmem_seg_t *)(vfp + 1); + vfp->vs_kprev = (vmem_seg_t *)(vfp - 1); + } + + vmp->vm_freelist[0].vs_kprev = NULL; + vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL; + vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0; + vmp->vm_hash_table = vmp->vm_hash0; + vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1; + vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); + + vsp = &vmp->vm_seg0; + vsp->vs_anext = vsp; + vsp->vs_aprev = vsp; + vsp->vs_knext = vsp; + vsp->vs_kprev = vsp; + vsp->vs_type = VMEM_SPAN; + vsp->vs_span_createtime = hrnow; + + vsp = &vmp->vm_rotor; + vsp->vs_type = VMEM_ROTOR; + VMEM_INSERT(&vmp->vm_seg0, vsp, a); + + bcopy(&vmem_kstat_template, &vmp->vm_kstat, sizeof (vmem_kstat_t)); + + vmp->vm_id = id; + if (source != NULL) + vmp->vm_kstat.vk_source_id.value.ui32 = source->vm_id; + vmp->vm_source = source; + vmp->vm_source_alloc = afunc; + vmp->vm_source_free = ffunc; + + /* + * Some arenas (like vmem_metadata and kmem_metadata) cannot + * use quantum caching to lower fragmentation. Instead, we + * increase their imports, giving a similar effect. + */ + if (vmp->vm_cflags & VMC_NO_QCACHE) { + if (qcache_max > VMEM_NQCACHE_MAX && ISP2(qcache_max)) { + vmp->vm_min_import = qcache_max; + } else { + vmp->vm_min_import = + VMEM_QCACHE_SLABSIZE(nqcache << vmp->vm_qshift); + } + nqcache = 0; + } + + if (nqcache != 0) { + ASSERT(!(vmflag & VM_NOSLEEP)); + vmp->vm_qcache_max = nqcache << vmp->vm_qshift; + for (i = 0; i < nqcache; i++) { + char buf[VMEM_NAMELEN + 21]; + (void) snprintf(buf, VMEM_NAMELEN + 20, "%s_%lu", + vmp->vm_name, (i + 1) * quantum); + vmp->vm_qcache[i] = kmem_cache_create(buf, + (i + 1) * quantum, quantum, NULL, NULL, NULL, + NULL, vmp, KMC_QCACHE | KMC_NOTOUCH); + } + } + + if ((vmp->vm_ksp = kstat_create("vmem", vmp->vm_id, vmp->vm_name, + "vmem", KSTAT_TYPE_NAMED, sizeof (vmem_kstat_t) / + sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) != NULL) { + vmp->vm_ksp->ks_data = &vmp->vm_kstat; + kstat_install(vmp->vm_ksp); + } + + mutex_enter(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != NULL) + vmpp = &cur->vm_next; + *vmpp = vmp; + mutex_exit(&vmem_list_lock); + + if (vmp->vm_cflags & VMC_POPULATOR) { + ASSERT(vmem_populators < VMEM_INITIAL); + vmem_populator[atomic_inc_32_nv(&vmem_populators) - 1] = vmp; + mutex_enter(&vmp->vm_lock); + (void) vmem_populate(vmp, vmflag | VM_PANIC); + mutex_exit(&vmp->vm_lock); + } + + if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) { + vmem_destroy(vmp); + return (NULL); + } + + return (vmp); +} + +vmem_t * +vmem_xcreate(const char *name, void *base, size_t size, size_t quantum, + vmem_ximport_t *afunc, vmem_free_t *ffunc, vmem_t *source, + size_t qcache_max, int vmflag) +{ + ASSERT(!(vmflag & (VMC_POPULATOR | VMC_XALLOC))); + vmflag &= ~(VMC_POPULATOR | VMC_XALLOC); + + return (vmem_create_common(name, base, size, quantum, + (vmem_alloc_t *)afunc, ffunc, source, qcache_max, + vmflag | VMC_XALLOC)); +} + +vmem_t * +vmem_create(const char *name, void *base, size_t size, size_t quantum, + vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source, + size_t qcache_max, int vmflag) +{ + ASSERT(!(vmflag & (VMC_XALLOC | VMC_XALIGN))); + vmflag &= ~(VMC_XALLOC | VMC_XALIGN); + + return (vmem_create_common(name, base, size, quantum, + afunc, ffunc, source, qcache_max, vmflag)); +} + +/* + * Destroy arena vmp. + */ +void +vmem_destroy(vmem_t *vmp) +{ + vmem_t *cur, **vmpp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t *vsp, *anext; + size_t leaked; + + /* + * set vm_nsegfree to zero because vmem_free_span_list + * would have already freed vm_segfree. + */ + vmp->vm_nsegfree = 0; + mutex_enter(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != vmp) + vmpp = &cur->vm_next; + *vmpp = vmp->vm_next; + mutex_exit(&vmem_list_lock); + + leaked = vmem_size(vmp, VMEM_ALLOC); + if (leaked != 0) + printf("SPL: vmem_destroy('%s'): leaked %lu %s\n", + vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? + "identifiers" : "bytes"); + + if (vmp->vm_hash_table != vmp->vm_hash0) + vmem_free(vmem_hash_arena, vmp->vm_hash_table, + (vmp->vm_hash_mask + 1) * sizeof (void *)); + + /* + * Give back the segment structures for anything that's left in the + * arena, e.g. the primary spans and their free segments. + */ + VMEM_DELETE(&vmp->vm_rotor, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = anext) { + anext = vsp->vs_anext; + vmem_putseg_global(vsp); + } + + while (vmp->vm_nsegfree > 0) + vmem_putseg_global(vmem_getseg(vmp)); + + kstat_delete(vmp->vm_ksp); + + mutex_destroy(&vmp->vm_lock); + cv_destroy(&vmp->vm_cv); + vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); +} + + +/* + * Destroy arena vmp. + */ +void +vmem_destroy_internal(vmem_t *vmp) +{ + vmem_t *cur, **vmpp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t *vsp, *anext; + size_t leaked; + + mutex_enter(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != vmp) + vmpp = &cur->vm_next; + *vmpp = vmp->vm_next; + mutex_exit(&vmem_list_lock); + + leaked = vmem_size(vmp, VMEM_ALLOC); + if (leaked != 0) + printf("SPL: vmem_destroy('%s'): leaked %lu %s\n", + vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? + "identifiers" : "bytes"); + + if (vmp->vm_hash_table != vmp->vm_hash0) + if (vmem_hash_arena != NULL) + vmem_free(vmem_hash_arena, vmp->vm_hash_table, + (vmp->vm_hash_mask + 1) * sizeof (void *)); + + /* + * Give back the segment structures for anything that's left in the + * arena, e.g. the primary spans and their free segments. + */ + VMEM_DELETE(&vmp->vm_rotor, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = anext) { + anext = vsp->vs_anext; + vmem_putseg_global(vsp); + } + + while (vmp->vm_nsegfree > 0) + vmem_putseg_global(vmem_getseg(vmp)); + + if (!(vmp->vm_cflags & VMC_IDENTIFIER) && + vmem_size(vmp, VMEM_ALLOC) != 0) + printf("SPL: vmem_destroy('%s'): STILL %lu bytes at " + "kstat_delete() time\n", + vmp->vm_name, vmem_size(vmp, VMEM_ALLOC)); + + kstat_delete(vmp->vm_ksp); + + mutex_destroy(&vmp->vm_lock); + cv_destroy(&vmp->vm_cv); + + // Alas, to free, requires access to "vmem_vmem_arena" the very thing + // we release first. + // vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); +} + +/* + * Only shrink vmem hashtable if it is 1<vm_kstat.vk_alloc.value.ui64 - + vmp->vm_kstat.vk_free.value.ui64); + + new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); + old_size = vmp->vm_hash_mask + 1; + + if ((old_size >> vmem_rescale_minshift) <= new_size && + new_size <= (old_size << 1)) + return; + + new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *), + VM_NOSLEEP); + if (new_table == NULL) + return; + bzero(new_table, new_size * sizeof (void *)); + + mutex_enter(&vmp->vm_lock); + + old_size = vmp->vm_hash_mask + 1; + old_table = vmp->vm_hash_table; + + vmp->vm_hash_mask = new_size - 1; + vmp->vm_hash_table = new_table; + vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); + + for (h = 0; h < old_size; h++) { + vsp = old_table[h]; + while (vsp != NULL) { + uintptr_t addr = vsp->vs_start; + vmem_seg_t *next_vsp = vsp->vs_knext; + vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr); + vsp->vs_knext = *hash_bucket; + *hash_bucket = vsp; + vsp = next_vsp; + } + } + + mutex_exit(&vmp->vm_lock); + + if (old_table != vmp->vm_hash0) + vmem_free(vmem_hash_arena, old_table, + old_size * sizeof (void *)); +} + +/* + * Perform periodic maintenance on all vmem arenas. + */ + +void +vmem_update(void *dummy) +{ + vmem_t *vmp; + + mutex_enter(&vmem_list_lock); + for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) { + /* + * If threads are waiting for resources, wake them up + * periodically so they can issue another kmem_reap() + * to reclaim resources cached by the slab allocator. + */ + cv_broadcast(&vmp->vm_cv); + + /* + * Rescale the hash table to keep the hash chains short. + */ + vmem_hash_rescale(vmp); + } + mutex_exit(&vmem_list_lock); + + (void) bsd_timeout(vmem_update, dummy, &vmem_update_interval); +} + +void +vmem_qcache_reap(vmem_t *vmp) +{ + int i; + + /* + * Reap any quantum caches that may be part of this vmem. + */ + for (i = 0; i < VMEM_NQCACHE_MAX; i++) + if (vmp->vm_qcache[i]) + kmem_cache_reap_now(vmp->vm_qcache[i]); +} + +/* given a size, return the appropriate vmem_bucket_arena[] entry */ + +static inline uint16_t +vmem_bucket_number(size_t size) +{ + // For VMEM_BUCKET_HIBIT == 12, + // vmem_bucket_arena[n] holds allocations from 2^[n+11]+1 to 2^[n+12], + // so for [n] = 0, 2049-4096, for [n]=5 65537-131072, + // for [n]=7 (256k+1)-512k + // set hb: 512k == 19, 256k+1 == 19, 256k == 18, ... + const int hb = highbit(size-1); + + int bucket = hb - VMEM_BUCKET_LOWBIT; + + // very large allocations go into the 16 MiB bucket + if (hb > VMEM_BUCKET_HIBIT) + bucket = VMEM_BUCKET_HIBIT - VMEM_BUCKET_LOWBIT; + + // very small allocations go into the 4 kiB bucket + if (bucket < 0) + bucket = 0; + + return (bucket); +} + +static inline vmem_t * +vmem_bucket_arena_by_size(size_t size) +{ + uint16_t bucket = vmem_bucket_number(size); + + return (vmem_bucket_arena[bucket]); +} + +vmem_t * +spl_vmem_bucket_arena_by_size(size_t size) +{ + return (vmem_bucket_arena_by_size(size)); +} + +static inline void +vmem_bucket_wake_all_waiters(void) +{ + for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { + const int bucket = i - VMEM_BUCKET_LOWBIT; + vmem_t *bvmp = vmem_bucket_arena[bucket]; + cv_broadcast(&bvmp->vm_cv); + } + cv_broadcast(&spl_heap_arena->vm_cv); +} + +/* + * xnu_alloc_throttled_bail() : spin looking for memory + * + */ + +static inline void * +xnu_alloc_throttled_bail(uint64_t now_ticks, vmem_t *calling_vmp, + size_t size, int vmflags) +{ + // spin looking for memory + const uint64_t bigtarget = MAX(size, 16ULL*1024ULL*1024ULL); + static volatile _Atomic bool alloc_lock = false; + static volatile _Atomic uint64_t force_time = 0; + + uint64_t timeout_ticks = hz / 2; + if (vmflags & VM_PUSHPAGE) + timeout_ticks = hz / 4; + + uint64_t timeout_time = now_ticks + timeout_ticks; + + for (uint32_t suspends = 0, blocked_suspends = 0, + try_no_pressure = 0; /* empty */; /* empty */) { + if (force_time + timeout_ticks > timeout_time) { + // another thread has forced an allocation + // by timing out. push our deadline into the future. + timeout_time = force_time + timeout_ticks; + } + if (alloc_lock) { + blocked_suspends++; + IOSleep(1); + } else if (spl_vmem_xnu_useful_bytes_free() >= bigtarget) { + bool f = false; + // if alloc_lock == f then alloc_lock = true and result + // is true otherwise result is false and f = true + if (!__c11_atomic_compare_exchange_strong(&alloc_lock, + &f, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + /* + * avoid (highly unlikely) data race on + * alloc_lock. if alloc_lock has become true + * while we were in the else if expression + * then we effectively optimize away the + * (relaxed) load of alloc_lock (== true) + * into f and continue. + */ + continue; + } + // alloc_lock is now visible as true to all threads + try_no_pressure++; + void *m = spl_vmem_malloc_if_no_pressure(size); + if (m != NULL) { + uint64_t ticks = zfs_lbolt() - now_ticks; + printf("SPL: %s returning %llu bytes after " + "%llu ticks (hz=%u, seconds = %llu), " + "%u suspends, %u blocked, %u tries (%s)\n", + __func__, (uint64_t)size, + ticks, hz, ticks/hz, suspends, + blocked_suspends, try_no_pressure, + calling_vmp->vm_name); + // atomic seq cst, so is published to all + // threads + alloc_lock = false; + return (m); + } else { + alloc_lock = false; + spl_free_set_emergency_pressure(bigtarget); + suspends++; + IOSleep(1); + } + } else if (zfs_lbolt() > timeout_time) { + bool f = false; + if (!__c11_atomic_compare_exchange_strong(&alloc_lock, + &f, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + // avoid (highly unlikely) data race on + // alloc_lock as above + continue; + } + void *mp = spl_vmem_malloc_unconditionally(size); + uint64_t now = zfs_lbolt(); + uint64_t ticks = now - now_ticks; + force_time = now; + printf("SPL: %s TIMEOUT %llu bytes after " + "%llu ticks (hz=%u, seconds=%llu), " + "%u suspends, %u blocked, %u tries (%s)\n", + __func__, (uint64_t)size, + ticks, hz, ticks/hz, suspends, + blocked_suspends, try_no_pressure, + calling_vmp->vm_name); + alloc_lock = false; + atomic_inc_64(&spl_xat_forced); + return (mp); + } else { + spl_free_set_emergency_pressure(bigtarget); + suspends++; + IOSleep(1); + } + } +} + +static void * +xnu_alloc_throttled(vmem_t *bvmp, size_t size, int vmflag) +{ + // the caller is one of the bucket arenas. + // null_vmp will be spl_default_arena_parent, which is + // just a placeholder. + + uint64_t now = zfs_lbolt(); + const uint64_t entry_now = now; + + void *m = spl_vmem_malloc_if_no_pressure(size); + + if (m != NULL) { + atomic_inc_64(&spl_xat_success); + spl_xat_lastalloc = gethrtime(); + // wake up waiters on all the arena condvars + // since there is apparently no memory shortage. + vmem_bucket_wake_all_waiters(); + return (m); + } else { + spl_free_set_emergency_pressure((int64_t)size); + } + + if (vmflag & VM_PANIC) { + // force an allocation now to avoid a panic + spl_xat_lastalloc = gethrtime(); + spl_free_set_emergency_pressure(4LL * (int64_t)size); + void *p = spl_vmem_malloc_unconditionally(size); + // p cannot be NULL (unconditional kernel malloc always works + // or panics) + // therefore: success, wake all waiters on alloc|free condvar + // wake up arena waiters to let them know there is memory + // available in the arena; let waiters on other bucket arenas + // continue sleeping. + cv_broadcast(&bvmp->vm_cv); + return (p); + } + + if (vmflag & VM_NOSLEEP) { + spl_free_set_emergency_pressure(MAX(2LL * (int64_t)size, + 16LL*1024LL*1024LL)); + /* cheating a bit, but not really waiting */ + kpreempt(KPREEMPT_SYNC); + void *p = spl_vmem_malloc_if_no_pressure(size); + if (p != NULL) { + atomic_inc_64(&spl_xat_late_success_nosleep); + cv_broadcast(&bvmp->vm_cv); + spl_xat_lastalloc = gethrtime(); + } + // if p == NULL, then there will be an increment in + // the fail kstat + return (p); + } + + /* + * Loop for a while trying to satisfy VM_SLEEP allocations. + * + * If we are able to allocate memory, then return the pointer. + * + * We return NULL if some other thread's activity has caused + * sufficient memory to appear in this arena that we can satisfy + * the allocation. + * + * We call xnu_alloc_throttle_bail() after a few milliseconds of + * waiting; it will either return a pointer to newly allocated + * memory or NULL. We return the result. + * + */ + + const uint32_t bucket_number = + vmem_bucket_id_to_bucket_number[bvmp->vm_id]; + static volatile _Atomic uint32_t waiters = 0; + + waiters++; + + if (waiters == 1UL) + atomic_inc_64(&spl_xat_no_waiters); + + static _Atomic uint32_t max_waiters_seen = 0; + + if (waiters > max_waiters_seen) { + max_waiters_seen = waiters; + printf("SPL: %s: max_waiters_seen increased to %u\n", __func__, + max_waiters_seen); + } + + boolean_t local_xat_pressured = false; + + for (; /* empty */; /* empty */) { + clock_t wait_time = USEC2NSEC(500UL * MAX(waiters, 1UL)); + mutex_enter(&bvmp->vm_lock); + spl_xat_sleep++; + if (local_xat_pressured) { + spl_xat_pressured++; + local_xat_pressured = false; + } + (void) cv_timedwait_hires(&bvmp->vm_cv, &bvmp->vm_lock, + wait_time, 0, 0); + mutex_exit(&bvmp->vm_lock); + now = zfs_lbolt(); + // We may be here because of a broadcast to &vmp->vm_cv, + // causing xnu to schedule all the sleepers in priority-weighted + // FIFO order. Because of the mutex_exit(), the sections below + // here may be entered concurrently. + // spl_vmem_malloc_if_no_pressure does a mutex, so avoid calling + // it unless there is a chance it will succeed. + if (spl_vmem_xnu_useful_bytes_free() > (MAX(size, + 16ULL*1024ULL*1024ULL))) { + void *a = spl_vmem_malloc_if_no_pressure(size); + if (a != NULL) { + atomic_inc_64(&spl_xat_late_success); + spl_xat_lastalloc = gethrtime(); + waiters--; + // Wake up all waiters on the bucket arena + // locks, since the system apparently has + // memory again. + vmem_bucket_wake_all_waiters(); + return (a); + } else { + // Probably vm_page_free_count changed while + // we were in the mutex queue in + // spl_vmem_malloc_if_no_pressure(). There is + // therefore no point in doing the bail-out + // check below, so go back to the top of the + // for loop. + atomic_inc_64(&spl_xat_late_deny); + continue; + } + } + if (now > entry_now + hz / 4 || + spl_vba_threads[bucket_number] > 1UL) { + // If there are other threads waiting for us + // in vba() then when we satisfy this allocation, + // we satisfy more than one thread, so invoke XATB(). + // Otherwise, if we have had no luck for 250 ms, then + // switch to XATB() which is much more aggressive. + if (spl_vba_threads[bucket_number] > 1UL) + atomic_inc_64(&spl_xat_bailed_contended); + atomic_inc_64(&spl_xat_bailed); + static _Atomic uint32_t bailing_threads = 0; + static _Atomic uint32_t max_bailers_seen = 0; + bailing_threads++; + if (bailing_threads > max_bailers_seen) { + max_bailers_seen = bailing_threads; + printf("SPL: %s: max_bailers_seen increased " + "to %u\n", __func__, max_bailers_seen); + } + void *b = + xnu_alloc_throttled_bail(now, bvmp, size, vmflag); + bailing_threads--; + spl_xat_lastalloc = gethrtime(); + // wake up waiters on the arena lock, + // since they now have memory they can use. + cv_broadcast(&bvmp->vm_cv); + // open turnstile after having bailed, rather + // than before + waiters--; + return (b); + } else if (now - entry_now > 0 && + ((now - entry_now) % (hz/10))) { + spl_free_set_emergency_pressure(MAX(size, + 16LL*1024LL*1024LL)); + local_xat_pressured = true; + } + } +} + +static void +xnu_free_throttled(vmem_t *vmp, void *vaddr, size_t size) +{ + extern void osif_free(void *, uint64_t); + + // Serialize behind a (short) spin-sleep delay, giving + // xnu time to do freelist management and + // PT teardowns + + // In the usual case there is only one thread in this function, + // so we proceed waitlessly to osif_free(). + + // When there are multiple threads here, we delay the 2nd and later. + + // Explict race: + // The osif_free() is not protected by the vmem_xnu_alloc_lock + // mutex; that is just used for implementing the delay. Consequently, + // the waiters on the same lock in spl_vmem_malloc_if_no_pressure may + // falsely see too small a value for vm_page_free_count. We don't + // care in part because xnu performs poorly when doing + // free-then-allocate anwyay. + + // a_waiters gauges the loop exit checking and sleep duration; + // it is a count of the number of threads trying to do work + // in this function. + static volatile _Atomic uint32_t a_waiters = 0; + + // is_freeing protects the osif_free() call; see comment below + static volatile _Atomic bool is_freeing = false; + + a_waiters++; // generates "lock incl ..." + + static _Atomic uint32_t max_waiters_seen = 0; + + if (a_waiters > max_waiters_seen) { + max_waiters_seen = a_waiters; + printf("SPL: %s: max_waiters_seen increased to %u\n", + __func__, max_waiters_seen); + } + + for (uint32_t iter = 0; a_waiters > 1UL; iter++) { + // there is more than one thread here, so suspend and + // sleep for 1 ms + atomic_inc_64(&spl_xft_wait); + IOSleep(1); + // If are growing old in this loop, then see if + // anyone else is still in osif_free. If not, + // we can exit. + if (iter >= a_waiters) { + // if is_freeing == f, then set is_freeing to true with + // release semantics (i.e. "push" it to other cores) + // then break; otherwise, set f to true relaxedly (i.e., + // optimize it out) + bool f = false; + if (__c11_atomic_compare_exchange_weak(&is_freeing, + &f, true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { + break; + } + } + } + // If there is more than one thread in this function, osif_free() is + // protected by is_freeing. Release it after the osif_free() + // call has been made and the lastfree bookkeeping has been done. + osif_free(vaddr, size); + spl_xat_lastfree = gethrtime(); + is_freeing = false; + a_waiters--; + kpreempt(KPREEMPT_SYNC); + // since we just gave back xnu enough to satisfy an allocation + // in at least the smaller buckets, let's wake up anyone in + // the cv_wait() in vmem_xalloc([bucket_#], ...) + vmem_bucket_wake_all_waiters(); +} + +// return 0 if the bit was unset before the atomic OR. +static inline bool +vba_atomic_lock_bucket(volatile _Atomic uint16_t *bbap, uint16_t bucket_bit) +{ + + // We use a test-and-set of the appropriate bit + // in buckets_busy_allocating; if it was not set, + // then break out of the loop. + // + // This compiles into an orl, cmpxchgw instruction pair. + // the return from __c11_atomic_fetch_or() is the + // previous value of buckets_busy_allocating. + + uint16_t prev = + __c11_atomic_fetch_or(bbap, bucket_bit, __ATOMIC_SEQ_CST); + if (prev & bucket_bit) + return (false); // we did not acquire the bit lock here + else + return (true); // we turned the bit from 0 to 1 +} + +static void * +vmem_bucket_alloc(vmem_t *null_vmp, size_t size, const int vmflags) +{ + + if (vmflags & VM_NO_VBA) + return (NULL); + + // caller is spl_heap_arena looking for memory. + // null_vmp will be spl_default_arena_parent, and so + // is just a placeholder. + + vmem_t *calling_arena = spl_heap_arena; + + static volatile _Atomic uint32_t hipriority_allocators = 0; + boolean_t local_hipriority_allocator = false; + + if (0 != (vmflags & (VM_PUSHPAGE | VM_NOSLEEP | VM_PANIC | VM_ABORT))) { + local_hipriority_allocator = true; + hipriority_allocators++; + } + + if (!ISP2(size)) + atomic_inc_64(&spl_bucket_non_pow2_allocs); + + vmem_t *bvmp = vmem_bucket_arena_by_size(size); + + // there are 13 buckets, so use a 16-bit scalar to hold + // a set of bits, where each bit corresponds to an in-progress + // vmem_alloc(bucket, ...) below. + + static volatile _Atomic uint16_t buckets_busy_allocating = 0; + const uint16_t bucket_number = vmem_bucket_number(size); + const uint16_t bucket_bit = (uint16_t)1 << bucket_number; + + spl_vba_threads[bucket_number]++; + + static volatile _Atomic uint32_t waiters = 0; + + // First, if we are VM_SLEEP, check for memory, try some pressure, + // and if that doesn't work, force entry into the loop below. + + bool loop_once = false; + + if ((vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) == 0 && + ! vmem_canalloc_atomic(bvmp, size)) { + if (spl_vmem_xnu_useful_bytes_free() < (MAX(size, + 16ULL*1024ULL*1024ULL))) { + spl_free_set_emergency_pressure(size); + IOSleep(1); + if (!vmem_canalloc_atomic(bvmp, size) && + (spl_vmem_xnu_useful_bytes_free() < (MAX(size, + 16ULL*1024ULL*1024ULL)))) { + loop_once = true; + } + } + } + + // spin-sleep: if we would need to go to the xnu allocator. + // + // We want to avoid a burst of allocs from bucket_heap's children + // successively hitting a low-memory condition, or alternatively + // each successfully importing memory from xnu when they can share + // a single import. + // + // We also want to take advantage of any memory that becomes available + // in bucket_heap. + // + // If there is more than one thread in this function (~ few percent) + // then the subsequent threads are put into the loop below. They + // can escape the loop if they are [1]non-waiting allocations, or + // [2]if they become the only waiting thread, or + // [3]if the cv_timedwait_hires returns -1 (which represents EWOULDBLOCK + // from msleep() which gets it from _sleep()'s THREAD_TIMED_OUT) + // allocating in the bucket, or [4]if this thread has (rare condition) + // spent a quarter of a second in the loop. + + if (waiters++ > 1 || loop_once) { + atomic_inc_64(&spl_vba_loop_entries); + } + + static _Atomic uint32_t max_waiters_seen = 0; + + if (waiters > max_waiters_seen) { + max_waiters_seen = waiters; + printf("SPL: %s: max_waiters_seen increased to %u\n", __func__, + max_waiters_seen); + } + + // local counters, to be added atomically to global kstat variables + uint64_t local_memory_blocked = 0, local_cv_timeout = 0; + uint64_t local_loop_timeout = 0; + uint64_t local_cv_timeout_blocked = 0, local_loop_timeout_blocked = 0; + uint64_t local_sleep = 0, local_hipriority_blocked = 0; + + const uint64_t loop_ticks = 25; // a tick is 10 msec, so 250 msec + const uint64_t hiprio_loop_ticks = 4; // 40 msec + + for (uint64_t entry_time = zfs_lbolt(), + loop_timeout = entry_time + loop_ticks, + hiprio_timeout = entry_time + hiprio_loop_ticks, timedout = 0; + waiters > 1UL || loop_once; /* empty */) { + loop_once = false; + // non-waiting allocations should proceeed to vmem_alloc() + // immediately + if (vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) { + break; + } + if (vmem_canalloc_atomic(bvmp, size)) { + // We can probably vmem_alloc(bvmp, size, vmflags). + // At worst case it will give us a NULL and we will + // end up on the vmp's cv_wait. + // + // We can have threads with different bvmp + // taking this exit, and will proceed concurrently. + // + // However, we should protect against a burst of + // callers hitting the same bvmp before the allocation + // results are reflected in + // vmem_canalloc_atomic(bvmp, ...) + if (local_hipriority_allocator == false && + hipriority_allocators > 0) { + // more high priority allocations are wanted, + // so this thread stays here + local_hipriority_blocked++; + } else if (vba_atomic_lock_bucket( + &buckets_busy_allocating, bucket_bit)) { + // we are not being blocked by another allocator + // to the same bucket, or any higher priority + // allocator + atomic_inc_64(&spl_vba_parent_memory_appeared); + break; + // The vmem_alloc() should return extremely + // quickly from an INSTANTFIT allocation that + // canalloc predicts will succeed. + } else { + // another thread is trying to use the free + // memory in the bucket_## arena; there might + // still be free memory there after its + // allocation is completed, and there might be + // excess in the bucket_heap arena, so stick + // around in this loop. + local_memory_blocked++; + cv_broadcast(&bvmp->vm_cv); + } + } + if (timedout > 0) { + if (local_hipriority_allocator == false && + hipriority_allocators > 0) { + local_hipriority_blocked++; + } else if (vba_atomic_lock_bucket( + &buckets_busy_allocating, bucket_bit)) { + if (timedout & 1) + local_cv_timeout++; + if (timedout & 6 || zfs_lbolt() >= loop_timeout) + local_loop_timeout++; + break; + } else { + if (timedout & 1) { + local_cv_timeout_blocked++; + } + if (timedout & 6) { + local_loop_timeout_blocked++; + } else if (zfs_lbolt() > loop_timeout) { + timedout |= 2; + } + // flush the current thread in xat() out of + // xat()'s for() loop and into xat_bail() + cv_broadcast(&bvmp->vm_cv); + } + } + // The bucket is already allocating, or the bucket needs + // more memory to satisfy vmem_allocat(bvmp, size, VM_NOSLEEP), + // or we want to give the bucket some time to acquire more + // memory. + // substitute for the vmp arena's cv_wait in vmem_xalloc() + // (vmp is the bucket_heap AKA spl_heap_arena) + mutex_enter(&calling_arena->vm_lock); + local_sleep++; + if (local_sleep >= 1000ULL) { + atomic_add_64(&spl_vba_sleep, local_sleep - 1ULL); + local_sleep = 1ULL; + atomic_add_64(&spl_vba_cv_timeout_blocked, + local_cv_timeout_blocked); + local_cv_timeout_blocked = 0; + atomic_add_64(&spl_vba_loop_timeout_blocked, + local_loop_timeout_blocked); + local_loop_timeout_blocked = 0; + atomic_add_64(&spl_vba_hiprio_blocked, + local_hipriority_blocked); + local_hipriority_blocked = 0; + if (local_memory_blocked > 1ULL) { + atomic_add_64(&spl_vba_parent_memory_blocked, + local_memory_blocked - 1ULL); + local_memory_blocked = 1ULL; + } + } + clock_t wait_time = MSEC2NSEC(30); + if (timedout > 0 || local_memory_blocked > 0) { + wait_time = MSEC2NSEC(1); + } + int ret = cv_timedwait_hires(&calling_arena->vm_cv, + &calling_arena->vm_lock, + wait_time, 0, 0); + // We almost certainly have exited because of a + // signal/broadcast, but maybe just timed out. + // Either way, recheck memory. + mutex_exit(&calling_arena->vm_lock); + if (ret == -1) { + // cv_timedwait_hires timer expired + timedout |= 1; + cv_broadcast(&bvmp->vm_cv); + } else if ((timedout & 2) == 0) { + // we were awakened; check to see if we have been + // in the for loop for a long time + uint64_t n = zfs_lbolt(); + if (n > loop_timeout) { + timedout |= 2; + extern uint64_t real_total_memory; + spl_free_set_emergency_pressure( + real_total_memory / 64LL); + // flush the current thread in xat() out of + // xat()'s for() loop and into xat_bail() + cv_broadcast(&bvmp->vm_cv); + } else if (local_hipriority_allocator && + n > hiprio_timeout && waiters > 1UL) { + timedout |= 4; + } + } + } + + /* + * Turn on the exclusion bit in buckets_busy_allocating, to + * prevent multiple threads from calling vmem_alloc() on the + * same bucket arena concurrently rather than serially. + * + * This principally reduces the liklihood of asking xnu for + * more memory when other memory is or becomes available. + * + * This exclusion only applies to VM_SLEEP allocations; + * others (VM_PANIC, VM_NOSLEEP, VM_ABORT) will go to + * vmem_alloc() concurrently with any other threads. + * + * Since we aren't doing a test-and-set operation like above, + * we can just use |= and &= below and get correct atomic + * results, instead of using: + * + * __c11_atomic_fetch_or(&buckets_busy_allocating, + * bucket_bit, __ATOMIC_SEQ_CST); + * with the &= down below being written as + * __c11_atomic_fetch_and(&buckets_busy_allocating, + * ~bucket_bit, __ATOMIC_SEQ_CST); + * + * and this makes a difference with no optimization either + * compiling the whole file or with __attribute((optnone)) + * in front of the function decl. In particular, the non- + * optimized version that uses the builtin __c11_atomic_fetch_{and,or} + * preserves the C program order in the machine language output, + * inersting cmpxchgws, while all optimized versions, and the + * non-optimized version using the plainly-written version, reorder + * the "orw regr, memory" and "andw register, memory" (these are atomic + * RMW operations in x86-64 when the memory is naturally aligned) so + * that the strong memory model x86-64 promise that later loads see the + * results of earlier stores. + * + * clang+llvm simply are good at optimizing _Atomics and + * the optimized code differs only in line numbers and + * among all three approaches (as plainly written, using + * the __c11_atomic_fetch_{or,and} with sequential consistency, + * or when compiling with at least -O optimization so an + * atomic_or_16(&buckets_busy_allocating) built with GCC intrinsics + * is actually inlined rather than a function call). + * + */ + + // in case we left the loop by being the only waiter, stop the + // next thread arriving from leaving the for loop because + // vmem_canalloc(bvmp, that_thread's_size) is true. + + buckets_busy_allocating |= bucket_bit; + + // update counters + if (local_sleep > 0) + atomic_add_64(&spl_vba_sleep, local_sleep); + if (local_memory_blocked > 0) + atomic_add_64(&spl_vba_parent_memory_blocked, + local_memory_blocked); + if (local_cv_timeout > 0) + atomic_add_64(&spl_vba_cv_timeout, local_cv_timeout); + if (local_cv_timeout_blocked > 0) + atomic_add_64(&spl_vba_cv_timeout_blocked, + local_cv_timeout_blocked); + if (local_loop_timeout > 0) + atomic_add_64(&spl_vba_loop_timeout, local_loop_timeout); + if (local_loop_timeout_blocked > 0) + atomic_add_64(&spl_vba_loop_timeout_blocked, + local_loop_timeout_blocked); + if (local_hipriority_blocked > 0) + atomic_add_64(&spl_vba_hiprio_blocked, + local_hipriority_blocked); + + // There is memory in this bucket, or there are no other waiters, + // or we aren't a VM_SLEEP allocation, or we iterated out of the + // for loop. + // vmem_alloc() and vmem_xalloc() do their own mutex serializing + // on bvmp->vm_lock, so we don't have to here. + // + // vmem_alloc may take some time to return (especially for VM_SLEEP + // allocations where we did not take the vm_canalloc(bvmp...) break out + // of the for loop). Therefore, if we didn't enter the for loop at all + // because waiters was 0 when we entered this function, + // subsequent callers will enter the for loop. + + void *m = vmem_alloc(bvmp, size, vmflags); + + // allow another vmem_canalloc() through for this bucket + // by atomically turning off the appropriate bit + + /* + * Except clang+llvm DTRT because of _Atomic, could be written as: + * __c11_atomic_fetch_and(&buckets_busy_allocating, + * ~bucket_bit, __ATOMIC_SEQ_CST); + * + * On processors with more relaxed memory models, it might be + * more efficient to do so with release semantics here, and + * in the atomic |= above, with acquire semantics in the bit tests, + * but on the other hand it may be hard to do better than clang+llvm. + */ + + buckets_busy_allocating &= ~bucket_bit; + + if (local_hipriority_allocator) + hipriority_allocators--; + + // if we got an allocation, wake up the arena cv waiters + // to let them try to exit the for(;;) loop above and + // exit the cv_wait() in vmem_xalloc(vmp, ...) + + if (m != NULL) { + cv_broadcast(&calling_arena->vm_cv); + } + + waiters--; + spl_vba_threads[bucket_number]--; + return (m); +} + +static void +vmem_bucket_free(vmem_t *null_vmp, void *vaddr, size_t size) +{ + vmem_t *calling_arena = spl_heap_arena; + + vmem_free(vmem_bucket_arena_by_size(size), vaddr, size); + + // wake up arena waiters to let them try an alloc + cv_broadcast(&calling_arena->vm_cv); +} + +static inline int64_t +vmem_bucket_arena_free(uint16_t bucket) +{ + VERIFY(bucket < VMEM_BUCKETS); + return ((int64_t)vmem_size_semi_atomic(vmem_bucket_arena[bucket], + VMEM_FREE)); +} + +static inline int64_t +vmem_bucket_arena_used(int bucket) +{ + VERIFY(bucket < VMEM_BUCKETS); + return ((int64_t)vmem_size_semi_atomic(vmem_bucket_arena[bucket], + VMEM_ALLOC)); +} + + +int64_t +vmem_buckets_size(int typemask) +{ + int64_t total_size = 0; + + for (int i = 0; i < VMEM_BUCKETS; i++) { + int64_t u = vmem_bucket_arena_used(i); + int64_t f = vmem_bucket_arena_free(i); + if (typemask & VMEM_ALLOC) + total_size += u; + if (typemask & VMEM_FREE) + total_size += f; + } + if (total_size < 0) + total_size = 0; + + return ((size_t)total_size); +} + +static uint64_t +spl_validate_bucket_span_size(uint64_t val) +{ + if (!ISP2(val)) { + printf("SPL: %s: WARNING %llu is not a power of two, " + "not changing.\n", __func__, val); + return (0); + } + if (val < 128ULL*1024ULL || val > 16ULL*1024ULL*1024ULL) { + printf("SPL: %s: WARNING %llu is out of range [128k - 16M], " + "not changing.\n", __func__, val); + return (0); + } + return (val); +} + +static inline void +spl_modify_bucket_span_size(int bucket, uint64_t size) +{ + vmem_t *bvmp = vmem_bucket_arena[bucket]; + + mutex_enter(&bvmp->vm_lock); + bvmp->vm_min_import = size; + mutex_exit(&bvmp->vm_lock); +} + +static inline void +spl_modify_bucket_array() +{ + for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { + // i = 12, bucket = 0, contains allocs from 8192 to 16383 bytes, + // and should never ask xnu for < 16384 bytes, so as to avoid + // asking xnu for a non-power-of-two size. + const int bucket = i - VMEM_BUCKET_LOWBIT; + const uint32_t bucket_alloc_minimum_size = 1UL << (uint32_t)i; + const uint32_t bucket_parent_alloc_minimum_size = + bucket_alloc_minimum_size * 2UL; + + switch (i) { + // see vmem_init() below for details + case 16: + case 17: + spl_modify_bucket_span_size(bucket, + MAX(spl_bucket_tunable_small_span, + bucket_parent_alloc_minimum_size)); + break; + default: + spl_modify_bucket_span_size(bucket, + MAX(spl_bucket_tunable_large_span, + bucket_parent_alloc_minimum_size)); + break; + } + } +} + +static inline void +spl_printf_bucket_span_sizes(void) +{ + // this doesn't have to be super-exact + printf("SPL: %s: ", __func__); + for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { + int bnum = i - VMEM_BUCKET_LOWBIT; + vmem_t *bvmp = vmem_bucket_arena[bnum]; + printf("%llu ", (uint64_t)bvmp->vm_min_import); + } + printf("\n"); +} + +static inline void +spl_set_bucket_spans(uint64_t l, uint64_t s) +{ + if (spl_validate_bucket_span_size(l) && + spl_validate_bucket_span_size(s)) { + atomic_swap_64(&spl_bucket_tunable_large_span, l); + atomic_swap_64(&spl_bucket_tunable_small_span, s); + spl_modify_bucket_array(); + } +} + +void +spl_set_bucket_tunable_large_span(uint64_t size) +{ + uint64_t s = 0; + + mutex_enter(&vmem_xnu_alloc_lock); + atomic_swap_64(&s, spl_bucket_tunable_small_span); + spl_set_bucket_spans(size, s); + mutex_exit(&vmem_xnu_alloc_lock); + + spl_printf_bucket_span_sizes(); +} + +void +spl_set_bucket_tunable_small_span(uint64_t size) +{ + uint64_t l = 0; + + mutex_enter(&vmem_xnu_alloc_lock); + atomic_swap_64(&l, spl_bucket_tunable_large_span); + spl_set_bucket_spans(l, size); + mutex_exit(&vmem_xnu_alloc_lock); + + spl_printf_bucket_span_sizes(); +} + +static void * +spl_vmem_default_alloc(vmem_t *vmp, size_t size, int vmflags) +{ + extern void *osif_malloc(uint64_t); + return (osif_malloc(size)); +} + +static void +spl_vmem_default_free(vmem_t *vmp, void *vaddr, size_t size) +{ + extern void osif_free(void *, uint64_t); + osif_free(vaddr, size); +} + +vmem_t * +vmem_init(const char *heap_name, + void *heap_start, size_t heap_size, size_t heap_quantum, + void *(*heap_alloc)(vmem_t *, size_t, int), + void (*heap_free)(vmem_t *, void *, size_t)) +{ + uint32_t id; + int nseg = VMEM_SEG_INITIAL; + vmem_t *heap; + + // XNU mutexes need initialisation + mutex_init(&vmem_list_lock, "vmem_list_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_segfree_lock, "vmem_segfree_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_sleep_lock, "vmem_sleep_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_nosleep_lock, "vmem_nosleep_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_pushpage_lock, "vmem_pushpage_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_panic_lock, "vmem_panic_lock", MUTEX_DEFAULT, + NULL); + mutex_init(&vmem_xnu_alloc_lock, "vmem_xnu_alloc_lock", MUTEX_DEFAULT, + NULL); + + while (--nseg >= 0) + vmem_putseg_global(&vmem_seg0[nseg]); + + /* + * On OSX we ultimately have to use the OS allocator + * as the ource and sink of memory as it is allocated + * and freed. + * + * The spl_root_arena_parent is needed in order to provide a + * base arena with an always-NULL afunc and ffunc in order to + * end the searches done by vmem_[x]alloc and vm_xfree; it + * serves no other purpose; its stats will always be zero. + * + */ + + // id 0 + spl_default_arena_parent = vmem_create("spl_default_arena_parent", + NULL, 0, heap_quantum, NULL, NULL, NULL, 0, VM_SLEEP); + + // illumos/openzfs has a gigantic pile of memory that it can use + // for its first arena; + // o3x is not so lucky, so we start with this + static char initial_default_block[16ULL*1024ULL*1024ULL] + __attribute__((aligned(4096))) = { 0 }; + + // The default arena is very low-bandwidth; it supplies the initial + // large allocation for the heap arena below, and it serves as the + // parent of the vmem_metadata arena. It will typically do only 2 + // or 3 parent_alloc calls (to spl_vmem_default_alloc) in total. + + spl_default_arena = vmem_create("spl_default_arena", // id 1 + initial_default_block, 16ULL*1024ULL*1024ULL, + heap_quantum, spl_vmem_default_alloc, spl_vmem_default_free, + spl_default_arena_parent, 16ULL*1024ULL*1024ULL, + VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); + + VERIFY(spl_default_arena != NULL); + + // The bucket arenas satisfy allocations & frees from the bucket heap + // that are dispatched to the bucket whose power-of-two label is the + // smallest allocation that vmem_bucket_allocate will ask for. + // + // The bucket arenas in turn exchange memory with XNU's allocator/freer + // in large spans (~ 1 MiB is stable on all systems but creates bucket + // fragmentation) + // + // Segregating by size constrains internal fragmentation within the + // bucket and provides kstat.vmem visiblity and span-size policy to + // be applied to particular buckets (notably the sources of most + // allocations, see the comments below) + // + // For VMEM_BUCKET_HIBIT == 12, + // vmem_bucket_arena[n] holds allocations from 2^[n+11]+1 to 2^[n+12], + // so for [n] = 0, 2049-4096, for [n]=5 65537-131072, + // for [n]=7 (256k+1)-512k + // + // so "kstat.vmvm.vmem.bucket_1048576" should be read as the bucket + // arena containing allocations 1 MiB and smaller, but larger + // than 512 kiB. + + // create arenas for the VMEM_BUCKETS, id 2 - id 14 + + extern uint64_t real_total_memory; + VERIFY3U(real_total_memory, >=, 1024ULL*1024ULL*1024ULL); + + // adjust minimum bucket span size for memory size + // see comments in the switch below + // large span: 1 MiB and bigger on large-memory (> 32 GiB) systems + // small span: 256 kiB and bigger on large-memory systems + const uint64_t k = 1024ULL; + const uint64_t qm = 256ULL * k; + const uint64_t m = 1024ULL* k; + const uint64_t big = MAX(real_total_memory / (k * 32ULL), m); + const uint64_t small = MAX(real_total_memory / (k * 128ULL), qm); + spl_bucket_tunable_large_span = MIN(big, 16ULL * m); + spl_bucket_tunable_small_span = small; + printf("SPL: %s: real_total_memory %llu, large spans %llu, small " + "spans %llu\n", __func__, real_total_memory, + spl_bucket_tunable_large_span, spl_bucket_tunable_small_span); + char *buf = vmem_alloc(spl_default_arena, VMEM_NAMELEN + 21, VM_SLEEP); + for (int32_t i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { + size_t minimum_allocsize = 0; + const uint64_t bucket_largest_size = (1ULL << (uint64_t)i); + (void) snprintf(buf, VMEM_NAMELEN + 20, "%s_%llu", + "bucket", bucket_largest_size); + dprintf("SPL: %s creating arena %s (i == %d)\n", __func__, buf, + i); + switch (i) { + case 15: + case 16: + /* + * With the arrival of abd, the 2^15 (== 32768) and 2^16 + * buckets are by far the most busy, holding + * respectively the qcache spans of kmem_va (the + * kmem_alloc et al. heap) and zfs_qcache (notably the + * source for the abd_chunk arena) + * + * The lifetime of early (i.e., after import and mount) + * allocations can be highly variable, leading + * to persisting fragmentation from the first eviction + * after arc has grown large. This can happen if, for + * example, there substantial import and mounting (and + * mds/mdworker and backupd scanning) activity before a + * user logs in and starts demanding memory in userland + * (e.g. by firing up a browser or mail app). + * + * Crucially, this makes it difficult to give back + * memory to xnu without holding the ARC size down for + * long periods of time. + * + * We can mitigate this by exchanging smaller + * amounts of memory with xnu for these buckets. + * There are two downsides: xnu's memory + * freelist will be prone to greater + * fragmentation, which will affect all + * allocation and free activity using xnu's + * allocator including kexts other than our; and + * we are likely to have more waits in the throttled + * alloc function, as more threads are likely to require + * slab importing into the kmem layer and fewer threads + * can be satisfied by a small allocation vs a large + * one. + * + * The import sizes are sysadmin-tunable by setting + * kstat.spl.misc.spl_misc.spl_tunable_small_span + * to a power-of-two number of bytes in zsysctl.conf + * should a sysadmin prefer non-early allocations to + * be larger or smaller depending on system performance + * and workload. + * + * However, a zfs booting system must use the defaults + * here for the earliest allocations, therefore they. + * should be only large enough to protect system + * performance if the sysadmin never changes the tunable + * span sizes. + */ + minimum_allocsize = MAX(spl_bucket_tunable_small_span, + bucket_largest_size * 4); + break; + default: + /* + * These buckets are all relatively low bandwidth and + * with relatively uniform lifespans for most + * allocations (borrowed arc buffers dominate). + * They should be large enough that they do not + * pester xnu. + */ + minimum_allocsize = MAX(spl_bucket_tunable_large_span, + bucket_largest_size * 4); + break; + } + dprintf("SPL: %s setting bucket %d (%d) to size %llu\n", + __func__, i, (int)(1 << i), (uint64_t)minimum_allocsize); + const int bucket_number = i - VMEM_BUCKET_LOWBIT; + vmem_t *b = vmem_create(buf, NULL, 0, heap_quantum, + xnu_alloc_throttled, xnu_free_throttled, + spl_default_arena_parent, minimum_allocsize, + VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE | VMC_TIMEFREE); + VERIFY(b != NULL); + b->vm_min_import = minimum_allocsize; + b->vm_source = b; + vmem_bucket_arena[bucket_number] = b; + vmem_bucket_id_to_bucket_number[b->vm_id] = bucket_number; + } + + vmem_free(spl_default_arena, buf, VMEM_NAMELEN + 21); + // spl_heap_arena, the bucket heap, is the primary interface + // to the vmem system + + // all arenas not rooted to vmem_metadata will be rooted to + // spl_heap arena. + + spl_heap_arena = vmem_create("bucket_heap", // id 15 + NULL, 0, heap_quantum, + vmem_bucket_alloc, vmem_bucket_free, spl_default_arena_parent, 0, + VM_SLEEP | VMC_TIMEFREE | VMC_OLDFIRST); + + VERIFY(spl_heap_arena != NULL); + + // add a fixed-sized allocation to spl_heap_arena; this reduces the + // need to talk to the bucket arenas by a substantial margin + // (kstat.vmem.vmem.bucket_heap.{alloc+free} is much greater than + // kstat.vmem.vmem.bucket_heap.parent_{alloc+free}, and improves with + // increasing initial fixed allocation size. + + const size_t mib = 1024ULL * 1024ULL; + const size_t gib = 1024ULL * mib; + size_t resv_size = 128ULL * mib; + extern uint64_t real_total_memory; + + if (real_total_memory >= 4ULL * gib) + resv_size = 256ULL * mib; + if (real_total_memory >= 8ULL * gib) + resv_size = 512ULL * mib; + if (real_total_memory >= 16ULL * gib) + resv_size = gib; + + printf("SPL: %s adding fixed allocation of %llu to the bucket_heap\n", + __func__, (uint64_t)resv_size); + + spl_heap_arena_initial_alloc = vmem_add(spl_heap_arena, + vmem_alloc(spl_default_arena, resv_size, VM_SLEEP), + resv_size, VM_SLEEP); + + VERIFY(spl_heap_arena_initial_alloc != NULL); + + spl_heap_arena_initial_alloc_size = resv_size; + + // kstat.vmem.vmem.heap : kmem_cache_alloc() and similar calls + // to handle in-memory datastructures other than arc and zio buffers. + + heap = vmem_create(heap_name, // id 16 + NULL, 0, heap_quantum, + vmem_alloc, vmem_free, spl_heap_arena, 0, + VM_SLEEP); + + VERIFY(heap != NULL); + + // Root all the low bandwidth metadata arenas to the default arena. + // The vmem_metadata allocations will all be 32 kiB or larger, + // and the total allocation will generally cap off around 24 MiB. + + vmem_metadata_arena = vmem_create("vmem_metadata", // id 17 + NULL, 0, heap_quantum, vmem_alloc, vmem_free, spl_default_arena, + 8 * PAGESIZE, VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); + + VERIFY(vmem_metadata_arena != NULL); + + vmem_seg_arena = vmem_create("vmem_seg", // id 18 + NULL, 0, heap_quantum, + vmem_alloc, vmem_free, vmem_metadata_arena, 0, + VM_SLEEP | VMC_POPULATOR); + + VERIFY(vmem_seg_arena != NULL); + + vmem_hash_arena = vmem_create("vmem_hash", // id 19 + NULL, 0, 8, + vmem_alloc, vmem_free, vmem_metadata_arena, 0, + VM_SLEEP); + + VERIFY(vmem_hash_arena != NULL); + + vmem_vmem_arena = vmem_create("vmem_vmem", // id 20 + vmem0, sizeof (vmem0), 1, + vmem_alloc, vmem_free, vmem_metadata_arena, 0, + VM_SLEEP); + + VERIFY(vmem_vmem_arena != NULL); + + // 21 (0-based) vmem_create before this line. - macroized + // NUMBER_OF_ARENAS_IN_VMEM_INIT + for (id = 0; id < vmem_id; id++) { + (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t), + 1, 0, 0, &vmem0[id], &vmem0[id + 1], + VM_NOSLEEP | VM_BESTFIT | VM_PANIC); + } + + printf("SPL: starting vmem_update() thread\n"); + vmem_update(NULL); + + return (heap); +} + +struct free_slab { + vmem_t *vmp; + size_t slabsize; + void *slab; + list_node_t next; +}; +static list_t freelist; + +static void vmem_fini_freelist(void *vmp, void *start, size_t size) +{ + struct free_slab *fs; + + MALLOC(fs, struct free_slab *, sizeof (struct free_slab), M_TEMP, + M_WAITOK); + fs->vmp = vmp; + fs->slabsize = size; + fs->slab = start; + list_link_init(&fs->next); + list_insert_tail(&freelist, fs); +} + +void +vmem_free_span_list(void) +{ + int total = 0; + int total_count = 0; + struct free_slab *fs; + int release = 1; + + while ((fs = list_head(&freelist))) { + total_count++; + total += fs->slabsize; + list_remove(&freelist, fs); + for (int id = 0; id < VMEM_INITIAL; id++) { + if (&vmem0[id] == fs->slab) { + release = 0; + break; + } + } + if (release) + fs->vmp->vm_source_free(fs->vmp, fs->slab, + fs->slabsize); + release = 1; + FREE(fs, M_TEMP); + } +} + +static void +vmem_fini_void(void *vmp, void *start, uint32_t size) +{ +} + +void +vmem_fini(vmem_t *heap) +{ + struct free_slab *fs; + uint64_t total; + + bsd_untimeout(vmem_update, NULL); + + printf("SPL: %s: stopped vmem_update. Creating list and walking " + "arenas.\n", __func__); + + /* Create a list of slabs to free by walking the list of allocs */ + list_create(&freelist, sizeof (struct free_slab), + offsetof(struct free_slab, next)); + + /* Walk to list of allocations */ + + /* + * walking with VMEM_REENTRANT causes segment consolidation and + * freeing of spans the freelist contains a list of segments that + * are still allocated at the time of the walk; unfortunately the + * lists cannot be exact without complex multiple passes, locking, + * and a more complex vmem_fini_freelist(). + * + * Walking without VMEM_REENTRANT can produce a nearly-exact list + * of unfreed spans, which Illumos would then free directly after + * the list is complete. + * + * Unfortunately in O3X, that lack of exactness can lead to a panic + * caused by attempting to free to xnu memory that we already freed + * to xnu. Fortunately, we can get a sense of what would have been + * destroyed after the (non-reentrant) walking, and we printf that + * at the end of this function. + */ + + // Walk all still-alive arenas from leaves to the root + + vmem_walk(heap, VMEM_ALLOC | VMEM_REENTRANT, vmem_fini_void, heap); + + vmem_walk(heap, VMEM_ALLOC, vmem_fini_freelist, heap); + + vmem_free_span_list(); + dprintf("\nSPL: %s destroying heap\n", __func__); + vmem_destroy(heap); // PARENT: spl_heap_arena + + printf("SPL: %s: walking spl_heap_arena, aka bucket_heap (pass 1)\n", + __func__); + + vmem_walk(spl_heap_arena, VMEM_ALLOC | VMEM_REENTRANT, vmem_fini_void, + spl_heap_arena); + + printf("SPL: %s: calling vmem_xfree(spl_default_arena, ptr, %llu);\n", + __func__, (uint64_t)spl_heap_arena_initial_alloc_size); + + // forcibly remove the initial alloc from spl_heap_arena arena, whether + // or not it is empty. below this point, any activity on + // spl_default_arena other than a non-reentrant(!) walk and a destroy + // is unsafe (UAF or MAF). + // However, all the children of spl_heap_arena should now be destroyed. + + vmem_xfree(spl_default_arena, spl_heap_arena_initial_alloc, + spl_heap_arena_initial_alloc_size); + + printf("SPL: %s: walking spl_heap_arena, aka bucket_heap (pass 2)\n", + __func__); + + vmem_walk(spl_heap_arena, VMEM_ALLOC, vmem_fini_freelist, + spl_heap_arena); + vmem_free_span_list(); + + printf("SPL: %s: walking bucket arenas...\n", __func__); + + for (int i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { + const int bucket = i - VMEM_BUCKET_LOWBIT; + vmem_walk(vmem_bucket_arena[bucket], + VMEM_ALLOC | VMEM_REENTRANT, vmem_fini_void, + vmem_bucket_arena[bucket]); + + vmem_walk(vmem_bucket_arena[bucket], VMEM_ALLOC, + vmem_fini_freelist, vmem_bucket_arena[bucket]); + } + vmem_free_span_list(); + + dprintf("SPL: %s destroying spl_bucket_arenas...", __func__); + for (int32_t i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { + vmem_t *vmpt = vmem_bucket_arena[i - VMEM_BUCKET_LOWBIT]; + dprintf(" %llu", (1ULL << i)); + vmem_destroy(vmpt); // parent: spl_default_arena_parent + } + dprintf("\n"); + + printf("SPL: %s: walking vmem metadata-related arenas...\n", __func__); + + vmem_walk(vmem_vmem_arena, VMEM_ALLOC | VMEM_REENTRANT, + vmem_fini_void, vmem_vmem_arena); + + vmem_walk(vmem_vmem_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_vmem_arena); + + vmem_free_span_list(); + + // We should not do VMEM_REENTRANT on vmem_seg_arena or + // vmem_hash_arena or below to avoid causing work in + // vmem_seg_arena and vmem_hash_arena. + + vmem_walk(vmem_seg_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_seg_arena); + + vmem_free_span_list(); + + vmem_walk(vmem_hash_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_hash_arena); + vmem_free_span_list(); + + vmem_walk(vmem_metadata_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_metadata_arena); + + vmem_free_span_list(); + dprintf("SPL: %s walking the root arena (spl_default_arena)...\n", + __func__); + + vmem_walk(spl_default_arena, VMEM_ALLOC, + vmem_fini_freelist, spl_default_arena); + + vmem_free_span_list(); + + dprintf("SPL: %s destroying bucket heap\n", __func__); + // PARENT: spl_default_arena_parent (but depends on buckets) + vmem_destroy(spl_heap_arena); + + // destroying the vmem_vmem arena and any arena afterwards + // requires the use of vmem_destroy_internal(), which does + // not talk to vmem_vmem_arena like vmem_destroy() does. + // dprintf("SPL: %s destroying vmem_vmem_arena\n", __func__); + // vmem_destroy_internal(vmem_vmem_arena); + // parent: vmem_metadata_arena + + // destroying the seg arena means we must no longer + // talk to vmem_populate() + dprintf("SPL: %s destroying vmem_seg_arena\n", __func__); + vmem_destroy(vmem_seg_arena); + + // vmem_hash_arena may be freed-to in vmem_destroy_internal() + // so it should be just before the vmem_metadata_arena. + dprintf("SPL: %s destroying vmem_hash_arena\n", __func__); + vmem_destroy(vmem_hash_arena); // parent: vmem_metadata_arena + vmem_hash_arena = NULL; + + // XXX: if we panic on unload below here due to destroyed mutex, + // vmem_init() will need some reworking (e.g. have + // vmem_metadata_arena talk directly to xnu), or alternatively a + // vmem_destroy_internal_internal() function that does not touch + // vmem_hash_arena will need writing. + + dprintf("SPL: %s destroying vmem_metadata_arena\n", __func__); + vmem_destroy(vmem_metadata_arena); // parent: spl_default_arena + + dprintf("\nSPL: %s destroying spl_default_arena\n", __func__); + vmem_destroy(spl_default_arena); // parent: spl_default_arena_parent + dprintf("\nSPL: %s destroying spl_default_arena_parant\n", __func__); + vmem_destroy(spl_default_arena_parent); + + dprintf("SPL: %s destroying vmem_vmem_arena\n", __func__); + vmem_destroy_internal(vmem_vmem_arena); + + printf("SPL: arenas removed, now try destroying mutexes... "); + + printf("vmem_xnu_alloc_lock "); + mutex_destroy(&vmem_xnu_alloc_lock); + printf("vmem_panic_lock "); + mutex_destroy(&vmem_panic_lock); + printf("vmem_pushpage_lock "); + mutex_destroy(&vmem_pushpage_lock); + printf("vmem_nosleep_lock "); + mutex_destroy(&vmem_nosleep_lock); + printf("vmem_sleep_lock "); + mutex_destroy(&vmem_sleep_lock); + printf("vmem_segfree_lock "); + mutex_destroy(&vmem_segfree_lock); + printf("vmem_list_lock "); + mutex_destroy(&vmem_list_lock); + + printf("\nSPL: %s: walking list of live slabs at time of call to %s\n", + __func__, __func__); + + // annoyingly, some of these should be returned to xnu, but + // we have no idea which have already been freed to xnu, and + // freeing a second time results in a panic. + + /* Now release the list of allocs to built above */ + total = 0; + uint64_t total_count = 0; + while ((fs = list_head(&freelist))) { + total_count++; + total += fs->slabsize; + list_remove(&freelist, fs); + // extern void segkmem_free(vmem_t *, void *, size_t); + // segkmem_free(fs->vmp, fs->slab, fs->slabsize); + FREE(fs, M_TEMP); + } + printf("SPL: WOULD HAVE released %llu bytes (%llu spans) from arenas\n", + total, total_count); + list_destroy(&freelist); + printf("SPL: %s: Brief delay for readability...\n", __func__); + delay(hz); + printf("SPL: %s: done!\n", __func__); +} + +/* + * return true if inuse is much smaller than imported + */ +static inline bool +bucket_fragmented(const uint16_t bn, const uint64_t now) +{ + + // early during uptime, just let buckets grow. + + if (now < 600 * hz) + return (false); + + // if there has been no pressure in the past five minutes, + // then we will just let the bucket grow. + + const uint64_t timeout = 5ULL * 60ULL * hz; + + if (spl_free_last_pressure_wrapper() + timeout < now) + return (false); + + const vmem_t *vmp = vmem_bucket_arena[bn]; + + const int64_t imported = + (int64_t)vmp->vm_kstat.vk_mem_import.value.ui64; + const int64_t inuse = + (int64_t)vmp->vm_kstat.vk_mem_inuse.value.ui64; + const int64_t tiny = 64LL*1024LL*1024LL; + const int64_t small = tiny * 2LL; // 128 M + const int64_t medium = small * 2LL; // 256 + const int64_t large = medium * 2LL; // 512 + const int64_t huge = large * 2LL; // 1 G + const int64_t super_huge = huge * 2LL; // 2 + + const int64_t amount_free = imported - inuse; + + if (amount_free <= tiny || imported <= small) + return (false); + + const int64_t percent_free = (amount_free * 100LL) / imported; + + if (percent_free > 75LL) { + return (true); + } else if (imported <= medium) { + return (percent_free >= 50); + } else if (imported <= large) { + return (percent_free >= 33); + } else if (imported <= huge) { + return (percent_free >= 25); + } else if (imported <= super_huge) { + return (percent_free >= 15); + } else { + return (percent_free >= 10); + } +} + +/* + * return true if the bucket for size is fragmented + */ +static inline bool +spl_arc_no_grow_impl(const uint16_t b, const size_t size, + const boolean_t buf_is_metadata, kmem_cache_t **kc) +{ + static _Atomic uint8_t frag_suppression_counter[VMEM_BUCKETS] = { 0 }; + + const uint64_t now = zfs_lbolt(); + + const bool fragmented = bucket_fragmented(b, now); + + if (fragmented) { + if (size < 32768) { + // Don't suppress small qcached blocks when the + // qcache size (bucket_262144) is fragmented, + // since they will push everything else towards + // the tails of ARC lists without eating up a large + // amount of space themselves. + return (false); + } + const uint32_t b_bit = (uint32_t)1 << (uint32_t)b; + spl_arc_no_grow_bits |= b_bit; + const uint32_t sup_at_least_every = MIN(b_bit, 255); + const uint32_t sup_at_most_every = MAX(b_bit, 16); + const uint32_t sup_every = MIN(sup_at_least_every, + sup_at_most_every); + if (frag_suppression_counter[b] >= sup_every) { + frag_suppression_counter[b] = 0; + return (true); + } else { + frag_suppression_counter[b]++; + return (false); + } + } else { + const uint32_t b_bit = (uint32_t)1 << (uint32_t)b; + spl_arc_no_grow_bits &= ~b_bit; + } + + extern bool spl_zio_is_suppressed(const size_t, const uint64_t, + const boolean_t, kmem_cache_t **); + + return (spl_zio_is_suppressed(size, now, buf_is_metadata, kc)); +} + +static inline uint16_t +vmem_bucket_number_arc_no_grow(const size_t size) +{ + // qcaching on arc + if (size < 128*1024) + return (vmem_bucket_number(262144)); + else + return (vmem_bucket_number(size)); +} + +boolean_t +spl_arc_no_grow(size_t size, boolean_t buf_is_metadata, kmem_cache_t **zp) +{ + const uint16_t b = vmem_bucket_number_arc_no_grow(size); + + const bool rv = spl_arc_no_grow_impl(b, size, buf_is_metadata, zp); + + if (rv) { + atomic_inc_64(&spl_arc_no_grow_count); + } + + return ((boolean_t)rv); +} diff --git a/module/os/macos/spl/spl-vnode.c b/module/os/macos/spl/spl-vnode.c new file mode 100644 index 0000000000..df69fdf667 --- /dev/null +++ b/module/os/macos/spl/spl-vnode.c @@ -0,0 +1,496 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +int +vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, + struct vnode **vpp, enum create crwhy, mode_t umask) +{ + vfs_context_t vctx; + int fmode; + int error; + + fmode = filemode; + if (crwhy) + fmode |= O_CREAT; + // TODO I think this should be 'fmode' instead of 'filemode' + vctx = vfs_context_create((vfs_context_t)0); + error = vnode_open(pnamep, filemode, createmode, 0, vpp, vctx); + (void) vfs_context_rele(vctx); + return (error); +} + +int +vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode, + struct vnode **vpp, enum create crwhy, + mode_t umask, struct vnode *startvp) +{ + char *path; + int pathlen = MAXPATHLEN; + int error; + + path = (char *)kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + error = vn_getpath(startvp, path, &pathlen); + if (error == 0) { + strlcat(path, pnamep, MAXPATHLEN); + error = vn_open(path, seg, filemode, createmode, vpp, crwhy, + umask); + } + + kmem_free(path, MAXPATHLEN); + return (error); +} + +extern errno_t vnode_rename(const char *, const char *, int, vfs_context_t); + +errno_t +vnode_rename(const char *from, const char *to, int flags, vfs_context_t vctx) +{ + /* + * We need proper KPI changes to be able to safely update + * the zpool.cache file. For now, we return EPERM. + */ + return (EPERM); +} + +int +vn_rename(char *from, char *to, enum uio_seg seg) +{ + vfs_context_t vctx; + int error; + + vctx = vfs_context_create((vfs_context_t)0); + + error = vnode_rename(from, to, 0, vctx); + + (void) vfs_context_rele(vctx); + + return (error); +} + +extern errno_t vnode_remove(const char *, int, enum vtype, vfs_context_t); + +errno_t +vnode_remove(const char *name, int flag, enum vtype type, vfs_context_t vctx) +{ + /* + * Now that zed ZFS Event Daemon can handle the rename of zpool.cache + * we will silence this limitation, and look in zed.d/config.sync.sh + */ + return (EPERM); +} + + +int +vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) +{ + vfs_context_t vctx; + enum vtype type; + int error; + + type = dirflag == RMDIRECTORY ? VDIR : VREG; + + vctx = vfs_context_create((vfs_context_t)0); + + error = vnode_remove(fnamep, 0, type, vctx); + + (void) vfs_context_rele(vctx); + + return (error); +} + +int +VOP_SPACE(struct vnode *vp, int cmd, struct flock *fl, int flags, offset_t off, + cred_t *cr, void *ctx) +{ + int error = 0; +#ifdef F_PUNCHHOLE + if (cmd == F_FREESP) { + fpunchhole_t fpht; + fpht.fp_flags = 0; + fpht.fp_offset = fl->l_start; + fpht.fp_length = fl->l_len; + if (vnode_getwithref(vp) == 0) { + error = VNOP_IOCTL(vp, F_PUNCHHOLE, (caddr_t)&fpht, 0, + ctx); + (void) vnode_put(vp); + } + } +#endif + return (error); +} + +int +VOP_CLOSE(struct vnode *vp, int flag, int count, offset_t off, + void *cr, void *k) +{ + vfs_context_t vctx; + int error; + + vctx = vfs_context_create((vfs_context_t)0); + error = vnode_close(vp, flag & FWRITE, vctx); + (void) vfs_context_rele(vctx); + return (error); +} + +int +VOP_FSYNC(struct vnode *vp, int flags, void* unused, void *uused2) +{ + vfs_context_t vctx; + int error; + + vctx = vfs_context_create((vfs_context_t)0); + error = VNOP_FSYNC(vp, (flags == FSYNC), vctx); + (void) vfs_context_rele(vctx); + return (error); +} + +int +VOP_GETATTR(struct vnode *vp, vattr_t *vap, int flags, void *x3, void *x4) +{ + vfs_context_t vctx; + int error; + + vctx = vfs_context_create((vfs_context_t)0); + error = vnode_getattr(vp, vap, vctx); + (void) vfs_context_rele(vctx); + return (error); +} + +errno_t VNOP_LOOKUP(struct vnode *, struct vnode **, + struct componentname *, vfs_context_t); + +errno_t +VOP_LOOKUP(struct vnode *vp, struct vnode **vpp, + struct componentname *cn, vfs_context_t ct) +{ + return (VNOP_LOOKUP(vp, vpp, cn, ct)); +} + +#undef VFS_ROOT + +extern int VFS_ROOT(mount_t, struct vnode **, vfs_context_t); +int +spl_vfs_root(mount_t mount, struct vnode **vp) +{ + return (VFS_ROOT(mount, vp, vfs_context_current())); +} + +void +vfs_mountedfrom(struct mount *vfsp, char *osname) +{ + (void) copystr(osname, vfs_statfs(vfsp)->f_mntfromname, MNAMELEN - 1, + 0); +} + +static kmutex_t spl_getf_lock; +static list_t spl_getf_list; + +int +spl_vnode_init(void) +{ + mutex_init(&spl_getf_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&spl_getf_list, sizeof (struct spl_fileproc), + offsetof(struct spl_fileproc, f_next)); + return (0); +} + +void +spl_vnode_fini(void) +{ + mutex_destroy(&spl_getf_lock); + list_destroy(&spl_getf_list); +} + +#include +struct fileproc; + +extern int fp_drop(struct proc *p, int fd, struct fileproc *fp, int locked); +extern int fp_drop_written(struct proc *p, int fd, struct fileproc *fp, + int locked); +extern int fp_lookup(struct proc *p, int fd, struct fileproc **resultfp, + int locked); +extern int fo_read(struct fileproc *fp, struct uio *uio, int flags, + vfs_context_t ctx); +extern int fo_write(struct fileproc *fp, struct uio *uio, int flags, + vfs_context_t ctx); +extern int file_vnode_withvid(int, struct vnode **, uint32_t *); +extern int file_drop(int); + +/* + * getf(int fd) - hold a lock on a file descriptor, to be released by calling + * releasef(). On OSX we will also look up the vnode of the fd for calls + * to spl_vn_rdwr(). + */ +void * +getf(int fd) +{ + struct fileproc *fp = NULL; + struct spl_fileproc *sfp = NULL; + struct vnode *vp; + uint32_t vid; + + /* + * We keep the "fp" pointer as well, both for unlocking in releasef() + * and used in vn_rdwr(). + */ + + sfp = kmem_alloc(sizeof (*sfp), KM_SLEEP); + if (!sfp) + return (NULL); + + if (fp_lookup(current_proc(), fd, &fp, 0 /* !locked */)) { + kmem_free(sfp, sizeof (*sfp)); + return (NULL); + } + + printf("current_proc %p: fd %d fp %p vp %p\n", current_proc(), + fd, fp, vp); + + sfp->f_vnode = vp; + sfp->f_fd = fd; + sfp->f_offset = 0; + sfp->f_proc = current_proc(); + sfp->f_fp = fp; + + /* Also grab vnode, so we can fish out the minor, for onexit */ + if (!file_vnode_withvid(fd, &vp, &vid)) { + sfp->f_vnode = vp; + if (vnode_vtype(vp) != VDIR) { + sfp->f_file = minor(vnode_specrdev(vp)); + } + file_drop(fd); + } + + mutex_enter(&spl_getf_lock); + list_insert_tail(&spl_getf_list, sfp); + mutex_exit(&spl_getf_lock); + + return (sfp); +} + +struct vnode * +getf_vnode(void *fp) +{ + struct spl_fileproc *sfp = (struct spl_fileproc *)fp; + struct vnode *vp = NULL; + uint32_t vid; + + if (!file_vnode_withvid(sfp->f_fd, &vp, &vid)) { + file_drop(sfp->f_fd); + } + + return (vp); +} + +void +releasef(int fd) +{ + struct spl_fileproc *fp = NULL; + struct proc *p; + + p = current_proc(); + mutex_enter(&spl_getf_lock); + for (fp = list_head(&spl_getf_list); fp != NULL; + fp = list_next(&spl_getf_list, fp)) { + if ((fp->f_proc == p) && fp->f_fd == fd) break; + } + mutex_exit(&spl_getf_lock); + if (!fp) + return; // Not found + + if (fp->f_writes) + fp_drop_written(p, fd, fp->f_fp, 0 /* !locked */); + else + fp_drop(p, fd, fp->f_fp, 0 /* !locked */); + + /* Remove node from the list */ + mutex_enter(&spl_getf_lock); + list_remove(&spl_getf_list, fp); + mutex_exit(&spl_getf_lock); + + /* Free the node */ + kmem_free(fp, sizeof (*fp)); +} + +/* + * getf()/releasef() IO handler. + */ +int spl_vn_rdwr(enum uio_rw rw, struct spl_fileproc *sfp, + caddr_t base, ssize_t len, offset_t offset, enum uio_seg seg, + int ioflag, rlim64_t ulimit, cred_t *cr, ssize_t *residp) +{ + uio_t *auio; + int spacetype; + int error = 0; + vfs_context_t vctx; + + spacetype = UIO_SEG_IS_USER_SPACE(seg) ? UIO_USERSPACE32 : UIO_SYSSPACE; + + vctx = vfs_context_create((vfs_context_t)0); + auio = uio_create(1, 0, spacetype, rw); + uio_reset(auio, offset, spacetype, rw); + uio_addiov(auio, (uint64_t)(uintptr_t)base, len); + + if (rw == UIO_READ) { + error = fo_read(sfp->f_fp, auio, ioflag, vctx); + } else { + error = fo_write(sfp->f_fp, auio, ioflag, vctx); + } + + if (residp) { + *residp = uio_resid(auio); + } else { + if (uio_resid(auio) && error == 0) + error = EIO; + } + + uio_free(auio); + vfs_context_rele(vctx); + + return (error); +} + +/* Regular vnode vn_rdwr */ +int zfs_vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, ssize_t len, + offset_t offset, enum uio_seg seg, int ioflag, rlim64_t ulimit, + cred_t *cr, ssize_t *residp) +{ + uio_t *auio; + int spacetype; + int error = 0; + vfs_context_t vctx; + + spacetype = UIO_SEG_IS_USER_SPACE(seg) ? UIO_USERSPACE32 : UIO_SYSSPACE; + + vctx = vfs_context_create((vfs_context_t)0); + auio = uio_create(1, 0, spacetype, rw); + uio_reset(auio, offset, spacetype, rw); + uio_addiov(auio, (uint64_t)(uintptr_t)base, len); + + if (rw == UIO_READ) { + error = VNOP_READ(vp, auio, ioflag, vctx); + } else { + error = VNOP_WRITE(vp, auio, ioflag, vctx); + } + + if (residp) { + *residp = uio_resid(auio); + } else { + if (uio_resid(auio) && error == 0) + error = EIO; + } + + uio_free(auio); + vfs_context_rele(vctx); + + return (error); +} + +void +spl_rele_async(void *arg) +{ + struct vnode *vp = (struct vnode *)arg; + if (vp) vnode_put(vp); +} + +void +vn_rele_async(struct vnode *vp, void *taskq) +{ + VERIFY(taskq_dispatch((taskq_t *)taskq, + (task_func_t *)spl_rele_async, vp, TQ_SLEEP) != 0); +} + +vfs_context_t +spl_vfs_context_kernel(void) +{ + return (vfs_context_kernel()); +} + +#undef build_path +extern int build_path(struct vnode *vp, char *buff, int buflen, int *outlen, + int flags, vfs_context_t ctx); + +int spl_build_path(struct vnode *vp, char *buff, int buflen, int *outlen, + int flags, vfs_context_t ctx) +{ + return (build_path(vp, buff, buflen, outlen, flags, ctx)); +} + +/* + * vnode_notify was moved from KERNEL_PRIVATE to KERNEL in 10.11, but to be + * backward compatible, we keep the wrapper for now. + */ +extern int vnode_notify(struct vnode *, uint32_t, struct vnode_attr *); +int +spl_vnode_notify(struct vnode *vp, uint32_t type, struct vnode_attr *vap) +{ + return (vnode_notify(vp, type, vap)); +} + +extern int vfs_get_notify_attributes(struct vnode_attr *vap); +int +spl_vfs_get_notify_attributes(struct vnode_attr *vap) +{ + return (vfs_get_notify_attributes(vap)); +} + +/* Root directory vnode for the system a.k.a. '/' */ +/* + * Must use vfs_rootvnode() to acquire a reference, and + * vnode_put() to release it + */ + +extern struct vnode *rootvnode; + +struct vnode * +getrootdir(void) +{ + struct vnode *rvnode; + + // Unfortunately, Apple's vfs_rootvnode() fails to check for + // NULL rootvp, and just panics. We aren't technically allowed to + // see rootvp, but in the interest of avoiding a panic... + if (rootvnode == NULL) + return (NULL); + + rvnode = vfs_rootvnode(); + if (rvnode) + vnode_put(rvnode); + return (rvnode); +} diff --git a/module/os/macos/spl/spl-xdr.c b/module/os/macos/spl/spl-xdr.c new file mode 100644 index 0000000000..4eb115017d --- /dev/null +++ b/module/os/macos/spl/spl-xdr.c @@ -0,0 +1,524 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include + + +/* + * SPL's XDR mem implementation. + * + * This is used by libnvpair to serialize/deserialize the name-value pair data + * structures into byte arrays in a well-defined and portable manner. + * + * These data structures are used by the DMU/ZFS to flexibly manipulate various + * information in memory and later serialize it/deserialize it to disk. + * Examples of usages include the pool configuration, lists of pool and dataset + * properties, etc. + * + * Reference documentation for the XDR representation and XDR operations can be + * found in RFC 1832 and xdr(3), respectively. + * + * === Implementation shortcomings === + * + * It is assumed that the following C types have the following sizes: + * + * char/unsigned char: 1 byte + * short/unsigned short: 2 bytes + * int/unsigned int: 4 bytes + * longlong_t/u_longlong_t: 8 bytes + * + * The C standard allows these types to be larger (and in the case of ints, + * shorter), so if that is the case on some compiler/architecture, the build + * will fail (on purpose). + * + * If someone wants to fix the code to work properly on such environments, then: + * + * 1) Preconditions should be added to xdrmem_enc functions to make sure the + * caller doesn't pass arguments which exceed the expected range. + * 2) Functions which take signed integers should be changed to properly do + * sign extension. + * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger + * problems than this implementation. + * + * It is also assumed that: + * + * 1) Chars have 8 bits. + * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned + * memcpy, memset and memcmp. + * 3) Arrays passed to xdr_array() are packed and the compiler/architecture + * supports element-sized-aligned memory accesses. + * 4) Negative integers are natively stored in two's complement binary + * representation. + * + * No checks are done for the 4 assumptions above, though. + * + * === Caller expectations === + * + * Existing documentation does not describe the semantics of XDR operations very + * well. Therefore, some assumptions about failure semantics will be made and + * will be described below: + * + * 1) If any encoding operation fails (e.g., due to lack of buffer space), the + * the stream should be considered valid only up to the encoding operation + * previous to the one that first failed. However, the stream size as returned + * by xdr_control() cannot be considered to be strictly correct (it may be + * bigger). + * + * Putting it another way, if there is an encoding failure it's undefined + * whether anything is added to the stream in that operation and therefore + * neither xdr_control() nor future encoding operations on the same stream can + * be relied upon to produce correct results. + * + * 2) If a decoding operation fails, it's undefined whether anything will be + * decoded into passed buffers/pointers during that operation, or what the + * values on those buffers will look like. + * + * Future decoding operations on the same stream will also have similar + * undefined behavior. + * + * 3) When the first decoding operation fails it is OK to trust the results of + * previous decoding operations on the same stream, as long as the caller + * expects a failure to be possible (e.g. due to end-of-stream). + * + * However, this is highly discouraged because the caller should know the + * stream size and should be coded to expect any decoding failure to be data + * corruption due to hardware, accidental or even malicious causes, which should + * be handled gracefully in all cases. + * + * In very rare situations where there are strong reasons to believe the data + * can be trusted to be valid and non-tampered with, then the caller may assume + * a decoding failure to be a bug (e.g. due to mismatched data types) and may + * fail non-gracefully. + * + * 4) Non-zero padding bytes will cause the decoding operation to fail. + * + * 5) Zero bytes on string types will also cause the decoding operation to fail. + * + * 6) It is assumed that either the pointer to the stream buffer given by the + * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int + * memory accesses. + * + * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap. + * + * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user + * space or MMIO space), the computer may explode. + */ + +static struct xdr_ops xdrmem_encode_ops; +static struct xdr_ops xdrmem_decode_ops; + +void +xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op) +{ + switch (op) { + case XDR_ENCODE: + xdrs->x_ops = &xdrmem_encode_ops; + break; + case XDR_DECODE: + xdrs->x_ops = &xdrmem_decode_ops; + break; + default: + printf("SPL: Invalid op value: %d\n", op); + xdrs->x_ops = NULL; /* Let the caller know we failed */ + return; + } + + xdrs->x_op = op; + xdrs->x_addr = addr; + xdrs->x_addr_end = addr + size; + + if (xdrs->x_addr_end < xdrs->x_addr) { + printf("SPL: Overflow while creating xdrmem: %p, %u\n", addr, + size); + xdrs->x_ops = NULL; + } +} +EXPORT_SYMBOL(xdrmem_create); + +static bool_t +xdrmem_control(XDR *xdrs, int req, void *info) +{ + struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info; + + if (req != XDR_GET_BYTES_AVAIL) { + printf("SPL: Called with unknown request: %d\n", req); + return (FALSE); + } + + rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */ + rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr; + + return (TRUE); +} + +static bool_t +xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(xdrs->x_addr, cp, cnt); + + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + memset(xdrs->x_addr, 0, pad); + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + static uint32_t zero = 0; + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(cp, xdrs->x_addr, cnt); + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + /* An inverted memchr() would be useful here... */ + if (memcmp(&zero, xdrs->x_addr, pad) != 0) + return (FALSE); + + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint32(XDR *xdrs, uint32_t val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *((uint32_t *)xdrs->x_addr) = BE_32(val); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_dec_uint32(XDR *xdrs, uint32_t *val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *val = BE_32(*((uint32_t *)xdrs->x_addr)); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_enc_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + // BUILD_BUG_ON(sizeof(char) != 1); + val = *((unsigned char *) cp); + + return (xdrmem_enc_uint32(xdrs, val)); +} + +static bool_t +xdrmem_dec_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + // BUILD_BUG_ON(sizeof(char) != 1); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * If any of the 3 other bytes are non-zero then val will be greater + * than 0xff and we fail because according to the RFC, this block does + * not have a char encoded in it. + */ + if (val > 0xff) + return (FALSE); + + *((unsigned char *) cp) = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp) +{ + // BUILD_BUG_ON(sizeof(unsigned short) != 2); + + return (xdrmem_enc_uint32(xdrs, *usp)); +} + +static bool_t +xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp) +{ + uint32_t val; + + // BUILD_BUG_ON(sizeof(unsigned short) != 2); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * Short ints are not in the RFC, but we assume similar logic as in + * xdrmem_dec_char(). + */ + if (val > 0xffff) + return (FALSE); + + *usp = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint(XDR *xdrs, unsigned *up) +{ + // BUILD_BUG_ON(sizeof(unsigned) != 4); + + return (xdrmem_enc_uint32(xdrs, *up)); +} + +static bool_t +xdrmem_dec_uint(XDR *xdrs, unsigned *up) +{ + // BUILD_BUG_ON(sizeof(unsigned) != 4); + + return (xdrmem_dec_uint32(xdrs, (uint32_t *)up)); +} + +static bool_t +xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + // BUILD_BUG_ON(sizeof(u_longlong_t) != 8); + + if (!xdrmem_enc_uint32(xdrs, *ullp >> 32)) + return (FALSE); + + return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff)); +} + +static bool_t +xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + uint32_t low, high; + + // BUILD_BUG_ON(sizeof(u_longlong_t) != 8); + + if (!xdrmem_dec_uint32(xdrs, &high)) + return (FALSE); + if (!xdrmem_dec_uint32(xdrs, &low)) + return (FALSE); + + *ullp = ((u_longlong_t)high << 32) | low; + + return (TRUE); +} + +static bool_t +xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i; + caddr_t addr = *arrp; + + if (*sizep > maxsize || *sizep > UINT_MAX / elsize) + return (FALSE); + + if (!xdrmem_enc_uint(xdrs, sizep)) + return (FALSE); + + for (i = 0; i < *sizep; i++) { + if (!elproc(xdrs, addr)) + return (FALSE); + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i, size; + bool_t alloc = FALSE; + caddr_t addr; + + if (!xdrmem_dec_uint(xdrs, sizep)) + return (FALSE); + + size = *sizep; + + if (size > maxsize || size > UINT_MAX / elsize) + return (FALSE); + + /* + * The Solaris man page says: "If *arrp is NULL when decoding, + * xdr_array() allocates memory and *arrp points to it". + */ + if (*arrp == NULL) { + // BUILD_BUG_ON(sizeof(uint_t) > sizeof(size_t)); + + *arrp = kmem_alloc(size * elsize, KM_NOSLEEP); + if (*arrp == NULL) + return (FALSE); + + alloc = TRUE; + } + + addr = *arrp; + + for (i = 0; i < size; i++) { + if (!elproc(xdrs, addr)) { + if (alloc) + kmem_free(*arrp, size * elsize); + return (FALSE); + } + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + size_t slen = strlen(*sp); + uint_t len; + + if (slen > maxsize) + return (FALSE); + + len = slen; + + if (!xdrmem_enc_uint(xdrs, &len)) + return (FALSE); + + return (xdrmem_enc_bytes(xdrs, *sp, len)); +} + +static bool_t +xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + uint_t size; + bool_t alloc = FALSE; + + if (!xdrmem_dec_uint(xdrs, &size)) + return (FALSE); + + if (size > maxsize || size > UINT_MAX - 1) + return (FALSE); + + /* + * Solaris man page: "If *sp is NULL when decoding, xdr_string() + * allocates memory and *sp points to it". + */ + if (*sp == NULL) { + // BUILD_BUG_ON(sizeof(uint_t) > sizeof(size_t)); + + *sp = kmem_alloc(size + 1, KM_NOSLEEP); + if (*sp == NULL) + return (FALSE); + + alloc = TRUE; + } + + if (!xdrmem_dec_bytes(xdrs, *sp, size)) + goto fail; + + if (kmemchr(*sp, 0, size) != NULL) + goto fail; + + (*sp)[size] = '\0'; + + return (TRUE); + +fail: + if (alloc) + kmem_free(*sp, size + 1); + + return (FALSE); +} + +static struct xdr_ops xdrmem_encode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_enc_char, + .xdr_u_short = xdrmem_enc_ushort, + .xdr_u_int = xdrmem_enc_uint, + .xdr_u_longlong_t = xdrmem_enc_ulonglong, + .xdr_opaque = xdrmem_enc_bytes, + .xdr_string = xdr_enc_string, + .xdr_array = xdr_enc_array +}; + +static struct xdr_ops xdrmem_decode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_dec_char, + .xdr_u_short = xdrmem_dec_ushort, + .xdr_u_int = xdrmem_dec_uint, + .xdr_u_longlong_t = xdrmem_dec_ulonglong, + .xdr_opaque = xdrmem_dec_bytes, + .xdr_string = xdr_dec_string, + .xdr_array = xdr_dec_array +}; diff --git a/module/os/macos/spl/spl-zlib.c b/module/os/macos/spl/spl-zlib.c new file mode 100644 index 0000000000..5aa92c324a --- /dev/null +++ b/module/os/macos/spl/spl-zlib.c @@ -0,0 +1,199 @@ +/* + * + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler + */ + +#include +#include +#include +#include + +#ifdef DEBUG_SUBSYSTEM +#undef DEBUG_SUBSYSTEM +#endif + +#define DEBUG_SUBSYSTEM SS_ZLIB + +static spl_kmem_cache_t *zlib_workspace_cache; + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS))); +} + +static void +zlib_workspace_free(void *workspace) +{ + kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_compress_level); + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, + size_t sourceLen) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return (Z_DATA_ERROR); + + return (err); + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_uncompress); + +int +spl_zlib_init(void) +{ + int size; + SENTRY; + + size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + + zlib_workspace_cache = kmem_cache_create( + "spl_zlib_workspace_cache", + size, 0, NULL, NULL, NULL, NULL, NULL, + KMC_VMEM | KMC_NOEMERGENCY); + if (!zlib_workspace_cache) + SRETURN(1); + + SRETURN(0); +} + +void +spl_zlib_fini(void) +{ + SENTRY; + kmem_cache_destroy(zlib_workspace_cache); + zlib_workspace_cache = NULL; + SEXIT; +} diff --git a/module/os/macos/zfs/.gitignore b/module/os/macos/zfs/.gitignore new file mode 100644 index 0000000000..aaec2f8ea2 --- /dev/null +++ b/module/os/macos/zfs/.gitignore @@ -0,0 +1,2 @@ +zfs +zfs.kext diff --git a/module/os/macos/zfs/Info.plist b/module/os/macos/zfs/Info.plist new file mode 100644 index 0000000000..761b080738 --- /dev/null +++ b/module/os/macos/zfs/Info.plist @@ -0,0 +1,115 @@ + + + + + BuildMachineOSBuild + 14C1514 + CFBundleDevelopmentRegion + English + CFBundleExecutable + zfs + CFBundleIdentifier + net.lundman.zfs + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + zfs + CFBundlePackageType + KEXT + CFBundleShortVersionString + 1.0 + CFBundleSignature + ???? + CFBundleVersion + 1.0.0 + DTCompiler + com.apple.compilers.llvm.clang.1_0 + DTPlatformBuild + 6C131e + DTPlatformVersion + GM + DTSDKBuild + 12F37 + DTSDKName + macosx10.8 + DTXcode + 0620 + DTXcodeBuild + 6C131e + IOKitPersonalities + + net.lundman.zfs + + CFBundleIdentifier + net.lundman.zfs + IOClass + net_lundman_zfs_zvol + IOMatchCategory + net_lundman_zfs_zvol + IOMediaIcon + + CFBundleIdentifier + net.lundman.zfs + IOBundleResourceFile + VolumeIcon.icns + + IOProviderClass + IOResources + IOResourceMatch + IOBSD + + net.lundman.zfs.ZFSDatasetProxy + + CFBundleIdentifier + net.lundman.zfs + IOClass + ZFSDatasetProxy + IOProbeScore + 1000 + IOMatchCategory + ZFSPool + IOProviderClass + ZFSPool + + net.lundman.zfs.ZFSDatasetScheme + + CFBundleIdentifier + net.lundman.zfs + IOClass + ZFSDatasetScheme + IOProbeScore + 5000 + IOMatchCategory + IOStorage + IOPropertyMatch + + Whole + + + IOProviderClass + IOMedia + + + NSHumanReadableCopyright + CDDL (ZFS), BSD (FreeBSD), Copyright © 2012-2020 OpenZFS on OS X. All rights reserved. + OSBundleCompatibleVersion + 1.0.0 + OSBundleLibraries + + com.apple.iokit.IOStorageFamily + 1.6 + com.apple.kpi.bsd + 8.0.0 + com.apple.kpi.iokit + 8.0.0 + com.apple.kpi.libkern + 10.0 + com.apple.kpi.mach + 8.0.0 + com.apple.kpi.unsupported + 8.0.0 + net.lundman.kernel.dependencies + 12.5.0 + + + diff --git a/module/os/macos/zfs/InfoPlist.strings b/module/os/macos/zfs/InfoPlist.strings new file mode 100644 index 0000000000..0c67376eba --- /dev/null +++ b/module/os/macos/zfs/InfoPlist.strings @@ -0,0 +1,5 @@ + + + + + diff --git a/module/os/macos/zfs/Makefile.am b/module/os/macos/zfs/Makefile.am new file mode 100644 index 0000000000..2ff35cb52b --- /dev/null +++ b/module/os/macos/zfs/Makefile.am @@ -0,0 +1,345 @@ + +INFO_PLIST = Info.plist +PLIST_STRING = InfoPlist.strings + +ZFS_META_VERSION = @ZFS_META_VERSION@ +ZFS_DEBUG_STR = @ZFS_DEBUG_STR@ + +zfs_CPPFLAGS = \ + -Wall \ + -nostdinc \ + -mkernel \ + -fno-builtin-printf \ + -D__KERNEL__ \ + -D_KERNEL \ + -DKERNEL \ + -DKERNEL_PRIVATE \ + -DDRIVER_PRIVATE \ + -DNAMEDSTREAMS=1 \ + -DAPPLE \ + -DNeXT \ + -I$(top_srcdir)/include/os/macos/spl \ + -I$(top_srcdir)/include/os/macos/zfs \ + -I$(top_srcdir)/module/icp/include \ + -I$(top_srcdir)/include \ + -I@KERNEL_HEADERS@/Headers \ + -I@KERNEL_HEADERS@/PrivateHeaders + +zfs_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ + +zfs_CFLAGS = +zfs_CXXFLAGS = + +zfs_LDFLAGS = \ + -Xlinker \ + -kext \ + -nostdlib \ + -lkmodc++ \ + -lkmod \ + -lcc_kext + +zfs_LDADD = \ + $(top_builddir)/module/os/macos/spl/libspl.la + +zfs_LIBS = + +# If we don't set this to nothing, it adds "-lz -liconv" +LIBS = + +bin_PROGRAMS = zfs.kext +noinst_PROGRAMS = zfs + +zfs_kext_SOURCE = + +if TARGET_CPU_X86_64 +zfs_ASM_SOURCES_C = \ + ../../../icp/asm-x86_64/aes/aeskey.c \ + ../../../icp/algs/modes/gcm_pclmulqdq.c \ + ../../../zcommon/zfs_fletcher_intel.c \ + ../../../zcommon/zfs_fletcher_sse.c \ + ../../../zcommon/zfs_fletcher_avx512.c \ + ../../../zfs/vdev_raidz_math_sse2.c \ + ../../../zfs/vdev_raidz_math_ssse3.c \ + ../../../zfs/vdev_raidz_math_avx2.c \ + ../../../zfs/vdev_raidz_math_avx512f.c \ + ../../../zfs/vdev_raidz_math_avx512bw.c +zfs_ASM_SOURCES_AS = \ + ../../../icp/asm-x86_64/os/macos/aes/aes_amd64.S \ + ../../../icp/asm-x86_64/os/macos/aes/aes_aesni.S \ + ../../../icp/asm-x86_64/os/macos/modes/gcm_pclmulqdq.S \ + ../../../icp/asm-x86_64/os/macos/sha1/sha1-x86_64.S \ + ../../../icp/asm-x86_64/os/macos/sha2/sha256_impl.S \ + ../../../icp/asm-x86_64/os/macos/sha2/sha512_impl.S +else +zfs_ASM_SOURCES_C = +zfs_ASM_SOURCES_AS = +endif + +zfs_SOURCES = \ + ../../../zfs/abd.c \ + abd_os.c \ + ../../../zfs/aggsum.c \ + ../../../zfs/arc.c \ + arc_os.c \ + ../../../avl/avl.c \ + ../../../zfs/blkptr.c \ + ../../../zfs/bplist.c \ + ../../../zfs/bpobj.c \ + ../../../zfs/bptree.c \ + ../../../zfs/bqueue.c \ + ../../../zfs/btree.c \ + ../../../zcommon/cityhash.c \ + ../../../zfs/dbuf.c \ + ../../../zfs/dbuf_stats.c \ + ../../../zfs/ddt.c \ + ../../../zfs/ddt_zap.c \ + ../../../zfs/dmu.c \ + ../../../zfs/dmu_diff.c \ + ../../../zfs/dmu_object.c \ + ../../../zfs/dmu_objset.c \ + ../../../zfs/dmu_recv.c \ + ../../../zfs/dmu_redact.c \ + ../../../zfs/dmu_send.c \ + ../../../zfs/dmu_traverse.c \ + ../../../zfs/dmu_tx.c \ + ../../../zfs/dmu_zfetch.c \ + ../../../zfs/dnode.c \ + ../../../zfs/dnode_sync.c \ + ../../../zfs/dsl_bookmark.c \ + ../../../zfs/dsl_crypt.c \ + ../../../zfs/dsl_dataset.c \ + ../../../zfs/dsl_deadlist.c \ + ../../../zfs/dsl_deleg.c \ + ../../../zfs/dsl_destroy.c \ + ../../../zfs/dsl_dir.c \ + ../../../zfs/dsl_pool.c \ + ../../../zfs/dsl_prop.c \ + ../../../zfs/dsl_scan.c \ + ../../../zfs/dsl_synctask.c \ + ../../../zfs/dsl_userhold.c \ + ../../../zfs/edonr_zfs.c \ + ../../../zfs/fm.c \ + ../../../zfs/gzip.c \ + ../../../zfs/hkdf.c \ + ldi_osx.c \ + ldi_vnode.c \ + ldi_iokit.cpp \ + ../../../zfs/lz4.c \ + ../../../zfs/lzjb.c \ + ../../../zfs/metaslab.c \ + ../../../zfs/mmp.c \ + ../../../zfs/multilist.c \ + ../../../zfs/objlist.c \ + ../../../zfs/pathname.c \ + ../../../zfs/range_tree.c \ + ../../../zfs/refcount.c \ + ../../../zfs/rrwlock.c \ + ../../../zfs/sa.c \ + ../../../zfs/sha256.c \ + ../../../zfs/skein_zfs.c \ + ../../../zfs/spa.c \ + ../../../zfs/spa_boot.c \ + ../../../zfs/spa_checkpoint.c \ + ../../../zfs/spa_config.c \ + ../../../zfs/spa_errlog.c \ + ../../../zfs/spa_history.c \ + ../../../zfs/spa_log_spacemap.c \ + ../../../zfs/spa_misc.c \ + spa_misc_os.c \ + spa_stats.c \ + ../../../zfs/space_map.c \ + ../../../zfs/space_reftree.c \ + ../../../zfs/txg.c \ + ../../../zfs/uberblock.c \ + ../../../zfs/unique.c \ + ../../../zfs/vdev.c \ + ../../../zfs/vdev_cache.c \ + vdev_disk.c \ + vdev_file.c \ + ../../../zfs/vdev_indirect.c \ + ../../../zfs/vdev_indirect_births.c \ + ../../../zfs/vdev_indirect_mapping.c \ + ../../../zfs/vdev_initialize.c \ + ../../../zfs/vdev_label.c \ + ../../../zfs/vdev_mirror.c \ + ../../../zfs/vdev_missing.c \ + ../../../zfs/vdev_queue.c \ + ../../../zfs/vdev_raidz.c \ + ../../../zfs/vdev_raidz_math.c \ + ../../../zfs/vdev_raidz_math_scalar.c \ + ../../../zfs/vdev_removal.c \ + ../../../zfs/vdev_root.c \ + ../../../zfs/vdev_trim.c \ + ../../../zfs/zap.c \ + ../../../zfs/zap_leaf.c \ + ../../../zfs/zap_micro.c \ + ../../../zfs/zcp.c \ + ../../../zfs/zcp_get.c \ + ../../../zfs/zcp_global.c \ + ../../../zfs/zcp_iter.c \ + ../../../zfs/zcp_set.c \ + ../../../zfs/zcp_synctask.c \ + ../../../zfs/zfeature.c \ + ../../../zcommon/zfeature_common.c \ + zfs_acl.c \ + zfs_boot.cpp \ + ../../../zfs/zfs_byteswap.c \ + zfs_ctldir.c \ + zfs_debug.c \ + zfs_dir.c \ + ../../../zfs/zfs_fm.c \ + zfs_file_os.c \ + ../../../zfs/zfs_fuid.c \ + zfs_fuid_os.c \ + ../../../zfs/zfs_ioctl.c \ + zfs_ioctl_os.c \ + zfs_kstat_osx.c \ + ../../../zfs/zfs_log.c \ + ../../../zfs/zfs_onexit.c \ + zfs_osx.cpp \ + ../../../zfs/zfs_quota.c \ + ../../../zfs/zfs_ratelimit.c \ + ../../../zfs/zfs_replay.c \ + ../../../zfs/zfs_rlock.c \ + ../../../zfs/zfs_sa.c \ + zfs_vfsops.c \ + zfs_vnops.c \ + zfs_vnops_osx.c \ + zfs_vnops_osx_lib.c \ + zfs_znode.c \ + ../../../zfs/zil.c \ + ../../../zfs/zio.c \ + ../../../zfs/zio_checksum.c \ + zio_crypt.c \ + ../../../zfs/zio_compress.c \ + ../../../zfs/zio_inject.c \ + ../../../zfs/zle.c \ + ../../../zfs/zrlock.c \ + ../../../zfs/zthr.c \ + ../../../zfs/zvol.c \ + zvol_os.c \ + zvolIO.cpp \ + ZFSDatasetProxy.cpp \ + ZFSDatasetScheme.cpp \ + ZFSDataset.cpp \ + ZFSPool.cpp \ + ../../../nvpair/fnvpair.c \ + ../../../nvpair/nvpair.c \ + ../../../nvpair/nvpair_alloc_fixed.c \ + ../../../nvpair/nvpair_alloc_spl.c \ + ../../../unicode/u8_textprep.c \ + ../../../unicode/uconv.c \ + ../../../zcommon/zfs_comutil.c \ + ../../../zcommon/zfs_deleg.c \ + ../../../zcommon/zfs_fletcher.c \ + ../../../zcommon/zfs_fletcher_superscalar.c \ + ../../../zcommon/zfs_fletcher_superscalar4.c \ + ../../../zcommon/zfs_namecheck.c \ + ../../../zcommon/zfs_prop.c \ + ../../../zcommon/zpool_prop.c \ + ../../../zcommon/zprop_common.c \ + ../../../icp/api/kcf_cipher.c \ + ../../../icp/api/kcf_digest.c \ + ../../../icp/api/kcf_mac.c \ + ../../../icp/api/kcf_miscapi.c \ + ../../../icp/api/kcf_ctxops.c \ + ../../../icp/core/kcf_callprov.c \ + ../../../icp/core/kcf_prov_tabs.c \ + ../../../icp/core/kcf_sched.c \ + ../../../icp/core/kcf_mech_tabs.c \ + ../../../icp/core/kcf_prov_lib.c \ + ../../../icp/spi/kcf_spi.c \ + ../../../icp/io/aes.c \ + ../../../icp/io/edonr_mod.c \ + ../../../icp/io/sha2_mod.c \ + ../../../icp/io/sha1_mod.c \ + ../../../icp/io/skein_mod.c \ + ../../../icp/os/modhash.c \ + ../../../icp/os/modconf.c \ + ../../../icp/algs/edonr/edonr.c \ + ../../../icp/algs/modes/cbc.c \ + ../../../icp/algs/modes/ccm.c \ + ../../../icp/algs/modes/ctr.c \ + ../../../icp/algs/modes/ecb.c \ + ../../../icp/algs/modes/gcm_generic.c \ + ../../../icp/algs/modes/gcm.c \ + ../../../icp/algs/modes/modes.c \ + ../../../icp/algs/sha2/sha2.c \ + ../../../icp/algs/skein/skein.c \ + ../../../icp/algs/skein/skein_block.c \ + ../../../icp/algs/skein/skein_iv.c \ + ../../../icp/algs/aes/aes_impl_aesni.c \ + ../../../icp/algs/aes/aes_impl_generic.c \ + ../../../icp/algs/aes/aes_impl_x86-64.c \ + ../../../icp/algs/aes/aes_impl.c \ + ../../../icp/algs/aes/aes_modes.c \ + ../../../icp/illumos-crypto.c \ + ../../../lua/lapi.c \ + ../../../lua/lauxlib.c \ + ../../../lua/lbaselib.c \ + ../../../lua/lcode.c \ + ../../../lua/lcompat.c \ + ../../../lua/lcorolib.c \ + ../../../lua/lctype.c \ + ../../../lua/ldebug.c \ + ../../../lua/ldo.c \ + ../../../lua/lfunc.c \ + ../../../lua/lgc.c \ + ../../../lua/llex.c \ + ../../../lua/lmem.c \ + ../../../lua/lobject.c \ + ../../../lua/lopcodes.c \ + ../../../lua/lparser.c \ + ../../../lua/lstate.c \ + ../../../lua/lstring.c \ + ../../../lua/lstrlib.c \ + ../../../lua/ltable.c \ + ../../../lua/ltablib.c \ + ../../../lua/ltm.c \ + ../../../lua/lvm.c \ + ../../../lua/lzio.c \ + ../../../lua/setjmp/setjmp.S \ + $(zfs_ASM_SOURCES_C) \ + $(zfs_ASM_SOURCES_AS) + +# Ensure these files are always built with -O2 to avoid stack overflow. +../../../zfs/zfs-dsl_scan.$(OBJEXT): CFLAGS := $(CFLAGS:-O0%=-O2) +../../../lua/zfs-lvm.$(OBJEXT): CFLAGS := $(CFLAGS:-O0%=-O2) + +KERNEL_MODDIR= $(DESTDIR)@KERNEL_MODPREFIX@/zfs.kext + +dist_noinst_DATA = $(PLIST_STRING) $(INFO_PLIST) + +zfs.kext$(EXEEXT): zfs $(PLIST_STRING) $(INFO_PLIST) + @echo "" + @mkdir -p zfs.kext/Contents/Resources/English.lproj zfs.kext/Contents/MacOS + @cp -f $(INFO_PLIST) zfs.kext/Contents/ + /usr/libexec/PlistBuddy -c "Set :CFBundleShortVersionString $(ZFS_META_VERSION)" zfs.kext/Contents/Info.plist + /usr/libexec/PlistBuddy -c "Delete :OSBundleLibraries:net.lundman.kernel.dependencies" zfs.kext/Contents/Info.plist + /usr/libexec/PlistBuddy -c "Add :OSBundleLibraries:net.lundman.kernel.dependencies.$(ZFS_META_VERSION) string 12.5.0" zfs.kext/Contents/Info.plist + @cp -f $(PLIST_STRING) zfs.kext/Contents/Resources/English.lproj/ + @cp -f zfs zfs.kext/Contents/MacOS/ + @mkdir -p zfs.kext/Contents/PlugIns/KernelExports.kext/ + @cp -f $(top_srcdir)/module/os/macos/kernel/kernelexports zfs.kext/Contents/PlugIns/KernelExports.kext/KernelExports + @cp -f $(top_srcdir)/module/os/macos/kernel/Info.plist zfs.kext/Contents/PlugIns/KernelExports.kext/ + /usr/libexec/PlistBuddy -c "Set :CFBundleIdentifier net.lundman.kernel.dependencies.$(ZFS_META_VERSION)" zfs.kext/Contents/PlugIns/KernelExports.kext/Info.plist + /usr/libexec/PlistBuddy -c "Add :OSBundleRequired string Root" zfs.kext/Contents/Info.plist + cp -f $(top_srcdir)/module/os/macos/kernel/version.plist zfs.kext/Contents/PlugIns/KernelExports.kext/ + @kextlibs -unsupported -undef-symbols -xml zfs.kext/ || echo "Ignoring errors..(Most of these are expected)" | grep -v -f $(top_srcdir)/module/os/macos/kernel/zfs.exports + +install-exec-local: zfs.kext + rm -rf $(KERNEL_MODDIR) + mkdir -p $(KERNEL_MODDIR) + rsync -r zfs.kext/ $(KERNEL_MODDIR) + chown -R root:wheel $(KERNEL_MODDIR) || echo "Unable to chown root:wheel $(KERNEL_MODDIR)" + @echo + @echo "To load module: kextload -v $(KERNEL_MODDIR)" + @echo "To uninstall module: rm -rf $(KERNEL_MODDIR)" + @echo + +uninstall-am: + rm -rf $(KERNEL_MODDIR) + +clean: + rm -rf zfs.kext/ + rm -f *.o *.lo zfs diff --git a/module/os/macos/zfs/ZFSDataset.cpp b/module/os/macos/zfs/ZFSDataset.cpp new file mode 100644 index 0000000000..9dde5613f6 --- /dev/null +++ b/module/os/macos/zfs/ZFSDataset.cpp @@ -0,0 +1,867 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * ZFSDataset - proxy disk for legacy and com.apple.devicenode mounts. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(DEBUG) || defined(ZFS_DEBUG) +#ifdef dprintf +#undef dprintf +#endif +#define dprintf(fmt, ...) do { \ + IOLog("ZFSDataset %s " fmt "\n", __func__, ##__VA_ARGS__); \ +_NOTE(CONSTCOND) } while (0) +#else +#ifndef dprintf +#define dprintf(fmt, ...) do { } while (0); +#endif +#endif /* if DEBUG or ZFS_DEBUG */ + +#define DPRINTF_FUNC() do { dprintf(""); } while (0); + +OSDefineMetaClassAndStructors(ZFSDataset, IOMedia); + +#if 0 +/* XXX Only for debug tracing */ +bool +ZFSDataset::open(IOService *client, + IOOptionBits options, IOStorageAccess access) +{ + bool ret; + DPRINTF_FUNC(); + + ret = IOMedia::open(client, options, access); + + dprintf("ZFSDataset %s ret %d", ret); + return (ret); +} + +bool +ZFSDataset::isOpen(const IOService *forClient) const +{ + DPRINTF_FUNC(); + return (false); +} + +void +ZFSDataset::close(IOService *client, + IOOptionBits options) +{ + DPRINTF_FUNC(); + IOMedia::close(client, options); +} + +bool +ZFSDataset::handleOpen(IOService *client, + IOOptionBits options, void *access) +{ + bool ret; + DPRINTF_FUNC(); + + ret = IOMedia::handleOpen(client, options, access); + + dprintf("ZFSDataset %s ret %d", ret); + return (ret); +} + +bool +ZFSDataset::handleIsOpen(const IOService *client) const +{ + bool ret; + DPRINTF_FUNC(); + + ret = IOMedia::handleIsOpen(client); + + dprintf("ZFSDataset %s ret %d", ret); + return (ret); +} + +void +ZFSDataset::handleClose(IOService *client, + IOOptionBits options) +{ + DPRINTF_FUNC(); + IOMedia::handleClose(client, options); +} + +bool +ZFSDataset::attach(IOService *provider) +{ + DPRINTF_FUNC(); + return (IOMedia::attach(provider)); +} + +void +ZFSDataset::detach(IOService *provider) +{ + DPRINTF_FUNC(); + IOMedia::detach(provider); +} + +bool +ZFSDataset::start(IOService *provider) +{ + DPRINTF_FUNC(); + return (IOMedia::start(provider)); +} + +void +ZFSDataset::stop(IOService *provider) +{ + DPRINTF_FUNC(); + IOMedia::stop(provider); +} +#endif + +/* XXX Only for debug tracing */ +void +ZFSDataset::free() +{ + DPRINTF_FUNC(); + IOMedia::free(); +} + +/* + * Override init to call IOMedia init then setup properties. + */ +bool +ZFSDataset::init(UInt64 base, UInt64 size, + UInt64 preferredBlockSize, + IOMediaAttributeMask attributes, + bool isWhole, bool isWritable, + const char *contentHint, + OSDictionary *properties) +{ + OSDictionary *newProps = NULL, *deviceDict; + OSNumber *physSize, *logSize; +#if 0 + OSDictionary *protocolDict; + const OSSymbol *virtualSymbol, *internalSymbol; +#endif + bool ret; + + DPRINTF_FUNC(); + + /* Clone or create new properties dictionary */ + if (properties) newProps = OSDictionary::withDictionary(properties); + if (!newProps) newProps = OSDictionary::withCapacity(2); + + /* Allocate dictionaries, numbers, and string symbols */ + deviceDict = OSDictionary::withCapacity(2); +#if 0 + protocolDict = OSDictionary::withCapacity(2); +#endif + + physSize = OSNumber::withNumber(4096, 32); + logSize = OSNumber::withNumber(512, 32); + +#if 0 + kIOPropertyPhysicalInterconnectTypeVirtual + kIOPropertyPhysicalInterconnectTypeKey + kIOPropertyInterconnectFileKey + kIOPropertyInternalKey + kIOPropertyPhysicalInterconnectLocationKey + + kIOPropertyProtocolCharacteristicsKey + kIOPropertyMediumTypeKey + kIOPropertyLogicalBlockSizeKey + kIOPropertyPhysicalBlockSizeKey + kIOPropertyBytesPerPhysicalSectorKey + kIOPropertyDeviceCharacteristicsKey + kIOBlockStorageDeviceTypeKey + kIOBlockStorageDeviceTypeGeneric +#endif + +#if 0 + virtualSymbol = OSSymbol::withCString( + kIOPropertyPhysicalInterconnectTypeVirtual); + internalSymbol = OSSymbol::withCString( + kIOPropertyInternalKey); +#endif + + /* Validate allocations */ + if (!newProps || !deviceDict || !physSize || !logSize +#if 0 + // || !protocolDict || !virtualSymbol || !internalSymbol +#endif + ) { + dprintf("symbol allocation failed"); + OSSafeReleaseNULL(newProps); + OSSafeReleaseNULL(deviceDict); +#if 0 + OSSafeReleaseNULL(protocolDict); +#endif + OSSafeReleaseNULL(physSize); + OSSafeReleaseNULL(logSize); +#if 0 + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(internalSymbol); +#endif + return (false); + } + + /* Setup device characteristics */ + deviceDict->setObject(kIOPropertyPhysicalBlockSizeKey, physSize); + deviceDict->setObject(kIOPropertyLogicalBlockSizeKey, logSize); + OSSafeReleaseNULL(physSize); + OSSafeReleaseNULL(logSize); + +#if 0 + /* Setup protocol characteristics */ + protocolDict->setObject(kIOPropertyPhysicalInterconnectTypeKey, + virtualSymbol); + protocolDict->setObject(kIOPropertyPhysicalInterconnectLocationKey, + internalSymbol); + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(internalSymbol); +#endif + + /* XXX Setup required IOMedia props */ + + /* Set new device and protocol dictionaries */ + if (newProps->setObject(kIOPropertyDeviceCharacteristicsKey, + deviceDict) == false +#if 0 + // || + // newProps->setObject(kIOPropertyProtocolCharacteristicsKey, + // protocolDict) == false +#endif + ) { + dprintf("setup properties failed"); + OSSafeReleaseNULL(newProps); + OSSafeReleaseNULL(deviceDict); +#if 0 + OSSafeReleaseNULL(protocolDict); +#endif + return (false); + } + OSSafeReleaseNULL(deviceDict); +#if 0 + OSSafeReleaseNULL(protocolDict); +#endif + + /* Call IOMedia init with size and newProps */ + ret = IOMedia::init(base, size, preferredBlockSize, + attributes, isWhole, isWritable, contentHint, + newProps); + OSSafeReleaseNULL(newProps); + + if (!ret) dprintf("IOMedia init failed"); + + return (ret); + +#if 0 + /* Get current device and protocol dictionaries */ + lockForArbitration(); + oldDeviceDict = OSDynamicCast(OSDictionary, + getProperty(kIOStorageDeviceCharacteristicsKey)); + oldProtocolDict = OSDynamicCast(OSDictionary, + getProperty(kIOStorageProtocolCharacteristicsKey)); + if (oldDeviceDict) oldDeviceDict->retain(); + if (oldProtocolDict) oldProtocolDict->retain(); + unlockForArbitration(); + + /* Clone existing dictionaries */ + if (oldDeviceDict) { + newDeviceDict = OSDictionary::withDict(oldDeviceDict); + OSSafeReleaseNULL(oldDeviceDict); + } + if (oldProtocolDict) { + newProtocolDict = OSDictionary::withDict(oldProtocolDict); + OSSafeReleaseNULL(oldDeviceDict); + } + + /* Make new if missing */ + if (!newDeviceDict) + newDeviceDict = OSDictionary::withCapacity(2); + if (!newProtocolDict) + newProtocolDict = OSDictionary::withCapacity(2); + + /* Setup device characteristics */ + newDeviceDict->setObject(kIOStoragePhysicalBlocksizeKey, physSize); + newDeviceDict->setObject(kIOStorageLogicalBlocksizeKey, logSize); + OSSafeReleaseNULL(physSize); + OSSafeReleaseNULL(logSize); + + /* Setup protocol characteristics */ + newProtocolDict->setObject(kIOStorageProtocolInterconnectTypeKey, + virtualSymbol); + newProtocolDict->setObject(kIOStorageProtocolInterconnectNameKey, + internalSymbol); + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(internalSymbol); + + /* XXX Setup required IOMedia props */ + + /* Set new device and protocol dictionaries */ + lockForArbitration(); + setProperty(kIOStorageDeviceCharacteristicsKey, newDeviceDict); + setProperty(kIOStorageProtocolCharacteristicsKey, newProtocolDict); + unlockForArbitration(); + + /* Cleanup and return success */ + OSSafeReleaseNULL(newDeviceDict); + OSSafeReleaseNULL(newProtocolDict); + return (true); +#endif +} + +/* + * Set both the IOService name and the ZFS Dataset property. + */ +bool +ZFSDataset::setDatasetName(const char *name) +{ + OSDictionary *prevDict, *newDict = NULL; + OSString *datasetString; + const char *newname; + + if (!name || name[0] == '\0') { + dprintf("missing name"); + return (false); + } + + if ((newname = strrchr(name, '/')) == NULL) { + newname = name; + } else { + /* Advance beyond slash */ + newname++; + } + +#if 0 + size_t len; + /* Length of IOMedia name plus null terminator */ + len = (strlen(kZFSIOMediaPrefix) + strlen(name) + + strlen(kZFSIOMediaSuffix) + 1); + // len = strlen("ZFS ") + strlen(name) + strlen(" Media") + 1; + + newname = (char *)kmem_alloc(len, KM_SLEEP); +#endif + datasetString = OSString::withCString(name); + +#if 0 + nameString = OSString::withCString(newname); + if (newname == NULL || nameString == NULL) { + dprintf("couldn't make name strings"); + OSSafeReleaseNULL(nameString); + if (newname) kmem_free(newname, len); + return (false); + } +#else + if (datasetString == NULL) { + dprintf("couldn't make name strings"); + return (false); + } +#endif + +#if 0 + bzero(newname, len); + snprintf(newname, len, "%s%s%s", kZFSIOMediaPrefix, + name, kZFSIOMediaSuffix); + + ASSERT3U(strlen(newname), ==, len-1); +#endif + + /* Lock IORegistryEntry and get current prop dict */ + lockForArbitration(); + if ((prevDict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyDeviceCharacteristicsKey))) == NULL) { + /* Unlock IORegistryEntry */ + unlockForArbitration(); + dprintf("couldn't get prop dict"); + } + prevDict->retain(); + unlockForArbitration(); + + /* Clone existing dictionary */ + if (prevDict) { + if ((newDict = OSDictionary::withDictionary(prevDict)) == + NULL) { + dprintf("couldn't clone prop dict"); + } + OSSafeReleaseNULL(prevDict); + /* Non-fatal at the moment */ + } + + /* If prevDict did not exist or couldn't be copied, make new */ + if (!newDict && (newDict = OSDictionary::withCapacity(1)) == NULL) { + dprintf("couldn't make new prop dict"); + } + + /* If we have a new or copied dict at this point */ + if (newDict) { + /* Add or replace dictionary Product Name string */ + if (newDict->setObject(kIOPropertyProductNameKey, + datasetString) == false) { + dprintf("couldn't set name"); + OSSafeReleaseNULL(datasetString); + // OSSafeReleaseNULL(nameString); + // kmem_free(newname, len); + OSSafeReleaseNULL(newDict); + return (false); + } + + /* Lock IORegistryEntry and replace prop dict */ + lockForArbitration(); + if (setProperty(kIOPropertyDeviceCharacteristicsKey, + newDict) == false) { + unlockForArbitration(); + dprintf("couldn't set name"); + OSSafeReleaseNULL(datasetString); + // OSSafeReleaseNULL(nameString); + // kmem_free(newname, len); + OSSafeReleaseNULL(newDict); + return (false); + } + unlockForArbitration(); + OSSafeReleaseNULL(newDict); + } + + /* Lock IORegistryEntry to replace property and set name */ + lockForArbitration(); + /* Assign plain ZFS Dataset name */ + setProperty(kZFSDatasetNameKey, datasetString); + /* Assign IOMedia name */ + // setName(name); + setName(newname); + + /* Unlock IORegistryEntry and cleanup allocations */ + unlockForArbitration(); + // kmem_free(newname, len); + // OSSafeReleaseNULL(nameString); + return (true); +} + +#if 0 +static inline uint64_t +get_objnum(const char *name) +{ + objset_t *os = NULL; + uint64_t objnum; + int error; + + if (!name) + return (0); + + error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, FTAG, &os); + if (error != 0) { + dprintf("couldn't open dataset %d", error); + return (0); + } + + objnum = dmu_objset_id(os); + + dmu_objset_disown(os, FTAG); + + return (objnum); +} +#endif + +/* + * Create a proxy device, name it appropriately, and return it. + */ +ZFSDataset * +ZFSDataset::withDatasetNameAndSize(const char *name, uint64_t size) +{ + ZFSDataset *dataset = NULL; + objset_t *os = NULL; + OSString *uuidStr = NULL; + OSObject *property = NULL; + char uuid_cstr[37]; + uint64_t objnum, readonly, guid; +#if 0 + // uint64_t ref_size, avail_size, obj_count, obj_free; +#endif + uuid_t uuid; + int error; + bool isWritable; + + DPRINTF_FUNC(); + + if (!name || name[0] == '\0') { + dprintf("missing name"); + /* Nothing allocated or retained yet */ + return (NULL); + } + bzero(uuid_cstr, sizeof (uuid_cstr)); + +#if 0 + OSNumber *sizeNum = NULL; + property = copyProperty(kZFSPoolSizeKey, gIOServicePlane, + kIORegistryIterateRecursively|kIORegistryIterateParents); + if (!property) { + dprintf("couldn't get pool size"); + /* Nothing allocated or retained yet */ + return (NULL); + } + if ((sizeNum = OSDynamicCast(OSNumber, property)) == NULL) { + dprintf("couldn't cast pool size"); + goto error; + } + size = sizeNum->unsigned64BitValue(); + sizeNum = NULL; + OSSafeReleaseNULL(property); +#endif + + if (zfs_vfs_uuid_gen(name, uuid) != 0) { + dprintf("UUID gen failed"); + goto error; + } + // uuid_unparse(uuid, uuid_cstr); + zfs_vfs_uuid_unparse(uuid, uuid_cstr); + // snprintf(uuid_cstr, sizeof (uuid_cstr), ""); + + uuidStr = OSString::withCString(uuid_cstr); + if (!uuidStr) { + dprintf("uuidStr alloc failed"); + goto error; + } + + dataset = new ZFSDataset; + if (!dataset) { + dprintf("allocation failed"); + goto error; + } + + /* Own the dmu objset to get properties */ + error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_FALSE, FTAG, &os); + if (error != 0) { + dprintf("couldn't open dataset %d", error); + goto error; + } + + /* Get the dsl_dir to lookup object number */ + objnum = dmu_objset_id(os); + +#if 0 + dmu_objset_space(os, &ref_size, &avail_size, &obj_count, &obj_free); +#endif + + // if (os->os_dsl_dataset) + // guid = dsl_dataset_phys(os->os_dsl_dataset)->ds_guid; + guid = dmu_objset_fsid_guid(os); + // dsl_prop_get_integer(name, "guid", &guid, NULL) != 0) { + + if (dsl_prop_get_integer(name, "readonly", &readonly, NULL) != 0) { + dmu_objset_disown(os, B_FALSE, FTAG); + dprintf("get readonly property failed"); + goto error; + } + // size = (1<<30); + // isWritable = true; + dmu_objset_disown(os, B_FALSE, FTAG); + +#if 0 + size = ref_size + avail_size; +#endif + + isWritable = (readonly == 0ULL); + + if (dataset->init(/* base */ 0, size, DEV_BSIZE, + /* attributes */ 0, /* isWhole */ false, isWritable, + kZFSContentHint, /* properties */ NULL) == false) { + dprintf("init failed"); + goto error; + } + + if (dataset->setDatasetName(name) == false) { + dprintf("invalid name"); + goto error; + } + + /* Set media UUID */ + dataset->setProperty(kIOMediaUUIDKey, uuidStr); + OSSafeReleaseNULL(uuidStr); + + return (dataset); + +error: + OSSafeReleaseNULL(property); + OSSafeReleaseNULL(uuidStr); + OSSafeReleaseNULL(dataset); + return (NULL); +} + +/* + * Compatibility method simulates a read but returns all zeros. + */ +void +ZFSDataset::read(IOService *client, + UInt64 byteStart, IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + IOByteCount total, cur_len, done = 0; + addr64_t cur; + + DPRINTF_FUNC(); + if (!buffer) { + if (completion) complete(completion, kIOReturnInvalid, 0); + return; + } + + total = buffer->getLength(); + + /* XXX Get each physical segment of the buffer and zero it */ + while (done < total) { + cur_len = 0; + cur = buffer->getPhysicalSegment(done, &cur_len); + if (cur == 0) break; + if (cur_len != 0) bzero_phys(cur, cur_len); + done += cur_len; + ASSERT3U(done, <=, total); + } + ASSERT3U(done, ==, total); + + // if (!completion || !completion->action) { + if (!completion) { + dprintf("invalid completion"); + return; + } + +// (completion->action)(completion->target, completion->parameter, +// kIOReturnSuccess, total); + complete(completion, kIOReturnSuccess, total); +} + +/* + * Compatibility method simulates a write as a no-op. + */ +void +ZFSDataset::write(IOService *client, + UInt64 byteStart, IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + IOByteCount total; + DPRINTF_FUNC(); + + if (!buffer) { + if (completion) complete(completion, kIOReturnInvalid); + return; + } + + total = buffer->getLength(); + + // if (!completion || !completion->action) { + if (!completion) { + dprintf("invalid completion"); + return; + } + + /* XXX No-op, just return success */ +// (completion->action)(completion->target, completion->parameter, +// kIOReturnSuccess, total); + complete(completion, kIOReturnSuccess, total); +} + +#ifdef DEBUG +volatile SInt64 num_sync = 0; +#endif + +/* + * Compatibility method simulates a barrier sync as a no-op. + */ +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) +IOReturn +ZFSDataset::synchronize(IOService *client, + UInt64 byteStart, UInt64 byteCount, + IOStorageSynchronizeOptions options) +#else +IOReturn +ZFSDataset::synchronizeCache(IOService *client) +#endif +{ +#ifdef DEBUG + SInt64 cur_sync = 0; + DPRINTF_FUNC(); + cur_sync = OSIncrementAtomic64(&num_sync); + dprintf("sync called %lld times", cur_sync); +#endif + + /* XXX Needs to report success for mount_common() */ + return (kIOReturnSuccess); +} + +/* + * Compatibility method returns failure (unsupported). + */ +IOReturn +ZFSDataset::unmap(IOService *client, + IOStorageExtent *extents, UInt32 extentsCount, +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + IOStorageUnmapOptions options) +#else + UInt32 options) +#endif +{ + DPRINTF_FUNC(); + return (kIOReturnUnsupported); +} + +/* + * Compatibility method returns failure (no result). + */ +IOStorage * +ZFSDataset::copyPhysicalExtent(IOService *client, + UInt64 *byteStart, UInt64 *byteCount) +{ + DPRINTF_FUNC(); + return (0); + // return (IOMedia::copyPhysicalExtent(client, byteStart, byteCount)); +} + +/* + * Compatibility method simulates lock as a no-op. + */ +bool +ZFSDataset::lockPhysicalExtents(IOService *client) +{ + DPRINTF_FUNC(); + // return (IOMedia::unlockPhysicalExtents(client)); + return (true); +} + +/* + * Compatibility method simulates unlock as a no-op. + */ +void +ZFSDataset::unlockPhysicalExtents(IOService *client) +{ + DPRINTF_FUNC(); + // IOMedia::unlockPhysicalExtents(client); +} + +/* + * Compatibility method returns failure (unsupported). + */ +#if defined(MAC_OS_X_VERSION_10_10) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10) +IOReturn +ZFSDataset::setPriority(IOService *client, + IOStorageExtent *extents, UInt32 extentsCount, + IOStoragePriority priority) +{ + DPRINTF_FUNC(); + return (kIOReturnUnsupported); + // return (IOMedia::setPriority(client, extents, + // extentsCount, priority)); +} +#endif + +/* + * Compatibility method returns default system blocksize. + */ +UInt64 +ZFSDataset::getPreferredBlockSize() const +{ + DPRINTF_FUNC(); + return (DEV_BSIZE); + // return (IOMedia::getPreferredBlockSize()); +} + +/* XXX Only for debug tracing */ +UInt64 +ZFSDataset::getSize() const +{ + DPRINTF_FUNC(); + return (IOMedia::getSize()); +} + +/* XXX Only for debug tracing */ +UInt64 +ZFSDataset::getBase() const +{ + DPRINTF_FUNC(); + return (IOMedia::getBase()); +} + +/* XXX Only for debug tracing */ +bool +ZFSDataset::isEjectable() const +{ + DPRINTF_FUNC(); + return (IOMedia::isEjectable()); +} + +/* XXX Only for debug tracing */ +bool +ZFSDataset::isFormatted() const +{ + DPRINTF_FUNC(); + return (IOMedia::isFormatted()); +} + +/* XXX Only for debug tracing */ +bool +ZFSDataset::isWhole() const +{ + DPRINTF_FUNC(); + return (IOMedia::isWhole()); +} + +/* XXX Only for debug tracing */ +bool +ZFSDataset::isWritable() const +{ + DPRINTF_FUNC(); + return (IOMedia::isWritable()); +} + +/* XXX Only for debug tracing */ +const char * +ZFSDataset::getContent() const +{ + DPRINTF_FUNC(); + return (IOMedia::getContent()); +} + +/* XXX Only for debug tracing */ +const char * +ZFSDataset::getContentHint() const +{ + DPRINTF_FUNC(); + return (IOMedia::getContentHint()); +} + +/* XXX Only for debug tracing */ +IOMediaAttributeMask +ZFSDataset::getAttributes() const +{ + DPRINTF_FUNC(); + return (IOMedia::getAttributes()); +} diff --git a/module/os/macos/zfs/ZFSDatasetProxy.cpp b/module/os/macos/zfs/ZFSDatasetProxy.cpp new file mode 100644 index 0000000000..4036bd74b1 --- /dev/null +++ b/module/os/macos/zfs/ZFSDatasetProxy.cpp @@ -0,0 +1,478 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ + +#include +#include +#include +#include + +#if defined(DEBUG) || defined(ZFS_DEBUG) +#ifdef dprintf +#undef dprintf +#endif +#define dprintf(fmt, ...) do { \ + IOLog("ZFSDatasetProxy %s " fmt "\n", __func__, ##__VA_ARGS__); \ +_NOTE(CONSTCOND) } while (0) +#else +#ifndef dprintf +#define dprintf(fmt, ...) do { } while (0); +#endif +#endif /* if DEBUG or ZFS_DEBUG */ + +#define DPRINTF_FUNC() do { dprintf(""); } while (0); + +/* block size is 512 B, count is 512 M blocks */ +#define ZFS_PROXY_DEV_BSIZE (UInt64)(1<<9) +#define ZFS_PROXY_DEV_BCOUNT (UInt64)(2<<29) +#define kZFSProxyGUIDKey "ZFS Pool GUID" +#define kZFSProxyReadOnlyKey "ZFS Pool Read-Only" + +OSDefineMetaClassAndStructors(ZFSDatasetProxy, IOBlockStorageDevice); + +void +ZFSDatasetProxy::free() +{ + char *str; + + /* vendor, revision, and info share a null char */ + if (vendorString) { + str = (char *)vendorString; + vendorString = 0; + if (revisionString == str) revisionString = 0; + if (infoString == str) infoString = 0; + IOFree(str, strlen(str)+1); + } + + /* Product string contains pool name */ + if (productString) { + str = (char *)productString; + productString = 0; + IOFree(str, strlen(str)+1); + } + + IOBlockStorageDevice::free(); +} + +bool +ZFSDatasetProxy::init(OSDictionary *properties) +{ + char *str = (char *)IOMalloc(1); + + if (!str) { + dprintf("string allocation failed\n"); + return (false); + } + str[0] = '\0'; + vendorString = str; + revisionString = str; + infoString = str; + + if (IOBlockStorageDevice::init(properties) == false) { + dprintf("BlockStorageDevice start failed"); + goto error; + } + + return (true); + +error: + if (str) { + vendorString = 0; + revisionString = 0; + infoString = 0; + IOFree(str, 1); + } + return (false); +} + +bool +ZFSDatasetProxy::start(IOService *provider) +{ + OSObject *property = NULL, *size = NULL; + OSString *nameString = NULL; + OSNumber *sizeNum = NULL; + OSDictionary *deviceDict = NULL, *protocolDict = NULL; + const OSSymbol *virtualSymbol = NULL, *internalSymbol = NULL; + const char *cstr = NULL; + char *pstring = NULL; + int plen = 0; + bool started = false; + + size = copyProperty(kZFSPoolSizeKey, gIOServicePlane, + (kIORegistryIterateRecursively|kIORegistryIterateParents)); + property = copyProperty(kZFSPoolNameKey, gIOServicePlane, + (kIORegistryIterateRecursively|kIORegistryIterateParents)); + + if (!size || !property) { + dprintf("couldn't get pool name or size"); + goto error; + } + + nameString = OSDynamicCast(OSString, property); + if (!nameString) { + dprintf("missing pool name"); + goto error; + } +#if 0 + /* Try hard to get the name string */ + do { + nameString = OSDynamicCast(OSString, property); + + if (nameString) nameString->retain(); + + if (!nameString) { + OSSymbol *nameSymbol; + nameSymbol = OSDynamicCast(OSSymbol, property); + if (!nameSymbol) { + dprintf("couldn't get name"); + goto error; + } + nameString = OSString::withCString( + nameSymbol->getCStringNoCopy()); + } + } while (0); +#endif + + sizeNum = OSDynamicCast(OSNumber, size); + if (!sizeNum) { + dprintf("invalid size"); + goto error; + } + _pool_bcount = sizeNum->unsigned64BitValue() / DEV_BSIZE; + sizeNum = 0; + size->release(); + size = 0; + + cstr = nameString->getCStringNoCopy(); + if (!cstr || (plen = strlen(cstr) + 1) == 1) { + goto error; + } + pstring = (char *)IOMalloc(plen); + if (!pstring) { + goto error; + } + snprintf(pstring, plen, "%s", cstr); + productString = pstring; + pstring = 0; + + if (IOBlockStorageDevice::start(provider) == false) { + dprintf("BlockStorageDevice start failed"); + goto error; + } + started = true; + + deviceDict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyDeviceCharacteristicsKey)); + if (deviceDict) { + /* Clone a new dictionary */ + deviceDict = OSDictionary::withDictionary(deviceDict); + if (!deviceDict) { + dprintf("dict clone failed"); + goto error; + } + } + + if (!deviceDict) { + dprintf("creating new device dict"); + deviceDict = OSDictionary::withCapacity(1); + } + + if (!deviceDict) { + dprintf("missing device dict"); + goto error; + } + + deviceDict->setObject(kIOPropertyProductNameKey, nameString); + OSSafeReleaseNULL(nameString); + + if (setProperty(kIOPropertyDeviceCharacteristicsKey, + deviceDict) == false) { + dprintf("device dict setProperty failed"); + goto error; + } + OSSafeReleaseNULL(deviceDict); + + protocolDict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyProtocolCharacteristicsKey)); + if (protocolDict) { + /* Clone a new dictionary */ + protocolDict = OSDictionary::withDictionary(protocolDict); + if (!protocolDict) { + dprintf("dict clone failed"); + goto error; + } + } + + if (!protocolDict) { + dprintf("creating new protocol dict"); + protocolDict = OSDictionary::withCapacity(1); + } + + if (!protocolDict) { + dprintf("missing protocol dict"); + goto error; + } + + virtualSymbol = OSSymbol::withCString( + kIOPropertyPhysicalInterconnectTypeVirtual); + internalSymbol = OSSymbol::withCString( + kIOPropertyInternalKey); + if (!virtualSymbol || !internalSymbol) { + dprintf("symbol alloc failed"); + goto error; + } + + protocolDict->setObject(kIOPropertyPhysicalInterconnectTypeKey, + virtualSymbol); + protocolDict->setObject(kIOPropertyPhysicalInterconnectLocationKey, + internalSymbol); + + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(internalSymbol); + + if (setProperty(kIOPropertyProtocolCharacteristicsKey, + protocolDict) == false) { + dprintf("protocol dict setProperty failed"); + goto error; + } + OSSafeReleaseNULL(protocolDict); + registerService(kIOServiceAsynchronous); + + return (true); + +error: + OSSafeReleaseNULL(size); + OSSafeReleaseNULL(property); + OSSafeReleaseNULL(deviceDict); + OSSafeReleaseNULL(protocolDict); + OSSafeReleaseNULL(nameString); + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(internalSymbol); + if (pstring) IOFree(pstring, plen); + if (started) IOBlockStorageDevice::stop(provider); + return (false); +} + +/* XXX IOBlockStorageDevice */ +IOReturn +ZFSDatasetProxy::doSynchronizeCache(void) +{ + DPRINTF_FUNC(); + return (kIOReturnSuccess); +} + +IOReturn +ZFSDatasetProxy::doAsyncReadWrite(IOMemoryDescriptor *buffer, + UInt64 block, UInt64 nblks, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + char zero[ZFS_PROXY_DEV_BSIZE]; + size_t len, cur, off = 0; + + DPRINTF_FUNC(); + + if (!buffer) { + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* Read vs. write */ + if (buffer->getDirection() == kIODirectionIn) { + /* Zero the read buffer */ + bzero(zero, ZFS_PROXY_DEV_BSIZE); + len = buffer->getLength(); + while (len > 0) { + cur = (len > ZFS_PROXY_DEV_BSIZE ? + ZFS_PROXY_DEV_BSIZE : len); + buffer->writeBytes(/* offset */ off, + /* buf */ zero, /* length */ cur); + off += cur; + len -= cur; + } + // dprintf("%s: read: %llu %llu", + // __func__, block, nblks); + IOStorage::complete(completion, kIOReturnSuccess, + buffer->getLength()); + return (kIOReturnSuccess); + } + + if (buffer->getDirection() != kIODirectionOut) { + dprintf("invalid direction %d", buffer->getDirection()); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* + * XXX For now this just returns error for all writes. + * If it turns out that mountroot/bdevvp try to + * verify writable status by reading a block and writing + * it back to disk, lie and say it succeeded. + */ + dprintf("write: %llu %llu", block, nblks); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); +} + +IOReturn +ZFSDatasetProxy::doEjectMedia() +{ + DPRINTF_FUNC(); + /* XXX Called at shutdown, maybe return success? */ + return (kIOReturnError); +} + +IOReturn +ZFSDatasetProxy::doFormatMedia(UInt64 byteCapacity) +{ + DPRINTF_FUNC(); + /* XXX shouldn't need it */ + return (kIOReturnError); + // return (kIOReturnSuccess); +} + +UInt32 +ZFSDatasetProxy::doGetFormatCapacities(UInt64 *capacities, + UInt32 capacitiesMaxCount) const +{ + DPRINTF_FUNC(); + if (capacities && capacitiesMaxCount > 0) { + capacities[0] = (ZFS_PROXY_DEV_BSIZE * ZFS_PROXY_DEV_BCOUNT); + dprintf("capacity %llu", capacities[0]); + } + + /* Always inform caller of capacity count */ + return (1); +} + +/* Returns full pool name from instance private var */ +char * +ZFSDatasetProxy::getProductString() +{ + if (productString) dprintf("[%s]", productString); + /* Return class private string */ + return ((char *)productString); +} + +/* Returns readonly status from instance private var */ +IOReturn +ZFSDatasetProxy::reportWriteProtection(bool *isWriteProtected) +{ + DPRINTF_FUNC(); + if (isWriteProtected) *isWriteProtected = isReadOnly; + return (kIOReturnSuccess); +} + +/* These return class static string for all instances */ +char * +ZFSDatasetProxy::getVendorString() +{ + dprintf("[%s]", vendorString); + /* Return class static string */ + return ((char *)vendorString); +} +char * +ZFSDatasetProxy::getRevisionString() +{ + dprintf("[%s]", revisionString); + /* Return class static string */ + return ((char *)revisionString); +} +char * +ZFSDatasetProxy::getAdditionalDeviceInfoString() +{ + dprintf("[%s]", infoString); + /* Return class static string */ + return ((char *)infoString); +} + +/* Always return media present and unchanged */ +IOReturn +ZFSDatasetProxy::reportMediaState(bool *mediaPresent, + bool *changedState) +{ + DPRINTF_FUNC(); + if (mediaPresent) *mediaPresent = true; + if (changedState) *changedState = false; + return (kIOReturnSuccess); +} + +/* Always report nonremovable and nonejectable */ +IOReturn +ZFSDatasetProxy::reportRemovability(bool *isRemoveable) +{ + DPRINTF_FUNC(); + if (isRemoveable) *isRemoveable = false; + return (kIOReturnSuccess); +} +IOReturn +ZFSDatasetProxy::reportEjectability(bool *isEjectable) +{ + DPRINTF_FUNC(); + if (isEjectable) *isEjectable = false; + return (kIOReturnSuccess); +} + +/* Always report 512b blocksize */ +IOReturn +ZFSDatasetProxy::reportBlockSize(UInt64 *blockSize) +{ + DPRINTF_FUNC(); + if (!blockSize) + return (kIOReturnError); + + *blockSize = ZFS_PROXY_DEV_BSIZE; + return (kIOReturnSuccess); +} + +/* XXX Calculate from dev_bcount, should get size from objset */ +/* XXX Can issue message kIOMessageMediaParametersHaveChanged to update */ +IOReturn +ZFSDatasetProxy::reportMaxValidBlock(UInt64 *maxBlock) +{ + DPRINTF_FUNC(); + if (!maxBlock) + return (kIOReturnError); + + // *maxBlock = 0; + // *maxBlock = ZFS_PROXY_DEV_BCOUNT - 1; + *maxBlock = _pool_bcount - 1; + dprintf("maxBlock %llu", *maxBlock); + + return (kIOReturnSuccess); +} + +IOReturn +ZFSDatasetProxy::getWriteCacheState(bool *enabled) +{ + dprintf("getCacheState\n"); + if (enabled) *enabled = true; + return (kIOReturnSuccess); +} + +IOReturn +ZFSDatasetProxy::setWriteCacheState(bool enabled) +{ + dprintf("setWriteCache\n"); + return (kIOReturnSuccess); +} diff --git a/module/os/macos/zfs/ZFSDatasetScheme.cpp b/module/os/macos/zfs/ZFSDatasetScheme.cpp new file mode 100644 index 0000000000..6bba6e3e58 --- /dev/null +++ b/module/os/macos/zfs/ZFSDatasetScheme.cpp @@ -0,0 +1,1121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + * Copyright (c) 2017, Jorgen Lundman. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(DEBUG) || defined(ZFS_DEBUG) +#ifdef dprintf +#undef dprintf +#endif +#define dprintf(fmt, ...) do { \ + IOLog("ZFSDatasetScheme %s " fmt "\n", __func__, ##__VA_ARGS__);\ +_NOTE(CONSTCOND) } while (0) +#else +#ifndef dprintf +#define dprintf(fmt, ...) do { } while (0); +#endif +#endif /* if DEBUG or ZFS_DEBUG */ + +static ZFSDatasetScheme * +zfs_osx_proxy_scheme_by_osname(const char *osname) +{ + ZFSDatasetScheme *scheme = NULL; + OSDictionary *matching; + OSObject *object; + OSString *str; + OSIterator *iter; + char *pool_name, *slash; + size_t len; + + slash = strchr(osname, '/'); + if (slash) { + len = (slash - osname) + 1; + } else { + len = strlen(osname) + 1; + } + + pool_name = (char *)kmem_alloc(len, KM_SLEEP); + if (!pool_name) { + dprintf("string alloc failed"); + return (NULL); + } + snprintf(pool_name, len, "%s", osname); + dprintf("pool_name [%s] from %s", pool_name, osname); + + matching = IOService::serviceMatching(kZFSDatasetSchemeClass); + if (!matching) { + dprintf("couldn't get match dict"); + kmem_free(pool_name, len); + return (NULL); + } + + /* Add the pool name for exact match */ + str = OSString::withCString(pool_name); + matching->setObject(kZFSPoolNameKey, str); + OSSafeReleaseNULL(str); + + object = IOService::copyMatchingService(matching); + + if (object && (scheme = OSDynamicCast(ZFSDatasetScheme, + object)) == NULL) { + object->release(); + } + object = NULL; + + if (scheme && ((str = OSDynamicCast(OSString, + scheme->getProperty(kZFSPoolNameKey))) == NULL || + str->isEqualTo(pool_name) == false)) { + scheme->release(); + scheme = NULL; + } + + if (!scheme) { + int i; + for (i = 0; i < 12; i++) { // up to 6s + iter = IOService::getMatchingServices(matching); + if (iter) break; + IOSleep(500); + } + + if (i) dprintf("%s: tried %d times\n", __func__, i); + + if (!iter) { + dprintf("couldn't get iterator"); + kmem_free(pool_name, len); + OSSafeReleaseNULL(matching); + return (NULL); + } + + while ((object = iter->getNextObject())) { + if (iter->isValid() == false) { + iter->reset(); + continue; + } + scheme = OSDynamicCast(ZFSDatasetScheme, object); + if (!scheme) continue; + + object = scheme->getProperty(kZFSPoolNameKey, + gIOServicePlane, kIORegistryIterateParents | + kIORegistryIterateRecursively); + if (!object) continue; + + str = OSDynamicCast(OSString, object); + if (!str) continue; + + if (str->isEqualTo(pool_name)) break; + + str = NULL; + object = NULL; + scheme = NULL; + } + + if (scheme) scheme->retain(); + OSSafeReleaseNULL(iter); + } + OSSafeReleaseNULL(matching); + kmem_free(pool_name, len); + pool_name = 0; + + if (scheme == NULL) { + dprintf("no matching pool proxy"); + } + return (scheme); + +#if 0 + spa_t *spa; + ZFSPool *pool = 0; + + if (!osname || osname[0] == '\0') { + dprintf("missing dataset argument"); + return (EINVAL); + } + + /* Lookup the pool spa */ + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(osname); + if (spa && spa->spa_iokit_proxy) { + pool = spa->spa_iokit_proxy->proxy; + if (pool) pool->retain(); + } + mutex_exit(&spa_namespace_lock); + + /* Need a pool proxy to attach to */ + if (!pool) { + dprintf("couldn't get pool proxy"); + return (EINVAL); + } + return (0); +#endif +} + +/* + * Get the proxy device by matching a property name and value. + * + * Inputs: + * property: const char string. + * value: const char string. + * + * Return: + * Pointer to proxy on success, NULL on error or missing. + */ +static ZFSDataset * +zfs_osx_proxy_lookup(const char *property, OSObject *value) +{ + OSIterator *iter = NULL; + OSDictionary *matching = NULL; + OSObject *next = NULL, *prop = NULL; + ZFSDataset *dataset = NULL; + + /* Validate arguments */ + if (!property || !value || property[0] == '\0') { + dprintf("invalid argument"); + return (NULL); + } + + /* + * Create the matching dictionary for class. + * Add property and value to match dict. + */ + matching = IOService::serviceMatching(kZFSDatasetClassKey); + if ((matching) == NULL || + (matching->setObject(property, value) == false)) { + dprintf("match dictionary create failed"); + OSSafeReleaseNULL(matching); + return (NULL); + } + + /* Try to copy if there is only one match */ + next = IOService::copyMatchingService(matching); + if (next != NULL && ((dataset = OSDynamicCast(ZFSDataset, + next)) != NULL) && + (prop = dataset->getProperty(property)) != NULL && + (prop->isEqualTo(value))) { + dprintf("quick matched dataset"); + OSSafeReleaseNULL(matching); + /* Leave retain taken by copyMatching */ + return (dataset); + } + /* Unretained references */ + prop = NULL; + dataset = NULL; + /* If set, it was retained by copyMatchingService */ + OSSafeReleaseNULL(next); + + iter = IOService::getMatchingServices(matching); + OSSafeReleaseNULL(matching); + if (iter == NULL) { + dprintf("iterator failed"); + return (NULL); + } + + while ((next = iter->getNextObject())) { + dataset = OSDynamicCast(ZFSDataset, next); + if (!dataset) continue; + + if ((prop = dataset->getProperty(property)) == NULL) { + dataset = NULL; + continue; + } + + if (prop->isEqualTo(value)) { + /* Take a reference on the match */ + dprintf("found match"); + dataset->retain(); + prop = NULL; + break; + } + + prop = NULL; + dataset = NULL; + } + /* Release iterator */ + OSSafeReleaseNULL(iter); + + /* Leave retain */ + return (dataset); +#if 0 + /* + * Copy (first) matching service. + * Cast service to proxy class. + */ + if ((service = IOService::copyMatchingService(matching)) == NULL || + (dataset = OSDynamicCast(ZFSDataset, service)) == NULL) { + dprintf("matching failed"); + OSSafeReleaseNULL(service); + return (NULL); + } + + /* Leave retain from copyMatchingService */ + return (dataset); +#endif +} + +/* + * Get the proxy device for a given dataset name. + * + * Input: + * osname: dataset name e.g. pool/dataset + * + * Return: + * Valid ZFSDataset service, or NULL on error or missing. + */ +ZFSDataset * +zfs_osx_proxy_get(const char *osname) +{ + ZFSDataset *dataset; + OSString *osstr; + + /* Validate arguments, osname is limited to MAXNAMELEN */ + if (!osname || osname[0] == '\0' || osname[0] == '/' || + strnlen(osname, MAXNAMELEN+1) == (MAXNAMELEN+1)) { + dprintf("invalid argument"); + return (NULL); + } + + osstr = OSString::withCString(osname); + if (!osstr) { + dprintf("string alloc failed"); + return (NULL); + } + + dataset = zfs_osx_proxy_lookup(kZFSDatasetNameKey, osstr); + OSSafeReleaseNULL(osstr); + + if (!dataset) { + dprintf("lookup failed"); + return (NULL); + } + + return (dataset); +} + +/* + * Get the proxy device for a given a device name or path. + * + * Input: + * devpath: BSD name as const char* string, e.g. "/dev/diskN" or "diskN" + * must be null-terminated + * + * Return: + * Valid ZFSDataset service, or NULL on error or missing. + */ +static ZFSDataset * +zfs_osx_proxy_from_devpath(const char *devpath) +{ + /* XXX No need to init, will be assigned */ + ZFSDataset *dataset; + OSString *bsdstr; + const char *bsdname; + + /* Validate arguments, devpath is limited to MAXPATHLEN */ + if (!devpath || devpath[0] == '\0' || + strnlen(devpath, MAXPATHLEN+1) == (MAXPATHLEN+1)) { + dprintf("invalid argument"); + return (NULL); + } + + /* If we have a path, remove prefix */ + if (strncmp(devpath, "/dev/", 5) == 0) { + bsdname = devpath + 5; + } else { + bsdname = devpath; + } + + /* Make sure we have (at least) "diskN" at this point */ + if (strncmp(bsdname, "disk", 4) != 0 || bsdname[4] == '\0') { + dprintf("invalid bsdname %s from %s", bsdname, devpath); + return (NULL); + } + + bsdstr = OSString::withCString(bsdname); + if (!bsdstr) { + dprintf("string alloc failed"); + return (NULL); + } + + dataset = zfs_osx_proxy_lookup(kIOBSDNameKey, bsdstr); + OSSafeReleaseNULL(bsdstr); + + if (!dataset) { + dprintf("lookup with %s failed", bsdname); + return (NULL); + } + + return (dataset); +} + +/* + * Given a dataset, get the desired property and write its + * value to the caller-supplied buffer. + * + * Inputs: + * dataset: valid ZFSDataset object, should be retained by + * caller. + * property: const char* of the desired property name key. + * value: char* buffer which should be at least 'len' bytes. + * len: length of value buffer. + * + * Return: + * 0 on success, positive int on error. + */ +static int +zfs_osx_proxy_get_prop_string(ZFSDataset *dataset, + const char *property, char *value, int len) +{ + OSObject *obj; + OSString *valueString; + + /* Validate arguments */ + if (!dataset || !property || !value || len == 0) { + dprintf("invalid argument"); + return (EINVAL); + } + + /* Lock proxy while getting property */ + dataset->lockForArbitration(); + obj = dataset->copyProperty(property); + dataset->unlockForArbitration(); + + if (!obj) { + dprintf("no property %s", property); + return (ENXIO); + } + + valueString = OSDynamicCast(OSString, obj); + /* Validate property value */ + if (!valueString) { + dprintf("couldn't cast value for %s", property); + OSSafeReleaseNULL(obj); + return (ENXIO); + } + + /* Write up to len bytes */ + snprintf(value, len, "%s", valueString->getCStringNoCopy()); + + /* Release string and proxy */ + valueString = 0; + OSSafeReleaseNULL(obj); + + return (0); +} + +extern "C" { + +/* + * Given a ZFS dataset name, get the proxy device and write the + * BSD Name to the caller-supplied buffer. + * + * Inputs: + * osname: dataset name as char* string, e.g. "pool/dataset" + * must be null-terminated + * bsdname: char* string buffer where bsdname will be written + * len: length of bsdname buffer + * + * Return: + * 0 on success, positive int errno on failure. + */ +int +zfs_osx_proxy_get_bsdname(const char *osname, + char *bsdname, int len) +{ + /* XXX No need to init, will be assigned */ + ZFSDataset *dataset; + int ret; + + /* Validate arguments */ + if (!osname || !bsdname || len == 0) { + dprintf("invalid argument"); + return (EINVAL); + } + + /* Get dataset proxy (takes a retain) */ + dataset = zfs_osx_proxy_get(osname); + if (!dataset) { + dprintf("no proxy matching %s", osname); + return (ENOENT); + } + + /* Get BSD name property and write to bsdname buffer */ + ret = zfs_osx_proxy_get_prop_string(dataset, + kIOBSDNameKey, bsdname, len); + OSSafeReleaseNULL(dataset); + + if (ret != 0) { + dprintf("ret %d", ret); + } + + return (ret); +} + +/* + * Given a device name or path, get the proxy device and write the + * ZFS Dataset name to the caller-supplied buffer. + * + * Inputs: + * devpath: BSD name as const char* string, e.g. "/dev/diskN" or "diskN" + * must be null-terminated + * osname: char* string buffer where osname will be written + * len: length of osname buffer + * + * Return: + * 0 on success, positive int errno on failure. + */ +int +zfs_osx_proxy_get_osname(const char *devpath, char *osname, int len) +{ + /* XXX No need to init, will be assigned */ + ZFSDataset *dataset; + int ret; + + /* Validate arguments */ + if (!devpath || !osname || len == 0) { + dprintf("invalid argument"); + return (EINVAL); + } + + /* Get dataset proxy (takes a retain) */ + dataset = zfs_osx_proxy_from_devpath(devpath); + if (!dataset) { + dprintf("no proxy matching %s", devpath); + return (ENOENT); + } + + /* Get dataset name property and write to osname buffer */ + ret = zfs_osx_proxy_get_prop_string(dataset, + kZFSDatasetNameKey, osname, len); + OSSafeReleaseNULL(dataset); + + if (ret != 0) { + dprintf("ret %d", ret); + } + + return (ret); +} + +/* + * Check if a dataset has a proxy device. + * + * Input: + * osname: dataset name e.g. pool/dataset + * + * Return: + * 1 if exists, 0 on error or missing. + */ +int +zfs_osx_proxy_exists(const char *osname) +{ + ZFSDataset *dataset; + + /* Get dataset proxy (takes a retain) */ + if ((dataset = zfs_osx_proxy_get(osname)) != NULL) { + OSSafeReleaseNULL(dataset); + return (1); + } + + return (0); +} + +/* + * Remove the proxy device for a given dataset name. + * + * Input: + * osname: dataset name e.g. pool/dataset + */ +void +zfs_osx_proxy_remove(const char *osname) +{ + ZFSDataset *dataset; + ZFSDatasetScheme *provider; + + /* Get dataset proxy (takes a retain) */ + dataset = zfs_osx_proxy_get(osname); + if (dataset == NULL) { + dprintf("couldn't get dataset"); + return; + } +#if 0 + /* Terminate and release retain */ + dataset->terminate(kIOServiceSynchronous | kIOServiceRequired); + OSSafeReleaseNULL(dataset); +#endif + provider = OSDynamicCast(ZFSDatasetScheme, + dataset->getProvider()); + if (!provider) { + dprintf("invalid provider"); + return; + } + + OSSafeReleaseNULL(dataset); + dprintf("removing %s", osname); + provider->removeDataset(osname, /* force */ true); +} + +/* + * Create a proxy device for a given dataset name, unless one exists. + * + * Input: + * osname: dataset name e.g. pool/dataset + * + * Return: + * 0 on success, or positive int on error. + */ +int +zfs_osx_proxy_create(const char *osname) +{ + ZFSDatasetScheme *provider = NULL; + + if (!osname || osname[0] == '\0') { + dprintf("missing dataset argument"); + return (EINVAL); + } + + provider = zfs_osx_proxy_scheme_by_osname(osname); + if (provider == NULL) { + dprintf("can't get pool proxy"); + return (ENOENT); + } + + if (provider->addDataset(osname) == false) { + dprintf("couldn't add dataset"); + provider->release(); + return (ENXIO); + } + + provider->release(); + return (0); +} + +} /* extern "C" */ + +static SInt32 +orderHoles(const OSMetaClassBase *obj1, const OSMetaClassBase *obj2, + __unused void *context) +{ + OSNumber *num1, *num2; + + if (obj1 == NULL || + (num1 = OSDynamicCast(OSNumber, obj1)) == NULL) { + /* Push invalid OSNumbers to end of list */ + return (-1); + } + if (obj2 == NULL || + (num2 = OSDynamicCast(OSNumber, obj2)) == NULL) { + /* If both are non-OSNumber, same ordering */ + if (num1 == NULL) + return (0); + /* If num1 is a valid OSNumber, push num2 to end */ + return (1); + } + + /* + * A comparison result of the object: + *
    + *
  • a negative value if obj2 should precede obj1,
  • + *
  • a positive value if obj1 should precede obj2,
  • + *
  • and 0 if obj1 and obj2 have an equivalent ordering.
  • + *
+ */ + if (num1->isEqualTo(num2)) + return (0); + + if (num1->unsigned32BitValue() < num2->unsigned32BitValue()) { + return (1); + } else { + return (-1); + } +} + +OSDefineMetaClassAndStructors(ZFSDatasetScheme, IOPartitionScheme); + +void +ZFSDatasetScheme::free() +{ + OSSafeReleaseNULL(_datasets); + OSSafeReleaseNULL(_holes); + _max_id = 0; + + IOPartitionScheme::free(); +} + +bool +ZFSDatasetScheme::init(OSDictionary *properties) +{ + _datasets = OSSet::withCapacity(1); + _holes = OSOrderedSet::withCapacity(1, orderHoles); + _max_id = 0; + + if (!_datasets || !_holes) { + dprintf("OSSet allocation failed"); + OSSafeReleaseNULL(_datasets); + OSSafeReleaseNULL(_holes); + return (false); + } + + OSDictionary *newProps = NULL; + if (properties) newProps = OSDictionary::withDictionary(properties); + if (!newProps) newProps = OSDictionary::withCapacity(2); + OSString *str; + str = OSString::withCString("IOGUIDPartitionScheme"); + newProps->setObject("IOClass", str); + OSSafeReleaseNULL(str); + str = OSString::withCString("GUID_partition_scheme"); + newProps->setObject("Content Mask", str); + OSSafeReleaseNULL(str); + + if (IOPartitionScheme::init(newProps) == false) { + dprintf("IOPartitionScheme init failed"); + OSSafeReleaseNULL(newProps); + OSSafeReleaseNULL(_datasets); + OSSafeReleaseNULL(_holes); + return (false); + } + OSSafeReleaseNULL(newProps); + + return (true); +} + +bool +ZFSDatasetScheme::start(IOService *provider) +{ + OSObject *pool_name; + + if (IOPartitionScheme::start(provider) == false) { + dprintf("IOPartitionScheme start failed"); + return (false); + } + + pool_name = getProperty(kZFSPoolNameKey, + gIOServicePlane, kIORegistryIterateRecursively| + kIORegistryIterateParents); + if (pool_name) { + setProperty(kZFSPoolNameKey, pool_name); + } + + // registerService(kIOServiceAsynchronous); + registerService(kIOServiceSynchronous); + + return (true); +} + +IOService * +ZFSDatasetScheme::probe(IOService *provider, SInt32 *score) +{ + OSObject *property; + IOService *parent; + + /* First ask IOPartitionScheme to probe */ + if (IOPartitionScheme::probe(provider, score) == 0) { + dprintf("IOPartitionScheme probe failed"); + return (0); + } + + /* Check for ZFS Pool Name first */ + property = getProperty(kZFSPoolNameKey, gIOServicePlane, + kIORegistryIterateRecursively|kIORegistryIterateParents); + if (!property) { + dprintf("no pool name"); + return (0); + } + + /* Make sure we have a target, and valid provider below */ + if (provider == NULL || + OSDynamicCast(IOMedia, provider) == NULL || + (parent = provider->getProvider()) == NULL) { + dprintf("invalid provider"); + return (0); + } + + /* Make sure provider is driver, and has valid provider below */ + if (OSDynamicCast(IOBlockStorageDriver, parent) == NULL || + (parent = parent->getProvider()) == NULL) { + dprintf("invalid parent"); + return (0); + } + + /* Make sure the parent provider is a proxy */ + if (OSDynamicCast(ZFSDatasetProxy, parent) == NULL) { + dprintf("invalid grandparent"); + return (0); + } + + /* Successful match */ + dprintf("Match"); + // *score = 5000; + return (this); +} + +uint32_t +ZFSDatasetScheme::getNextPartitionID() +{ + uint32_t ret_id = 0ULL; + + /* Try to lock, unless service is terminated */ + if (lockForArbitration(false) == false) { + dprintf("service is terminated"); + return (0ULL); + } + + /* If the partiton list is sparse (has holes) */ + if (_holes->getCount() != 0) { + OSNumber *id_num = OSDynamicCast(OSNumber, + _holes->getFirstObject()); + + /* Just in case the list is invalid */ +#ifdef DEBUG + if (!id_num) panic("invalid hole list"); +#endif + + if (id_num) { + id_num->retain(); + _holes->removeObject(id_num); + ret_id = id_num->unsigned32BitValue(); + OSSafeReleaseNULL(id_num); + goto out; + } + } + + /* If no holes were found, just get next id */ + ret_id = (_max_id += 1); + +out: + unlockForArbitration(); + return (ret_id); +} + +void ZFSDatasetScheme::returnPartitionID(uint32_t part_id) +{ + OSNumber *id_num = OSNumber::withNumber(part_id, 32); + + if (!id_num) dprintf("alloc failed"); + /* XXX Continue and try to decrement max_id if possible */ + + if (lockForArbitration(false) == false) { + dprintf("service is terminated"); + OSSafeReleaseNULL(id_num); + return; + } + + /* Decrementing highest part id */ + if (part_id == _max_id) { + /* First, decrement max */ + _max_id--; + /* no longer needed */ + OSSafeReleaseNULL(id_num); + + /* Now iterate down the hole list */ + while ((id_num = OSDynamicCast(OSNumber, + _holes->getLastObject()))) { + /* Only need to remove consecutive matches */ + if (id_num->unsigned32BitValue() != (_max_id)) { + break; + } + + /* Remove this num from hole list */ + id_num->retain(); + _holes->removeObject(id_num); + OSSafeReleaseNULL(id_num); + /* Decrement max */ + _max_id--; + } + /* Creating a new 'hole' in the ID namespace */ + } else { + /* Better have been able to allocate OSNum */ + if (!id_num) { + unlockForArbitration(); +#ifdef DEBUG + panic("ZFSDatasetScheme %s failed to return partID", + __func__); +#endif + return; + } + + /* + * OSOrderedSet only enforces ordering when + * using setObject(anObject) interface. + * Therefore _holes must not use setFirstObject, + * setLastObject, setObject(index, anObject) + */ + + /* Add a new OSNum to hole list */ + _holes->setObject(id_num); + OSSafeReleaseNULL(id_num); + } + + unlockForArbitration(); +} + +bool +ZFSDatasetScheme::addDataset(const char *osname) +{ + ZFSDataset *dataset; + OSObject *obj; + OSNumber *sizeNum; + char location[24]; + uint64_t size; + uint32_t part_id; + + obj = copyProperty(kZFSPoolSizeKey, gIOServicePlane, + kIORegistryIterateRecursively|kIORegistryIterateParents); + if (!obj) { + dprintf("missing pool size"); + return (false); + } + sizeNum = OSDynamicCast(OSNumber, obj); + if (!sizeNum) { + dprintf("invalid pool size"); + return (false); + } + size = sizeNum->unsigned64BitValue(); + sizeNum = 0; + OSSafeReleaseNULL(obj); + + part_id = getNextPartitionID(); + /* Only using non-zero partition ids */ + if (part_id == 0) { + dprintf("invalid partition ID"); + return (false); + } + snprintf(location, sizeof (location), "%u", part_id); + +#if 0 + OSString *locationStr; + locationStr = OSString::withCString(location); + if (!locationStr) { + dprintf("location string alloc failed"); + return (false); + } + OSSafeReleaseNULL(locationStr); +#endif + + dataset = ZFSDataset::withDatasetNameAndSize(osname, size); + if (!dataset) { + dprintf("couldn't add %s", osname); + return (false); + } + + /* Set location in plane and partiton ID property */ + dataset->setLocation(location); +#ifdef kIOMediaBaseKey + dataset->setProperty(kIOMediaBaseKey, 0ULL, 64); +#endif + dataset->setProperty(kIOMediaPartitionIDKey, part_id, 32); + + // This sets the "diskutil list -> TYPE" field + dataset->setProperty("Content", "ZFS Dataset"); + // This matches with Info.plist, so it calls zfs.util for NAME + dataset->setProperty("Content Hint", + "6A898CC3-1DD2-11B2-99A6-080020736631"); + + if (dataset->attach(this) == false) { + dprintf("attach failed"); + OSSafeReleaseNULL(dataset); + return (false); + } + + if (dataset->start(this) == false) { + dprintf("start failed"); + dataset->detach(this); + OSSafeReleaseNULL(dataset); + return (false); + } + + /* Protect the OSSet by taking IOService lock */ + lockForArbitration(); + _datasets->setObject(dataset); + unlockForArbitration(); + + // dataset->registerService(kIOServiceAsynchronous); + dataset->registerService(kIOServiceSynchronous); + + /* Adding to OSSet takes a retain */ + OSSafeReleaseNULL(dataset); + + return (true); +} + +bool +ZFSDatasetScheme::removeDataset(const char *osname, bool force) +{ + OSCollectionIterator *iter; + ZFSDataset *dataset = NULL; + OSNumber *partNum; + uint32_t part_id = 0; + bool locked; + + if ((locked = lockForArbitration(false)) == false) { + dprintf("couldn't lock terminated service"); + } + + iter = OSCollectionIterator::withCollection(_datasets); + if (!iter) { + dprintf("couldn't get dataset iterator"); + return (false); + } + + while ((dataset = OSDynamicCast(ZFSDataset, + iter->getNextObject())) != NULL) { + OSObject *property; + OSString *str; + + property = dataset->getProperty(kZFSDatasetNameKey); + if (!property) continue; + + str = OSDynamicCast(OSString, property); + if (!str) continue; + + if (str->isEqualTo(osname)) { + _datasets->removeObject(dataset); + break; + } + } + + if (!dataset) { + dprintf("couldn't get dataset"); + iter->release(); + return (false); + } + + dataset->retain(); + iter->release(); + iter = 0; + + if (locked) unlockForArbitration(); + + partNum = OSDynamicCast(OSNumber, + dataset->getProperty(kIOMediaPartitionIDKey)); + if (!partNum) { + dprintf("couldn't get partition number"); + } else { + part_id = partNum->unsigned32BitValue(); + } + + if (force) { + dataset->terminate(kIOServiceSynchronous| + kIOServiceRequired); + } else { + dataset->terminate(kIOServiceSynchronous); + } + + dataset->release(); + dataset = 0; + + /* Only return non-zero partition ids */ + if (part_id != 0) { + dprintf("terminated partition %u", part_id); + returnPartitionID(part_id); + } + + return (true); +} + +/* Compatibility shims */ +void +ZFSDatasetScheme::read(IOService *client, + UInt64 byteStart, + IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + IOStorage::complete(completion, kIOReturnError, 0); +} + +void +ZFSDatasetScheme::write(IOService *client, + UInt64 byteStart, + IOMemoryDescriptor *buffer, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + IOStorage::complete(completion, kIOReturnError, 0); +} + +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) +IOReturn +ZFSDatasetScheme::synchronize(IOService *client, + UInt64 byteStart, + UInt64 byteCount, + IOStorageSynchronizeOptions options) +#else +IOReturn +ZFSDatasetScheme::synchronizeCache(IOService *client) +#endif +{ + return (kIOReturnUnsupported); +} + +IOReturn +ZFSDatasetScheme::unmap(IOService *client, + IOStorageExtent *extents, + UInt32 extentsCount, +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + IOStorageUnmapOptions options) +#else + UInt32 options) +#endif +{ + return (kIOReturnUnsupported); +} + +bool +ZFSDatasetScheme::lockPhysicalExtents(IOService *client) +{ + return (false); +} + +IOStorage * +ZFSDatasetScheme::copyPhysicalExtent(IOService *client, + UInt64 * byteStart, + UInt64 * byteCount) +{ + return (NULL); +} + +void +ZFSDatasetScheme::unlockPhysicalExtents(IOService *client) +{ +} + +#if defined(MAC_OS_X_VERSION_10_10) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_10) +IOReturn +ZFSDatasetScheme::setPriority(IOService *client, + IOStorageExtent *extents, + UInt32 extentsCount, + IOStoragePriority priority) +{ + return (kIOReturnUnsupported); +} +#endif diff --git a/module/os/macos/zfs/ZFSPool.cpp b/module/os/macos/zfs/ZFSPool.cpp new file mode 100644 index 0000000000..9d62b204f6 --- /dev/null +++ b/module/os/macos/zfs/ZFSPool.cpp @@ -0,0 +1,881 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Evan Susarret. All rights reserved. + */ + +#include +#include +#include +#include +#include + +extern "C" { +#include +#include +#include +} /* extern "C" */ + +#include + +#if defined(DEBUG) || defined(ZFS_DEBUG) +#ifdef dprintf +#undef dprintf +#endif +#define dprintf(fmt, ...) do { \ + printf("ZFSPool %s " fmt "\n", __func__, ##__VA_ARGS__); \ +_NOTE(CONSTCOND) } while (0) +#else +#ifndef dprintf +#define dprintf(fmt, ...) do { } while (0); +#endif +#endif /* if DEBUG or ZFS_DEBUG */ + +#define DPRINTF_FUNC() do { dprintf("%s\n", __func__); } while (0); + +#if 0 +/* block size is 512 B, count is 512 M blocks */ +#define ZFS_POOL_DEV_BSIZE (UInt64)(1<<9) +#define ZFS_POOL_DEV_BCOUNT (UInt64)(2<<29) +#endif + +/* + * Returns handle to ZFS IOService, with a retain count. + */ +static IOService * +copy_zfs_handle() +{ + /* Get the ZFS service handle the 'hard way' */ + OSDictionary *matching; + IOService *service = 0; + + matching = IOService::serviceMatching("net_lundman_zfs_zvol"); + if (matching) { + service = IOService::copyMatchingService(matching); + OSSafeReleaseNULL(matching); + } + + if (!service) { + dprintf("couldn't get zfs IOService"); + return (NULL); + } + + return (service); +#if 0 + /* Got service, make sure it casts */ + zfs_hl = OSDynamicCast(net_lundman_zfs_zvol, service); + if (zfs_hl == NULL) { + dprintf("couldn't get zfs_hl"); + /* Drop retain from copyMatchingService */ + OSSafeReleaseNULL(service); + return (NULL); + } + + return (zfs_hl); +#endif +} + +OSDefineMetaClassAndStructors(ZFSPool, IOService); + +#if 0 +bool +ZFSPool::open(IOService *client, IOOptionBits options, void *arg) +{ + bool ret; + + IOLog("ZFSPool %s\n", __func__); + + ret = IOService::open(client, options, arg); + + IOLog("ZFSPool %s ret %d\n", __func__, ret); + + return (ret); +} + +bool +ZFSPool::isOpen(const IOService *forClient) const +{ + IOLog("ZFSPool %s\n", __func__); + return (false); +} + +void +ZFSPool::close(IOService *client, IOOptionBits options) +{ + IOLog("ZFSPool %s\n", __func__); + IOService::close(client, options); +} +#endif + +bool +ZFSPool::handleOpen(IOService *client, + IOOptionBits options, void *arg) +{ + bool ret = true; + + dprintf(""); + // IOLog("ZFSPool %s\n", __func__); + + /* XXX IOService open() locks for arbitration around handleOpen */ + // lockForArbitration(); + _openClients->setObject(client); + ret = _openClients->containsObject(client); + // unlockForArbitration(); + + return (ret); +// return (IOService::handleOpen(client, options, NULL)); +} + +bool +ZFSPool::handleIsOpen(const IOService *client) const +{ + bool ret; + + dprintf(""); + // IOLog("ZFSPool %s\n", __func__); + + /* XXX IOService isOpen() locks for arbitration around handleIsOpen */ + // lockForArbitration(); + ret = _openClients->containsObject(client); + // unlockForArbitration(); + + return (ret); +// return (IOService::handleIsOpen(client)); +} + +void +ZFSPool::handleClose(IOService *client, + IOOptionBits options) +{ + dprintf(""); + // IOLog("ZFSPool %s\n", __func__); + + /* XXX IOService close() locks for arbitration around handleClose */ + // lockForArbitration(); + if (_openClients->containsObject(client) == false) { + dprintf("not open"); + } + /* Remove client from set */ + _openClients->removeObject(client); + // unlockForArbitration(); + +// IOService::handleClose(client, options); +} + +#if 0 +/* XXX IOBlockStorageDevice */ +void +ZFSPool::read(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion) +{ + IOLog("ZFSPool %s\n", __func__); + IOStorage::complete(completion, kIOReturnError, 0); +} + +void +ZFSPool::write(IOService *client, UInt64 byteStart, + IOMemoryDescriptor *buffer, IOStorageAttributes *attr, + IOStorageCompletion *completion) +{ + IOLog("ZFSPool %s\n", __func__); + IOStorage::complete(completion, kIOReturnError, 0); +} +#endif + +bool +ZFSPool::setPoolName(const char *name) +{ +/* Assign dataset name from null-terminated string */ + OSString *dsstr; + // const OSSymbol *dsstr; +#if 0 + OSDictionary *dict; + char *newname, *oldname; +#else + char *newname; +#endif + size_t len; + + DPRINTF_FUNC(); + + /* Validate arguments */ + if (!name || (len = strnlen(name, + ZFS_MAX_DATASET_NAME_LEN)) == 0) { + dprintf("missing argument"); + return (false); + } + + /* Truncate too-long names (shouldn't happen) */ + if (len == ZFS_MAX_DATASET_NAME_LEN && + name[ZFS_MAX_DATASET_NAME_LEN] != '\0') { + dprintf("name too long [%s]", name); + /* XXX Just truncate the name */ + len--; + } + + /* Allocate room for name plus null char */ + newname = (char *)kmem_alloc(len+1, KM_SLEEP); + if (!newname) { + dprintf("string alloc failed"); + return (false); + } + snprintf(newname, len+1, "%s", name); + newname[len] = '\0'; /* just in case */ + + /* Save an OSString copy for IORegistry */ + dsstr = OSString::withCString(newname); + // dsstr = OSSymbol::withCString(newname); + + kmem_free(newname, len+1); + + if (!dsstr) { + dprintf("OSString failed"); + return (false); + } + +#if 0 + /* Swap into class private var */ + oldname = (char *)productString; + productString = newname; + newname = 0; + if (oldname) { + kmem_free(oldname, strlen(oldname)+1); + oldname = 0; + } + + /* Get and clone device characteristics prop dict */ + if ((dict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyDeviceCharacteristicsKey))) == NULL || + (dict = OSDictionary::withDictionary(dict)) == NULL) { + dprintf("couldn't clone prop dict"); + /* Should only happen during initialization */ + } + + if (dict) { + /* Copy string, add to dictionary, and replace prop dict */ + if (dict->setObject(kIOPropertyProductNameKey, + dsstr) == false || + setProperty(kIOPropertyDeviceCharacteristicsKey, + dict) == false) { + dprintf("couldn't set name"); + OSSafeReleaseNULL(dsstr); + OSSafeReleaseNULL(dict); + return (false); + } + OSSafeReleaseNULL(dict); + } +#endif + + /* Set Pool name IOregistry property */ + setProperty(kZFSPoolNameKey, dsstr); + + /* Finally, set the IORegistryEntry/IOService name */ + setName(dsstr->getCStringNoCopy()); + OSSafeReleaseNULL(dsstr); + + return (true); +} + +bool +ZFSPool::init(OSDictionary *properties, spa_t *spa) +{ +#if 0 + /* Allocate dictionaries and symbols */ + OSDictionary *pdict = OSDictionary::withCapacity(2); + OSDictionary *ddict = OSDictionary::withCapacity(4); + const OSSymbol *virtualSymbol = OSSymbol::withCString( + kIOPropertyPhysicalInterconnectTypeVirtual); + const OSSymbol *locationSymbol = OSSymbol::withCString( + kIOPropertyInternalExternalKey); + const OSSymbol *ssdSymbol = OSSymbol::withCString( + kIOPropertyMediumTypeSolidStateKey); + OSNumber *physSize = NULL, *logSize = NULL; + const OSSymbol *vendorSymbol = 0; + const OSSymbol *revisionSymbol = 0; + const OSSymbol *blankSymbol = 0; + OSBoolean *rdonly = 0; + UInt64 phys_bsize, log_bsize; + OSString *str = 0; + const char *cstr = 0; +#endif + uint64_t space; + bool ret = false; + + DPRINTF_FUNC(); + +#if 0 + physSize = OSNumber::withNumber((uint32_t)ZFS_POOL_DEV_BSIZE, 32); + logSize = OSNumber::withNumber((uint32_t)ZFS_POOL_DEV_BSIZE, 32); +#endif + if (!spa) { + dprintf("missing spa"); + goto error; + } + +#if 0 + /* Get physical and logical size from spa */ + phys_bsize = (1ULL<spa_max_ashift); + log_bsize = (1ULL<spa_min_ashift); +#endif + +#if 0 + /* Workaround glitchy behavior with large bsize in xnu */ + if (log_bsize > 8192) log_bsize = 8192; +#endif + +#if 0 + /* XXX Shouldn't be possible */ + if (log_bsize == 0) log_bsize = DEV_BSIZE; + + physSize = OSNumber::withNumber((uint32_t)phys_bsize, 32); + logSize = OSNumber::withNumber((uint32_t)log_bsize, 32); + + /* Validate allocations */ + if (!pdict || !ddict || !virtualSymbol || !locationSymbol || + !ssdSymbol || !physSize || !logSize) { + dprintf("allocation failed"); + goto error; + } +#endif + + /* Need an OSSet for open clients */ + _openClients = OSSet::withCapacity(1); + if (_openClients == NULL) { + dprintf("client OSSet failed"); + goto error; + } + + /* Set spa pointer and this Pool object's name to match */ + if (!spa) { + dprintf("missing spa"); + goto error; + } + _spa = spa; + // setName(spa_name(spa)); + +#if 0 + /* Init class statics every time an instance inits */ + /* Shared across instances, but doesn't hurt to reprint */ + if (vendorString == NULL) { + char *string; + int len = strlen("zpool")+1; + string = (char *)kmem_alloc(len, KM_SLEEP); + if (!string) goto error; + snprintf(string, len, "zpool"); + vendorString = string; + } + + if (revisionString == NULL) { + char *string; + int len = strlen("0.1")+1; + string = (char *)kmem_alloc(len, KM_SLEEP); + if (!string) goto error; + snprintf(string, len, "0.1"); + revisionString = string; + } + + if (revisionString == NULL) { + char *string; + int len = strlen("ZFS Pool")+1; + string = (char *)kmem_alloc(len, KM_SLEEP); + if (!string) goto error; + snprintf(string, len, "ZFS pool"); + infoString = string; + } + + /* For IORegistry keys, cache OSSymbols for class statics */ + /* Leverages OSSymbol cahce pool to reuse across instances */ + vendorSymbol = OSSymbol::withCString(vendorString); + revisionSymbol = OSSymbol::withCString(revisionString); + blankSymbol = OSSymbol::withCString(""); + if (!vendorSymbol || !revisionSymbol || !blankSymbol) { + dprintf("class symbols failed"); + goto error; + } +#endif + + /* Call super init */ + if (IOService::init(properties) == false) { + dprintf("device init failed"); + goto error; + } + +#if 0 + /* Set class private vars */ + productString = NULL; + isReadOnly = false; // XXX should really be true initially + + /* Set Protocol Characteristics */ + if (pdict->setObject(kIOPropertyPhysicalInterconnectLocationKey, + locationSymbol) == false || + pdict->setObject(kIOPropertyPhysicalInterconnectTypeKey, + virtualSymbol) == false) { + dprintf("pdict set properties failed"); + goto error; + } + setProperty(kIOPropertyProtocolCharacteristicsKey, pdict); + + /* Set Device Characteristics */ + if (ddict->setObject(kIOPropertyVendorNameKey, + vendorSymbol) == false || + ddict->setObject(kIOPropertyProductRevisionLevelKey, + revisionSymbol) == false || + ddict->setObject(kIOPropertyProductSerialNumberKey, + blankSymbol) == false || + ddict->setObject(kIOPropertyPhysicalBlockSizeKey, + physSize) == false || + ddict->setObject(kIOPropertyLogicalBlockSizeKey, + logSize) == false || + ddict->setObject(kIOPropertyMediumTypeKey, + ssdSymbol) == false) { + dprintf("ddict set properties failed"); + goto error; + } + setProperty(kIOPropertyDeviceCharacteristicsKey, ddict); + + /* Check for passed in readonly status */ + if (properties && (rdonly = OSDynamicCast(OSBoolean, + properties->getObject(kZFSPoolReadOnlyKey))) != NULL) { + /* Got the boolean */ + isReadOnly = rdonly->getValue(); + dprintf("set %s", (isReadOnly ? "readonly" : "readwrite")); + } + + /* Check for passed in pool GUID */ + if (properties && (str = OSDynamicCast(OSString, + properties->getObject(kZFSPoolGUIDKey))) != NULL) { + /* Got the string, try to set GUID */ + str->retain(); + if (ddict->setObject(kZFSPoolGUIDKey, str) == false) { + dprintf("couldn't set GUID"); + OSSafeReleaseNULL(str); + goto error; + } +#ifdef DEBUG + cstr = str->getCStringNoCopy(); + dprintf("set GUID"); + cstr = 0; +#endif + OSSafeReleaseNULL(str); + } +#endif + + if (setPoolName(spa_name(spa)) == false) { + dprintf("setPoolName failed"); + goto error; + } + + space = spa_get_dspace(spa); +dprintf("space %llu", space); + setProperty(kZFSPoolSizeKey, space, 64); + +#if 0 + /* Check for passed in pool name */ + if (properties && (str = OSDynamicCast(OSString, + properties->getObject(kZFSPoolNameKey))) != NULL && + (cstr = str->getCStringNoCopy()) != NULL) { + /* Got the string, try to set name */ + str->retain(); + if (setPoolName(cstr) == false) { + /* Unlikely */ + dprintf("couldn't setup pool" + " name property [%s]", cstr); + OSSafeReleaseNULL(str); + goto error; + } + + dprintf("set pool name [%s]", cstr); + OSSafeReleaseNULL(str); + } else { + if (setPoolName("invalid") == false) { + dprintf("setPoolName failed"); + goto error; + } + dprintf("set name [invalid]"); + } +#endif + + /* Success */ + ret = true; + +error: +#if 0 + /* All of these will be released on error */ + OSSafeReleaseNULL(pdict); + OSSafeReleaseNULL(ddict); + OSSafeReleaseNULL(virtualSymbol); + OSSafeReleaseNULL(locationSymbol); + OSSafeReleaseNULL(ssdSymbol); + OSSafeReleaseNULL(physSize); + OSSafeReleaseNULL(logSize); + OSSafeReleaseNULL(vendorSymbol); + OSSafeReleaseNULL(revisionSymbol); + OSSafeReleaseNULL(blankSymbol); + OSSafeReleaseNULL(str); +#endif + return (ret); +} + +void +ZFSPool::free() +{ + OSSet *oldSet; +#if 0 + char *pstring; +#endif + + if (_openClients) { + oldSet = _openClients; + _openClients = 0; + OSSafeReleaseNULL(oldSet); + } + _spa = 0; + +#if 0 + pstring = (char *)productString; + productString = 0; + if (pstring) kmem_free(pstring, strlen(pstring) + 1); +#endif + + IOService::free(); +} + +extern "C" { + +void +spa_iokit_pool_proxy_destroy(spa_t *spa) +{ + ZFSPool *proxy; + spa_iokit_t *wrapper; + + if (!spa) { + printf("missing spa"); + return; + } + + /* Get pool proxy */ + wrapper = spa->spa_iokit_proxy; + spa->spa_iokit_proxy = NULL; + + if (wrapper == NULL) { + printf("missing spa_iokit_proxy"); + return; + } + + proxy = wrapper->proxy; + + /* Free the struct */ + kmem_free(wrapper, sizeof (spa_iokit_t)); + if (!proxy) { + printf("missing proxy"); + return; + } + + if (proxy->terminate(kIOServiceSynchronous| + kIOServiceRequired) == false) { + dprintf("terminate failed"); + } + proxy->release(); + + /* + * IOService *provider; + * provider = proxy->getProvider(); + * + * proxy->detach(provider); + * proxy->stop(provider); + * + * proxy->release(); + */ +} + +int +spa_iokit_pool_proxy_create(spa_t *spa) +{ + IOService *zfs_hl; + ZFSPool *proxy; + spa_iokit_t *wrapper; + + if (!spa) { + dprintf("missing spa"); + return (EINVAL); + } + + /* Allocate C struct */ + if ((wrapper = (spa_iokit_t *)kmem_alloc(sizeof (spa_iokit_t), + KM_SLEEP)) == NULL) { + dprintf("couldn't allocate wrapper"); + return (ENOMEM); + } + + /* Get ZFS IOService */ + if ((zfs_hl = copy_zfs_handle()) == NULL) { + dprintf("couldn't get ZFS handle"); + kmem_free(wrapper, sizeof (spa_iokit_t)); + return (ENODEV); + } + + /* Allocate and init ZFS pool proxy */ + proxy = ZFSPool::withProviderAndPool(zfs_hl, spa); + if (!proxy) { + dprintf("Pool proxy creation failed"); + kmem_free(wrapper, sizeof (spa_iokit_t)); + OSSafeReleaseNULL(zfs_hl); + return (ENOMEM); + } + /* Drop retain from copy_zfs_handle */ + OSSafeReleaseNULL(zfs_hl); + + /* Set pool proxy */ + wrapper->proxy = proxy; + spa->spa_iokit_proxy = wrapper; + + return (0); +} + +} /* extern "C" */ + +ZFSPool * +ZFSPool::withProviderAndPool(IOService *zfs_hl, spa_t *spa) +{ + ZFSPool *proxy = new ZFSPool; + + if (!proxy) { + printf("allocation failed"); + return (0); + } + + if (proxy->init(0, spa) == false || + proxy->attach(zfs_hl) == false) { + printf("init/attach failed"); + OSSafeReleaseNULL(proxy); + return (0); + } + + if (proxy->start(zfs_hl) == false) { + printf("start failed"); + proxy->detach(zfs_hl); + OSSafeReleaseNULL(proxy); + return (0); + } + + /* Open zfs_hl, adding proxy to its open clients */ + // if (proxy->open(zfs_hl) == false) { + if (zfs_hl->open(proxy) == false) { + printf("open failed"); + proxy->stop(zfs_hl); + proxy->detach(zfs_hl); + OSSafeReleaseNULL(proxy); + return (0); + } + proxy->registerService(kIOServiceAsynchronous); + + return (proxy); +} + +#if 0 +/* XXX IOBlockStorageDevice */ +IOReturn +ZFSPool::doSynchronizeCache(void) +{ + dprintf(""); + return (kIOReturnSuccess); +} + +IOReturn +ZFSPool::doAsyncReadWrite(IOMemoryDescriptor *buffer, + UInt64 block, UInt64 nblks, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + char zero[ZFS_POOL_DEV_BSIZE]; + size_t len, cur, off = 0; + + DPRINTF_FUNC(); + + if (!buffer) { + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* Read vs. write */ + if (buffer->getDirection() == kIODirectionIn) { + /* Zero the read buffer */ + bzero(zero, ZFS_POOL_DEV_BSIZE); + len = buffer->getLength(); + while (len > 0) { + cur = (len > ZFS_POOL_DEV_BSIZE ? + ZFS_POOL_DEV_BSIZE : len); + buffer->writeBytes(/* offset */ off, + /* buf */ zero, /* length */ cur); + off += cur; + len -= cur; + } + // dprintf("read: %llu %llu", block, nblks); + IOStorage::complete(completion, kIOReturnSuccess, + buffer->getLength()); + return (kIOReturnSuccess); + } + + if (buffer->getDirection() != kIODirectionOut) { + dprintf("invalid direction %d", buffer->getDirection()); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* + * XXX For now this just returns error for all writes. + * If it turns out that mountroot/bdevvp try to + * verify writable status by reading a block and writing + * it back to disk, lie and say it succeeded. + */ + dprintf("write: %llu %llu", block, nblks); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); +} + +IOReturn +ZFSPool::doEjectMedia() +{ + DPRINTF_FUNC(); + /* XXX Called at shutdown, maybe return success? */ + return (kIOReturnError); +} + +IOReturn +ZFSPool::doFormatMedia(UInt64 byteCapacity) +{ + DPRINTF_FUNC(); + /* XXX shouldn't need it */ + return (kIOReturnError); + // return (kIOReturnSuccess); +} + +UInt32 +ZFSPool::doGetFormatCapacities(UInt64 *capacities, + UInt32 capacitiesMaxCount) const +{ + DPRINTF_FUNC(); + if (capacities && capacitiesMaxCount > 0) { + capacities[0] = (ZFS_POOL_DEV_BSIZE * ZFS_POOL_DEV_BCOUNT); + dprintf("capacity %llu", capacities[0]); + } + + /* Always inform caller of capacity count */ + return (1); +} + +/* Returns full pool name from instance private var */ +char * +ZFSPool::getProductString() +{ + if (productString) dprintf("[%s]", productString); + /* Return class private string */ + return ((char *)productString); +} + +/* Returns readonly status from instance private var */ +IOReturn +ZFSPool::reportWriteProtection(bool *isWriteProtected) +{ + DPRINTF_FUNC(); + if (isWriteProtected) *isWriteProtected = isReadOnly; + return (kIOReturnSuccess); +} + +/* These return class static string for all instances */ +char * +ZFSPool::getVendorString() +{ + dprintf("[%s]", vendorString); + /* Return class static string */ + return ((char *)vendorString); +} +char * +ZFSPool::getRevisionString() +{ + dprintf("[%s]", revisionString); + /* Return class static string */ + return ((char *)revisionString); +} +char * +ZFSPool::getAdditionalDeviceInfoString() +{ + dprintf("[%s]", infoString); + /* Return class static string */ + return ((char *)infoString); +} + +/* Always return media present and unchanged */ +IOReturn +ZFSPool::reportMediaState(bool *mediaPresent, + bool *changedState) +{ + DPRINTF_FUNC(); + if (mediaPresent) *mediaPresent = true; + if (changedState) *changedState = false; + return (kIOReturnSuccess); +} + +/* Always report nonremovable and nonejectable */ +IOReturn +ZFSPool::reportRemovability(bool *isRemoveable) +{ + DPRINTF_FUNC(); + if (isRemoveable) *isRemoveable = false; + return (kIOReturnSuccess); +} +IOReturn +ZFSPool::reportEjectability(bool *isEjectable) +{ + DPRINTF_FUNC(); + if (isEjectable) *isEjectable = false; + return (kIOReturnSuccess); +} + +/* Always report 512b blocksize */ +IOReturn +ZFSPool::reportBlockSize(UInt64 *blockSize) +{ + DPRINTF_FUNC(); + if (!blockSize) + return (kIOReturnError); + + *blockSize = ZFS_POOL_DEV_BSIZE; + return (kIOReturnSuccess); +} + +/* XXX Calculate from dev_bcount, should get size from objset */ +/* XXX Can issue message kIOMessageMediaParametersHaveChanged to update */ +IOReturn +ZFSPool::reportMaxValidBlock(UInt64 *maxBlock) +{ + DPRINTF_FUNC(); + if (!maxBlock) + return (kIOReturnError); + + // *maxBlock = 0; + *maxBlock = ZFS_POOL_DEV_BCOUNT - 1; + dprintf("maxBlock %llu", *maxBlock); + + return (kIOReturnSuccess); +} +#endif diff --git a/module/os/macos/zfs/abd_os.c b/module/os/macos/zfs/abd_os.c new file mode 100644 index 0000000000..4a702b1ab4 --- /dev/null +++ b/module/os/macos/zfs/abd_os.c @@ -0,0 +1,482 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2020 by Jorgen Lundman. All rights reserved. + */ + +/* + * See abd.c for a general overview of the arc buffered data (ABD). + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled. + */ + +#include +#include +#include +#include +#include + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 4096; + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + + +/* + * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are + * just a single zero'd sized zfs_abd_chunk_size buffer. This + * allows us to conserve memory by only using a single zero buffer + * for the scatter chunks. + */ +abd_t *abd_zero_scatter = NULL; +static char *abd_zero_buf = NULL; + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +static size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + ABD_SCATTER(abd).abd_offset + abd->abd_size)); +} + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + size_t n = abd_scatter_chunkcnt(abd); + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + n * zfs_abd_chunk_size - abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - n * zfs_abd_chunk_size); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + /* + * There is no scatter linear pages in FreeBSD so there is an + * if an error if the ABD has been marked as a linear page. + */ + VERIFY(!abd_is_linear_page(abd)); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + zfs_abd_chunk_size); + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + ASSERT3P( + ABD_SCATTER(abd).abd_chunks[i], !=, NULL); + } +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + size_t n = abd_chunkcnt_for_bytes(size); + for (int i = 0; i < n; i++) { + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + ABD_SCATTER(abd).abd_chunks[i] = c; + } + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; +} + +void +abd_free_chunks(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + } +} + +abd_t * +abd_alloc_struct(size_t size) +{ + size_t chunkcnt = abd_chunkcnt_for_bytes(size); + size_t abd_size = offsetof(abd_t, + abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(MAX(abd_size, sizeof (abd_t)), KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + abd->abd_orig_size = MAX(abd_size, sizeof (abd_t)); + list_link_init(&abd->abd_gang_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); + int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + mutex_destroy(&abd->abd_mtx); + ASSERT(!list_link_active(&abd->abd_gang_link)); + kmem_free(abd, MAX(size, sizeof(abd_t))); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +/* + * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where + * each chunk in the scatterlist will be set to abd_zero_buf. + */ +static void +abd_alloc_zero_scatter(void) +{ + size_t n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP); + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + + abd_zero_scatter->abd_flags = ABD_FLAG_OWNER | ABD_FLAG_ZEROS; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + abd_zero_scatter->abd_parent = NULL; + zfs_refcount_create(&abd_zero_scatter->abd_children); + + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + ABD_SCATTER(abd_zero_scatter).abd_chunk_size = + zfs_abd_chunk_size; + + for (int i = 0; i < n; i++) { + ABD_SCATTER(abd_zero_scatter).abd_chunks[i] = + abd_zero_buf; + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size); +} + +static void +abd_free_zero_scatter(void) +{ + zfs_refcount_destroy(&abd_zero_scatter->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size); + + abd_free_struct(abd_zero_scatter); + abd_zero_scatter = NULL; + kmem_free(abd_zero_buf, zfs_abd_chunk_size); +} + +void +abd_init(void) +{ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } + + abd_alloc_zero_scatter(); +} + +void +abd_fini(void) +{ + abd_free_zero_scatter(); + + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* + * FreeBSD does not have have scatter linear pages + * so there is an error. + */ + VERIFY(0); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * This is just a helper function to abd_get_offset_scatter() to alloc a + * scatter ABD using the calculated chunkcnt based on the offset within the + * parent ABD. + */ +static abd_t * +abd_alloc_scatter_offset_chunkcnt(size_t chunkcnt) +{ + size_t abd_size = offsetof(abd_t, + abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(MAX(abd_size, sizeof (abd_t)), KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + abd->abd_orig_size = MAX(abd_size, sizeof (abd_t)); + list_link_init(&abd->abd_gang_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + +abd_t * +abd_get_offset_scatter(abd_t *sabd, size_t off) +{ + abd_t *abd = NULL; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size; + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&ABD_SCATTER(abd).abd_chunks, + &ABD_SCATTER(sabd).abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + + return (abd); +} + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + ASSERT(!abd_is_gang(abd)); + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + ABD_SCATTER(aiter->iter_abd).abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +void +abd_cache_reap_now(void) +{ + kmem_cache_reap_now(abd_chunk_cache); +} diff --git a/module/os/macos/zfs/arc_os.c b/module/os/macos/zfs/arc_os.c new file mode 100644 index 0000000000..2bac6ee73a --- /dev/null +++ b/module/os/macos/zfs/arc_os.c @@ -0,0 +1,845 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +int64_t last_free_memory; +free_memory_reason_t last_free_reason; + +extern arc_stats_t arc_stats; + +static kmutex_t arc_reclaim_lock; +static kcondvar_t arc_reclaim_thread_cv; +static boolean_t arc_reclaim_thread_exit; +static kcondvar_t arc_reclaim_waiters_cv; + +/* + * log2(fraction of ARC which must be free to allow growing). + * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, + * when reading a new block into the ARC, we will evict an equal-sized block + * from the ARC. + * + * This must be less than arc_shrink_shift, so that when we shrink the ARC, + * we will still not allow it to grow. + */ +extern int arc_no_grow_shift; + + +/* + * Return a default max arc size based on the amount of physical memory. + */ +uint64_t +arc_default_max(uint64_t min, uint64_t allmem) +{ + /* Default to 1/3 of all memory. */ + return (MAX(allmem / 3, min)); +} + +#ifdef _KERNEL + +/* Remove these uses of _Atomic */ +static _Atomic boolean_t arc_reclaim_in_loop = B_FALSE; +static _Atomic int64_t reclaim_shrink_target = 0; + +/* + * Return maximum amount of memory that we could possibly use. Reduced + * to half of all memory in user space which is primarily used for testing. + */ +uint64_t +arc_all_memory(void) +{ + return (kmem_size()); +} + +/* + * Return the amount of memory that is considered free. In user space + * which is primarily used for testing we pretend that free memory ranges + * from 0-20% of all memory. + */ +uint64_t +arc_free_memory(void) +{ + int64_t avail; + + avail = spl_free_wrapper(); + return (avail >= 0LL ? avail : 0LL); +} + +/* + * Return the amount of memory that can be consumed before reclaim will be + * needed. Positive if there is sufficient free memory, negative indicates + * the amount of memory that needs to be freed up. + */ +int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + free_memory_reason_t r = FMR_UNKNOWN; + + /* + * memory logic is in spl_free_wrapper(), including absorption + * of pressure terms + */ + lowest = spl_free_wrapper(); + r = FMR_NEEDFREE; + if (spl_free_fast_pressure_wrapper() != FALSE) { + /* wake up arc_reclaim_thread() if it is sleeping */ + cv_signal(&arc_reclaim_thread_cv); + } + + last_free_memory = lowest; + last_free_reason = r; + + return (lowest); +} + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + int64_t available_memory = spl_free_wrapper(); + int64_t freemem = available_memory / PAGESIZE; + static uint64_t page_load = 0; + static uint64_t last_txg = 0; + +#if defined(__i386) + available_memory = + MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); +#endif + + if (txg > last_txg) { + last_txg = txg; + page_load = 0; + } + + if (freemem > physmem * arc_lotsfree_percent / 100) { + page_load = 0; + return (0); + } + + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + + if (spl_free_manual_pressure_wrapper() != 0 && + arc_reclaim_in_loop == B_FALSE) { + cv_signal(&arc_reclaim_thread_cv); + kpreempt(KPREEMPT_SYNC); + page_load = 0; + } + + if (!spl_minimal_physmem_p() && page_load > 0) { + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + printf("ZFS: %s: !spl_minimal_physmem_p(), available_memory " + "== %lld, page_load = %llu, txg = %llu, reserve = %llu\n", + __func__, available_memory, page_load, txg, reserve); + if (arc_reclaim_in_loop == B_FALSE) + cv_signal(&arc_reclaim_thread_cv); + kpreempt(KPREEMPT_SYNC); + page_load = 0; + return (SET_ERROR(EAGAIN)); + } + + if (arc_reclaim_needed() && page_load > 0) { + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + printf("ZFS: %s: arc_reclaim_needed(), available_memory " + "== %lld, page_load = %llu, txg = %llu, reserve = %lld\n", + __func__, available_memory, page_load, txg, reserve); + if (arc_reclaim_in_loop == B_FALSE) + cv_signal(&arc_reclaim_thread_cv); + kpreempt(KPREEMPT_SYNC); + page_load = 0; + return (SET_ERROR(EAGAIN)); + } + + /* as with sun, assume we are reclaiming */ + if (available_memory <= 0 || page_load > available_memory / 4) { + return (SET_ERROR(ERESTART)); + } + + if (!spl_minimal_physmem_p()) { + page_load += reserve/8; + return (0); + } + + page_load = 0; + + return (0); +} + +int64_t +arc_shrink(int64_t to_free) +{ + int64_t shrank = 0; + int64_t arc_c_before = arc_c; + int64_t arc_adjust_evicted = 0; + + uint64_t asize = aggsum_value(&arc_size); + if (arc_c > arc_c_min) { + + if (arc_c > arc_c_min + to_free) + atomic_add_64(&arc_c, -to_free); + else + arc_c = arc_c_min; + + atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); + if (asize < arc_c) + arc_c = MAX(asize, arc_c_min); + if (arc_p > arc_c) + arc_p = (arc_c >> 1); + ASSERT(arc_c >= arc_c_min); + ASSERT((int64_t)arc_p >= 0); + } + + shrank = arc_c_before - arc_c; + + if (aggsum_value(&arc_size) > arc_c) { + zthr_wakeup(arc_adjust_zthr); + arc_adjust_evicted = 0; + } + return (shrank + arc_adjust_evicted); +} + + +/* + * arc.c has a arc_reap_zthr we should probably use, instead of + * having our own legacy arc_reclaim_thread(). + */ +static void arc_kmem_reap_now(void) +{ + + /* arc.c will do the heavy lifting */ + arc_kmem_reap_soon(); + + /* Now some OsX additionals */ + extern kmem_cache_t *abd_chunk_cache; + extern kmem_cache_t *znode_cache; + + kmem_cache_reap_now(abd_chunk_cache); + if (znode_cache) kmem_cache_reap_now(znode_cache); + + if (zio_arena_parent != NULL) { + /* + * Ask the vmem arena to reclaim unused memory from its + * quantum caches. + */ + vmem_qcache_reap(zio_arena_parent); + } +} + + + +/* + * Threads can block in arc_get_data_impl() waiting for this thread to evict + * enough data and signal them to proceed. When this happens, the threads in + * arc_get_data_impl() are sleeping while holding the hash lock for their + * particular arc header. Thus, we must be careful to never sleep on a + * hash lock in this thread. This is to prevent the following deadlock: + * + * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", + * waiting for the reclaim thread to signal it. + * + * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, + * fails, and goes to sleep forever. + * + * This possible deadlock is avoided by always acquiring a hash lock + * using mutex_tryenter() from arc_reclaim_thread(). + */ +static void +arc_reclaim_thread(void *unused) +{ + hrtime_t growtime = 0; + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_reclaim_lock); + while (!arc_reclaim_thread_exit) { + arc_reclaim_in_loop = B_TRUE; + uint64_t evicted = 0; + + mutex_exit(&arc_reclaim_lock); + + if (reclaim_shrink_target > 0) { + int64_t t = reclaim_shrink_target; + reclaim_shrink_target = 0; + evicted = arc_shrink(t); + extern kmem_cache_t *abd_chunk_cache; + kmem_cache_reap_now(abd_chunk_cache); + IOSleep(1); + goto lock_and_sleep; + } + + int64_t pre_adjust_free_memory = MIN(spl_free_wrapper(), + arc_available_memory()); + + int64_t manual_pressure = spl_free_manual_pressure_wrapper(); + spl_free_set_pressure(0); // clears both spl pressure variables + + /* + * We call arc_adjust() before (possibly) calling + * arc_kmem_reap_now(), so that we can wake up + * arc_get_data_impl() sooner. + */ + zthr_wakeup(arc_adjust_zthr); + + int64_t free_memory = arc_available_memory(); + + int64_t post_adjust_manual_pressure = + spl_free_manual_pressure_wrapper(); + manual_pressure = MAX(manual_pressure, + post_adjust_manual_pressure); + spl_free_set_pressure(0); + + int64_t post_adjust_free_memory = + MIN(spl_free_wrapper(), arc_available_memory()); + + // if arc_adjust() evicted, we expect post_adjust_free_memory + // to be larger than pre_adjust_free_memory (as there should + // be more free memory). + int64_t d_adj = post_adjust_free_memory - + pre_adjust_free_memory; + + if (manual_pressure > 0 && post_adjust_manual_pressure == 0) { + // pressure did not get re-signalled during arc_adjust() + if (d_adj >= 0) { + manual_pressure -= MIN(evicted, d_adj); + } else { + manual_pressure -= evicted; + } + } else if (evicted > 0 && manual_pressure > 0 && + post_adjust_manual_pressure > 0) { + // otherwise use the most recent pressure value + manual_pressure = post_adjust_manual_pressure; + } + + free_memory = post_adjust_free_memory; + + if (free_memory >= 0 && manual_pressure <= 0 && evicted > 0) { + extern kmem_cache_t *abd_chunk_cache; + kmem_cache_reap_now(abd_chunk_cache); + } + + if (free_memory < 0 || manual_pressure > 0) { + + if (free_memory <= + (arc_c >> arc_no_grow_shift) + SPA_MAXBLOCKSIZE) { + arc_no_grow = B_TRUE; + + /* + * Absorb occasional low memory conditions, as they + * may be caused by a single sequentially writing thread + * pushing a lot of dirty data into the ARC. + * + * In particular, we want to quickly + * begin re-growing the ARC if we are + * not in chronic high pressure. + * However, if we're in chronic high + * pressure, we want to reduce reclaim + * thread work by keeping arc_no_grow set. + * + * If growtime is in the past, then set it to last + * half a second (which is the length of the + * cv_timedwait_hires() call below; if this works, + * that value should be a parameter, #defined or constified. + * + * If growtime is in the future, then make sure that it + * is no further than 60 seconds into the future. + * If it's in the nearer future, then grow growtime by + * an exponentially increasing value starting with 500msec. + * + */ + const hrtime_t curtime = gethrtime(); + const hrtime_t agr = SEC2NSEC(arc_grow_retry); + static int grow_pass = 0; + + if (growtime == 0) { + growtime = curtime + MSEC2NSEC(500); + grow_pass = 0; + } else { + // check for 500ms not being enough + ASSERT3U(growtime, >, curtime); + if (growtime <= curtime) + growtime = curtime + + MSEC2NSEC(500); + + // growtime is in the future! + const hrtime_t difference = + growtime - curtime; + + if (difference >= agr) { + // cap arc_grow_retry secs now + growtime = curtime + agr - 1LL; + grow_pass = 0; + } else { + hrtime_t grow_by = + MSEC2NSEC(500) * + (1LL << grow_pass); + + if (grow_by > (agr >> 1)) + grow_by = agr >> 1; + + growtime += grow_by; + + // add 512 seconds maximum + if (grow_pass < 10) + grow_pass++; + } + } + } + + arc_warm = B_TRUE; + + arc_kmem_reap_now(); + + /* + * If we are still low on memory, shrink the ARC + * so that we have arc_shrink_min free space. + */ + free_memory = arc_available_memory(); + + static int64_t old_to_free = 0; + + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + + if (to_free > 0 || manual_pressure != 0) { + // 2 * SPA_MAXBLOCKSIZE + const int64_t large_amount = + 32LL * 1024LL * 1024LL; + const int64_t huge_amount = + 128LL * 1024LL * 1024LL; + + if (to_free > large_amount || + evicted > huge_amount) + printf("SPL: %s: post-reap %lld " + "post-evict %lld adjusted %lld " + "pre-adjust %lld to-free %lld" + " pressure %lld\n", + __func__, free_memory, d_adj, + evicted, pre_adjust_free_memory, + to_free, manual_pressure); + to_free = MAX(to_free, manual_pressure); + + int64_t old_arc_size = + (int64_t)aggsum_value(&arc_size); + (void) arc_shrink(to_free); + int64_t new_arc_size = + (int64_t)aggsum_value(&arc_size); + int64_t arc_shrink_freed = + old_arc_size - new_arc_size; + int64_t left_to_free = + to_free - arc_shrink_freed; + if (left_to_free <= 0) { + if (arc_shrink_freed > large_amount) { + printf("ZFS: %s, arc_shrink " + "freed %lld, zeroing " + "old_to_free from %lld\n", + __func__, arc_shrink_freed, + old_to_free); + } + old_to_free = 0; + } else if (arc_shrink_freed > 2LL * + (int64_t)SPA_MAXBLOCKSIZE) { + printf("ZFS: %s, arc_shrink freed " + "%lld, setting old_to_free to " + "%lld from %lld\n", + __func__, arc_shrink_freed, + left_to_free, old_to_free); + old_to_free = left_to_free; + } else { + old_to_free = left_to_free; + } + + // If we have reduced ARC by a lot before + // this point, try to give memory back to + // lower arenas (and possibly xnu). + + int64_t total_freed = + arc_shrink_freed + evicted; + if (total_freed >= huge_amount) { + if (zio_arena_parent != NULL) + vmem_qcache_reap( + zio_arena_parent); + } + if (arc_shrink_freed > 0) + evicted += arc_shrink_freed; + } else if (old_to_free > 0) { + printf("ZFS: %s, (old_)to_free has " + "returned to zero from %lld\n", + __func__, old_to_free); + old_to_free = 0; + } + + } else if (free_memory < (arc_c >> arc_no_grow_shift) && + aggsum_value(&arc_size) > + arc_c_min + SPA_MAXBLOCKSIZE) { + // relatively low memory and arc is above arc_c_min + arc_no_grow = B_TRUE; + growtime = gethrtime() + SEC2NSEC(1); + } + + if (growtime > 0 && gethrtime() >= growtime) { + if (arc_no_grow == B_TRUE) + dprintf("ZFS: arc growtime expired\n"); + growtime = 0; + arc_no_grow = B_FALSE; + } + +lock_and_sleep: + + mutex_enter(&arc_reclaim_lock); + + /* + * If evicted is zero, we couldn't evict anything via + * arc_adjust(). This could be due to hash lock + * collisions, but more likely due to the majority of + * arc buffers being unevictable. Therefore, even if + * arc_size is above arc_c, another pass is unlikely to + * be helpful and could potentially cause us to enter an + * infinite loop. + */ + if (aggsum_compare(&arc_size, arc_c) <= 0 || evicted == 0) { + /* + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any threads before we go to sleep. + */ + cv_broadcast(&arc_reclaim_waiters_cv); + + arc_reclaim_in_loop = B_FALSE; + /* + * Block until signaled, or after one second (we + * might need to perform arc_kmem_reap_now() + * even if we aren't being signalled) + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&arc_reclaim_thread_cv, + &arc_reclaim_lock, MSEC2NSEC(500), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); + + } else if (evicted >= SPA_MAXBLOCKSIZE * 3) { + // we evicted plenty of buffers, so let's wake up + // all the waiters rather than having them stall + cv_broadcast(&arc_reclaim_waiters_cv); + } else { + // we evicted some buffers but are still overflowing, + // so wake up only one waiter + cv_signal(&arc_reclaim_waiters_cv); + } + } + + arc_reclaim_thread_exit = B_FALSE; + cv_broadcast(&arc_reclaim_thread_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ + thread_exit(); +} + +/* This is called before arc is initialized, and threads are not running */ +void +arc_lowmem_init(void) +{ +} + +/* This is called after arc is initialized, and thread are running */ +void +arc_os_init(void) +{ + mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); + cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + + arc_reclaim_thread_exit = B_FALSE; + + (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + + arc_warm = B_FALSE; + +} + +void +arc_lowmem_fini(void) +{ +} + +void +arc_os_fini(void) +{ + mutex_enter(&arc_reclaim_lock); + arc_reclaim_thread_exit = B_TRUE; + /* + * The reclaim thread will set arc_reclaim_thread_exit back to + * B_FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_reclaim_thread_exit) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); + } + mutex_exit(&arc_reclaim_lock); + + mutex_destroy(&arc_reclaim_lock); + cv_destroy(&arc_reclaim_thread_cv); + cv_destroy(&arc_reclaim_waiters_cv); +} + +/* + * Uses ARC static variables in logic. + */ +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +/* max size for dnodes */ +#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ + +/* So close, they made arc_min_prefetch_ms be static, but no others */ + +int +arc_kstat_update_osx(kstat_t *ksp, int rw) +{ + osx_kstat_t *ks = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + + /* Did we change the value ? */ + if (ks->arc_zfs_arc_max.value.ui64 != zfs_arc_max) { + + /* Assign new value */ + zfs_arc_max = ks->arc_zfs_arc_max.value.ui64; + + /* Update ARC with new value */ + if (zfs_arc_max > 64<<20 && zfs_arc_max < + physmem * PAGESIZE) + arc_c_max = zfs_arc_max; + + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + /* If meta_limit is not set, adjust it automatically */ + if (!zfs_arc_meta_limit) + arc_meta_limit = arc_c_max / 4; + } + + if (ks->arc_zfs_arc_min.value.ui64 != zfs_arc_min) { + zfs_arc_min = ks->arc_zfs_arc_min.value.ui64; + if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) { + arc_c_min = zfs_arc_min; + printf("ZFS: set arc_c_min %llu, arc_meta_min " + "%llu, zfs_arc_meta_min %llu\n", + arc_c_min, arc_meta_min, zfs_arc_meta_min); + if (arc_c < arc_c_min) { + printf("ZFS: raise arc_c %llu to " + "arc_c_min %llu\n", arc_c, + arc_c_min); + arc_c = arc_c_min; + if (arc_p < (arc_c >> 1)) { + printf("ZFS: raise arc_p %llu " + "to %llu\n", + arc_p, (arc_c >> 1)); + arc_p = (arc_c >> 1); + } + } + } + } + + if (ks->arc_zfs_arc_meta_limit.value.ui64 != + zfs_arc_meta_limit) { + zfs_arc_meta_limit = + ks->arc_zfs_arc_meta_limit.value.ui64; + + /* Allow the tunable to override if it is reasonable */ + if (zfs_arc_meta_limit > 0 && + zfs_arc_meta_limit <= arc_c_max) + arc_meta_limit = zfs_arc_meta_limit; + + if (arc_c_min < arc_meta_limit / 2 && + zfs_arc_min == 0) + arc_c_min = arc_meta_limit / 2; + + printf("ZFS: set arc_meta_limit %llu, arc_c_min %llu," + "zfs_arc_meta_limit %lu\n", + arc_meta_limit, arc_c_min, zfs_arc_meta_limit); + } + + if (ks->arc_zfs_arc_meta_min.value.ui64 != zfs_arc_meta_min) { + zfs_arc_meta_min = ks->arc_zfs_arc_meta_min.value.ui64; + if (zfs_arc_meta_min >= arc_c_min) { + printf("ZFS: probable error, zfs_arc_meta_min " + "%llu >= arc_c_min %llu\n", + zfs_arc_meta_min, arc_c_min); + } + if (zfs_arc_meta_min > 0 && + zfs_arc_meta_min <= arc_meta_limit) + arc_meta_min = zfs_arc_meta_min; + printf("ZFS: set arc_meta_min %llu\n", arc_meta_min); + } + + zfs_arc_grow_retry = ks->arc_zfs_arc_grow_retry.value.ui64; + arc_grow_retry = zfs_arc_grow_retry; + zfs_arc_shrink_shift = ks->arc_zfs_arc_shrink_shift.value.ui64; + zfs_arc_p_min_shift = ks->arc_zfs_arc_p_min_shift.value.ui64; + zfs_arc_average_blocksize = + ks->arc_zfs_arc_average_blocksize.value.ui64; + + } else { + + ks->arc_zfs_arc_max.value.ui64 = zfs_arc_max; + ks->arc_zfs_arc_min.value.ui64 = zfs_arc_min; + + ks->arc_zfs_arc_meta_limit.value.ui64 = zfs_arc_meta_limit; + ks->arc_zfs_arc_meta_min.value.ui64 = zfs_arc_meta_min; + + ks->arc_zfs_arc_grow_retry.value.ui64 = + zfs_arc_grow_retry ? zfs_arc_grow_retry : arc_grow_retry; + ks->arc_zfs_arc_shrink_shift.value.ui64 = zfs_arc_shrink_shift; + ks->arc_zfs_arc_p_min_shift.value.ui64 = zfs_arc_p_min_shift; + ks->arc_zfs_arc_average_blocksize.value.ui64 = + zfs_arc_average_blocksize; + } + return (0); +} + +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *ptr) +{ + arc_prune_t *ap = (arc_prune_t *)ptr; + arc_prune_func_t *func = ap->p_pfunc; + + if (func != NULL) + func(ap->p_adjust, ap->p_private); + + zfs_refcount_remove(&ap->p_refcnt, func); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffered they reference. This provides a mechanism to ensure the ARC can + * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * is analogous to dnlc_reduce_cache() but more generic. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +void +arc_prune_async(int64_t adjust) +{ + arc_prune_t *ap; + + mutex_enter(&arc_prune_mtx); + for (ap = list_head(&arc_prune_list); ap != NULL; + ap = list_next(&arc_prune_list, ap)) { + + if (zfs_refcount_count(&ap->p_refcnt) >= 2) + continue; + + zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); + ap->p_adjust = adjust; + if (taskq_dispatch(arc_prune_taskq, arc_prune_task, + ap, TQ_SLEEP) == TASKQID_INVALID) { + zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); + continue; + } + ARCSTAT_BUMP(arcstat_prune); + } + mutex_exit(&arc_prune_mtx); +} + +#else /* KERNEL */ + +int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + free_memory_reason_t r = FMR_UNKNOWN; + + /* Every 100 calls, free a small amount */ + if (spa_get_random(100) == 0) + lowest = -1024; + + last_free_memory = lowest; + last_free_reason = r; + + return (lowest); +} + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + return (0); +} + +uint64_t +arc_all_memory(void) +{ + return (ptob(physmem) / 2); +} + +uint64_t +arc_free_memory(void) +{ + return (spa_get_random(arc_all_memory() * 20 / 100)); +} + +#endif /* KERNEL */ diff --git a/module/os/macos/zfs/ldi_iokit.cpp b/module/os/macos/zfs/ldi_iokit.cpp new file mode 100644 index 0000000000..bae0060efb --- /dev/null +++ b/module/os/macos/zfs/ldi_iokit.cpp @@ -0,0 +1,2000 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * Portions of this document are copyright Oracle and Joyent. + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +/* + * Apple IOKit (c++) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS internal + */ +#include + +/* + * LDI Includes + */ +#include + +/* Debug prints */ +#if defined(DEBUG) || defined(ZFS_DEBUG) + +#ifdef dprintf +#undef dprintf +#endif + +#define dprintf(fmt, ...) do { \ + IOLog(fmt, __VA_ARGS__); \ +_NOTE(CONSTCOND) } while (0) +#endif + +/* Attach created IOService objects to the IORegistry under ZFS. */ +// #define LDI_IOREGISTRY_ATTACH + +/* + * Globals + */ +static IOService *ldi_zfs_handle; + +/* Exposed to c callers */ +extern "C" { + +struct _handle_iokit { + IOMedia *media; + IOService *client; +}; /* 16b */ + +struct _handle_notifier { + IONotifier *obj; +}; /* 8b */ + +#define LH_MEDIA(lhp) lhp->lh_tsd.iokit_tsd->media +#define LH_CLIENT(lhp) lhp->lh_tsd.iokit_tsd->client +#define LH_NOTIFIER(lhp) lhp->lh_notifier->obj + +void +handle_free_iokit(struct ldi_handle *lhp) { + if (!lhp) { + dprintf("%s missing lhp\n", __func__); + return; + } + + if (!lhp->lh_tsd.iokit_tsd) { + dprintf("%s missing iokit_tsd\n", __func__); + return; + } + + /* Free IOService client */ + if (handle_free_ioservice(lhp) != 0) { + dprintf("%s lhp %p client %s\n", + __func__, lhp, "couldn't be removed"); + } + + kmem_free(lhp->lh_tsd.iokit_tsd, sizeof (struct _handle_iokit)); + lhp->lh_tsd.iokit_tsd = 0; +} + +/* Returns handle with lock still held */ +struct ldi_handle * +handle_alloc_iokit(dev_t device, int fmode) +{ + struct ldi_handle *lhp, *retlhp; + + /* Search for existing handle */ + if ((retlhp = handle_find(device, fmode, B_TRUE)) != NULL) { + dprintf("%s found handle before alloc\n", __func__); + return (retlhp); + } + + /* Allocate an LDI IOKit handle */ + if ((lhp = handle_alloc_common(LDI_TYPE_IOKIT, device, + fmode)) == NULL) { + dprintf("%s couldn't allocate handle\n", __func__); + return (NULL); + } + + /* Allocate and clear type-specific device data */ + lhp->lh_tsd.iokit_tsd = (struct _handle_iokit *)kmem_alloc( + sizeof (struct _handle_iokit), KM_SLEEP); + LH_MEDIA(lhp) = 0; + LH_CLIENT(lhp) = 0; + + /* Allocate an IOService client for open/close */ + if (handle_alloc_ioservice(lhp) != 0) { + dprintf("%s couldn't allocate IOService client\n", __func__); + handle_release(lhp); + return (NULL); + } + + /* Add the handle to the list, or return match */ + if ((retlhp = handle_add(lhp)) == NULL) { + dprintf("%s handle_add failed\n", __func__); + handle_release(lhp); + return (NULL); + } + + /* Check if new or found handle was returned */ + if (retlhp != lhp) { + dprintf("%s found handle after alloc\n", __func__); + handle_release(lhp); + lhp = 0; + } + + return (retlhp); +} + +int +handle_free_ioservice(struct ldi_handle *lhp) +{ + /* Validate handle pointer */ + ASSERT3U(lhp, !=, NULL); +#ifdef DEBUG + if (!lhp) { + dprintf("%s missing handle\n", __func__); + return (EINVAL); + } + if (!LH_CLIENT(lhp)) { + dprintf("%s missing client\n", __func__); + return (ENODEV); + } +#endif + +#ifdef LDI_IOREGISTRY_ATTACH + /* Detach client from ZFS in IORegistry */ + LH_CLIENT(lhp)->detach(ldi_zfs_handle); +#endif + + LH_CLIENT(lhp)->stop(ldi_zfs_handle); + LH_CLIENT(lhp)->release(); + LH_CLIENT(lhp) = 0; + + return (0); +} + +int +handle_alloc_ioservice(struct ldi_handle *lhp) +{ + IOService *client; + + /* Validate handle pointer */ + ASSERT3U(lhp, !=, NULL); + if (lhp == NULL) { + dprintf("%s missing handle\n", __func__); + return (EINVAL); + } + + /* Allocate and init an IOService client for open/close */ + if ((client = new IOService) == NULL) { + dprintf("%s couldn't allocate new IOService\n", __func__); + return (ENOMEM); + } + if (client->init(0) != true) { + dprintf("%s IOService init failed\n", __func__); + client->release(); + return (ENOMEM); + } + +#ifdef LDI_IOREGISTRY_ATTACH + /* Attach client to ZFS in IORegistry */ + if (client->attach(ldi_zfs_handle) != true) { + dprintf("%s IOService attach failed\n", __func__); + client->release(); + return (ENOMEM); + } +#endif + + /* Start service */ + if (client->start(ldi_zfs_handle) != true) { + dprintf("%s IOService attach failed\n", __func__); + /* Detach client from ZFS in IORegistry */ +#ifdef LDI_IOREGISTRY_ATTACH + client->detach(ldi_zfs_handle); +#endif + client->release(); + return (ENOMEM); + } + + LH_CLIENT(lhp) = client; + return (0); +} + +/* Set status to Offline and post event */ +static bool +handle_media_terminate_cb(void* target, void* refCon, + IOService* newService, IONotifier* notifier) +{ + struct ldi_handle *lhp = (struct ldi_handle *)refCon; + +#ifdef DEBUG + if (!lhp) { + dprintf("%s missing refCon ldi_handle\n", __func__); + return (false); + } +#endif + + /* Take hold on handle to prevent removal */ + handle_hold(lhp); + + dprintf("%s setting lhp %p to Offline status\n", __func__, lhp); + if (handle_status_change(lhp, LDI_STATUS_OFFLINE) != 0) { + dprintf("%s handle_status_change failed\n", __func__); + handle_release(lhp); + return (false); + } + + handle_release(lhp); + return (true); +} + +int +handle_close_iokit(struct ldi_handle *lhp) +{ +#ifdef DEBUG + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_type, ==, LDI_TYPE_IOKIT); + ASSERT3U(lhp->lh_status, ==, LDI_STATUS_CLOSING); + + /* Validate IOMedia and IOService client */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp)) || + !OSDynamicCast(IOService, LH_CLIENT(lhp))) { + dprintf("%s invalid IOMedia or client\n", __func__); + return (ENODEV); + } +#endif /* DEBUG */ + + LH_MEDIA(lhp)->close(LH_CLIENT(lhp)); + LH_MEDIA(lhp) = 0; + return (0); +} + +static int +handle_open_iokit(struct ldi_handle *lhp, IOMedia *media) +{ +#ifdef DEBUG + ASSERT3U(lhp, !=, NULL); + ASSERT3U(media, !=, NULL); + ASSERT3U(lhp->lh_type, ==, LDI_TYPE_IOKIT); + ASSERT3U(lhp->lh_status, ==, LDI_STATUS_OPENING); + + /* Validate IOMedia and IOService client */ + if (!OSDynamicCast(IOMedia, media) || + !OSDynamicCast(IOService, LH_CLIENT(lhp))) { + dprintf("%s invalid IOMedia or client\n", __func__); + return (ENODEV); + } +#endif /* DEBUG */ + /* Retain until open or error */ + media->retain(); + + /* + * If read/write mode is requested, check that the + * device is actually writeable. + */ + if (lhp->lh_fmode & FWRITE && media->isWritable() == false) { + dprintf("%s read-write requested on %s\n", + __func__, "read-only IOMedia"); + media->release(); + return (EPERM); + } + + /* Call open with the IOService client handle */ + if (media->IOMedia::open(LH_CLIENT(lhp), 0, + (lhp->lh_fmode & FWRITE ? kIOStorageAccessReaderWriter : + kIOStorageAccessReader)) == false) { + dprintf("%s IOMedia->open failed\n", __func__); + media->release(); + return (EIO); + } + media->release(); + + /* Assign IOMedia device */ + LH_MEDIA(lhp) = media; + return (0); +} + +int +handle_get_size_iokit(struct ldi_handle *lhp, uint64_t *dev_size) +{ + if (!lhp || !dev_size) { + dprintf("%s missing lhp or dev_size\n", __func__); + return (EINVAL); + } + +#ifdef DEBUG + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s no IOMedia\n", __func__); + return (ENODEV); + } +#endif + + *dev_size = LH_MEDIA(lhp)->getSize(); + if (*dev_size == 0) { + dprintf("%s %s\n", __func__, + "IOMedia getSize returned 0"); + return (EINVAL); + } + + return (0); +} + +int +handle_get_dev_path_iokit(struct ldi_handle *lhp, + char *path, int len) +{ + int retlen = len; + + if (!lhp || !path || len == 0) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + +#ifdef DEBUG + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s no IOMedia\n", __func__); + return (ENODEV); + } +#endif + + if (LH_MEDIA(lhp)->getPath(path, &retlen, gIODTPlane) == false) { + dprintf("%s getPath failed\n", __func__); + return (EIO); + } + +dprintf("%s got path [%s]\n", __func__, path); + return (0); +} + +int handle_get_bootinfo_iokit(struct ldi_handle *lhp, + struct io_bootinfo *bootinfo) +{ + int error = 0; + + if (!lhp || !bootinfo) { + dprintf("%s missing argument\n", __func__); +printf("%s missing argument\n", __func__); + return (EINVAL); + } + + if ((error = handle_get_size_iokit(lhp, + &bootinfo->dev_size)) != 0 || + (error = handle_get_dev_path_iokit(lhp, bootinfo->dev_path, + sizeof (bootinfo->dev_path))) != 0) { + dprintf("%s get size or dev_path error %d\n", + __func__, error); + } + + return (error); +} + +int +handle_sync_iokit(struct ldi_handle *lhp) +{ +#ifdef DEBUG + /* Validate IOMedia and client */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp)) || + !OSDynamicCast(IOService, LH_CLIENT(lhp))) { + dprintf("%s invalid IOMedia or client\n", __func__); + return (ENODEV); + } +#endif + +#if defined(MAC_OS_X_VERSION_10_11) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_11) + /* Issue device sync */ + if (LH_MEDIA(lhp)->synchronize(LH_CLIENT(lhp), 0, 0, 0) != + kIOReturnSuccess) { + dprintf("%s %s\n", __func__, + "IOMedia synchronizeCache failed"); + return (ENOTSUP); + } +#else + /* Issue device sync */ + if (LH_MEDIA(lhp)->synchronizeCache(LH_CLIENT(lhp)) != + kIOReturnSuccess) { + dprintf("%s %s\n", __func__, + "IOMedia synchronizeCache failed"); + return (ENOTSUP); + } +#endif + + /* Success */ + return (0); +} + +static dev_t +dev_from_media(IOMedia *media) +{ + OSObject *property; + OSNumber *number; + uint32_t major, minor; + dev_t device = 0; + + /* Validate media */ + if (!media || !OSDynamicCast(IOMedia, media)) { + dprintf("%s no device\n", __func__); + return (0); + } + media->retain(); + + /* Get device major */ + if (NULL == (property = media->getProperty(kIOBSDMajorKey, + gIOServicePlane, kIORegistryIterateRecursively)) || + NULL == (number = OSDynamicCast(OSNumber, property))) { + dprintf("%s couldn't get BSD major\n", __func__); + media->release(); + return (0); + } + major = number->unsigned32BitValue(); + number = NULL; + property = NULL; + + /* Get device minor */ + if (NULL == (property = media->getProperty(kIOBSDMinorKey, + gIOServicePlane, kIORegistryIterateRecursively)) || + NULL == (number = OSDynamicCast(OSNumber, property))) { + dprintf("%s couldn't get BSD major\n", __func__); + media->release(); + return (0); + } + minor = number->unsigned32BitValue(); + number = NULL; + property = NULL; + + /* Cleanup */ + media->release(); + media = NULL; + + device = makedev(major, minor); + + /* Return 0 or valid dev_t */ + return (device); +} + +/* Returns NULL or dictionary with a retain count */ +static OSDictionary * +media_matchdict_from_dev(dev_t device) +{ + OSDictionary *matchDict; + OSNumber *majorNum, *minorNum; + + /* Validate dev_t */ + if (device == 0) { + dprintf("%s no dev_t provided\n", __func__); + return (NULL); + } + + /* Allocate OSNumbers for BSD major and minor (32-bit) */ + if (NULL == (majorNum = OSNumber::withNumber(major(device), 32)) || + NULL == (minorNum = OSNumber::withNumber(minor(device), 32))) { + dprintf("%s couldn't alloc major/minor as OSNumber\n", + __func__); + if (majorNum) { + majorNum->release(); + } + return (NULL); + } + + /* Match on IOMedia */ + if (NULL == (matchDict = IOService::serviceMatching("IOMedia")) || + !(matchDict->setObject(kIOBSDMajorKey, majorNum)) || + !(matchDict->setObject(kIOBSDMinorKey, minorNum))) { + dprintf("%s couldn't get matching dictionary\n", __func__); + if (matchDict) { + matchDict->release(); + } + majorNum->release(); + minorNum->release(); + return (NULL); + } + majorNum->release(); + minorNum->release(); + + /* Return NULL or valid OSDictionary with retain count */ + return (matchDict); +} + +/* Returns NULL or dictionary with a retain count */ +/* + * media_matchdict_from_path + * translate from paths of the form /dev/diskNsN + * or /private/var/run/disk/by-id/media- to a matching + * dictionary. + */ +static OSDictionary * +media_matchdict_from_path(const char *path) +{ + OSDictionary *matchDict = 0; + OSString *bsdName = NULL; + OSString *uuid = NULL; + const char *substr = 0; + bool ret; + + /* Validate path */ + if (path == 0 || strlen(path) <= 1) { + dprintf("%s no path provided\n", __func__); + return (NULL); + } + /* Translate /dev/diskN and InvariantDisks paths */ + if (strncmp(path, "/dev/", 5) != 0 && + strncmp(path, "/var/run/disk/by-id/", 20) != 0 && + strncmp(path, "/private/var/run/disk/by-id/", 28) != 0) { + dprintf("%s Unrecognized path %s\n", __func__, path); + return (NULL); + } + + /* Validate path and alloc bsdName */ + if (strncmp(path, "/dev/", 5) == 0) { + + /* substr starts after '/dev/' */ + substr = path + 5; + /* Get diskN from /dev/diskN or /dev/rdiskN */ + if (strncmp(substr, "disk", 4) == 0) { + bsdName = OSString::withCString(substr); + } else if (strncmp(substr, "rdisk", 5) == 0) { + bsdName = OSString::withCString(substr + 1); + } + } else if (strncmp(path, "/var/run/disk/by-id/", 20) == 0 || + strncmp(path, "/private/var/run/disk/by-id/", 28) == 0) { + /* InvariantDisks paths */ + + /* substr starts after '/by-id/' */ + substr = path + 20; + if (strncmp(path, "/private", 8) == 0) substr += 8; + + /* Handle media UUID, skip volume UUID or device GUID */ + if (strncmp(substr, "media-", 6) == 0) { + /* Lookup IOMedia with UUID */ + uuid = OSString::withCString(substr+strlen("media-")); + } else if (strncmp(substr, "volume-", 7) == 0) { + /* + * volume-UUID is specified by DiskArbitration + * when a Filesystem bundle is able to probe + * the media and retrieve/generate a UUID for + * it's contents. + * So while we could use this and have zfs.util + * probe for vdev GUID (and pool GUID) and + * generate a UUID, we would need to do the same + * here to find the disk, possibly probing + * devices to get the vdev GUID in the process. + */ + dprintf("%s Unsupported volume-UUID path %s\n", + __func__, path); + } else if (strncmp(substr, "device-", 7) == 0) { + /* Lookup IOMedia with device GUID */ + /* + * XXX Not sure when this is used, no devices + * seem to be presented this way. + */ + dprintf("%s Unsupported device-GUID path %s\n", + __func__, path); + } else { + dprintf("%s unrecognized path %s\n", __func__, path); + } + /* by-path and by-serial are handled separately */ + } + + if (!bsdName && !uuid) { + dprintf("%s Invalid path %s\n", __func__, path); + return (NULL); + } + + /* Match on IOMedia by BSD disk name */ + matchDict = IOService::serviceMatching("IOMedia"); + if (!matchDict) { + dprintf("%s couldn't get matching dictionary\n", __func__); + if (bsdName) bsdName->release(); + if (uuid) uuid->release(); + return (NULL); + } + if (bsdName) { + ret = matchDict->setObject(kIOBSDNameKey, bsdName); + bsdName->release(); + + if (!ret) { + dprintf("%s couldn't setup bsd name matching" + " dictionary\n", __func__); + matchDict->release(); + matchDict = 0; + } + if (uuid) uuid->release(); + } else if (uuid) { + if (matchDict->setObject(kIOMediaUUIDKey, uuid) == false) { + dprintf("%s couldn't setup UUID matching" + " dictionary\n", __func__); + uuid->release(); + matchDict->release(); + matchDict = 0; + } + } else { + dprintf("%s missing matching property\n", __func__); + matchDict->release(); + matchDict = 0; + } + + /* Return NULL or valid OSDictionary with retain count */ + return (matchDict); +} + +/* Returns NULL or matched IOMedia with a retain count */ +static IOMedia * +media_from_matchdict(OSDictionary *matchDict) +{ + OSIterator *iter = 0; + OSObject *obj = 0; + IOMedia *media = 0; + + if (!matchDict) { + dprintf("%s missing matching dictionary\n", __func__); + return (NULL); + } + + /* + * We could instead use copyMatchingService, since + * there should only be one match. + */ + iter = IOService::getMatchingServices(matchDict); + if (!iter) { + dprintf("%s No iterator from getMatchingServices\n", + __func__); + return (NULL); + } + + /* Get first object from iterator */ + while ((obj = iter->getNextObject()) != NULL) { + if ((media = OSDynamicCast(IOMedia, obj)) == NULL) { + obj = 0; + continue; + } + if (media->isFormatted() == false) { + obj = 0; + media = 0; + continue; + } + + media->retain(); + break; + } + + if (!media) { + dprintf("%s no match found\n", __func__); + iter->release(); + return (NULL); + } + +#ifdef DEBUG + /* Report if there were additional matches */ + if (iter->getNextObject() != NULL) { + dprintf("%s Had more potential matches\n", __func__); + } +#endif + iter->release(); + iter = 0; + + /* Return valid IOMedia with retain count */ + return (media); +} + +/* + * media_from_dev is intended to be called by ldi_open_by_name + * and ldi_open_by_dev with a dev_t, and returns NULL or an IOMedia + * device with a retain count that should be released on open. + */ +static IOMedia * +media_from_dev(dev_t device = 0) +{ + IOMedia *media; + OSDictionary *matchDict; + + /* Get matchDict, will need to be released */ + matchDict = media_matchdict_from_dev(device); + if (!matchDict) { + dprintf("%s couldn't get matching dictionary\n", __func__); + return (NULL); + } + + /* Get first matching IOMedia */ + media = media_from_matchdict(matchDict); + matchDict->release(); + matchDict = 0; + + if (!media) { + dprintf("%s no IOMedia found for dev_t %d\n", __func__, + device); + } + + /* Return NULL or valid media with retain count */ + return (media); +} + +/* + * media_from_device_path + * + * translate /private/var/run/disk/by-path/ to an IOMedia + * handle. The remainder of the path should be a valid + * path in the IORegistry IODTPlane device tree. + */ +static IOMedia * +media_from_device_path(const char *path = 0) +{ + IORegistryEntry *entry = 0; + IOMedia *media = 0; + OSString *osstr; + const char *string, *dash; + + /* Must be /var/run/disk/by-path/, but may have /private prefix */ + if (!path || path[0] == 0 || + (strncmp(path, "/var/run/disk/by-path/", 22) != 0 && + strncmp(path, "/private/var/run/disk/by-path/", 30) != 0)) { + dprintf("%s invalid path [%s]\n", __func__, + (path && path[0] != '\0' ? path : "")); + return (NULL); + } + + /* We need the leading slash in the string, so trim 21 or 29 */ + if (strncmp(path, "/private", 8) == 0) { + osstr = OSString::withCString(path+29); + } else { + osstr = OSString::withCString(path+21); + } + if (!osstr) { + dprintf("%s couldn't get string from path\n", __func__); + return (NULL); + } + + string = osstr->getCStringNoCopy(); + ASSERT(string); + + /* Convert dashes to slashes */ + while ((dash = strchr(string, '-')) != NULL) { + osstr->setChar('/', dash - string); + } + dprintf("%s string [%s]\n", __func__, string); + + entry = IORegistryEntry::fromPath(string, gIODTPlane); + string = 0; + osstr->release(); + osstr = 0; + + if (!entry) { + dprintf("%s IORegistryEntry::fromPath failed\n", __func__); + return (NULL); + } + + if ((media = OSDynamicCast(IOMedia, entry)) == NULL) { + entry->release(); + return (0); + } + + /* Leave a retain count on the media */ + return (media); +} + +/* + * media_from_serial + * + * translate /private/var/run/disk/by-serial/model-serial[:location] + * to an IOMedia handle. The path format is determined by + * InvariantDisks logic in IDSerialLinker.cpp. + */ +static IOMedia * +media_from_serial(const char *path = 0) +{ + IORegistryEntry *entry = 0; + IOMedia *media = 0; + OSDictionary *matching = 0; + OSDictionary *deviceCharacteristics = 0; + OSIterator *iter = 0; + OSString *osstr = 0; + OSString *model = 0; + OSString *serial = 0; + OSNumber *bsdUnit = 0; + OSObject *property = 0; + OSObject *propDict = 0; + OSObject *obj = 0; + const char *substr = 0; + const char *sep1 = 0, *sep2 = 0; + const char *string = 0, *space = 0; + const char *location = 0, *entryLocation = 0; + int newlen = 0, soff = 0; + bool matched = false; + + /* Must be /var/run/disk/by-serial/, but may have /private prefix */ + if (!path || path[0] == 0 || + (strncmp(path, "/var/run/disk/by-serial/", 24) != 0 && + strncmp(path, "/private/var/run/disk/by-serial/", 32) != 0)) { + dprintf("%s invalid path [%s]\n", __func__, + (path && path[0] != '\0' ? path : "")); + return (NULL); + } + + /* substr starts after '/by-serial/' */ + substr = path + 24; + if (strncmp(path, "/private", 8) == 0) substr += 8; + + /* + * For each whole-disk IOMedia: + * Search parents for deviceCharacteristics, or skip. + * Check for Model and Serial Number properties, or skip. + * Trim trailing space and swap underscores within string. + * If "model-serial" matches path so far: + * Match whole-disk IOMedia if no slice specified. + * Or get child IOMedia with matching Location property. + */ + + sep1 = strchr(substr, '-'); + sep2 = strrchr(substr, ':'); + if (sep1 == 0) { + dprintf("%s invalid by-serial path [%s]\n", __func__, substr); + return (NULL); + } + if (sep2 == 0) { + dprintf("%s no slice, whole disk [%s]\n", __func__, substr); + sep2 = substr + (strlen(substr)); + } + + if ((matching = IOService::serviceMatching("IOMedia")) == NULL) { + dprintf("%s couldn't get matching dictionary\n", __func__); + return (NULL); + } + + if ((matching->setObject(kIOMediaWholeKey, kOSBooleanTrue) == false) || + (iter = IOService::getMatchingServices(matching)) == NULL) { + dprintf("%s couldn't get IOMedia iterator\n", __func__); + matching->release(); + return (NULL); + } + matching->release(); + matching = 0; + + while ((obj = iter->getNextObject()) != NULL) { + if ((entry = OSDynamicCast(IORegistryEntry, obj)) == NULL || + (media = OSDynamicCast(IOMedia, entry)) == NULL || + media->isFormatted() == false) { + // media->isWhole() == false) { + continue; + } + + propDict = media->getProperty( + kIOPropertyDeviceCharacteristicsKey, gIOServicePlane, + (kIORegistryIterateRecursively | + kIORegistryIterateParents)); + if ((deviceCharacteristics = OSDynamicCast(OSDictionary, + propDict)) == NULL) { + dprintf("%s no device characteristics, skipping\n", + __func__); + continue; + } + + /* + * Get each property, cast as OSString, then copy + * to a new OSString. + */ + if ((property = deviceCharacteristics->getObject( + kIOPropertyProductNameKey)) == NULL || + (osstr = OSDynamicCast(OSString, property)) == NULL || + (model = OSString::withString(osstr)) == NULL) { + dprintf("%s no product name, skipping\n", __func__); + continue; + } + if ((property = deviceCharacteristics->getObject( + kIOPropertyProductSerialNumberKey)) == NULL || + (osstr = OSDynamicCast(OSString, property)) == NULL || + (serial = OSString::withString(osstr)) == NULL) { + dprintf("%s no serial number, skipping\n", __func__); + model->release(); + model = 0; + continue; + } + + string = model->getCStringNoCopy(); + if (!string) { + model->release(); + model = 0; + serial->release(); + serial = 0; + continue; + } + /* Trim trailing whitespace */ + for (newlen = strlen(string); newlen > 0; newlen--) { + if (string[newlen-1] != ' ') { + model->setChar('\0', newlen); + break; + } + } + + /* + * sep1 is the location of the first '-' in the path. + * even if there is a '-' in the model name, we can skip + * media with model names shorter than that. + */ + if (newlen == 0 || + (newlen < (sep1 - substr)) || + (substr[newlen] != '-')) { + model->release(); + model = 0; + serial->release(); + serial = 0; + continue; + } + + /* Convert spaces to underscores */ + while ((space = strchr(string, ' ')) != NULL) { + model->setChar('_', space - string); + } + + /* Compare the model string with the path */ + if (strncmp(substr, string, newlen) != 0) { + model->release(); + model = 0; + serial->release(); + serial = 0; + continue; + } + dprintf("%s model string matched [%s]\n", + __func__, model->getCStringNoCopy()); + model->release(); + model = 0; + + soff = newlen + 1; + + string = serial->getCStringNoCopy(); + if (!string) { + serial->release(); + serial = 0; + continue; + } + /* Trim trailing whitespace */ + for (newlen = strlen(string); newlen > 0; newlen--) { + if (string[newlen-1] != ' ') { + serial->setChar('\0', newlen); + break; + } + } + /* + * sep2 is the location of the last ':' in the path, or + * the end of the string if there is none. + * even if there is a ':' in the serial number, we can skip + * media with serial number strings shorter than that. + */ + if (newlen == 0 || + (newlen < (sep2 - sep1 - 1)) || + (substr[soff+newlen] != '\0' && + substr[soff+newlen] != ':')) { + serial->release(); + serial = 0; + continue; + } + + /* Convert spaces to underscores */ + while ((space = strchr(string, ' ')) != NULL) { + serial->setChar('_', space - string); + } + + /* Compare the serial string with the path */ + if (strncmp(substr+soff, string, newlen) != 0) { + serial->release(); + serial = 0; + continue; + } + dprintf("%s serial string matched [%s]\n", + __func__, serial->getCStringNoCopy()); + serial->release(); + serial = 0; + + /* + * Still need to get the slice - the component + * after an optional ':' at the end of the + * string, by searching for IOMedia with that + * location string below the whole-disk IOMedia. + */ + /* Set new location of ':' */ + sep2 = substr + (soff + newlen); + /* Found match */ + matched = true; + media->retain(); + break; + } + iter->release(); + iter = 0; + + if (!matched || !media) { + dprintf("%s no matching devices found\n", __func__); + return (NULL); + } + + /* Whole disk path will not end with ':' */ + if (sep2[0] != ':') { + dprintf("%s Found whole disk [%s]\n", __func__, path); + /* Leave a retain count on the media */ + return (media); + } + + /* Remainder of string is location */ + location = sep2 + 1; + dprintf("%s location string [%s]\n", __func__, location); + + if ((bsdUnit = OSDynamicCast(OSNumber, + media->getProperty(kIOBSDUnitKey))) == NULL) { + dprintf("%s couldn't get BSD unit number\n", __func__); + media->release(); + return (NULL); + } + if ((matching = IOService::serviceMatching("IOMedia")) == NULL || + (matching->setObject(kIOMediaWholeKey, kOSBooleanFalse)) == false || + (matching->setObject(kIOBSDUnitKey, bsdUnit)) == false || + (iter = IOService::getMatchingServices(matching)) == NULL) { + dprintf("%s iterator for location failed\n", + __func__); + + if (matching) matching->release(); + /* We had a candidate, but couldn't get the location */ + media->release(); + return (NULL); + } + matching->release(); + matching = 0; + + /* Iterate over children checking for matching location */ + matched = false; + entry = 0; + while ((obj = iter->getNextObject()) != NULL) { + if ((entry = OSDynamicCast(IORegistryEntry, obj)) == NULL || + (OSDynamicCast(IOMedia, entry)) == NULL) { + entry = 0; + continue; + } + + if ((entryLocation = entry->getLocation()) == NULL || + (strlen(entryLocation) != strlen(location)) || + strcmp(entryLocation, location) != 0) { + entry = 0; + continue; + } + + dprintf("%s found match\n", __func__); + matched = true; + entry->retain(); + break; + } + iter->release(); + iter = 0; + + /* Drop the whole-disk media */ + media->release(); + media = 0; + + /* Cast the new entry, if there is one */ + if (!entry || (media = OSDynamicCast(IOMedia, entry)) == NULL) { +if (entry) dprintf("%s had entry but couldn't cast\n", __func__); + dprintf("%s no media found for path %s\n", + __func__, path); + if (entry) entry->release(); + return (NULL); + } + + dprintf("%s media from serial number succeeded\n", __func__); + + /* Leave a retain count on the media */ + return (matched ? media : NULL); +} + +/* + * media_from_path is intended to be called by ldi_open_by_name + * with a char* path, and returns NULL or an IOMedia device with a + * retain count that should be released on open. + */ +static IOMedia * +media_from_path(const char *path = 0) +{ + IOMedia *media; + OSDictionary *matchDict; + + /* Validate path */ + if (path == 0 || strlen(path) <= 1) { + dprintf("%s no path provided\n", __func__); + return (NULL); + } + + if (strncmp(path, "/var/run/disk/by-path/", 22) == 0 || + strncmp(path, "/private/var/run/disk/by-path/", 30) == 0) { + media = media_from_device_path(path); + dprintf("%s media_from_device_path %s\n", __func__, + (media ? "succeeded" : "failed")); + return (media); + } + + if (strncmp(path, "/var/run/disk/by-serial/", 24) == 0 || + strncmp(path, "/private/var/run/disk/by-serial/", 32) == 0) { + media = media_from_serial(path); + dprintf("%s media_from_serial %s\n", __func__, + (media ? "succeeded" : "failed")); + return (media); + } + + /* Try to get /dev/disk or /private/var/run/disk/by-id path */ + matchDict = media_matchdict_from_path(path); + if (!matchDict) { + dprintf("%s couldn't get matching dictionary\n", __func__); + return (NULL); + } + + media = media_from_matchdict(matchDict); + matchDict->release(); + matchDict = 0; + + if (!media) { + dprintf("%s no IOMedia found for path %s\n", __func__, path); + } + + /* Return NULL or valid media with retain count */ + return (media); +} + +/* Define an IOKit buffer for buf_strategy_iokit */ +typedef struct ldi_iokit_buf { + IOMemoryDescriptor *iomem; + IOStorageCompletion iocompletion; + IOStorageAttributes ioattr; +} ldi_iokit_buf_t; /* XXX Currently 64b */ + +/* Completion handler for IOKit strategy */ +static void +ldi_iokit_io_intr(void *target, void *parameter, + IOReturn status, UInt64 actualByteCount) +{ + ldi_iokit_buf_t *iobp = (ldi_iokit_buf_t *)target; + ldi_buf_t *lbp = (ldi_buf_t *)parameter; + +#ifdef DEBUG + /* In debug builds, verify buffer pointers */ + ASSERT3U(lbp, !=, 0); + ASSERT3U(iobp, !=, 0); + + if (!iobp || !lbp) { + printf("%s missing a buffer\n", __func__); + return; + } + + ASSERT3U(iobp->iomem, !=, 0); + + if (!iobp->iomem) { + printf("%s missing iobp->iomem\n", __func__); + return; + } + + // this is very very very noisy in --enable-boot + // ASSERT3U(ldi_zfs_handle, !=, 0); + + if (actualByteCount == 0 || + actualByteCount != lbp->b_bcount || + status != kIOReturnSuccess) { + printf("%s %s %llx / %llx\n", __func__, + "actualByteCount != lbp->b_bcount", + actualByteCount, lbp->b_bcount); + if (ldi_zfs_handle) + printf("%s status %d %d %s\n", __func__, status, + ldi_zfs_handle->errnoFromReturn(status), + ldi_zfs_handle->stringFromReturn(status)); + else + printf("%s status %d ldi_zfs_handle is NULL\n", + __func__, status); + } +#endif + + /* Complete and release IOMemoryDescriptor */ + iobp->iomem->complete(); + iobp->iomem->release(); + iobp->iomem = 0; + + /* Compute resid */ + ASSERT3U(lbp->b_bcount, >=, actualByteCount); + lbp->b_resid = (lbp->b_bcount - actualByteCount); + + /* Set error status */ + if (status == kIOReturnSuccess && + actualByteCount != 0 && lbp->b_resid == 0) { + lbp->b_error = 0; + } else { + lbp->b_error = EIO; + } + + /* Free IOKit buffer */ + kmem_free(iobp, sizeof (ldi_iokit_buf_t)); + + /* Call original completion function */ + if (lbp->b_iodone) { + (void) lbp->b_iodone(lbp); + } +} + +/* Synchronous IO, called by buf_strategy_iokit */ +static int +buf_sync_strategy_iokit(ldi_buf_t *lbp, ldi_iokit_buf_t *iobp, + struct ldi_handle *lhp) +{ + UInt64 actualByteCount = 0; + IOReturn result; + + /* Read or write */ + if (lbp->b_flags & B_READ) { + result = LH_MEDIA(lhp)->IOStorage::read(LH_CLIENT(lhp), + dbtolb(lbp->b_lblkno), iobp->iomem, + &iobp->ioattr, &actualByteCount); + } else { + result = LH_MEDIA(lhp)->IOStorage::write(LH_CLIENT(lhp), + dbtolb(lbp->b_lblkno), iobp->iomem, + &iobp->ioattr, &actualByteCount); + } + + /* Call completion */ + ldi_iokit_io_intr((void *)iobp, (void *)lbp, + result, actualByteCount); + + /* Return success based on result */ + return (result == kIOReturnSuccess ? 0 : EIO); +} + +/* + * Uses IOMedia::read asynchronously or IOStorage::read synchronously. + * virtual void read(IOService * client, + * UInt64 byteStart, + * IOMemoryDescriptor * buffer, + * IOStorageAttributes * attributes, + * IOStorageCompletion * completion); + * virtual IOReturn read(IOService * client, + * UInt64 byteStart, + * IOMemoryDescriptor * buffer, + * IOStorageAttributes * attributes = 0, + * UInt64 * actualByteCount = 0); + */ +int +buf_strategy_iokit(ldi_buf_t *lbp, struct ldi_handle *lhp) +{ + ldi_iokit_buf_t *iobp = 0; + + ASSERT3U(lbp, !=, NULL); + ASSERT3U(lhp, !=, NULL); + +#ifdef DEBUG + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp)) || + !OSDynamicCast(IOService, LH_CLIENT(lhp))) { + dprintf("%s invalid IOMedia or client\n", __func__); + return (ENODEV); + } +#endif /* DEBUG */ + + /* Allocate an IOKit buffer */ + iobp = (ldi_iokit_buf_t *)kmem_alloc(sizeof (ldi_iokit_buf_t), + KM_SLEEP); + if (!iobp) { + dprintf("%s couldn't allocate buf_iokit_t\n", __func__); + return (ENOMEM); + } +#ifdef LDI_ZERO + /* Zero the new buffer struct */ + bzero(iobp, sizeof (ldi_iokit_buf_t)); +#endif + + /* Set completion and attributes for async IO */ + if (lbp->b_iodone != NULL) { + iobp->iocompletion.target = iobp; + iobp->iocompletion.parameter = lbp; + iobp->iocompletion.action = &ldi_iokit_io_intr; + } + +/* XXX Zeroed above if LDI_ZERO, otherwise here */ +#ifndef LDI_ZERO + /* XXX Zero the ioattr struct */ + bzero(&iobp->ioattr, sizeof (IOStorageAttributes)); +#endif + + /* Allocate a memory descriptor pointing to the data address */ + iobp->iomem = IOMemoryDescriptor::withAddress( + lbp->b_un.b_addr, lbp->b_bcount, + (lbp->b_flags & B_READ ? kIODirectionIn : kIODirectionOut)); + + /* Verify the buffer */ + if (!iobp->iomem || iobp->iomem->getLength() != lbp->b_bcount || + iobp->iomem->prepare() != kIOReturnSuccess) { + dprintf("%s couldn't allocate IO buffer\n", + __func__); + if (iobp->iomem) { + iobp->iomem->release(); + } + kmem_free(iobp, sizeof (ldi_iokit_buf_t)); + return (ENOMEM); + } + + /* Recheck instantaneous value of handle status */ + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s device not online\n", __func__); + iobp->iomem->complete(); + iobp->iomem->release(); + kmem_free(iobp, sizeof (ldi_iokit_buf_t)); + return (ENODEV); + } + + /* Synchronous or async */ + if (lbp->b_iodone == NULL) { + return (buf_sync_strategy_iokit(lbp, iobp, lhp)); + } + + /* Read or write */ + if (lbp->b_flags & B_READ) { + LH_MEDIA(lhp)->IOMedia::read(LH_CLIENT(lhp), + dbtolb(lbp->b_lblkno), iobp->iomem, + &iobp->ioattr, &iobp->iocompletion); + } else { + LH_MEDIA(lhp)->IOMedia::write(LH_CLIENT(lhp), + dbtolb(lbp->b_lblkno), iobp->iomem, + &iobp->ioattr, &iobp->iocompletion); + } + + /* Return success, will call io_intr when done */ + return (0); +} + +/* Client interface, alloc and open IOKit handle */ +int +ldi_open_by_media(IOMedia *media = 0, dev_t device = 0, + int fmode = 0, ldi_handle_t *lhp = 0) +{ + struct ldi_handle *retlhp; + ldi_status_t status; + int error; + + /* Validate IOMedia */ + if (!media || !lhp) { + dprintf("%s invalid argument %p or %p\n", + __func__, media, lhp); + return (EINVAL); + } + + /* Retain for duration of open */ + media->retain(); + + /* Get dev_t if not supplied */ + if (device == 0 && (device = dev_from_media(media)) == 0) { + dprintf("%s dev_from_media failed: %p %d\n", __func__, + media, device); + media->release(); + return (ENODEV); + } + + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*(struct ldi_handle **)lhp, ==, NULL); + + /* Allocate IOKit handle */ + retlhp = handle_alloc_iokit(device, fmode); + if (retlhp == NULL) { + dprintf("%s couldn't allocate IOKit handle\n", __func__); + media->release(); + return (ENOMEM); + } + + /* Try to open device with IOMedia */ + status = handle_open_start(retlhp); + if (status == LDI_STATUS_ONLINE) { + dprintf("%s already online, refs %d, openrefs %d\n", __func__, + retlhp->lh_ref, retlhp->lh_openref); + /* Cast retlhp and assign to lhp (may be 0) */ + *lhp = (ldi_handle_t)retlhp; + media->release(); + /* Successfully incremented open ref */ + return (0); + } + if (status != LDI_STATUS_OPENING) { + dprintf("%s invalid status %d\n", __func__, status); + handle_release(retlhp); + retlhp = 0; + media->release(); + return (ENODEV); + } + + error = handle_open_iokit(retlhp, media); + media->release(); + + if (error) { + dprintf("%s Couldn't open handle\n", __func__); + handle_open_done(retlhp, LDI_STATUS_CLOSED); + handle_release(retlhp); + retlhp = 0; + return (EIO); + } + handle_open_done(retlhp, LDI_STATUS_ONLINE); + + /* Register for disk notifications */ + handle_register_notifier(retlhp); + + /* Cast retlhp and assign to lhp (may be 0) */ + *lhp = (ldi_handle_t)retlhp; + /* Pass error from open */ + return (error); +} + +/* Client interface, find IOMedia from dev_t, alloc and open handle */ +int +ldi_open_media_by_dev(dev_t device = 0, int fmode = 0, + ldi_handle_t *lhp = 0) +{ + IOMedia *media = 0; + int error = EINVAL; + + /* Validate arguments */ + if (!lhp || device == 0) { + dprintf("%s missing argument %p %d\n", + __func__, lhp, device); + return (EINVAL); + } + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*((struct ldi_handle **)lhp), ==, NULL); + + /* Get IOMedia from major/minor */ + if ((media = media_from_dev(device)) == NULL) { + dprintf("%s media_from_dev error %d\n", + __func__, error); + return (ENODEV); + } + + /* Try to open by media */ + error = ldi_open_by_media(media, device, fmode, lhp); + + /* Release IOMedia and clear */ + media->release(); + media = 0; + + /* Pass error from open */ + return (error); +} + +/* Client interface, find dev_t and IOMedia/vnode, alloc and open handle */ +int +ldi_open_media_by_path(char *path = 0, int fmode = 0, + ldi_handle_t *lhp = 0) +{ + IOMedia *media = 0; + dev_t device = 0; + int error = EINVAL; + + /* Validate arguments */ + if (!lhp || !path) { + dprintf("%s %s %p %s %d\n", __func__, + "missing lhp or path", lhp, path, fmode); + return (EINVAL); + } + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*((struct ldi_handle **)lhp), ==, NULL); + + /* For /dev/disk*, and InvariantDisk paths */ + if ((media = media_from_path(path)) == NULL) { + dprintf("%s media_from_path failed\n", __func__); + return (ENODEV); + } + + error = ldi_open_by_media(media, device, fmode, lhp); + + /* Release IOMedia and clear */ + media->release(); + media = 0; + + /* Error check open */ + if (error) { + dprintf("%s ldi_open_by_media failed %d\n", + __func__, error); + } + + return (error); +} + +int +handle_remove_notifier(struct ldi_handle *lhp) +{ + handle_notifier_t notifier; + +#ifdef DEBUG + if (!lhp) { + dprintf("%s missing handle\n", __func__); + return (EINVAL); + } +#endif + + if (lhp->lh_notifier == 0) { + dprintf("%s no notifier installed\n", __func__); + return (0); + } + + /* First clear notifier pointer */ + notifier = lhp->lh_notifier; + lhp->lh_notifier = 0; + +#ifdef DEBUG + /* Validate IONotifier object */ + if (!OSDynamicCast(IONotifier, notifier->obj)) { + dprintf("%s %p is not an IONotifier\n", __func__, + notifier->obj); + return (EINVAL); + } +#endif + + notifier->obj->remove(); + kmem_free(notifier, sizeof (handle_notifier_t)); + return (0); +} + +int +handle_register_notifier(struct ldi_handle *lhp) +{ + OSDictionary *matchDict; + handle_notifier_t notifier; + + /* Make sure we have a handle and dev_t */ + if (!lhp || lhp->lh_dev == 0) { + dprintf("%s no handle or missing dev_t\n", __func__); + return (EINVAL); + } + + notifier = (handle_notifier_t)kmem_alloc( + sizeof (struct _handle_notifier), KM_SLEEP); + if (!notifier) { + dprintf("%s couldn't alloc notifier struct\n", __func__); + return (ENOMEM); + } + + /* Get matchDict, will need to be released */ + matchDict = media_matchdict_from_dev(lhp->lh_dev); + if (!matchDict) { + dprintf("%s couldn't get matching dictionary\n", __func__); + kmem_free(notifier, sizeof (handle_notifier_t)); + return (EINVAL); + } + + /* Register IOMedia termination notification */ + notifier->obj = IOService::addMatchingNotification( + gIOTerminatedNotification, matchDict, + handle_media_terminate_cb, /* target */ 0, + /* refCon */ (void *)lhp, /* priority */ 0); + matchDict->release(); + + /* Error check notifier */ + if (!notifier->obj) { + dprintf("%s addMatchingNotification failed\n", + __func__); + kmem_free(notifier, sizeof (handle_notifier_t)); + return (ENOMEM); + } + + /* Assign notifier to handle */ + lhp->lh_notifier = notifier; + return (0); +} + +/* Supports both IOKit and vnode handles by finding IOMedia from dev_t */ +int +handle_set_wce_iokit(struct ldi_handle *lhp, int *wce) +{ + IOMedia *media; + IORegistryEntry *parent; + IOBlockStorageDevice *device; + IOReturn result; + bool value; + + if (!lhp || !wce) { + return (EINVAL); + } + + switch (lhp->lh_type) { + case LDI_TYPE_IOKIT: + if ((media = LH_MEDIA(lhp)) == NULL) { + dprintf("%s couldn't get IOMedia\n", __func__); + return (ENODEV); + } + /* Add a retain count */ + media->retain(); + break; + case LDI_TYPE_VNODE: + if (lhp->lh_dev == 0 || + (media = media_from_dev(lhp->lh_dev)) == 0) { + dprintf("%s couldn't find IOMedia for dev_t %d\n", + __func__, lhp->lh_dev); + return (ENODEV); + } + /* Returned media has a retain count */ + break; + default: + dprintf("%s invalid handle\n", __func__); + return (EINVAL); + } + + /* Walk the parents of this media */ + for (parent = media->getParentEntry(gIOServicePlane); + parent != NULL; + parent = parent->getParentEntry(gIOServicePlane)) { + /* Until a valid device is found */ + device = OSDynamicCast(IOBlockStorageDevice, parent); + if (device != NULL) { + device->retain(); + break; + } + /* Next parent */ + } + media->release(); + media = 0; + + /* If no matching device was found */ + if (!device) { + dprintf("%s no IOBlockStorageDevice found\n", __func__); + return (ENODEV); + } + + result = device->getWriteCacheState(&value); + if (result != kIOReturnSuccess) { + // dprintf("%s couldn't get current write cache state %d\n", + // __func__, ldi_zfs_handle->errnoFromReturn(result)); + return (ENXIO); + } + + /* If requested value does not match current */ + if (value != *wce) { + value = (*wce == 1); + /* Attempt to change the value */ + result = device->setWriteCacheState(value); + } + + /* Set error and wce to return */ + if (result != kIOReturnSuccess) { + // dprintf("%s couldn't set write cache %d\n", + // __func__, ldi_zfs_handle->errnoFromReturn(result)); + /* Flip wce to indicate current status */ + *wce = !(*wce); + return (ENXIO); + } + + return (0); +} + +int +handle_get_media_info_iokit(struct ldi_handle *lhp, + struct dk_minfo *dkm) +{ + uint32_t blksize; + uint64_t blkcount; + + if (!lhp || !dkm) { + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + if ((blksize = LH_MEDIA(lhp)->getPreferredBlockSize()) == 0) { + dprintf("%s invalid blocksize\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + if ((blkcount = LH_MEDIA(lhp)->getSize() / blksize) == 0) { + dprintf("%s invalid block count\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + LH_MEDIA(lhp)->release(); + + /* Set the return values */ + dkm->dki_capacity = blkcount; + dkm->dki_lbsize = blksize; + + return (0); +} + +int +handle_get_media_info_ext_iokit(struct ldi_handle *lhp, + struct dk_minfo_ext *dkmext) +{ + OSObject *prop; + OSNumber *number; + uint32_t blksize, pblksize; + uint64_t blkcount; + + if (!lhp || !dkmext) { + dprintf("%s missing lhp or dkmext\n", __func__); + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + prop = LH_MEDIA(lhp)->getProperty(kIOPropertyPhysicalBlockSizeKey, + gIOServicePlane, kIORegistryIterateRecursively | + kIORegistryIterateParents); + + number = OSDynamicCast(OSNumber, prop); + if (!prop || !number) { + dprintf("%s couldn't get physical blocksize\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + pblksize = number->unsigned32BitValue(); + number = 0; + prop = 0; + + if ((blksize = LH_MEDIA(lhp)->getPreferredBlockSize()) == 0) { + dprintf("%s invalid blocksize\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + if ((blkcount = LH_MEDIA(lhp)->getSize() / blksize) == 0) { + dprintf("%s invalid block count\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + LH_MEDIA(lhp)->release(); + +#ifdef DEBUG + dprintf("%s phys blksize %u, logical blksize %u, blockcount %llu\n", + __func__, pblksize, blksize, blkcount); +#endif + + /* Set the return values */ + dkmext->dki_capacity = blkcount; + dkmext->dki_lbsize = blksize; + dkmext->dki_pbsize = pblksize; + + return (0); +} + +int +handle_check_media_iokit(struct ldi_handle *lhp, int *status) +{ + /* Validate arguments */ + if (!lhp || !status) { + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + /* Validate device size */ + if (LH_MEDIA(lhp)->getSize() == 0) { + dprintf("%s media reported 0 size\n", __func__); + LH_MEDIA(lhp)->release(); + return (ENXIO); + } + + /* Validate write status if handle fmode is read-write */ + if ((lhp->lh_fmode & FWRITE) && + LH_MEDIA(lhp)->isWritable() == false) { + dprintf("%s media is not writeable\n", __func__); + LH_MEDIA(lhp)->release(); + return (EPERM); + } + + LH_MEDIA(lhp)->release(); + + /* Success */ + *status = 0; + return (0); +} + +int +handle_is_solidstate_iokit(struct ldi_handle *lhp, int *isssd) +{ + OSDictionary *propDict = 0; + OSString *property = 0; + + /* Validate arguments */ + if (!lhp || !isssd) { + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + propDict = OSDynamicCast(OSDictionary, LH_MEDIA(lhp)->getProperty( + kIOPropertyDeviceCharacteristicsKey, gIOServicePlane)); + + if (propDict != 0) { + property = OSDynamicCast(OSString, + propDict->getObject(kIOPropertyMediumTypeKey)); + propDict = 0; + } + + if (property != 0 && + property->isEqualTo(kIOPropertyMediumTypeSolidStateKey)) { + *isssd = 1; + } + property = 0; + + LH_MEDIA(lhp)->release(); + + return (0); +} + +int +handle_features_iokit(struct ldi_handle *lhp, + uint32_t *data) +{ + if (!lhp || !data) { + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + OSDictionary *dictionary = OSDynamicCast( + /* class */ OSDictionary, + /* object */ LH_MEDIA(lhp)->getProperty( + /* key */ kIOStorageFeaturesKey, + /* plane */ gIOServicePlane)); + + *data = 0; + + if (dictionary) { + OSBoolean *boolean; + +#ifdef DK_FEATURE_BARRIER + boolean = OSDynamicCast( + /* class */ OSBoolean, + /* object */ dictionary->getObject( + /* key */ kIOStorageFeatureBarrier)); + + if (boolean == kOSBooleanTrue) + *(uint32_t *)data |= DK_FEATURE_BARRIER; +#endif + + boolean = OSDynamicCast( + /* class */ OSBoolean, + /* object */ dictionary->getObject( + /* key */ kIOStorageFeatureForceUnitAccess)); + + if (boolean == kOSBooleanTrue) + *(uint32_t *)data |= DK_FEATURE_FORCE_UNIT_ACCESS; + +#ifdef DK_FEATURE_PRIORITY + boolean = OSDynamicCast( + /* class */ OSBoolean, + /* object */ dictionary->getObject( + /* key */ kIOStorageFeaturePriority)); + + if (boolean == kOSBooleanTrue) + *(uint32_t *)data |= DK_FEATURE_PRIORITY; +#endif + + boolean = OSDynamicCast( + /* class */ OSBoolean, + /* object */ dictionary->getObject( + /* key */ kIOStorageFeatureUnmap)); + + if (boolean == kOSBooleanTrue) + *(uint32_t *)data |= DK_FEATURE_UNMAP; + } + + LH_MEDIA(lhp)->release(); + return (0); +} + +int +handle_unmap_iokit(struct ldi_handle *lhp, + dkioc_free_list_ext_t *dkm) +{ + int error = 0; + + if (!lhp || !dkm) { + return (EINVAL); + } + + /* Validate IOMedia */ + if (!OSDynamicCast(IOMedia, LH_MEDIA(lhp))) { + dprintf("%s invalid IOKit handle\n", __func__); + return (ENODEV); + } + + LH_MEDIA(lhp)->retain(); + + /* We need to convert illumos' dkioc_free_list_t to dk_unmap_t */ + IOStorageExtent *extents; + extents = IONew(IOStorageExtent, 1); + extents[0].byteStart = dkm->dfle_start; + extents[0].byteCount = dkm->dfle_length; + + /* + * dkm->dfl_flags vs IOStorageUnmapOptions + * #define DF_WAIT_SYNC 0x00000001 + * Wait for full write-out of free. + * IOStorageUnmapOptions is only 0 + */ + + /* issue unmap */ + error = LH_MEDIA(lhp)->unmap(LH_CLIENT(lhp), + extents, 1, 0); + + if (error != 0) { + dprintf("%s unmap: 0x%x\n", __func__, error); + // Convert IOReturn to errno + error = LH_MEDIA(lhp)->errnoFromReturn(error); + } + + IODelete(extents, IOStorageExtent, 1); + LH_MEDIA(lhp)->release(); + + return (error); +} + + +} /* extern "C" */ diff --git a/module/os/macos/zfs/ldi_osx.c b/module/os/macos/zfs/ldi_osx.c new file mode 100644 index 0000000000..7f0902d258 --- /dev/null +++ b/module/os/macos/zfs/ldi_osx.c @@ -0,0 +1,2440 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * Portions of this document are copyright Oracle and Joyent. + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +/* + * LDI Subsystem on OS X: + * + * Designed as a drop-in replacement for sunldi.h and driver_lyr.c, + * LDI abstracts away platform-specific device handling. This allows + * vdev_disk.c to more closely match 'upstream' illumos/OpenZFS. + * + * LDI handles may use IOKit or vnode ops to locate and use devices. + * - This reduces the call stack and work needed for almost all IO. + * - Allows for vdev discovery and use during early boot, before the + * root device is mounted. + * - Having both types allows use of non-standard kexts which publish + * bdevsw block devices (without IOMedia). + * + * XXX Review correct call stack using dtrace, annotate stack size. + * Previously, vnode_open and VNOP_STRATEGY were used, which required + * allocating buf_t for IO. This meant translating byte offsets to + * block numbers for every IO. Once issued, dtrace showed that a very + * large stack was required: + * VNOP_STRATEGY macro performs work then calls + * spec_strategy (vop->vop_strategy) which performs work then calls + * dkiostrategy (syscall) which passes the IO to an IOMediaBSDClient + * IOMediaBSDClient performed work and passes to its IOMedia provider + * + * Beyond that is a common path shared by vnode and IOMedia: + * IOMedia performs work, then does prepareRequest, breakUpRequest, + * deBlockRequest, and executeRequest. + * Potentially passed down the provider stack through IOPartitionMap + * then to the whole-disk IOMedia, with more work + * Passed down through IOBlockStorageDriver, with more work + * Passed down through IOBlockStorageDevice, with more work + * Finally passed to Family-specific driver (AHCI, diskimage, etc.) + * + * By directly accessing IOMedia, the stack is reduced, and byte + * offsets are passed to read()/write() via ldi_strategy. + * We still need to allocate an IOMemoryDescriptor for the data buf, + * however only an IOMemoryDescriptor::withAddress() reference is + * required, similar to buf_setdataptr. + */ + +/* + * LDI Handle hash lists: + * + * During ldi_init, LH_HASH_SZ lists and locks are allocated. New handles + * will be added to the list indexed by the hash of the dev_t number. + * + * The hash function simply performs a modulus on the dev_t number based on + * the LH_HASH_SZ, as opposed to illumos which hashes based on the vnode + * pointer. + * This has been tested by hashing disk0, disk0s1, disk0s2, disk1, disk1s1, + * etc. to verify results were distributed across hash range. + * + * OS X dev_t numbers should be unique unless a new device claims the same + * dev_t as a removed/failed device. This would only be a collision if we + * still have a handle for the failed device (notification/event handlers + * should remove these before that occurs). + * Since Offline status is a dead-end and the handle cannot be dereferenced + * or freed while iterating the hash list, it is safe to check the status + * and skip a handle if the status is Offline (without taking handle lock). + * + * XXX On illumos the hash function uses the vnode's pointer address as the + * unique key. Since vnode addresses are aligned to the size of the vnode + * struct, the hash function shifts the pointer address to the right in order + * to hash the unique bits of the address. OS X dev_t use all the bits of + * an unsigned 32-bit int. + */ + +/* + * LDI Handle locks: + * + * Handle references and list membership are protected by the hash list + * locks. + * Handle status and other fields are protected by a per-handle mutex. + * + * To prevent deadlocks and artificial delays, the hash list locks should + * be held only for handle hold/release and handle_add/remove (list + * iterate/insert/remove). Those functions avoid blocking. + * Use the handle mutex to change state, and avoid blocking there, too. + * + * XXX Right now handle_status_change does allocate for taskq_dispatch + * with the handle lock held, but uses TQ_NOSLEEP and verifies result. + * + * Non-locking ops such as ldi_strategy, ldi_get_size, and ldi_sync will + * check the instantaneous status/refs before attempting to proceed, and + * can only perform IO while the device is Online. + */ + +/* + * LDI Handle allocation: + * + * ldi_open_by_name and ldi_open_by_dev locate the device and call + * ldi_open_media_by_path, ldi_open_media_by_dev, or ldi_open_vnode_by_path. + * + * From ldi_open_by_media and _by_vnode, we call handle_alloc_{type}. Both + * call handle_alloc_common to allocate and configure the handle. + * + * A handle is allocated in the Closed state with 1 reference. The handle + * is added to the hash list on allocation, unless a duplicate handle exists + * (same dev_t as well as fmode, not in Offline status). If an existing + * handle is found, the newly allocated handle is freed. + * + * handle_open_start is called, which takes the handle lock to check current + * status. Each of these states is possible: + * Offline: device has disappeared between allocation and now (unlikely). + * Closed: new or recently closed handle, changes status to Opening. + * Closing: already in progress. Sleeps on lock and rechecks the status. + * Opening: already in progress. Sleeps on lock and rechecks the status. + * Online: no need to open device, just increment openref count. + * + * If handle_open_start changes the status to Opening, the device is opened + * by calling handle_open_iokit or handle_open_vnode. + * + * This differs from illumos driver_lyr.c where handle_alloc first opens a + * vnode for the device, allocates a handle by vnode, and finally checks for + * a duplicate handle in the list (open, alloc, find vs. alloc, open, find). + * To do so, illumos has a VOP_OPEN that is aware of layered-driver opens. + */ + +/* + * LDI Handle list membership: + * + * Allocate with one reference, to be used or released by the caller. + * Call handle_hold if additional references are needed. + * + * Call handle_release to drop reference. On last release, this calls + * handle_free (but does not remove the handle from the list, see below). + * + * Call handle_add to determine if this handle is a duplicate, inserting + * handle into list or returning an existing handle with a hold. + * Check the result and call handle_release on the new handle if another + * handle was returned (new handle is not added to list). + * + * Each call to handle_find will take optionally take a hold, which should + * be released when no longer needed (used by handle_add). + * + * Calling handle_open increments lh_openref but does not change lh_ref. + * Caller should already have called handle_hold to get a reference. + * + * If lh_ref is 1, call handle_remove_locked (with list lock) to remove the + * handle from the list, then call handle_release_locked to remove last ref + * and free. + * A handle shouldn't remain in the list in Closed status with no refs. + * + * Calling handle_close with the last openref will automatically take list + * lock, call handle_remove_locked, and then handle_release_locked. + */ + +/* + * LDI Handle device objects: + * + * Multiple read-only opens share one read-only handle. + * Multiple read-write opens share one read-write handle. + * + * IOKit handles are allocated with the dev_t number and fmode. + * handle_open_iokit is passed an IOMedia object (which should have a + * retain held). + * Once handle_open returns, the IOMedia can be released by the caller. + * + * Vnode handles are allocated with the dev_t number and fmode. + * handle_open_vnode is passed a path (null-terminated C string). + * vnode_open increments both iocount and refcount, vnode_ref increments + * usecount, vnode_put drops iocount between ops. + * vnode_getwithref takes an iocount, and vnode_rele drops usecount + * before vnode_close decrements iocount and refcount. + */ + +/* + * LDI Handle status: + * + * #define LDI_STATUS_OFFLINE 0x0 + * #define LDI_STATUS_CLOSED 0x1 + * #define LDI_STATUS_CLOSING 0x2 + * #define LDI_STATUS_OPENING 0x3 + * #define LDI_STATUS_ONLINE 0x4 + * + * The handle lock will be taken to change status. + * + * Handle state can only progress from Closed to Opening status, and must + * have a reference held to do so. The lock is dropped for open and close + * ops while the handle is in Opening or Closing status. + * + * If the open is successful, the state is set to Online (with handle lock + * held). This state is required for IO operations to be started. The state + * may have changed by the time an IO completes. + * + * For IOKit devices, and vnode devices that have an IOMedia, a callback is + * registered for IOMedia termination which changes the state to Offline and + * posts event callbacks. + * + * Closing a handle, by the user or as a result of an event, sets the state + * to Closing. Once device close is issued, the state changes from Closing + * to Closed (even if close returned failure). + * + * A handle that still has refs and openrefs will remain in the Online + * state, dropping refs and openrefs each time ldi_close is called. + * + * If there are refs but no openrefs, it remains in the Closed state, and + * drops refs each time handle_release is called. + * This allows clients to call ldi_open_by_* to reopen the handle, in the + * case where one client is opening the handle at the same time another is + * closing it. + * + * If the device has gone missing (IOMedia terminated), the handle will + * change to Offline status. This is a dead-end which issues Offline Notify + * and Finalize events, then cleans up the handle once all clients have + * called ldi_close. + * + * Once all references have been dropped, the handle is removed from the + * hash list with the hash list lock held, then freed. + */ + +/* + * LDI Events: + * + * XXX Degrade event is not implemented, doubt it will be useful. Intended + * to be set when a vdev that is backed by RAID becomes degraded. This is + * not a recommended use case for ZFS, and on OS X we only have AppleRAID + * or custom hardware or software RAID. Also per the comments, the vdev + * would be marked Degraded only to inform the user via zpool status. + * + * XXX Tested in VirtualBox by hotplugging a SATA device, have yet to + * test with USB removal, etc. + * + * ldi_register_ev_callback can be used to add a struct to the event + * callback list containing the handle pointer, a notify callback, and + * a finalize callback. + * + * Supported events are Offline Notify/Finalize, which will be + * posted when the device enters the Offline state (IOMedia terminated). + * + * The event callback functions should be non-blocking. It is recommended + * to update a flag that can be checked prior to calling ldi_strategy. + */ + +/* + * LDI client interfaces: + * + * ldi_open_by_name + * ldi_open_by_dev + * ldi_close + * + * ldi_register_ev_callback + * ldi_unregister_ev_callback + * + * ldi_get_size + * ldi_sync + * ldi_ioctl + * ldi_strategy + * + * ldi_bioinit + * ldi_biofini + */ + +/* + * LDI Buffers: + * + * ldi_strategy uses an abstract buffer for IO, so clients do not need to + * be concerned with type-specific buf_t and IOMemoryDescriptor handling. + * + * Allocate and free ldi_buf_t manually, calling ldi_bioinit after alloc + * and ldi_biofini prior to free. + * + * Synchronous IO can be performed by setting b_iodone to NULL. + * + * Allocate and use a buffer like this: + * + * ldi_buf_t *bp = (ldi_buf_t *)kmem_alloc(sizeof (ldi_buf_t), KM_SLEEP); + * // Verify allocation before proceeding + * error = ldi_bioinit(bp); + * bp->b_bcount = size; + * bp->b_bufsize = size; + * bp->b_offset = offset; + * bp->b_data = data_ptr; + * bp->b_flags = B_BUSY | B_NOCACHE | B_READ; // For example + * bp->b_iodone = &io_intr_func; // For async IO, omit for sync IO + * ldi_strategy(handle, bp); // Issue IO + * + * With an async callback function such as: + * void io_intr_func(ldi_buf_t bp, void *param) + * { + * // Check/copyout bp->b_error and bp->b_resid + * ldi_biofini(bp); + * kmem_free(bp, sizeof (ldi_buf_t)); + * } + */ + +/* + * XXX LDI TO DO + * + * LDI handle stats. In debug builds, we have IO counters - number of IOs, + * number of bytes in/out. + * kstats for handle counts and sysctls for vnode/IOKit modes also implemented. + * + * To implement events, both vnode and IOKit handles register for matching + * notifications from the IOMedia object (if found). + * Using subclassed IOService can also receive IOMessage events, which + * would be issued earlier. + * + * Vnode handles with no IOMedia could post events on (multiple) IO failures. + */ + +/* + * ZFS internal + */ +#include +#include +#include +#include +#include + +/* + * LDI Includes + */ +#include + +/* Debug prints */ +#ifdef DEBUG +#define LDI_EVDBG(args) cmn_err args +#define LDI_EVTRC(args) cmn_err args +#else +#define LDI_EVDBG(args) do {} while (0) +#define LDI_EVTRC(args) do {} while (0) +#endif + +#ifdef DEBUG +#ifdef dprintf +#undef dprintf +#endif + +#define dprintf ldi_log + +#define ldi_log(fmt, ...) do { \ + printf(fmt, __VA_ARGS__); \ + /* delay(hz>>1); */ \ +_NOTE(CONSTCOND) } while (0) +#endif + +/* + * Defines + * comment out defines to alter behavior. + */ +// #define LDI_ZERO /* For debugging, zero allocations */ + +/* Find IOMedia by matching on the BSD disk name. */ +static boolean_t ldi_use_iokit_from_path = 1; + +/* Find IOMedia by matching on the BSD major/minor (dev_t) number. */ +static boolean_t ldi_use_iokit_from_dev = 1; + +/* + * Find dev_t by vnode_lookup. + * Resolves symlinks to block devices, symlinks, InvariantDisk links. + */ +static boolean_t ldi_use_dev_from_path = 1; + +/* + * Open device by vnode if all else fails. + * Not intented to be a fallback for unsuccessful IOMedia open, but rather + * for bdev devices that do not have an IOMedia (published by other KEXTs). + */ +static boolean_t ldi_use_vnode_from_path = 1; + +/* + * Sysctls + */ +#include +SYSCTL_DECL(_ldi); +SYSCTL_NODE(, OID_AUTO, ldi, CTLFLAG_RD | CTLFLAG_LOCKED, 0, ""); +SYSCTL_NODE(_ldi, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_LOCKED, 0, ""); +SYSCTL_UINT(_ldi_debug, OID_AUTO, use_iokit_from_dev, + CTLFLAG_RW | CTLFLAG_LOCKED, &ldi_use_iokit_from_dev, 0, + "ZFS LDI use iokit_from_path"); +SYSCTL_UINT(_ldi_debug, OID_AUTO, use_iokit_from_path, + CTLFLAG_RW | CTLFLAG_LOCKED, &ldi_use_iokit_from_path, 0, + "ZFS LDI use iokit_from_dev"); +SYSCTL_UINT(_ldi_debug, OID_AUTO, use_dev_from_path, + CTLFLAG_RW | CTLFLAG_LOCKED, &ldi_use_dev_from_path, 0, + "ZFS LDI use dev_from_path"); +SYSCTL_UINT(_ldi_debug, OID_AUTO, use_vnode_from_path, + CTLFLAG_RW | CTLFLAG_LOCKED, &ldi_use_vnode_from_path, 0, + "ZFS LDI use vnode_from_path"); + +/* + * Globals + */ +static volatile int64_t ldi_handle_hash_count; + +static list_t ldi_handle_hash_list[LH_HASH_SZ]; +static kmutex_t ldi_handle_hash_lock[LH_HASH_SZ]; + +/* + * Use of "ldi_ev_callback_list" must be protected by ldi_ev_lock() + * and ldi_ev_unlock(). + */ +static struct ldi_ev_callback_list ldi_ev_callback_list; + +static uint32_t ldi_ev_id_pool = 0; + +struct ldi_ev_cookie { + char *ck_evname; + uint_t ck_sync; + uint_t ck_ctype; +}; + +#define CT_DEV_EV_OFFLINE 0x1 +#define CT_DEV_EV_DEGRADED 0x2 +static struct ldi_ev_cookie ldi_ev_cookies[] = { + {LDI_EV_OFFLINE, 1, CT_DEV_EV_OFFLINE}, + {LDI_EV_DEGRADE, 0, CT_DEV_EV_DEGRADED}, + {LDI_EV_DEVICE_REMOVE, 0, 0}, + {NULL} /* must terminate list */ +}; + +/* + * kstats + */ +static kstat_t *ldi_ksp; + +typedef struct ldi_stats { + kstat_named_t handle_count; + kstat_named_t handle_count_iokit; + kstat_named_t handle_count_vnode; + kstat_named_t handle_refs; + kstat_named_t handle_open_rw; + kstat_named_t handle_open_ro; +} ldi_stats_t; + +static ldi_stats_t ldi_stats = { + { "handle_count", KSTAT_DATA_UINT64 }, + { "handle_count_iokit", KSTAT_DATA_UINT64 }, + { "handle_count_vnode", KSTAT_DATA_UINT64 }, + { "handle_refs", KSTAT_DATA_UINT64 }, + { "handle_open_rw", KSTAT_DATA_UINT64 }, + { "handle_open_ro", KSTAT_DATA_UINT64 } +}; + +#define LDISTAT(stat) (ldi_stats.stat.value.ui64) +#define LDISTAT_INCR(stat, val) \ +atomic_add_64(&ldi_stats.stat.value.ui64, (val)) +#define LDISTAT_BUMP(stat) LDISTAT_INCR(stat, 1) +#define LDISTAT_BUMPDOWN(stat) LDISTAT_INCR(stat, -1) + +/* + * Define macros for accessing layered driver hash structures + */ +#define LH_HASH(dev) handle_hash_func(dev) + +static inline uint_t +handle_hash_func(dev_t device) +{ + /* Just cast, macro does modulus to hash value */ + return ((uint_t)device % LH_HASH_SZ); +} + +typedef struct status_change_args { + struct ldi_handle *lhp; + int new_status; +} status_change_args_t; + +static void +handle_status_change_callback(void *arg) +{ + status_change_args_t *sc = (status_change_args_t *)arg; + + /* Validate arg struct */ + if (!sc || !sc->lhp) { + dprintf("%s missing callback struct %p or lh\n", + __func__, sc); + return; + } + if (sc->new_status > LDI_STATUS_ONLINE) { + dprintf("%s invalid status %d\n", + __func__, sc->new_status); + return; + } + + dprintf("%s Invoking notify for handle %p status %d\n", + __func__, sc->lhp, sc->new_status); + ldi_invoke_notify(0 /* dip */, sc->lhp->lh_dev, S_IFBLK, + LDI_EV_OFFLINE, sc->lhp); + + dprintf("%s Invoking finalize for handle %p status %d\n", + __func__, sc->lhp, sc->new_status); + ldi_invoke_finalize(0 /* dip */, sc->lhp->lh_dev, S_IFBLK, + LDI_EV_OFFLINE, LDI_EV_SUCCESS, sc->lhp); + + /* Free callback struct */ + kmem_free(sc, sizeof (status_change_args_t)); +} + +/* Protected by handle lock */ +static int +handle_status_change_locked(struct ldi_handle *lhp, int new_status) +{ + status_change_args_t *sc = 0; + + /* Validate lhp */ + if (!lhp) { + dprintf("%s missing handle\n", __func__); + return (EINVAL); + } + if (new_status > LDI_STATUS_ONLINE) { + dprintf("%s invalid status %d\n", __func__, new_status); + return (EINVAL); + } + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_dev, !=, 0); + ASSERT(MUTEX_HELD(&lhp->lh_lock)); + + /* Set the status first */ + lhp->lh_status = new_status; + + /* Only Offline needs an event */ + if (new_status != LDI_STATUS_OFFLINE) { + dprintf("%s skipping status %d\n", __func__, new_status); + return (0); + } + + dprintf("%s new_status is Offline %d\n", __func__, new_status); + + /* Allocate struct to pass to event callback */ + /* Allocating with lock held, use KM_NOSLEEP */ + sc = (status_change_args_t *)kmem_alloc(sizeof (status_change_args_t), + KM_NOSLEEP); + if (!sc) { + dprintf("%s couldn't allocate callback struct\n", + __func__); + return (ENOMEM); + } + sc->lhp = lhp; + sc->new_status = new_status; + + mutex_exit(&lhp->lh_lock); /* Currently needs to drop lock */ + handle_status_change_callback((void *)sc); + mutex_enter(&lhp->lh_lock); /* Retake before return */ + + return (0); +} + +/* Protected by handle lock */ +int +handle_status_change(struct ldi_handle *lhp, int new_status) +{ + int error; + + /* Validate lh and new_status */ + if (!lhp) { + dprintf("%s missing handle\n", __func__); + return (EINVAL); + } + if (new_status > LDI_STATUS_ONLINE) { + dprintf("%s invalid state %d\n", __func__, new_status); + return (EINVAL); + } + + mutex_enter(&lhp->lh_lock); + error = handle_status_change_locked(lhp, new_status); + mutex_exit(&lhp->lh_lock); + + return (error); +} + +/* Protected by hash list lock */ +void +handle_hold_locked(struct ldi_handle *lhp) +{ +#ifdef DEBUG + int index; + + ASSERT3U(lhp, !=, NULL); + index = LH_HASH(lhp->lh_dev); + ASSERT(MUTEX_HELD(&ldi_handle_hash_lock[index])); +#endif + + /* Increment ref count and kstat */ + lhp->lh_ref++; + LDISTAT_BUMP(handle_refs); +} + +/* Protected by hash list lock */ +void +handle_hold(struct ldi_handle *lhp) +{ + int index; + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_dev, !=, 0); + + index = LH_HASH(lhp->lh_dev); + mutex_enter(&ldi_handle_hash_lock[index]); + handle_hold_locked(lhp); + mutex_exit(&ldi_handle_hash_lock[index]); +} + +/* + * Locate existing handle in linked list, may return NULL. Optionally places a + * hold on found handle. + */ +static struct ldi_handle * +handle_find_locked(dev_t device, int fmode, boolean_t hold) +{ + struct ldi_handle *retlhp = NULL, *lhp; + int index = LH_HASH(device); + + /* Validate device */ + if (device == 0) { + dprintf("%s invalid device\n", __func__); + return (NULL); + } + /* If fmode is 0, find any handle with matching dev_t */ + + ASSERT(MUTEX_HELD(&ldi_handle_hash_lock[index])); + + /* Iterate over handle hash list */ + for (lhp = list_head(&ldi_handle_hash_list[index]); + lhp != NULL; + lhp = list_next(&ldi_handle_hash_list[index], lhp)) { + /* Check for matching dev_t and fmode (if set) */ + if (lhp->lh_dev != device) { + continue; + } + + /* Special case for find any */ + if (fmode == 0) { + /* Found a match */ + retlhp = lhp; + break; + } + + /* fmode must match write level */ + if (((lhp->lh_fmode & FWRITE) && !(fmode & FWRITE)) || + (!(lhp->lh_fmode & FWRITE) && (fmode & FWRITE))) { + continue; + } + + /* Found a match */ + retlhp = lhp; + break; + } + + /* Take hold, if requested */ + if (hold && retlhp) { + /* Caller asked for hold on found handle */ + handle_hold_locked(retlhp); + } + + return (retlhp); +} + +/* + * Call without lock held to find a handle by dev_t, + * optionally placing a hold on the found handle. + */ +struct ldi_handle * +handle_find(dev_t device, int fmode, boolean_t hold) +{ + struct ldi_handle *lhp; + int index = LH_HASH(device); + + if (device == 0) { + dprintf("%s invalid device\n", __func__); + return (NULL); + } + + /* Lock for duration of find */ + mutex_enter(&ldi_handle_hash_lock[index]); + + /* Find handle by dev_t (with hold) */ + lhp = handle_find_locked(device, fmode, hold); + + /* Unlock and return handle (could be NULL) */ + mutex_exit(&ldi_handle_hash_lock[index]); + return (lhp); +} + +static void +handle_free(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + + /* Validate lhp, references, and status */ + if (lhp->lh_ref != 0 || + lhp->lh_status != LDI_STATUS_CLOSED) { + dprintf("%s ref %d status %d\n", __func__, lhp->lh_ref, + lhp->lh_status); + } + + /* Remove notification handler */ + if (handle_remove_notifier(lhp) != 0) { + dprintf("%s lhp %p notifier %s\n", + __func__, lhp, "couldn't be removed"); + } + + /* Destroy condvar and mutex */ + cv_destroy(&lhp->lh_cv); + mutex_destroy(&lhp->lh_lock); + + /* Decrement kstat handle count */ + LDISTAT_BUMPDOWN(handle_count); + /* IOKit or vnode */ + switch (lhp->lh_type) { + case LDI_TYPE_IOKIT: + /* Decrement kstat handle count and free iokit_tsd */ + LDISTAT_BUMPDOWN(handle_count_iokit); + handle_free_iokit(lhp); + break; + + case LDI_TYPE_VNODE: + /* Decrement kstat handle count and free vnode_tsd */ + LDISTAT_BUMPDOWN(handle_count_vnode); + handle_free_vnode(lhp); + break; + default: + dprintf("%s invalid handle type\n", __func__); + break; + } + + /* Deallocate handle */ + dprintf("%s freeing %p\n", __func__, lhp); + kmem_free(lhp, sizeof (struct ldi_handle)); + lhp = 0; +} + +/* + * Remove handle from list, decrementing counters + */ +static void +handle_remove_locked(struct ldi_handle *lhp) +{ + int index; + + ASSERT3U(lhp, !=, NULL); + index = LH_HASH(lhp->lh_dev); + ASSERT(MUTEX_HELD(&ldi_handle_hash_lock[index])); + + /* Remove from list, update handle count */ + list_remove(&ldi_handle_hash_list[index], lhp); + OSDecrementAtomic(&ldi_handle_hash_count); +} + +void +handle_remove(struct ldi_handle *lhp) +{ + int index = LH_HASH(lhp->lh_dev); + + mutex_enter(&ldi_handle_hash_lock[index]); + handle_remove_locked(lhp); + mutex_exit(&ldi_handle_hash_lock[index]); +} + +/* Protected by hash list lock */ +static void +handle_release_locked(struct ldi_handle *lhp) +{ + boolean_t lastrelease = B_FALSE; + +#ifdef DEBUG + ASSERT3U(lhp, !=, NULL); + int index = LH_HASH(lhp->lh_dev); + ASSERT(MUTEX_HELD(&ldi_handle_hash_lock[index])); +#endif + + if (lhp->lh_ref != 0) { + lhp->lh_ref--; + LDISTAT_BUMPDOWN(handle_refs); + } else { + dprintf("%s with 0 refs\n", __func__); + } + + dprintf("%s %x remaining holds\n", __func__, lhp->lh_ref); + + /* If last open ref was dropped */ + lastrelease = (lhp->lh_ref == 0); + + if (lastrelease) { + dprintf("%s removing handle %p from list\n", __func__, lhp); + handle_remove_locked(lhp); + dprintf("%s freeing handle %p\n", __func__, lhp); + handle_free(lhp); + } +} + +/* Protected by hash list lock */ +void +handle_release(struct ldi_handle *lhp) +{ + int index; + + ASSERT3U(lhp, !=, NULL); + index = LH_HASH(lhp->lh_dev); + + mutex_enter(&ldi_handle_hash_lock[index]); + handle_release_locked(lhp); + mutex_exit(&ldi_handle_hash_lock[index]); +} + +/* + * Add new handle to list. + */ +static struct ldi_handle * +handle_add_locked(struct ldi_handle *lhp) +{ + struct ldi_handle *retlhp; + int index = 0; + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_dev, !=, 0); + + /* Lock should be held */ + index = LH_HASH(lhp->lh_dev); + ASSERT(MUTEX_HELD(&ldi_handle_hash_lock[index])); + + /* Search for existing handle */ + if ((retlhp = handle_find_locked(lhp->lh_dev, lhp->lh_fmode, + B_TRUE)) != NULL) { + dprintf("%s found handle %p\n", __func__, retlhp); + return (retlhp); + } + + /* Insert into list */ + list_insert_head(&ldi_handle_hash_list[index], lhp); + + /* Update handle count */ + OSIncrementAtomic(&ldi_handle_hash_count); + + /* Return success */ + return (lhp); +} + +/* + * Caller should check if returned handle is the same and free new + * handle if an existing handle was returned + */ +struct ldi_handle * +handle_add(struct ldi_handle *lhp) +{ + struct ldi_handle *retlhp; + int index; + + ASSERT3U(lhp, !=, NULL); + index = LH_HASH(lhp->lh_dev); + + mutex_enter(&ldi_handle_hash_lock[index]); + retlhp = handle_add_locked(lhp); + mutex_exit(&ldi_handle_hash_lock[index]); + + return (retlhp); +} + +/* + * Returns a handle with 1 reference and status Closed + */ +#ifdef illumos +static struct ldi_handle * +handle_alloc(vnode_t *vp, struct ldi_ident_t *li) +#else /* illumos */ +struct ldi_handle * +handle_alloc_common(uint_t type, dev_t device, int fmode) +#endif /* !illumos */ +{ + struct ldi_handle *new_lh; + size_t len; + + /* Validate arguments */ + if ((type != LDI_TYPE_IOKIT && type != LDI_TYPE_VNODE) || + device == 0 || fmode == 0) { + dprintf("%s Invalid type %d, device %d, or fmode %d\n", + __func__, type, device, fmode); + return (NULL); + } + + /* Allocate and verify */ + len = sizeof (struct ldi_handle); + if (NULL == (new_lh = (struct ldi_handle *)kmem_alloc(len, + KM_SLEEP))) { + dprintf("%s couldn't allocate ldi_handle\n", __func__); + return (NULL); + } +#ifdef LDI_ZERO + /* Clear the struct for safety */ + bzero(new_lh, len); +#endif + + /* Create handle lock */ + mutex_init(&new_lh->lh_lock, NULL, MUTEX_DEFAULT, NULL); + /* And condvar */ + cv_init(&new_lh->lh_cv, NULL, CV_DEFAULT, NULL); + + /* + * Set the handle type, which dictates the type of device pointer + * and buffers used for the lifetime of the ldi_handle + */ + new_lh->lh_type = type; + /* Set dev_t (major/minor) device number */ + new_lh->lh_dev = device; + + /* Clear list head */ + new_lh->lh_node.list_next = NULL; + new_lh->lh_node.list_prev = NULL; + + /* Initialize with 1 handle ref and 0 open refs */ + new_lh->lh_ref = 1; + new_lh->lh_openref = 0; + + /* Clear type-specific device data */ + new_lh->lh_tsd.iokit_tsd = 0; + /* No need to clear vnode_tsd in union */ + new_lh->lh_notifier = 0; + + /* Assign fmode */ + new_lh->lh_fmode = fmode; + + /* Alloc in status Closed */ + new_lh->lh_status = LDI_STATUS_CLOSED; + + /* Increment kstats */ + LDISTAT_BUMP(handle_count); + LDISTAT_BUMP(handle_refs); + if (type == LDI_TYPE_IOKIT) { + LDISTAT_BUMP(handle_count_iokit); + } else if (type == LDI_TYPE_VNODE) { + LDISTAT_BUMP(handle_count_vnode); + } + + return (new_lh); +} + +static void +handle_set_open_locked(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + ASSERT(MUTEX_HELD(&lhp->lh_lock)); + + /* Increment number of open clients */ + lhp->lh_openref++; + + /* Increment kstats */ + if (lhp->lh_fmode & FWRITE) { + LDISTAT_BUMP(handle_open_rw); + } else { + LDISTAT_BUMP(handle_open_ro); + } +} + +#if 0 +static void +handle_set_open(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + + mutex_enter(&lhp->lh_lock); + handle_set_open_locked(lhp); + mutex_exit(&lhp->lh_lock); +} +#endif + +static void +handle_clear_open_locked(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + ASSERT(MUTEX_HELD(&lhp->lh_lock)); + + /* Decrement number of open clients */ + if (lhp->lh_openref == 0) { + dprintf("%s with 0 open refs\n", __func__); + return; + } + + /* Decrement kstats */ + lhp->lh_openref--; + if (lhp->lh_fmode & FWRITE) { + LDISTAT_BUMPDOWN(handle_open_rw); + } else { + LDISTAT_BUMPDOWN(handle_open_ro); + } +} + +#if 0 +static inline void +handle_clear_open(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_dev, !=, 0); + ASSERT3U(lhp->lh_openref, !=, 0); + + mutex_enter(&lhp->lh_lock); + handle_clear_open_locked(lhp, lhp->lh_fmode); + mutex_exit(&lhp->lh_lock); +} +#endif + +static int +handle_close(struct ldi_handle *lhp) +{ +#ifdef DEBUG + int openrefs; +#endif + int error = EINVAL; + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_ref, !=, 0); + ASSERT3U(lhp->lh_openref, !=, 0); + ASSERT(lhp->lh_type == LDI_TYPE_IOKIT || + lhp->lh_type == LDI_TYPE_VNODE); + + /* Take lock */ + mutex_enter(&lhp->lh_lock); + + /* + * Possible statuses: + * Online with one or more openref + * Offline due to IOMedia termination, one or more openref remain + * Impossible or programming error: + * Closing and Closed should only be set with 0 openref + * Opening should have 0 openref so far, and clients should not be + * calling ldi_close + */ + switch (lhp->lh_status) { + case LDI_STATUS_ONLINE: + if (lhp->lh_openref == 0) { + /* Unlock and return error */ + mutex_exit(&lhp->lh_lock); + /* Shouldn't happen */ + dprintf("%s status Online with 0 openrefs\n", + __func__); + return (ENXIO); + } + + /* If multiple open refs are held */ + if (lhp->lh_openref > 1) { + goto drop_openref; + } + + /* Otherwise open with last open ref */ + /* change status to closing and proceed */ + handle_status_change_locked(lhp, LDI_STATUS_CLOSING); + /* Unlock and exit loop */ + mutex_exit(&lhp->lh_lock); + goto do_close; + + case LDI_STATUS_OFFLINE: + if (lhp->lh_openref == 0) { + /* Unlock and return error */ + mutex_exit(&lhp->lh_lock); + /* Shouldn't happen */ + dprintf("%s status Offline with 0 openrefs\n", + __func__); + return (ENXIO); + } + + /* + * Otherwise the device was marked missing and clients need + * to drop openrefs until it can be released. + */ + goto drop_openref; + + default: + mutex_exit(&lhp->lh_lock); + dprintf("%s invalid handle status %d\n", + __func__, lhp->lh_status); + return (ENXIO); + } + +drop_openref: + /* Just decrement open refs/stats */ + handle_clear_open_locked(lhp); +#ifdef DEBUG + /* Save openrefs to report after unlock */ + openrefs = lhp->lh_openref; +#endif + mutex_exit(&lhp->lh_lock); + +#ifdef DEBUG + dprintf("%s has %d remaining openrefs\n", __func__, openrefs); +#endif + return (0); + +do_close: + /* Remove notification handler */ + if (lhp->lh_notifier) { + error = handle_remove_notifier(lhp); + if (error) { + dprintf("%s lhp %p notifier %p error %d %s\n", + __func__, lhp, lhp->lh_notifier, error, + "couldn't be removed"); + /* Proceeds with close */ + } + } + + /* IOMedia or vnode */ + switch (lhp->lh_type) { + case LDI_TYPE_IOKIT: + error = handle_close_iokit(lhp); + /* Preserve error for return */ + break; + case LDI_TYPE_VNODE: + error = handle_close_vnode(lhp); + /* Preserve error for return */ + break; + } + +#ifdef DEBUG + if (error != 0) { + /* We will still set the handle to Closed status */ + dprintf("%s error %d from handle_close_{type}\n", + __func__, error); + } +#endif + + /* Take lock to drop openref and set status */ + mutex_enter(&lhp->lh_lock); + handle_clear_open_locked(lhp); + handle_status_change_locked(lhp, LDI_STATUS_CLOSED); + + /* Wake any waiting opens and unlock */ + cv_signal(&lhp->lh_cv); + mutex_exit(&lhp->lh_lock); + +dprintf("%s returning %d\n", __func__, error); + return (error); +} + +ldi_status_t +handle_open_start(struct ldi_handle *lhp) +{ + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_ref, !=, 0); + + /* Take lock */ + mutex_enter(&lhp->lh_lock); + /* Loop if the handle is in opening or closing status */ + do { + /* XXX Needs sleep timeout */ + switch (lhp->lh_status) { + case LDI_STATUS_ONLINE: + /* Increment readonly / readwrite count */ + handle_set_open_locked(lhp); + mutex_exit(&lhp->lh_lock); + + /* Success */ + return (LDI_STATUS_ONLINE); + + case LDI_STATUS_CLOSED: + /* Not yet open, change status to opening and proceed */ + handle_status_change_locked(lhp, LDI_STATUS_OPENING); + + /* Unlock and exit loop */ + mutex_exit(&lhp->lh_lock); + /* Return success */ + return (LDI_STATUS_OPENING); + + case LDI_STATUS_OPENING: + case LDI_STATUS_CLOSING: + /* Open or close in progress, sleep until signaled */ + dprintf("%s sleeping on lock\n", __func__); + cv_wait(&lhp->lh_cv, &lhp->lh_lock); + continue; + default: + mutex_exit(&lhp->lh_lock); + dprintf("%s invalid handle status %d\n", + __func__, lhp->lh_status); + return (LDI_STATUS_OFFLINE); + } + } while (1); + + /* Shouldn't reach this */ + return (LDI_STATUS_CLOSED); +} + +void +handle_open_done(struct ldi_handle *lhp, ldi_status_t new_status) +{ + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_status, ==, LDI_STATUS_OPENING); + + /* Lock to change status */ + mutex_enter(&lhp->lh_lock); + + if (new_status != LDI_STATUS_ONLINE) { + /* Set status, issues event */ + handle_status_change_locked(lhp, LDI_STATUS_CLOSED); + } else { + /* Increment open count and fmode */ + handle_set_open_locked(lhp); + /* Set status, issues event */ + handle_status_change_locked(lhp, LDI_STATUS_ONLINE); + } + + /* Wake any waiting opens and unlock */ + cv_signal(&lhp->lh_cv); + mutex_exit(&lhp->lh_lock); + + /* + * Flush out any old buffers remaining from + * a previous use, only if opening read-write. + */ + if (new_status == LDI_STATUS_ONLINE && + (lhp->lh_fmode & FWRITE) && + ldi_sync((ldi_handle_t)lhp) != 0) { + dprintf("%s ldi_sync failed\n", __func__); + } +} + +/* + * Release all remaining handles (during ldi_fini) + * Unless something went wrong, all handles should + * be closed and have zero references. + */ +static void +handle_hash_release() +{ + struct ldi_handle *lhp; + int index, refs, j; + + for (index = 0; index < LH_HASH_SZ; index++) { + mutex_enter(&ldi_handle_hash_lock[index]); + if (!list_empty(&ldi_handle_hash_list[index])) { + dprintf("%s still have LDI handle(s) in list %d\n", + __func__, index); + } + + /* Iterate over the list */ + while ((lhp = list_head(&ldi_handle_hash_list[index]))) { + /* remove from list to deallocate */ + list_remove(&ldi_handle_hash_list[index], lhp); + + /* Update handle count */ + OSDecrementAtomic(&ldi_handle_hash_count); + + dprintf("%s releasing %p with %u refs and status %d\n", + __func__, lhp, lhp->lh_ref, lhp->lh_status); + /* release holds */ + refs = lhp->lh_ref; + for (j = 0; j < refs; j++) { + handle_release_locked(lhp); + } + lhp = 0; + } + + list_destroy(&ldi_handle_hash_list[index]); + mutex_exit(&ldi_handle_hash_lock[index]); + mutex_destroy(&ldi_handle_hash_lock[index]); + } +} + +/* + * LDI Event functions + */ +char * +ldi_ev_get_type(ldi_ev_cookie_t cookie) +{ + int i; + struct ldi_ev_cookie *cookie_impl = (struct ldi_ev_cookie *)cookie; + + for (i = 0; ldi_ev_cookies[i].ck_evname != NULL; i++) { + if (&ldi_ev_cookies[i] == cookie_impl) { + LDI_EVTRC((CE_NOTE, "ldi_ev_get_type: LDI: %s", + ldi_ev_cookies[i].ck_evname)); + return (ldi_ev_cookies[i].ck_evname); + } + } + + return ("UNKNOWN EVENT"); +} + +static int +ldi_native_cookie(ldi_ev_cookie_t cookie) +{ + int i; + struct ldi_ev_cookie *cookie_impl = (struct ldi_ev_cookie *)cookie; + + for (i = 0; ldi_ev_cookies[i].ck_evname != NULL; i++) { + if (&ldi_ev_cookies[i] == cookie_impl) { + LDI_EVTRC((CE_NOTE, "ldi_native_cookie: native LDI")); + return (1); + } + } + + LDI_EVTRC((CE_NOTE, "ldi_native_cookie: is NDI")); + return (0); +} + +static ldi_ev_cookie_t +ldi_get_native_cookie(const char *evname) +{ + int i; + + for (i = 0; ldi_ev_cookies[i].ck_evname != NULL; i++) { + if (strcmp(ldi_ev_cookies[i].ck_evname, evname) == 0) { + LDI_EVTRC((CE_NOTE, "ldi_get_native_cookie: found")); + return ((ldi_ev_cookie_t)&ldi_ev_cookies[i]); + } + } + + LDI_EVTRC((CE_NOTE, "ldi_get_native_cookie: NOT found")); + return (NULL); +} + +/* + * ldi_ev_lock() needs to be recursive, since layered drivers may call + * other LDI interfaces (such as ldi_close() from within the context of + * a notify callback. Since the notify callback is called with the + * ldi_ev_lock() held and ldi_close() also grabs ldi_ev_lock, the lock needs + * to be recursive. + */ +static void +ldi_ev_lock(void) +{ + LDI_EVTRC((CE_NOTE, "ldi_ev_lock: entered")); + + mutex_enter(&ldi_ev_callback_list.le_lock); + if (ldi_ev_callback_list.le_thread == curthread) { + ASSERT(ldi_ev_callback_list.le_busy >= 1); + ldi_ev_callback_list.le_busy++; + } else { + while (ldi_ev_callback_list.le_busy) + cv_wait(&ldi_ev_callback_list.le_cv, + &ldi_ev_callback_list.le_lock); + ASSERT(ldi_ev_callback_list.le_thread == NULL); + ldi_ev_callback_list.le_busy = 1; + ldi_ev_callback_list.le_thread = curthread; + } + mutex_exit(&ldi_ev_callback_list.le_lock); + + LDI_EVTRC((CE_NOTE, "ldi_ev_lock: exit")); +} + +static void +ldi_ev_unlock(void) +{ + LDI_EVTRC((CE_NOTE, "ldi_ev_unlock: entered")); + mutex_enter(&ldi_ev_callback_list.le_lock); + ASSERT(ldi_ev_callback_list.le_thread == curthread); + ASSERT(ldi_ev_callback_list.le_busy >= 1); + + ldi_ev_callback_list.le_busy--; + if (ldi_ev_callback_list.le_busy == 0) { + ldi_ev_callback_list.le_thread = NULL; + cv_signal(&ldi_ev_callback_list.le_cv); + } + mutex_exit(&ldi_ev_callback_list.le_lock); + LDI_EVTRC((CE_NOTE, "ldi_ev_unlock: exit")); +} + +int +ldi_ev_get_cookie(ldi_handle_t lh, char *evname, ldi_ev_cookie_t *cookiep) +{ + ldi_ev_cookie_t tcookie; + + LDI_EVDBG((CE_NOTE, "ldi_ev_get_cookie: entered: evname=%s", + evname ? evname : "")); + + if (lh == NULL || evname == NULL || + strlen(evname) == 0 || cookiep == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_ev_get_cookie: invalid args")); + return (LDI_EV_FAILURE); + } + + *cookiep = NULL; + + /* + * First check if it is a LDI native event + */ + tcookie = ldi_get_native_cookie(evname); + if (tcookie) { + LDI_EVDBG((CE_NOTE, "ldi_ev_get_cookie: got native cookie")); + *cookiep = tcookie; + return (LDI_EV_SUCCESS); + } + + return (LDI_EV_FAILURE); +} + +int +ldi_ev_register_callbacks(ldi_handle_t lh, ldi_ev_cookie_t cookie, + ldi_ev_callback_t *callb, void *arg, ldi_callback_id_t *id) +{ + struct ldi_handle *lhp = (struct ldi_handle *)lh; + ldi_ev_callback_impl_t *lecp; + + if (lh == NULL || cookie == NULL || callb == NULL || id == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_ev_register_callbacks: Invalid args")); + return (LDI_EV_FAILURE); + } + + if (callb->cb_vers != LDI_EV_CB_VERS) { + LDI_EVDBG((CE_NOTE, "ldi_ev_register_callbacks: Invalid vers")); + return (LDI_EV_FAILURE); + } + + if (callb->cb_notify == NULL && callb->cb_finalize == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_ev_register_callbacks: NULL callb")); + return (LDI_EV_FAILURE); + } + + *id = 0; + + lecp = kmem_zalloc(sizeof (ldi_ev_callback_impl_t), KM_SLEEP); + + ldi_ev_lock(); + + /* + * Add the notify/finalize callback to the LDI's list of callbacks. + */ + lecp->lec_lhp = lhp; + + lecp->lec_dev = lhp->lh_dev; + lecp->lec_spec = S_IFBLK; + + lecp->lec_notify = callb->cb_notify; + lecp->lec_finalize = callb->cb_finalize; + lecp->lec_arg = arg; + lecp->lec_cookie = cookie; + + lecp->lec_id = (void *)(uintptr_t)(++ldi_ev_id_pool); + + list_insert_tail(&ldi_ev_callback_list.le_head, lecp); + + *id = (ldi_callback_id_t)lecp->lec_id; + + ldi_ev_unlock(); + + LDI_EVDBG((CE_NOTE, "ldi_ev_register_callbacks: registered " + "notify/finalize")); + + return (LDI_EV_SUCCESS); +} + +static int +ldi_ev_device_match(ldi_ev_callback_impl_t *lecp, __unused dev_info_t *dip, + dev_t dev, int spec_type) +{ + ASSERT(lecp); + ASSERT(dev != DDI_DEV_T_NONE); + ASSERT(dev != NODEV); + ASSERT((dev == DDI_DEV_T_ANY && spec_type == 0) || + (spec_type == S_IFCHR || spec_type == S_IFBLK)); + ASSERT(lecp->lec_spec == S_IFCHR || lecp->lec_spec == S_IFBLK); + ASSERT(lecp->lec_dev != DDI_DEV_T_ANY); + ASSERT(lecp->lec_dev != DDI_DEV_T_NONE); + ASSERT(lecp->lec_dev != NODEV); + + if (dev != DDI_DEV_T_ANY) { + if (dev != lecp->lec_dev || spec_type != lecp->lec_spec) + return (0); + } + + LDI_EVTRC((CE_NOTE, "ldi_ev_device_match: MATCH dev=%d", + (uint32_t)dev)); + + return (1); +} + +/* + * LDI framework function to post a "notify" event to all layered drivers + * that have registered for that event + * + * Returns: + * LDI_EV_SUCCESS - registered callbacks allow event + * LDI_EV_FAILURE - registered callbacks block event + * LDI_EV_NONE - No matching LDI callbacks + * + * This function is *not* to be called by layered drivers. It is for I/O + * framework code in Solaris, such as the I/O retire code and DR code + * to call while servicing a device event such as offline or degraded. + */ +int +ldi_invoke_notify(__unused dev_info_t *dip, dev_t dev, int spec_type, + char *event, void *ev_data) +{ + ldi_ev_callback_impl_t *lecp; + list_t *listp; + int ret; + char *lec_event; + + ASSERT(dev != DDI_DEV_T_NONE); + ASSERT(dev != NODEV); + ASSERT((dev == DDI_DEV_T_ANY && spec_type == 0) || + (spec_type == S_IFCHR || spec_type == S_IFBLK)); + ASSERT(event); + + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): entered: dip=%p, ev=%s", + (void *)dip, event)); + + ret = LDI_EV_NONE; + ldi_ev_lock(); + + VERIFY(ldi_ev_callback_list.le_walker_next == NULL); + listp = &ldi_ev_callback_list.le_head; + for (lecp = list_head(listp); lecp; lecp = + ldi_ev_callback_list.le_walker_next) { + ldi_ev_callback_list.le_walker_next = list_next(listp, lecp); + + /* Check if matching device */ + if (!ldi_ev_device_match(lecp, dip, dev, spec_type)) + continue; + + if (lecp->lec_lhp == NULL) { + /* + * Consumer has unregistered the handle and so + * is no longer interested in notify events. + */ + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): No LDI " + "handle, skipping")); + continue; + } + + if (lecp->lec_notify == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): No notify " + "callback. skipping")); + continue; /* not interested in notify */ + } + + /* + * Check if matching event + */ + lec_event = ldi_ev_get_type(lecp->lec_cookie); + if (strcmp(event, lec_event) != 0) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): Not matching" + " event {%s,%s}. skipping", event, lec_event)); + continue; + } + + lecp->lec_lhp->lh_flags |= LH_FLAGS_NOTIFY; + if (lecp->lec_notify((ldi_handle_t)lecp->lec_lhp, + lecp->lec_cookie, lecp->lec_arg, ev_data) != + LDI_EV_SUCCESS) { + ret = LDI_EV_FAILURE; + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): notify" + " FAILURE")); + break; + } + + /* We have a matching callback that allows the event to occur */ + ret = LDI_EV_SUCCESS; + + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): 1 consumer success")); + } + + if (ret != LDI_EV_FAILURE) + goto out; + +#ifdef __APPLE__ + dprintf("%s offline notify failed, shouldn't happen\n", __func__); + goto out; +#endif +#ifdef illumos + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): undoing notify")); + + /* + * Undo notifies already sent + */ + lecp = list_prev(listp, lecp); + VERIFY(ldi_ev_callback_list.le_walker_prev == NULL); + for (; lecp; lecp = ldi_ev_callback_list.le_walker_prev) { + ldi_ev_callback_list.le_walker_prev = list_prev(listp, lecp); + + /* + * Check if matching device + */ + if (!ldi_ev_device_match(lecp, dip, dev, spec_type)) + continue; + + if (lecp->lec_finalize == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): no finalize, " + "skipping")); + continue; /* not interested in finalize */ + } + + /* + * it is possible that in response to a notify event a + * layered driver closed its LDI handle so it is ok + * to have a NULL LDI handle for finalize. The layered + * driver is expected to maintain state in its "arg" + * parameter to keep track of the closed device. + */ + + /* Check if matching event */ + lec_event = ldi_ev_get_type(lecp->lec_cookie); + if (strcmp(event, lec_event) != 0) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): not matching " + "event: %s,%s, skipping", event, lec_event)); + continue; + } + + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): calling finalize")); + + lecp->lec_finalize(lecp->lec_lhp, lecp->lec_cookie, + LDI_EV_FAILURE, lecp->lec_arg, ev_data); + + /* + * If LDI native event and LDI handle closed in context + * of notify, NULL out the finalize callback as we have + * already called the 1 finalize above allowed in this situation + */ + if (lecp->lec_lhp == NULL && + ldi_native_cookie(lecp->lec_cookie)) { + LDI_EVDBG((CE_NOTE, + "ldi_invoke_notify(): NULL-ing finalize after " + "calling 1 finalize following ldi_close")); + lecp->lec_finalize = NULL; + } + } +#endif /* illumos */ + +out: + ldi_ev_callback_list.le_walker_next = NULL; + ldi_ev_callback_list.le_walker_prev = NULL; + ldi_ev_unlock(); + + if (ret == LDI_EV_NONE) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_notify(): no matching " + "LDI callbacks")); + } + + return (ret); +} + +/* + * LDI framework function to invoke "finalize" callbacks for all layered + * drivers that have registered callbacks for that event. + * + * This function is *not* to be called by layered drivers. It is for I/O + * framework code in Solaris, such as the I/O retire code and DR code + * to call while servicing a device event such as offline or degraded. + */ +void +ldi_invoke_finalize(__unused dev_info_t *dip, dev_t dev, int spec_type, + char *event, int ldi_result, void *ev_data) +{ + ldi_ev_callback_impl_t *lecp; + list_t *listp; + char *lec_event; + int found = 0; + + ASSERT(dev != DDI_DEV_T_NONE); + ASSERT(dev != NODEV); + ASSERT((dev == DDI_DEV_T_ANY && spec_type == 0) || + (spec_type == S_IFCHR || spec_type == S_IFBLK)); + ASSERT(event); + ASSERT(ldi_result == LDI_EV_SUCCESS || ldi_result == LDI_EV_FAILURE); + + LDI_EVDBG((CE_NOTE, "ldi_invoke_finalize(): entered: dip=%p, result=%d" + " event=%s", (void *)dip, ldi_result, event)); + + ldi_ev_lock(); + VERIFY(ldi_ev_callback_list.le_walker_next == NULL); + listp = &ldi_ev_callback_list.le_head; + for (lecp = list_head(listp); lecp; lecp = + ldi_ev_callback_list.le_walker_next) { + ldi_ev_callback_list.le_walker_next = list_next(listp, lecp); + + if (lecp->lec_finalize == NULL) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_finalize(): No " + "finalize. Skipping")); + continue; /* Not interested in finalize */ + } + + /* + * Check if matching device + */ + if (!ldi_ev_device_match(lecp, dip, dev, spec_type)) + continue; + + /* + * It is valid for the LDI handle to be NULL during finalize. + * The layered driver may have done an LDI close in the notify + * callback. + */ + + /* + * Check if matching event + */ + lec_event = ldi_ev_get_type(lecp->lec_cookie); + if (strcmp(event, lec_event) != 0) { + LDI_EVDBG((CE_NOTE, "ldi_invoke_finalize(): Not " + "matching event {%s,%s}. Skipping", + event, lec_event)); + continue; + } + + LDI_EVDBG((CE_NOTE, "ldi_invoke_finalize(): calling finalize")); + + found = 1; + + lecp->lec_finalize((ldi_handle_t)lecp->lec_lhp, + lecp->lec_cookie, ldi_result, lecp->lec_arg, + ev_data); + + /* + * If LDI native event and LDI handle closed in context + * of notify, NULL out the finalize callback as we have + * already called the 1 finalize above allowed in this situation + */ + if (lecp->lec_lhp == NULL && + ldi_native_cookie(lecp->lec_cookie)) { + LDI_EVDBG((CE_NOTE, + "ldi_invoke_finalize(): NULLing finalize after " + "calling 1 finalize following ldi_close")); + lecp->lec_finalize = NULL; + } + } + ldi_ev_callback_list.le_walker_next = NULL; + ldi_ev_unlock(); + + if (found) + return; + + LDI_EVDBG((CE_NOTE, "ldi_invoke_finalize(): no matching callbacks")); +} + +int +ldi_ev_remove_callbacks(ldi_callback_id_t id) +{ + ldi_ev_callback_impl_t *lecp; + ldi_ev_callback_impl_t *next; + ldi_ev_callback_impl_t *found; + list_t *listp; + + if (id == 0) { + cmn_err(CE_WARN, "ldi_ev_remove_callbacks: Invalid ID 0"); + return (LDI_EV_FAILURE); + } + + LDI_EVDBG((CE_NOTE, "ldi_ev_remove_callbacks: entered: id=%p", + (void *)id)); + + ldi_ev_lock(); + + listp = &ldi_ev_callback_list.le_head; + next = found = NULL; + for (lecp = list_head(listp); lecp; lecp = next) { + next = list_next(listp, lecp); + if (lecp->lec_id == id) { + VERIFY(found == NULL); + + /* + * If there is a walk in progress, shift that walk + * along to the next element so that we can remove + * this one. This allows us to unregister an arbitrary + * number of callbacks from within a callback. + * + * See the struct definition (in sunldi_impl.h) for + * more information. + */ + if (ldi_ev_callback_list.le_walker_next == lecp) + ldi_ev_callback_list.le_walker_next = next; + if (ldi_ev_callback_list.le_walker_prev == lecp) + ldi_ev_callback_list.le_walker_prev = list_prev( + listp, ldi_ev_callback_list.le_walker_prev); + + list_remove(listp, lecp); + found = lecp; + } + } + ldi_ev_unlock(); + + if (found == NULL) { + cmn_err(CE_WARN, "No LDI event handler for id (%p)", + (void *)id); + return (LDI_EV_SUCCESS); + } + + LDI_EVDBG((CE_NOTE, "ldi_ev_remove_callbacks: removed " + "LDI native callbacks")); + kmem_free(found, sizeof (ldi_ev_callback_impl_t)); + + return (LDI_EV_SUCCESS); +} +/* + * XXX End LDI Events + */ + +/* Client interface, find IOMedia from dev_t, alloc and open handle */ +int +ldi_open_by_dev(dev_t device, __unused int otyp, int fmode, + __unused cred_t *cred, ldi_handle_t *lhp, + __unused ldi_ident_t ident) +{ + int error = EINVAL; + + dprintf("%s dev_t %d fmode %d\n", __func__, device, fmode); + + /* Validate arguments */ + if (!lhp || device == 0) { + dprintf("%s missing argument %p %d\n", + __func__, lhp, device); + return (EINVAL); + } + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*((struct ldi_handle **)lhp), ==, NULL); + + /* Try to open by media */ + error = ldi_open_media_by_dev(device, fmode, lhp); + + /* Pass error from open */ + return (error); +} + +/* Client interface, find dev_t and IOMedia/vnode, alloc and open handle */ +int +ldi_open_by_name(char *path, int fmode, __unused cred_t *cred, + ldi_handle_t *lhp, __unused ldi_ident_t li) +{ + dev_t device = 0; + int error = EINVAL; + + dprintf("%s dev_t %d fmode %d\n", __func__, device, fmode); + + /* Validate arguments */ + if (!lhp || !path) { + dprintf("%s %s %p %s %d\n", __func__, + "missing lhp or path", lhp, path, fmode); + return (EINVAL); + } + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*((struct ldi_handle **)lhp), ==, NULL); + + /* Validate active open modes */ + if (!ldi_use_iokit_from_path && !ldi_use_dev_from_path && + !ldi_use_vnode_from_path) { + dprintf("%s no valid modes to open device\n", __func__); + return (EINVAL); + } + + /* Try to open IOMedia by path */ + if (ldi_use_iokit_from_path) { + error = ldi_open_media_by_path(path, fmode, lhp); + + /* Error check open */ + if (!error) { + return (0); + } else { + dprintf("%s ldi_open_media_by_path failed\n", + __func__); + /* Not fatal, retry by dev_t or vnode */ + } + } + + /* Get dev_t from path, try to open IOMedia by dev */ + if (ldi_use_dev_from_path) { + /* Uses vnode_lookup */ + device = dev_from_path(path); + if (device == 0) { + dprintf("%s dev_from_path failed %s\n", + __func__, path); + /* + * Both media_from_dev and vnode_from_path will fail + * if dev_from_path fails, since it uses vnode_lookup. + */ + return (ENODEV); + } + + if (ldi_use_iokit_from_dev) { + /* Searches for matching IOMedia */ + error = ldi_open_media_by_dev(device, fmode, lhp); + if (!error) { + return (0); + } else { + dprintf("%s ldi_open_media_by_dev failed %d\n", + __func__, device); + /* Not fatal, retry as vnode */ + } + } + } + + if (!ldi_use_vnode_from_path) { + return (EINVAL); + } + + /* Try to open vnode by path */ + error = ldi_open_vnode_by_path(path, device, fmode, lhp); + if (error) { + dprintf("%s ldi_open_vnode_by_path failed %d\n", __func__, + error); + } + + return (error); +} + +/* Client interface, wrapper for handle_close */ +int +ldi_close(ldi_handle_t lh, int fmode, __unused cred_t *cred) +{ + struct ldi_handle *handlep = (struct ldi_handle *)lh; + int error = EINVAL; + + ASSERT3U(handlep, !=, NULL); + ASSERT3U(handlep->lh_ref, !=, 0); + ASSERT3U(handlep->lh_fmode, ==, fmode); + + dprintf("%s dev_t %d fmode %d\n", __func__, handlep->lh_dev, fmode); + + /* Remove event callbacks */ + boolean_t notify = B_FALSE; + list_t *listp; + ldi_ev_callback_impl_t *lecp; + + /* + * Search the event callback list for callbacks with this + * handle. There are 2 cases + * 1. Called in the context of a notify. The handle consumer + * is releasing its hold on the device to allow a reconfiguration + * of the device. Simply NULL out the handle and the notify callback. + * The finalize callback is still available so that the consumer + * knows of the final disposition of the device. + * 2. Not called in the context of notify. NULL out the handle as well + * as the notify and finalize callbacks. Since the consumer has + * closed the handle, we assume it is not interested in the + * notify and finalize callbacks. + */ + ldi_ev_lock(); + + if (handlep->lh_flags & LH_FLAGS_NOTIFY) + notify = B_TRUE; + listp = &ldi_ev_callback_list.le_head; + for (lecp = list_head(listp); lecp; lecp = list_next(listp, lecp)) { + if (lecp->lec_lhp != handlep) + continue; + lecp->lec_lhp = NULL; + lecp->lec_notify = NULL; + LDI_EVDBG((CE_NOTE, "ldi_close: NULLed lh and notify")); + if (!notify) { + LDI_EVDBG((CE_NOTE, "ldi_close: NULLed finalize")); + lecp->lec_finalize = NULL; + } + } + + if (notify) + handlep->lh_flags &= ~LH_FLAGS_NOTIFY; + ldi_ev_unlock(); + + /* Close device if only one openref, or just decrement openrefs */ + if ((error = handle_close(handlep)) != 0) { + dprintf("%s error from handle_close: %d\n", + __func__, error); + } + + /* Decrement lh_ref, if last ref then remove and free */ + handle_release(handlep); + handlep = 0; + + /* XXX clear pointer arg, and return success? */ + lh = (ldi_handle_t)0; + return (0); + // return (error); +} + +/* + * Client interface, must be in LDI_STATUS_ONLINE + */ +int +ldi_get_size(ldi_handle_t lh, uint64_t *dev_size) +{ + struct ldi_handle *handlep = (struct ldi_handle *)lh; + int error; + + /* + * Ensure we have an LDI handle, and a valid dev_size and/or + * blocksize pointer. Caller must pass at least one of these. + */ + if (!handlep || !dev_size) { + dprintf("%s handle %p\n", __func__, handlep); + dprintf("%s dev_size %p\n", __func__, dev_size); + return (EINVAL); + } + + /* + * Must be in LDI_STATUS_ONLINE + * IOMedia can return getSize without being opened, but vnode + * devices must be opened first. + * Rather than have support differing behaviors, require that + * handle is open to retrieve the size. + */ + if (handlep->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s device not online\n", __func__); + return (ENODEV); + } + + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + error = handle_get_size_iokit(handlep, dev_size); + return (error); + + case LDI_TYPE_VNODE: + error = handle_get_size_vnode(handlep, dev_size); + return (error); + } + + /* Default case, shouldn't reach this */ + dprintf("%s invalid lh_type %d\n", __func__, + handlep->lh_type); + return (EINVAL); +} + +/* + * Must be in LDI_STATUS_ONLINE + * XXX Needs async callback + */ +int +ldi_sync(ldi_handle_t lh) +{ + struct ldi_handle *handlep = (struct ldi_handle *)lh; + int error; + + /* Ensure we have an LDI handle */ + if (!handlep) { + dprintf("%s no handle\n", __func__); + return (EINVAL); + } + + /* Must be in LDI_STATUS_ONLINE */ + if (handlep->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s device not online\n", __func__); + return (ENODEV); + } + + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + error = handle_sync_iokit(handlep); + return (error); + + case LDI_TYPE_VNODE: + error = handle_sync_vnode(handlep); + return (error); + } + + /* Default case, shouldn't reach this */ + dprintf("%s invalid lh_type %d\n", __func__, + handlep->lh_type); + return (EINVAL); +} + +int +ldi_ioctl(ldi_handle_t lh, int cmd, intptr_t arg, + __unused int mode, __unused cred_t *cr, __unused int *rvalp) +{ + struct ldi_handle *handlep = (struct ldi_handle *)lh; + int error = EINVAL; + struct dk_callback *dkc; + + switch (cmd) { + /* Flush write cache */ + case DKIOCFLUSHWRITECACHE: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + error = handle_sync_iokit(handlep); + break; + + case LDI_TYPE_VNODE: + error = handle_sync_vnode(handlep); + break; + + default: + error = ENOTSUP; + } + + if (!arg) { + return (error); + } + + dkc = (struct dk_callback *)arg; + /* Issue completion callback if set */ + if (dkc->dkc_callback) { + (*dkc->dkc_callback)(dkc->dkc_cookie, error); + } + + return (error); + + /* Set or clear write cache enabled */ + case DKIOCSETWCE: + /* + * There doesn't seem to be a way to do this by vnode, + * so we need to be able to locate an IOMedia and an + * IOBlockStorageDevice provider. + */ + return (handle_set_wce_iokit(handlep, (int *)arg)); + + /* Get media blocksize and block count */ + case DKIOCGMEDIAINFO: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_get_media_info_iokit(handlep, + (struct dk_minfo *)arg)); + + case LDI_TYPE_VNODE: + return (handle_get_media_info_vnode(handlep, + (struct dk_minfo *)arg)); + + default: + return (ENOTSUP); + } + + /* Get media logical/physical blocksize and block count */ + case DKIOCGMEDIAINFOEXT: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_get_media_info_ext_iokit(handlep, + (struct dk_minfo_ext *)arg)); + + case LDI_TYPE_VNODE: + return (handle_get_media_info_ext_vnode(handlep, + (struct dk_minfo_ext *)arg)); + + default: + return (ENOTSUP); + } + + /* Check device status */ + case DKIOCSTATE: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_check_media_iokit(handlep, + (int *)arg)); + + case LDI_TYPE_VNODE: + return (handle_check_media_vnode(handlep, + (int *)arg)); + + default: + return (ENOTSUP); + } + + case DKIOCISSOLIDSTATE: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_is_solidstate_iokit(handlep, + (int *)arg)); + + case LDI_TYPE_VNODE: + return (handle_is_solidstate_vnode(handlep, + (int *)arg)); + + default: + return (ENOTSUP); + } + + case DKIOCGETBOOTINFO: + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_get_bootinfo_iokit(handlep, + (struct io_bootinfo *)arg)); + + case LDI_TYPE_VNODE: + return (handle_get_bootinfo_vnode(handlep, + (struct io_bootinfo *)arg)); + + default: + return (ENOTSUP); + } + + case DKIOCGETFEATURES: /* UNMAP? */ + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_features_iokit(handlep, + (uint32_t *)arg)); + + case LDI_TYPE_VNODE: + return (handle_features_vnode(handlep, + (uint32_t *)arg)); + + default: + return (ENOTSUP); + } + + case DKIOCFREE: /* UNMAP */ + /* IOMedia or vnode */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + return (handle_unmap_iokit(handlep, + (dkioc_free_list_ext_t *)arg)); + + case LDI_TYPE_VNODE: + return (handle_unmap_vnode(handlep, + (dkioc_free_list_ext_t *)arg)); + + default: + return (ENOTSUP); + } + + default: + return (ENOTSUP); + } +} + +/* + * Must already have handle_open called on lh. + */ +int +ldi_strategy(ldi_handle_t lh, ldi_buf_t *lbp) +{ + struct ldi_handle *handlep = (struct ldi_handle *)lh; + int error = EINVAL; + + /* Verify arguments */ + if (!handlep || !lbp || lbp->b_bcount == 0) { + dprintf("%s missing something...\n", __func__); + dprintf("handlep [%p]\n", handlep); + dprintf("lbp [%p]\n", lbp); + if (lbp) { + dprintf("lbp->b_bcount %llu\n", + lbp->b_bcount); + } + return (EINVAL); + } + + /* Check instantaneous value of handle status */ + if (handlep->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s device not online\n", __func__); + return (ENODEV); + } + + /* IOMedia or vnode */ + /* Issue type-specific buf_strategy, preserve error */ + switch (handlep->lh_type) { + case LDI_TYPE_IOKIT: + error = buf_strategy_iokit(lbp, handlep); + break; + case LDI_TYPE_VNODE: + error = buf_strategy_vnode(lbp, handlep); + break; + default: + dprintf("%s invalid lh_type %d\n", __func__, handlep->lh_type); + return (EINVAL); + } + + return (error); +} + +/* Client interface to get an LDI buffer */ +ldi_buf_t * +ldi_getrbuf(int flags) +{ +/* Example: bp = getrbuf(KM_SLEEP); */ + ldi_buf_t *lbp; + + /* Allocate with requested flags */ + lbp = kmem_alloc(sizeof (ldi_buf_t), flags); + /* Verify allocation */ + if (!lbp) { + return (NULL); + } + + ldi_bioinit(lbp); + + return (lbp); +} + +/* Client interface to release an LDI buffer */ +void +ldi_freerbuf(ldi_buf_t *lbp) +{ + if (!lbp) { + return; + } + + /* Deallocate */ + kmem_free(lbp, sizeof (ldi_buf_t)); +} + +void +ldi_bioinit(ldi_buf_t *lbp) +{ +#ifdef LDI_ZERO + /* Zero the new buffer struct */ + bzero(lbp, sizeof (ldi_buf_t)); +#endif + + /* Initialize defaults */ + lbp->b_un.b_addr = 0; + lbp->b_flags = 0; + lbp->b_bcount = 0; + lbp->b_bufsize = 0; + lbp->b_lblkno = 0; + lbp->b_resid = 0; + lbp->b_error = 0; +} + +/* + * IOKit C++ functions + */ +int +ldi_init(void *provider) +{ + int index; + + /* Allocate kstat pointer */ + ldi_ksp = kstat_create("zfs", 0, "ldi", "darwin", KSTAT_TYPE_NAMED, + sizeof (ldi_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (ldi_ksp == NULL) { + dprintf("%s couldn't register kstats\n", __func__); + return (ENOMEM); + } + + /* Register kstats */ + ldi_ksp->ks_data = &ldi_stats; + kstat_install(ldi_ksp); + + /* Register sysctls */ + sysctl_register_oid(&sysctl__ldi); + sysctl_register_oid(&sysctl__ldi_debug); + sysctl_register_oid(&sysctl__ldi_debug_use_iokit_from_path); + sysctl_register_oid(&sysctl__ldi_debug_use_iokit_from_dev); + sysctl_register_oid(&sysctl__ldi_debug_use_dev_from_path); + sysctl_register_oid(&sysctl__ldi_debug_use_vnode_from_path); + + /* Create handle hash lists and locks */ + ldi_handle_hash_count = 0; + for (index = 0; index < LH_HASH_SZ; index++) { + mutex_init(&ldi_handle_hash_lock[index], NULL, + MUTEX_DEFAULT, NULL); + list_create(&ldi_handle_hash_list[index], + sizeof (struct ldi_handle), + offsetof(struct ldi_handle, lh_node)); + } + + /* + * Initialize the LDI event subsystem + */ + mutex_init(&ldi_ev_callback_list.le_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ldi_ev_callback_list.le_cv, NULL, CV_DEFAULT, NULL); + ldi_ev_callback_list.le_busy = 0; + ldi_ev_callback_list.le_thread = NULL; + ldi_ev_callback_list.le_walker_next = NULL; + ldi_ev_callback_list.le_walker_prev = NULL; + list_create(&ldi_ev_callback_list.le_head, + sizeof (ldi_ev_callback_impl_t), + offsetof(ldi_ev_callback_impl_t, lec_list)); + + return (0); +} + +void +ldi_fini() +{ + /* + * Teardown the LDI event subsystem + */ + ldi_ev_lock(); +#ifdef DEBUG + if (ldi_ev_callback_list.le_busy != 1 || + ldi_ev_callback_list.le_thread != curthread || + ldi_ev_callback_list.le_walker_next != NULL || + ldi_ev_callback_list.le_walker_prev != NULL) { + dprintf("%s still has %s %llu %s %p %s %p %s %p\n", __func__, + "le_busy", ldi_ev_callback_list.le_busy, + "le_thread", ldi_ev_callback_list.le_thread, + "le_walker_next", ldi_ev_callback_list.le_walker_next, + "le_walker_prev", ldi_ev_callback_list.le_walker_prev); + } +#endif + list_destroy(&ldi_ev_callback_list.le_head); + ldi_ev_unlock(); +#ifdef DEBUG + ldi_ev_callback_list.le_busy = 0; + ldi_ev_callback_list.le_thread = NULL; + ldi_ev_callback_list.le_walker_next = NULL; + ldi_ev_callback_list.le_walker_prev = NULL; +#endif + + cv_destroy(&ldi_ev_callback_list.le_cv); + mutex_destroy(&ldi_ev_callback_list.le_lock); + + if (ldi_handle_hash_count != 0) { + dprintf("%s ldi_handle_hash_count %llu\n", __func__, + ldi_handle_hash_count); + } + + /* Destroy handle hash lists and locks */ + handle_hash_release(); + + /* Unregister sysctls */ + sysctl_unregister_oid(&sysctl__ldi_debug_use_iokit_from_path); + sysctl_unregister_oid(&sysctl__ldi_debug_use_iokit_from_dev); + sysctl_unregister_oid(&sysctl__ldi_debug_use_dev_from_path); + sysctl_unregister_oid(&sysctl__ldi_debug_use_vnode_from_path); + sysctl_unregister_oid(&sysctl__ldi_debug); + sysctl_unregister_oid(&sysctl__ldi); + + /* Unregister kstats */ + if (ldi_ksp != NULL) { + kstat_delete(ldi_ksp); + ldi_ksp = NULL; + } + + if (ldi_handle_hash_count != 0) { + dprintf("%s handle_hash_count still %llu\n", __func__, + ldi_handle_hash_count); + } +} diff --git a/module/os/macos/zfs/ldi_vnode.c b/module/os/macos/zfs/ldi_vnode.c new file mode 100644 index 0000000000..5a6ff4efc0 --- /dev/null +++ b/module/os/macos/zfs/ldi_vnode.c @@ -0,0 +1,1028 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * Portions of this document are copyright Oracle and Joyent. + * OS X implementation of ldi_ named functions for ZFS written by + * Evan Susarret in 2015. + */ + +/* + * ZFS internal + */ +#include + +/* + * LDI Includes + */ +#include + +/* Debug prints */ +#ifdef DEBUG + +#ifdef dprintf +#undef dprintf +#endif + +#define dprintf ldi_log + +#define ldi_log(fmt, ...) do { \ + printf(fmt, __VA_ARGS__); \ + /* delay(hz>>1); */ \ +_NOTE(CONSTCOND) } while (0) +#endif + +struct _handle_vnode { + vnode_t *devvp; + char *vd_readlinkname; +}; /* 16b */ + +#define LH_VNODE(lhp) lhp->lh_tsd.vnode_tsd->devvp + +void +handle_free_vnode(struct ldi_handle *lhp) +{ + if (!lhp) { + dprintf("%s missing lhp\n", __func__); + return; + } + + if (!lhp->lh_tsd.vnode_tsd) { + dprintf("%s missing vnode_tsd\n", __func__); + return; + } + + kmem_free(lhp->lh_tsd.vnode_tsd, sizeof (struct _handle_vnode)); + lhp->lh_tsd.vnode_tsd = 0; +} + + +/* Returns handle with lock still held */ +struct ldi_handle * +handle_alloc_vnode(dev_t device, int fmode) +{ + struct ldi_handle *lhp, *retlhp; + + /* Search for existing handle */ + if ((retlhp = handle_find(device, fmode, B_TRUE)) != NULL) { + dprintf("%s found handle before alloc\n", __func__); + return (retlhp); + } + + /* Validate arguments */ + if (device == 0 || fmode == 0) { + dprintf("%s missing dev_t %d or fmode %d\n", + __func__, device, fmode); + return (NULL); + } + + /* Allocate LDI vnode handle */ + if ((lhp = handle_alloc_common(LDI_TYPE_VNODE, device, + fmode)) == NULL) { + dprintf("%s couldn't allocate lhp\n", __func__); + return (NULL); + } + + /* Allocate and clear type-specific device data */ + lhp->lh_tsd.vnode_tsd = (struct _handle_vnode *)kmem_alloc( + sizeof (struct _handle_vnode), KM_SLEEP); + LH_VNODE(lhp) = 0; + + /* Add the handle to the list, or return match */ + if ((retlhp = handle_add(lhp)) == NULL) { + dprintf("%s handle_add failed\n", __func__); + handle_release(lhp); + return (NULL); + } + + /* Check if new or found handle was returned */ + if (retlhp != lhp) { + dprintf("%s found handle after alloc\n", __func__); + handle_release(lhp); + lhp = 0; + } + + return (retlhp); +} + +int +handle_close_vnode(struct ldi_handle *lhp) +{ + vfs_context_t context; + int error = EINVAL; + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(lhp->lh_type, ==, LDI_TYPE_VNODE); + ASSERT3U(LH_VNODE(lhp), !=, NULL); + ASSERT3U(lhp->lh_status, ==, LDI_STATUS_CLOSING); + +#ifdef DEBUG + /* Validate vnode and context */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + /* If getwithref failed, we can't call vnode_close. */ + LH_VNODE(lhp) = NULLVP; + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* For read-write, clear mountedon flag and wait for writes */ + if (lhp->lh_fmode & FWRITE) { + /* Wait for writes to complete */ + error = vnode_waitforwrites(LH_VNODE(lhp), 0, 0, 0, + "ldi::handle_close_vnode"); + if (error != 0) { + dprintf("%s waitforwrites returned %d\n", + __func__, error); + } + } + + /* Drop usecount */ + vnode_rele(LH_VNODE(lhp)); + + /* Drop iocount and refcount */ + error = vnode_close(LH_VNODE(lhp), + (lhp->lh_fmode & FWRITE ? FWASWRITTEN : 0), + context); + /* Preserve error from vnode_close */ + + /* Clear handle devvp vnode pointer */ + LH_VNODE(lhp) = NULLVP; + /* Drop VFS context */ + vfs_context_rele(context); + + if (error) { + dprintf("%s vnode_close error %d\n", + __func__, error); + } + /* Return error from close */ + return (error); +} + +static int +handle_open_vnode(struct ldi_handle *lhp, char *path) +{ + vfs_context_t context; + int error = EINVAL; + + ASSERT3U(lhp, !=, NULL); + ASSERT3U(path, !=, NULL); + ASSERT3U(lhp->lh_type, ==, LDI_TYPE_VNODE); + ASSERT3U(lhp->lh_status, ==, LDI_STATUS_OPENING); + + /* Validate path string */ + if (!path || strlen(path) <= 1) { + dprintf("%s missing path\n", __func__); + return (EINVAL); + } + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Try to open the device by path (takes iocount) */ + error = vnode_open(path, lhp->lh_fmode, 0, 0, + &(LH_VNODE(lhp)), context); + + if (error) { + dprintf("%s vnode_open error %d\n", __func__, error); + /* Return error from vnode_open */ + return (error); + } + + /* Increase usecount, saving error. */ + error = vnode_ref(LH_VNODE(lhp)); + if (error != 0) { + dprintf("%s couldn't vnode_ref\n", __func__); + vnode_close(LH_VNODE(lhp), lhp->lh_fmode, context); + /* Return error from vnode_ref */ + return (error); + } + + /* Verify vnode refers to a block device */ + if (!vnode_isblk(LH_VNODE(lhp))) { + dprintf("%s %s is not a block device\n", + __func__, path); + vnode_rele(LH_VNODE(lhp)); + vnode_close(LH_VNODE(lhp), lhp->lh_fmode, context); + return (ENOTBLK); + } + + /* Drop iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop VFS context */ + vfs_context_rele(context); + + return (0); +} + +int +handle_get_size_vnode(struct ldi_handle *lhp, uint64_t *dev_size) +{ + vfs_context_t context; + uint64_t blkcnt = 0; + uint32_t blksize = 0; + int error = EINVAL; + +#ifdef DEBUG + if (!lhp || !dev_size) { + dprintf("%s missing lhp or dev_size\n", __func__); + return (EINVAL); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* Fetch the blocksize */ + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCGETBLOCKSIZE, + (caddr_t)&blksize, 0, context); + error = (blksize == 0 ? ENODEV : error); + + /* Fetch the block count */ + error = (error ? error : VNOP_IOCTL(LH_VNODE(lhp), + DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, + 0, context)); + error = (blkcnt == 0 ? ENODEV : error); + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop VFS context */ + vfs_context_rele(context); + + /* Cast both to 64-bit then multiply */ + *dev_size = ((uint64_t)blksize * (uint64_t)blkcnt); + if (*dev_size == 0) { + dprintf("%s invalid blksize %u or blkcnt %llu\n", + __func__, blksize, blkcnt); + return (ENODEV); + } + return (0); +} + +int +handle_get_dev_path_vnode(struct ldi_handle *lhp, char *path, int len) +{ + vfs_context_t context; + int error; + + if (!lhp || !path || len == 0) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + if ((error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCGETFIRMWAREPATH, + (caddr_t)path, len, context)) != 0) { + dprintf("%s VNOP_IOCTL error %d\n", __func__, error); + /* Preserve error to return */ + } + + /* Drop iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop VFS context */ + vfs_context_rele(context); + +if (error == 0) dprintf("%s got device path [%s]\n", __func__, path); + return (error); +} + +int +handle_get_bootinfo_vnode(struct ldi_handle *lhp, + struct io_bootinfo *bootinfo) +{ + int error; + + if (!lhp || !bootinfo) { + dprintf("%s missing argument\n", __func__); +printf("%s missing argument\n", __func__); + return (EINVAL); + } + + if ((error = handle_get_size_vnode(lhp, + &bootinfo->dev_size)) != 0 || + (error = handle_get_dev_path_vnode(lhp, bootinfo->dev_path, + sizeof (bootinfo->dev_path))) != 0) { + dprintf("%s get size or dev_path error %d\n", + __func__, error); + } + + return (error); +} + +int +handle_sync_vnode(struct ldi_handle *lhp) +{ + vfs_context_t context; + int error = EINVAL; + +#ifdef DEBUG + if (!lhp) { + dprintf("%s missing lhp\n", __func__); + return (EINVAL); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* + * Flush out any old buffers remaining from a previous use. + * buf_invalidateblks flushes UPL buffers, VNOP_FSYNC informs + * the disk device to flush write buffers to disk. + */ + error = buf_invalidateblks(LH_VNODE(lhp), BUF_WRITE_DATA, 0, 0); + + error = (error ? error : VNOP_FSYNC(LH_VNODE(lhp), + MNT_WAIT, context)); + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop VFS context */ + vfs_context_rele(context); + + if (error) { + dprintf("%s buf_invalidateblks or VNOP_FSYNC error %d\n", + __func__, error); + return (ENOTSUP); + } + return (0); +} + +/* vnode_lookup, find dev_t info */ +dev_t +dev_from_path(char *path) +{ + vfs_context_t context; + vnode_t *devvp = NULLVP; + dev_t device; + int error = EINVAL; + +#ifdef DEBUG + /* Validate path */ + if (path == 0 || strlen(path) <= 1 || path[0] != '/') { + dprintf("%s invalid path provided\n", __func__); + return (0); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (0); + } + + /* Try to lookup the vnode by path */ + error = vnode_lookup(path, 0, &devvp, context); + if (error || devvp == NULLVP) { + dprintf("%s vnode_lookup failed %d\n", __func__, error); + vfs_context_rele(context); + return (0); + } + + /* Get the rdev of this vnode */ + device = vnode_specrdev(devvp); + + /* Drop iocount on devvp */ + vnode_put(devvp); + /* Drop vfs_context */ + vfs_context_rele(context); + +#ifdef DEBUG + /* Validate dev_t */ + if (device == 0) { + dprintf("%s invalid device\n", __func__); + } +#endif + + /* Return 0 or valid dev_t */ + return (device); +} + +/* Completion handler for vnode strategy */ +static void +ldi_vnode_io_intr(buf_t bp, void *arg) +{ + ldi_buf_t *lbp = (ldi_buf_t *)arg; + + ASSERT3U(bp, !=, NULL); + ASSERT3U(lbp, !=, NULL); + + /* Copyout error and resid */ + lbp->b_error = buf_error(bp); + lbp->b_resid = buf_resid(bp); + +#ifdef DEBUG + if (lbp->b_error || lbp->b_resid != 0) { + dprintf("%s io error %d resid %llu\n", __func__, + lbp->b_error, lbp->b_resid); + } +#endif + + /* Teardown */ + buf_free(bp); + + /* Call original completion function */ + if (lbp->b_iodone) { + lbp->b_iodone(lbp); + } +} + +int +buf_strategy_vnode(ldi_buf_t *lbp, struct ldi_handle *lhp) +{ + buf_t bp = 0; + int error = EINVAL; + + ASSERT3U(lbp, !=, NULL); + ASSERT3U(lhp, !=, NULL); + +#ifdef DEBUG + if (!lbp || !lhp) { + dprintf("%s missing lbp or lhp\n", __func__); + return (EINVAL); + } + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s handle is not Online\n", __func__); + return (ENODEV); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and verify buf_t */ + if (NULL == (bp = buf_alloc(LH_VNODE(lhp)))) { + dprintf("%s couldn't allocate buf_t\n", __func__); + return (ENOMEM); + } + + /* Setup buffer */ + buf_setflags(bp, B_NOCACHE | (lbp->b_flags & B_READ ? + B_READ : B_WRITE)); + buf_setcount(bp, lbp->b_bcount); + buf_setdataptr(bp, (uintptr_t)lbp->b_un.b_addr); + buf_setblkno(bp, lbp->b_lblkno); + buf_setlblkno(bp, lbp->b_lblkno); + buf_setsize(bp, lbp->b_bufsize); + + /* For asynchronous IO */ + if (lbp->b_iodone != NULL) { + buf_setcallback(bp, &ldi_vnode_io_intr, lbp); + } + + /* Recheck instantaneous value of handle status */ + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s device not online\n", __func__); + buf_free(bp); + return (ENODEV); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + buf_free(bp); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + if (!(lbp->b_flags & B_READ)) { + /* Does not return an error status */ + vnode_startwrite(LH_VNODE(lhp)); + } + + + + /* Issue the IO, preserving error */ + error = VNOP_STRATEGY(bp); + + if (error) { + dprintf("%s VNOP_STRATEGY error %d\n", + __func__, error); + /* Reclaim write count on vnode */ + if (!(lbp->b_flags & B_READ)) { + vnode_writedone(LH_VNODE(lhp)); + } + vnode_put(LH_VNODE(lhp)); + buf_free(bp); + return (EIO); + } + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + + /* For synchronous IO, call completion */ + if (lbp->b_iodone == NULL) { + ldi_vnode_io_intr(bp, (void*)lbp); + } + + /* Pass error from VNOP_STRATEGY */ + return (error); +} + +/* Client interface, alloc and open vnode handle by pathname */ +int +ldi_open_vnode_by_path(char *path, dev_t device, + int fmode, ldi_handle_t *lhp) +{ + struct ldi_handle *retlhp; + ldi_status_t status; + int error = EIO; + + /* Validate arguments */ + if (!path || strlen(path) <= 1 || device == 0 || !lhp) { + dprintf("%s invalid argument %p %d %p\n", __func__, + path, device, lhp); + if (path) { + dprintf("*path string is %s\n", path); + } + return (EINVAL); + } + /* In debug build, be loud if we potentially leak a handle */ + ASSERT3U(*(struct ldi_handle **)lhp, ==, NULL); + + /* Allocate handle with path */ + retlhp = handle_alloc_vnode(device, fmode); + if (retlhp == NULL) { + dprintf("%s couldn't allocate vnode handle\n", __func__); + return (ENOMEM); + } + + /* Mark the handle as Opening, or increment openref */ + status = handle_open_start(retlhp); + if (status == LDI_STATUS_ONLINE) { + dprintf("%s already online, refs %d, openrefs %d\n", __func__, + retlhp->lh_ref, retlhp->lh_openref); + /* Cast retlhp and assign to lhp (may be 0) */ + *lhp = (ldi_handle_t)retlhp; + /* Successfully incremented open ref in open_start */ + return (0); + } + + /* If state is now Opening, try to open device by vnode */ + if (status != LDI_STATUS_OPENING || + (error = handle_open_vnode(retlhp, path)) != 0) { + dprintf("%s Couldn't open handle\n", __func__); + handle_open_done(retlhp, LDI_STATUS_CLOSED); + handle_release(retlhp); + retlhp = 0; + return ((error == EACCES) ? EROFS:EIO); + } + handle_open_done(retlhp, LDI_STATUS_ONLINE); + + /* Register for disk notifications */ + handle_register_notifier(retlhp); + + /* Cast retlhp and assign to lhp (may be 0) */ + *lhp = (ldi_handle_t)retlhp; + /* Pass error from open */ + return (error); +} + +int +handle_get_media_info_vnode(struct ldi_handle *lhp, + struct dk_minfo *dkm) +{ + vfs_context_t context; + uint32_t blksize; + uint64_t blkcount; + int error; + +#ifdef DEBUG + if (!lhp || !dkm) { + dprintf("%s missing lhp or dkm\n", __func__); + return (EINVAL); + } + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s handle is not Online\n", __func__); + return (ENODEV); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (0); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* Get the blocksize and block count */ + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCGETBLOCKSIZE, + (caddr_t)&blksize, 0, context); + error = (error ? error : VNOP_IOCTL(LH_VNODE(lhp), + DKIOCGETBLOCKCOUNT, (caddr_t)&blkcount, + 0, context)); + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop vfs_context */ + vfs_context_rele(context); + + if (error) { + dkm->dki_capacity = 0; + dkm->dki_lbsize = 0; + return (error); + } + + /* If successful, set return values */ + dkm->dki_capacity = blkcount; + dkm->dki_lbsize = blksize; + return (0); +} + +int +handle_get_media_info_ext_vnode(struct ldi_handle *lhp, + struct dk_minfo_ext *dkmext) +{ + vfs_context_t context; + uint32_t blksize, pblksize; + uint64_t blkcount; + int error; + +#ifdef DEBUG + if (!lhp || !dkmext) { + dprintf("%s missing lhp or dkmext\n", __func__); + return (EINVAL); + } + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s handle is not Online\n", __func__); + return (ENODEV); + } + + /* Validate vnode and context */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode or context\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (0); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* Get the blocksize, physical blocksize, and block count */ + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCGETBLOCKSIZE, + (caddr_t)&blksize, 0, context); + error = (error ? error : VNOP_IOCTL(LH_VNODE(lhp), + DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&pblksize, + 0, context)); + error = (error ? error : VNOP_IOCTL(LH_VNODE(lhp), + DKIOCGETBLOCKCOUNT, (caddr_t)&blkcount, + 0, context)); + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop vfs_context */ + vfs_context_rele(context); + + if (error) { + dkmext->dki_capacity = 0; + dkmext->dki_lbsize = 0; + dkmext->dki_pbsize = 0; + return (error); + } + + /* If successful, set return values */ + dkmext->dki_capacity = blkcount; + dkmext->dki_lbsize = blksize; + dkmext->dki_pbsize = pblksize; + return (0); +} + +int +handle_check_media_vnode(struct ldi_handle *lhp, int *status) +{ + if (!lhp || !status) { + dprintf("%s missing lhp or invalid status\n", __func__); + return (EINVAL); + } + + /* Validate vnode and context */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } + + /* XXX As yet unsupported */ + return (ENOTSUP); + + /* Check if the device is available and responding */ + return (0); +} + +int +handle_is_solidstate_vnode(struct ldi_handle *lhp, int *isssd) +{ + vfs_context_t context; + int error; + + if (!lhp || !isssd) { + dprintf("%s missing lhp or invalid status\n", __func__); + return (EINVAL); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (ENOMEM); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCISSOLIDSTATE, + (caddr_t)isssd, 0, context); + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop vfs_context */ + vfs_context_rele(context); + + return (error); +} + +int +handle_features_vnode(struct ldi_handle *lhp, + uint32_t *features) +{ + vfs_context_t context; + int error; + +#ifdef DEBUG + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s handle is not Online\n", __func__); + return (ENODEV); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (0); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + + /* All code paths from here must vnode_put. */ + + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCGETFEATURES, + (caddr_t)features, 0, context); + + if (error) { + printf("%s: 0x%x\n", __func__, error); + } + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop vfs_context */ + vfs_context_rele(context); + + return (error); +} + +int +handle_unmap_vnode(struct ldi_handle *lhp, + dkioc_free_list_ext_t *dkm) +{ + vfs_context_t context; + int error; + +#ifdef DEBUG + if (!lhp || !dkm) { + dprintf("%s missing lhp or dkm\n", __func__); + return (EINVAL); + } + if (lhp->lh_status != LDI_STATUS_ONLINE) { + dprintf("%s handle is not Online\n", __func__); + return (ENODEV); + } + + /* Validate vnode */ + if (LH_VNODE(lhp) == NULLVP) { + dprintf("%s missing vnode\n", __func__); + return (ENODEV); + } +#endif + + /* Allocate and validate context */ + context = vfs_context_create(spl_vfs_context_kernel()); + if (!context) { + dprintf("%s couldn't create VFS context\n", __func__); + return (0); + } + + /* Take an iocount on devvp vnode. */ + error = vnode_getwithref(LH_VNODE(lhp)); + if (error) { + dprintf("%s vnode_getwithref error %d\n", + __func__, error); + vfs_context_rele(context); + return (ENODEV); + } + /* All code paths from here must vnode_put. */ + + /* We need to convert illumos' dkioc_free_list_t to dk_unmap_t */ + /* We only support 1 entry now */ + dk_unmap_t dkun = { 0 }; + dk_extent_t ext; + dkun.extentsCount = 1; + dkun.extents = &ext; + ext.offset = dkm->dfle_start; + ext.length = dkm->dfle_length; + + /* + * dkm->dfl_flags vs dkun.options + * #define DF_WAIT_SYNC 0x00000001 Wait for full write-out of free. + * #define _DK_UNMAP_INITIALIZE 0x00000100 + */ + + /* issue unmap */ + error = VNOP_IOCTL(LH_VNODE(lhp), DKIOCUNMAP, + (caddr_t)&dkun, 0, context); + + if (error) { + dprintf("%s unmap: 0x%x for off %llx size %llx\n", __func__, + error, ext.offset, ext.length); + } + + /* Release iocount on vnode (still has usecount) */ + vnode_put(LH_VNODE(lhp)); + /* Drop vfs_context */ + vfs_context_rele(context); + + return (error); +} diff --git a/module/os/macos/zfs/policy.c b/module/os/macos/zfs/policy.c new file mode 100644 index 0000000000..5525302266 --- /dev/null +++ b/module/os/macos/zfs/policy.c @@ -0,0 +1,354 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright (C) 2016 Lawrence Livermore National Security, LLC. + * + * For Linux the vast majority of this enforcement is already handled via + * the standard Linux VFS permission checks. However certain administrative + * commands which bypass the standard mechanisms may need to make use of + * this functionality. + */ + +#include +#include +#include + +/* + * The passed credentials cannot be directly verified because Linux only + * provides and interface to check the *current* process credentials. In + * order to handle this the capable() test is only run when the passed + * credentials match the current process credentials or the kcred. In + * all other cases this function must fail and return the passed err. + */ +static int +priv_policy_ns(const cred_t *cr, int capability, boolean_t all, int err, + struct user_namespace *ns) +{ + ASSERT3S(all, ==, B_FALSE); + + if (cr != CRED() && (cr != kcred)) + return (err); + +#if defined(CONFIG_USER_NS) + if (!(ns ? ns_capable(ns, capability) : capable(capability))) +#else + if (!capable(capability)) +#endif + return (err); + + return (0); +} + +static int +priv_policy(const cred_t *cr, int capability, boolean_t all, int err) +{ + return (priv_policy_ns(cr, capability, all, err, NULL)); +} + +static int +priv_policy_user(const cred_t *cr, int capability, boolean_t all, int err) +{ + /* + * All priv_policy_user checks are preceded by kuid/kgid_has_mapping() + * checks. If we cannot do them, we shouldn't be using ns_capable() + * since we don't know whether the affected files are valid in our + * namespace. + */ +#if defined(CONFIG_USER_NS) + return (priv_policy_ns(cr, capability, all, err, cr->user_ns)); +#else + return (priv_policy_ns(cr, capability, all, err, NULL)); +#endif +} + +/* + * Checks for operations that are either client-only or are used by + * both clients and servers. + */ +int +secpolicy_nfs(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); +} + +/* + * Catch all system configuration. + */ +int +secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); +} + +/* + * Like secpolicy_vnode_access() but we get the actual wanted mode and the + * current mode of the file, not the missing bits. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, + mode_t curmode, mode_t wantmode) +{ + return (0); +} + +/* + * This is a special routine for ZFS; it is used to determine whether + * any of the privileges in effect allow any form of access to the + * file. There's no reason to audit this or any reason to record + * this. More work is needed to do the "KPLD" stuff. + */ +int +secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + + if (inode_owner_or_capable(ip)) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + if (priv_policy_user(cr, CAP_DAC_OVERRIDE, B_FALSE, EPERM) == 0) + return (0); + + if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, B_FALSE, EPERM) == 0) + return (0); + + return (EPERM); +} + +/* + * Determine if subject can chown owner of a file. + */ +int +secpolicy_vnode_chown(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM)); +} + +/* + * Determine if subject can change group ownership of a file. + */ +int +secpolicy_vnode_create_gid(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SETGID, B_FALSE, EPERM)); +} + +/* + * Policy determines whether we can remove an entry from a directory, + * regardless of permission bits. + */ +int +secpolicy_vnode_remove(const cred_t *cr) +{ + return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM)); +} + +/* + * Determine that subject can modify the mode of a file. allzone privilege + * needed when modifying root owned object. + */ +int +secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM)); +} + +/* + * Are we allowed to retain the set-uid/set-gid bits when + * changing ownership or when writing to a file? + * "issuid" should be true when set-uid; only in that case + * root ownership is checked (setgid is assumed). + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) +{ + return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); +} + +/* + * Determine that subject can set the file setgid flag. + */ +int +secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) +{ +#if defined(CONFIG_USER_NS) + if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid))) + return (EPERM); +#endif + if (crgetfsgid(cr) != gid && !groupmember(gid, cr)) + return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); + + return (0); +} + +/* + * Determine if the subject can inject faults in the ZFS fault injection + * framework. Requires all privileges. + */ +int +secpolicy_zinject(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); +} + +/* + * Determine if the subject has permission to manipulate ZFS datasets + * (not pools). Equivalent to the SYS_MOUNT privilege. + */ +int +secpolicy_zfs(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); +} + +void +secpolicy_setid_clear(vattr_t *vap, cred_t *cr) +{ + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + (vap->va_mode & S_ISUID) != 0 && + (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~(S_ISUID|S_ISGID); + } +} + +/* + * Determine that subject can set the file setid flags. + */ +static int +secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); +} + +/* + * Determine that subject can make a file a "sticky". + * + * Enforced in the Linux VFS. + */ +static int +secpolicy_vnode_stky_modify(const cred_t *cr) +{ + return (0); +} + +int +secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, + const vattr_t *ovap, cred_t *cr) +{ + int error; + + if ((vap->va_mode & S_ISUID) != 0 && + (error = secpolicy_vnode_setid_modify(cr, + ovap->va_uid)) != 0) { + return (error); + } + + /* + * Check privilege if attempting to set the + * sticky bit on a non-directory. + */ + if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 && + secpolicy_vnode_stky_modify(cr) != 0) { + vap->va_mode &= ~S_ISVTX; + } + + /* + * Check for privilege if attempting to set the + * group-id bit. + */ + if ((vap->va_mode & S_ISGID) != 0 && + secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) { + vap->va_mode &= ~S_ISGID; + } + + return (0); +} + +/* + * Check privileges for setting xvattr attributes + */ +int +secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, mode_t type) +{ + return (secpolicy_vnode_chown(cr, owner)); +} + +/* + * Check privileges for setattr attributes. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node) +{ + return (0); +} + +/* + * Check privileges for links. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_basic_link(const cred_t *cr) +{ + return (0); +} diff --git a/module/os/macos/zfs/qat.c b/module/os/macos/zfs/qat.c new file mode 100644 index 0000000000..08613b3a20 --- /dev/null +++ b/module/os/macos/zfs/qat.c @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include +#include + +qat_stats_t qat_stats = { + { "comp_requests", KSTAT_DATA_UINT64 }, + { "comp_total_in_bytes", KSTAT_DATA_UINT64 }, + { "comp_total_out_bytes", KSTAT_DATA_UINT64 }, + { "decomp_requests", KSTAT_DATA_UINT64 }, + { "decomp_total_in_bytes", KSTAT_DATA_UINT64 }, + { "decomp_total_out_bytes", KSTAT_DATA_UINT64 }, + { "dc_fails", KSTAT_DATA_UINT64 }, + { "encrypt_requests", KSTAT_DATA_UINT64 }, + { "encrypt_total_in_bytes", KSTAT_DATA_UINT64 }, + { "encrypt_total_out_bytes", KSTAT_DATA_UINT64 }, + { "decrypt_requests", KSTAT_DATA_UINT64 }, + { "decrypt_total_in_bytes", KSTAT_DATA_UINT64 }, + { "decrypt_total_out_bytes", KSTAT_DATA_UINT64 }, + { "crypt_fails", KSTAT_DATA_UINT64 }, + { "cksum_requests", KSTAT_DATA_UINT64 }, + { "cksum_total_in_bytes", KSTAT_DATA_UINT64 }, + { "cksum_fails", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *qat_ksp = NULL; + +CpaStatus +qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes) +{ + *pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL); + if (*pp_mem_addr == NULL) + return (CPA_STATUS_RESOURCE); + return (CPA_STATUS_SUCCESS); +} + +void +qat_mem_free_contig(void **pp_mem_addr) +{ + if (*pp_mem_addr != NULL) { + kfree(*pp_mem_addr); + *pp_mem_addr = NULL; + } +} + +int +qat_init(void) +{ + qat_ksp = kstat_create("zfs", 0, "qat", "misc", + KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (qat_ksp != NULL) { + qat_ksp->ks_data = &qat_stats; + kstat_install(qat_ksp); + } + + /* + * Just set the disable flag when qat init failed, qat can be + * turned on again in post-process after zfs module is loaded, e.g.: + * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable + */ + if (qat_dc_init() != 0) + zfs_qat_compress_disable = 1; + + if (qat_cy_init() != 0) { + zfs_qat_checksum_disable = 1; + zfs_qat_encrypt_disable = 1; + } + + return (0); +} + +void +qat_fini(void) +{ + if (qat_ksp != NULL) { + kstat_delete(qat_ksp); + qat_ksp = NULL; + } + + qat_cy_fini(); + qat_dc_fini(); +} + +#endif diff --git a/module/os/macos/zfs/qat_compress.c b/module/os/macos/zfs/qat_compress.c new file mode 100644 index 0000000000..ad3ead3b16 --- /dev/null +++ b/module/os/macos/zfs/qat_compress.c @@ -0,0 +1,569 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Max instances in a QAT device, each instance is a channel to submit + * jobs to QAT hardware, this is only for pre-allocating instance and + * session arrays; the actual number of instances are defined in the + * QAT driver's configuration file. + */ +#define QAT_DC_MAX_INSTANCES 48 + +/* + * ZLIB head and foot size + */ +#define ZLIB_HEAD_SZ 2 +#define ZLIB_FOOT_SZ 4 + +static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES]; +static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES]; +static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES]; +static Cpa16U num_inst = 0; +static Cpa32U inst_num = 0; +static boolean_t qat_dc_init_done = B_FALSE; +int zfs_qat_compress_disable = 0; + +boolean_t +qat_dc_use_accel(size_t s_len) +{ + return (!zfs_qat_compress_disable && + qat_dc_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +static void +qat_dc_callback(void *p_callback, CpaStatus status) +{ + if (p_callback != NULL) + complete((struct completion *)p_callback); +} + +static void +qat_dc_clean(void) +{ + Cpa16U buff_num = 0; + Cpa16U num_inter_buff_lists = 0; + + for (Cpa16U i = 0; i < num_inst; i++) { + cpaDcStopInstance(dc_inst_handles[i]); + QAT_PHYS_CONTIG_FREE(session_handles[i]); + /* free intermediate buffers */ + if (buffer_array[i] != NULL) { + cpaDcGetNumIntermediateBuffers( + dc_inst_handles[i], &num_inter_buff_lists); + for (buff_num = 0; buff_num < num_inter_buff_lists; + buff_num++) { + CpaBufferList *buffer_inter = + buffer_array[i][buff_num]; + if (buffer_inter->pBuffers) { + QAT_PHYS_CONTIG_FREE( + buffer_inter->pBuffers->pData); + QAT_PHYS_CONTIG_FREE( + buffer_inter->pBuffers); + } + QAT_PHYS_CONTIG_FREE( + buffer_inter->pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(buffer_inter); + } + } + } + + num_inst = 0; + qat_dc_init_done = B_FALSE; +} + +int +qat_dc_init(void) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U sess_size = 0; + Cpa32U ctx_size = 0; + Cpa16U num_inter_buff_lists = 0; + Cpa16U buff_num = 0; + Cpa32U buff_meta_size = 0; + CpaDcSessionSetupData sd = {0}; + + if (qat_dc_init_done) + return (0); + + status = cpaDcGetNumInstances(&num_inst); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + /* if the user has configured no QAT compression units just return */ + if (num_inst == 0) + return (0); + + if (num_inst > QAT_DC_MAX_INSTANCES) + num_inst = QAT_DC_MAX_INSTANCES; + + status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + for (Cpa16U i = 0; i < num_inst; i++) { + cpaDcSetAddressTranslation(dc_inst_handles[i], + (void*)virt_to_phys); + + status = cpaDcBufferListGetMetaSize(dc_inst_handles[i], + 1, &buff_meta_size); + + if (status == CPA_STATUS_SUCCESS) + status = cpaDcGetNumIntermediateBuffers( + dc_inst_handles[i], &num_inter_buff_lists); + + if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0) + status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i], + num_inter_buff_lists * + sizeof (CpaBufferList *)); + + for (buff_num = 0; buff_num < num_inter_buff_lists; + buff_num++) { + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num], + sizeof (CpaBufferList)); + + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]-> + pPrivateMetaData, + buff_meta_size); + + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]->pBuffers, + sizeof (CpaFlatBuffer)); + + if (status == CPA_STATUS_SUCCESS) { + /* + * implementation requires an intermediate + * buffer approximately twice the size of + * output buffer, which is 2x max buffer + * size here. + */ + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]->pBuffers-> + pData, 2 * QAT_MAX_BUF_SIZE); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + buffer_array[i][buff_num]->numBuffers = 1; + buffer_array[i][buff_num]->pBuffers-> + dataLenInBytes = 2 * QAT_MAX_BUF_SIZE; + } + } + + status = cpaDcStartInstance(dc_inst_handles[i], + num_inter_buff_lists, buffer_array[i]); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + sd.compLevel = CPA_DC_L1; + sd.compType = CPA_DC_DEFLATE; + sd.huffType = CPA_DC_HT_FULL_DYNAMIC; + sd.sessDirection = CPA_DC_DIR_COMBINED; + sd.sessState = CPA_DC_STATELESS; + sd.deflateWindowSize = 7; + sd.checksum = CPA_DC_ADLER32; + status = cpaDcGetSessionSize(dc_inst_handles[i], + &sd, &sess_size, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size); + if (session_handles[i] == NULL) + goto fail; + + status = cpaDcInitSession(dc_inst_handles[i], + session_handles[i], + &sd, NULL, qat_dc_callback); + if (status != CPA_STATUS_SUCCESS) + goto fail; + } + + qat_dc_init_done = B_TRUE; + return (0); +fail: + qat_dc_clean(); + return (-1); +} + +void +qat_dc_fini(void) +{ + if (!qat_dc_init_done) + return; + + qat_dc_clean(); +} + +/* + * The "add" parameter is an additional buffer which is passed + * to QAT as a scratch buffer alongside the destination buffer + * in case the "compressed" data ends up being larger than the + * original source data. This is necessary to prevent QAT from + * generating buffer overflow warnings for incompressible data. + */ +static int +qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, + char *dst, int dst_len, char *add, int add_len, size_t *c_len) +{ + CpaInstanceHandle dc_inst_handle; + CpaDcSessionHandle session_handle; + CpaBufferList *buf_list_src = NULL; + CpaBufferList *buf_list_dst = NULL; + CpaFlatBuffer *flat_buf_src = NULL; + CpaFlatBuffer *flat_buf_dst = NULL; + Cpa8U *buffer_meta_src = NULL; + Cpa8U *buffer_meta_dst = NULL; + Cpa32U buffer_meta_size = 0; + CpaDcRqResults dc_results; + CpaStatus status = CPA_STATUS_FAIL; + Cpa32U hdr_sz = 0; + Cpa32U compressed_sz; + Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2; + Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2; + Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2; + Cpa32U bytes_left; + Cpa32U dst_pages = 0; + Cpa32U adler32 = 0; + char *data; + struct page *page; + struct page **in_pages = NULL; + struct page **out_pages = NULL; + struct page **add_pages = NULL; + Cpa32U page_off = 0; + struct completion complete; + Cpa32U page_num = 0; + Cpa16U i; + + /* + * We increment num_src_buf and num_dst_buf by 2 to allow + * us to handle non page-aligned buffer addresses and buffers + * whose sizes are not divisible by PAGE_SIZE. + */ + Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) + + (num_src_buf * sizeof (CpaFlatBuffer)); + Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) + + ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer)); + + status = QAT_PHYS_CONTIG_ALLOC(&in_pages, + num_src_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&out_pages, + num_dst_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&add_pages, + num_add_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + dc_inst_handle = dc_inst_handles[i]; + session_handle = session_handles[i]; + + cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf, + &buffer_meta_size); + status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf, + &buffer_meta_size); + status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + /* build source buffer list */ + status = QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1); + + buf_list_src->pBuffers = flat_buf_src; /* always point to first one */ + + /* build destination buffer list */ + status = QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); + + buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */ + + buf_list_src->numBuffers = 0; + buf_list_src->pPrivateMetaData = buffer_meta_src; + bytes_left = src_len; + data = src; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + in_pages[page_num] = page; + flat_buf_src->pData = kmap(page) + page_off; + flat_buf_src->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_src->dataLenInBytes; + data += flat_buf_src->dataLenInBytes; + flat_buf_src++; + buf_list_src->numBuffers++; + page_num++; + } + + buf_list_dst->numBuffers = 0; + buf_list_dst->pPrivateMetaData = buffer_meta_dst; + bytes_left = dst_len; + data = dst; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + flat_buf_dst->pData = kmap(page) + page_off; + out_pages[page_num] = page; + flat_buf_dst->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_dst->dataLenInBytes; + data += flat_buf_dst->dataLenInBytes; + flat_buf_dst++; + buf_list_dst->numBuffers++; + page_num++; + dst_pages++; + } + + /* map additional scratch pages into the destination buffer list */ + bytes_left = add_len; + data = add; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + flat_buf_dst->pData = kmap(page) + page_off; + add_pages[page_num] = page; + flat_buf_dst->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_dst->dataLenInBytes; + data += flat_buf_dst->dataLenInBytes; + flat_buf_dst++; + buf_list_dst->numBuffers++; + page_num++; + } + + init_completion(&complete); + + if (dir == QAT_COMPRESS) { + QAT_STAT_BUMP(comp_requests); + QAT_STAT_INCR(comp_total_in_bytes, src_len); + + cpaDcGenerateHeader(session_handle, + buf_list_dst->pBuffers, &hdr_sz); + buf_list_dst->pBuffers->pData += hdr_sz; + buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz; + status = cpaDcCompressData( + dc_inst_handle, session_handle, + buf_list_src, buf_list_dst, + &dc_results, CPA_DC_FLUSH_FINAL, + &complete); + if (status != CPA_STATUS_SUCCESS) { + goto fail; + } + + /* we now wait until the completion of the operation. */ + wait_for_completion(&complete); + + if (dc_results.status != CPA_STATUS_SUCCESS) { + status = CPA_STATUS_FAIL; + goto fail; + } + + compressed_sz = dc_results.produced; + if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) { + status = CPA_STATUS_INCOMPRESSIBLE; + goto fail; + } + + flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); + /* move to the last page */ + flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT; + + /* no space for gzip footer in the last page */ + if (((compressed_sz + hdr_sz) % PAGE_SIZE) + + ZLIB_FOOT_SZ > PAGE_SIZE) { + status = CPA_STATUS_INCOMPRESSIBLE; + goto fail; + } + + /* jump to the end of the buffer and append footer */ + flat_buf_dst->pData = + (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK) + + ((compressed_sz + hdr_sz) % PAGE_SIZE); + flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ; + + dc_results.produced = 0; + status = cpaDcGenerateFooter(session_handle, + flat_buf_dst, &dc_results); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + *c_len = compressed_sz + dc_results.produced + hdr_sz; + QAT_STAT_INCR(comp_total_out_bytes, *c_len); + } else { + ASSERT3U(dir, ==, QAT_DECOMPRESS); + QAT_STAT_BUMP(decomp_requests); + QAT_STAT_INCR(decomp_total_in_bytes, src_len); + + buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ; + buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ; + status = cpaDcDecompressData(dc_inst_handle, session_handle, + buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL, + &complete); + + if (CPA_STATUS_SUCCESS != status) { + status = CPA_STATUS_FAIL; + goto fail; + } + + /* we now wait until the completion of the operation. */ + wait_for_completion(&complete); + + if (dc_results.status != CPA_STATUS_SUCCESS) { + status = CPA_STATUS_FAIL; + goto fail; + } + + /* verify adler checksum */ + adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ); + if (adler32 != BSWAP_32(dc_results.checksum)) { + status = CPA_STATUS_FAIL; + goto fail; + } + *c_len = dc_results.produced; + QAT_STAT_INCR(decomp_total_out_bytes, *c_len); + } + +fail: + if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE) + QAT_STAT_BUMP(dc_fails); + + if (in_pages) { + for (page_num = 0; + page_num < buf_list_src->numBuffers; + page_num++) { + kunmap(in_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(in_pages); + } + + if (out_pages) { + for (page_num = 0; page_num < dst_pages; page_num++) { + kunmap(out_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(out_pages); + } + + if (add_pages) { + for (page_num = 0; + page_num < buf_list_dst->numBuffers - dst_pages; + page_num++) { + kunmap(add_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(add_pages); + } + + QAT_PHYS_CONTIG_FREE(buffer_meta_src); + QAT_PHYS_CONTIG_FREE(buffer_meta_dst); + QAT_PHYS_CONTIG_FREE(buf_list_src); + QAT_PHYS_CONTIG_FREE(buf_list_dst); + + return (status); +} + +/* + * Entry point for QAT accelerated compression / decompression. + */ +int +qat_compress(qat_compress_dir_t dir, char *src, int src_len, + char *dst, int dst_len, size_t *c_len) +{ + int ret; + size_t add_len = 0; + void *add = NULL; + + if (dir == QAT_COMPRESS) { + add_len = dst_len; + add = zio_data_buf_alloc(add_len); + } + + ret = qat_compress_impl(dir, src, src_len, dst, + dst_len, add, add_len, c_len); + + if (dir == QAT_COMPRESS) + zio_data_buf_free(add, add_len); + + return (ret); +} + +static int +param_set_qat_compress(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * zfs_qat_compress_disable = 0: enable qat compress + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_dc_init_done) { + ret = qat_dc_init(); + if (ret != 0) { + zfs_qat_compress_disable = 1; + return (ret); + } + } + return (ret); +} + +module_param_call(zfs_qat_compress_disable, param_set_qat_compress, + param_get_int, &zfs_qat_compress_disable, 0644); +MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression"); + +#endif diff --git a/module/os/macos/zfs/qat_crypt.c b/module/os/macos/zfs/qat_crypt.c new file mode 100644 index 0000000000..4771b2f3be --- /dev/null +++ b/module/os/macos/zfs/qat_crypt.c @@ -0,0 +1,630 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * This file represents the QAT implementation of checksums and encryption. + * Internally, QAT shares the same cryptographic instances for both of these + * operations, so the code has been combined here. QAT data compression uses + * compression instances, so that code is separated into qat_compress.c + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include +#include +#include +#include +#include +#include +#include "lac/cpa_cy_im.h" +#include "lac/cpa_cy_common.h" +#include + +/* + * Max instances in a QAT device, each instance is a channel to submit + * jobs to QAT hardware, this is only for pre-allocating instances + * and session arrays; the actual number of instances are defined in + * the QAT driver's configure file. + */ +#define QAT_CRYPT_MAX_INSTANCES 48 + +#define MAX_PAGE_NUM 1024 + +static Cpa32U inst_num = 0; +static Cpa16U num_inst = 0; +static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES]; +static boolean_t qat_cy_init_done = B_FALSE; +int zfs_qat_encrypt_disable = 0; +int zfs_qat_checksum_disable = 0; + +typedef struct cy_callback { + CpaBoolean verify_result; + struct completion complete; +} cy_callback_t; + +static void +symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation, + void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify) +{ + cy_callback_t *cb = p_callback; + + if (cb != NULL) { + /* indicate that the function has been called */ + cb->verify_result = verify; + complete(&cb->complete); + } +} + +boolean_t +qat_crypt_use_accel(size_t s_len) +{ + return (!zfs_qat_encrypt_disable && + qat_cy_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +boolean_t +qat_checksum_use_accel(size_t s_len) +{ + return (!zfs_qat_checksum_disable && + qat_cy_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +void +qat_cy_clean(void) +{ + for (Cpa16U i = 0; i < num_inst; i++) + cpaCyStopInstance(cy_inst_handles[i]); + + num_inst = 0; + qat_cy_init_done = B_FALSE; +} + +int +qat_cy_init(void) +{ + CpaStatus status = CPA_STATUS_FAIL; + + if (qat_cy_init_done) + return (0); + + status = cpaCyGetNumInstances(&num_inst); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + /* if the user has configured no QAT encryption units just return */ + if (num_inst == 0) + return (0); + + if (num_inst > QAT_CRYPT_MAX_INSTANCES) + num_inst = QAT_CRYPT_MAX_INSTANCES; + + status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + for (Cpa16U i = 0; i < num_inst; i++) { + status = cpaCySetAddressTranslation(cy_inst_handles[i], + (void *)virt_to_phys); + if (status != CPA_STATUS_SUCCESS) + goto error; + + status = cpaCyStartInstance(cy_inst_handles[i]); + if (status != CPA_STATUS_SUCCESS) + goto error; + } + + qat_cy_init_done = B_TRUE; + return (0); + +error: + qat_cy_clean(); + return (-1); +} + +void +qat_cy_fini(void) +{ + if (!qat_cy_init_done) + return; + + qat_cy_clean(); +} + +static CpaStatus +qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle, + CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key, + Cpa64U crypt, Cpa32U aad_len) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U ctx_size; + Cpa32U ciper_algorithm; + Cpa32U hash_algorithm; + CpaCySymSessionSetupData sd = { 0 }; + + if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) { + return (CPA_STATUS_FAIL); + } else { + ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM; + hash_algorithm = CPA_CY_SYM_HASH_AES_GCM; + } + + sd.cipherSetupData.cipherAlgorithm = ciper_algorithm; + sd.cipherSetupData.pCipherKey = key->ck_data; + sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8; + sd.hashSetupData.hashAlgorithm = hash_algorithm; + sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH; + sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN; + sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len; + sd.sessionPriority = CPA_CY_PRIORITY_NORMAL; + sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING; + sd.digestIsAppended = CPA_FALSE; + sd.verifyDigest = CPA_FALSE; + + if (dir == QAT_ENCRYPT) { + sd.cipherSetupData.cipherDirection = + CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT; + sd.algChainOrder = + CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER; + } else { + ASSERT3U(dir, ==, QAT_DECRYPT); + sd.cipherSetupData.cipherDirection = + CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT; + sd.algChainOrder = + CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH; + } + + status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = cpaCySymInitSession(inst_handle, symcallback, &sd, + *cy_session_ctx); + if (status != CPA_STATUS_SUCCESS) { + QAT_PHYS_CONTIG_FREE(*cy_session_ctx); + return (status); + } + + return (CPA_STATUS_SUCCESS); +} + +static CpaStatus +qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle, + CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U ctx_size; + Cpa32U hash_algorithm; + CpaCySymSessionSetupData sd = { 0 }; + + /* + * ZFS's SHA512 checksum is actually SHA512/256, which uses + * a different IV from standard SHA512. QAT does not support + * SHA512/256, so we can only support SHA256. + */ + if (cksum == ZIO_CHECKSUM_SHA256) + hash_algorithm = CPA_CY_SYM_HASH_SHA256; + else + return (CPA_STATUS_FAIL); + + sd.sessionPriority = CPA_CY_PRIORITY_NORMAL; + sd.symOperation = CPA_CY_SYM_OP_HASH; + sd.hashSetupData.hashAlgorithm = hash_algorithm; + sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN; + sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t); + sd.digestIsAppended = CPA_FALSE; + sd.verifyDigest = CPA_FALSE; + + status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = cpaCySymInitSession(inst_handle, symcallback, &sd, + *cy_session_ctx); + if (status != CPA_STATUS_SUCCESS) { + QAT_PHYS_CONTIG_FREE(*cy_session_ctx); + return (status); + } + + return (CPA_STATUS_SUCCESS); +} + +static CpaStatus +qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs, + CpaBufferList *src, CpaBufferList *dst) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U meta_size = 0; + + status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size); + if (status != CPA_STATUS_SUCCESS) + goto error; + + if (src != dst) { + status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData, + meta_size); + if (status != CPA_STATUS_SUCCESS) + goto error; + } + + return (CPA_STATUS_SUCCESS); + +error: + QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData); + if (src != dst) + QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData); + + return (status); +} + +int +qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, + uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf, + crypto_key_t *key, uint64_t crypt, uint32_t enc_len) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa16U i; + CpaInstanceHandle cy_inst_handle; + Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2; + Cpa32U bytes_left = 0; + Cpa8S *data = NULL; + CpaCySymSessionCtx *cy_session_ctx = NULL; + cy_callback_t cb; + CpaCySymOpData op_data = { 0 }; + CpaBufferList src_buffer_list = { 0 }; + CpaBufferList dst_buffer_list = { 0 }; + CpaFlatBuffer *flat_src_buf_array = NULL; + CpaFlatBuffer *flat_src_buf = NULL; + CpaFlatBuffer *flat_dst_buf_array = NULL; + CpaFlatBuffer *flat_dst_buf = NULL; + struct page *in_pages[MAX_PAGE_NUM]; + struct page *out_pages[MAX_PAGE_NUM]; + Cpa32U in_page_num = 0; + Cpa32U out_page_num = 0; + Cpa32U in_page_off = 0; + Cpa32U out_page_off = 0; + + if (dir == QAT_ENCRYPT) { + QAT_STAT_BUMP(encrypt_requests); + QAT_STAT_INCR(encrypt_total_in_bytes, enc_len); + } else { + QAT_STAT_BUMP(decrypt_requests); + QAT_STAT_INCR(decrypt_total_in_bytes, enc_len); + } + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + cy_inst_handle = cy_inst_handles[i]; + + status = qat_init_crypt_session_ctx(dir, cy_inst_handle, + &cy_session_ctx, key, crypt, aad_len); + if (status != CPA_STATUS_SUCCESS) { + /* don't count CCM as a failure since it's not supported */ + if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM) + QAT_STAT_BUMP(crypt_fails); + return (status); + } + + /* + * We increment nr_bufs by 2 to allow us to handle non + * page-aligned buffer addresses and buffers whose sizes + * are not divisible by PAGE_SIZE. + */ + status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs, + &src_buffer_list, &dst_buffer_list); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult, + ZIO_DATA_MAC_LEN); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv, + ZIO_DATA_IV_LEN); + if (status != CPA_STATUS_SUCCESS) + goto fail; + if (aad_len > 0) { + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData, + aad_len); + if (status != CPA_STATUS_SUCCESS) + goto fail; + bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len); + } + + bytes_left = enc_len; + data = src_buf; + flat_src_buf = flat_src_buf_array; + while (bytes_left > 0) { + in_page_off = ((long)data & ~PAGE_MASK); + in_pages[in_page_num] = qat_mem_to_page(data); + flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off; + flat_src_buf->dataLenInBytes = + min((long)PAGE_SIZE - in_page_off, (long)bytes_left); + data += flat_src_buf->dataLenInBytes; + bytes_left -= flat_src_buf->dataLenInBytes; + flat_src_buf++; + in_page_num++; + } + src_buffer_list.pBuffers = flat_src_buf_array; + src_buffer_list.numBuffers = in_page_num; + + bytes_left = enc_len; + data = dst_buf; + flat_dst_buf = flat_dst_buf_array; + while (bytes_left > 0) { + out_page_off = ((long)data & ~PAGE_MASK); + out_pages[out_page_num] = qat_mem_to_page(data); + flat_dst_buf->pData = kmap(out_pages[out_page_num]) + + out_page_off; + flat_dst_buf->dataLenInBytes = + min((long)PAGE_SIZE - out_page_off, (long)bytes_left); + data += flat_dst_buf->dataLenInBytes; + bytes_left -= flat_dst_buf->dataLenInBytes; + flat_dst_buf++; + out_page_num++; + } + dst_buffer_list.pBuffers = flat_dst_buf_array; + dst_buffer_list.numBuffers = out_page_num; + + op_data.sessionCtx = cy_session_ctx; + op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL; + op_data.cryptoStartSrcOffsetInBytes = 0; + op_data.messageLenToCipherInBytes = 0; + op_data.hashStartSrcOffsetInBytes = 0; + op_data.messageLenToHashInBytes = 0; + op_data.messageLenToCipherInBytes = enc_len; + op_data.ivLenInBytes = ZIO_DATA_IV_LEN; + bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN); + /* if dir is QAT_DECRYPT, copy digest_buf to pDigestResult */ + if (dir == QAT_DECRYPT) + bcopy(digest_buf, op_data.pDigestResult, ZIO_DATA_MAC_LEN); + + cb.verify_result = CPA_FALSE; + init_completion(&cb.complete); + status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data, + &src_buffer_list, &dst_buffer_list, NULL); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + /* we now wait until the completion of the operation. */ + wait_for_completion(&cb.complete); + + if (cb.verify_result == CPA_FALSE) { + status = CPA_STATUS_FAIL; + goto fail; + } + + if (dir == QAT_ENCRYPT) { + /* if dir is QAT_ENCRYPT, save pDigestResult to digest_buf */ + bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN); + QAT_STAT_INCR(encrypt_total_out_bytes, enc_len); + } else { + QAT_STAT_INCR(decrypt_total_out_bytes, enc_len); + } + +fail: + if (status != CPA_STATUS_SUCCESS) + QAT_STAT_BUMP(crypt_fails); + + for (i = 0; i < in_page_num; i++) + kunmap(in_pages[i]); + for (i = 0; i < out_page_num; i++) + kunmap(out_pages[i]); + + cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx); + if (aad_len > 0) + QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData); + QAT_PHYS_CONTIG_FREE(op_data.pIv); + QAT_PHYS_CONTIG_FREE(op_data.pDigestResult); + QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(cy_session_ctx); + QAT_PHYS_CONTIG_FREE(flat_src_buf_array); + QAT_PHYS_CONTIG_FREE(flat_dst_buf_array); + + return (status); +} + +int +qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp) +{ + CpaStatus status; + Cpa16U i; + CpaInstanceHandle cy_inst_handle; + Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2; + Cpa32U bytes_left = 0; + Cpa8S *data = NULL; + CpaCySymSessionCtx *cy_session_ctx = NULL; + cy_callback_t cb; + Cpa8U *digest_buffer = NULL; + CpaCySymOpData op_data = { 0 }; + CpaBufferList src_buffer_list = { 0 }; + CpaFlatBuffer *flat_src_buf_array = NULL; + CpaFlatBuffer *flat_src_buf = NULL; + struct page *in_pages[MAX_PAGE_NUM]; + Cpa32U page_num = 0; + Cpa32U page_off = 0; + + QAT_STAT_BUMP(cksum_requests); + QAT_STAT_INCR(cksum_total_in_bytes, size); + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + cy_inst_handle = cy_inst_handles[i]; + + status = qat_init_checksum_session_ctx(cy_inst_handle, + &cy_session_ctx, cksum); + if (status != CPA_STATUS_SUCCESS) { + /* don't count unsupported checksums as a failure */ + if (cksum == ZIO_CHECKSUM_SHA256 || + cksum == ZIO_CHECKSUM_SHA512) + QAT_STAT_BUMP(cksum_fails); + return (status); + } + + /* + * We increment nr_bufs by 2 to allow us to handle non + * page-aligned buffer addresses and buffers whose sizes + * are not divisible by PAGE_SIZE. + */ + status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs, + &src_buffer_list, &src_buffer_list); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer, + sizeof (zio_cksum_t)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + bytes_left = size; + data = buf; + flat_src_buf = flat_src_buf_array; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + in_pages[page_num] = qat_mem_to_page(data); + flat_src_buf->pData = kmap(in_pages[page_num]) + page_off; + flat_src_buf->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + data += flat_src_buf->dataLenInBytes; + bytes_left -= flat_src_buf->dataLenInBytes; + flat_src_buf++; + page_num++; + } + src_buffer_list.pBuffers = flat_src_buf_array; + src_buffer_list.numBuffers = page_num; + + op_data.sessionCtx = cy_session_ctx; + op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL; + op_data.hashStartSrcOffsetInBytes = 0; + op_data.messageLenToHashInBytes = size; + op_data.pDigestResult = digest_buffer; + + cb.verify_result = CPA_FALSE; + init_completion(&cb.complete); + status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data, + &src_buffer_list, &src_buffer_list, NULL); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + /* we now wait until the completion of the operation. */ + wait_for_completion(&cb.complete); + + if (cb.verify_result == CPA_FALSE) { + status = CPA_STATUS_FAIL; + goto fail; + } + + bcopy(digest_buffer, zcp, sizeof (zio_cksum_t)); + +fail: + if (status != CPA_STATUS_SUCCESS) + QAT_STAT_BUMP(cksum_fails); + + for (i = 0; i < page_num; i++) + kunmap(in_pages[i]); + + cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx); + QAT_PHYS_CONTIG_FREE(digest_buffer); + QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(cy_session_ctx); + QAT_PHYS_CONTIG_FREE(flat_src_buf_array); + + return (status); +} + +static int +param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * zfs_qat_encrypt_disable = 0: enable qat encrypt + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_cy_init_done) { + ret = qat_cy_init(); + if (ret != 0) { + zfs_qat_encrypt_disable = 1; + return (ret); + } + } + return (ret); +} + +static int +param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * set_checksum_param_ops = 0: enable qat checksum + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_cy_init_done) { + ret = qat_cy_init(); + if (ret != 0) { + zfs_qat_checksum_disable = 1; + return (ret); + } + } + return (ret); +} + +module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt, + param_get_int, &zfs_qat_encrypt_disable, 0644); +MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption"); + +module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum, + param_get_int, &zfs_qat_checksum_disable, 0644); +MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming"); + +#endif diff --git a/module/os/macos/zfs/spa_misc_os.c b/module/os/macos/zfs/spa_misc_os.c new file mode 100644 index 0000000000..aa546edd1b --- /dev/null +++ b/module/os/macos/zfs/spa_misc_os.c @@ -0,0 +1,116 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" + +const char * +spa_history_zone(void) +{ + return ("macos"); +} + +void +spa_create_os(void *arg) +{ + spa_t *spa = (spa_t *)arg; + int haslock = 0; + int error; + + haslock = mutex_owned(&spa_namespace_lock); + + /* Increase open refcount */ + spa_open_ref(spa, FTAG); + + if (haslock) { + mutex_exit(&spa_namespace_lock); + } + + /* Create IOKit pool proxy */ + if ((error = spa_iokit_pool_proxy_create(spa)) != 0) { + printf("%s spa_iokit_pool_proxy_create error %d\n", + __func__, error); + /* spa_create succeeded, ignore proxy error */ + } + + /* Cache vdev info, needs open ref above, and pool proxy */ + + if (error == 0 && (error = zfs_boot_update_bootinfo(spa)) != 0) { + printf("%s update_bootinfo error %d\n", __func__, error); + /* create succeeded, ignore error from bootinfo */ + } + + /* Drop open refcount */ + if (haslock) { + mutex_enter(&spa_namespace_lock); + } + + spa_close(spa, FTAG); +} + +void +spa_export_os(void *arg) +{ + spa_t *spa = (spa_t *)arg; + + /* Remove IOKit pool proxy */ + spa_iokit_pool_proxy_destroy(spa); +} + +void +spa_activate_os(void *arg) +{ + /* spa_t *spa = (spa_t *)arg; */ + /* Lock kext in kernel while mounted */ + OSKextRetainKextWithLoadTag(OSKextGetCurrentLoadTag()); +} + +void +spa_deactivate_os(void *arg) +{ + /* spa_t *spa = (spa_t *)arg; */ + OSKextReleaseKextWithLoadTag(OSKextGetCurrentLoadTag()); +} diff --git a/module/os/macos/zfs/spa_stats.c b/module/os/macos/zfs/spa_stats.c new file mode 100644 index 0000000000..cb7c88f441 --- /dev/null +++ b/module/os/macos/zfs/spa_stats.c @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +void +spa_stats_init(spa_t *spa) +{ + +} + +void +spa_stats_destroy(spa_t *spa) +{ + +} + +void +spa_iostats_trim_add(spa_t *spa, trim_type_t type, + uint64_t extents_written, uint64_t bytes_written, + uint64_t extents_skipped, uint64_t bytes_skipped, + uint64_t extents_failed, uint64_t bytes_failed) +{ +} + +void +spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) +{ +} + +void +spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) +{ + +} +/* + * Set txg state completion time and increment current state. + */ +int +spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, + hrtime_t completed_time) +{ + return (0); +} + +txg_stat_t * +spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) +{ + return (NULL); +} + +void +spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) +{ + +} + +void +spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) +{ + +} + +void +spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, + uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, + int error) +{ + +} + +int +spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, + hrtime_t duration) +{ + return (0); +} + +int +spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) +{ + return (0); +} diff --git a/module/os/macos/zfs/trace.c b/module/os/macos/zfs/trace.c new file mode 100644 index 0000000000..0c9990e854 --- /dev/null +++ b/module/os/macos/zfs/trace.c @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Each Linux tracepoints subsystem must define CREATE_TRACE_POINTS in one + * (and only one) C file, so this dummy file exists for that purpose. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include diff --git a/module/os/macos/zfs/vdev_disk.c b/module/os/macos/zfs/vdev_disk.c new file mode 100644 index 0000000000..ef24f9ab6a --- /dev/null +++ b/module/os/macos/zfs/vdev_disk.c @@ -0,0 +1,786 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Based on Apple MacZFS source code + * Copyright (c) 2014,2016 by Jorgen Lundman. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for disks. + */ + +/* XXX leave extern if declared elsewhere - originally was in zfs_ioctl.c */ +ldi_ident_t zfs_li; + +static void vdev_disk_close(vdev_t *); + +typedef struct vdev_disk_ldi_cb { + list_node_t lcb_next; + ldi_callback_id_t lcb_id; +} vdev_disk_ldi_cb_t; + +static void +vdev_disk_alloc(vdev_t *vd) +{ + vdev_disk_t *dvd; + + dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + + /* + * Create the LDI event callback list. + */ + list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), + offsetof(vdev_disk_ldi_cb_t, lcb_next)); +} + +static void +vdev_disk_free(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; + + if (dvd == NULL) + return; + + /* + * We have already closed the LDI handle. Clean up the LDI event + * callbacks and free vd->vdev_tsd. + */ + while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { + list_remove(&dvd->vd_ldi_cbs, lcb); + (void) ldi_ev_remove_callbacks(lcb->lcb_id); + kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); + } + list_destroy(&dvd->vd_ldi_cbs); + kmem_free(dvd, sizeof (vdev_disk_t)); + vd->vdev_tsd = NULL; +} + +static int +vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg, + void *ev_data) +{ + vdev_t *vd = (vdev_t *)arg; + vdev_disk_t *dvd = vd->vdev_tsd; + + /* + * Ignore events other than offline. + */ + if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) + return (LDI_EV_SUCCESS); + + /* + * All LDI handles must be closed for the state change to succeed, so + * call on vdev_disk_close() to do this. + * + * We inform vdev_disk_close that it is being called from offline + * notify context so it will defer cleanup of LDI event callbacks and + * freeing of vd->vdev_tsd to the offline finalize or a reopen. + */ + dvd->vd_ldi_offline = B_TRUE; + vdev_disk_close(vd); + + /* + * Now that the device is closed, request that the spa_async_thread + * mark the device as REMOVED and notify FMA of the removal. + */ + zfs_post_remove(vd->vdev_spa, vd); + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); + + return (LDI_EV_SUCCESS); +} + +/* ARGSUSED */ +static void +vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, + int ldi_result, void *arg, void *ev_data) +{ + vdev_t *vd = (vdev_t *)arg; + + /* + * Ignore events other than offline. + */ + if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) + return; + + /* + * We have already closed the LDI handle in notify. + * Clean up the LDI event callbacks and free vd->vdev_tsd. + */ + vdev_disk_free(vd); + /* + * Request that the vdev be reopened if the offline state change was + * unsuccessful. + */ + if (ldi_result != LDI_EV_SUCCESS) { + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); + } +} + +static ldi_ev_callback_t vdev_disk_off_callb = { + .cb_vers = LDI_EV_CB_VERS, + .cb_notify = vdev_disk_off_notify, + .cb_finalize = vdev_disk_off_finalize +}; + +/* + * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when + * even a fallback to DKIOCGMEDIAINFO fails. + */ +#ifdef DEBUG +#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) +#else +#define VDEV_DEBUG(...) /* Nothing... */ +#endif + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + spa_t *spa = vd->vdev_spa; + vdev_disk_t *dvd = vd->vdev_tsd; + ldi_ev_cookie_t ecookie; + vdev_disk_ldi_cb_t *lcb; + union { + struct dk_minfo_ext ude; + struct dk_minfo ud; + } dks; + struct dk_minfo_ext *dkmext = &dks.ude; + struct dk_minfo *dkm = &dks.ud; + int error; + uint64_t capacity = 0, blksz = 0, pbsize; + int isssd; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (dvd != NULL) { + if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) { + /* + * If we are opening a device in its offline notify + * context, the LDI handle was just closed. Clean + * up the LDI event callbacks and free vd->vdev_tsd. + */ + vdev_disk_free(vd); + } else { + ASSERT(vd->vdev_reopening); + goto skip_open; + } + } + + /* + * Create vd->vdev_tsd. + */ + vdev_disk_alloc(vd); + dvd = vd->vdev_tsd; + + /* + * When opening a disk device, we want to preserve the user's original + * intent. We always want to open the device by the path the user gave + * us, even if it is one of multiple paths to the same device. But we + * also want to be able to survive disks being removed/recabled. + * Therefore the sequence of opening devices is: + * + * 1. Try opening the device by path. For legacy pools without the + * 'whole_disk' property, attempt to fix the path by appending 's0'. + * + * 2. If the devid of the device matches the stored value, return + * success. + * + * 3. Otherwise, the device may have moved. Try opening the device + * by the devid instead. + */ + + error = EINVAL; /* presume failure */ + + if (vd->vdev_path != NULL) { + + /* + * If we have not yet opened the device, try to open it by the + * specified path. + */ + if (error != 0) { + error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), + kcred, &dvd->vd_lh, zfs_li); + } + + /* + * If we succeeded in opening the device, but 'vdev_wholedisk' + * is not yet set, then this must be a slice. + */ + if (error == 0 && vd->vdev_wholedisk == -1ULL) + vd->vdev_wholedisk = 0; + } + + /* + * If all else fails, then try opening by physical path (if available) + * or the logical path (if we failed due to the devid check). While not + * as reliable as the devid, this will give us something, and the higher + * level vdev validation will prevent us from opening the wrong device. + */ + if (error) { + + /* + * Note that we don't support the legacy auto-wholedisk support + * as above. This hasn't been used in a very long time and we + * don't need to propagate its oddities to this edge condition. + */ + if (error && vd->vdev_path != NULL) + error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), + kcred, &dvd->vd_lh, zfs_li); + } + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]", + error); + return (error); + } + + /* + * Register callbacks for the LDI offline event. + */ + if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == + LDI_EV_SUCCESS) { + lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); + list_insert_tail(&dvd->vd_ldi_cbs, lcb); + (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, + &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); + } + +skip_open: + /* + * Determine the actual size of the device. + */ + if (ldi_get_size(dvd->vd_lh, psize) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_disk_open: failed to get size"); + return (SET_ERROR(EINVAL)); + } + + *max_psize = *psize; + + /* + * Determine the device's minimum transfer size. + * If the ioctl isn't supported, assume DEV_BSIZE. + */ + if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, + (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { + capacity = dkmext->dki_capacity - 1; + blksz = dkmext->dki_lbsize; + pbsize = dkmext->dki_pbsize; + } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, + (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { + VDEV_DEBUG( + "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", + vd->vdev_path); + capacity = dkm->dki_capacity - 1; + blksz = dkm->dki_lbsize; + pbsize = blksz; + } else { + VDEV_DEBUG("vdev_disk_open(\"%s\"): " + "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", + vd->vdev_path, error); + pbsize = DEV_BSIZE; + } + + *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; + + if (vd->vdev_wholedisk == 1) { + int wce = 1; + + /* + * Since we own the whole disk, try to enable disk write + * caching. We ignore errors because it's OK if we can't do it. + */ + (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, + FKIOCTL, kcred, NULL); + } + + /* + * Clear the nowritecache bit, so that on a vdev_reopen() we will + * try again. + */ + vd->vdev_nowritecache = B_FALSE; + + /* Inform the ZIO pipeline that we are non-rotational */ + vd->vdev_nonrot = B_FALSE; + if (ldi_ioctl(dvd->vd_lh, DKIOCISSOLIDSTATE, (intptr_t)&isssd, + FKIOCTL, kcred, NULL) == 0) { + vd->vdev_nonrot = (isssd ? B_TRUE : B_FALSE); + } + + // Assume no TRIM + vd->vdev_has_trim = B_FALSE; + uint32_t features; + if (ldi_ioctl(dvd->vd_lh, DKIOCGETFEATURES, (intptr_t)&features, + FKIOCTL, kcred, NULL) == 0) { + if (features & DK_FEATURE_UNMAP) + vd->vdev_has_trim = B_TRUE; + } + + /* Set when device reports it supports secure TRIM. */ + // No secure trim in Apple yet. + vd->vdev_has_securetrim = B_FALSE; + + return (0); +} + +static void +vdev_disk_close(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + if (vd->vdev_reopening || dvd == NULL) + return; + + if (dvd->vd_lh != NULL) { + (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); + dvd->vd_lh = NULL; + } + + vd->vdev_delayed_close = B_FALSE; + /* + * If we closed the LDI handle due to an offline notify from LDI, + * don't free vd->vdev_tsd or unregister the callbacks here; + * the offline finalize callback or a reopen will take care of it. + */ + if (dvd->vd_ldi_offline) + return; + + vdev_disk_free(vd); +} + +int +vdev_disk_physio(vdev_t *vd, caddr_t data, + size_t size, uint64_t offset, int flags, boolean_t isdump) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) + return (EIO); + + ASSERT(vd->vdev_ops == &vdev_disk_ops); + + return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); +} + +int +vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, + size_t size, uint64_t offset, int flags) +{ + ldi_buf_t *bp; + int error = 0; + + if (vd_lh == NULL) + return (SET_ERROR(EINVAL)); + + ASSERT(flags & B_READ || flags & B_WRITE); + + bp = getrbuf(KM_SLEEP); + bp->b_flags = flags | B_BUSY | B_NOCACHE; + bp->b_bcount = size; + bp->b_un.b_addr = (void *)data; + bp->b_lblkno = lbtodb(offset); + bp->b_bufsize = size; + + error = ldi_strategy(vd_lh, bp); + ASSERT(error == 0); + + if ((error = biowait(bp)) == 0 && bp->b_resid != 0) + error = SET_ERROR(EIO); + freerbuf(bp); + + return (error); +} + +static void +vdev_disk_io_intr(ldi_buf_t *bp) +{ + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; + + /* + * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. + * Rather than teach the rest of the stack about other error + * possibilities (EFAULT, etc), we normalize the error value here. + */ + zio->io_error = (geterror(bp) != 0 ? EIO : 0); + + if (zio->io_error == 0 && bp->b_resid != 0) + zio->io_error = SET_ERROR(EIO); + + if (zio->io_type == ZIO_TYPE_READ) { + abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, + zio->io_size); + } else { + abd_return_buf(zio->io_abd, bp->b_un.b_addr, + zio->io_size); + } + + kmem_free(vb, sizeof (vdev_buf_t)); + + zio_delay_interrupt(zio); +} + +static void +vdev_disk_ioctl_free(zio_t *zio) +{ + kmem_free(zio->io_vsd, sizeof (struct dk_callback)); +} + +static const zio_vsd_ops_t vdev_disk_vsd_ops = { + vdev_disk_ioctl_free, + zio_vsd_default_cksum_report +}; + +static void +vdev_disk_ioctl_done(void *zio_arg, int error) +{ + zio_t *zio = zio_arg; + + zio->io_error = error; + + zio_interrupt(zio); +} + +static void +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_buf_t *vb; + struct dk_callback *dkc; + ldi_buf_t *bp = 0; + int flags, error = 0; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); + zio->io_vsd_ops = &vdev_disk_vsd_ops; + + dkc->dkc_callback = vdev_disk_ioctl_done; + dkc->dkc_flag = FLUSH_VOLATILE; + dkc->dkc_cookie = zio; + + error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, + (uintptr_t)dkc, FKIOCTL, kcred, NULL); + + if (error == 0) { + /* + * The ioctl will be done asychronously, + * and will call vdev_disk_ioctl_done() + * upon completion. + */ + return; + } + + zio->io_error = error; + + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + } /* io_cmd */ + + zio_execute(zio); + return; + + case ZIO_TYPE_WRITE: + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) + flags = B_WRITE; + else + flags = B_WRITE | B_ASYNC; + break; + + case ZIO_TYPE_READ: + if (zio->io_priority == ZIO_PRIORITY_SYNC_READ) + flags = B_READ; + else + flags = B_READ | B_ASYNC; + break; + + case ZIO_TYPE_TRIM: + { + dkioc_free_list_ext_t dfle; + dfle.dfle_start = zio->io_offset; + dfle.dfle_length = zio->io_size; + zio->io_error = ldi_ioctl(dvd->vd_lh, DKIOCFREE, + (uintptr_t)&dfle, FKIOCTL, kcred, NULL); + zio_interrupt(zio); + return; + } + + default: + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); + return; + } /* io_type */ + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + + /* Stop OSX from also caching our data */ + flags |= B_NOCACHE | B_PASSIVE; + + zio->io_target_timestamp = zio_handle_io_delay(zio); + + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); + + vb->vb_io = zio; + bp = &vb->vb_buf; + + ASSERT(bp != NULL); + ASSERT(zio->io_abd != NULL); + ASSERT(zio->io_size != 0); + + bioinit(bp); + bp->b_flags = B_BUSY | flags; + bp->b_bcount = zio->io_size; + + if (zio->io_type == ZIO_TYPE_READ) { + ASSERT3S(zio->io_abd->abd_size, >=, zio->io_size); + bp->b_un.b_addr = + abd_borrow_buf(zio->io_abd, zio->io_size); + } else { + ASSERT3S(zio->io_abd->abd_size, >=, zio->io_size); + bp->b_un.b_addr = + abd_borrow_buf_copy(zio->io_abd, zio->io_size); + } + + bp->b_lblkno = lbtodb(zio->io_offset); + bp->b_bufsize = zio->io_size; + bp->b_iodone = (int (*)(struct ldi_buf *))vdev_disk_io_intr; + + error = ldi_strategy(dvd->vd_lh, bp); + if (error != 0) { + dprintf("%s error from ldi_strategy %d\n", __func__, error); + zio->io_error = EIO; + kmem_free(vb, sizeof (vdev_buf_t)); + zio_execute(zio); + // zio_interrupt(zio); + } +} + +static void +vdev_disk_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. Otherwise, probe the device and + * make sure it's still accessible. + */ + if (zio->io_error == EIO && !vd->vdev_remove_wanted) { + vdev_disk_t *dvd = vd->vdev_tsd; + int state = DKIO_NONE; + + if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { + /* + * We post the resource as soon as possible, instead of + * when the async removal actually happens, because the + * DE is using this information to discard previous I/O + * errors. + */ + zfs_post_remove(zio->io_spa, vd); + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } +} + +static void +vdev_disk_hold(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* We must have a pathname, and it must be absolute. */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') + return; + + /* + * Only prefetch path and devid info if the device has + * never been opened. + */ + if (vd->vdev_tsd != NULL) + return; + +} + +static void +vdev_disk_rele(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* XXX: Implement me as a vnode rele for the device */ +} + +vdev_ops_t vdev_disk_ops = { + vdev_disk_open, + vdev_disk_close, + vdev_default_asize, + vdev_disk_io_start, + vdev_disk_io_done, + NULL, + NULL, + vdev_disk_hold, + vdev_disk_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * Given the root disk device devid or pathname, read the label from + * the device, and construct a configuration nvlist. + */ +int +vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) +{ + ldi_handle_t vd_lh; + vdev_label_t *label; + uint64_t s, size; + int l; + int error = -1; + + /* + * Read the device label and build the nvlist. + */ + + /* Apple: Error will be -1 at this point, allowing open_by_name */ + error = -1; + vd_lh = 0; /* Dismiss compiler warning */ + + if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, + zfs_li))) + return (error); + + if (ldi_get_size(vd_lh, &s)) { + (void) ldi_close(vd_lh, FREAD, kcred); + return (SET_ERROR(EIO)); + } + + size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); + label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); + + *config = NULL; + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t offset, state, txg = 0; + + /* read vdev label */ + offset = vdev_label_offset(size, l, 0); + if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, + VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state >= POOL_STATE_DESTROYED) { + nvlist_free(*config); + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(*config); + *config = NULL; + continue; + } + + break; + } + + kmem_free(label, sizeof (vdev_label_t)); + (void) ldi_close(vd_lh, FREAD, kcred); + if (*config == NULL) + error = SET_ERROR(EIDRM); + + return (error); +} diff --git a/module/os/macos/zfs/vdev_file.c b/module/os/macos/zfs/vdev_file.c new file mode 100644 index 0000000000..ca04765f1b --- /dev/null +++ b/module/os/macos/zfs/vdev_file.c @@ -0,0 +1,322 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for files. + */ + +static taskq_t *vdev_file_taskq; + +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static mode_t +vdev_file_open_mode(spa_mode_t spa_mode) +{ + mode_t mode = 0; + + if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { + mode = O_RDWR; + } else if (spa_mode & SPA_MODE_READ) { + mode = O_RDONLY; + } else if (spa_mode & SPA_MODE_WRITE) { + mode = O_WRONLY; + } + + return (mode | O_LARGEFILE); +} + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + vdev_file_t *vf; + zfs_file_t *fp; + zfs_file_attr_t zfa; + int error = 0; + + dprintf("vdev_file_open %p\n", vd->vdev_tsd); + + /* + * Rotational optimizations only make sense on block devices. + */ + vd->vdev_nonrot = B_TRUE; + + /* + * Allow TRIM on file based vdevs. This may not always be supported, + * since it depends on your kernel version and underlying filesystem + * type but it is always safe to attempt. + */ + vd->vdev_has_trim = B_TRUE; + + /* + * Disable secure TRIM on file based vdevs. There is no way to + * request this behavior from the underlying filesystem. + */ + vd->vdev_has_securetrim = B_FALSE; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ +#ifdef _KERNEL + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + goto skip_open; + } +#endif + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + + error = zfs_file_open(vd->vdev_path + 1, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_file = fp; + + /* + * Make sure it's a regular file. + */ + if (zfs_file_getattr(fp, &zfa)) { + return (SET_ERROR(ENODEV)); + } + +skip_open: + /* + * Determine the physical size of the file. + */ + error = zfs_file_getattr(vf->vf_file, &zfa); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *max_psize = *psize = zfa.zfa_size; + *ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_reopening || vf == NULL) + return; + + if (vf->vf_file != NULL) { + zfs_file_close(vf->vf_file); + } + + vd->vdev_delayed_close = B_FALSE; + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +static void +vdev_file_io_strategy(void *arg) +{ + zio_t *zio = (zio_t *)arg; + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + ssize_t resid; + loff_t off; + void *data; + ssize_t size; + int err; + + off = zio->io_offset; + size = zio->io_size; + resid = 0; + + if (zio->io_type == ZIO_TYPE_READ) { + data = + abd_borrow_buf(zio->io_abd, size); + err = zfs_file_pread(vf->vf_file, data, size, off, &resid); + abd_return_buf_copy(zio->io_abd, data, size); + } else { + data = + abd_borrow_buf_copy(zio->io_abd, size); + err = zfs_file_pwrite(vf->vf_file, data, size, off, &resid); + abd_return_buf(zio->io_abd, data, size); + } + + zio->io_error = (err != 0 ? EIO : 0); + + if (zio->io_error == 0 && resid != 0) + zio->io_error = SET_ERROR(ENOSPC); + + zio_delay_interrupt(zio); +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + zio->io_error = zfs_file_fsync(vf->vf_file, + O_SYNC|O_DSYNC); + break; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_execute(zio); + return; + } else if (zio->io_type == ZIO_TYPE_TRIM) { + int mode = 0; + + ASSERT3U(zio->io_size, !=, 0); + + /* XXX FreeBSD has no fallocate routine in file ops */ + zio->io_error = zfs_file_fallocate(vf->vf_file, + mode, zio->io_offset, zio->io_size); + zio_execute(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + + VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio, + TQ_SLEEP), !=, 0); +} + + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_FILE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("vdev_file_taskq", 100, minclsyspri, + max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); + + VERIFY(vdev_file_taskq); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/module/os/macos/zfs/zfs_acl.c b/module/os/macos/zfs/zfs_acl.c new file mode 100644 index 0000000000..0757a8e26a --- /dev/null +++ b/module/os/macos/zfs/zfs_acl.c @@ -0,0 +1,2983 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + +#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} + +zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = &zfs_acl_fuid_ops; + else + aclp->z_ops = &zfs_acl_v0_ops; + return (aclp); +} + +zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (S_ISDIR(obj_mode) && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) + return (B_FALSE); + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + ASSERT(aclp); + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops->ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops->ace_flags_get(acep); + *type = aclp->z_ops->ace_type_get(acep); + *access_mask = aclp->z_ops->ace_mask_get(acep); + *who = aclp->z_ops->ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +#if 0 // unused function +static zfs_acl_node_t * +zfs_acl_curr_node(zfs_acl_t *aclp) +{ + ASSERT(aclp->z_curr_node); + return (aclp->z_curr_node); +} +#endif + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +int +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops->ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +#if 0 // unused function +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) + continue; + + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} +#endif + +static int +zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type))) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = &zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, zp->z_mode, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops->ace_mask_set(acep, access_mask); + aclp->z_ops->ace_type_set(acep, access_type); + aclp->z_ops->ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops->ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + */ +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; + + + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type))) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over any inherit_only ACEs + */ + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + + /* + * Apple has unusual expectations to emulate hfs in that the + * mode is not updated: + * -rw-r--r-- 1 root wheel 0 Nov 12 12:39 file.txt + * chmod +a "root allow execute" file.txt + * ZFS: -rwxr--r--+ 1 root wheel 0 Nov 12 12:39 file.txt + * HFS: -rw-r--r--+ 1 root wheel 0 Nov 12 12:39 file.txt + * 0: user:root allow execute + */ + if (entry_type == ACE_OWNER) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; + } + } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + *pflags &= ~ZFS_NO_EXECS_DENIED; + else + *pflags |= ZFS_NO_EXECS_DENIED; + + return (mode); +} + +/* + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. + */ +int +zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) +{ + zfs_acl_t *aclp; + int aclsize = 0; + int acl_count = 0; + zfs_acl_node_t *aclnode; + zfs_acl_phys_t znode_acl; + int version; + int error; + boolean_t drop_lock = B_FALSE; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + + /* + * close race where znode could be upgrade while trying to + * read the znode attributes. + * + * But this could only happen if the file isn't already an SA + * znode + */ + if (!zp->z_is_sa && !have_lock) { + mutex_enter(&zp->z_lock); + drop_lock = B_TRUE; + } + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; + } + + aclp = zfs_acl_alloc(version); + + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + + if (error != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + goto done; + } + + list_insert_head(&aclp->z_acl, aclnode); + + *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; +done: + if (drop_lock) + mutex_exit(&zp->z_lock); + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACLTYPE_POSIXACL) + return (0); + + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error == 0 && aclp->z_acl_count > 0) + zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, zp->z_uid, zp->z_gid); + + /* + * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL + * nor a DACL_ACES SA in which case ENOENT is returned from + * zfs_acl_node_read() when the SA can't be located. + * Allow chown/chgrp to succeed in these cases rather than + * returning an error that makes no sense in the context of + * the caller. + */ + if (error == ENOENT) + return (0); + + return (error); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) +{ + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + zfs_acl_phys_t acl_phys; + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); + + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + /* + * Upgrade needed? + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp, cr); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_OLD_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; + } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); + } + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); +} + +static void +zfs_acl_chmod(umode_t umode, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) +{ + void *acep = NULL; + uint64_t who; + int new_count, new_bytes; + int ace_size; + int entry_type; + uint16_t iflags, type; + uint32_t access_mask; + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops->ace_abstract_size(); + void *zacep; + boolean_t isdir; + trivial_acl_t masks; + + new_count = new_bytes = 0; + + isdir = S_ISDIR(umode); + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + entry_type = (iflags & ACE_TYPE_FLAGS); + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; + } + + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + } else { + /* + * Limit permissions granted by ACEs to be no greater + * than permissions of the requested group mode. + * Applies when the "aclmode" property is set to + * "groupmask". + */ + if ((type == ALLOW) && trim) + access_mask &= masks.group; + } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops->ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error = 0; + + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(zp->z_mode, mode, B_TRUE, + (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(umode_t umode, uint16_t acep_flags) +{ + int iflags = (acep_flags & 0xf); + + if (S_ISDIR(umode) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!S_ISDIR((umode) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(zfsvfs_t *zfsvfs, zfs_acl_t *paclp, + uint64_t umode, boolean_t *need_chmod) +{ + void *pacep = NULL; + void *acep; + zfs_acl_node_t *aclnode; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + uint_t aclinherit; + boolean_t isdir = S_ISDIR(umode); + boolean_t islnk = S_ISLNK(umode); + boolean_t isreg = S_ISREG(umode); + + *need_chmod = B_TRUE; + + aclp = zfs_acl_alloc(paclp->z_version); + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || islnk) + return (aclp); + + while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type))) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(umode, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if ((aclinherit == ZFS_ACL_PASSTHROUGH || + aclinherit == ZFS_ACL_PASSTHROUGH_X) && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == OWNING_GROUP)) && + (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((umode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + + /* + * Strip write_acl and write_owner from permissions + * when inheriting an ACE + */ + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; + } + + ace_size = aclp->z_ops->ace_size(pacep); + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops->ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops->ace_flags_get(acep); + + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + continue; + } + + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; + + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } else { + newflags &= ~ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + * Also, create FUIDs for owner and group. + */ +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) +{ + int error; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_acl_t *paclp; + gid_t gid; + boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; + boolean_t inherited = B_FALSE; + + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); + + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); + /* + * Determine uid and gid. + */ + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + } else { + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; + if (vap->va_mask & ATTR_GID) { + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + if (acl_ids->z_fgid != dzp->z_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + acl_ids->z_fgid = 0; + } + if (acl_ids->z_fgid == 0) { + if (dzp->z_mode & S_ISGID) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = dzp->z_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, + cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = zfs_fuid_idx_domain( + &zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, + FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } + } else { + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); +#ifdef __FreeBSD__ + gid = acl_ids->z_fgid = dzp->z_gid; +#else + gid = crgetgid(cr); +#endif + } + } + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; + } else { + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); + mutex_enter(&dzp->z_lock); + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; + } else { + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + mutex_exit(&dzp->z_lock); + mutex_exit(&dzp->z_acl_lock); + + if (need_chmod) { + if (vap->va_type == VDIR) + acl_ids->z_aclp->z_hints |= + ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, + trim, acl_ids->z_aclp); + } + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + + return (0); +} + +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} + +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) +{ + return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || + zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || + (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); +} + +/* + * Retrieve a file's ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclcheck, + cred_t *cr) +{ + struct kauth_acl **aclpp = (struct kauth_acl **)vsecp; + zfs_acl_t *aclp; + kauth_acl_t k_acl; + u_int32_t ace_flags = 0; + kauth_ace_rights_t rights = 0; + guid_t *guidp; + uint64_t who; + uint32_t access_mask; + uint16_t flags; + uint16_t type; + int i; + int error; + void *zacep = NULL; + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_TRUE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + if ((k_acl = kauth_acl_alloc(aclp->z_acl_count)) == NULL) { + mutex_exit(&zp->z_acl_lock); + *aclpp = (kauth_acl_t)KAUTH_FILESEC_NONE; + return (ENOMEM); + } + + dprintf("acl_count %d\n", aclp->z_acl_count); + + k_acl->acl_entrycount = aclp->z_acl_count; + k_acl->acl_flags = 0; + *aclpp = k_acl; + + /* + * Translate Open Solaris ACEs to Mac OS X ACLs + */ + i = 0; + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &flags, &type))) { + rights = 0; + ace_flags = 0; + + guidp = &k_acl->acl_ace[i].ace_applicable; + + if (flags & ACE_OWNER) { +#if HIDE_TRIVIAL_ACL + continue; +#endif + who = -1; + nfsacl_set_wellknown(KAUTH_WKG_OWNER, guidp); + } else if ((flags & OWNING_GROUP) == OWNING_GROUP) { +#if HIDE_TRIVIAL_ACL + continue; +#endif + who = -1; + nfsacl_set_wellknown(KAUTH_WKG_GROUP, guidp); + } else if (flags & ACE_EVERYONE) { +#if HIDE_TRIVIAL_ACL + continue; +#endif + who = -1; + nfsacl_set_wellknown(KAUTH_WKG_EVERYBODY, guidp); + /* Try to get a guid from our uid */ + } else { + + dprintf("ZFS: trying to map uid %d flags %x type %x\n", + who, flags, type); + + if (flags & OWNING_GROUP) { + if (kauth_cred_gid2guid(who, guidp) == 0) { + dprintf("ZFS: appears to be a group\n"); + } + } else if (kauth_cred_uid2guid(who, guidp) == 0) { + dprintf("ZFS: appears to be a user\n"); + } else { + dprintf("ZFS: Unable to map\n"); + bzero(guidp, sizeof (guid_t)); + } + } + + // access_mask = aclp->z_acl[i].a_access_mask; + if (access_mask & ACE_READ_DATA) + rights |= KAUTH_VNODE_READ_DATA; + if (access_mask & ACE_WRITE_DATA) + rights |= KAUTH_VNODE_WRITE_DATA; + if (access_mask & ACE_APPEND_DATA) + rights |= KAUTH_VNODE_APPEND_DATA; + if (access_mask & ACE_READ_NAMED_ATTRS) + rights |= KAUTH_VNODE_READ_EXTATTRIBUTES; + if (access_mask & ACE_WRITE_NAMED_ATTRS) + rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; + if (access_mask & ACE_EXECUTE) + rights |= KAUTH_VNODE_EXECUTE; + if (access_mask & ACE_DELETE_CHILD) + rights |= KAUTH_VNODE_DELETE_CHILD; + if (access_mask & ACE_READ_ATTRIBUTES) + rights |= KAUTH_VNODE_READ_ATTRIBUTES; + if (access_mask & ACE_WRITE_ATTRIBUTES) + rights |= KAUTH_VNODE_WRITE_ATTRIBUTES; + if (access_mask & ACE_DELETE) + rights |= KAUTH_VNODE_DELETE; + if (access_mask & ACE_READ_ACL) + rights |= KAUTH_VNODE_READ_SECURITY; + if (access_mask & ACE_WRITE_ACL) + rights |= KAUTH_VNODE_WRITE_SECURITY; + if (access_mask & ACE_WRITE_OWNER) + rights |= KAUTH_VNODE_TAKE_OWNERSHIP; + if (access_mask & ACE_SYNCHRONIZE) + rights |= KAUTH_VNODE_SYNCHRONIZE; + k_acl->acl_ace[i].ace_rights = rights; + + // flags = aclp->z_acl[i].a_flags; + if (flags & ACE_FILE_INHERIT_ACE) + ace_flags |= KAUTH_ACE_FILE_INHERIT; + if (flags & ACE_DIRECTORY_INHERIT_ACE) + ace_flags |= KAUTH_ACE_DIRECTORY_INHERIT; + if (flags & ACE_NO_PROPAGATE_INHERIT_ACE) + ace_flags |= KAUTH_ACE_LIMIT_INHERIT; + if (flags & ACE_INHERIT_ONLY_ACE) + ace_flags |= KAUTH_ACE_ONLY_INHERIT; + + // type = aclp->z_acl[i].a_type; + switch (type) { + case ACE_ACCESS_ALLOWED_ACE_TYPE: + ace_flags |= KAUTH_ACE_PERMIT; + break; + case ACE_ACCESS_DENIED_ACE_TYPE: + ace_flags |= KAUTH_ACE_DENY; + break; + case ACE_SYSTEM_AUDIT_ACE_TYPE: + ace_flags |= KAUTH_ACE_AUDIT; + break; + case ACE_SYSTEM_ALARM_ACE_TYPE: + ace_flags |= KAUTH_ACE_ALARM; + break; + } + k_acl->acl_ace[i].ace_flags = ace_flags; + i++; + } + k_acl->acl_entrycount = i; + mutex_exit(&zp->z_acl_lock); + + zfs_acl_free(aclp); + + return (0); +} + +int +zfs_addacl_trivial(znode_t *zp, ace_t *aces, int *nentries, int seen_type) +{ + zfs_acl_t *aclp; + uint64_t who; + uint32_t access_mask; + uint16_t flags; + uint16_t type; + int i; + int error; + void *zacep = NULL; + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_TRUE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + dprintf("ondisk acl_count %d\n", aclp->z_acl_count); + + // Start at the end + i = *nentries; + + /* + * Translate Open Solaris ACEs to Mac OS X ACLs + */ + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &flags, &type))) { + + if (flags & ACE_OWNER) { + if (seen_type & ACE_OWNER) continue; + seen_type |= ACE_OWNER; + who = -1; + } else if ((flags & OWNING_GROUP) == OWNING_GROUP) { + if (seen_type & ACE_GROUP) continue; + seen_type |= ACE_GROUP; + who = -1; + } else if (flags & ACE_EVERYONE) { + if (seen_type & ACE_EVERYONE) continue; + seen_type |= ACE_EVERYONE; + who = -1; + /* Try to get a guid from our uid */ + } else { + // Only deal with the trivials + continue; + } + + aces[i].a_who = who; + aces[i].a_access_mask = access_mask; + aces[i].a_flags = flags; + aces[i].a_type = type; + + dprintf("zfs: adding entry %d for type %x sizeof %d\n", i, type, + sizeof (aces[i])); + i++; + } + + *nentries = i; + mutex_exit(&zp->z_acl_lock); + + zfs_acl_free(aclp); + + return (0); +} + + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode, + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (SET_ERROR(EINVAL)); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size, fuidp, cr)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + + + +/* + * Set a file's ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; + uint64_t acl_obj; + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, vnode_vtype(ZTOV(zp)), vsecp, cr, + &fuidp, &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + +top: + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_NOWAIT); + if (error) { + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); + zp->z_acl_cached = aclp; + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + + +/* + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. + */ +static int +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) +{ + if ((v4_mode & WRITE_MASK) && + (vfs_isrdonly(zp->z_zfsvfs->z_vfs)) && + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { + return (SET_ERROR(EROFS)); + } + + /* + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common(). + */ + if ((v4_mode & WRITE_MASK_DATA) && + (zp->z_pflags & ZFS_IMMUTABLE)) { + return (EPERM); + } +#ifdef sun + if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (EPERM); +#else + /* + * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK + * (sunlnk) is set. We just don't allow directory removal, which is + * handled in zfs_zaccess_delete(). + */ + if ((v4_mode & ACE_DELETE) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (SET_ERROR(EPERM)); + } +#endif + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); + } + + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCES if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + ASSERT(zp->z_acl_cached); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + uint32_t mask_matched; + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (vnode_isdir(ZTOV(zp)) && (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != IDMAP_WK_CREATOR_OWNER_UID && + uid == newid) + checkit = B_TRUE; + break; + } else { + mutex_exit(&zp->z_acl_lock); + return (SET_ERROR(EIO)); + } + } + + if (checkit) { + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } + } + *working_mode &= ~mask_matched; + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (SET_ERROR(EACCES)); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + /* + * Note: ZFS_READONLY represents the "DOS R/O" attribute. + * When that flag is set, we should behave as if write access + * were not granted by anything in the ACL. In particular: + * We _must_ allow writes after opening the file r/w, then + * setting the DOS R/O attribute, and writing some more. + * (Similar to how you can write after fchmod(fd, 0444).) + * + * Therefore ZFS_READONLY is ignored in the dataset check + * above, and checked here as if part of the ACL check. + * Also note: DOS R/O is ignored for directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + !vnode_isdir(ZTOV(zp)) && + (zp->z_pflags & ZFS_READONLY)) { + return (SET_ERROR(EPERM)); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (SET_ERROR(EACCES)); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t uid = crgetuid(cr); + int error; + + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + is_attr = ((zdp->z_pflags & ZFS_XATTR) && + (vnode_isdir(ZTOV(zdp)))); + if (is_attr) + goto slow; + + + mutex_enter(&zdp->z_acl_lock); + + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + + if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + + if (uid == zdp->z_uid) { + owner = B_TRUE; + if (zdp->z_mode & S_IXUSR) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (groupmember(zdp->z_gid, cr)) { + groupmbr = B_TRUE; + if (zdp->z_mode & S_IXGRP) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_mode & S_IXOTH) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + } + + mutex_exit(&zdp->z_acl_lock); + +slow: + DTRACE_PROBE(zfs__fastpath__execute__access__miss); + ZFS_ENTER(zdp->z_zfsvfs); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); + ZFS_EXIT(zdp->z_zfsvfs); + return (error); +} + +/* + * Determine whether Access should be granted/denied. + * + * The least priv subsytem is always consulted as a basic privilege + * can define any form of access. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + boolean_t check_privs; + znode_t *xzp = NULL; + znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; + + is_attr = ((zp->z_pflags & ZFS_XATTR) && (vnode_isdir(ZTOV(zp)))); + +#ifdef __APPLE__ + /* + * In FreeBSD, we don't care about permissions of individual ADS. + * Note that not checking them is not just an optimization - without + * this shortcut, EA operations may bogusly fail with EACCES. + */ + if (zp->z_pflags & ZFS_XATTR) + return (0); +#else + /* + * If attribute then validate against base file + */ + if (is_attr) { + uint64_t parent; + + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zp->z_zfsvfs), &parent, + sizeof (parent))) != 0) + return (error); + + /* + * Cache the lookup on the parent file znode as + * zp->z_xattr_parent and hold a reference. This + * effectively pins the parent in memory until all + * child xattr znodes have been destroyed and + * release their references in zfs_inode_destroy(). + */ + error = zfs_zget(zp->z_zfsvfs, parent, &check_zp); + if (error) + return (error); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zp->z_xattr_parent == NULL) + zp->z_xattr_parent = check_zp; + rw_exit(&zp->z_xattr_lock); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } +#endif + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + /* + * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC + * in needed_bits. Map the bits mapped by working_mode (currently + * missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VWRITE; + if (working_mode & ACE_EXECUTE) + needed_bits |= VEXEC; + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits)); + } + + if (error && !check_privs) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + mode_t checkmode = 0; + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner, + needed_bits & ~checkmode, needed_bits); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, + owner); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, + owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(ZTOV(check_zp), cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, + owner); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = SET_ERROR(EACCES); + } + } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits); + } + + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +/* See zfs_zaccess_delete() */ +uint64_t zfs_write_implies_delete_child = 1; + +/* + * Determine whether delete access should be granted. + * + * The following chart outlines how we handle delete permissions which is + * how recent versions of windows (Windows 2008) handles it. The efficiency + * comes from not having to check the parent ACL where the object itself grants + * delete: + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Deny * | Permit | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Deny * | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * Re. execute permission on the directory: if that's missing, + * the vnode lookup of the target will fail before we get here. + * + * Re [*] in the table above: NFSv4 would normally Permit delete for + * these two cells of the matrix. + * See acl.h for notes on which ACE_... flags should be checked for which + * operations. Specifically, the NFSv4 committee recommendation is in + * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs + * should take precedence ahead of ALLOW ACEs. + * + * This implementation always consults the target object's ACL first. + * If a DENY ACE is present on the target object that specifies ACE_DELETE, + * delete access is denied. If an ALLOW ACE with ACE_DELETE is present on + * the target object, access is allowed. If and only if no entries with + * ACE_DELETE are present in the object's ACL, check the container's ACL + * for entries with ACE_DELETE_CHILD. + * + * A summary of the logic implemented from the table above is as follows: + * + * First check for DENY ACEs that apply. + * If either target or container has a deny, EACCES. + * + * Delete access can then be summarized as follows: + * 1: The object to be deleted grants ACE_DELETE, or + * 2: The containing directory grants ACE_DELETE_CHILD. + * In a Windows system, that would be the end of the story. + * In this system, (2) has some complications... + * 2a: "sticky" bit on a directory adds restrictions, and + * 2b: existing ACEs from previous versions of ZFS may + * not carry ACE_DELETE_CHILD where they should, so we + * also allow delete when ACE_WRITE_DATA is granted. + * + * Note: 2b is technically a work-around for a prior bug, + * which hopefully can go away some day. For those who + * no longer need the work around, and for testing, this + * work-around is made conditional via the tunable: + * zfs_write_implies_delete_child + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t wanted_dirperms; + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + boolean_t dzpcheck_privs; + boolean_t zpcheck_privs; + + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); + + /* + * Case 1: + * If target object grants ACE_DELETE then we are done. This is + * indicated by a return value of 0. For this case we don't worry + * about the sticky bit because sticky only applies to the parent + * directory and this is the child access result. + * + * If we encounter a DENY ACE here, we're also done (EACCES). + * Note that if we hit a DENY ACE here (on the target) it should + * take precedence over a DENY ACE on the container, so that when + * we have more complete auditing support we will be able to + * report an access failure against the specific target. + * (This is part of why we're checking the target first.) + */ + zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr); + if (zp_error == EACCES) { + /* We hit a DENY ACE. */ + if (!zpcheck_privs) + return (SET_ERROR(zp_error)); + + return (secpolicy_vnode_remove(ZTOV(zp), cr)); + } + if (zp_error == 0) + return (0); + + /* + * Case 2: + * If the containing directory grants ACE_DELETE_CHILD, + * or we're in backward compatibility mode and the + * containing directory has ACE_WRITE_DATA, allow. + * Case 2b is handled with wanted_dirperms. + */ + wanted_dirperms = ACE_DELETE_CHILD; + if (zfs_write_implies_delete_child) + wanted_dirperms |= ACE_WRITE_DATA; + dzp_error = zfs_zaccess_common(dzp, wanted_dirperms, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + if (dzp_error == EACCES) { + /* We hit a DENY ACE. */ + if (!dzpcheck_privs) + return (SET_ERROR(dzp_error)); + return (secpolicy_vnode_remove(ZTOV(zp), cr)); + } + + /* + * Cases 2a, 2b (continued) + * + * Note: dzp_working_mode now contains any permissions + * that were NOT granted. Therefore, if any of the + * wanted_dirperms WERE granted, we will have: + * dzp_working_mode != wanted_dirperms + * We're really asking if ANY of those permissions + * were granted, and if so, grant delete access. + */ + if (dzp_working_mode != wanted_dirperms) + dzp_error = 0; + + /* + * dzp_error is 0 if the container granted us permissions to "modify". + * If we do not have permission via one or more ACEs, our current + * privileges may still permit us to modify the container. + * + * dzpcheck_privs is false when i.e. the FS is read-only. + * Otherwise, do privilege checks for the container. + */ + if (dzp_error != 0 && dzpcheck_privs) { + uid_t owner; + /* + * The secpolicy call needs the requested access and + * the current access mode of the container, but it + * only knows about Unix-style modes (VEXEC, VWRITE), + * so this must condense the fine-grained ACE bits into + * Unix modes. + * + * The VEXEC flag is easy, because we know that has + * always been checked before we get here (during the + * lookup of the target vnode). The container has not + * granted us permissions to "modify", so we do not set + * the VWRITE flag in the current access mode. + */ + owner = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, + ZFS_OWNER); + dzp_error = secpolicy_vnode_access2(cr, ZTOV(dzp), + owner, VEXEC, VWRITE|VEXEC); + } + if (dzp_error != 0) { + /* + * Note: We may have dzp_error = -1 here (from + * zfs_zacess_common). Don't return that. + */ + return (SET_ERROR(EACCES)); + } + + /* + * At this point, we know that the directory permissions allow + * us to modify, but we still need to check for the additional + * restrictions that apply when the "sticky bit" is set. + * + * Yes, zfs_sticky_remove_access() also checks this bit, but + * checking it here and skipping the call below is nice when + * you're watching all of this with dtrace. + */ + if ((dzp->z_mode & S_ISVTX) == 0) + return (0); + /* + * zfs_sticky_remove_access will succeed if: + * 1. The sticky bit is absent. + * 2. We pass the sticky bit restrictions. + * 3. We have privileges that always allow file removal. + */ + return (zfs_sticky_remove_access(dzp, zp, cr)); +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + add_perm = (vnode_isdir(ZTOV(szp))) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + * + * BSD operating systems also require write permission + * on the directory being moved from one parent directory + * to another. + */ + if (vnode_isdir(ZTOV(szp)) && ZTOV(sdzp) != ZTOV(tdzp)) { + if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))) + return (error); + } + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp) { + if ((error = zfs_zaccess_delete(tdzp, tzp, cr))) + return (error); + } + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/module/os/macos/zfs/zfs_boot.cpp b/module/os/macos/zfs/zfs_boot.cpp new file mode 100644 index 0000000000..7493792ed0 --- /dev/null +++ b/module/os/macos/zfs/zfs_boot.cpp @@ -0,0 +1,2972 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2015, Evan Susarret. All rights reserved. + */ +/* + * ZFS boot utils + * + * While loading the kext, check if early boot and zfs-boot + * kernel flag. + * Allocate pool_list (and lock). + * Register matching notification zfs_boot_probe_disk to check + * IOMediaBSDClient devices as they are published (or matched?), + * passing pool_list (automatically calls handler for all + * existing devices). + * Dispatch zfs_boot_import_thread on system_taskq. + * + * In notification handler zfs_boot_probe_disk: + * Check provider IOMedia for: + * 1 Leaf node and whole disk. + * 2 Leaf node and type ZFS. + * 3 Leaf node and type FreeBSD-ZFS. + * Check IOMedia meets minimum size or bail. + * Allocate char* buffer. + * Call vdev_disk_read_rootlabel. + * XXX Alternately: + * Alloc and prep IOMemoryDescriptor. + * Open IOMedia device (read-only). + * Try to read vdev label from device. + * Close IOMedia device. + * Release IOMemoryDescriptor (data is in buffer). + * XXX + * If label was read, try to generate a config from label. + * Check pool name matches zfs-boot or bail. + * Check pool status. + * Update this vdev's path and set status. + * Set other vdevs to missing status. + * Check-in config in thread-safe manner: + * Take pool_list lock. + * If config not found, insert new config, or update existing. + * Unlock pool_list. + * If found config is complete, wake import thread. + * + * In vdev_disk_read_rootlabel: + * Use vdev_disk_physio to read label. + * If label was read, try to unpack. + * Return label or failure. + * + * In vdev_disk_physio: + * Open device (read-only) using vnop/VOP. + * Try to read vdev label from device. + * Close device using vnop/VOP. + * + * In zfs_boot_import_thread: + * Loop checking for work and sleeping on lock between loops. + * Take pool_list lock and check for work. + * Attempt to import root pool using spa_import_rootpool. + * If successful, remove notification handler (waits for + * all tasks). + * Empty and deallocate pool_list (and lock). + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +extern "C" { +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); + +} /* extern "C" */ + +#include +#include +#include +#include + +#if defined(DEBUG) || defined(ZFS_DEBUG) +#define DSTATIC +#else +#define DSTATIC static +#endif + +#ifndef verify +#define verify(EX) (void)((EX) || \ + (printf("%s, %s, %d, %s\n", #EX, __FILE__, __LINE__, __func__), 0)) +#endif /* verify */ + +#ifndef dprintf +#if defined(DEBUG) || defined(ZFS_DEBUG) +#define dprintf(fmt, ...) do { \ + printf("%s " fmt, __func__, __VA_ARGS__); \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf(fmt, ...) do { } while (0); +#endif /* if DEBUG or ZFS_DEBUG */ +#endif /* ifndef dprintf */ + +/* Most of this is only built when configured with --enable-boot */ + +/* block size is 512 B, count is 512 M blocks */ +#define ZFS_BOOT_DEV_BSIZE (UInt64)(1<<9) +#define ZFS_BOOT_DEV_BCOUNT (UInt64)(2<<29) +#define ZFS_BOOT_DATASET_NAME_KEY "zfs_dataset_name" +#define ZFS_BOOT_DATASET_UUID_KEY "zfs_dataset_uuid" +#define ZFS_BOOT_DATASET_RDONLY_KEY "zfs_dataset_rdonly" +#define ZFS_MOUNTROOT_RETRIES 50 +#define ZFS_BOOTLOG_DELAY 100 + +/* + * C functions for boot-time vdev discovery + */ + +/* + * Intermediate structures used to gather configuration information. + */ +typedef struct config_entry { + uint64_t ce_txg; + nvlist_t *ce_config; + struct config_entry *ce_next; +} config_entry_t; + +typedef struct vdev_entry { + uint64_t ve_guid; + config_entry_t *ve_configs; + struct vdev_entry *ve_next; +} vdev_entry_t; + +typedef struct pool_entry { + uint64_t pe_guid; + vdev_entry_t *pe_vdevs; + struct pool_entry *pe_next; + uint64_t complete; +} pool_entry_t; + +typedef struct name_entry { + char *ne_name; + uint64_t ne_guid; + uint64_t ne_order; + uint64_t ne_num_labels; + struct name_entry *ne_next; +} name_entry_t; + +typedef struct pool_list { + pool_entry_t *pools; + name_entry_t *names; + uint64_t pool_guid; + char *pool_name; + OSSet *new_disks; + OSSet *disks; + kmutex_t lock; + kcondvar_t cv; + IOService *zfs_hl; + IONotifier *notifier; + volatile UInt64 terminating; +} pool_list_t; + +#define ZFS_BOOT_ACTIVE 0x1 +#define ZFS_BOOT_TERMINATING 0x2 +#define ZFS_BOOT_INVALID 0x99 + +#define ZFS_BOOT_PREALLOC_SET 5 + +#if 0 +static ZFSBootDevice *bootdev = 0; +#endif +static pool_list_t *zfs_boot_pool_list = 0; + +DSTATIC char * +zfs_boot_get_devid(const char *path) +{ + /* + * XXX Unavailable interface + * + * If we implement one in spl, it could + * simplify import when device paths + * have changed (e.g. USB pools). + * + * Could use ldi DeviceTree path, or + * IOService path if not in DTPlane. + */ + return (NULL); +} + +/* + * Go through and fix up any path and/or devid information for the given vdev + * configuration. + * + * Copied from libzfs_import.c + */ +DSTATIC int +zfs_boot_fix_paths(nvlist_t *nv, name_entry_t *names) +{ + nvlist_t **child; + uint_t c, children; + uint64_t guid; + name_entry_t *ne, *best; + char *path, *devid; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (zfs_boot_fix_paths(child[c], names) != 0) + return (-1); + return (0); + } + + /* + * This is a leaf (file or disk) vdev. In either case, go through + * the name list and see if we find a matching guid. If so, replace + * the path and see if we can calculate a new devid. + * + * There may be multiple names associated with a particular guid, in + * which case we have overlapping partitions or multiple paths to the + * same disk. In this case we prefer to use the path name which + * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we + * use the lowest order device which corresponds to the first match + * while traversing the ZPOOL_IMPORT_PATH search path. + */ + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + path = NULL; + + best = NULL; + for (ne = names; ne != NULL; ne = ne->ne_next) { + if (ne->ne_guid == guid) { + + if (path == NULL) { + best = ne; + break; + } + + if ((strlen(path) == strlen(ne->ne_name)) && + strncmp(path, ne->ne_name, strlen(path)) == 0) { + best = ne; + break; + } + + if (best == NULL) { + best = ne; + continue; + } + + /* Prefer paths with more vdev labels. */ + if (ne->ne_num_labels > best->ne_num_labels) { + best = ne; + continue; + } + + /* Prefer paths earlier in the search order. */ + if (ne->ne_num_labels == best->ne_num_labels && + ne->ne_order < best->ne_order) { + best = ne; + continue; + } + } + } + + if (best == NULL) + return (0); + + if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) + return (-1); + + if ((devid = zfs_boot_get_devid(best->ne_name)) == NULL) { + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + } else { + if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0) { + spa_strfree(devid); + return (-1); + } + spa_strfree(devid); + } + + return (0); +} + +/* + * Add the given configuration to the list of known devices. + * + * Copied from libzfs_import.c + * diffs: kmem_alloc, kmem_free with size + */ +DSTATIC int +zfs_boot_add_config(pool_list_t *pl, const char *path, + int order, int num_labels, nvlist_t *config) +{ + uint64_t pool_guid, vdev_guid, top_guid, txg, state; + pool_entry_t *pe; + vdev_entry_t *ve; + config_entry_t *ce; + name_entry_t *ne; + + dprintf("%s %p [%s] %d %d %p\n", __func__, + pl, path, order, num_labels, config); + + /* + * If this is a hot spare not currently in use or level 2 cache + * device, add it to the list of names to translate, but don't do + * anything else. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) == 0 && + (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && + nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { + if ((ne = (name_entry_t *) kmem_alloc( + sizeof (name_entry_t), KM_SLEEP)) == NULL) { + return (-1); + } + bzero(ne, sizeof (name_entry_t)); + + if ((ne->ne_name = spa_strdup(path)) == NULL) { + kmem_free(ne, sizeof (name_entry_t)); + return (-1); + } + ne->ne_guid = vdev_guid; + ne->ne_order = order; + ne->ne_num_labels = num_labels; + ne->ne_next = pl->names; + pl->names = ne; + return (0); + } + + /* + * If we have a valid config but cannot read any of these fields, then + * it means we have a half-initialized label. In vdev_label_init() + * we write a label with txg == 0 so that we can identify the device + * in case the user refers to the same disk later on. If we fail to + * create the pool, we'll be left with a label in this state + * which should not be considered part of a valid pool. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, + &vdev_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(config); + return (0); + } + + /* + * First, see if we know about this pool. If not, then add it to the + * list of known pools. + */ + for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { + if (pe->pe_guid == pool_guid) + break; + } + + if (pe == NULL) { + if ((pe = (pool_entry_t *) kmem_alloc( + sizeof (pool_entry_t), KM_SLEEP)) == NULL) { + nvlist_free(config); + return (-1); + } + bzero(pe, sizeof (pool_entry_t)); + pe->pe_guid = pool_guid; + pe->pe_next = pl->pools; + pl->pools = pe; + } + + /* + * Second, see if we know about this toplevel vdev. Add it if its + * missing. + */ + for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { + if (ve->ve_guid == top_guid) + break; + } + + if (ve == NULL) { + if ((ve = (vdev_entry_t *) kmem_alloc( + sizeof (vdev_entry_t), KM_SLEEP)) == NULL) { + nvlist_free(config); + return (-1); + } + bzero(ve, sizeof (vdev_entry_t)); + ve->ve_guid = top_guid; + ve->ve_next = pe->pe_vdevs; + pe->pe_vdevs = ve; + } + + /* + * Third, see if we have a config with a matching transaction group. If + * so, then we do nothing. Otherwise, add it to the list of known + * configs. + */ + for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { + if (ce->ce_txg == txg) + break; + } + + if (ce == NULL) { + if ((ce = (config_entry_t *) kmem_alloc( + sizeof (config_entry_t), KM_SLEEP)) == NULL) { + nvlist_free(config); + return (-1); + } + bzero(ce, sizeof (config_entry_t)); + ce->ce_txg = txg; + ce->ce_config = config; + ce->ce_next = ve->ve_configs; + ve->ve_configs = ce; + } else { + nvlist_free(config); + } + + /* + * At this point we've successfully added our config to the list of + * known configs. The last thing to do is add the vdev guid -> path + * mappings so that we can fix up the configuration as necessary before + * doing the import. + */ + if ((ne = (name_entry_t *) kmem_alloc( + sizeof (name_entry_t), KM_SLEEP)) == NULL) { + return (-1); + } + bzero(ne, sizeof (name_entry_t)); + + if ((ne->ne_name = spa_strdup(path)) == NULL) { + kmem_free(ne, sizeof (name_entry_t)); + return (-1); + } + + ne->ne_guid = vdev_guid; + ne->ne_order = order; + ne->ne_num_labels = num_labels; + ne->ne_next = pl->names; + pl->names = ne; + + return (0); +} + +/* + * libzfs_import used the libzfs handle and a zfs + * command to issue tryimport in-kernel via ioctl. + * This should leave config as-is, and return nvl. + * Since zfs_boot is already in-kernel, duplicate + * config into nvl, and call spa_tryimport on it. + */ +DSTATIC nvlist_t * +zfs_boot_refresh_config(nvlist_t *config) +{ + nvlist_t *nvl = 0; + + /* tryimport does not free config, and returns new nvl or null */ + nvl = spa_tryimport(config); + return (nvl); +} + +/* + * Determine if the vdev id is a hole in the namespace. + */ +DSTATIC boolean_t +zfs_boot_vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) +{ + int c; + + for (c = 0; c < holes; c++) { + /* Top-level is a hole */ + if (hole_array[c] == id) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Convert our list of pools into the definitive set of configurations. We + * start by picking the best config for each toplevel vdev. Once that's done, + * we assemble the toplevel vdevs into a full config for the pool. We make a + * pass to fix up any incorrect paths, and then add it to the main list to + * return to the user. + */ +DSTATIC nvlist_t * +zfs_boot_get_configs(pool_list_t *pl, boolean_t active_ok) +{ + pool_entry_t *pe; + vdev_entry_t *ve; + config_entry_t *ce; + nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; + nvlist_t **spares, **l2cache; + uint_t i, nspares, nl2cache; + boolean_t config_seen; + uint64_t best_txg; + char *name, *hostname = NULL; + uint64_t guid; + uint_t children = 0; + nvlist_t **child = NULL; + uint_t holes; + uint64_t *hole_array, max_id; + uint_t c; +#if 0 + boolean_t isactive; +#endif + uint64_t hostid; + nvlist_t *nvl; + boolean_t valid_top_config = B_FALSE; + + if (nvlist_alloc(&ret, 0, 0) != 0) + goto nomem; + + for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { + uint64_t id, max_txg = 0; + + if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) + goto nomem; + config_seen = B_FALSE; + + /* + * Iterate over all toplevel vdevs. Grab the pool configuration + * from the first one we find, and then go through the rest and + * add them as necessary to the 'vdevs' member of the config. + */ + for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { + + /* + * Determine the best configuration for this vdev by + * selecting the config with the latest transaction + * group. + */ + best_txg = 0; + for (ce = ve->ve_configs; ce != NULL; + ce = ce->ce_next) { + + if (ce->ce_txg > best_txg) { + tmp = ce->ce_config; + best_txg = ce->ce_txg; + } + } + + /* + * We rely on the fact that the max txg for the + * pool will contain the most up-to-date information + * about the valid top-levels in the vdev namespace. + */ + if (best_txg > max_txg) { + (void) nvlist_remove(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, + ZPOOL_CONFIG_HOLE_ARRAY, + DATA_TYPE_UINT64_ARRAY); + + max_txg = best_txg; + hole_array = NULL; + holes = 0; + max_id = 0; + valid_top_config = B_FALSE; + + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { + verify(nvlist_add_uint64(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + max_id) == 0); + valid_top_config = B_TRUE; + } + + if (nvlist_lookup_uint64_array(tmp, + ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, + &holes) == 0) { + verify(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_HOLE_ARRAY, + hole_array, holes) == 0); + } + } + + if (!config_seen) { + /* + * Copy the relevant pieces of data to the pool + * configuration: + * + * version + * pool guid + * name + * pool txg (if available) + * comment (if available) + * pool state + * hostid (if available) + * hostname (if available) + */ + uint64_t state, version, pool_txg; + char *comment = NULL; + + version = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VERSION); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_VERSION, version); + guid = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_GUID); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_GUID, guid); + name = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_POOL_NAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_POOL_NAME, name); + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_TXG, &pool_txg) == 0) + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_TXG, pool_txg); + + if (nvlist_lookup_string(tmp, + ZPOOL_CONFIG_COMMENT, &comment) == 0) + fnvlist_add_string(config, + ZPOOL_CONFIG_COMMENT, comment); + + state = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_STATE); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_STATE, state); + + hostid = 0; + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + fnvlist_add_uint64(config, + ZPOOL_CONFIG_HOSTID, hostid); + hostname = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_HOSTNAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_HOSTNAME, hostname); + } + + config_seen = B_TRUE; + } + + /* + * Add this top-level vdev to the child array. + */ + verify(nvlist_lookup_nvlist(tmp, + ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); + verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, + &id) == 0); + + if (id >= children) { + nvlist_t **newchild; + + newchild = (nvlist_t **) kmem_alloc((id + 1) * + sizeof (nvlist_t *), KM_SLEEP); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + kmem_free(child, children * + sizeof (nvlist_t *)); + child = newchild; + children = id + 1; + } + if (nvlist_dup(nvtop, &child[id], 0) != 0) + goto nomem; + + } + + /* + * If we have information about all the top-levels then + * clean up the nvlist which we've constructed. This + * means removing any extraneous devices that are + * beyond the valid range or adding devices to the end + * of our array which appear to be missing. + */ + if (valid_top_config) { + if (max_id < children) { + for (c = max_id; c < children; c++) + nvlist_free(child[c]); + children = max_id; + } else if (max_id > children) { + nvlist_t **newchild; + + newchild = (nvlist_t **) kmem_alloc((max_id) * + sizeof (nvlist_t *), KM_SLEEP); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + kmem_free(child, children * + sizeof (nvlist_t *)); + child = newchild; + children = max_id; + } + } + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + + /* + * The vdev namespace may contain holes as a result of + * device removal. We must add them back into the vdev + * tree before we process any missing devices. + */ + if (holes > 0) { + ASSERT(valid_top_config); + + for (c = 0; c < children; c++) { + nvlist_t *holey; + + if (child[c] != NULL || + !zfs_boot_vdev_is_hole(hole_array, holes, + c)) + continue; + + if (nvlist_alloc(&holey, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + + /* + * Holes in the namespace are treated as + * "hole" top-level vdevs and have a + * special flag set on them. + */ + if (nvlist_add_string(holey, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_GUID, 0ULL) != 0) { + nvlist_free(holey); + goto nomem; + } + child[c] = holey; + } + } + + /* + * Look for any missing top-level vdevs. If this is the case, + * create a faked up 'missing' vdev as a placeholder. We cannot + * simply compress the child array, because the kernel performs + * certain checks to make sure the vdev IDs match their location + * in the configuration. + */ + for (c = 0; c < children; c++) { + if (child[c] == NULL) { + nvlist_t *missing; + if (nvlist_alloc(&missing, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + if (nvlist_add_string(missing, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) != 0 || + nvlist_add_uint64(missing, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(missing, + ZPOOL_CONFIG_GUID, 0ULL) != 0) { + nvlist_free(missing); + goto nomem; + } + child[c] = missing; + } + } + + /* + * Put all of this pool's top-level vdevs into a root vdev. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) + goto nomem; + if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) != 0 || + nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || + nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + child, children) != 0) { + nvlist_free(nvroot); + goto nomem; + } + + for (c = 0; c < children; c++) + nvlist_free(child[c]); + kmem_free(child, children * sizeof (nvlist_t *)); + children = 0; + child = NULL; + + /* + * Go through and fix up any paths and/or devids based on our + * known list of vdev GUID -> path mappings. + */ + if (zfs_boot_fix_paths(nvroot, pl->names) != 0) { + nvlist_free(nvroot); + goto nomem; + } + + /* + * Add the root vdev to this pool's configuration. + */ + if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + nvroot) != 0) { + nvlist_free(nvroot); + goto nomem; + } + nvlist_free(nvroot); + + /* + * zdb uses this path to report on active pools that were + * imported or created using -R. + */ + if (active_ok) + goto add_pool; + +#if 0 +/* + * For root-pool import, no pools are active yet. + * Pool name and guid were looked up from the config and only used here. + * (Later we lookup the pool name for a separate test). + */ + /* + * Determine if this pool is currently active, in which case we + * can't actually import it. + */ + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + + if (zfs_boot_pool_active(name, guid, &isactive) != 0) + goto error; + + if (isactive) { + nvlist_free(config); + config = NULL; + continue; + } +#endif + + if ((nvl = zfs_boot_refresh_config(config)) == NULL) { + nvlist_free(config); + config = NULL; + continue; + } + + nvlist_free(config); + config = nvl; + + /* + * Go through and update the paths for spares, now that we have + * them. + */ + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + if (zfs_boot_fix_paths(spares[i], pl->names) != + 0) + goto nomem; + } + } + + /* + * Update the paths for l2cache devices. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + for (i = 0; i < nl2cache; i++) { + if (zfs_boot_fix_paths(l2cache[i], pl->names) != + 0) + goto nomem; + } + } + + /* + * Restore the original information read from the actual label. + */ + (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, + DATA_TYPE_STRING); + if (hostid != 0) { + verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, + hostid) == 0); + verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, + hostname) == 0); + } + +add_pool: + /* + * Add this pool to the list of configs. + */ + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + if (nvlist_add_nvlist(ret, name, config) != 0) + goto nomem; + + nvlist_free(config); + config = NULL; + } + + return (ret); + +nomem: +#ifdef DEBUG + printf("zfs_boot_get_configs failed to allocate memory\n"); +#endif + if (config) nvlist_free(config); + if (ret) nvlist_free(ret); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + if (children > 0) { + kmem_free(child, children * sizeof (nvlist_t *)); + } + /* + * libzfs_import simply calls free(child), we need to + * pass kmem_free the size of the array. Array is + * allocated above as (children * sizeof nvlist_t*). + */ + + return (NULL); +} + +/* + * Return the offset of the given label. + */ +DSTATIC uint64_t +zfs_boot_label_offset(uint64_t size, int l) +{ + ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); + return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); +} + +/* + * Given an IOMedia, read the label information and return an nvlist + * describing the configuration, if there is one. The number of valid + * labels found will be returned in num_labels when non-NULL. + */ +DSTATIC int +zfs_boot_read_label(IOService *zfs_hl, IOMedia *media, + nvlist_t **config, int *num_labels) +{ + IOMemoryDescriptor *buffer = NULL; + uint64_t mediaSize; + uint64_t nread = 0; + vdev_label_t *label; + nvlist_t *expected_config = NULL; + uint64_t expected_guid = 0, size, labelsize; + int l, count = 0; + IOReturn ret; + + *config = NULL; + + /* Verify IOMedia pointer and device size */ + if (!media || (mediaSize = media->getSize()) == 0) { + dprintf("%s couldn't get media or size\n", __func__); + return (-1); + } + + /* Determine vdev label size and aligned vdev size */ + labelsize = sizeof (vdev_label_t); + size = P2ALIGN_TYPED(mediaSize, labelsize, uint64_t); + + /* Allocate a buffer to read labels into */ + label = (vdev_label_t *) kmem_alloc(labelsize, KM_SLEEP); + if (!label) { + dprintf("%s couldn't allocate label for read\n", __func__); + return (-1); + } + + /* Allocate a memory descriptor with the label pointer */ + buffer = IOMemoryDescriptor::withAddress((void*)label, labelsize, + kIODirectionIn); + + /* Verify buffer was allocated */ + if (!buffer || (buffer->getLength() != labelsize)) { + dprintf("%s couldn't allocate buffer for read\n", __func__); + goto error; + } + + /* Open the device for reads */ + if (false == media->IOMedia::open(zfs_hl, 0, + kIOStorageAccessReader)) { + dprintf("%s media open failed\n", __func__); + goto error; + } + + /* Read all four vdev labels */ + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t state, guid, txg; + + /* Zero the label buffer */ + bzero(label, labelsize); + + /* Prepare the buffer for IO */ + buffer->prepare(kIODirectionIn); + + /* Read a label from the specified offset */ + ret = media->IOMedia::read(zfs_hl, + zfs_boot_label_offset(size, l), + buffer, 0, &nread); + + /* Call the buffer completion */ + buffer->complete(); + + /* Skip failed reads, try next label */ + if (ret != kIOReturnSuccess) { + dprintf("%s media->read failed\n", __func__); + continue; + } + + /* Skip incomplete reads, try next label */ + if (nread < labelsize) { + dprintf("%s nread %llu / %llu\n", + __func__, nread, labelsize); + continue; + } + + /* Skip invalid labels that can't be unpacked */ + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) + continue; + + /* Verify GUID */ + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid == 0) { + dprintf("%s nvlist_lookup guid failed %llu\n", + __func__, guid); + nvlist_free(*config); + continue; + } + + /* Verify vdev state */ + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + dprintf("%s nvlist_lookup state failed %llu\n", + __func__, state); + nvlist_free(*config); + continue; + } + + /* Verify txg number */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + dprintf("%s nvlist_lookup txg failed %llu\n", + __func__, txg); + nvlist_free(*config); + continue; + } + + /* Increment count for first match, or if guid matches */ + if (expected_guid) { + if (expected_guid == guid) + count++; + + nvlist_free(*config); + } else { + expected_config = *config; + expected_guid = guid; + count++; + } + } + + /* Close IOMedia */ + media->close(zfs_hl); + + /* Copy out the config and number of labels */ + if (num_labels != NULL) + *num_labels = count; + + kmem_free(label, labelsize); + buffer->release(); + *config = expected_config; + + return (0); + + +error: + /* Clean up */ + if (buffer) { + buffer->release(); + buffer = 0; + } + if (label) { + kmem_free(label, labelsize); + label = 0; + } + + return (-1); +} + +DSTATIC bool +zfs_boot_probe_media(void* target, void* refCon, + IOService* newService, __unused IONotifier* notifier) +{ + IOMedia *media = 0; + OSObject *isLeaf = 0; + OSString *ospath = 0; + uint64_t mediaSize = 0; + pool_list_t *pools = (pool_list_t *) refCon; + + /* Verify pool list can be cast */ + if (!pools) { + dprintf("%s invalid refCon\n", __func__); + return (false); + } + /* Should never happen */ + if (!newService) { + printf("%s %s\n", "zfs_boot_probe_media", + "called with null newService"); + return (false); + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 1\n", __func__); + return (false); + } + + /* Validate pool name */ + if (!pools->pool_name || strlen(pools->pool_name) == 0) { + dprintf("%s no pool name specified\n", __func__); + return (false); + } + + /* Get the parent IOMedia device */ + media = OSDynamicCast(IOMedia, newService->getProvider()); + + if (!media) { + dprintf("%s couldn't be cast as IOMedia\n", + __func__); + return (false); + } + + isLeaf = media->getProperty(kIOMediaLeafKey); + if (!isLeaf) { + dprintf("%s skipping non-leaf\n", __func__); + goto out; + } + + mediaSize = media->getSize(); + if (mediaSize < SPA_MINDEVSIZE) { + dprintf("%s skipping device with size %llu\n", + __func__, mediaSize); + goto out; + } + + ospath = OSDynamicCast(OSString, media->getProperty( + kIOBSDNameKey, gIOServicePlane, + kIORegistryIterateRecursively)); + if (!ospath || (ospath->getLength() == 0)) { + dprintf("%s skipping device with no bsd disk node\n", + __func__); + goto out; + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 2\n", __func__); + goto out; + } + + + /* Take pool_list lock */ + mutex_enter(&pools->lock); + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 3\n", __func__); + /* Unlock the pool list lock */ + mutex_exit(&pools->lock); + goto out; + } + + /* Add this IOMedia to the disk set */ + pools->disks->setObject(media); + + /* Unlock the pool list lock */ + mutex_exit(&pools->lock); + + /* Wakeup zfs_boot_import_thread */ + cv_signal(&pools->cv); + +out: + media = 0; + return (true); +} + +DSTATIC bool +zfs_boot_probe_disk(pool_list_t *pools, IOMedia *media) +{ + OSString *ospath, *uuid; + char *path = 0, *pname; + const char prefix[] = "/private/var/run/disk/by-id/media-"; + uint64_t this_guid; + int num_labels, err, len = 0; + nvlist_t *config; + boolean_t matched = B_FALSE; + + dprintf("%s: with %s media\n", __func__, + (media ? "valid" : "missing")); + ASSERT3U(media, !=, NULL); + + /* Verify pool list can be cast */ + if (!pools) { + dprintf("%s missing pool_list\n", __func__); + return (false); + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 1\n", __func__); + return (false); + } + + /* Validate pool name */ + if (!pools->pool_name || strlen(pools->pool_name) == 0) { + dprintf("%s no pool name specified\n", __func__); + return (false); + } + + /* Try to get a UUID from the media */ + uuid = OSDynamicCast(OSString, media->getProperty(kIOMediaUUIDKey)); + if (uuid && uuid->getLength() != 0) { + /* Allocate room for prefix, UUID, and null terminator */ + len = (strlen(prefix) + uuid->getLength()) + 1; + + path = (char *) kmem_alloc(len, KM_SLEEP); + if (!path) { + dprintf("%s couldn't allocate path\n", __func__); + return (false); + } + + snprintf(path, len, "%s%s", prefix, uuid->getCStringNoCopy()); + uuid = 0; + } else { + /* Get the BSD name as a C string */ + ospath = OSDynamicCast(OSString, media->getProperty( + kIOBSDNameKey, gIOServicePlane, + kIORegistryIterateRecursively)); + if (!ospath || (ospath->getLength() == 0)) { + dprintf("%s skipping device with no bsd disk node\n", + __func__); + return (false); + } + + /* Allocate room for "/dev/" + "diskNsN" + '\0' */ + len = (strlen("/dev/") + ospath->getLength() + 1); + path = (char *) kmem_alloc(len, KM_SLEEP); + if (!path) { + dprintf("%s couldn't allocate path\n", __func__); + return (false); + } + + /* "/dev/" is 5 characters, plus null character */ + snprintf(path, len, "/dev/%s", ospath->getCStringNoCopy()); + ospath = 0; + } + dprintf("%s path [%s]\n", __func__, (path ? path : "")); + + /* Read vdev labels, if any */ + err = zfs_boot_read_label(pools->zfs_hl, media, + &config, &num_labels); + + /* Skip disks with no labels */ + if (err != 0 || num_labels == 0 || !config) { + goto out; + } + + /* Lookup pool name */ + if (pools->pool_name != NULL && + (nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0)) { + /* Compare with pool_name */ + if (strlen(pools->pool_name) == strlen(pname) && + strncmp(pools->pool_name, pname, + strlen(pname)) == 0) { + printf("%s matched pool %s\n", + __func__, pname); + matched = B_TRUE; + } + /* Compare with pool_guid */ + } else if (pools->pool_guid != 0) { + matched = nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_GUID, + &this_guid) == 0 && + pools->pool_guid == this_guid; + } + + /* Skip non-matches */ + if (!matched) { + nvlist_free(config); + config = NULL; + goto out; + } + + /* + * Add this config to the pool list. + * Always assigns order 1 since all disks are + * referenced by /private/var/run/disk/by-id/ paths. + */ + dprintf("%s: add_config %s\n", __func__, path); + if (zfs_boot_add_config(pools, path, 1, + num_labels, config) != 0) { + printf("%s couldn't add config to pool list\n", + __func__); + } + +out: + /* Clean up */ + if (path && len > 0) { + kmem_free(path, len); + } + return (true); +} + +DSTATIC void +zfs_boot_free() +{ + pool_entry_t *pe, *penext; + vdev_entry_t *ve, *venext; + config_entry_t *ce, *cenext; + name_entry_t *ne, *nenext; + pool_list_t *pools = zfs_boot_pool_list; + + /* Verify pool list can be cast */ + if (!pools) { + dprintf("%s: no pool_list to clear\n", __func__); + return; + } + + /* Clear global ptr */ + zfs_boot_pool_list = 0; + + pools->terminating = ZFS_BOOT_TERMINATING; + + /* Remove IONotifier (waits for tasks to complete) */ + if (pools->notifier) { + pools->notifier->remove(); + pools->notifier = 0; + } + + /* Release the lock */ + mutex_destroy(&pools->lock); + + /* Release the disk set */ + if (pools->disks) { + pools->disks->flushCollection(); + pools->disks->release(); + pools->disks = 0; + } + + /* Clear the zfs IOService handle */ + if (pools->zfs_hl) { + pools->zfs_hl = 0; + } + + /* Free the pool_name string */ + if (pools->pool_name) { + kmem_free(pools->pool_name, strlen(pools->pool_name) + 1); + pools->pool_name = 0; + } + + /* Clear the pool config list */ + for (pe = pools->pools; pe != NULL; pe = penext) { + /* Clear the vdev list */ + penext = pe->pe_next; + for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { + /* Clear the vdev config list */ + venext = ve->ve_next; + for (ce = ve->ve_configs; ce != NULL; ce = cenext) { + cenext = ce->ce_next; + if (ce->ce_config) + nvlist_free(ce->ce_config); + kmem_free(ce, sizeof (config_entry_t)); + } + kmem_free(ve, sizeof (vdev_entry_t)); + } + kmem_free(pe, sizeof (pool_entry_t)); + } + pools->pools = 0; + + /* Clear the vdev name list */ + for (ne = pools->names; ne != NULL; ne = nenext) { + nenext = ne->ne_next; + if (ne->ne_name) + spa_strfree(ne->ne_name); + kmem_free(ne, sizeof (name_entry_t)); + } + pools->names = 0; + + /* Finally, free the pool list struct */ + kmem_free(pools, sizeof (pool_list_t)); + pools = 0; +} + +void +zfs_boot_fini() +{ + pool_list_t *pools = zfs_boot_pool_list; + + if (!pools) { + printf("%s no pool_list to clear\n", __func__); + return; + } + + /* Set terminating flag */ + if (false == OSCompareAndSwap64(ZFS_BOOT_ACTIVE, + ZFS_BOOT_TERMINATING, &(pools->terminating))) { + printf("%s already terminating? %llu\n", + __func__, pools->terminating); + } + + /* Wakeup zfs_boot_import_thread */ + cv_signal(&pools->cv); + + /* Clean up */ + pools = 0; +} + +#define kBootUUIDKey "boot-uuid" +#define kBootUUIDMediaKey "boot-uuid-media" + +DSTATIC int +zfs_boot_publish_bootfs(IOService *zfs_hl, pool_list_t *pools) +{ + ZFSDataset *dataset = NULL; + IOMedia *media; + IOService *resourceService = NULL; + OSDictionary *properties = NULL; + spa_t *spa = NULL; + char *zfs_bootfs = NULL; + uint64_t bootfs = 0ULL; + int error, len = ZFS_MAX_DATASET_NAME_LEN; + + dprintf("%s\n", __func__); + if (!zfs_hl || !pools) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + +#if 0 + ZFSPool *pool_proxy = NULL; + if (bootdev) { + dprintf("%s bootdev already set\n", __func__); + return (EBUSY); + } +#endif + + zfs_bootfs = (char *)kmem_alloc(len, KM_SLEEP); + if (!zfs_bootfs) { + printf("%s string alloc failed\n", __func__); + return (ENOMEM); + } + zfs_bootfs[0] = '\0'; + + mutex_enter(&spa_namespace_lock); + spa = spa_next(NULL); + if (spa) { + bootfs = spa_bootfs(spa); + } + if (bootfs == 0) { + mutex_exit(&spa_namespace_lock); + dprintf("%s no bootfs, nothing to do\n", __func__); + kmem_free(zfs_bootfs, len); + return (0); + } + +#if 0 + /* Get pool proxy */ + if (!spa->spa_iokit_proxy || + (pool_proxy = spa->spa_iokit_proxy->proxy) == NULL) { + mutex_exit(&spa_namespace_lock); + dprintf("%s no spa_pool_proxy\n", __func__); + kmem_free(zfs_bootfs, len); + return (0); + } +#endif + + error = dsl_dsobj_to_dsname(spa_name(spa), + spa_bootfs(spa), zfs_bootfs); + mutex_exit(&spa_namespace_lock); + + if (error != 0) { + dprintf("%s bootfs to name failed\n", __func__); + kmem_free(zfs_bootfs, len); + return (ENODEV); + } + + printf("%s: publishing bootfs [%s]\n", __func__, zfs_bootfs); + + /* Create prop dict for the proxy, with 6 or more keys */ + if ((properties = OSDictionary::withCapacity(6)) == NULL) { + dprintf("%s prop dict allocation failed\n", __func__); + kmem_free(zfs_bootfs, len); + return (ENOMEM); + } + + /* Set Content Hint and Content */ + do { + const OSSymbol *partUUID; + + /* ZFS (BF01) partition type */ + if ((partUUID = OSSymbol::withCString( + "6A898CC3-1DD2-11B2-99A6-080020736631")) == NULL) { + dprintf("%s couldn't make partUUID\n", __func__); + break; + // kmem_free(zfs_bootfs, len); + // return (ENOMEM); + } + + /* Assign ZFS partiton UUID to both */ + if (properties->setObject(kIOMediaContentKey, + partUUID) == false || + properties->setObject(kIOMediaContentHintKey, + partUUID) == false) { + dprintf("%s content hint failed\n", __func__); + // kmem_free(zfs_bootfs, len); + // return (ENOMEM); + } + partUUID->release(); + } while (0); + + /* XXX Set dataset name, rdonly, and UUID */ + do { + OSString *nameStr; + OSString *uuidStr; + char uuid_cstr[UUID_PRINTABLE_STRING_LENGTH]; + uuid_t uuid; + + bzero(uuid, sizeof (uuid_t)); + bzero(uuid_cstr, UUID_PRINTABLE_STRING_LENGTH); + + zfs_vfs_uuid_gen(zfs_bootfs, uuid); + zfs_vfs_uuid_unparse(uuid, uuid_cstr); + + nameStr = OSString::withCString(zfs_bootfs); + uuidStr = OSString::withCString(uuid_cstr); + + if (properties->setObject(ZFS_BOOT_DATASET_NAME_KEY, + nameStr) == false || + properties->setObject(ZFS_BOOT_DATASET_UUID_KEY, + uuidStr) == false || + properties->setObject(ZFS_BOOT_DATASET_RDONLY_KEY, + kOSBooleanFalse) == false) { + dprintf("ZFSBootDevice::%s couldn't setup" + "property dict\n", __func__); + nameStr->release(); + uuidStr->release(); + kmem_free(zfs_bootfs, len); + return (ENOMEM); + } + nameStr->release(); + uuidStr->release(); + } while (0); + + /* Create proxy device */ + error = zfs_osx_proxy_create(zfs_bootfs); + if (error == 0) { + dataset = zfs_osx_proxy_get(zfs_bootfs); + } + /* Done with this string */ + kmem_free(zfs_bootfs, len); + zfs_bootfs = 0; + + if (!dataset) { + printf("%s: couldn't create proxy device\n", + __func__); + return (ENXIO); + } + + media = OSDynamicCast(IOMedia, dataset); + if (!media) { + printf("%s: couldn't cast proxy media\n", + __func__); + dataset->release(); + return (ENXIO); + } + +#if 0 + bootdev = new ZFSBootDevice; + + if (!bootdev) { + printf("%s: couldn't create boot device\n", __func__); + return (ENOMEM); + } + + if (bootdev->init(properties) == false) { + printf("%s init failed\n", __func__); + properties->release(); + bootdev->release(); + bootdev = 0; + return (ENXIO); + } + properties->release(); + properties = 0; + + if (bootdev->attach(pool_proxy) == false) { + printf("%s attach failed\n", __func__); + bootdev->release(); + bootdev = 0; + return (ENXIO); + } + + /* Technically should start but this doesn't do much */ + if (bootdev->start(pool_proxy) == false) { + printf("%s start failed\n", __func__); + bootdev->detach(pool_proxy); + bootdev->release(); + bootdev = 0; + return (ENXIO); + } + + /* Get matching started */ + bootdev->registerService(kIOServiceAsynchronous); + // bootdev->registerService(kIOServiceSynchronous); + + do { + if (bootdev->getClient() != 0) { + media = OSDynamicCast(IOMedia, + bootdev->getClient()->getClient()); + if (media) { + media->retain(); + break; + } + } + + /* Sleep until media is available */ + /* + * XXX Should use waitForServiceMatching or IONotifier + */ + IOSleep(200); + } while (!media); + + if (!media) { + /* XXX currently unreachable */ + printf("%s couldn't get bootdev media\n", __func__); + return (ENXIO); + } +#endif + + resourceService = IOService::getResourceService(); + if (!resourceService) { + dprintf("%s missing resource service\n", __func__); + /* Handle error */ + media->release(); + return (ENXIO); + } + +#if 1 + /* XXX publish an IOMedia as the BootUUIDMedia resource */ + /* uses same method as AppleFileSystemDriver */ + + /* Set IOMedia UUID */ + /* XXX skip (moved get uuid below) */ + // media->setProperty(kIOMediaUUIDKey, uuid); + /* Publish this IOMedia as the boot-uuid-media */ + IOService::publishResource(kBootUUIDMediaKey, media); + + /* Drop retain from earlier */ + media->release(); + /* Remove boot-uuid key so AppleFileSystem stops matching */ + resourceService->removeProperty(kBootUUIDKey); +#else + OSString *uuid = 0; + /* Get the current boot-uuid string */ + uuid = OSDynamicCast(OSString, + resourceService->getProperty(kBootUUIDKey, gIOServicePlane)); + if (!uuid) { + dprintf("%s missing boot-uuid IOResource\n", __func__); + /* Handle error */ + return (ENXIO); + } + printf("%s: got boot-uuid %s\n", __func__, uuid->getCStringNoCopy()); + + /* XXX Or use below and let AppleFileSystemDriver match it */ + /* Leaves the Apple_Boot content hint (at least for now) */ + media->setProperty(kIOMediaContentHintKey, "Apple_Boot"); + media->setProperty(kIOMediaUUIDKey, uuid); + /* Register for notifications (not matching) */ + media->registerService(kIOServiceAsynchronous); + /* Drop retain from earlier */ + media->release(); +#endif + + printf("%s done\n", __func__); + return (0); +} + +DSTATIC void +zfs_boot_import_thread(void *arg) +{ + nvlist_t *configs, *nv, *newnv; + nvpair_t *elem; + IOService *zfs_hl = 0; + OSSet *disks, *new_set = 0; + OSCollectionIterator *iter = 0; + OSObject *next; + IOMedia *media; + pool_list_t *pools = (pool_list_t *)arg; + uint64_t pool_state; + boolean_t pool_imported = B_FALSE; + int error = EINVAL; + + /* Verify pool list coult be cast */ + ASSERT3U(pools, !=, 0); + if (!pools) { + printf("%s %p %s\n", "zfs_boot_import_thread", + arg, "couldn't be cast as pool_list_t*"); + return; + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 1\n", __func__); + goto out_unlocked; + } + + new_set = OSSet::withCapacity(1); + /* To swap with pools->disks while locked */ + if (!new_set) { + dprintf("%s couldn't allocate new_set\n", __func__); + goto out_unlocked; + } + + /* Take pool list lock */ + mutex_enter(&pools->lock); + + zfs_hl = pools->zfs_hl; + + /* Check for work, then sleep on the lock */ + do { + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 2\n", __func__); + goto out_locked; + } + + /* Check for work */ + if (pools->disks->getCount() == 0) { + dprintf("%s no disks to check\n", __func__); + goto next_locked; + } + + /* Swap full set with a new empty one */ + ASSERT3U(new_set, !=, 0); + disks = pools->disks; + pools->disks = new_set; + new_set = 0; + + /* Release pool list lock */ + mutex_exit(&pools->lock); + + /* Create an iterator over the objects in the set */ + iter = OSCollectionIterator::withCollection(disks); + + /* couldn't be initialized */ + if (!iter) { + dprintf("%s %s %d %s\n", "zfs_boot_import_thread", + "couldn't get iterator from collection", + disks->getCount(), "disks skipped"); + + /* Merge disks back into pools->disks */ + mutex_enter(&pools->lock); + pools->disks->merge(disks); + mutex_exit(&pools->lock); + + /* Swap 'disks' back to new_set */ + disks->flushCollection(); + new_set = disks; + disks = 0; + + continue; + } + + /* Iterate over all disks */ + while ((next = iter->getNextObject()) != NULL) { + /* Cast each IOMedia object */ + media = OSDynamicCast(IOMedia, next); + + if (!iter->isValid()) { + /* Oh gosh, need to start over */ + iter->reset(); + continue; + } + + if (!media) { + dprintf("%s couldn't cast IOMedia\n", + __func__); + continue; + } + + /* Check this IOMedia device for a vdev label */ + if (!zfs_boot_probe_disk(pools, media)) { + dprintf("%s couldn't probe disk\n", + __func__); + continue; + } + } + + /* Clean up */ + media = 0; + iter->release(); + iter = 0; + + /* Swap 'disks' back to new_set */ + disks->flushCollection(); + new_set = disks; + disks = 0; + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 3\n", __func__); + goto out_unlocked; + } + + mutex_enter(&pools->lock); + /* Check for work */ + if (pools->disks->getCount() != 0) { + dprintf("%s more disks available, looping\n", __func__); + continue; + } + /* Release pool list lock */ + mutex_exit(&pools->lock); + + /* Generate a list of pool configs to import */ + configs = zfs_boot_get_configs(pools, B_TRUE); + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 4\n", __func__); + goto out_unlocked; + } + + /* Iterate over the nvlists (stored as nvpairs in nvlist) */ + elem = NULL; + while ((elem = nvlist_next_nvpair(configs, + elem)) != NULL) { + /* Cast the nvpair back to nvlist */ + nv = NULL; + verify(nvpair_value_nvlist(elem, &nv) == 0); + + /* Check vdev state */ + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0); + if (pool_state == POOL_STATE_DESTROYED) { + dprintf("%s skipping destroyed pool\n", + __func__); + continue; + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 5\n", __func__); + goto out_unlocked; + } + + /* Try import */ + newnv = spa_tryimport(nv); + nvlist_free(nv); + nv = 0; + if (newnv) { + dprintf("%s newnv: %p\n", __func__, newnv); + + /* Stop probing disks */ + if (pools->notifier) + pools->notifier->disable(); + + /* Do import */ + pool_imported = (spa_import(pools->pool_name, + newnv, 0, 0) == 0); + nvlist_free(newnv); + newnv = 0; + // pool_imported = spa_import_rootpool(nv); + } else { + dprintf("%s no newnv returned\n", __func__); + } + + dprintf("%s spa_import returned %d\n", __func__, + pool_imported); + + if (pool_imported) { + /* Get bootfs and publish IOMedia */ + error = zfs_boot_publish_bootfs(zfs_hl, pools); + if (error != 0) { + dprintf("%s publish bootfs error %d\n", + __func__, error); + } + + goto out_unlocked; + } else { + /* Resume notifications */ + if (pools->notifier) + pools->notifier->enable(true); + } + } + + /* Retake pool list lock */ + mutex_enter(&pools->lock); + +next_locked: + /* Check for work */ + if (pools->disks->getCount() != 0) { + continue; + } + + /* Abort early */ + if (pools->terminating != ZFS_BOOT_ACTIVE) { + dprintf("%s terminating 6\n", __func__); + goto out_locked; + } + + dprintf("%s sleeping on lock\n", __func__); + /* Sleep on lock, thread is resumed with lock held */ + cv_timedwait_sig(&pools->cv, &pools->lock, + ddi_get_lbolt() + hz); + + /* Loop forever */ + } while (true); + +out_locked: + /* Unlock pool list lock */ + mutex_exit(&pools->lock); + +out_unlocked: + /* Cleanup new_set */ + if (new_set) { + new_set->flushCollection(); + new_set->release(); + new_set = 0; + } + + /* Teardown pool list, lock, etc */ + zfs_boot_free(); + + return; /* taskq_dispatch */ +#if 0 + thread_exit(); /* thread_create */ +#endif +} + +DSTATIC bool +zfs_boot_check_mountroot(char **pool_name, uint64_t *pool_guid) +{ + /* + * Check if the kext is loading during early boot + * and/or check if root is mounted (IORegistry?) + * Use PE Boot Args to determine the root pool name. + */ + char *zfs_boot; + char *split; + uint64_t len; + bool result = false; + uint64_t uptime = 0; + + + if (!pool_name || !pool_guid) { + dprintf("%s %s\n", __func__, + "invalid pool_name or pool_guid ptr"); + return (false); + } + + /* XXX Ugly hack to determine if this is early boot */ + /* + * Could just check if boot-uuid (or rd= or rootdev=) + * are set, and abort otherwise + * IOResource "boot-uuid" only published before root is + * mounted, or "boot-uuid-media" once discovered + */ + clock_get_uptime(&uptime); /* uptime since boot in nanoseconds */ + dprintf("%s uptime: %llu\n", __func__, uptime); + + /* 3 billion nanoseconds ~= 3 seconds */ + // if (uptime >= 3LLU<<30) { + /* 60 billion nanoseconds ~= 60 seconds */ + if (uptime >= 7LLU<<33) { + dprintf("%s %s\n", __func__, "Already booted"); + /* + * Start the getrootdir() from working, the vfs_start() call + * isn't called until first mount, which is too late for + * spa_async_dispatch(). + */ + return (false); + } else { + dprintf("%s %s\n", __func__, "Boot time"); + } + + zfs_boot = (char *) kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (!zfs_boot) { + dprintf("%s couldn't allocate zfs_boot\n", __func__); + return (false); + } + + result = PE_parse_boot_argn("zfs_boot", zfs_boot, + ZFS_MAX_DATASET_NAME_LEN); + // dprintf( "Raw zfs_boot: [%llu] {%s}\n", + // (uint64_t)strlen(zfs_boot), zfs_boot); + + result = (result && (zfs_boot != 0) && strlen(zfs_boot) > 0); + + if (!result) { + result = PE_parse_boot_argn("rd", zfs_boot, + MAXPATHLEN); + result = (result && (zfs_boot != 0) && + strlen(zfs_boot) > 0 && + strncmp(zfs_boot, "zfs:", 4)); + // dprintf("Raw rd: [%llu] {%s}\n", + // (uint64_t)strlen(zfs_boot), zfs_boot ); + } + if (!result) { + result = PE_parse_boot_argn("rootdev", zfs_boot, + MAXPATHLEN); + result = (result && (zfs_boot != 0) && + strlen(zfs_boot) > 0 && + strncmp(zfs_boot, "zfs:", 4)); + // dprintf("Raw rootdev: [%llu] {%s}\n", + // (uint64_t)strlen(zfs_boot), zfs_boot ); + } + + /* + * XXX To Do - parse zpool_guid boot arg + */ + *pool_guid = 0; + + if (result) { + /* Check for first slash in zfs_boot */ + split = strchr(zfs_boot, '/'); + if (split) { + /* copy pool name up to first slash */ + len = (split - zfs_boot); + } else { + /* or copy whole string */ + len = strlen(zfs_boot); + } + + *pool_name = (char *) kmem_alloc(len+1, KM_SLEEP); + snprintf(*pool_name, len+1, "%s", zfs_boot); + + dprintf("Got zfs_boot: [%llu] {%s}->{%s}\n", + *pool_guid, zfs_boot, *pool_name); + } else { + dprintf("%s\n", "No zfs_boot\n"); + pool_name = 0; + } + + kmem_free(zfs_boot, ZFS_MAX_DATASET_NAME_LEN); + zfs_boot = 0; + return (result); +} + +bool +zfs_boot_init(IOService *zfs_hl) +{ + IONotifier *notifier = 0; + pool_list_t *pools = 0; + char *pool_name = 0; + uint64_t pool_guid = 0; + + zfs_boot_pool_list = 0; + + if (!zfs_hl) { + dprintf("%s: No zfs_hl provided\n", __func__); + return (false); + } + + if (!zfs_boot_check_mountroot(&pool_name, &pool_guid) || + (!pool_name && pool_guid == 0)) { + /* + * kext is not being loaded during early-boot, + * or no pool is specified for import. + */ + dprintf("%s: check failed\n", __func__); + return (true); + } + + pools = (pool_list_t *) kmem_alloc(sizeof (pool_list_t), + KM_SLEEP); + if (!pools) { + goto error; + } + bzero(pools, sizeof (pool_list_t)); + + if ((pools->disks = OSSet::withCapacity( + ZFS_BOOT_PREALLOC_SET)) == NULL) { + /* Fail if memory couldn't be allocated */ + goto error; + } + pools->terminating = ZFS_BOOT_ACTIVE; + pools->pools = 0; + pools->names = 0; + pools->pool_guid = pool_guid; + pools->pool_name = pool_name; + pools->zfs_hl = zfs_hl; + + notifier = IOService::addMatchingNotification( + gIOFirstPublishNotification, IOService::serviceMatching( + "IOMediaBSDClient"), zfs_boot_probe_media, + zfs_hl, pools, 0); + + if (!notifier) { + /* Fail if memory couldn't be allocated */ + goto error; + } + pools->notifier = notifier; + + mutex_init(&pools->lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pools->cv, NULL, CV_DEFAULT, NULL); + + /* Finally, start the import thread */ + taskq_dispatch(system_taskq, zfs_boot_import_thread, + (void*)pools, TQ_SLEEP); +#if 0 +/* Alternate method of scheduling the import thread */ + (void) thread_create(NULL, 0, zfs_boot_import_thread, + pools, 0, &p0, + TS_RUN, minclsyspri); +#endif + + zfs_boot_pool_list = pools; + + return (true); + +error: + if (pools) { + if (pools->disks) { + pools->disks->flushCollection(); + pools->disks->release(); + pools->disks = 0; + } + kmem_free(pools, sizeof (pool_list_t)); + pools = 0; + } + return (false); +} + +/* Include these functions in all builds */ + +/* + * zfs_boot_update_bootinfo_vdev_leaf + * Inputs: spa: valid pool spa pointer. vd: valid vdev pointer. + * Return: 0 on success, positive integer errno on failure. + * Callers: zfs_boot_update_bootinfo_vdev + * + * called by bootinfo_vdev with each leaf vdev. + */ +DSTATIC int +zfs_boot_update_bootinfo_vdev_leaf(OSArray *array, vdev_t *vd) +{ + OSDictionary *dict; + OSString *dev_str; + OSNumber *dev_size; + vdev_disk_t *dvd; + struct io_bootinfo *info; + int error; + + /* Validate inputs */ + if (!array || !vd) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + /* Should be called with leaf vdev */ + if (!vd->vdev_ops->vdev_op_leaf) { + dprintf("%s not a leaf vdev\n", __func__); + return (EINVAL); + } + + /* Skip hole vdevs */ + if (vd->vdev_ishole) { + dprintf("%s skipping hole in namespace\n", __func__); + return (0); + } + + /* No info available if missing */ + if (strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_MISSING) == 0) { + dprintf("%s skipping missing vdev\n", __func__); + return (0); + } + + /* Must be a disk, not a file */ + if (strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { + dprintf("%s skipping non-disk vdev\n", __func__); + return (0); + } + + /* Skip obviously non-bootable vdevs */ + if (vd->vdev_islog || + vd->vdev_isl2cache || vd->vdev_isspare) { + dprintf("%s skipping non-bootable\n", __func__); + return (0); + } + + /* Get vdev type-specific data */ + dvd = (vdev_disk_t *)vd->vdev_tsd; + if (!dvd || !dvd->vd_lh) { + dprintf("%s missing dvd or ldi handle\n", __func__); + return (0); + } + + /* Allocate an ldi io_bootinfo struct */ + info = (struct io_bootinfo *)kmem_alloc( + sizeof (struct io_bootinfo), KM_SLEEP); + if (!info) { + dprintf("%s info alloc failed\n", __func__); + return (ENOMEM); + } + bzero(info, sizeof (struct io_bootinfo)); + + /* Ask the vdev handle to fill in the info */ + error = ldi_ioctl(dvd->vd_lh, DKIOCGETBOOTINFO, + (intptr_t)info, 0, 0, 0); + if (error != 0) { + dprintf("%s ioctl error %d\n", __func__, error); + kmem_free(info, sizeof (struct io_bootinfo)); + return (EIO); + } + + /* Allocate dictionary to hold the keys */ + if ((dict = OSDictionary::withCapacity(2)) == NULL) { + dprintf("%s dictionary alloc failed\n", __func__); + kmem_free(info, sizeof (struct io_bootinfo)); + return (ENOMEM); + } + + /* Keys are path (string) and size (number) */ + dev_str = OSString::withCString(info->dev_path); + dev_size = OSNumber::withNumber(info->dev_size, + (8 * sizeof (info->dev_size))); + kmem_free(info, sizeof (struct io_bootinfo)); + info = 0; + + /* Add keys to dictionary or bail */ + if (!dev_str || !dev_size || + dict->setObject(kIOBootDevicePathKey, + dev_str) == false || + dict->setObject(kIOBootDeviceSizeKey, + dev_size) == false) { + dprintf("%s dictionary setup failed\n", __func__); + if (dev_str) dev_str->release(); + if (dev_size) dev_size->release(); + dict->release(); + dict = 0; + return (ENOMEM); + } + dev_str->release(); + dev_str = 0; + dev_size->release(); + dev_size = 0; + + /* Add dict to array */ + if (array->setObject(dict) == false) { + dprintf("%s couldn't set bootinfo\n", __func__); + dict->release(); + dict = 0; + return (ENOMEM); + } + dict->release(); + dict = 0; + + return (0); +} + +/* + * zfs_boot_update_bootinfo_vdev + * Inputs: spa: valid pool spa pointer. vd: valid vdev pointer. + * Return: 0 on success, positive integer errno on failure. + * Callers: zfs_boot_update_bootinfo + * + * called by bootinfo with root vdev, and recursively calls + * itself while iterating over children (vdevs only have a + * few levels of nesting at most). + */ +DSTATIC int +zfs_boot_update_bootinfo_vdev(OSArray *array, vdev_t *vd) +{ + int c, error; + + /* Validate inputs */ + if (!array || !vd) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + /* Skip obviously non-bootable vdevs */ + if (vd->vdev_islog || + vd->vdev_isl2cache || vd->vdev_isspare) { + dprintf("%s skipping non-bootable\n", __func__); + return (0); + } + + /* Process leaf vdevs */ + if (vd->vdev_ops->vdev_op_leaf) { + error = zfs_boot_update_bootinfo_vdev_leaf(array, vd); + if (error) + dprintf("%s bootinfo_vdev_leaf error %d\n", + __func__, error); + return (error); + } + + /* Iterate over child vdevs */ + for (c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] == NULL) { + dprintf("%s hole in vdev namespace\n", __func__); + continue; + } + + /* Recursion */ + error = zfs_boot_update_bootinfo_vdev(array, + vd->vdev_child[c]); + if (error != 0) { + dprintf("%s bootinfo_vdev_leaf error %d\n", + __func__, error); + return (error); + } + } + + return (0); +} + +extern "C" { + +/* + * zfs_boot_update_bootinfo + * Inputs: spa: valid pool spa pointer. + * Return: 0 on success, positive integer errno on failure. + * Callers: spa_open_common, spa_vdev_add, spa_vdev_remove, + * spa_vdev_attach, spa_vdev_detach. + * + * Called from spa.c on changes to the vdev layout. This + * information is assigned to the pool proxy so all zvols + * and datasets will retrieve the property through IOKit + * since it is retrieved via recursion. + * (see bless-105/Misc/BLCreateBooterInformationDictionary.c). + * If IOBootDevice property is needed for each dataset and + * zvol, we can revisit this and assign/update on all of + * these (already implemented a prototype that worked fine). + * + * Note: bootinfo is only collected for data vdevs. + * XXX We only want boot helpers there, unless there is a + * compelling argument for log, cache, or spares having + * boot helpers. + */ +int +zfs_boot_update_bootinfo(spa_t *spa) +{ + ZFSPool *pool_proxy; + OSArray *array; + int error; + + if (!spa) { + dprintf("%s missing spa\n", __func__); + return (EINVAL); + } + + /* XXX Could count vdevs first? */ + if ((array = OSArray::withCapacity(1)) == NULL) { + dprintf("%s allocation failed\n", __func__); + return (ENOMEM); + } + + /* Grab necessary locks */ + mutex_enter(&spa_namespace_lock); + spa_open_ref(spa, FTAG); + + /* Get pool proxy */ + if (!spa->spa_iokit_proxy || + (pool_proxy = spa->spa_iokit_proxy->proxy) == NULL) { + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + dprintf("%s no spa_pool_proxy\n", __func__); + return (0); + } + /* Avoid it disappearing from under us */ + pool_proxy->retain(); + + /* Don't need to hold this throughout */ + mutex_exit(&spa_namespace_lock); + + /* vdev state lock only requires an spa open ref */ + spa_vdev_state_enter(spa, SCL_NONE); + + /* Iterate over all vdevs */ + if ((error = zfs_boot_update_bootinfo_vdev(array, + spa->spa_root_vdev)) != 0) { + dprintf("%s bootinfo_vdev error %d\n", + __func__, error); + + /* Drop locks */ + (void) spa_vdev_state_exit(spa, NULL, 0); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + array->release(); + pool_proxy->release(); + return (error); + } + + /* Release locks, passing NULL vd (no change) */ + error = spa_vdev_state_exit(spa, NULL, 0); + if (error != 0) { + dprintf("%s spa_vdev_state_exit error %d\n", + __func__, error); + } + + /* setProperty adds a retain */ + pool_proxy->setProperty(kIOBootDeviceKey, array); + pool_proxy->release(); + array->release(); + + /* Drop locks */ + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + return (0); +} + +} /* extern "C" */ + +#if 0 +#ifdef ZFS_BOOT +/* Remainder only needed for boot */ + +#define DPRINTF_FUNC() dprintf("%s\n", __func__) + +#pragma mark - ZFSBootDevice + +OSDefineMetaClassAndStructors(ZFSBootDevice, IOBlockStorageDevice); +char ZFSBootDevice::vendorString[4] = "ZFS"; +char ZFSBootDevice::revisionString[4] = "0.1"; +char ZFSBootDevice::infoString[12] = "ZFS dataset"; + +#if 0 +int +zfs_boot_get_path(char *path, int len) +{ + OSString *disk = 0; + + if (!path || len == 0) { + dprintf("%s: invalid argument\n", __func__); + return (-1); + } + + if (bootdev) { + disk = OSDynamicCast(OSString, + bootdev->getProperty(kIOBSDNameKey, gIOServicePlane, + kIORegistryIterateRecursively)); + } + + if (disk) { + snprintf(path, len, "/dev/%s", disk->getCStringNoCopy()); + return (0); + } + + return (-1); +} +#endif + +bool +ZFSBootDevice::init(OSDictionary *properties) +{ + /* Allocate dictionaries and symbols */ + OSDictionary *pdict = OSDictionary::withCapacity(2); + OSDictionary *ddict = OSDictionary::withCapacity(4); + const OSSymbol *virtualSymbol = OSSymbol::withCString( + kIOPropertyPhysicalInterconnectTypeVirtual); + const OSSymbol *ramSymbol = OSSymbol::withCString( + kIOPropertyInterconnectRAMKey); + const OSSymbol *ssdSymbol = OSSymbol::withCString( + kIOPropertyMediumTypeSolidStateKey); + OSNumber *physSize = OSNumber::withNumber((uint32_t)4096, 32); + OSNumber *logSize = OSNumber::withNumber((uint32_t)512, 32); + const OSSymbol *vendorSymbol = 0; + const OSSymbol *revisionSymbol = 0; + const OSSymbol *blankSymbol = 0; + OSBoolean *rdonly = 0; + OSString *str = 0; + const char *cstr = 0; + bool ret = false; + + DPRINTF_FUNC(); + + /* Validate allocations */ + if (!pdict || !ddict || !virtualSymbol || !ramSymbol || + !ssdSymbol || !physSize || !logSize) { + dprintf("ZFSBootDevice::%s allocation failed\n", __func__); + goto error; + } + + /* Init class statics every time an instance inits */ + /* Shared across instances, but doesn't hurt to reprint */ + snprintf(vendorString, strlen("ZFS")+1, "ZFS"); + snprintf(revisionString, strlen("0.1")+1, "0.1"); + snprintf(infoString, strlen("ZFS dataset")+1, "ZFS dataset"); + + /* For IORegistry keys, cache OSSymbols for class statics */ + /* Leverages OSSymbol cahce pool to reuse across instances */ + vendorSymbol = OSSymbol::withCString(vendorString); + revisionSymbol = OSSymbol::withCString(revisionString); + blankSymbol = OSSymbol::withCString(""); + if (!vendorSymbol || !revisionSymbol || !blankSymbol) { + dprintf("ZFSBootDevice::%s class symbols failed\n", __func__); + goto error; + } + + /* Call super init */ + if (IOBlockStorageDevice::init(properties) == false) { + dprintf("ZFSBootDevice::%s device init failed\n", __func__); + goto error; + } + + /* Set class private vars */ + productString = NULL; + isReadOnly = false; // XXX should really be true initially + + /* Set Protocol Characteristics */ + if (pdict->setObject(kIOPropertyPhysicalInterconnectLocationKey, + ramSymbol) == false || + pdict->setObject(kIOPropertyPhysicalInterconnectTypeKey, + virtualSymbol) == false) { + dprintf("%s pdict set properties failed\n", __func__); + goto error; + } + setProperty(kIOPropertyProtocolCharacteristicsKey, pdict); + + /* Set Device Characteristics */ + if (ddict->setObject(kIOPropertyVendorNameKey, + vendorSymbol) == false || + ddict->setObject(kIOPropertyProductRevisionLevelKey, + revisionSymbol) == false || + ddict->setObject(kIOPropertyProductSerialNumberKey, + blankSymbol) == false || + ddict->setObject(kIOPropertyPhysicalBlockSizeKey, + physSize) == false || + ddict->setObject(kIOPropertyLogicalBlockSizeKey, + logSize) == false || + ddict->setObject(kIOPropertyMediumTypeKey, + ssdSymbol) == false) { + dprintf("%s ddict set properties failed\n", __func__); + goto error; + } + setProperty(kIOPropertyDeviceCharacteristicsKey, ddict); + + /* Check for passed in readonly status */ + if (properties && (rdonly = OSDynamicCast(OSBoolean, + properties->getObject(ZFS_BOOT_DATASET_RDONLY_KEY))) != NULL) { + /* Got the boolean */ + isReadOnly = rdonly->getValue(); + dprintf("ZFSBootDevice %s set %s\n", __func__, + (isReadOnly ? "readonly" : "readwrite")); + } + + /* Check for passed in dataset UUID */ + if (properties && (str = OSDynamicCast(OSString, + properties->getObject(ZFS_BOOT_DATASET_UUID_KEY))) != NULL && + (cstr = str->getCStringNoCopy()) != NULL) { + /* Got the string, try to set UUID */ + str->retain(); + if (ddict->setObject("Dataset UUID", str) == false) { + dprintf("ZFSBootDevice::%s failed UUID [%s]\n", + __func__, cstr); + str->release(); + goto error; + } + dprintf("ZFSBootDevice::%s set UUID [%s]\n", + __func__, cstr); + str->release(); + } + + /* Check for passed in dataset name */ + if (properties && (str = OSDynamicCast(OSString, + properties->getObject(ZFS_BOOT_DATASET_NAME_KEY))) != NULL && + (cstr = str->getCStringNoCopy()) != NULL) { + /* Got the string, try to set name */ + str->retain(); + if (setDatasetName(cstr) == false) { + /* Unlikely */ + dprintf("ZFSBootDevice %s couldn't setup dataset" + " name property [%s]\n", __func__, cstr); + str->release(); + goto error; + } + + dprintf("ZFSBootDevice %s set dataset name [%s]\n", + __func__, cstr); + str->release(); + } else { + if (setDatasetName("invalid") == false) { + dprintf("ZFSBootDevice::%s setDatasetName failed\n", + __func__); + goto error; + } + dprintf("ZFSBootDevice %s set name [invalid]\n", __func__); + } + + /* Success */ + ret = true; + +error: + if (pdict) pdict->release(); + if (ddict) ddict->release(); + if (virtualSymbol) virtualSymbol->release(); + if (ramSymbol) ramSymbol->release(); + if (ssdSymbol) ssdSymbol->release(); + if (physSize) physSize->release(); + if (logSize) logSize->release(); + if (vendorSymbol) vendorSymbol->release(); + if (revisionSymbol) revisionSymbol->release(); + if (blankSymbol) blankSymbol->release(); + return (ret); +} + +void +ZFSBootDevice::free() +{ + char *pstring = (char *)productString; + productString = 0; + + if (pstring) kmem_free(pstring, strlen(pstring) + 1); + + IOBlockStorageDevice::free(); +} + +#if 0 +bool +ZFSBootDevice::attach(IOService *provider) +{ + DPRINTF_FUNC(); + // return (IOMedia::attach(provider)); + return (IOBlockStorageDevice::attach(provider)); +} + +void +ZFSBootDevice::detach(IOService *provider) +{ + DPRINTF_FUNC(); + // IOMedia::detach(provider); + IOBlockStorageDevice::detach(provider); +} + +bool +ZFSBootDevice::start(IOService *provider) +{ + DPRINTF_FUNC(); + // return (IOMedia::start(provider)); + return (IOBlockStorageDevice::start(provider)); +} + +void +ZFSBootDevice::stop(IOService *provider) +{ + DPRINTF_FUNC(); + // IOMedia::stop(provider); + IOBlockStorageDevice::stop(provider); +} + +IOService* +ZFSBootDevice::probe(IOService *provider, SInt32 *score) +{ + DPRINTF_FUNC(); + // return (IOMedia::probe(provider, score)); + return (IOBlockStorageDevice::probe(provider, score)); +} +#endif + +IOReturn +ZFSBootDevice::doSynchronizeCache(void) +{ + dprintf("ZFSBootDevice %s\n", __func__); + return (kIOReturnSuccess); +} + +IOReturn +ZFSBootDevice::doAsyncReadWrite(IOMemoryDescriptor *buffer, + UInt64 block, UInt64 nblks, + IOStorageAttributes *attributes, + IOStorageCompletion *completion) +{ + char zero[ZFS_BOOT_DEV_BSIZE]; + size_t len, cur, off = 0; + + DPRINTF_FUNC(); + + if (!buffer) { + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* Read vs. write */ + if (buffer->getDirection() == kIODirectionIn) { + /* Zero the read buffer */ + bzero(zero, ZFS_BOOT_DEV_BSIZE); + len = buffer->getLength(); + while (len > 0) { + cur = (len > ZFS_BOOT_DEV_BSIZE ? + ZFS_BOOT_DEV_BSIZE : len); + buffer->writeBytes(/* offset */ off, + /* buf */ zero, /* length */ cur); + off += cur; + len -= cur; + } + // dprintf("%s: read: %llu %llu\n", + // __func__, block, nblks); + IOStorage::complete(completion, kIOReturnSuccess, + buffer->getLength()); + return (kIOReturnSuccess); + } + + if (buffer->getDirection() != kIODirectionOut) { + dprintf("%s invalid direction %d\n", __func__, + buffer->getDirection()); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); + } + + /* + * XXX For now this just returns error for all writes. + * If it turns out that mountroot/bdevvp try to + * verify writable status by reading a block and writing + * it back to disk, lie and say it succeeded. + */ + dprintf("%s: write: %llu %llu\n", __func__, block, nblks); + IOStorage::complete(completion, kIOReturnError, 0); + return (kIOReturnSuccess); +} + +IOReturn +ZFSBootDevice::doEjectMedia() +{ + DPRINTF_FUNC(); + /* XXX Called at shutdown, maybe return success? */ + return (kIOReturnError); +} + +IOReturn +ZFSBootDevice::doFormatMedia(UInt64 byteCapacity) +{ + DPRINTF_FUNC(); + /* XXX shouldn't need it */ + return (kIOReturnError); + // return (kIOReturnSuccess); +} + +UInt32 +ZFSBootDevice::doGetFormatCapacities(UInt64 *capacities, + UInt32 capacitiesMaxCount) const +{ + DPRINTF_FUNC(); + if (capacities && capacitiesMaxCount > 0) { + capacities[0] = (ZFS_BOOT_DEV_BSIZE * ZFS_BOOT_DEV_BCOUNT); + dprintf("ZFSBootDevice %s: capacity %llu\n", + __func__, capacities[0]); + } + + /* Always inform caller of capacity count */ + return (1); +} + +/* Assign dataset name from null-terminated string */ +bool +ZFSBootDevice::setDatasetName(const char *dsname) +{ + OSDictionary *dict; + OSString *dsstr; + char *newname, *oldname; + size_t len; + + DPRINTF_FUNC(); + + /* Validate arguments */ + if (!dsname || (len = strnlen(dsname, + ZFS_MAX_DATASET_NAME_LEN)) == 0) { + dprintf("%s: missing argument\n", __func__); + return (false); + } + + /* Truncate too-long names (shouldn't happen) */ + if (len == ZFS_MAX_DATASET_NAME_LEN && + dsname[ZFS_MAX_DATASET_NAME_LEN] != '\0') { + dprintf("%s: dsname too long [%s]\n", + __func__, dsname); + /* XXX Just truncate the name */ + len--; + } + + /* Allocate room for name plus null char */ + newname = (char *)kmem_alloc(len+1, KM_SLEEP); + if (!newname) { + dprintf("ZFSBootDevice::%s string alloc failed\n", __func__); + return (false); + } + snprintf(newname, len+1, "%s", dsname); + newname[len] = '\0'; /* just in case */ + + /* Save an OSString copy for IORegistry */ + dsstr = OSString::withCString(newname); + if (!dsstr) { + dprintf("ZFSBootDevice::%s OSString failed\n", __func__); + kmem_free(newname, len+1); + return (false); + } + + /* Swap into class private var */ + oldname = productString; + productString = newname; + newname = 0; + if (oldname) { + kmem_free(oldname, strlen(oldname)+1); + oldname = 0; + } + + /* Get and clone device characteristics prop dict */ + if ((dict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyDeviceCharacteristicsKey))) == NULL || + (dict = OSDictionary::withDictionary(dict)) == NULL) { + dprintf("%s couldn't clone prop dict\n", __func__); + /* Should only happen during initialization */ + } + + if (dict) { + /* Copy string, add to dictionary, and replace prop dict */ + if (dict->setObject(kIOPropertyProductNameKey, + dsstr) == false || + setProperty(kIOPropertyDeviceCharacteristicsKey, + dict) == false) { + dprintf("%s couldn't set name\n", __func__); + dsstr->release(); + dict->release(); + return (false); + } + dict->release(); + dict = 0; + } + + /* Finally, set the IORegistryEntry/IOService name */ + setName(dsstr->getCStringNoCopy()); + dsstr->release(); + + return (true); +} + +/* Returns full dataset name from instance private var */ +char * +ZFSBootDevice::getProductString() +{ + dprintf("ZFSBootDevice %s [%s]\n", productString); + /* Return class private string */ + return (productString); +} + +/* Returns readonly status from instance private var */ +IOReturn +ZFSBootDevice::reportWriteProtection(bool *isWriteProtected) +{ + DPRINTF_FUNC(); + if (isWriteProtected) *isWriteProtected = isReadOnly; + return (kIOReturnSuccess); +} + +/* These return class static string for all instances */ +char * +ZFSBootDevice::getVendorString() +{ + dprintf("ZFSBootDevice %s [%s]\n", vendorString); + /* Return class static string */ + return (vendorString); +} +char * +ZFSBootDevice::getRevisionString() +{ + dprintf("ZFSBootDevice %s [%s]\n", revisionString); + /* Return class static string */ + return (revisionString); +} +char * +ZFSBootDevice::getAdditionalDeviceInfoString() +{ + dprintf("ZFSBootDevice %s [%s]\n", infoString); + /* Return class static string */ + return (infoString); +} + +/* Always return media present and unchanged */ +IOReturn +ZFSBootDevice::reportMediaState(bool *mediaPresent, + bool *changedState) +{ + DPRINTF_FUNC(); + if (mediaPresent) *mediaPresent = true; + if (changedState) *changedState = false; + return (kIOReturnSuccess); +} + +/* Always report nonremovable and nonejectable */ +IOReturn +ZFSBootDevice::reportRemovability(bool *isRemoveable) +{ + DPRINTF_FUNC(); + if (isRemoveable) *isRemoveable = false; + return (kIOReturnSuccess); +} +IOReturn +ZFSBootDevice::reportEjectability(bool *isEjectable) +{ + DPRINTF_FUNC(); + if (isEjectable) *isEjectable = false; + return (kIOReturnSuccess); +} + +/* Always report 512b blocksize */ +IOReturn +ZFSBootDevice::reportBlockSize(UInt64 *blockSize) +{ + DPRINTF_FUNC(); + if (!blockSize) + return (kIOReturnError); + + *blockSize = ZFS_BOOT_DEV_BSIZE; + return (kIOReturnSuccess); +} + +/* XXX Calculate from dev_bcount, should get size from objset */ +/* XXX Can issue message kIOMessageMediaParametersHaveChanged to update */ +IOReturn +ZFSBootDevice::reportMaxValidBlock(UInt64 *maxBlock) +{ + DPRINTF_FUNC(); + if (!maxBlock) + return (kIOReturnError); + + // *maxBlock = 0; + *maxBlock = ZFS_BOOT_DEV_BCOUNT - 1; + dprintf("ZFSBootDevice %s: maxBlock %llu\n", __func__, *maxBlock); + + return (kIOReturnSuccess); +} +#endif /* ZFS_BOOT */ +#endif /* 0 */ diff --git a/module/os/macos/zfs/zfs_ctldir.c b/module/os/macos/zfs/zfs_ctldir.c new file mode 100644 index 0000000000..57f8dc2e28 --- /dev/null +++ b/module/os/macos/zfs/zfs_ctldir.c @@ -0,0 +1,1519 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * LLNL-CODE-403049. + * Rewritten for Linux by: + * Rohan Puri + * Brian Behlendorf + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright (c) 2018 George Melikov. All Rights Reserved. + * Copyright (c) 2019 Datto, Inc. All rights reserved. + * Copyright (c) 2020 Jorgen Lundman. All rights reserved. + */ + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' and 'shares' directory, but this may + * expand in the future. The elements are built dynamically, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by an user mode helper which invokes + * the mount procedure. Unmounts are handled by allowing the mount + * point to expire so the kernel may automatically unmount it. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/') all share the same + * zfsvfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted on top of the '.zfs/snapshot/' paths + * (ie: snapshots) are complete ZFS filesystems and have their own unique + * zfsvfs_t. However, the fsid reported by these mounts will be the same + * as that used by the parent zfsvfs_t to make NFS happy. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_namecheck.h" + +extern kmem_cache_t *znode_cache; +extern uint64_t vnop_num_vnodes; + +/* + * Apple differences; + * + * We don't have 'shares' directory, so only 'snapshot' is relevant. + * + * We can not issue mount from kernel, so involve zed. + * - see zfs_ctldir_snapdir.c + * + * All vnodes point to znode_t, no special case nodes. + */ + +/* List of zfsctl mounts waiting to be mounted */ +static kmutex_t zfsctl_mounts_lock; +static list_t zfsctl_mounts_list; +struct zfsctl_mounts_waiting { + kmutex_t zcm_lock; + kcondvar_t zcm_cv; + list_node_t zcm_node; + char zcm_name[ZFS_MAX_DATASET_NAME_LEN]; +}; +typedef struct zfsctl_mounts_waiting zfsctl_mounts_waiting_t; + + +/* + * Control Directory Tunables (.zfs) + */ +int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; +int zfs_admin_snapshot = 1; +int zfs_auto_snapshot = 1; + +static kmutex_t zfsctl_unmount_lock; +static kcondvar_t zfsctl_unmount_cv; +static boolean_t zfsctl_unmount_thread_exit; + +static kmutex_t zfsctl_unmount_list_lock; +static list_t zfsctl_unmount_list; + +struct zfsctl_unmount_delay { + char *se_name; /* full snapshot name */ + spa_t *se_spa; /* pool spa */ + uint64_t se_objsetid; /* snapshot objset id */ + time_t se_time; + list_node_t se_nodelink; +}; +typedef struct zfsctl_unmount_delay zfsctl_unmount_delay_t; + + +/* + * Check if the given vnode is a part of the virtual .zfs directory. + */ +boolean_t +zfsctl_is_node(struct vnode *ip) +{ + return (ITOZ(ip)->z_is_ctldir); +} + +typedef int (**vnode_operations)(void *); + + +/* + * Allocate a new vnode with the passed id and ops. + */ +static struct vnode * +zfsctl_vnode_alloc(zfsvfs_t *zfsvfs, uint64_t id, + char *name) +{ + timestruc_t now; + struct vnode *vp = NULL; + znode_t *zp = NULL; + struct vnode_fsparam vfsp; + + printf("%s\n", __func__); + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + + gethrestime(&now); + ASSERT3P(zp->z_dirlocks, ==, NULL); + ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT3P(zp->z_xattr_cached, ==, NULL); + zp->z_zfsvfs = zfsvfs; + zp->z_id = id; + zp->z_unlinked = B_FALSE; + zp->z_atime_dirty = B_FALSE; + zp->z_zn_prefetch = B_FALSE; + zp->z_moved = B_FALSE; + zp->z_is_sa = B_FALSE; + zp->z_is_mapped = B_FALSE; + zp->z_is_ctldir = B_TRUE; + zp->z_is_stale = B_FALSE; + zp->z_sa_hdl = NULL; + zp->z_blksz = 0; + zp->z_seq = 0; + zp->z_mapcnt = 0; + zp->z_size = 0; + zp->z_pflags = 0; + zp->z_mode = 0; + zp->z_sync_cnt = 0; + zp->z_gen = 0; + zp->z_mode = (S_IFDIR | (S_IRWXU|S_IRWXG|S_IRWXO)); + zp->z_uid = 0; + zp->z_gid = 0; + ZFS_TIME_ENCODE(&now, zp->z_atime); + + zp->z_snap_mount_time = 0; /* Allow automount attempt */ + + strlcpy(zp->z_name_cache, name, sizeof (zp->z_name_cache)); + + dprintf("%s zp %p with vp %p zfsvfs %p vfs %p\n", __func__, + zp, vp, zfsvfs, zfsvfs->z_vfs); + + bzero(&vfsp, sizeof (vfsp)); + vfsp.vnfs_str = "zfs"; + vfsp.vnfs_mp = zfsvfs->z_vfs; + vfsp.vnfs_vtype = IFTOVT((mode_t)zp->z_mode); + vfsp.vnfs_fsnode = zp; + vfsp.vnfs_flags = VNFS_ADDFSREF; + + /* Tag root directory */ + if (id == zfsvfs->z_root) + vfsp.vnfs_markroot = 1; + + /* + * This creates a vnode with VSYSTEM set, this is so that unmount's + * vflush() (called before our vfs_unmount) will pass (and not block + * waiting for the usercount ref to be released). We then release the + * VROOT vnode in zfsctl_destroy, and release the usercount ref. + * Because of this, we need to call vnode_recycle() ourselves in destroy + */ + if (id == ZFSCTL_INO_ROOT) + vfsp.vnfs_marksystem = 1; + + vfsp.vnfs_vops = zfs_ctldirops; + + while (vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp) != 0) { + kpreempt(KPREEMPT_SYNC); + } + atomic_inc_64(&vnop_num_vnodes); + + printf("Assigned zp %p with vp %p zfsvfs %p\n", zp, vp, zp->z_zfsvfs); + + vnode_settag(vp, VT_ZFS); + + zp->z_vid = vnode_vid(vp); + zp->z_vnode = vp; + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + membar_producer(); + if (id < zfsvfs->z_ctldir_startid) + zfsvfs->z_ctldir_startid = id; + mutex_exit(&zfsvfs->z_znodes_lock); + + return (vp); +} + +/* + * Lookup the vnode with given id, it will be allocated if needed. + */ +static struct vnode * +zfsctl_vnode_lookup(zfsvfs_t *zfsvfs, uint64_t id, + char *name) +{ + struct vnode *ip = NULL; + int error = 0; + + printf("%s\n", __func__); + + while (ip == NULL) { + + error = zfs_vfs_vget(zfsvfs->z_vfs, id, &ip, NULL); + if (error == 0 && ip != NULL) + break; + + /* May fail due to concurrent zfsctl_vnode_alloc() */ + ip = zfsctl_vnode_alloc(zfsvfs, id, name); + } + + return (ip); +} + +/* + * Create the '.zfs' directory. This directory is cached as part of the VFS + * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount() + * therefore checks against a vfs_count of 2 instead of 1. This reference + * is removed when the ctldir is destroyed in the unmount. All other entities + * under the '.zfs' directory are created dynamically as needed. + * + * Because the dynamically created '.zfs' directory entries assume the use + * of 64-bit vnode numbers this support must be disabled on 32-bit systems. + */ +int +zfsctl_create(zfsvfs_t *zfsvfs) +{ + ASSERT(zfsvfs->z_ctldir == NULL); + + printf("%s\n", __func__); + + /* Create root node, tagged with VSYSTEM - see above */ + zfsvfs->z_ctldir = zfsctl_vnode_alloc(zfsvfs, ZFSCTL_INO_ROOT, + ZFS_CTLDIR_NAME); + if (zfsvfs->z_ctldir == NULL) + return (SET_ERROR(ENOENT)); + + vnode_ref(zfsvfs->z_ctldir); + VN_RELE(zfsvfs->z_ctldir); + + printf("%s: done %p\n", __func__, zfsvfs->z_ctldir); + + return (0); +} + +/* + * Destroy the '.zfs' directory or remove a snapshot from + * zfs_snapshots_by_name. Only called when the filesystem is unmounted. + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + if (zfsvfs->z_ctldir) { + if (VN_HOLD(zfsvfs->z_ctldir) == 0) { + vnode_rele(zfsvfs->z_ctldir); + /* Because tagged VSYSTEM, we manually call recycle */ + vnode_recycle(zfsvfs->z_ctldir); + VN_RELE(zfsvfs->z_ctldir); + } + zfsvfs->z_ctldir = NULL; + } +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +struct vnode * +zfsctl_root(znode_t *zp) +{ + ASSERT(zfs_has_ctldir(zp)); + VN_HOLD(ZTOZSB(zp)->z_ctldir); + return (ZTOZSB(zp)->z_ctldir); +} + + +struct vnode * +zfs_root_dotdot(struct vnode *vp) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + znode_t *rootzp = NULL; + struct vnode *retvp = NULL; + + printf("%s: for id %llu\n", __func__, zp->z_id); + + if (zp->z_id == ZFSCTL_INO_ROOT) + zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + else if (zp->z_id == ZFSCTL_INO_SNAPDIR) + retvp = zfsctl_root(zp); + else + retvp = zfsctl_vnode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR, + "snapshot"); + + if (rootzp != NULL) + retvp = ZTOV(rootzp); + + return (retvp); +} + +/* + * Special case the handling of "..". + */ +int +zfsctl_root_lookup(struct vnode *dvp, char *name, struct vnode **vpp, + int flags, cred_t *cr, int *direntflags, struct componentname *realpnp) +{ + znode_t *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + int error = 0; + uint64_t id; + + printf("%s: '%s'\n", __func__, name); + + ZFS_ENTER(zfsvfs); + + if (strcmp(name, "..") == 0) { + *vpp = zfs_root_dotdot(dvp); + } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) { + *vpp = zfsctl_vnode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR, + name); + } else { + error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id); + if (error != 0) + goto out; + *vpp = zfsctl_vnode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id, + name); + } + + if (*vpp == NULL) { + error = SET_ERROR(ENOENT); + } + +out: + ZFS_EXIT(zfsvfs); + + return (error); +} + +int +zfsctl_vnop_lookup(struct vnop_lookup_args *ap) +#if 0 + struct vnop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + vfs_context_t a_context; + }; +#endif +{ + int direntflags = 0; + int error; + struct componentname *cnp = ap->a_cnp; + char *filename = NULL; + int filename_num_bytes = 0; + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + + /* + * Darwin uses namelen as an optimisation, for example it can be + * set to 5 for the string "alpha/beta" to look up "alpha". In this + * case we need to copy it out to null-terminate. + */ + if (cnp->cn_nameptr[cnp->cn_namelen] != 0) { + filename_num_bytes = cnp->cn_namelen + 1; + filename = (char *)kmem_alloc(filename_num_bytes, KM_SLEEP); + bcopy(cnp->cn_nameptr, filename, cnp->cn_namelen); + filename[cnp->cn_namelen] = '\0'; + } + + error = zfsctl_root_lookup(ap->a_dvp, + filename ? filename : cnp->cn_nameptr, + ap->a_vpp, /* flags */ 0, cr, &direntflags, NULL); + + /* If we are to create a directory, change error code for XNU */ + if ((error == ENOENT) && + (cnp->cn_flags & ISLASTCN)) { + if ((cnp->cn_nameiop == CREATE) || + (cnp->cn_nameiop == RENAME)) + error = EJUSTRETURN; + } + + if (filename != NULL) + kmem_free(filename, filename_num_bytes); + + return (error); +} + +/* Quick output function for readdir */ +#define DIRENT_RECLEN(namelen, ext) \ + ((ext) ? \ + ((sizeof (struct direntry) + (namelen) - (MAXPATHLEN-1) + 7) & ~7) \ + : \ + ((sizeof (struct dirent) - (NAME_MAX+1)) + (((namelen)+1 + 7) &~ 7))) + +static int zfsctl_dir_emit(const char *name, uint64_t id, enum vtype type, + struct vnop_readdir_args *ap, uint64_t **next) +{ + struct uio *uio = ap->a_uio; + boolean_t extended = (ap->a_flags & VNODE_READDIR_EXTENDED); + struct direntry *eodp; /* Extended */ + struct dirent *odp; /* Standard */ + int namelen; + void *buf; + int error = 0; + ushort_t reclen; + + printf("%s '%s'\n", __func__, name); + + namelen = strlen(name); + reclen = DIRENT_RECLEN(namelen, extended); + + if (reclen > uio_resid(uio)) + return (EINVAL); + + buf = kmem_zalloc(reclen, KM_SLEEP); + + if (extended) { + eodp = buf; + + /* + * NOTE: d_seekoff is the offset for the *next* entry - + * so poke in the previous struct with this id + */ + eodp->d_seekoff = uio_offset(uio) + 1; + + eodp->d_ino = id; + eodp->d_type = type; + + (void) bcopy(name, eodp->d_name, namelen + 1); + eodp->d_namlen = namelen; + eodp->d_reclen = reclen; + + } else { + odp = buf; + + odp->d_ino = id; + odp->d_type = type; + (void) bcopy(name, odp->d_name, namelen + 1); + odp->d_namlen = namelen; + odp->d_reclen = reclen; + + } + + /* Copyout this entry */ + error = uiomove(buf, (long)reclen, UIO_READ, uio); + + kmem_free(buf, reclen); + return (error); +} + +int +zfsctl_vnop_readdir_root(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + int error = 0; + uint64_t *next = NULL; + int entries = 0; + uint64_t offset; + struct uio *uio = ap->a_uio; + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + printf("%s\n", __func__); + + ZFS_ENTER(zfsvfs); + + *ap->a_numdirent = 0; + + offset = uio_offset(uio); + + while (offset < 3 && error == 0) { + + switch (offset) { + case 0: /* "." */ + error = zfsctl_dir_emit(".", ZFSCTL_INO_ROOT, + DT_DIR, ap, &next); + break; + + case 1: /* ".." */ + error = zfsctl_dir_emit("..", 2, + DT_DIR, ap, &next); + break; + + case 2: + error = zfsctl_dir_emit(ZFS_SNAPDIR_NAME, + ZFSCTL_INO_SNAPDIR, DT_DIR, ap, &next); + break; + } + + if (error == ENOENT) { + printf("end of snapshots reached\n"); + break; + } + + if (error != 0) { + printf("emit error\n"); + break; + } + + entries++; + offset++; + uio_setoffset(uio, offset); + } + + uio_setoffset(uio, offset); + + /* Finished without error? Set EOF */ + if (offset >= 3 && error == 0) { + *ap->a_eofflag = 1; + printf("Setting eof\n"); + } + + *ap->a_numdirent = entries; + printf("Returning %d entries\n", entries); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +int +zfsctl_vnop_readdir_snapdir(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + int error = 0; + uint64_t *next = NULL; + int entries = 0; + uint64_t offset; + struct uio *uio = ap->a_uio; + boolean_t case_conflict; + uint64_t id; + char snapname[MAXNAMELEN]; + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + printf("%s\n", __func__); + + ZFS_ENTER(zfsvfs); + + *ap->a_numdirent = 0; + + offset = uio_offset(uio); + + while (error == 0) { + + switch (offset) { + case 0: /* "." */ + error = zfsctl_dir_emit(".", ZFSCTL_INO_SNAPDIR, + DT_DIR, ap, &next); + break; + + case 1: /* ".." */ + error = zfsctl_dir_emit("..", ZFSCTL_INO_ROOT, + DT_DIR, ap, &next); + break; + + default: + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), + FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, + MAXNAMELEN, snapname, &id, &offset, &case_conflict); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), + FTAG); + if (error) + break; + + error = zfsctl_dir_emit(snapname, + ZFSCTL_INO_SHARES - id, DT_DIR, ap, &next); + break; + } + + if (error != 0) { + printf("emit error\n"); + break; + } + + entries++; + offset++; + uio_setoffset(uio, offset); + } + + uio_setoffset(uio, offset); + + /* Finished without error? Set EOF */ + if (error == ENOENT) { + *ap->a_eofflag = 1; + printf("Setting eof\n"); + error = 0; + } + + *ap->a_numdirent = entries; + printf("Returning %d entries\n", entries); + + ZFS_EXIT(zfsvfs); + + return (error); +} + + +/* We need to spit out a valid "." ".." entries for mount to work */ +int +zfsctl_vnop_readdir_snapdirs(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + int error = 0; + uint64_t *next = NULL; + int entries = 0; + uint64_t offset; + struct uio *uio = ap->a_uio; + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + + *ap->a_numdirent = 0; + + offset = uio_offset(uio); + + printf("%s: for id %llu: offset %llu\n", __func__, + zp->z_id, offset); + + while (error == 0) { + + switch (offset) { + case 0: /* "." */ + error = zfsctl_dir_emit(".", ZFSCTL_INO_SNAPDIR, + DT_DIR, ap, &next); + break; + + case 1: /* ".." */ + error = zfsctl_dir_emit("..", ZFSCTL_INO_ROOT, + DT_DIR, ap, &next); + break; + + default: + error = ENOENT; + break; + } + + if (error != 0) { + printf("emit error\n"); + break; + } + + entries++; + offset++; + uio_setoffset(uio, offset); + } + + uio_setoffset(uio, offset); + + /* Finished without error? Set EOF */ + if (error == ENOENT) { + *ap->a_eofflag = 1; + printf("Setting eof\n"); + error = 0; + } + + *ap->a_numdirent = entries; + printf("Returning %d entries\n", entries); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +int +zfsctl_vnop_readdir(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + znode_t *zp = VTOZ(ap->a_vp); + + printf("%s\n", __func__); + + /* Which directory are we to output? */ + switch (zp->z_id) { + case ZFSCTL_INO_ROOT: + return (zfsctl_vnop_readdir_root(ap)); + case ZFSCTL_INO_SNAPDIR: + return (zfsctl_vnop_readdir_snapdir(ap)); + default: + return (zfsctl_vnop_readdir_snapdirs(ap)); + } + return (EINVAL); +} + +int +zfsctl_vnop_getattr(struct vnop_getattr_args *ap) +#if 0 + struct vnop_getattr_args { + struct vnode *a_vp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ + vattr_t *vap = ap->a_vap; + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + timestruc_t now; + + printf("%s: active x%llx\n", __func__, vap->va_active); + + ZFS_ENTER(zfsvfs); + + gethrestime(&now); + + if (VATTR_IS_ACTIVE(vap, va_rdev)) + VATTR_RETURN(vap, va_rdev, zfsvfs->z_rdev); + if (VATTR_IS_ACTIVE(vap, va_nlink)) + VATTR_RETURN(vap, va_nlink, + vnode_isdir(vp) ? zp->z_size : zp->z_links); + if (VATTR_IS_ACTIVE(vap, va_total_size)) + VATTR_RETURN(vap, va_total_size, 512); + if (VATTR_IS_ACTIVE(vap, va_total_alloc)) + VATTR_RETURN(vap, va_total_alloc, 512); + if (VATTR_IS_ACTIVE(vap, va_data_size)) + VATTR_RETURN(vap, va_data_size, 0); + if (VATTR_IS_ACTIVE(vap, va_data_alloc)) + VATTR_RETURN(vap, va_data_alloc, 0); + if (VATTR_IS_ACTIVE(vap, va_iosize)) + VATTR_RETURN(vap, va_iosize, 512); + if (VATTR_IS_ACTIVE(vap, va_uid)) + VATTR_RETURN(vap, va_uid, 0); + if (VATTR_IS_ACTIVE(vap, va_gid)) + VATTR_RETURN(vap, va_gid, 0); + if (VATTR_IS_ACTIVE(vap, va_mode)) + VATTR_RETURN(vap, va_mode, S_IFDIR | + S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + if (VATTR_IS_ACTIVE(vap, va_flags)) + VATTR_RETURN(vap, va_flags, zfs_getbsdflags(zp)); + + if (VATTR_IS_ACTIVE(vap, va_acl)) { + VATTR_RETURN(vap, va_uuuid, kauth_null_guid); + VATTR_RETURN(vap, va_guuid, kauth_null_guid); + VATTR_RETURN(vap, va_acl, NULL); + } + + // crtime, atime, mtime, ctime, btime + uint64_t timez[2]; + timez[0] = zfsvfs->z_mount_time; + timez[1] = 0; + + if (VATTR_IS_ACTIVE(vap, va_create_time)) { + ZFS_TIME_DECODE(&vap->va_create_time, timez); + VATTR_SET_SUPPORTED(vap, va_create_time); + } + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + ZFS_TIME_DECODE(&vap->va_access_time, timez); + VATTR_SET_SUPPORTED(vap, va_access_time); + } + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + ZFS_TIME_DECODE(&vap->va_modify_time, timez); + VATTR_SET_SUPPORTED(vap, va_modify_time); + } + if (VATTR_IS_ACTIVE(vap, va_change_time)) { + ZFS_TIME_DECODE(&vap->va_change_time, timez); + VATTR_SET_SUPPORTED(vap, va_change_time); + } + if (VATTR_IS_ACTIVE(vap, va_backup_time)) { + ZFS_TIME_DECODE(&vap->va_backup_time, timez); + VATTR_SET_SUPPORTED(vap, va_backup_time); + } + if (VATTR_IS_ACTIVE(vap, va_addedtime)) { + ZFS_TIME_DECODE(&vap->va_addedtime, timez); + VATTR_SET_SUPPORTED(vap, va_addedtime); + } + + if (VATTR_IS_ACTIVE(vap, va_fileid)) + VATTR_RETURN(vap, va_fileid, zp->z_id); + if (VATTR_IS_ACTIVE(vap, va_linkid)) + VATTR_RETURN(vap, va_linkid, zp->z_id); + if (VATTR_IS_ACTIVE(vap, va_parentid)) { + switch (zp->z_id) { + case ZFSCTL_INO_ROOT: + // ".zfs" parent is mount, 2 on osx + VATTR_RETURN(vap, va_linkid, 2); + break; + case ZFSCTL_INO_SNAPDIR: + // ".zfs/snapshot" parent is ".zfs" + VATTR_RETURN(vap, va_linkid, ZFSCTL_INO_ROOT); + break; + default: + // ".zfs/snapshot/$name" parent ".zfs/snapshot" + VATTR_RETURN(vap, va_linkid, + ZFSCTL_INO_SNAPDIR); + break; + } + } + if (VATTR_IS_ACTIVE(vap, va_fsid)) + VATTR_RETURN(vap, va_fsid, zfsvfs->z_rdev); + + if (VATTR_IS_ACTIVE(vap, va_filerev)) + VATTR_RETURN(vap, va_filerev, 0); + if (VATTR_IS_ACTIVE(vap, va_gen)) + VATTR_RETURN(vap, va_gen, zp->z_gen); + if (VATTR_IS_ACTIVE(vap, va_type)) + VATTR_RETURN(vap, va_type, vnode_vtype(ZTOV(zp))); + if (VATTR_IS_ACTIVE(vap, va_name)) { + strlcpy(vap->va_name, zp->z_name_cache, MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + } + + /* Don't include '.' and '..' in the number of entries */ + if (VATTR_IS_ACTIVE(vap, va_nchildren) && vnode_isdir(vp)) { + VATTR_RETURN(vap, va_nchildren, + zp->z_links > 3 ? zp->z_links-2 : 1); + } + if (VATTR_IS_ACTIVE(vap, va_dirlinkcount) && vnode_isdir(vp)) + VATTR_RETURN(vap, va_dirlinkcount, 1); + +#ifdef VNODE_ATTR_va_fsid64 + if (VATTR_IS_ACTIVE(vap, va_fsid64)) { + vap->va_fsid64.val[0] = + vfs_statfs(zfsvfs->z_vfs)->f_fsid.val[0]; + vap->va_fsid64.val[1] = vfs_typenum(zfsvfs->z_vfs); + VATTR_SET_SUPPORTED(vap, va_fsid64); + } +#endif + + ZFS_EXIT(zfsvfs); + + printf("%s: returned x%llx missed: x%llx\n", __func__, + vap->va_supported, vap->va_active &= ~vap->va_supported); + return (0); +} + +int +zfsctl_vnop_access(struct vnop_access_args *ap) +{ + int accmode = ap->a_action; + dprintf("zfsctl_access\n"); + + if (accmode & VWRITE) + return (EACCES); + return (0); +} + +int +zfsctl_vnop_open(struct vnop_open_args *ap) +{ + int flags = ap->a_mode; + + if (flags & FWRITE) + return (EACCES); + + return (zfsctl_snapshot_mount(ap->a_vp, 0)); +} + +int +zfsctl_vnop_close(struct vnop_close_args *ap) +{ + printf("%s\n", __func__); + return (0); +} + +int +zfsctl_vnop_inactive(struct vnop_inactive_args *ap) +{ + printf("%s\n", __func__); + return (0); +} + +int +zfsctl_vnop_reclaim(struct vnop_reclaim_args *ap) +{ + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + printf("%s vp %p\n", __func__, vp); + vnode_removefsref(vp); /* ADDREF from vnode_create */ + vnode_clearfsnode(vp); /* vp->v_data = NULL */ + + mutex_enter(&zfsvfs->z_znodes_lock); + if (list_link_active(&zp->z_link_node)) { + list_remove(&zfsvfs->z_all_znodes, zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + zp->z_vnode = NULL; + kmem_cache_free(znode_cache, zp); + + return (0); +} + +/* + * Construct a full dataset name in full_name: "pool/dataset@snap_name" + */ +static int +zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len, + char *full_name) +{ + objset_t *os = zfsvfs->z_os; + + if (zfs_component_namecheck(snap_name, NULL, NULL) != 0) + return (SET_ERROR(EILSEQ)); + + dmu_objset_name(os, full_name); + if ((strlen(full_name) + 1 + strlen(snap_name)) >= len) + return (SET_ERROR(ENAMETOOLONG)); + + (void) strcat(full_name, "@"); + (void) strcat(full_name, snap_name); + + return (0); +} + +int +zfsctl_snapshot_mount(struct vnode *vp, int flags) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int ret = 0; + /* + * If we are here for a snapdirs directory, attempt to get zed + * to mount the snapshot for the user. If successful, forward the + * vnop_open() to them (ourselves). + * Use a timeout in case zed is not running. + */ + + if (zfs_auto_snapshot == 0) + return (0); + + ZFS_ENTER(zfsvfs); + if (((zp->z_id >= zfsvfs->z_ctldir_startid) && + (zp->z_id <= ZFSCTL_INO_SNAPDIRS))) { + hrtime_t now; + now = gethrtime(); + + /* + * If z_snap_mount_time is set, check if it is old enough to + * retry, if so, set z_snap_mount_time to zero. + */ + if (now - zp->z_snap_mount_time > SEC2NSEC(60)) + atomic_cas_64((uint64_t *)&zp->z_snap_mount_time, + (uint64_t)zp->z_snap_mount_time, + 0ULL); + + /* + * Attempt mount, make sure only to issue one request, by + * attempting to CAS in current time in place of zero. + */ + if (atomic_cas_64((uint64_t *)&zp->z_snap_mount_time, 0ULL, + (uint64_t)now) == 0ULL) { + char full_name[ZFS_MAX_DATASET_NAME_LEN]; + + /* First! */ + ret = zfsctl_snapshot_name(zfsvfs, zp->z_name_cache, + ZFS_MAX_DATASET_NAME_LEN, full_name); + + if (ret == 0) { + zfsctl_mounts_waiting_t *zcm; + + /* Create condvar to wait for mount to happen */ + + zcm = kmem_alloc( + sizeof (zfsctl_mounts_waiting_t), KM_SLEEP); + mutex_init(&zcm->zcm_lock, NULL, MUTEX_DEFAULT, + NULL); + cv_init(&zcm->zcm_cv, NULL, CV_DEFAULT, NULL); + strlcpy(zcm->zcm_name, full_name, + sizeof (zcm->zcm_name)); + + printf("%s: requesting mount for '%s'\n", + __func__, full_name); + + mutex_enter(&zfsctl_mounts_lock); + list_insert_tail(&zfsctl_mounts_list, zcm); + mutex_exit(&zfsctl_mounts_lock); + + mutex_enter(&zcm->zcm_lock); + zfs_ereport_snapshot_post( + FM_EREPORT_ZFS_SNAPSHOT_MOUNT, + dmu_objset_spa(zfsvfs->z_os), full_name); + + /* Now we wait hoping zed comes back to us */ + ret = cv_timedwait(&zcm->zcm_cv, &zcm->zcm_lock, + ddi_get_lbolt() + (hz * 3)); + + printf("%s: finished waiting %d\n", + __func__, ret); + + mutex_exit(&zcm->zcm_lock); + + mutex_enter(&zfsctl_mounts_lock); + list_remove(&zfsctl_mounts_list, zcm); + mutex_exit(&zfsctl_mounts_lock); + + mutex_destroy(&zcm->zcm_lock); + cv_destroy(&zcm->zcm_cv); + + kmem_free(zcm, + sizeof (zfsctl_mounts_waiting_t)); + + /* + * If we mounted, make it re-open it so + * the process that issued the access will + * see the mounted content + */ + if (ret >= 0) { + /* Remove the cache entry */ + cache_purge(vp); + cache_purge_negatives(vp); + ret = ERESTART; + } + } + } + } + + ZFS_EXIT(zfsvfs); + + return (ret); +} + +/* Called whenever zfs_vfs_mount() is called with a snapshot */ +void +zfsctl_mount_signal(char *osname, boolean_t mounting) +{ + zfsctl_mounts_waiting_t *zcm; + + printf("%s: looking for snapshot '%s'\n", __func__, osname); + + mutex_enter(&zfsctl_mounts_lock); + for (zcm = list_head(&zfsctl_mounts_list); + zcm; + zcm = list_next(&zfsctl_mounts_list, zcm)) { + if (strncmp(zcm->zcm_name, osname, sizeof (zcm->zcm_name)) == 0) + break; + } + mutex_exit(&zfsctl_mounts_lock); + + /* Is there someone to wake up? */ + if (zcm != NULL) { + mutex_enter(&zcm->zcm_lock); + cv_signal(&zcm->zcm_cv); + mutex_exit(&zcm->zcm_lock); + printf("%s: mount waiter found and signalled\n", __func__); + } + + zfsctl_unmount_delay_t *zcu; + + /* Add or remove mount to/from list of active mounts */ + + if (mounting) { + /* Add active mounts to the list */ + zcu = kmem_alloc(sizeof (zfsctl_unmount_delay_t), KM_SLEEP); + zcu->se_name = kmem_strdup(osname); + zcu->se_time = gethrestime_sec(); + list_link_init(&zcu->se_nodelink); + + mutex_enter(&zfsctl_unmount_list_lock); + list_insert_tail(&zfsctl_unmount_list, zcu); + mutex_exit(&zfsctl_unmount_list_lock); + + } else { + /* Unmounting */ + mutex_enter(&zfsctl_unmount_list_lock); + for (zcu = list_head(&zfsctl_unmount_list); + zcu != NULL; + zcu = list_next(&zfsctl_unmount_list, zcu)) { + if (strcmp(osname, zcu->se_name) == 0) { + list_remove(&zfsctl_unmount_list, zcu); + kmem_strfree(zcu->se_name); + kmem_free(zcu, sizeof (zfsctl_unmount_delay_t)); + break; + } + } + mutex_exit(&zfsctl_unmount_list_lock); + } +} + +int +zfsctl_snapshot_unmount_node(struct vnode *vp, const char *full_name, + int flags) +{ + znode_t *zp = VTOZ(vp); + + printf("%s\n", __func__); + + if (zp == NULL) + return (ENOENT); + + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int ret = ENOENT; + /* + * If we are here for a snapdirs directory, attempt to get zed + * to mount the snapshot for the user. If successful, forward the + * vnop_open() to them (ourselves). + * Use a timeout in case zed is not running. + */ + + ZFS_ENTER(zfsvfs); + + if (zp->z_id == zfsvfs->z_root) { + hrtime_t now; + now = gethrtime(); + + /* + * If z_snap_mount_time is set, check if it is old enough to + * retry, if so, set z_snap_mount_time to zero. + */ + if (now - zp->z_snap_mount_time > SEC2NSEC(60)) + atomic_cas_64((uint64_t *)&zp->z_snap_mount_time, + (uint64_t)zp->z_snap_mount_time, + 0ULL); + + /* + * Attempt unmount, make sure only to issue one request, by + * attempting to CAS in current time in place of zero. + */ + if (atomic_cas_64((uint64_t *)&zp->z_snap_mount_time, 0ULL, + (uint64_t)now) == 0ULL) { + + ret = 0; + + /* First! */ + + if (ret == 0) { + zfsctl_mounts_waiting_t *zcm; + + /* Create condvar to wait for mount to happen */ + + zcm = kmem_alloc( + sizeof (zfsctl_mounts_waiting_t), KM_SLEEP); + mutex_init(&zcm->zcm_lock, NULL, MUTEX_DEFAULT, + NULL); + cv_init(&zcm->zcm_cv, NULL, CV_DEFAULT, NULL); + strlcpy(zcm->zcm_name, full_name, + sizeof (zcm->zcm_name)); + + dprintf("%s: requesting unmount for '%s'\n", + __func__, full_name); + + mutex_enter(&zfsctl_mounts_lock); + list_insert_tail(&zfsctl_mounts_list, zcm); + mutex_exit(&zfsctl_mounts_lock); + + mutex_enter(&zcm->zcm_lock); + zfs_ereport_snapshot_post( + FM_EREPORT_ZFS_SNAPSHOT_UNMOUNT, + dmu_objset_spa(zfsvfs->z_os), full_name); + + /* Now we wait hoping zed comes back to us */ + ret = cv_timedwait(&zcm->zcm_cv, &zcm->zcm_lock, + ddi_get_lbolt() + (hz * 3)); + + dprintf("%s: finished waiting %d\n", + __func__, ret); + + mutex_exit(&zcm->zcm_lock); + + mutex_enter(&zfsctl_mounts_lock); + list_remove(&zfsctl_mounts_list, zcm); + mutex_exit(&zfsctl_mounts_lock); + + kmem_free(zcm, + sizeof (zfsctl_mounts_waiting_t)); + + /* Allow mounts to happen immediately */ + zp->z_snap_mount_time = 0; + + /* + * If we unmounted, alert caller + */ + if (ret >= 0) + ret = 0; + + } + } + } + + ZFS_EXIT(zfsvfs); + + return (ret); +} + +int +zfsctl_snapshot_unmount(const char *snapname, int flags) +{ + znode_t *rootzp; + zfsvfs_t *zfsvfs; + + printf("%s\n", __func__); + + if (strchr(snapname, '@') == NULL) + return (0); + + int err = getzfsvfs(snapname, &zfsvfs); + if (err != 0) { + ASSERT3P(zfsvfs, ==, NULL); + return (0); + } + ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); + + err = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (err == 0) { + zfsctl_snapshot_unmount_node(ZTOV(rootzp), snapname, flags); + VN_RELE(ZTOV(rootzp)); + } + + vfs_unbusy(zfsvfs->z_vfs); + return (0); +} + +int +zfsctl_vnop_mkdir(struct vnop_mkdir_args *ap) +#if 0 + struct vnop_mkdir_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + znode_t *dzp = VTOZ(ap->a_dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + char *dsname; + int error; + + if (zfs_admin_snapshot == 0) + return (SET_ERROR(EACCES)); + + dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (zfs_component_namecheck(ap->a_cnp->cn_nameptr, NULL, NULL) != 0) { + error = SET_ERROR(EILSEQ); + goto out; + } + + dmu_objset_name(zfsvfs->z_os, dsname); + + error = zfs_secpolicy_snapshot_perms(dsname, cr); + if (error != 0) + goto out; + + if (error == 0) { + error = dmu_objset_snapshot_one(dsname, ap->a_cnp->cn_nameptr); + if (error != 0) + goto out; + + error = zfsctl_root_lookup(ap->a_dvp, ap->a_cnp->cn_nameptr, + ap->a_vpp, 0, cr, NULL, NULL); + } + +out: + kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); + + return (error); +} + +int +zfsctl_vnop_rmdir(struct vnop_rmdir_args *ap) +#if 0 + struct vnop_rmdir_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + vfs_context_t a_context; + }; +#endif +{ + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + znode_t *dzp = VTOZ(ap->a_dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + char *snapname, *real; + char *name = ap->a_cnp->cn_nameptr; + int error; + + printf("%s: '%s'\n", __func__, name); + + if (zfs_admin_snapshot == 0) + return (SET_ERROR(EACCES)); + + ZFS_ENTER(zfsvfs); + + snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + error = dmu_snapshot_realname(zfsvfs->z_os, name, + real, ZFS_MAX_DATASET_NAME_LEN, NULL); + if (error == 0) { + name = real; + } else if (error != ENOTSUP) { + goto out; + } + } + + error = zfsctl_snapshot_name(zfsvfs, name, + ZFS_MAX_DATASET_NAME_LEN, snapname); + if (error == 0) + error = zfs_secpolicy_destroy_perms(snapname, cr); + if (error != 0) + goto out; + + error = zfsctl_snapshot_unmount_node(ap->a_vp, snapname, MNT_FORCE); + if ((error == 0) || (error == ENOENT)) { + error = dsl_destroy_snapshot(snapname, B_FALSE); + + /* Destroy the vnode */ + if (ap->a_vp != NULL) { + dprintf("%s: releasing vp\n", __func__); + vnode_recycle(ap->a_vp); + } + } + +out: + kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static void +zfsctl_unmount_thread(void *notused) +{ + callb_cpr_t cpr; + zfsctl_unmount_delay_t *zcu; + time_t now; + CALLB_CPR_INIT(&cpr, &zfsctl_unmount_lock, callb_generic_cpr, FTAG); + + dprintf("%s is alive\n", __func__); + + mutex_enter(&zfsctl_unmount_lock); + while (!zfsctl_unmount_thread_exit) { + + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&zfsctl_unmount_cv, + &zfsctl_unmount_lock, ddi_get_lbolt() + (hz<<6)); + CALLB_CPR_SAFE_END(&cpr, &zfsctl_unmount_lock); + + if (!zfsctl_unmount_thread_exit) { + /* + * Loop all active mounts, if any are older + * than ZFSCTL_EXPIRE_SNAPSHOT, then we update + * their timestamp and attempt unmount. + */ + now = gethrestime_sec(); + mutex_enter(&zfsctl_unmount_list_lock); + for (zcu = list_head(&zfsctl_unmount_list); + zcu != NULL; + zcu = list_next(&zfsctl_unmount_list, zcu)) { + if ((now > zcu->se_time) && + ((now - zcu->se_time) > + zfs_expire_snapshot)) { + zcu->se_time = now; + zfsctl_snapshot_unmount(zcu->se_name, + 0); + } + } + mutex_exit(&zfsctl_unmount_list_lock); + } + } + + zfsctl_unmount_thread_exit = FALSE; + cv_broadcast(&zfsctl_unmount_cv); + CALLB_CPR_EXIT(&cpr); + dprintf("ZFS: zfsctl_unmount thread exit\n"); + thread_exit(); +} + +/* + * Initialize the various pieces we'll need to create and manipulate .zfs + * directories. Currently this is unused but available. + */ +void +zfsctl_init(void) +{ + mutex_init(&zfsctl_mounts_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsctl_mounts_list, sizeof (zfsctl_mounts_waiting_t), + offsetof(zfsctl_mounts_waiting_t, zcm_node)); + + mutex_init(&zfsctl_unmount_list_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsctl_unmount_list, sizeof (zfsctl_unmount_delay_t), + offsetof(zfsctl_unmount_delay_t, se_nodelink)); + + mutex_init(&zfsctl_unmount_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zfsctl_unmount_cv, NULL, CV_DEFAULT, NULL); + zfsctl_unmount_thread_exit = FALSE; + + (void) thread_create(NULL, 0, zfsctl_unmount_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +/* + * Cleanup the various pieces we needed for .zfs directories. In particular + * ensure the expiry timer is canceled safely. + */ +void +zfsctl_fini(void) +{ + mutex_destroy(&zfsctl_mounts_lock); + list_destroy(&zfsctl_mounts_list); + + mutex_destroy(&zfsctl_unmount_list_lock); + list_destroy(&zfsctl_unmount_list); + + mutex_enter(&zfsctl_unmount_lock); + zfsctl_unmount_thread_exit = TRUE; + while (zfsctl_unmount_thread_exit) { + cv_signal(&zfsctl_unmount_cv); + cv_wait(&zfsctl_unmount_cv, &zfsctl_unmount_lock); + } + mutex_exit(&zfsctl_unmount_lock); + + mutex_destroy(&zfsctl_unmount_lock); + cv_destroy(&zfsctl_unmount_cv); +} + +module_param(zfs_admin_snapshot, int, 0644); +MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot"); + +module_param(zfs_expire_snapshot, int, 0644); +MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot"); diff --git a/module/os/macos/zfs/zfs_debug.c b/module/os/macos/zfs/zfs_debug.c new file mode 100644 index 0000000000..68cd17eeef --- /dev/null +++ b/module/os/macos/zfs/zfs_debug.c @@ -0,0 +1,186 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include + +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ + +int zfs_dbgmsg_enable = 1; + +/* + * Debug logging is enabled by default for production kernel builds. + * The overhead for this is negligible and the logs can be valuable when + * debugging. For non-production user space builds all debugging except + * logging is enabled since performance is no longer a concern. + */ +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +zfs_dbgmsg_fini(void) +{ + zfs_dbgmsg_t *zdm; + + while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { + int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_destroy(&zfs_dbgmsgs_lock); + ASSERT0(zfs_dbgmsg_size); +} + +void +__set_error(const char *file, const char *func, int line, int err) +{ + /* + * To enable this: + * + * $ echo 512 >/sys/module/zfs/parameters/zfs_flags + */ + if (zfs_flags & ZFS_DEBUG_SET_ERROR) + __dprintf(B_FALSE, file, func, line, "error %lu", err); +} + +/* + * Print these messages by running: + * echo ::zfs_dbgmsg | mdb -k + * + * Monitor these messages by running: + * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + * + * When used with libzpool, monitor with: + * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}' + */ + +/* + * MacOS X's dtrace doesn't handle the PROBEs, so + * we have a utility function that we can watch with + * sudo dtrace -qn '__zfs_dbgmsg:entry{printf("%s\n", stringof(arg0));}' + */ +noinline void +__zfs_dbgmsg(char *buf) +{ + int size = sizeof (zfs_dbgmsg_t) + strlen(buf); + zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP); + zdm->zdm_timestamp = gethrestime_sec(); + strlcpy(zdm->zdm_msg, buf, size); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size; + while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) { + zdm = list_remove_head(&zfs_dbgmsgs); + size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_exit(&zfs_dbgmsgs_lock); +} + +void +__dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...) +{ + int size, i; + va_list adx; + char *buf, *nl; + char *prefix = (dprint) ? "dprintf: " : ""; + const char *newfile; + + /* + * Get rid of annoying prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx); + va_end(adx); + + size += snprintf(NULL, 0, "%s%s:%d:%s(): ", prefix, newfile, line, + func); + + size++; /* null byte in the "buf" string */ + + /* + * There is one byte of string in sizeof (zfs_dbgmsg_t), used + * for the terminating null. + */ + buf = kmem_alloc(size, KM_SLEEP); + int roger = 0; + + va_start(adx, fmt); + i = snprintf(buf, size + 1, "%s%s:%d:%s(): ", + prefix, newfile, line, func); + roger = vsnprintf(buf + i, size -i + 1, fmt, adx); + va_end(adx); + + /* + * Get rid of trailing newline for dprintf logs. + */ + if (dprint && buf[0] != '\0') { + nl = &buf[strlen(buf) - 1]; + if (*nl == '\n') + *nl = '\0'; + } + + DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); + + __zfs_dbgmsg(buf); + + kmem_free(buf, size); +} + +void +zfs_dbgmsg_print(const char *tag) +{ + zfs_dbgmsg_t *zdm; + + (void) printf("ZFS_DBGMSG(%s):\n", tag); + mutex_enter(&zfs_dbgmsgs_lock); + for (zdm = list_head(&zfs_dbgmsgs); zdm; + zdm = list_next(&zfs_dbgmsgs, zdm)) + (void) printf("%s\n", zdm->zdm_msg); + mutex_exit(&zfs_dbgmsgs_lock); +} diff --git a/module/os/macos/zfs/zfs_dir.c b/module/os/macos/zfs/zfs_dir.c new file mode 100644 index 0000000000..0de0590572 --- /dev/null +++ b/module/os/macos/zfs/zfs_dir.c @@ -0,0 +1,1213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt, + boolean_t update, int *deflags, struct componentname *rpnp, uint64_t *zoid) +{ + boolean_t conflict = B_FALSE; + int error; + + if (zfsvfs->z_norm) { + size_t bufsz = 0; + char *buf = NULL; + + if (rpnp) { + buf = rpnp->cn_nameptr; + bufsz = rpnp->cn_namelen; + } + + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, buf, bufsz, &conflict); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + + /* + * Allow multiple entries provided the first entry is + * the object id. Non-zpl consumers may safely make + * use of the additional space. + * + * XXX: This should be a feature flag for compatibility + */ + if (error == EOVERFLOW) + error = 0; + + if (zfsvfs->z_norm && !error && deflags) + *deflags = conflict ? ED_CASE_CONFLICT : 0; + + *zoid = ZFS_DIRENT_OBJ(*zoid); + + return (error); +} + +/* + * Lock a directory entry. A dirlock on protects that name + * in dzp's directory zap object. As long as you hold a dirlock, you can + * assume two things: (1) dzp cannot be reaped, and (2) no other thread + * can change the zap entry for (i.e. link or unlink) this name. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZSHARED: allow concurrent access with other ZSHARED callers. + * ZXATTR: we want dzp's xattr directory + * ZCILOOK: On a mixed sensitivity file system, + * this lookup should be case-insensitive. + * ZCIEXACT: On a purely case-insensitive file system, + * this lookup should be case-sensitive. + * ZRENAMING: we are locking for renaming, force narrow locks + * ZHAVELOCK: Don't grab the z_name_lock for this call. The + * current thread already holds it. + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * dlpp - pointer to the dirlock for this entry (NULL on error) + * direntflags - (case-insensitive lookup only) + * flags if multiple case-sensitive matches exist in directory + * realpnp - (case-insensitive lookup only) + * actual name matched within the directory + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + * NOTE: For case-insensitive file systems we take wide locks (see below), + * but return znode pointers to a single match. + */ +int +zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, + int flag, int *direntflags, struct componentname *realpnp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zfs_dirlock_t *dl; + boolean_t update; + matchtype_t mt = 0; + uint64_t zoid; + int error = 0; + int cmpflags; + + *zpp = NULL; + *dlpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if ((name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || + (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) + return (SET_ERROR(EEXIST)); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect what vnodes can be cached in the DNLC, how we + * perform zap lookups, and the "width" of our dirlocks. + * + * A normal dirlock locks a single name. Note that with + * normalization a name can be composed multiple ways, but + * when normalized, these names all compare equal. A wide + * dirlock locks multiple names. We need these when the file + * system is supporting mixed-mode access. It is sometimes + * necessary to lock all case permutations of file name at + * once so that simultaneous case-insensitive/case-sensitive + * behaves as rationally as possible. + */ + + /* + * When matching we may need to normalize & change case according to + * FS settings. + * + * Note that a normalized match is necessary for a case insensitive + * filesystem when the lookup request is not exact because normalization + * can fold case independent of normalizing code point sequences. + * + * See the table above zfs_dropname(). + */ + if (zfsvfs->z_norm != 0) { + mt = MT_NORMALIZE; + + /* + * Determine if the match needs to honor the case specified in + * lookup, and if so keep track of that so that during + * normalization we don't fold case. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE && + (flag & ZCIEXACT)) || + (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { + mt |= MT_MATCH_CASE; + } + } + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * Maybe can add TO-UPPERed version of name to dnlc in ci-only + * case for performance improvement? + */ + update = !zfsvfs->z_norm || + (zfsvfs->z_case == ZFS_CASE_MIXED && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); + + /* + * ZRENAMING indicates we are in a situation where we should + * take narrow locks regardless of the file system's + * preferences for normalizing and case folding. This will + * prevent us deadlocking trying to grab the same wide lock + * twice if the two names happen to be case-insensitive + * matches. + */ + if (flag & ZRENAMING) + cmpflags = 0; + else + cmpflags = zfsvfs->z_norm; + + /* + * Wait until there are no locks on this name. + * + * Don't grab the lock if it is already held. However, cannot + * have both ZSHARED and ZHAVELOCK together. + */ + ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); + if (!(flag & ZHAVELOCK)) + rw_enter(&dzp->z_name_lock, RW_READER); + + mutex_enter(&dzp->z_lock); + for (;;) { + if (dzp->z_unlinked && !(flag & ZXATTR)) { + mutex_exit(&dzp->z_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); + return (SET_ERROR(ENOENT)); + } + for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { + if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, + U8_UNICODE_LATEST, &error) == 0) || error != 0) + break; + } + if (error != 0) { + mutex_exit(&dzp->z_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); + return (SET_ERROR(ENOENT)); + } + if (dl == NULL) { + /* + * Allocate a new dirlock and add it to the list. + */ + dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); + cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); + dl->dl_name = name; + dl->dl_sharecnt = 0; + dl->dl_namelock = 0; + dl->dl_namesize = 0; + dl->dl_dzp = dzp; + dl->dl_next = dzp->z_dirlocks; + dzp->z_dirlocks = dl; + break; + } + if ((flag & ZSHARED) && dl->dl_sharecnt != 0) + break; + cv_wait(&dl->dl_cv, &dzp->z_lock); + } + + /* + * If the z_name_lock was NOT held for this dirlock record it. + */ + if (flag & ZHAVELOCK) + dl->dl_namelock = 1; + + if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { + /* + * We're the second shared reference to dl. Make a copy of + * dl_name in case the first thread goes away before we do. + * Note that we initialize the new name before storing its + * pointer into dl_name, because the first thread may load + * dl->dl_name at any time. It'll either see the old value, + * which belongs to it, or the new shared copy; either is OK. + */ + dl->dl_namesize = strlen(dl->dl_name) + 1; + name = kmem_alloc(dl->dl_namesize, KM_SLEEP); + bcopy(dl->dl_name, name, dl->dl_namesize); + dl->dl_name = name; + } + + mutex_exit(&dzp->z_lock); + + /* + * We have a dirlock on the name. (Note that it is the dirlock, + * not the dzp's z_lock, that protects the name in the zap object.) + * See if there's an object by this name; if so, put a hold on it. + */ + if (flag & ZXATTR) { + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, mt, + update, direntflags, realpnp, &zoid); + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + zfs_dirent_unlock(dl); + return (error); + } + } else { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + return (SET_ERROR(EEXIST)); + } + error = zfs_zget(zfsvfs, zoid, zpp); + if (error) { + zfs_dirent_unlock(dl); + return (error); + } + } + + *dlpp = dl; + + return (0); +} + +/* + * Unlock this directory entry and wake anyone who was waiting for it. + */ +void +zfs_dirent_unlock(zfs_dirlock_t *dl) +{ + znode_t *dzp = dl->dl_dzp; + zfs_dirlock_t **prev_dl, *cur_dl; + + mutex_enter(&dzp->z_lock); + + if (!dl->dl_namelock) + rw_exit(&dzp->z_name_lock); + + if (dl->dl_sharecnt > 1) { + dl->dl_sharecnt--; + mutex_exit(&dzp->z_lock); + return; + } + prev_dl = &dzp->z_dirlocks; + while ((cur_dl = *prev_dl) != dl) + prev_dl = &cur_dl->dl_next; + *prev_dl = dl->dl_next; + cv_broadcast(&dl->dl_cv); + mutex_exit(&dzp->z_lock); + + if (dl->dl_namesize != 0) + kmem_free(dl->dl_name, dl->dl_namesize); + cv_destroy(&dl->dl_cv); + kmem_free(dl, sizeof (*dl)); +} + +/* + * Look up an entry in a directory. + * + * NOTE: '.' and '..' are handled as special cases because + * no directory entries are actually stored for them. If this is + * the root of a filesystem, then '.zfs' is also treated as a + * special pseudo-directory. + */ +int +zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags, + int *deflg, struct componentname *rpnp) +{ + zfs_dirlock_t *dl; + znode_t *zp; + struct vnode *vp; + int error = 0; + uint64_t parent; + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *zpp = dzp; + zhold(*zpp); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + + /* + * If we are a snapshot mounted under .zfs, return + * the inode pointer for the snapshot directory. + */ + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + + if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { + error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, + "snapshot", &vp, 0, kcred, NULL, NULL); + if (error == 0) + *zpp = VTOZ(vp); + return (error); + } + rw_enter(&dzp->z_parent_lock, RW_READER); + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *zpp = zp; + rw_exit(&dzp->z_parent_lock); + } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { + vp = zfsctl_root(dzp); + if (vp != NULL) + *zpp = VTOZ(vp); + else + error = ENOENT; + } else { + int zf; + + zf = ZEXISTS | ZSHARED; + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); + if (error == 0) { + *zpp = zp; + zfs_dirent_unlock(dl); + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + } + rpnp = NULL; + } + + if ((flags & FIGNORECASE) && rpnp && !error) + (void) strlcpy(rpnp->cn_nameptr, name, rpnp->cn_namelen); + + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + ASSERT(zp->z_unlinked); + ASSERT(ZTOI(zp)->i_nlink == 0); + + VERIFY3U(0, ==, + zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +static void +zfs_unlinked_drain_task(void *arg) +{ + zfsvfs_t *zfsvfs = arg; + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + int error; + + ASSERT3B(zfsvfs->z_draining, ==, B_TRUE); + + /* + * Iterate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0 && + zfsvfs->z_drain_state == ZFS_DRAIN_RUNNING; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + zp->z_unlinked = B_TRUE; + + /* + * zrele() decrements the znode's ref count and may cause + * it to be synchronously freed. We interrupt freeing + * of this znode by checking the return value of + * dmu_objset_zfs_unmounting() in dmu_free_long_range() + * when an unmount is requested. + */ + zrele(zp); + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + } + zap_cursor_fini(&zc); + + mutex_enter(&zfsvfs->z_drain_lock); + zfsvfs->z_drain_state = ZFS_DRAIN_SHUTDOWN; + cv_broadcast(&zfsvfs->z_drain_cv); + mutex_exit(&zfsvfs->z_drain_lock); +} + +/* + * Sets z_draining then tries to dispatch async unlinked drain. + * If that fails executes synchronous unlinked drain. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + + mutex_enter(&zfsvfs->z_drain_lock); + ASSERT(zfsvfs->z_drain_state == ZFS_DRAIN_SHUTDOWN); + zfsvfs->z_drain_state = ZFS_DRAIN_RUNNING; + mutex_exit(&zfsvfs->z_drain_lock); + + if (taskq_dispatch( + dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)), + zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP) == 0) { + zfs_dbgmsg("async zfs_unlinked_drain dispatch failed"); + zfs_unlinked_drain_task(zfsvfs); + } +} + +/* + * Wait for the unlinked drain taskq task to stop. This will interrupt the + * unlinked set processing if it is in progress. + */ +void +zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) +{ + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + + mutex_enter(&zfsvfs->z_drain_lock); + while (zfsvfs->z_drain_state != ZFS_DRAIN_SHUTDOWN) { + zfsvfs->z_drain_state = ZFS_DRAIN_SHUTDOWN_REQ; + cv_wait(&zfsvfs->z_drain_cv, &zfsvfs->z_drain_lock); + } + mutex_exit(&zfsvfs->z_drain_lock); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zfs_dirlock_t dl; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + ASSERT(S_ISREG(ZTOI(xzp)->i_mode) || + S_ISLNK(ZTOI(xzp)->i_mode)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_zrele_async(xzp); + skipped += 1; + continue; + } + bzero(&dl, sizeof (dl)); + dl.dl_dzp = dzp; + dl.dl_name = zap.za_name; + + error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + zfs_zrele_async(xzp); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + znode_t *xzp = NULL; + dmu_tx_t *tx; + uint64_t acl_obj; + uint64_t xattr_obj; + int error; + + /* + * If this is an attribute directory, purge its contents. + */ + if (S_ISDIR(zp->z_mode) && (zp->z_pflags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + + return; + } + } + + /* + * Free up all the data in the file. We don't do this for directories + * because we need truncate and remove to be in the same tx, like in + * zfs_znode_delete(). Otherwise, if we crash here we'll end up with + * an inconsistent truncated zap object in the delete queue. Note a + * truncated file is harmless since it only contains user data. + */ + if (S_ISREG(zp->z_mode)) { + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space or we were interrupted by unmount. + * Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT(error == 0); + } + + acl_obj = zfs_external_acl(zp); + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xzp) { + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + goto out; + } + + if (xzp) { + ASSERT(error == 0); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ + xzp->z_links = 0; /* no more links to it */ + VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &xzp->z_links, sizeof (xzp->z_links), tx)); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + } + + mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + /* + * Remove this znode from the unlinked set. If a has rollback has + * occurred while a file is open and unlinked. Then when the file + * is closed post rollback it will not exist in the rolled back + * version of the unlinked object. + */ + error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + zp->z_id, tx); + VERIFY(error == 0 || error == ENOENT); + + uint64_t count; + if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { + cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); + } + + mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); +out: + if (xzp) + zfs_zrele_async(xzp); +} + +static uint64_t +zfs_dirent(znode_t *zp, uint64_t mode) +{ + uint64_t de = zp->z_id; + + if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT(mode) << 60; + return (de); +} + +/* + * Link zp into dl. Can fail in the following cases : + * - if zp has been unlinked. + * - if the number of entries with the same hash (aka. colliding entries) + * exceed the capacity of a leaf-block of fatzap and splitting of the + * leaf-block does not help. + */ +int +zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +{ + znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t value; + int zp_is_dir = S_ISDIR(zp->z_mode); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + mutex_enter(&zp->z_lock); + + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + mutex_exit(&zp->z_lock); + return (SET_ERROR(ENOENT)); + } + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + } + + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, + &value, tx); + + /* + * zap_add could fail to add the entry if it exceeds the capacity of the + * leaf-block and zap_leaf_split() failed to help. + * The caller of this routine is responsible for failing the transaction + * which will rollback the SA updates done above. + */ + if (error != 0) { + mutex_exit(&zp->z_lock); + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + mutex_exit(&zp->z_lock); + + mutex_enter(&dzp->z_lock); + dzp->z_size++; + if (zp_is_dir) + dzp->z_links++; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&dzp->z_lock); + + return (0); +} + +/* + * The match type in the code for this function should conform to: + * + * ------------------------------------------------------------------------ + * fs type | z_norm | lookup type | match type + * ---------|-------------|-------------|---------------------------------- + * CS !norm | 0 | 0 | 0 (exact) + * CS norm | formX | 0 | MT_NORMALIZE + * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE + * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE + * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | ZCILOOK | MT_NORMALIZE + * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE + * + * Abbreviations: + * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed + * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) + * formX = unicode normalization form set on fs creation + */ +static int +zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (ZTOZSB(zp)->z_norm) { + matchtype_t mt = MT_NORMALIZE; + + if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE && + (flag & ZCIEXACT)) || + (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED && + !(flag & ZCILOOK))) { + mt |= MT_MATCH_CASE; + } + + error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id, + dl->dl_name, mt, tx); + } else { + error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, + tx); + } + + return (error); +} + +/* + * Unlink zp from dl, and mark zp for deletion if this was the last link. Can + * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, + boolean_t *unlinkedp) +{ + znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + int zp_is_dir = S_ISDIR(zp->z_mode); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (!(flag & ZRENAMING)) { + mutex_enter(&zp->z_lock); + + if (zp_is_dir && !zfs_dirempty(zp)) { + mutex_exit(&zp->z_lock); + return (SET_ERROR(ENOTEMPTY)); + } + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) { + mutex_exit(&zp->z_lock); + return (error); + } + + if (zp->z_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on %lu is %u, " + "should be at least %u", zp->z_id, + (int)zp->z_links, zp_is_dir + 1); + zp->z_links = zp_is_dir + 1; + } + if (--zp->z_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_links = 0; + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT(error == 0); + mutex_exit(&zp->z_lock); + } else { + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) + return (error); + } + + mutex_enter(&dzp->z_lock); + dzp->z_size--; /* one dirent removed */ + if (zp_is_dir) + dzp->z_links--; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&dzp->z_lock); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. Works with or without z_lock + * held, but can only be consider a hint in the latter case. Returns true + * if only "." and ".." remain and there's no work in progress. + * + * The internal ZAP size, rather than zp->z_size, needs to be checked since + * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + uint64_t count; + int error; + + if (dzp->z_dirlocks != NULL) + return (B_FALSE); + + error = zap_count(zfsvfs->z_os, dzp->z_id, &count); + if (error != 0 || count != 0) + return (B_FALSE); + + return (B_TRUE); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; +#ifdef DEBUG + uint64_t parent; +#endif + + *xzpp = NULL; + + if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) + return (error); + + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { + zfs_acl_ids_free(&acl_ids); + return (SET_ERROR(EDQUOT)); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + +#ifdef DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); + + if (!zp->z_unlinked) + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + +#ifdef __APPLE__ + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(xzp, zfsvfs); +#endif + + *xzpp = xzp; + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xipp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + znode_t *xzp; + zfs_dirlock_t *dl; + vattr_t va; + int error; +top: + error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); + if (error) + return (error); + + if (xzp != NULL) { + *xzpp = xzp; + zfs_dirent_unlock(dl); + return (0); + } + + if (!(flags & CREATE_XATTR_DIR)) { + zfs_dirent_unlock(dl); + return (SET_ERROR(ENOENT)); + } + + if (zfs_is_readonly(zfsvfs)) { + zfs_dirent_unlock(dl); + return (SET_ERROR(EROFS)); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = ATTR_TYPE | ATTR_MODE | ATTR_UID | ATTR_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + error = zfs_make_xattrdir(zp, &va, xzpp, cr); + zfs_dirent_unlock(dl); + + if (error == ERESTART) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * you have write access to the entry, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = ZTOZSB(zdp); + + if (zfsvfs->z_replay) + return (0); + + if ((zdp->z_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + (vnode_isreg(ZTOV(zp)) && + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(ZTOV(zp), cr)); +} diff --git a/module/os/macos/zfs/zfs_file_os.c b/module/os/macos/zfs/zfs_file_os.c new file mode 100644 index 0000000000..46ddcf033e --- /dev/null +++ b/module/os/macos/zfs/zfs_file_os.c @@ -0,0 +1,405 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include + +#define FILE_FD_NOTUSED -1 + +/* + * Open file + * + * path - fully qualified path to file + * flags - file attributes O_READ / O_WRITE / O_EXCL + * fpp - pointer to return file pointer + * + * Returns 0 on success underlying error on failure. + */ +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + struct vnode *vp = NULL; + vfs_context_t vctx; + int error; + + if (!(flags & O_CREAT) && (flags & O_WRONLY)) + flags |= O_EXCL; + + vctx = vfs_context_create((vfs_context_t)0); + error = vnode_open(path, flags, mode, 0, &vp, vctx); + (void) vfs_context_rele(vctx); + if (error == 0 && + vp != NULL) { + zfs_file_t *zf; + zf = (zfs_file_t *)kmem_zalloc(sizeof (zfs_file_t), KM_SLEEP); + zf->f_vnode = vp; + zf->f_fd = FILE_FD_NOTUSED; + *fpp = zf; + } + + /* Optional, implemented O_APPEND: set offset to file size. */ + VERIFY0(flags & O_APPEND); + + return (error); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + vfs_context_t vctx; + vctx = vfs_context_create((vfs_context_t)0); + vnode_close(fp->f_vnode, fp->f_writes ? FWRITE : 0, vctx); + (void) vfs_context_rele(vctx); + + kmem_free(fp, sizeof (zfs_file_t)); +} + +static int +zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, + loff_t *off, ssize_t *resid) +{ + int error; + ssize_t local_resid = count; + + /* If we came with a 'fd' use it, as it can handle pipes. */ + if (fp->f_fd == FILE_FD_NOTUSED) + error = zfs_vn_rdwr(UIO_WRITE, fp->f_vnode, (caddr_t)buf, count, + *off, UIO_SYSSPACE, 0, RLIM64_INFINITY, + kcred, &local_resid); + else + error = spl_vn_rdwr(UIO_WRITE, fp, (caddr_t)buf, count, + *off, UIO_SYSSPACE, 0, RLIM64_INFINITY, + kcred, &local_resid); + + if (error != 0) + return (SET_ERROR(error)); + + fp->f_writes = 1; + + if (resid != NULL) + *resid = local_resid; + else if (local_resid != 0) + return (SET_ERROR(EIO)); + + *off += count - local_resid; + + return (0); +} + +/* + * Stateful write - use os internal file pointer to determine where to + * write and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + ssize_t rc; + + rc = zfs_file_write_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + + return (SET_ERROR(rc)); +} + +/* + * Stateless write - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * off - file offset to write to (only valid for seekable types) + * resid - pointer to count of unwritten bytes + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_write_impl(fp, buf, count, &off, resid)); +} + +static ssize_t +zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *off, + ssize_t *resid) +{ + int error; + ssize_t local_resid = count; + + /* If we have realvp, it's faster to call its spl_vn_rdwr */ + if (fp->f_fd == FILE_FD_NOTUSED) + error = zfs_vn_rdwr(UIO_READ, fp->f_vnode, buf, count, + *off, UIO_SYSSPACE, 0, RLIM64_INFINITY, + kcred, &local_resid); + else + error = spl_vn_rdwr(UIO_READ, fp, buf, count, + *off, UIO_SYSSPACE, 0, RLIM64_INFINITY, + kcred, &local_resid); + + if (error) + return (SET_ERROR(error)); + + *off += count - local_resid; + if (resid != NULL) + *resid = local_resid; + + return (SET_ERROR(0)); +} + +/* + * Stateful read - use os internal file pointer to determine where to + * read and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to read + * resid - pointer to count of unread bytes (if short read) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + int rc; + + rc = zfs_file_read_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + return (rc); +} + +/* + * Stateless read - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to write + * off - file offset to read from (only valid for seekable types) + * resid - pointer to count of unwritten bytes (if short read) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_read_impl(fp, buf, count, &off, resid)); +} + +/* + * lseek - set / get file pointer + * + * fp - pointer to file (pipe, socket, etc) to read from + * offp - value to seek to, returns current value plus passed offset + * whence - see man pages for standard lseek whence values + * + * Returns 0 on success errno on failure (ESPIPE for non seekable types) + */ +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + if (*offp < 0 || *offp > MAXOFFSET_T) + return (EINVAL); + + switch (whence) { + case SEEK_SET: + fp->f_offset = *offp; + break; + case SEEK_CUR: + fp->f_offset += *offp; + *offp = fp->f_offset; + break; + case SEEK_END: + /* Implement this if eventually needed: get filesize */ + VERIFY0(whence == SEEK_END); + break; + } + + return (0); +} + +/* + * Get file attributes + * + * filp - file pointer + * zfattr - pointer to file attr structure + * + * Currently only used for fetching size and file mode. + * + * Returns 0 on success or error code of underlying getattr call on failure. + */ +int +zfs_file_getattr(zfs_file_t *filp, zfs_file_attr_t *zfattr) +{ + vfs_context_t vctx; + int rc; + vattr_t vap; + + VATTR_INIT(&vap); + VATTR_WANTED(&vap, va_size); + VATTR_WANTED(&vap, va_mode); + + vctx = vfs_context_create((vfs_context_t)0); + rc = vnode_getattr(filp->f_vnode, &vap, vctx); + (void) vfs_context_rele(vctx); + + if (rc) + return (rc); + + zfattr->zfa_size = vap.va_size; + zfattr->zfa_mode = vap.va_mode; + + return (0); +} + +/* + * Sync file to disk + * + * filp - file pointer + * flags - O_SYNC and or O_DSYNC + * + * Returns 0 on success or error code of underlying sync call on failure. + */ +int +zfs_file_fsync(zfs_file_t *filp, int flags) +{ + vfs_context_t vctx; + int rc; + + vctx = vfs_context_create((vfs_context_t)0); + rc = VNOP_FSYNC(filp->f_vnode, (flags == FSYNC), vctx); + (void) vfs_context_rele(vctx); + return (rc); +} + +/* + * fallocate - allocate or free space on disk + * + * fp - file pointer + * mode (non-standard options for hole punching etc) + * offset - offset to start allocating or freeing from + * len - length to free / allocate + * + * OPTIONAL + */ +int +zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) +{ + return (0); +} + +/* + * Request current file pointer offset + * + * fp - pointer to file + * + * Returns current file offset. + */ +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (fp->f_offset); +} + +/* + * Request file pointer private data + * + * fp - pointer to file + * + * Returns pointer to file private data. + */ +extern kmutex_t zfsdev_state_lock; +dev_t zfsdev_get_dev(void); + +void * +zfs_file_private(zfs_file_t *fp) +{ + dev_t dev; + void *zs; + + dev = zfsdev_get_dev(); + printf("%s: fetching dev x%x\n", __func__, dev); + if (dev == 0) + return (NULL); + + mutex_enter(&zfsdev_state_lock); + zs = zfsdev_get_state(minor(dev), ZST_ALL); + mutex_exit(&zfsdev_state_lock); + printf("%s: searching minor %d %p\n", __func__, minor(dev), zs); + + return (zs); +} + +/* + * unlink file + * + * path - fully qualified file path + * + * Returns 0 on success. + * + * OPTIONAL + */ +int +zfs_file_unlink(const char *path) +{ + return (EOPNOTSUPP); +} + +/* + * Get reference to file pointer + * + * fd - input file descriptor + * fpp - pointer to file pointer + * + * Returns 0 on success EBADF on failure. + */ +int +zfs_file_get(int fd, zfs_file_t **fpp) +{ + *fpp = getf(fd); + if (*fpp == NULL) + return (EBADF); + return (0); +} + +/* + * Drop reference to file pointer + * + * fd - input file descriptor + */ +void +zfs_file_put(int fd) +{ + releasef(fd); +} diff --git a/module/os/macos/zfs/zfs_fuid_os.c b/module/os/macos/zfs/zfs_fuid_os.c new file mode 100644 index 0000000000..ebd09abd65 --- /dev/null +++ b/module/os/macos/zfs/zfs_fuid_os.c @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#endif +#include + +uint64_t +zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, + cred_t *cr, zfs_fuid_info_t **fuidp) +{ + uid_t id; + + VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); + + id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); + + if (IS_EPHEMERAL(id)) + return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); + + return ((uint64_t)id); +} diff --git a/module/os/macos/zfs/zfs_ioctl_os.c b/module/os/macos/zfs/zfs_ioctl_os.c new file mode 100644 index 0000000000..57ab905bd2 --- /dev/null +++ b/module/os/macos/zfs/zfs_ioctl_os.c @@ -0,0 +1,392 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013, 2020 Jorgen Lundman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +int zfs_major = 0; +int zfs_bmajor = 0; +static void *zfs_devnode = NULL; +#define ZFS_MAJOR -24 + +boolean_t +zfs_vfs_held(zfsvfs_t *zfsvfs) +{ + return (zfsvfs->z_vfs != NULL); +} + +int +zfs_vfs_ref(zfsvfs_t **zfvp) +{ + int error = 0; + + if (*zfvp == NULL || (*zfvp)->z_vfs == NULL) + return (SET_ERROR(ESRCH)); + + error = vfs_busy((*zfvp)->z_vfs, LK_NOWAIT); + if (error != 0) { + *zfvp = NULL; + error = SET_ERROR(ESRCH); + } + return (error); +} + +void +zfs_vfs_rele(zfsvfs_t *zfsvfs) +{ + vfs_unbusy(zfsvfs->z_vfs); +} + +static uint_t zfsdev_private_tsd; + +static int +zfsdev_state_init(dev_t dev) +{ + zfsdev_state_t *zs, *zsprev = NULL; + minor_t minor; + boolean_t newzs = B_FALSE; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + minor = minor(dev); + if (minor == 0) + return (SET_ERROR(ENXIO)); + + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs->zs_minor == -1) + break; + zsprev = zs; + } + + if (!zs) { + zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); + newzs = B_TRUE; + } + + /* Store this dev_t in tsd, so zfs_get_private() can retrieve it */ + tsd_set(zfsdev_private_tsd, (void *)(uintptr_t)dev); + + zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); + zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); + + /* + * In order to provide for lock-free concurrent read access + * to the minor list in zfsdev_get_state_impl(), new entries + * must be completely written before linking them into the + * list whereas existing entries are already linked; the last + * operation must be updating zs_minor (from -1 to the new + * value). + */ + if (newzs) { + zs->zs_minor = minor; + zsprev->zs_next = zs; + } else { + zs->zs_minor = minor; + } + + return (0); +} + +dev_t +zfsdev_get_dev(void) +{ + return ((dev_t)tsd_get(zfsdev_private_tsd)); +} + +static int +zfsdev_state_destroy(dev_t dev) +{ + zfsdev_state_t *zs; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + tsd_set(zfsdev_private_tsd, NULL); + + zs = zfsdev_get_state(minor(dev), ZST_ALL); + + if (!zs) { + printf("%s: no cleanup for minor x%x\n", __func__, + minor(dev)); + return (0); + } + + ASSERT(zs != NULL); + if (zs->zs_minor != -1) { + zs->zs_minor = -1; + zfs_onexit_destroy(zs->zs_onexit); + zfs_zevent_destroy(zs->zs_zevent); + } + return (0); +} + +static int +zfsdev_open(dev_t dev, int flags, int devtype, struct proc *p) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + if (zfsdev_get_state(minor(dev), ZST_ALL)) { + mutex_exit(&zfsdev_state_lock); + return (0); + } + error = zfsdev_state_init(dev); + mutex_exit(&zfsdev_state_lock); + + return (-error); +} + +static int +zfsdev_release(dev_t dev, int flags, int devtype, struct proc *p) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + error = zfsdev_state_destroy(dev); + mutex_exit(&zfsdev_state_lock); + + return (-error); +} + +static int +zfsdev_ioctl(dev_t dev, ulong_t cmd, caddr_t arg, __unused int xflag, + struct proc *p) +{ + uint_t len, vecnum; + zfs_iocparm_t *zit; + zfs_cmd_t *zc; + int error, rc; + user_addr_t uaddr; + + /* Translate XNU ioctl to enum table: */ + len = IOCPARM_LEN(cmd); + vecnum = cmd - _IOWR('Z', ZFS_IOC_FIRST, zfs_iocparm_t); + zit = (void *)arg; + uaddr = (user_addr_t)zit->zfs_cmd; + + if (len != sizeof (zfs_iocparm_t)) { + /* + * printf("len %d vecnum: %d sizeof (zfs_cmd_t) %lu\n", + * len, vecnum, sizeof (zfs_cmd_t)); + */ + /* + * We can get plenty raw ioctl()s here, for exaple open() will + * cause spec_open() to issue DKIOCGETTHROTTLEMASK. + */ + return (EINVAL); + } + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + + if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) { + error = SET_ERROR(EFAULT); + goto out; + } + + error = zfsdev_ioctl_common(vecnum, zc, 0); + + rc = copyout(zc, uaddr, sizeof (*zc)); + + if (error == 0 && rc != 0) + error = -SET_ERROR(EFAULT); + + /* + * OSX must return(0) or XNU doesn't copyout(). Save the real + * rc to userland + */ + zit->zfs_ioc_error = error; + error = 0; + +out: + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); + +} + +/* for spa_iokit_dataset_proxy_create */ +#include +#include + +static int +zfs_ioc_osx_proxy_dataset(zfs_cmd_t *zc) +{ + int error; + const char *osname; + + /* XXX Get osname */ + osname = zc->zc_name; + + /* Create new virtual disk, and return /dev/disk name */ + error = zfs_osx_proxy_create(osname); + + if (!error) + error = zfs_osx_proxy_get_bsdname(osname, + zc->zc_value, sizeof (zc->zc_value)); + if (error) + printf("%s: Created virtual disk '%s' for '%s'\n", __func__, + zc->zc_value, osname); + + return (error); +} + +void +zfs_ioctl_init_os(void) +{ + /* APPLE Specific ioctls */ + zfs_ioctl_register_pool(ZFS_IOC_PROXY_DATASET, + zfs_ioc_osx_proxy_dataset, zfs_secpolicy_config, + B_FALSE, POOL_CHECK_NONE); +} + +/* ioctl handler for block device. Relay to zvol */ +static int +zfsdev_bioctl(dev_t dev, ulong_t cmd, caddr_t data, + __unused int flag, struct proc *p) +{ + return (zvol_os_ioctl(dev, cmd, data, 1, NULL, NULL)); +} + +static struct bdevsw zfs_bdevsw = { + .d_open = zvol_os_open, + .d_close = zvol_os_close, + .d_strategy = zvol_os_strategy, + .d_ioctl = zfsdev_bioctl, /* block ioctl handler */ + .d_dump = eno_dump, + .d_psize = zvol_os_get_volume_blocksize, + .d_type = D_DISK, +}; + +static struct cdevsw zfs_cdevsw = { + .d_open = zfsdev_open, + .d_close = zfsdev_release, + .d_read = zvol_os_read, + .d_write = zvol_os_write, + .d_ioctl = zfsdev_ioctl, + .d_stop = eno_stop, + .d_reset = eno_reset, + .d_ttys = NULL, + .d_select = eno_select, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_reserved_1 = eno_getc, + .d_reserved_2 = eno_putc, + .d_type = D_DISK +}; + +/* Callback to create a unique minor for each open */ +static int +zfs_devfs_clone(__unused dev_t dev, int action) +{ + static minor_t minorx; + + if (action == DEVFS_CLONE_ALLOC) { + mutex_enter(&zfsdev_state_lock); + minorx = zfsdev_minor_alloc(); + mutex_exit(&zfsdev_state_lock); + return (minorx); + } + return (-1); +} + +int +zfsdev_attach(void) +{ + dev_t dev; + + zfs_bmajor = bdevsw_add(-1, &zfs_bdevsw); + zfs_major = cdevsw_add_with_bdev(-1, &zfs_cdevsw, zfs_bmajor); + + if (zfs_major < 0) { + printf("ZFS: zfs_attach() failed to allocate a major number\n"); + return (-1); + } + + dev = makedev(zfs_major, 0); /* Get the device number */ + zfs_devnode = devfs_make_node_clone(dev, DEVFS_CHAR, UID_ROOT, + GID_WHEEL, 0666, zfs_devfs_clone, "zfs", 0); + if (!zfs_devnode) { + printf("ZFS: devfs_make_node() failed\n"); + return (-1); + } + + wrap_avl_init(); + wrap_unicode_init(); + wrap_nvpair_init(); + wrap_zcommon_init(); + wrap_icp_init(); + wrap_lua_init(); + + tsd_create(&zfsdev_private_tsd, NULL); + + kstat_osx_init(); + return (0); +} + +void +zfsdev_detach(void) +{ + kstat_osx_fini(); + + tsd_destroy(&zfsdev_private_tsd); + + wrap_lua_fini(); + wrap_icp_fini(); + wrap_zcommon_fini(); + wrap_nvpair_fini(); + wrap_unicode_fini(); + wrap_avl_fini(); + + if (zfs_devnode) { + devfs_remove(zfs_devnode); + zfs_devnode = NULL; + } + if (zfs_major) { + (void) cdevsw_remove(zfs_major, &zfs_cdevsw); + zfs_major = 0; + } +} diff --git a/module/os/macos/zfs/zfs_kstat_osx.c b/module/os/macos/zfs/zfs_kstat_osx.c new file mode 100644 index 0000000000..ea37f32963 --- /dev/null +++ b/module/os/macos/zfs/zfs_kstat_osx.c @@ -0,0 +1,883 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014, 2020 Jorgen Lundman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +/* + * In Solaris the tunable are set via /etc/system. Until we have a load + * time configuration, we add them to writable kstat tunables. + * + * This table is more or less populated from IllumOS mdb zfs_params sources + * https://github.com/illumos/illumos-gate/blob/master/ + * usr/src/cmd/mdb/common/modules/zfs/zfs.c#L336-L392 + * + */ + + + +osx_kstat_t osx_kstat = { + { "spa_version", KSTAT_DATA_UINT64 }, + { "zpl_version", KSTAT_DATA_UINT64 }, + + { "active_vnodes", KSTAT_DATA_UINT64 }, + { "vnop_debug", KSTAT_DATA_UINT64 }, + { "reclaim_nodes", KSTAT_DATA_UINT64 }, + { "ignore_negatives", KSTAT_DATA_UINT64 }, + { "ignore_positives", KSTAT_DATA_UINT64 }, + { "create_negatives", KSTAT_DATA_UINT64 }, + { "force_formd_normalized", KSTAT_DATA_UINT64 }, + { "skip_unlinked_drain", KSTAT_DATA_UINT64 }, + { "use_system_sync", KSTAT_DATA_UINT64 }, + + { "zfs_arc_max", KSTAT_DATA_UINT64 }, + { "zfs_arc_min", KSTAT_DATA_UINT64 }, + { "zfs_arc_meta_limit", KSTAT_DATA_UINT64 }, + { "zfs_arc_meta_min", KSTAT_DATA_UINT64 }, + { "zfs_arc_grow_retry", KSTAT_DATA_UINT64 }, + { "zfs_arc_shrink_shift", KSTAT_DATA_UINT64 }, + { "zfs_arc_p_min_shift", KSTAT_DATA_UINT64 }, + { "zfs_arc_average_blocksize", KSTAT_DATA_UINT64 }, + + { "l2arc_write_max", KSTAT_DATA_UINT64 }, + { "l2arc_write_boost", KSTAT_DATA_UINT64 }, + { "l2arc_headroom", KSTAT_DATA_UINT64 }, + { "l2arc_headroom_boost", KSTAT_DATA_UINT64 }, + { "l2arc_feed_secs", KSTAT_DATA_UINT64 }, + { "l2arc_feed_min_ms", KSTAT_DATA_UINT64 }, + + { "max_active", KSTAT_DATA_UINT64 }, + { "sync_read_min_active", KSTAT_DATA_UINT64 }, + { "sync_read_max_active", KSTAT_DATA_UINT64 }, + { "sync_write_min_active", KSTAT_DATA_UINT64 }, + { "sync_write_max_active", KSTAT_DATA_UINT64 }, + { "async_read_min_active", KSTAT_DATA_UINT64 }, + { "async_read_max_active", KSTAT_DATA_UINT64 }, + { "async_write_min_active", KSTAT_DATA_UINT64 }, + { "async_write_max_active", KSTAT_DATA_UINT64 }, + { "scrub_min_active", KSTAT_DATA_UINT64 }, + { "scrub_max_active", KSTAT_DATA_UINT64 }, + { "async_write_min_dirty_pct", KSTAT_DATA_INT64 }, + { "async_write_max_dirty_pct", KSTAT_DATA_INT64 }, + { "aggregation_limit", KSTAT_DATA_INT64 }, + { "read_gap_limit", KSTAT_DATA_INT64 }, + { "write_gap_limit", KSTAT_DATA_INT64 }, + + {"arc_lotsfree_percent", KSTAT_DATA_INT64 }, + {"zfs_dirty_data_max", KSTAT_DATA_INT64 }, + {"zfs_delay_max_ns", KSTAT_DATA_INT64 }, + {"zfs_delay_min_dirty_percent", KSTAT_DATA_INT64 }, + {"zfs_delay_scale", KSTAT_DATA_INT64 }, + {"spa_asize_inflation", KSTAT_DATA_INT64 }, + {"zfs_prefetch_disable", KSTAT_DATA_INT64 }, + {"zfetch_max_streams", KSTAT_DATA_INT64 }, + {"zfetch_min_sec_reap", KSTAT_DATA_INT64 }, + {"zfetch_array_rd_sz", KSTAT_DATA_INT64 }, + {"zfs_default_bs", KSTAT_DATA_INT64 }, + {"zfs_default_ibs", KSTAT_DATA_INT64 }, + {"metaslab_aliquot", KSTAT_DATA_INT64 }, + {"spa_max_replication_override", KSTAT_DATA_INT64 }, + {"spa_mode_global", KSTAT_DATA_INT64 }, + {"zfs_flags", KSTAT_DATA_INT64 }, + {"zfs_txg_timeout", KSTAT_DATA_INT64 }, + {"zfs_vdev_cache_max", KSTAT_DATA_INT64 }, + {"zfs_vdev_cache_size", KSTAT_DATA_INT64 }, + {"zfs_vdev_cache_bshift", KSTAT_DATA_INT64 }, + {"vdev_mirror_shift", KSTAT_DATA_INT64 }, + {"zfs_scrub_limit", KSTAT_DATA_INT64 }, + {"zfs_no_scrub_io", KSTAT_DATA_INT64 }, + {"zfs_no_scrub_prefetch", KSTAT_DATA_INT64 }, + {"fzap_default_block_shift", KSTAT_DATA_INT64 }, + {"zfs_immediate_write_sz", KSTAT_DATA_INT64 }, + {"zfs_read_chunk_size", KSTAT_DATA_INT64 }, + {"zfs_nocacheflush", KSTAT_DATA_INT64 }, + {"zil_replay_disable", KSTAT_DATA_INT64 }, + {"metaslab_df_alloc_threshold", KSTAT_DATA_INT64 }, + {"metaslab_df_free_pct", KSTAT_DATA_INT64 }, + {"zio_injection_enabled", KSTAT_DATA_INT64 }, + {"zvol_immediate_write_sz", KSTAT_DATA_INT64 }, + + { "l2arc_noprefetch", KSTAT_DATA_INT64 }, + { "l2arc_feed_again", KSTAT_DATA_INT64 }, + { "l2arc_norw", KSTAT_DATA_INT64 }, + + {"zfs_recover", KSTAT_DATA_INT64 }, + + {"zfs_free_bpobj_enabled", KSTAT_DATA_INT64 }, + + {"zfs_send_corrupt_data", KSTAT_DATA_UINT64 }, + {"zfs_send_queue_length", KSTAT_DATA_UINT64 }, + {"zfs_recv_queue_length", KSTAT_DATA_UINT64 }, + + {"zvol_inhibit_dev", KSTAT_DATA_UINT64 }, + {"zfs_send_set_freerecords_bit", KSTAT_DATA_UINT64 }, + + {"zfs_write_implies_delete_child", KSTAT_DATA_UINT64 }, + {"zfs_send_holes_without_birth_time", KSTAT_DATA_UINT64 }, + + {"dbuf_cache_max_bytes", KSTAT_DATA_UINT64 }, + + {"zfs_vdev_queue_depth_pct", KSTAT_DATA_UINT64 }, + {"zio_dva_throttle_enabled", KSTAT_DATA_UINT64 }, + + {"zfs_lua_max_instrlimit", KSTAT_DATA_UINT64 }, + {"zfs_lua_max_memlimit", KSTAT_DATA_UINT64 }, + + {"zfs_trim_extent_bytes_max", KSTAT_DATA_UINT64 }, + {"zfs_trim_extent_bytes_min", KSTAT_DATA_UINT64 }, + {"zfs_trim_metaslab_skip", KSTAT_DATA_UINT64 }, + {"zfs_trim_txg_batch", KSTAT_DATA_UINT64 }, + {"zfs_trim_queue_limit", KSTAT_DATA_UINT64 }, + + {"zfs_send_unmodified_spill_blocks", KSTAT_DATA_UINT64 }, + {"zfs_special_class_metadata_reserve_pct", KSTAT_DATA_UINT64 }, + + {"zfs_vdev_raidz_impl", KSTAT_DATA_STRING }, + {"icp_gcm_impl", KSTAT_DATA_STRING }, + {"icp_aes_impl", KSTAT_DATA_STRING }, + {"zfs_fletcher_4_impl", KSTAT_DATA_STRING }, + + {"zfs_expire_snapshot", KSTAT_DATA_UINT64 }, + {"zfs_admin_snapshot", KSTAT_DATA_UINT64 }, + {"zfs_auto_snapshot", KSTAT_DATA_UINT64 }, + + {"zfs_spa_discard_memory_limit", KSTAT_DATA_UINT64 }, + {"zfs_async_block_max_blocks", KSTAT_DATA_UINT64 }, + {"zfs_initialize_chunk_size", KSTAT_DATA_UINT64 }, + {"zfs_scan_suspend_progress", KSTAT_DATA_UINT64 }, + {"zfs_removal_suspend_progress", KSTAT_DATA_UINT64 }, + {"zfs_livelist_max_entries", KSTAT_DATA_UINT64 }, + + {"zfs_allow_redacted_dataset_mount", KSTAT_DATA_UINT64 }, + {"zfs_checksum_events_per_second", KSTAT_DATA_UINT64 }, + {"zfs_commit_timeout_pct", KSTAT_DATA_UINT64 }, + {"zfs_compressed_arc_enabled", KSTAT_DATA_UINT64 }, + {"zfs_condense_indirect_commit_entry_delay_ms", KSTAT_DATA_UINT64 }, + {"zfs_condense_min_mapping_bytes", KSTAT_DATA_UINT64 }, + {"zfs_deadman_checktime_ms", KSTAT_DATA_UINT64 }, + {"zfs_deadman_failmode", KSTAT_DATA_STRING }, + {"zfs_deadman_synctime_ms", KSTAT_DATA_UINT64 }, + {"zfs_deadman_ziotime_ms", KSTAT_DATA_UINT64 }, + {"zfs_disable_ivset_guid_check", KSTAT_DATA_UINT64 }, + {"zfs_initialize_value", KSTAT_DATA_UINT64 }, + {"zfs_keep_log_spacemaps_at_export", KSTAT_DATA_UINT64 }, + {"l2arc_rebuild_blocks_min_l2size", KSTAT_DATA_UINT64 }, + {"l2arc_rebuild_enabled", KSTAT_DATA_UINT64 }, + {"l2arc_trim_ahead", KSTAT_DATA_UINT64 }, + {"zfs_livelist_condense_new_alloc", KSTAT_DATA_UINT64 }, + {"zfs_livelist_condense_sync_cancel", KSTAT_DATA_UINT64 }, + {"zfs_livelist_condense_sync_pause", KSTAT_DATA_UINT64 }, + {"zfs_livelist_condense_zthr_cancel", KSTAT_DATA_UINT64 }, + {"zfs_livelist_condense_zthr_pause", KSTAT_DATA_UINT64 }, + {"zfs_livelist_min_percent_shared", KSTAT_DATA_UINT64 }, + {"zfs_max_dataset_nesting", KSTAT_DATA_UINT64 }, + {"zfs_max_missing_tvds", KSTAT_DATA_UINT64 }, + {"metaslab_debug_load", KSTAT_DATA_UINT64 }, + {"metaslab_force_ganging", KSTAT_DATA_UINT64 }, + {"zfs_multihost_fail_intervals", KSTAT_DATA_UINT64 }, + {"zfs_multihost_import_intervals", KSTAT_DATA_UINT64 }, + {"zfs_multihost_interval", KSTAT_DATA_UINT64 }, + {"zfs_override_estimate_recordsize", KSTAT_DATA_UINT64 }, + {"zfs_remove_max_segment", KSTAT_DATA_UINT64 }, + {"zfs_resilver_min_time_ms", KSTAT_DATA_UINT64 }, + {"zfs_scan_legacy", KSTAT_DATA_UINT64 }, + {"zfs_scan_vdev_limit", KSTAT_DATA_UINT64 }, + {"zfs_slow_io_events_per_second", KSTAT_DATA_UINT64 }, + {"spa_load_verify_data", KSTAT_DATA_UINT64 }, + {"spa_load_verify_metadata", KSTAT_DATA_UINT64 }, + {"zfs_unlink_suspend_progress", KSTAT_DATA_UINT64 }, + {"zfs_vdev_min_ms_count", KSTAT_DATA_UINT64 }, + {"vdev_validate_skip", KSTAT_DATA_UINT64 }, + {"zfs_zevent_len_max", KSTAT_DATA_UINT64 }, + {"zio_slow_io_ms", KSTAT_DATA_UINT64 }, + +}; + + +extern void kstat_named_setstr(kstat_named_t *knp, const char *src); +extern int zfs_vdev_raidz_impl_set(const char *val); +extern int zfs_vdev_raidz_impl_get(char *buffer, int max); +extern int icp_gcm_impl_set(const char *val); +extern int icp_gcm_impl_get(char *buffer, int max); +extern int icp_aes_impl_set(const char *val); +extern int icp_aes_impl_get(char *buffer, int max); +extern int zfs_fletcher_4_impl_set(const char *val); +extern int zfs_fletcher_4_impl_get(char *buffer, int max); + +static char vdev_raidz_string[80] = { 0 }; +static char icp_gcm_string[80] = { 0 }; +static char icp_aes_string[80] = { 0 }; +static char zfs_fletcher_4_string[80] = { 0 }; + +static kstat_t *osx_kstat_ksp; + +#if !defined(__OPTIMIZE__) +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif + +extern kstat_t *arc_ksp; + +static int osx_kstat_update(kstat_t *ksp, int rw) +{ + osx_kstat_t *ks = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + + /* Darwin */ + + debug_vnop_osx_printf = ks->darwin_debug.value.ui64; + if (ks->darwin_debug.value.ui64 == 9119) + panic("ZFS: User requested panic\n"); + zfs_vnop_ignore_negatives = + ks->darwin_ignore_negatives.value.ui64; + zfs_vnop_ignore_positives = + ks->darwin_ignore_positives.value.ui64; + zfs_vnop_create_negatives = + ks->darwin_create_negatives.value.ui64; + zfs_vnop_force_formd_normalized_output = + ks->darwin_force_formd_normalized.value.ui64; + zfs_vnop_skip_unlinked_drain = + ks->darwin_skip_unlinked_drain.value.ui64; + zfs_vfs_sync_paranoia = + ks->darwin_use_system_sync.value.ui64; + + /* ARC */ + /* Upstream has this static, but we can find another way ... */ + /* arc_kstat_update(ksp, rw); */ + if (arc_ksp != NULL && arc_ksp->ks_update != NULL) + arc_ksp->ks_update(ksp, rw); + arc_kstat_update_osx(ksp, rw); + + /* L2ARC */ + l2arc_write_max = ks->l2arc_write_max.value.ui64; + l2arc_write_boost = ks->l2arc_write_boost.value.ui64; + l2arc_headroom = ks->l2arc_headroom.value.ui64; + l2arc_headroom_boost = ks->l2arc_headroom_boost.value.ui64; + l2arc_feed_secs = ks->l2arc_feed_secs.value.ui64; + l2arc_feed_min_ms = ks->l2arc_feed_min_ms.value.ui64; + + l2arc_noprefetch = ks->l2arc_noprefetch.value.i64; + l2arc_feed_again = ks->l2arc_feed_again.value.i64; + l2arc_norw = ks->l2arc_norw.value.i64; + + /* vdev_queue */ + + zfs_vdev_max_active = + ks->zfs_vdev_max_active.value.ui64; + zfs_vdev_sync_read_min_active = + ks->zfs_vdev_sync_read_min_active.value.ui64; + zfs_vdev_sync_read_max_active = + ks->zfs_vdev_sync_read_max_active.value.ui64; + zfs_vdev_sync_write_min_active = + ks->zfs_vdev_sync_write_min_active.value.ui64; + zfs_vdev_sync_write_max_active = + ks->zfs_vdev_sync_write_max_active.value.ui64; + zfs_vdev_async_read_min_active = + ks->zfs_vdev_async_read_min_active.value.ui64; + zfs_vdev_async_read_max_active = + ks->zfs_vdev_async_read_max_active.value.ui64; + zfs_vdev_async_write_min_active = + ks->zfs_vdev_async_write_min_active.value.ui64; + zfs_vdev_async_write_max_active = + ks->zfs_vdev_async_write_max_active.value.ui64; + zfs_vdev_scrub_min_active = + ks->zfs_vdev_scrub_min_active.value.ui64; + zfs_vdev_scrub_max_active = + ks->zfs_vdev_scrub_max_active.value.ui64; + zfs_vdev_async_write_active_min_dirty_percent = + ks->zfs_vdev_async_write_active_min_dirty_percent.value.i64; + zfs_vdev_async_write_active_max_dirty_percent = + ks->zfs_vdev_async_write_active_max_dirty_percent.value.i64; + zfs_vdev_aggregation_limit = + ks->zfs_vdev_aggregation_limit.value.i64; + zfs_vdev_read_gap_limit = + ks->zfs_vdev_read_gap_limit.value.i64; + zfs_vdev_write_gap_limit = + ks->zfs_vdev_write_gap_limit.value.i64; + + arc_lotsfree_percent = + ks->arc_lotsfree_percent.value.i64; + zfs_dirty_data_max = + ks->zfs_dirty_data_max.value.i64; + zfs_delay_max_ns = + ks->zfs_delay_max_ns.value.i64; + zfs_delay_min_dirty_percent = + ks->zfs_delay_min_dirty_percent.value.i64; + zfs_delay_scale = + ks->zfs_delay_scale.value.i64; + spa_asize_inflation = + ks->spa_asize_inflation.value.i64; + zfs_prefetch_disable = + ks->zfs_prefetch_disable.value.i64; + zfetch_max_streams = + ks->zfetch_max_streams.value.i64; + zfetch_min_sec_reap = + ks->zfetch_min_sec_reap.value.i64; + zfetch_array_rd_sz = + ks->zfetch_array_rd_sz.value.i64; + zfs_default_bs = + ks->zfs_default_bs.value.i64; + zfs_default_ibs = + ks->zfs_default_ibs.value.i64; + metaslab_aliquot = + ks->metaslab_aliquot.value.i64; + spa_max_replication_override = + ks->spa_max_replication_override.value.i64; + spa_mode_global = + ks->spa_mode_global.value.i64; + zfs_flags = + ks->zfs_flags.value.i64; + zfs_txg_timeout = + ks->zfs_txg_timeout.value.i64; + zfs_vdev_cache_max = + ks->zfs_vdev_cache_max.value.i64; + zfs_vdev_cache_size = + ks->zfs_vdev_cache_size.value.i64; + zfs_no_scrub_io = + ks->zfs_no_scrub_io.value.i64; + zfs_no_scrub_prefetch = + ks->zfs_no_scrub_prefetch.value.i64; + fzap_default_block_shift = + ks->fzap_default_block_shift.value.i64; + zfs_immediate_write_sz = + ks->zfs_immediate_write_sz.value.i64; + zfs_read_chunk_size = + ks->zfs_read_chunk_size.value.i64; + zfs_nocacheflush = + ks->zfs_nocacheflush.value.i64; + zil_replay_disable = + ks->zil_replay_disable.value.i64; + metaslab_df_alloc_threshold = + ks->metaslab_df_alloc_threshold.value.i64; + metaslab_df_free_pct = + ks->metaslab_df_free_pct.value.i64; + zio_injection_enabled = + ks->zio_injection_enabled.value.i64; + zvol_immediate_write_sz = + ks->zvol_immediate_write_sz.value.i64; + + zfs_recover = + ks->zfs_recover.value.i64; + + zfs_free_bpobj_enabled = + ks->zfs_free_bpobj_enabled.value.i64; + + zfs_send_corrupt_data = + ks->zfs_send_corrupt_data.value.ui64; + zfs_send_queue_length = + ks->zfs_send_queue_length.value.ui64; + zfs_recv_queue_length = + ks->zfs_recv_queue_length.value.ui64; + + zvol_inhibit_dev = + ks->zvol_inhibit_dev.value.ui64; + zfs_send_set_freerecords_bit = + ks->zfs_send_set_freerecords_bit.value.ui64; + + zfs_write_implies_delete_child = + ks->zfs_write_implies_delete_child.value.ui64; + send_holes_without_birth_time = + ks->zfs_send_holes_without_birth_time.value.ui64; + + dbuf_cache_max_bytes = + ks->dbuf_cache_max_bytes.value.ui64; + + zfs_vdev_queue_depth_pct = + ks->zfs_vdev_queue_depth_pct.value.ui64; + + zio_dva_throttle_enabled = + (boolean_t)ks->zio_dva_throttle_enabled.value.ui64; + + zfs_lua_max_instrlimit = + ks->zfs_lua_max_instrlimit.value.ui64; + zfs_lua_max_memlimit = + ks->zfs_lua_max_memlimit.value.ui64; + + zfs_trim_extent_bytes_max = + ks->zfs_trim_extent_bytes_max.value.ui64; + zfs_trim_extent_bytes_min = + ks->zfs_trim_extent_bytes_min.value.ui64; + zfs_trim_metaslab_skip = + ks->zfs_trim_metaslab_skip.value.ui64; + zfs_trim_txg_batch = + ks->zfs_trim_txg_batch.value.ui64; + zfs_trim_queue_limit = + ks->zfs_trim_queue_limit.value.ui64; + + zfs_send_unmodified_spill_blocks = + ks->zfs_send_unmodified_spill_blocks.value.ui64; + zfs_special_class_metadata_reserve_pct = + ks->zfs_special_class_metadata_reserve_pct.value.ui64; + + // Check if string has changed (from KREAD), if so, update. +#if 0 + if (strcmp(vdev_raidz_string, + ks->zfs_vdev_raidz_impl.value.string.addr.ptr) != 0) + zfs_vdev_raidz_impl_set( + ks->zfs_vdev_raidz_impl.value.string.addr.ptr); + + if (strcmp(icp_gcm_string, + ks->icp_gcm_impl.value.string.addr.ptr) != 0) + icp_gcm_impl_set(ks->icp_gcm_impl.value.string.addr.ptr); + + if (strcmp(icp_aes_string, + ks->icp_aes_impl.value.string.addr.ptr) != 0) + icp_aes_impl_set(ks->icp_aes_impl.value.string.addr.ptr); + + if (strcmp(zfs_fletcher_4_string, + ks->zfs_fletcher_4_impl.value.string.addr.ptr) != 0) + zfs_fletcher_4_impl_set( + ks->zfs_fletcher_4_impl.value.string.addr.ptr); +#endif + + zfs_expire_snapshot = + ks->zfs_expire_snapshot.value.ui64; + zfs_admin_snapshot = + ks->zfs_admin_snapshot.value.ui64; + zfs_auto_snapshot = + ks->zfs_auto_snapshot.value.ui64; + + zfs_spa_discard_memory_limit = + ks->zfs_spa_discard_memory_limit.value.ui64; + zfs_async_block_max_blocks = + ks->zfs_async_block_max_blocks.value.ui64; + zfs_initialize_chunk_size = + ks->zfs_initialize_chunk_size.value.ui64; + zfs_scan_suspend_progress = + ks->zfs_scan_suspend_progress.value.ui64; + zfs_removal_suspend_progress = + ks->zfs_removal_suspend_progress.value.ui64; + zfs_livelist_max_entries = + ks->zfs_livelist_max_entries.value.ui64; + + zfs_allow_redacted_dataset_mount = + ks->zfs_allow_redacted_dataset_mount.value.ui64; + zfs_checksum_events_per_second = + ks->zfs_checksum_events_per_second.value.ui64; + zfs_commit_timeout_pct = + ks->zfs_commit_timeout_pct.value.ui64; + zfs_compressed_arc_enabled = + ks->zfs_compressed_arc_enabled.value.ui64; + zfs_condense_indirect_commit_entry_delay_ms = + ks->zfs_condense_indirect_commit_entry_delay_ms.value.ui64; + zfs_condense_min_mapping_bytes = + ks->zfs_condense_min_mapping_bytes.value.ui64; + zfs_deadman_checktime_ms = + ks->zfs_deadman_checktime_ms.value.ui64; + // string zfs_deadman_failmode = + // ks->zfs_deadman_failmode.value.ui64; + zfs_deadman_synctime_ms = + ks->zfs_deadman_synctime_ms.value.ui64; + zfs_deadman_ziotime_ms = + ks->zfs_deadman_ziotime_ms.value.ui64; + zfs_disable_ivset_guid_check = + ks->zfs_disable_ivset_guid_check.value.ui64; + zfs_initialize_value = + ks->zfs_initialize_value.value.ui64; + zfs_keep_log_spacemaps_at_export = + ks->zfs_keep_log_spacemaps_at_export.value.ui64; + l2arc_rebuild_blocks_min_l2size = + ks->l2arc_rebuild_blocks_min_l2size.value.ui64; + l2arc_rebuild_enabled = + ks->l2arc_rebuild_enabled.value.ui64; + l2arc_trim_ahead = ks->l2arc_trim_ahead.value.ui64; + zfs_livelist_condense_new_alloc = + ks->zfs_livelist_condense_new_alloc.value.ui64; + zfs_livelist_condense_sync_cancel = + ks->zfs_livelist_condense_sync_cancel.value.ui64; + zfs_livelist_condense_sync_pause = + ks->zfs_livelist_condense_sync_pause.value.ui64; + zfs_livelist_condense_zthr_cancel = + ks->zfs_livelist_condense_zthr_cancel.value.ui64; + zfs_livelist_condense_zthr_pause = + ks->zfs_livelist_condense_zthr_pause.value.ui64; + zfs_livelist_min_percent_shared = + ks->zfs_livelist_min_percent_shared.value.ui64; + zfs_max_dataset_nesting = + ks->zfs_max_dataset_nesting.value.ui64; + zfs_max_missing_tvds = + ks->zfs_max_missing_tvds.value.ui64; + metaslab_debug_load = ks->metaslab_debug_load.value.ui64; + metaslab_force_ganging = + ks->metaslab_force_ganging.value.ui64; + zfs_multihost_fail_intervals = + ks->zfs_multihost_fail_intervals.value.ui64; + zfs_multihost_import_intervals = + ks->zfs_multihost_import_intervals.value.ui64; + zfs_multihost_interval = + ks->zfs_multihost_interval.value.ui64; + zfs_override_estimate_recordsize = + ks->zfs_override_estimate_recordsize.value.ui64; + zfs_remove_max_segment = + ks->zfs_remove_max_segment.value.ui64; + zfs_resilver_min_time_ms = + ks->zfs_resilver_min_time_ms.value.ui64; + zfs_scan_legacy = ks->zfs_scan_legacy.value.ui64; + zfs_scan_vdev_limit = + ks->zfs_scan_vdev_limit.value.ui64; + zfs_slow_io_events_per_second = + ks->zfs_slow_io_events_per_second.value.ui64; + spa_load_verify_data = + ks->spa_load_verify_data.value.ui64; + spa_load_verify_metadata = + ks->spa_load_verify_metadata.value.ui64; + zfs_unlink_suspend_progress = + ks->zfs_unlink_suspend_progress.value.ui64; + zfs_vdev_min_ms_count = ks->zfs_vdev_min_ms_count.value.ui64; + vdev_validate_skip = ks->vdev_validate_skip.value.ui64; + zfs_zevent_len_max = ks->zfs_zevent_len_max.value.ui64; + zio_slow_io_ms = ks->zio_slow_io_ms.value.ui64; + + + } else { + + /* kstat READ */ + ks->spa_version.value.ui64 = SPA_VERSION; + ks->zpl_version.value.ui64 = ZPL_VERSION; + + /* Darwin */ + ks->darwin_active_vnodes.value.ui64 = vnop_num_vnodes; + ks->darwin_reclaim_nodes.value.ui64 = vnop_num_reclaims; + ks->darwin_debug.value.ui64 = debug_vnop_osx_printf; + ks->darwin_ignore_negatives.value.ui64 = + zfs_vnop_ignore_negatives; + ks->darwin_ignore_positives.value.ui64 = + zfs_vnop_ignore_positives; + ks->darwin_create_negatives.value.ui64 = + zfs_vnop_create_negatives; + ks->darwin_force_formd_normalized.value.ui64 = + zfs_vnop_force_formd_normalized_output; + ks->darwin_skip_unlinked_drain.value.ui64 = + zfs_vnop_skip_unlinked_drain; + ks->darwin_use_system_sync.value.ui64 = zfs_vfs_sync_paranoia; + + /* ARC */ + if (arc_ksp != NULL && arc_ksp->ks_update != NULL) + arc_ksp->ks_update(ksp, rw); + arc_kstat_update_osx(ksp, rw); + + /* L2ARC */ + ks->l2arc_write_max.value.ui64 = l2arc_write_max; + ks->l2arc_write_boost.value.ui64 = l2arc_write_boost; + ks->l2arc_headroom.value.ui64 = l2arc_headroom; + ks->l2arc_headroom_boost.value.ui64 = l2arc_headroom_boost; + ks->l2arc_feed_secs.value.ui64 = l2arc_feed_secs; + ks->l2arc_feed_min_ms.value.ui64 = l2arc_feed_min_ms; + + ks->l2arc_noprefetch.value.i64 = l2arc_noprefetch; + ks->l2arc_feed_again.value.i64 = l2arc_feed_again; + ks->l2arc_norw.value.i64 = l2arc_norw; + + /* vdev_queue */ + ks->zfs_vdev_max_active.value.ui64 = + zfs_vdev_max_active; + ks->zfs_vdev_sync_read_min_active.value.ui64 = + zfs_vdev_sync_read_min_active; + ks->zfs_vdev_sync_read_max_active.value.ui64 = + zfs_vdev_sync_read_max_active; + ks->zfs_vdev_sync_write_min_active.value.ui64 = + zfs_vdev_sync_write_min_active; + ks->zfs_vdev_sync_write_max_active.value.ui64 = + zfs_vdev_sync_write_max_active; + ks->zfs_vdev_async_read_min_active.value.ui64 = + zfs_vdev_async_read_min_active; + ks->zfs_vdev_async_read_max_active.value.ui64 = + zfs_vdev_async_read_max_active; + ks->zfs_vdev_async_write_min_active.value.ui64 = + zfs_vdev_async_write_min_active; + ks->zfs_vdev_async_write_max_active.value.ui64 = + zfs_vdev_async_write_max_active; + ks->zfs_vdev_scrub_min_active.value.ui64 = + zfs_vdev_scrub_min_active; + ks->zfs_vdev_scrub_max_active.value.ui64 = + zfs_vdev_scrub_max_active; + ks->zfs_vdev_async_write_active_min_dirty_percent.value.i64 = + zfs_vdev_async_write_active_min_dirty_percent; + ks->zfs_vdev_async_write_active_max_dirty_percent.value.i64 = + zfs_vdev_async_write_active_max_dirty_percent; + ks->zfs_vdev_aggregation_limit.value.i64 = + zfs_vdev_aggregation_limit; + ks->zfs_vdev_read_gap_limit.value.i64 = + zfs_vdev_read_gap_limit; + ks->zfs_vdev_write_gap_limit.value.i64 = + zfs_vdev_write_gap_limit; + + ks->arc_lotsfree_percent.value.i64 = + arc_lotsfree_percent; + ks->zfs_dirty_data_max.value.i64 = + zfs_dirty_data_max; + ks->zfs_delay_max_ns.value.i64 = + zfs_delay_max_ns; + ks->zfs_delay_min_dirty_percent.value.i64 = + zfs_delay_min_dirty_percent; + ks->zfs_delay_scale.value.i64 = + zfs_delay_scale; + ks->spa_asize_inflation.value.i64 = + spa_asize_inflation; + ks->zfs_prefetch_disable.value.i64 = + zfs_prefetch_disable; + ks->zfetch_max_streams.value.i64 = + zfetch_max_streams; + ks->zfetch_min_sec_reap.value.i64 = + zfetch_min_sec_reap; + ks->zfetch_array_rd_sz.value.i64 = + zfetch_array_rd_sz; + ks->zfs_default_bs.value.i64 = + zfs_default_bs; + ks->zfs_default_ibs.value.i64 = + zfs_default_ibs; + ks->metaslab_aliquot.value.i64 = + metaslab_aliquot; + ks->spa_max_replication_override.value.i64 = + spa_max_replication_override; + ks->spa_mode_global.value.i64 = + spa_mode_global; + ks->zfs_flags.value.i64 = + zfs_flags; + ks->zfs_txg_timeout.value.i64 = + zfs_txg_timeout; + ks->zfs_vdev_cache_max.value.i64 = + zfs_vdev_cache_max; + ks->zfs_vdev_cache_size.value.i64 = + zfs_vdev_cache_size; + ks->zfs_no_scrub_io.value.i64 = + zfs_no_scrub_io; + ks->zfs_no_scrub_prefetch.value.i64 = + zfs_no_scrub_prefetch; + ks->fzap_default_block_shift.value.i64 = + fzap_default_block_shift; + ks->zfs_immediate_write_sz.value.i64 = + zfs_immediate_write_sz; + ks->zfs_read_chunk_size.value.i64 = + zfs_read_chunk_size; + ks->zfs_nocacheflush.value.i64 = + zfs_nocacheflush; + ks->zil_replay_disable.value.i64 = + zil_replay_disable; + ks->metaslab_df_alloc_threshold.value.i64 = + metaslab_df_alloc_threshold; + ks->metaslab_df_free_pct.value.i64 = + metaslab_df_free_pct; + ks->zio_injection_enabled.value.i64 = + zio_injection_enabled; + ks->zvol_immediate_write_sz.value.i64 = + zvol_immediate_write_sz; + + ks->zfs_recover.value.i64 = + zfs_recover; + + ks->zfs_free_bpobj_enabled.value.i64 = + zfs_free_bpobj_enabled; + + ks->zfs_send_corrupt_data.value.ui64 = + zfs_send_corrupt_data; + ks->zfs_send_queue_length.value.ui64 = + zfs_send_queue_length; + ks->zfs_recv_queue_length.value.ui64 = + zfs_recv_queue_length; + + ks->zvol_inhibit_dev.value.ui64 = + zvol_inhibit_dev; + ks->zfs_send_set_freerecords_bit.value.ui64 = + zfs_send_set_freerecords_bit; + + ks->zfs_write_implies_delete_child.value.ui64 = + zfs_write_implies_delete_child; + ks->zfs_send_holes_without_birth_time.value.ui64 = + send_holes_without_birth_time; + + ks->dbuf_cache_max_bytes.value.ui64 = dbuf_cache_max_bytes; + + ks->zfs_vdev_queue_depth_pct.value.ui64 = + zfs_vdev_queue_depth_pct; + ks->zio_dva_throttle_enabled.value.ui64 = + (uint64_t)zio_dva_throttle_enabled; + + ks->zfs_lua_max_instrlimit.value.ui64 = zfs_lua_max_instrlimit; + ks->zfs_lua_max_memlimit.value.ui64 = zfs_lua_max_memlimit; + + ks->zfs_trim_extent_bytes_max.value.ui64 = + zfs_trim_extent_bytes_max; + ks->zfs_trim_extent_bytes_min.value.ui64 = + zfs_trim_extent_bytes_min; + ks->zfs_trim_metaslab_skip.value.ui64 = + zfs_trim_metaslab_skip; + ks->zfs_trim_txg_batch.value.ui64 = + zfs_trim_txg_batch; + ks->zfs_trim_queue_limit.value.ui64 = + zfs_trim_queue_limit; + + ks->zfs_send_unmodified_spill_blocks.value.ui64 = + zfs_send_unmodified_spill_blocks; + ks->zfs_special_class_metadata_reserve_pct.value.ui64 = + zfs_special_class_metadata_reserve_pct; +#if 0 + zfs_vdev_raidz_impl_get(vdev_raidz_string, + sizeof (vdev_raidz_string)); + kstat_named_setstr(&ks->zfs_vdev_raidz_impl, vdev_raidz_string); + + icp_gcm_impl_get(icp_gcm_string, sizeof (icp_gcm_string)); + kstat_named_setstr(&ks->icp_gcm_impl, icp_gcm_string); + + icp_aes_impl_get(icp_aes_string, sizeof (icp_aes_string)); + kstat_named_setstr(&ks->icp_aes_impl, icp_aes_string); + + zfs_fletcher_4_impl_get(zfs_fletcher_4_string, + sizeof (zfs_fletcher_4_string)); + kstat_named_setstr(&ks->zfs_fletcher_4_impl, + zfs_fletcher_4_string); +#endif + + ks->zfs_expire_snapshot.value.ui64 = zfs_expire_snapshot; + ks->zfs_admin_snapshot.value.ui64 = zfs_admin_snapshot; + ks->zfs_auto_snapshot.value.ui64 = zfs_auto_snapshot; + + ks->zfs_spa_discard_memory_limit.value.ui64 = + zfs_spa_discard_memory_limit; + ks->zfs_async_block_max_blocks.value.ui64 = + zfs_async_block_max_blocks; + ks->zfs_initialize_chunk_size.value.ui64 = + zfs_initialize_chunk_size; + ks->zfs_scan_suspend_progress.value.ui64 = + zfs_scan_suspend_progress; + ks->zfs_livelist_max_entries.value.ui64 = + zfs_livelist_max_entries; + + ks->zfs_allow_redacted_dataset_mount.value.ui64 = + zfs_allow_redacted_dataset_mount; + ks->zfs_checksum_events_per_second.value.ui64 = + zfs_checksum_events_per_second; + ks->zfs_commit_timeout_pct.value.ui64 = zfs_commit_timeout_pct; + ks->zfs_compressed_arc_enabled.value.ui64 = + zfs_compressed_arc_enabled; + ks->zfs_condense_indirect_commit_entry_delay_ms.value.ui64 = + zfs_condense_indirect_commit_entry_delay_ms; + ks->zfs_condense_min_mapping_bytes.value.ui64 = + zfs_condense_min_mapping_bytes; + ks->zfs_deadman_checktime_ms.value.ui64 = + zfs_deadman_checktime_ms; + + kstat_named_setstr(&ks->zfs_deadman_failmode, + zfs_deadman_failmode); + + ks->zfs_deadman_synctime_ms.value.ui64 = + zfs_deadman_synctime_ms; + ks->zfs_deadman_ziotime_ms.value.ui64 = zfs_deadman_ziotime_ms; + ks->zfs_disable_ivset_guid_check.value.ui64 = + zfs_disable_ivset_guid_check; + ks->zfs_initialize_value.value.ui64 = zfs_initialize_value; + ks->zfs_keep_log_spacemaps_at_export.value.ui64 = + zfs_keep_log_spacemaps_at_export; + ks->l2arc_rebuild_blocks_min_l2size.value.ui64 = + l2arc_rebuild_blocks_min_l2size; + ks->l2arc_rebuild_enabled.value.ui64 = l2arc_rebuild_enabled; + ks->l2arc_trim_ahead.value.ui64 = l2arc_trim_ahead; + ks->zfs_livelist_condense_new_alloc.value.ui64 = + zfs_livelist_condense_new_alloc; + ks->zfs_livelist_condense_sync_cancel.value.ui64 = + zfs_livelist_condense_sync_cancel; + ks->zfs_livelist_condense_sync_pause.value.ui64 = + zfs_livelist_condense_sync_pause; + ks->zfs_livelist_condense_zthr_cancel.value.ui64 = + zfs_livelist_condense_zthr_cancel; + ks->zfs_livelist_condense_zthr_pause.value.ui64 = + zfs_livelist_condense_zthr_pause; + ks->zfs_livelist_min_percent_shared.value.ui64 = + zfs_livelist_min_percent_shared; + ks->zfs_max_dataset_nesting.value.ui64 = + zfs_max_dataset_nesting; + ks->zfs_max_missing_tvds.value.ui64 = zfs_max_missing_tvds; + ks->metaslab_debug_load.value.ui64 = metaslab_debug_load; + ks->metaslab_force_ganging.value.ui64 = metaslab_force_ganging; + ks->zfs_multihost_fail_intervals.value.ui64 = + zfs_multihost_fail_intervals; + ks->zfs_multihost_import_intervals.value.ui64 = + zfs_multihost_import_intervals; + ks->zfs_multihost_interval.value.ui64 = zfs_multihost_interval; + ks->zfs_override_estimate_recordsize.value.ui64 = + zfs_override_estimate_recordsize; + ks->zfs_remove_max_segment.value.ui64 = zfs_remove_max_segment; + ks->zfs_resilver_min_time_ms.value.ui64 = + zfs_resilver_min_time_ms; + ks->zfs_scan_legacy.value.ui64 = zfs_scan_legacy; + ks->zfs_scan_vdev_limit.value.ui64 = zfs_scan_vdev_limit; + ks->zfs_slow_io_events_per_second.value.ui64 = + zfs_slow_io_events_per_second; + ks->spa_load_verify_data.value.ui64 = spa_load_verify_data; + ks->spa_load_verify_metadata.value.ui64 = + spa_load_verify_metadata; + ks->zfs_unlink_suspend_progress.value.ui64 = + zfs_unlink_suspend_progress; + ks->zfs_vdev_min_ms_count.value.ui64 = zfs_vdev_min_ms_count; + ks->vdev_validate_skip.value.ui64 = vdev_validate_skip; + ks->zfs_zevent_len_max.value.ui64 = zfs_zevent_len_max; + ks->zio_slow_io_ms.value.ui64 = zio_slow_io_ms; + } + + return (0); +} + + + +int +kstat_osx_init(void) +{ + osx_kstat_ksp = kstat_create("zfs", 0, "tunable", "darwin", + KSTAT_TYPE_NAMED, sizeof (osx_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); + + if (osx_kstat_ksp != NULL) { + osx_kstat_ksp->ks_data = &osx_kstat; + osx_kstat_ksp->ks_update = osx_kstat_update; + kstat_install(osx_kstat_ksp); + } + + return (0); +} + +void +kstat_osx_fini(void) +{ + if (osx_kstat_ksp != NULL) { + kstat_delete(osx_kstat_ksp); + osx_kstat_ksp = NULL; + } +} diff --git a/module/os/macos/zfs/zfs_osx.cpp b/module/os/macos/zfs/zfs_osx.cpp new file mode 100644 index 0000000000..898ff897ec --- /dev/null +++ b/module/os/macos/zfs/zfs_osx.cpp @@ -0,0 +1,310 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013-2020, Jorgen Lundman. All rights reserved. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +// Define the superclass. +#define super IOService + +OSDefineMetaClassAndStructors(net_lundman_zfs_zvol, IOService) + +extern "C" { + +#include +#include +#include + +extern SInt32 zfs_active_fs_count; + +#ifdef DEBUG +#define ZFS_DEBUG_STR " (DEBUG mode)" +#else +#define ZFS_DEBUG_STR "" +#endif + +static char spl_gitrev[64] = ZFS_META_GITREV; + +SYSCTL_DECL(_zfs); +SYSCTL_NODE(, OID_AUTO, zfs, CTLFLAG_RD, 0, ""); +SYSCTL_STRING(_zfs, OID_AUTO, kext_version, + CTLFLAG_RD | CTLFLAG_LOCKED, + spl_gitrev, 0, "ZFS KEXT Version"); + + +extern kern_return_t _start(kmod_info_t *ki, void *data); +extern kern_return_t _stop(kmod_info_t *ki, void *data); + +__attribute__((visibility("default"))) KMOD_EXPLICIT_DECL(net.lundman.zfs, + "1.0.0", _start, _stop) +kmod_start_func_t *_realmain = 0; +kmod_stop_func_t *_antimain = 0; +int _kext_apple_cc = __APPLE_CC__; + +} // Extern "C" + +bool +net_lundman_zfs_zvol::init(OSDictionary* dict) +{ + bool res; + + /* Need an OSSet for open clients */ + _openClients = OSSet::withCapacity(1); + if (_openClients == NULL) { + dprintf("client OSSet failed"); + return (false); + } + + res = super::init(dict); + + // IOLog("ZFS::init\n"); + return (res); +} + +void +net_lundman_zfs_zvol::free(void) +{ + OSSafeReleaseNULL(_openClients); + + // IOLog("ZFS::free\n"); + super::free(); +} + +bool +net_lundman_zfs_zvol::isOpen(const IOService *forClient) const +{ + bool ret; + ret = IOService::isOpen(forClient); + return (ret); +} + +bool +net_lundman_zfs_zvol::handleOpen(IOService *client, + IOOptionBits options, void *arg) +{ + bool ret = true; + + dprintf(""); + + _openClients->setObject(client); + ret = _openClients->containsObject(client); + + return (ret); +} + +bool +net_lundman_zfs_zvol::handleIsOpen(const IOService *client) const +{ + bool ret; + + dprintf(""); + + ret = _openClients->containsObject(client); + + return (ret); +} + +void +net_lundman_zfs_zvol::handleClose(IOService *client, + IOOptionBits options) +{ + dprintf(""); + + if (_openClients->containsObject(client) == false) { + dprintf("not open"); + } + + _openClients->removeObject(client); +} + +IOService* +net_lundman_zfs_zvol::probe(IOService *provider, SInt32 *score) +{ + IOService *res = super::probe(provider, score); + return (res); +} + + +/* + * + * ************************************************************************ + * + * Kernel Module Load + * + * ************************************************************************ + * + */ + +bool +net_lundman_zfs_zvol::start(IOService *provider) +{ + bool res = super::start(provider); + + IOLog("ZFS: Loading module ... \n"); + + if (!res) + return (res); + + /* Fire up all SPL modules and threads */ + spl_start(NULL, NULL); + + /* registerService() allows zconfigd to match against the service */ + this->registerService(); + + /* + * hostid is left as 0 on OSX, and left to be set if developers wish to + * use it. If it is 0, we will hash the hardware.uuid into a 32 bit + * value and set the hostid. + */ + if (!zone_get_hostid(NULL)) { + uint32_t myhostid = 0; + IORegistryEntry *ioregroot = + IORegistryEntry::getRegistryRoot(); + if (ioregroot) { + IORegistryEntry *macmodel = + ioregroot->getChildEntry(gIOServicePlane); + + if (macmodel) { + OSObject *ioplatformuuidobj; + ioplatformuuidobj = + macmodel->getProperty(kIOPlatformUUIDKey); + if (ioplatformuuidobj) { + OSString *ioplatformuuidstr = + OSDynamicCast(OSString, + ioplatformuuidobj); + + myhostid = fnv_32a_str( + ioplatformuuidstr-> + getCStringNoCopy(), + FNV1_32A_INIT); + + sysctlbyname("kern.hostid", NULL, NULL, + &myhostid, sizeof (myhostid)); + printf("ZFS: hostid set to %08x from " + "UUID '%s'\n", myhostid, + ioplatformuuidstr-> + getCStringNoCopy()); + } + } + } + } + + /* Register ZFS KEXT Version sysctl - separate to kstats */ + sysctl_register_oid(&sysctl__zfs); + sysctl_register_oid(&sysctl__zfs_kext_version); + + /* Init LDI */ + int error = 0; + error = ldi_init(NULL); + if (error) { + IOLog("%s ldi_init error %d\n", __func__, error); + goto failure; + } + + /* Start ZFS itself */ + zfs_kmod_init(); + + /* Register fs with XNU */ + zfs_vfsops_init(); + + /* + * When is the best time to start the system_taskq? It is strictly + * speaking not used by SPL, but by ZFS. ZFS should really start it? + */ + system_taskq_init(); + + res = zfs_boot_init((IOService *)this); + + printf("ZFS: Loaded module v%s-%s%s, " + "ZFS pool version %s, ZFS filesystem version %s\n", + ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, + SPA_VERSION_STRING, ZPL_VERSION_STRING); + + return (true); + +failure: + spl_stop(NULL, NULL); + sysctl_unregister_oid(&sysctl__zfs_kext_version); + sysctl_unregister_oid(&sysctl__zfs); + return (false); +} + +/* Here we are, at the end of all things */ +void +net_lundman_zfs_zvol::stop(IOService *provider) +{ + + zfs_boot_fini(); + + IOLog("ZFS: Attempting to unload ...\n"); + + super::stop(provider); + + system_taskq_fini(); + + zfs_vfsops_fini(); + + zfs_kmod_fini(); + + ldi_fini(); + + sysctl_unregister_oid(&sysctl__zfs_kext_version); + sysctl_unregister_oid(&sysctl__zfs); + + spl_stop(NULL, NULL); + + printf("ZFS: Unloaded module v%s-%s%s\n", + ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); + + /* + * There is no way to ensure all threads have actually got to the + * thread_exit() call, before we exit here (and XNU unloads all + * memory for the KEXT). So we increase the odds of that happening + * by delaying a little bit before we return to XNU. Quite possibly + * the worst "solution" but Apple has not given any good options. + */ + delay(hz*5); +} diff --git a/module/os/macos/zfs/zfs_vfsops.c b/module/os/macos/zfs/zfs_vfsops.c new file mode 100644 index 0000000000..0487ebf870 --- /dev/null +++ b/module/os/macos/zfs/zfs_vfsops.c @@ -0,0 +1,2988 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ +/* Portions Copyright 2013,2020 Jorgen Lundman */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_comutil.h" + +#include +#include +#include +#include +#include +#include + +// #define dprintf kprintf + +unsigned int zfs_vnop_skip_unlinked_drain = 0; + +int zfs_module_start(kmod_info_t *ki, void *data); +int zfs_module_stop(kmod_info_t *ki, void *data); +extern int getzfsvfs(const char *dsname, zfsvfs_t **zfvp); + +void arc_os_init(void); +void arc_os_fini(void); + +/* + * AVL tree of hardlink entries, which we need to map for Finder. The va_linkid + * needs to be unique for each hardlink target, as well as, return the znode + * in vget(va_linkid). Unfortunately, the va_linkid is 32bit (lost in the + * syscall translation to userland struct). We sort the AVL tree by + * -> directory id + * -> z_id + * -> name + * + */ +static int hardlinks_compare(const void *arg1, const void *arg2) +{ + const hardlinks_t *node1 = arg1; + const hardlinks_t *node2 = arg2; + int value; + if (node1->hl_parent > node2->hl_parent) + return (1); + if (node1->hl_parent < node2->hl_parent) + return (-1); + if (node1->hl_fileid > node2->hl_fileid) + return (1); + if (node1->hl_fileid < node2->hl_fileid) + return (-1); + + value = strncmp(node1->hl_name, node2->hl_name, PATH_MAX); + if (value < 0) + return (-1); + if (value > 0) + return (1); + return (0); +} + +/* + * Lookup same information from linkid, to get at parentid, objid and name + */ +static int hardlinks_compare_linkid(const void *arg1, const void *arg2) +{ + const hardlinks_t *node1 = arg1; + const hardlinks_t *node2 = arg2; + if (node1->hl_linkid > node2->hl_linkid) + return (1); + if (node1->hl_linkid < node2->hl_linkid) + return (-1); + return (0); +} + +extern int +zfs_obtain_xattr(znode_t *, const char *, mode_t, cred_t *, vnode_t **, int); + + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our kext + * from being unloaded after a umount -f + */ +uint32_t zfs_active_fs_count = 0; + +extern void zfs_ioctl_init(void); +extern void zfs_ioctl_fini(void); + +static int +zfsvfs_parse_option(char *option, char *value, vfs_t *vfsp) +{ + if (!option || !*option) + return (0); + dprintf("parse '%s' '%s'\n", option?option:"", + value?value:""); + if (!strcasecmp(option, "readonly")) { + if (value && *value && + strcasecmp(value, "off") == 0) + vfs_clearflags(vfsp, (uint64_t)MNT_RDONLY); + else + vfs_setflags(vfsp, (uint64_t)MNT_RDONLY); + } + return (0); +} + +/* + * Parse the raw mntopts and return a vfs_t describing the options. + */ +static int +zfsvfs_parse_options(char *mntopts, vfs_t *vfsp) +{ + int error = 0; + + if (mntopts != NULL) { + char *p, *t, *v; + char *keep; + + int len = strlen(mntopts) + 1; + keep = kmem_alloc(len, KM_SLEEP); + t = keep; + memcpy(t, mntopts, len); + + while (1) { + while (t && *t == ' ') t++; + + p = strpbrk(t, ","); + if (p) *p = 0; + + // find "=" + v = strpbrk(t, "="); + if (v) { + *v = 0; + v++; + while (*v == ' ') v++; + } + error = zfsvfs_parse_option(t, v, vfsp); + if (error) break; + if (!p) break; + t = &p[1]; + } + kmem_free(keep, len); + } + + return (error); +} + +int +zfs_is_readonly(zfsvfs_t *zfsvfs) +{ + return (!!(vfs_isrdonly(zfsvfs->z_vfs))); +} + +/* + * The OS sync ignored by default, as ZFS handles internal periodic + * syncs. (As per illumos) Unfortunately, we can not tell the difference + * of when users run "sync" by hand. Sync is called on umount though. + */ +uint64_t zfs_vfs_sync_paranoia = 0; + +int +zfs_vfs_sync(struct mount *vfsp, __unused int waitfor, + __unused vfs_context_t context) +{ + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (spl_panicstr()) + return (0); + + /* Check if sysctl setting wants sync - and we are not unmounting */ + if (zfs_vfs_sync_paranoia == 0 && + !vfs_isunmount(vfsp)) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + dsl_pool_t *dp; + + ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (spl_system_inshutdown() && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + + ZFS_EXIT(zfsvfs); + + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == B_TRUE) { + zfsvfs->z_atime = B_TRUE; + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOATIME); + } else { + zfsvfs->z_atime = B_FALSE; + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOATIME); + } +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + /* + * Apple does have MNT_NOUSERXATTR mount option, but unfortunately + * the VFS layer returns EACCESS if xattr access is attempted. + * Finder etc, will do so, even if filesystem capabilities is set + * without xattr, rendering the mount option useless. We no longer + * set it, and handle xattrs being disabled internally. + */ + + if (newval == ZFS_XATTR_OFF) { + zfsvfs->z_xattr = B_FALSE; + // vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOUSERXATTR); + } else { + zfsvfs->z_xattr = B_TRUE; + // vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOUSERXATTR); + + if (newval == ZFS_XATTR_SA) + zfsvfs->z_xattr_sa = B_TRUE; + else + zfsvfs->z_xattr_sa = B_FALSE; + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); + + zfsvfs->z_max_blksz = newval; + // zfsvfs->z_vfs->mnt_stat.f_iosize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_TRUE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_RDONLY); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_RDONLY); + } +} + +static void +devices_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NODEV); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NODEV); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOSUID); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOSUID); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOEXEC); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOEXEC); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + zfsvfs->z_show_ctldir = newval; + cache_purgevfs(zfsvfs->z_vfs); +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + // zfsvfs_t *zfsvfs = arg; + // zfsvfs->z_vscan = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +static void +finderbrowse_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_DONTBROWSE); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_DONTBROWSE); + } +} +static void +ignoreowner_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_IGNORE_OWNERSHIP); + } else { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_IGNORE_OWNERSHIP); + } +} + +static void +mimic_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(zfsvfs->z_vfs); + + if (newval == 0) { + strlcpy(vfsstatfs->f_fstypename, "zfs", MFSTYPENAMELEN); + } else { + strlcpy(vfsstatfs->f_fstypename, "hfs", MFSTYPENAMELEN); + } +} + +static int +zfs_register_callbacks(struct mount *vfsp) +{ + struct dsl_dataset *ds = NULL; + + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; + boolean_t devices = B_FALSE; + boolean_t do_devices = B_FALSE; + boolean_t xattr = B_FALSE; + boolean_t do_xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; + boolean_t finderbrowse = B_FALSE; + boolean_t do_finderbrowse = B_FALSE; + boolean_t ignoreowner = B_FALSE; + boolean_t do_ignoreowner = B_FALSE; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfs_fsprivate(vfsp); + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * This function can be called for a snapshot when we update snapshot's + * mount point, which isn't really supported. + */ + if (dmu_objset_is_snapshot(os)) + return (EOPNOTSUPP); + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ +#define vfs_optionisset(X, Y, Z) (vfs_flags(X)&(Y)) + + if (vfs_optionisset(vfsp, MNT_RDONLY, NULL) || + !spa_writeable(dmu_objset_spa(os))) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_NODEV, NULL)) { + devices = B_FALSE; + do_devices = B_TRUE; + } + /* xnu SETUID, not IllumOS SUID */ + if (vfs_optionisset(vfsp, MNT_NOSUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_NOUSERXATTR, NULL)) { + xattr = B_FALSE; + do_xattr = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_DONTBROWSE, NULL)) { + finderbrowse = B_FALSE; + do_finderbrowse = B_TRUE; + } + if (vfs_optionisset(vfsp, MNT_IGNORE_OWNERSHIP, NULL)) { + ignoreowner = B_TRUE; + do_ignoreowner = B_TRUE; + } + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); + // This appears to be PROP_PRIVATE, investigate if we want this + // ZOL calls this ACLTYPE + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_BROWSE), finderbrowse_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_IGNOREOWNER), + ignoreowner_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_MIMIC), mimic_changed_cb, zfsvfs); + + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_devices) + devices_changed_cb(zfsvfs, devices); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); + + if (do_finderbrowse) + finderbrowse_changed_cb(zfsvfs, finderbrowse); + if (do_ignoreowner) + ignoreowner_changed_cb(zfsvfs, ignoreowner); + + return (0); + +unregister: + dsl_prop_unregister_all(ds, zfsvfs); + return (error); +} + +/* + * Takes a dataset, a property, a value and that value's setpoint as + * found in the ZAP. Checks if the property has been changed in the vfs. + * If so, val and setpoint will be overwritten with updated content. + * Otherwise, they are left unchanged. + */ +int +zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, + char *setpoint) +{ + int error; + zfsvfs_t *zfvp; + mount_t vfsp; + objset_t *os; + uint64_t tmp = *val; + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + if (dmu_objset_type(os) != DMU_OST_ZFS) + return (EINVAL); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + mutex_exit(&os->os_user_ptr_lock); + if (zfvp == NULL) + return (ESRCH); + + vfsp = zfvp->z_vfs; + + switch (zfs_prop) { + case ZFS_PROP_ATIME: +// if (vfsp->vfs_do_atime) +// tmp = vfsp->vfs_atime; + break; + case ZFS_PROP_RELATIME: +// if (vfsp->vfs_do_relatime) +// tmp = vfsp->vfs_relatime; + break; + case ZFS_PROP_DEVICES: +// if (vfsp->vfs_do_devices) +// tmp = vfsp->vfs_devices; + break; + case ZFS_PROP_EXEC: +// if (vfsp->vfs_do_exec) +// tmp = vfsp->vfs_exec; + break; + case ZFS_PROP_SETUID: +// if (vfsp->vfs_do_setuid) +// tmp = vfsp->vfs_setuid; + break; + case ZFS_PROP_READONLY: +// if (vfsp->vfs_do_readonly) +// tmp = vfsp->vfs_readonly; + break; + case ZFS_PROP_XATTR: +// if (vfsp->vfs_do_xattr) +// tmp = vfsp->vfs_xattr; + break; + case ZFS_PROP_NBMAND: +// if (vfsp->vfs_do_nbmand) +// tmp = vfsp->vfs_nbmand; + break; + default: + return (ENOENT); + } + + if (tmp != *val) { + (void) strlcpy(setpoint, "temporary", ZFS_MAX_DATASET_NAME_LEN); + *val = tmp; + } + return (0); +} + + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + /* Volume status "all ok" */ + zfsvfs->z_notification_conditions = 0; + zfsvfs->z_freespace_notify_warninglimit = 0; + zfsvfs->z_freespace_notify_dangerlimit = 0; + zfsvfs->z_freespace_notify_desiredlevel = 0; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.\n", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); + } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; + + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); + + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; + + error = zfs_get_zplprop(os, ZFS_PROP_ACLMODE, &val); + if (error != 0) + return (error); + zfsvfs->z_acl_mode = (uint_t)val; + + zfs_get_zplprop(os, ZFS_PROP_LASTUNMOUNT, &val); + zfsvfs->z_last_unmount_time = val; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); + if ((error == 0) && (val == ZFS_XATTR_SA)) + zfsvfs->z_xattr_sa = B_TRUE; + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error != 0) + return (error); + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + return (0); +} + +int +zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + /* + * We claim to always be readonly so we can open snapshots; + * other ZPL code will prevent us from writing to snapshots. + */ + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, B_TRUE, + zfsvfs, &os); + if (error != 0) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + error = zfsvfs_create_impl(zfvp, zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + return (error); +} + +int +zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + zfsvfs->z_ctldir_startid = ZFSCTL_INO_SNAPDIRS; + + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); + + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + + int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1), + ZFS_OBJ_MTX_MAX); + zfsvfs->z_hold_size = size; + zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, + KM_SLEEP); + zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); + for (int i = 0; i != size; i++) { + avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, + sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); + mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); + } + + rw_init(&zfsvfs->z_hardlinks_lock, NULL, RW_DEFAULT, NULL); + avl_create(&zfsvfs->z_hardlinks, hardlinks_compare, + sizeof (hardlinks_t), offsetof(hardlinks_t, hl_node)); + avl_create(&zfsvfs->z_hardlinks_linkid, hardlinks_compare_linkid, + sizeof (hardlinks_t), offsetof(hardlinks_t, hl_node_linkid)); + zfsvfs->z_rdonly = 0; + + mutex_init(&zfsvfs->z_drain_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zfsvfs->z_drain_cv, NULL, CV_DEFAULT, NULL); + + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + *zfvp = zfsvfs; + return (0); +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + boolean_t readonly = vfs_isrdonly(zfsvfs->z_vfs); + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + + if (readonly != 0) + readonly_changed_cb(zfsvfs, B_FALSE); + else + if (!zfs_vnop_skip_unlinked_drain) + zfs_unlinked_drain(zfsvfs); + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } + } + + /* restore readonly bit */ + if (readonly != 0) + readonly_changed_cb(zfsvfs, B_TRUE); + } + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + return (0); +} + +extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + +void +zfsvfs_free(zfsvfs_t *zfsvfs) +{ + int i, size = zfsvfs->z_hold_size; + + dprintf("+zfsvfs_free\n"); + + zfs_fuid_destroy(zfsvfs); + + cv_destroy(&zfsvfs->z_drain_cv); + mutex_destroy(&zfsvfs->z_drain_lock); + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_lock); + list_destroy(&zfsvfs->z_all_znodes); + rrm_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + + for (i = 0; i != size; i++) { + avl_destroy(&zfsvfs->z_hold_trees[i]); + mutex_destroy(&zfsvfs->z_hold_locks[i]); + } + kmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); + kmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); + + dprintf("ZFS: Unloading hardlink AVLtree: %lu\n", + avl_numnodes(&zfsvfs->z_hardlinks)); + void *cookie = NULL; + hardlinks_t *hardlink; + rw_destroy(&zfsvfs->z_hardlinks_lock); + while ((hardlink = avl_destroy_nodes(&zfsvfs->z_hardlinks_linkid, + &cookie))) { + } + cookie = NULL; + while ((hardlink = avl_destroy_nodes(&zfsvfs->z_hardlinks, &cookie))) { + kmem_free(hardlink, sizeof (*hardlink)); + } + avl_destroy(&zfsvfs->z_hardlinks); + avl_destroy(&zfsvfs->z_hardlinks_linkid); + + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + dprintf("-zfsvfs_free\n"); +} + +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_vfs) { +#if 0 + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } +#endif + } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} + + +static int +zfs_domount(struct mount *vfsp, dev_t mount_dev, char *osname, char *options, + vfs_context_t ctx) +{ + int error = 0; + zfsvfs_t *zfsvfs; + uint64_t mimic = 0; + struct timeval tv; + + ASSERT(vfsp); + ASSERT(osname); + + error = zfsvfs_create(osname, B_FALSE, &zfsvfs); + if (error) + return (error); + zfsvfs->z_vfs = vfsp; + + error = zfsvfs_parse_options(options, zfsvfs->z_vfs); + if (error) + goto out; + + zfsvfs->z_rdev = mount_dev; + + /* HFS sets this prior to mounting */ + vfs_setflags(vfsp, (uint64_t)((unsigned int)MNT_DOVOLFS)); + /* Advisory locking should be handled at the VFS layer */ + vfs_setlocklocal(vfsp); + + /* + * Record the mount time (for Spotlight) + */ + microtime(&tv); + zfsvfs->z_mount_time = tv.tv_sec; + + vfs_setfsprivate(vfsp, zfsvfs); + + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + + error = dsl_prop_get_integer(osname, "com.apple.mimic", &mimic, NULL); + if (zfsvfs->z_rdev) { + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(vfsp); + vfsstatfs->f_fsid.val[0] = zfsvfs->z_rdev; + vfsstatfs->f_fsid.val[1] = vfs_typenum(vfsp); + } else { + // Otherwise, ask VFS to give us a random unique one. + vfs_getnewfsid(vfsp); + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(vfsp); + zfsvfs->z_rdev = vfsstatfs->f_fsid.val[0]; + } + + /* + * If we are readonly (ie, waiting for rootmount) we need to reply + * honestly, so launchd runs fsck_zfs and mount_zfs + */ + if (mimic) { + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(vfsp); + strlcpy(vfsstatfs->f_fstypename, "hfs", MFSTYPENAMELEN); + } + + /* + * Set features for file system. + */ + zfs_set_fuid_feature(zfsvfs); + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + zfsvfs_t *fs_zfsvfs; + + dmu_fsname(osname, fsname); + error = getzfsvfs(fsname, &fs_zfsvfs); + if (error == 0) { + if (fs_zfsvfs->z_unmounted) + error = SET_ERROR(EINVAL); + vfs_unbusy(fs_zfsvfs->z_vfs); + } + if (error) { + printf("file system '%s' is unmounted : error %d\n", + fsname, + error); + goto out; + } + + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if ((error = dsl_prop_get_integer(osname, "xattr", &pval, + NULL))) + goto out; + xattr_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + zfsctl_mount_signal(osname, B_TRUE); + + } else { + if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) + goto out; + } + + vfs_setflags(vfsp, (uint64_t)((unsigned int)MNT_JOURNALED)); + + if ((vfs_flags(vfsp) & MNT_ROOTFS) != 0) { + /* Root FS */ + vfs_clearflags(vfsp, + (uint64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS)); + vfs_clearflags(vfsp, + (uint64_t)((unsigned int)MNT_IGNORE_OWNERSHIP)); + } + +#if 1 // Want .zfs or not + if (!zfsvfs->z_issnap) { + zfsctl_create(zfsvfs); + } +#endif + +out: + if (error) { + vfs_setfsprivate(vfsp, NULL); + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); + zfsvfs_free(zfsvfs); + } else { + atomic_inc_32(&zfs_active_fs_count); + } + + return (error); +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + + /* + * Unregister properties. + */ + if (!dmu_objset_is_snapshot(os)) { + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); + } +} + +/* + * zfs_vfs_mountroot + * Given a device vnode created by vfs_mountroot bdevvp, + * and with the root pool already imported, root mount the + * dataset specified in the pool's bootfs property. + * + * Inputs: + * mp: VFS mount struct + * devvp: device vnode, currently only used to retrieve the + * dev_t for the fsid. Could vnode_get, vnode_ref, vnode_put, + * with matching get/rele/put in zfs_vfs_umount, but this is + * already done by XNU as well. + * ctx: VFS context, unused. + * + * Return: + * 0 on success, positive int on failure. + */ +int +zfs_vfs_mountroot(struct mount *mp, struct vnode *devvp, vfs_context_t ctx) +{ + /* + * static int zfsrootdone = 0; + */ + zfsvfs_t *zfsvfs = NULL; + spa_t *spa = 0; + char *zfs_bootfs = 0; + dev_t dev = 0; + int error = EINVAL; + + printf("ZFS: %s\n", __func__); + ASSERT(mp); + ASSERT(devvp); + ASSERT(ctx); + if (!mp || !devvp | !ctx) { + cmn_err(CE_NOTE, "%s: missing one of mp %p devvp %p" + " or ctx %p", __func__, mp, devvp, ctx); + return (EINVAL); + } + + /* Look up bootfs variable from pool here */ + zfs_bootfs = kmem_alloc(MAXPATHLEN, KM_SLEEP); + if (!zfs_bootfs) { + cmn_err(CE_NOTE, "%s: bootfs alloc failed", + __func__); + return (ENOMEM); + } + + mutex_enter(&spa_namespace_lock); + spa = spa_next(NULL); + if (!spa) { + mutex_exit(&spa_namespace_lock); + cmn_err(CE_NOTE, "%s: no pool available", + __func__); + goto out; + } + + error = dsl_dsobj_to_dsname(spa_name(spa), + spa_bootfs(spa), zfs_bootfs); + if (error != 0) { + mutex_exit(&spa_namespace_lock); + cmn_err(CE_NOTE, "%s: bootfs to name error %d", + __func__, error); + goto out; + } + mutex_exit(&spa_namespace_lock); + + /* + * By setting the dev_t value in the mount vfsp, + * mount_zfs will be called with the /dev/diskN + * proxy, but we can leave the dataset name in + * the mountedfrom field + */ + dev = vnode_specrdev(devvp); + + printf("Setting readonly\n"); + + if ((error = zfs_domount(mp, dev, zfs_bootfs, NULL, ctx)) != 0) { + printf("zfs_domount: error %d", error); + goto out; + } + + zfsvfs = (zfsvfs_t *)vfs_fsprivate(mp); + ASSERT(zfsvfs); + if (!zfsvfs) { + cmn_err(CE_NOTE, "missing zfsvfs"); + goto out; + } + + /* Set this mount to read-only */ + zfsvfs->z_rdonly = 1; + + /* + * Due to XNU mount flags, readonly gets set off for a short + * while, which means mimic will kick in if enabled. But we need + * to reply with true "zfs" until root has been remounted RW, so + * that launchd tries to run mount_zfs instead of mount_hfs + */ + mimic_changed_cb(zfsvfs, B_FALSE); + + /* + * Leave rootvp held. The root file system is never unmounted. + * + * XXX APPLE + * xnu will in fact call vfs_unmount on the root filesystem + * during shutdown/reboot. + */ + +out: + + if (zfs_bootfs) { + kmem_free(zfs_bootfs, MAXPATHLEN); + } + return (error); + +} + +/*ARGSUSED*/ +int +zfs_vfs_mount(struct mount *vfsp, vnode_t *mvp /* devvp */, + user_addr_t data, vfs_context_t context) +{ + char *osname = NULL; + char *options = NULL; + uint64_t flags = vfs_flags(vfsp); + int error = 0; + int rdonly = 0; + int mflag = 0; + char *proxy = NULL; + struct zfs_mount_args mnt_args; + size_t osnamelen = 0; + uint32_t cmdflags = 0; + + cmdflags = (uint32_t)vfs_flags(vfsp) & MNT_CMDFLAGS; + rdonly = vfs_isrdonly(vfsp); + + if (!data) { + /* + * From 10.12, if you set VFS_TBLCANMOUNTROOT, XNU will + * call vfs_mountroot if set (and we can not set it), OR + * call vfs_mount if not set. Since data is always passed NULL + * in this case, we know we are supposed to call mountroot. + */ + dprintf("ZFS: vfs_mount -> vfs_mountroot\n"); + return (zfs_vfs_mountroot(vfsp, mvp, context)); + } + + /* + * Get the objset name (the "special" mount argument). + */ + if (data) { + + // Clear the struct, so that "flags" is null if only given path. + bzero(&mnt_args, sizeof (mnt_args)); + + osname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + if (vfs_context_is64bit(context)) { + if ((error = ddi_copyin((void *)data, + (caddr_t)&mnt_args, sizeof (mnt_args), 0))) { + printf("%s: error on mnt_args copyin %d\n", + __func__, error); + goto out; + } + } else { + user32_addr_t tmp; + if ((error = ddi_copyin((void *)data, + (caddr_t)&tmp, sizeof (tmp), 0))) { + printf("%s: error on mnt_args copyin32 %d\n", + __func__, error); + goto out; + } + /* munge into LP64 addr */ + mnt_args.fspec = (char *)CAST_USER_ADDR_T(tmp); + } + + // Copy over the string + if ((error = ddi_copyinstr((const void *)mnt_args.fspec, osname, + MAXPATHLEN, &osnamelen))) { + printf("%s: error on osname copyin %d\n", + __func__, error); + if (!mvp) + goto out; + } + } + + proxy = kmem_alloc(MAXPATHLEN, KM_SLEEP); + if (!proxy) { + dprintf("%s proxy string alloc failed\n", __func__); + goto out; + } + *proxy = 0; + + /* + * Translate /dev/disk path into dataset name + * After this; + * "proxy" will have "/dev/disk" (IF given) + * "osname" has the dataset name as usual + */ + if (strncmp(osname, "/dev/disk", 9) == 0) { + strlcpy(proxy, osname, MAXPATHLEN); + error = zfs_osx_proxy_get_osname(osname, + osname, MAXPATHLEN); + if (error != 0) { + printf("%s couldn't get dataset from %s\n", + __func__, osname); + error = ENOENT; + goto out; + } + dprintf("%s got new osname %s\n", __func__, osname); + } + + if (mnt_args.struct_size == sizeof (mnt_args)) { + mflag = mnt_args.mflag; + options = kmem_alloc(mnt_args.optlen, KM_SLEEP); + error = ddi_copyin((const void *)mnt_args.optptr, + (caddr_t)options, mnt_args.optlen, 0); + } + + if (mflag & MS_RDONLY) { + dprintf("%s: adding MNT_RDONLY\n", __func__); + flags |= MNT_RDONLY; + } + + if (mflag & MS_OVERLAY) { + dprintf("%s: adding MNT_UNION\n", __func__); + flags |= MNT_UNION; + } + + if (mflag & MS_FORCE) { + dprintf("%s: adding MNT_FORCE\n", __func__); + flags |= MNT_FORCE; + } + + if (mflag & MS_REMOUNT) { + dprintf("%s: adding MNT_UPDATE on MS_REMOUNT\n", __func__); + flags |= MNT_UPDATE; + } + + vfs_setflags(vfsp, (uint64_t)flags); + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (cmdflags & MNT_UPDATE) { + + if (cmdflags & MNT_RELOAD) { + printf("%s: reload after fsck\n", __func__); + error = 0; + goto out; + } + + /* refresh mount options */ + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + ASSERT(zfsvfs); + + if (zfsvfs->z_rdonly == 0 && (flags & MNT_RDONLY || + vfs_isrdonly(vfsp))) { + /* downgrade */ + dprintf("%s: downgrade requested\n", __func__); + zfsvfs->z_rdonly = 1; + readonly_changed_cb(zfsvfs, B_TRUE); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + if (error) { + printf("%s: remount returned %d", + __func__, error); + } + } + + // if (zfsvfs->z_rdonly != 0 && vfs_iswriteupgrade(vfsp)) { + if (vfs_iswriteupgrade(vfsp)) { + /* upgrade */ + dprintf("%s: upgrade requested\n", __func__); + zfsvfs->z_rdonly = 0; + readonly_changed_cb(zfsvfs, B_FALSE); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + if (error) { + printf("%s: remount returned %d", + __func__, error); + } + } + + goto out; + } + + if (vfs_fsprivate(vfsp) != NULL) { + printf("already mounted\n"); + error = 0; + goto out; + } + + error = zfs_domount(vfsp, 0, osname, options, context); + if (error) { + printf("%s: zfs_domount returned %d\n", + __func__, error); + goto out; + } + + +out: + + if (error == 0) { + + /* Indicate to VFS that we support ACLs. */ + vfs_setextendedsecurity(vfsp); + + // Set /dev/disk name if we have one, otherwise, datasetname + vfs_mountedfrom(vfsp, proxy && *proxy ? proxy : osname); + + } + + if (error) + printf("zfs_vfs_mount: error %d\n", error); + + if (osname) + kmem_free(osname, MAXPATHLEN); + + if (proxy) + kmem_free(proxy, MAXPATHLEN); + + if (options) + kmem_free(options, mnt_args.optlen); + + return (error); +} + + +int +zfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, + __unused vfs_context_t context) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + uint64_t refdbytes, availbytes, usedobjs, availobjs; + uint64_t log_blksize; + uint64_t log_blkcnt; + + // dprintf("vfs_getattr\n"); + + ZFS_ENTER(zfsvfs); + + /* + * Finder will show the old/incorrect size, we can force a sync of the + * pool to make it correct, but that has side effects which are + * undesirable. + */ + /* txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); */ + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + VFSATTR_RETURN(fsap, f_objcount, usedobjs); + VFSATTR_RETURN(fsap, f_maxobjcount, 0x7fffffffffffffff); + /* + * Carbon depends on f_filecount and f_dircount so + * make up some values based on total objects. + */ + VFSATTR_RETURN(fsap, f_filecount, usedobjs - (usedobjs / 4)); + VFSATTR_RETURN(fsap, f_dircount, usedobjs / 4); + + /* + * Model after HFS in working out if we should use the legacy size + * 512, or go to 4096. Note that XNU only likes those two + * blocksizes, so we don't use the ZFS recordsize + */ + log_blkcnt = (u_int64_t)((refdbytes + availbytes) >> SPA_MINBLOCKSHIFT); + log_blksize = (log_blkcnt > 0x000000007fffffff) ? + 4096 : (1 << SPA_MINBLOCKSHIFT); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + VFSATTR_RETURN(fsap, f_bsize, log_blksize); + VFSATTR_RETURN(fsap, f_iosize, zfsvfs->z_max_blksz); + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + VFSATTR_RETURN(fsap, f_blocks, + (u_int64_t)((refdbytes + availbytes) / log_blksize)); + VFSATTR_RETURN(fsap, f_bfree, (u_int64_t)(availbytes / log_blksize)); + VFSATTR_RETURN(fsap, f_bavail, fsap->f_bfree); + VFSATTR_RETURN(fsap, f_bused, fsap->f_blocks - fsap->f_bfree); + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + VFSATTR_RETURN(fsap, f_ffree, (u_int64_t)MIN(availobjs, fsap->f_bfree)); + VFSATTR_RETURN(fsap, f_files, fsap->f_ffree + usedobjs); + + if (VFSATTR_IS_ACTIVE(fsap, f_fsid)) { + fsap->f_fsid.val[0] = zfsvfs->z_rdev; + fsap->f_fsid.val[1] = vfs_typenum(mp); + VFSATTR_SET_SUPPORTED(fsap, f_fsid); + } + if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) { + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_HARDLINKS | // ZFS + VOL_CAP_FMT_SPARSE_FILES | // ZFS + VOL_CAP_FMT_2TB_FILESIZE | // ZFS + VOL_CAP_FMT_JOURNAL | VOL_CAP_FMT_JOURNAL_ACTIVE | // ZFS + VOL_CAP_FMT_SYMBOLICLINKS | // msdos.. + // ZFS has root times just fine + /* VOL_CAP_FMT_NO_ROOT_TIMES | */ + // Ask XNU to remember zero-runs, instead of writing + // zeros to it. + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_PATH_FROM_ID | + VOL_CAP_FMT_64BIT_OBJECT_IDS | + /* VOL_CAP_FMT_DECMPFS_COMPRESSION | */ + VOL_CAP_FMT_HIDDEN_FILES; + + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_ATTRLIST | // ZFS + VOL_CAP_INT_NFSEXPORT | // ZFS + VOL_CAP_INT_EXTENDED_SECURITY | // ZFS +#if NAMEDSTREAMS + VOL_CAP_INT_NAMEDSTREAMS | // ZFS +#endif + VOL_CAP_INT_EXTENDED_ATTR | // ZFS + VOL_CAP_INT_VOL_RENAME | // msdos.. + VOL_CAP_INT_ADVLOCK | + // ZFS does not yet have exchangedata (it's in a branch) + /* VOL_CAP_INT_EXCHANGEDATA| */ + // ZFS does not yet have copyfile + /* VOL_CAP_INT_COPYFILE| */ + // ZFS does not yet have allocate + /* VOL_CAP_INT_ALLOCATE| */ + VOL_CAP_INT_FLOCK; + + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED1] = + 0; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED2] = + 0; + + /* + * This is the list of valid capabilities at time of + * compile. The valid list should have them all defined + * and the "capability" list above should enable only + * those we have implemented + */ + fsap->f_capabilities.valid[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + VOL_CAP_FMT_JOURNAL_ACTIVE | + VOL_CAP_FMT_NO_ROOT_TIMES | + VOL_CAP_FMT_SPARSE_FILES | + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE | + VOL_CAP_FMT_OPENDENYMODES | + VOL_CAP_FMT_PATH_FROM_ID | + VOL_CAP_FMT_64BIT_OBJECT_IDS | + VOL_CAP_FMT_NO_VOLUME_SIZES | + VOL_CAP_FMT_DECMPFS_COMPRESSION | + VOL_CAP_FMT_HIDDEN_FILES; + fsap->f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_SEARCHFS | + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_COPYFILE | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK | + VOL_CAP_INT_EXTENDED_ATTR | + VOL_CAP_INT_USERACCESS | +#if NAMEDSTREAMS + VOL_CAP_INT_NAMEDSTREAMS | +#endif + VOL_CAP_INT_MANLOCK; + + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED1] = 0; + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED2] = 0; + + /* Check if we are case-sensitive */ + if (zfsvfs->z_case == ZFS_CASE_SENSITIVE) + fsap->f_capabilities.capabilities[ + VOL_CAPABILITIES_FORMAT] |= + VOL_CAP_FMT_CASE_SENSITIVE; + + /* Check if xattr is enabled */ + if (zfsvfs->z_xattr == B_TRUE) { + fsap->f_capabilities.capabilities[ + VOL_CAPABILITIES_INTERFACES] |= + VOL_CAP_INT_EXTENDED_ATTR; + } + + // Check if mimic is on + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(zfsvfs->z_vfs); + if (strcmp(vfsstatfs->f_fstypename, "hfs") == 0) { + fsap->f_capabilities.capabilities[ + VOL_CAPABILITIES_FORMAT] |= + VOL_CAP_FMT_DECMPFS_COMPRESSION; + } + + VFSATTR_SET_SUPPORTED(fsap, f_capabilities); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { + fsap->f_attributes.validattr.commonattr = + ATTR_CMN_NAME | + ATTR_CMN_DEVID | + ATTR_CMN_FSID | + ATTR_CMN_OBJTYPE | + ATTR_CMN_OBJTAG | + ATTR_CMN_OBJID | + ATTR_CMN_OBJPERMANENTID | + ATTR_CMN_PAROBJID | + /* ATTR_CMN_SCRIPT | */ + ATTR_CMN_CRTIME | + ATTR_CMN_MODTIME | + ATTR_CMN_CHGTIME | + ATTR_CMN_ACCTIME | + /* ATTR_CMN_BKUPTIME | */ + ATTR_CMN_FNDRINFO | + ATTR_CMN_OWNERID | + ATTR_CMN_GRPID | + ATTR_CMN_ACCESSMASK | + ATTR_CMN_FLAGS | + ATTR_CMN_USERACCESS | + ATTR_CMN_EXTENDED_SECURITY | + ATTR_CMN_UUID | + ATTR_CMN_GRPUUID | +#ifdef ATTR_CMN_DOCUMENT_ID + ATTR_CMN_DOCUMENT_ID | +#endif +#ifdef ATTR_CMN_GEN_COUNT + ATTR_CMN_GEN_COUNT | +#endif + 0; + fsap->f_attributes.validattr.volattr = + ATTR_VOL_FSTYPE | + ATTR_VOL_SIGNATURE | + ATTR_VOL_SIZE | + ATTR_VOL_SPACEFREE | + ATTR_VOL_SPACEAVAIL | + ATTR_VOL_MINALLOCATION | + ATTR_VOL_ALLOCATIONCLUMP | + ATTR_VOL_IOBLOCKSIZE | + ATTR_VOL_OBJCOUNT | + ATTR_VOL_FILECOUNT | + ATTR_VOL_DIRCOUNT | + ATTR_VOL_MAXOBJCOUNT | + /* ATTR_VOL_MOUNTPOINT | */ + ATTR_VOL_NAME | + ATTR_VOL_MOUNTFLAGS | + /* ATTR_VOL_MOUNTEDDEVICE | */ + /* ATTR_VOL_ENCODINGSUSED | */ + ATTR_VOL_CAPABILITIES | + ATTR_VOL_ATTRIBUTES; + fsap->f_attributes.validattr.dirattr = + ATTR_DIR_LINKCOUNT | + ATTR_DIR_ENTRYCOUNT | + ATTR_DIR_MOUNTSTATUS; + fsap->f_attributes.validattr.fileattr = + ATTR_FILE_LINKCOUNT | + ATTR_FILE_TOTALSIZE | + ATTR_FILE_ALLOCSIZE | + /* ATTR_FILE_IOBLOCKSIZE */ + ATTR_FILE_DEVTYPE | + /* ATTR_FILE_FORKCOUNT */ + /* ATTR_FILE_FORKLIST */ + ATTR_FILE_DATALENGTH | + ATTR_FILE_DATAALLOCSIZE | + ATTR_FILE_RSRCLENGTH | + ATTR_FILE_RSRCALLOCSIZE; + fsap->f_attributes.validattr.forkattr = 0; + fsap->f_attributes.nativeattr.commonattr = + ATTR_CMN_NAME | + ATTR_CMN_DEVID | + ATTR_CMN_FSID | + ATTR_CMN_OBJTYPE | + ATTR_CMN_OBJTAG | + ATTR_CMN_OBJID | + ATTR_CMN_OBJPERMANENTID | + ATTR_CMN_PAROBJID | + /* ATTR_CMN_SCRIPT | */ + ATTR_CMN_CRTIME | + ATTR_CMN_MODTIME | + /* ATTR_CMN_CHGTIME | */ /* Supported but not native */ + ATTR_CMN_ACCTIME | + /* ATTR_CMN_BKUPTIME | */ + /* ATTR_CMN_FNDRINFO | */ + ATTR_CMN_OWNERID | /* Supported but not native */ + ATTR_CMN_GRPID | /* Supported but not native */ + ATTR_CMN_ACCESSMASK | /* Supported but not native */ + ATTR_CMN_FLAGS | + ATTR_CMN_USERACCESS | + ATTR_CMN_EXTENDED_SECURITY | + ATTR_CMN_UUID | + ATTR_CMN_GRPUUID | +#ifdef ATTR_CMN_DOCUMENT_ID + ATTR_CMN_DOCUMENT_ID | +#endif +#ifdef ATTR_CMN_GEN_COUNT + ATTR_CMN_GEN_COUNT | +#endif + 0; + fsap->f_attributes.nativeattr.volattr = + ATTR_VOL_FSTYPE | + ATTR_VOL_SIGNATURE | + ATTR_VOL_SIZE | + ATTR_VOL_SPACEFREE | + ATTR_VOL_SPACEAVAIL | + ATTR_VOL_MINALLOCATION | + ATTR_VOL_ALLOCATIONCLUMP | + ATTR_VOL_IOBLOCKSIZE | + ATTR_VOL_OBJCOUNT | + ATTR_VOL_FILECOUNT | + ATTR_VOL_DIRCOUNT | + ATTR_VOL_MAXOBJCOUNT | + /* ATTR_VOL_MOUNTPOINT | */ + ATTR_VOL_NAME | + ATTR_VOL_MOUNTFLAGS | + /* ATTR_VOL_MOUNTEDDEVICE | */ + /* ATTR_VOL_ENCODINGSUSED */ + ATTR_VOL_CAPABILITIES | + ATTR_VOL_ATTRIBUTES; + fsap->f_attributes.nativeattr.dirattr = 0; + fsap->f_attributes.nativeattr.fileattr = + /* ATTR_FILE_LINKCOUNT | */ /* Supported but not native */ + ATTR_FILE_TOTALSIZE | + ATTR_FILE_ALLOCSIZE | + /* ATTR_FILE_IOBLOCKSIZE */ + ATTR_FILE_DEVTYPE | + /* ATTR_FILE_FORKCOUNT */ + /* ATTR_FILE_FORKLIST */ + ATTR_FILE_DATALENGTH | + ATTR_FILE_DATAALLOCSIZE | + ATTR_FILE_RSRCLENGTH | + ATTR_FILE_RSRCALLOCSIZE; + fsap->f_attributes.nativeattr.forkattr = 0; + + VFSATTR_SET_SUPPORTED(fsap, f_attributes); + } + if (VFSATTR_IS_ACTIVE(fsap, f_create_time)) { + char osname[MAXNAMELEN]; + uint64_t value; + + // Get dataset name + dmu_objset_name(zfsvfs->z_os, osname); + dsl_prop_get_integer(osname, "CREATION", + &value, NULL); + fsap->f_create_time.tv_sec = value; + fsap->f_create_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_create_time); + } + if (VFSATTR_IS_ACTIVE(fsap, f_modify_time)) { + timestruc_t now; + uint64_t mtime[2]; + + gethrestime(&now); + ZFS_TIME_ENCODE(&now, mtime); + // fsap->f_modify_time = mtime; + ZFS_TIME_DECODE(&fsap->f_modify_time, mtime); + + VFSATTR_SET_SUPPORTED(fsap, f_modify_time); + } + /* + * For Carbon compatibility, pretend to support this legacy/unused + * attribute + */ + if (VFSATTR_IS_ACTIVE(fsap, f_backup_time)) { + fsap->f_backup_time.tv_sec = 0; + fsap->f_backup_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_backup_time); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { + char osname[MAXNAMELEN], *slash; + dmu_objset_name(zfsvfs->z_os, osname); + + slash = strrchr(osname, '/'); + if (slash) { + /* Advance past last slash */ + slash += 1; + } else { + /* Copy whole osname (pool root) */ + slash = osname; + } + strlcpy(fsap->f_vol_name, slash, MAXPATHLEN); + + VFSATTR_SET_SUPPORTED(fsap, f_vol_name); + dprintf("vfs_getattr: volume name '%s'\n", fsap->f_vol_name); + } + + /* If we are mimicking, we need userland know we are really ZFS */ + VFSATTR_RETURN(fsap, f_fssubtype, MNTTYPE_ZFS_SUBTYPE); + + /* + * According to joshade over at + * https://github.com/joshado/liberate-applefileserver/blob/ + * master/liberate.m + * the following values need to be returned for it to be considered + * by Apple's AFS. + */ + VFSATTR_RETURN(fsap, f_signature, 0x482b); /* "H+" in ascii */ + VFSATTR_RETURN(fsap, f_carbon_fsid, 0); + // Make up a UUID here, based on the name + if (VFSATTR_IS_ACTIVE(fsap, f_uuid)) { + + char osname[MAXNAMELEN]; + int error; + // Get dataset name + dmu_objset_name(zfsvfs->z_os, osname); + dprintf("%s: osname [%s]\n", __func__, osname); + + if ((error = zfs_vfs_uuid_gen(osname, + fsap->f_uuid)) != 0) { + dprintf("%s uuid_gen error %d\n", __func__, error); + } else { + /* return f_uuid in fsap */ + VFSATTR_SET_SUPPORTED(fsap, f_uuid); + } + } + + uint64_t missing = 0; + missing = (fsap->f_active ^ (fsap->f_active & fsap->f_supported)); + if (missing != 0) { + dprintf("%s: asked %08llx reply %08llx missing %08llx\n", + __func__, fsap->f_active, fsap->f_supported, + missing); + } + + ZFS_EXIT(zfsvfs); + + return (0); +} + +int +zfs_vnode_lock(vnode_t *vp, int flags) +{ + int error; + + ASSERT(vp != NULL); + + error = vn_lock(vp, flags); + return (error); +} + +/* + * The ARC has requested that the filesystem drop entries from the dentry + * and inode caches. This can occur when the ARC needs to free meta data + * blocks but can't because they are all pinned by entries in these caches. + */ + +/* Get vnode for the root object of this mount */ +int +zfs_vfs_root(struct mount *mp, vnode_t **vpp, __unused vfs_context_t context) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + znode_t *rootzp = NULL; + int error; + + if (!zfsvfs) { + struct vfsstatfs *stat = 0; + if (mp) stat = vfs_statfs(mp); + if (stat) + printf("%s mp on %s from %s\n", __func__, + stat->f_mntonname, stat->f_mntfromname); + printf("%s no zfsvfs yet for mp\n", __func__); + return (EINVAL); + } + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + else + *vpp = NULL; + + ZFS_EXIT(zfsvfs); + + if (error == 0 && *vpp != NULL) + if (vnode_vtype(*vpp) != VDIR) { + panic("%s: not a directory\n", __func__); + } + + return (error); +} + +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + /* + * We have experienced deadlocks with dmu_recv_end happening between + * suspend_fs() and resume_fs(). Clearly something is not quite ready + * so we will wait for pools to be synced first. + * This is considered a temporary solution until we can work out + * the full issue. + */ + + zfs_unlinked_drain_stop_wait(zfsvfs); + + /* + * If someone has not already unmounted this file system, + * drain the iput_taskq to ensure all active references to the + * zfs_sb_t have been handled only then can it be safely destroyed. + */ + if (zfsvfs->z_os) { + /* + * If we're unmounting we have to wait for the list to + * drain completely. + * + * If we're not unmounting there's no guarantee the list + * will drain completely, but iputs run from the taskq + * may add the parents of dir-based xattrs to the taskq + * so we want to wait for these. + * + * We can safely read z_nr_znodes without locking because the + * VFS has already blocked operations which add to the + * z_all_znodes list and thus increment z_nr_znodes. + */ + int round = 0; + while (!list_empty(&zfsvfs->z_all_znodes)) { + taskq_wait_outstanding(dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + if (++round > 1 && !unmounting) + break; + break; /* Only loop once - osx can get stuck */ + } + } + + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ + cache_purgevfs(zfsvfs->z_parent->z_vfs); + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + return (SET_ERROR(EIO)); + } + /* + * At this point there are no VFS ops active, and any new VFS ops + * will fail with EIO since we have z_teardown_lock for writer (only + * relevant for forced unmount). + * + * Release all holds on dbufs. We also grab an extra reference to all + * the remaining inodes so that the kernel does not attempt to free + * any inodes of a suspended fs. This can cause deadlocks since the + * zfs_resume_fs() process may involve starting threads, which might + * attempt to free unreferenced inodes to free up memory for the new + * thread. + */ + if (!unmounting) { + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + if (zp->z_sa_hdl) + zfs_znode_dmu_fini(zp); + if (VN_HOLD(ZTOV(zp)) == 0) { + vnode_ref(ZTOV(zp)); + zp->z_suspended = B_TRUE; + VN_RELE(ZTOV(zp)); + } + } + mutex_exit(&zfsvfs->z_znodes_lock); + } + + /* + * If we are unmounting, set the unmounted flag and let new VFS ops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other VFS ops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + /* + * Evict cached data. We must write out any dirty data before + * disowning the dataset. + */ + objset_t *os = zfsvfs->z_os; + boolean_t os_dirty = B_FALSE; + for (int t = 0; t < TXG_SIZE; t++) { + if (dmu_objset_is_dirty(os, t)) { + os_dirty = B_TRUE; + break; + } + } + if (!zfs_is_readonly(zfsvfs) && os_dirty) { + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + } + dmu_objset_evict_dbufs(zfsvfs->z_os); + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + dsl_dir_cancel_waiters(dd); + + return (0); +} + +int +zfs_vfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + objset_t *os; + char osname[MAXNAMELEN]; + int ret; + /* cred_t *cr = (cred_t *)vfs_context_ucred(context); */ + int destroyed_zfsctl = 0; + + dprintf("%s\n", __func__); + + zfs_unlinked_drain_stop_wait(zfsvfs); + + /* Save osname for later */ + dmu_objset_name(zfsvfs->z_os, osname); + + /* + * We might skip the sync called in the unmount path, since + * zfs_vfs_sync() is generally ignoring xnu's calls, and alas, + * mount_isforce() is set AFTER that sync call, so we can not + * detect unmount is inflight. But why not just sync now, it + * is safe. Optionally, sync if (mount_isforce()); + */ + spa_sync_allpools(); + + /* + * We purge the parent filesystem's vfsp as the parent filesystem + * and all of its snapshots have their vnode's v_vfsp set to the + * parent's filesystem's vfsp. Note, 'z_parent' is self + * referential for non-snapshots. + */ + cache_purgevfs(zfsvfs->z_parent->z_vfs); + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + * + * Unfortunately, XNU will check for mounts in preflight, and + * simply not call us at all if snapshots are mounted. + * We expect userland to unmount snapshots now. + */ + + ret = vflush(mp, NULLVP, SKIPSYSTEM); + + if (mntflags & MNT_FORCE) { + /* + * Mark file system as unmounted before calling + * vflush(FORCECLOSE). This way we ensure no future vnops + * will be called and risk operating on DOOMED vnodes. + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfsvfs->z_unmounted = B_TRUE; + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * We must release ctldir before vflush on osx. + */ + if (zfsvfs->z_ctldir != NULL) { + destroyed_zfsctl = 1; + zfsctl_destroy(zfsvfs); + } + + /* + * Flush all the files. + */ + ret = vflush(mp, NULLVP, + (mntflags & MNT_FORCE) ? FORCECLOSE|SKIPSYSTEM : SKIPSYSTEM); + + if ((ret != 0) && !(mntflags & MNT_FORCE)) { + if (destroyed_zfsctl) + zfsctl_create(zfsvfs); + return (ret); + } + + /* If we are ourselves a snapshot */ + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + /* Wake up anyone waiting for unmount */ + zfsctl_mount_signal(osname, B_FALSE); + } + + if (!vfs_isrdonly(zfsvfs->z_vfs) && + spa_writeable(dmu_objset_spa(zfsvfs->z_os)) && + !(mntflags & MNT_FORCE)) { + /* Update the last-unmount time for Spotlight's next mount */ + timestruc_t now; + dmu_tx_t *tx; + int error; + uint64_t value; + + dprintf("ZFS: '%s' Updating spotlight LASTUNMOUNT property\n", + osname); + + gethrestime(&now); + zfsvfs->z_last_unmount_time = now.tv_sec; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + value = zfsvfs->z_last_unmount_time; + error = zap_update(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_prop_to_name(ZFS_PROP_LASTUNMOUNT), + 8, 1, + &value, tx); + dmu_tx_commit(tx); + } + dprintf("ZFS: '%s' set lastunmount to 0x%lx (%d)\n", + osname, zfsvfs->z_last_unmount_time, error); + } + + /* + * Last chance to dump unreferenced system files. + */ + (void) vflush(mp, NULLVP, FORCECLOSE); + + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + + zfs_freevfs(zfsvfs->z_vfs); + + dprintf("zfs_osx_proxy_remove"); + zfs_osx_proxy_remove(osname); + + return (0); +} + +static int +zfs_vget_internal(zfsvfs_t *zfsvfs, ino64_t ino, vnode_t **vpp) +{ + znode_t *zp; + int err; + + printf("vget get %llu\n", ino); + + /* + * Check to see if we expect to find this in the hardlink avl tree of + * hashes. Use the MSB set high as indicator. + */ + hardlinks_t *findnode = NULL; + if ((1ULL<<31) & ino) { + hardlinks_t *searchnode; + avl_index_t loc; + + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + + dprintf("ZFS: vget looking for (%llx,%llu)\n", ino, ino); + + searchnode->hl_linkid = ino; + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks_linkid, searchnode, + &loc); + rw_exit(&zfsvfs->z_hardlinks_lock); + + kmem_free(searchnode, sizeof (hardlinks_t)); + + if (findnode) { + dprintf("ZFS: vget found (%llu, %llu, %u): '%s'\n", + findnode->hl_parent, + findnode->hl_fileid, findnode->hl_linkid, + findnode->hl_name); + // Lookup the actual zp instead + ino = findnode->hl_fileid; + } // findnode + } // MSB set + + + /* We can not be locked during zget. */ + if (!ino) { + dprintf("%s: setting ino from %lld to 2\n", __func__, ino); + ino = 2; + } + + err = zfs_zget(zfsvfs, ino, &zp); + + if (err) { + dprintf("zget failed %d\n", err); + return (err); + } + + /* Don't expose EA objects! */ + if (zp->z_pflags & ZFS_XATTR) { + err = ENOENT; + goto out; + } + if (zp->z_unlinked) { + err = EINVAL; + goto out; + } + + *vpp = ZTOV(zp); + + err = zfs_vnode_lock(*vpp, 0 /* flags */); + + /* + * Spotlight requires that vap->va_name() is set when returning + * from vfs_vget, so that vfs_getrealpath() can succeed in returning + * a path to mds. + */ + char *name = kmem_alloc(MAXPATHLEN + 2, KM_SLEEP); + + /* Root can't lookup in ZAP */ + if (zp->z_id == zfsvfs->z_root) { + + dmu_objset_name(zfsvfs->z_os, name); + dprintf("vget: set root '%s'\n", name); + vnode_update_identity(*vpp, NULL, name, + strlen(name), 0, VNODE_UPDATE_NAME); + + } else { + uint64_t parent; + + // if its a hardlink cache + if (findnode) { + + dprintf("vget: updating vnode to '%s' parent %llu\n", + findnode->hl_name, findnode->hl_parent); + + vnode_update_identity(*vpp, + NULL, findnode->hl_name, + strlen(findnode->hl_name), 0, + VNODE_UPDATE_NAME|VNODE_UPDATE_PARENT); + mutex_enter(&zp->z_lock); + strlcpy(zp->z_name_cache, findnode->hl_name, PATH_MAX); + zp->z_finder_parentid = findnode->hl_parent; + mutex_exit(&zp->z_lock); + + + // If we already have the name, cached in zfs_vnop_lookup + } else if (zp->z_name_cache[0]) { + dprintf("vget: cached name '%s'\n", zp->z_name_cache); + vnode_update_identity(*vpp, NULL, zp->z_name_cache, + strlen(zp->z_name_cache), 0, + VNODE_UPDATE_NAME); + + /* If needed, if findnode is set, update the parentid */ + + } else { + + /* Lookup name from ID, grab parent */ + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)) == 0); + + if (zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), name) == 0) { + + dprintf("vget: set name '%s'\n", name); + vnode_update_identity(*vpp, NULL, name, + strlen(name), 0, + VNODE_UPDATE_NAME); + } else { + dprintf("vget: unable to get name for %llu\n", + zp->z_id); + } // !zap_search + } + } // rootid + + kmem_free(name, MAXPATHLEN + 2); + +out: + + if (err != 0) { + VN_RELE(ZTOV(zp)); + *vpp = NULL; + } + + dprintf("vget return %d\n", err); + return (err); +} + +/* + * Get a vnode from a file id (ignoring the generation) + * + * Use by NFS Server (readdirplus) and VFS (build_path) + */ +int +zfs_vfs_vget(struct mount *mp, ino64_t ino, vnode_t **vpp, + __unused vfs_context_t context) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + int error; + + printf("%s: %llu\n", __func__, ino); + + ZFS_ENTER(zfsvfs); + + /* We also need to handle (.zfs) and (.zfs/snapshot). */ + if ((ino == ZFSCTL_INO_ROOT) && (zfsvfs->z_ctldir != NULL)) { + if (VN_HOLD(zfsvfs->z_ctldir) == 0) { + *vpp = zfsvfs->z_ctldir; + error = 0; + } else { + error = ENOENT; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * This one is trickier, we have no reference to it, but it is + * in the all list. A little expensive to search list, but at + * least "snapshot" is infrequently accessed + * We also need to check if it is a ".zfs/snapshot/$name" entry - + * luckily we keep the "lowest" ID seen, so we only need to check + * when it is in the range. + */ + if (zfsvfs->z_ctldir != NULL) { + + /* + * Either it is the snapdir itself, or one of the snapshot + * directories inside it + */ + if ((ino == ZFSCTL_INO_SNAPDIR) || + ((ino >= zfsvfs->z_ctldir_startid) && + (ino <= ZFSCTL_INO_SNAPDIRS))) { + znode_t *zp; + + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + if (zp->z_id == ino) + break; + if (zp->z_id == ZFSCTL_INO_SHARES - ino) + break; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + error = ENOENT; + if (zp != NULL) { + if (VN_HOLD(ZTOV(zp)) == 0) { + *vpp = ZTOV(zp); + error = 0; + } + } + + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * On Mac OS X we always export the root directory id as 2. + * So we don't expect to see the real root directory id + * from zfs_vfs_vget KPI (unless of course the real id was + * already 2). + */ + ino = INO_XNUTOZFS(ino, zfsvfs->z_root); + + error = zfs_vget_internal(zfsvfs, ino, vpp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +int +zfs_vfs_setattr(__unused struct mount *mp, __unused struct vfs_attr *fsap, + __unused vfs_context_t context) +{ + // 10a286 bits has an implementation of this: to set volume name. + return (ENOTSUP); +} + +/* + * NFS Server File Handle File ID + */ +typedef struct zfs_zfid { + uint8_t zf_object[8]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[8]; /* gen[i] = gen >> (8 * i) */ +} zfs_zfid_t; + +/* + * File handle to vnode pointer + */ +int +zfs_vfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, + vnode_t **vpp, __unused vfs_context_t context) +{ + dprintf("%s\n", __func__); + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + zfs_zfid_t *zfid = (zfs_zfid_t *)fhp; + znode_t *zp; + uint64_t obj_num = 0; + uint64_t fid_gen = 0; + uint64_t zp_gen; + int i; + int error; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + if (fhlen < sizeof (zfs_zfid_t)) { + error = EINVAL; + goto out; + } + + /* + * Grab the object and gen numbers in an endian neutral manner + */ + for (i = 0; i < sizeof (zfid->zf_object); i++) + obj_num |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + + if ((error = zfs_zget(zfsvfs, obj_num, &zp))) { + goto out; + } + + zp_gen = zp->z_gen; + if (zp_gen == 0) + zp_gen = 1; + + if (zp->z_unlinked || zp_gen != fid_gen) { + vnode_put(ZTOV(zp)); + error = EINVAL; + goto out; + } + *vpp = ZTOV(zp); +out: + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Vnode pointer to File handle + * + * XXX Do we want to check the DSL sharenfs property? + */ +int +zfs_vfs_vptofh(vnode_t *vp, int *fhlenp, unsigned char *fhp, + __unused vfs_context_t context) +{ + dprintf("%s\n", __func__); + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + zfs_zfid_t *zfid = (zfs_zfid_t *)fhp; + znode_t *zp = VTOZ(vp); + uint64_t obj_num; + uint64_t zp_gen; + int i; + + if (*fhlenp < sizeof (zfs_zfid_t)) { + return (EOVERFLOW); + } + + ZFS_ENTER(zfsvfs); + + obj_num = zp->z_id; + zp_gen = zp->z_gen; + if (zp_gen == 0) + zp_gen = 1; + + /* + * Store the object and gen numbers in an endian neutral manner + */ + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(obj_num >> (8 * i)); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(zp_gen >> (8 * i)); + + *fhlenp = sizeof (zfs_zfid_t); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + return (0); +} + +/* + * Reopen zfsvfs_t::z_os and release VOPs. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + int err, err2; + znode_t *zp; + + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; + + ds->ds_dir->dd_activity_cancelled = B_FALSE; + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + zfs_set_fuid_feature(zfsvfs); + + /* + * Attempt to re-establish all the active inodes with their + * dbufs. If a zfs_rezget() fails, then we unhash the inode + * and mark it stale. This prevents a collision if a new + * inode/object is created which must use the same inode + * number. The stale inode will be be released when the + * VFS prunes the dentry holding the remaining references + * on the stale inode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + err2 = zfs_rezget(zp); + if (err2) { + zp->z_is_stale = B_TRUE; + } + + /* see comment in zfs_suspend_fs() */ + if (zp->z_suspended) { + if (vnode_getwithref(ZTOV(zp)) == 0) { + vnode_rele(ZTOV(zp)); + zfs_zrele_async(zp); + zp->z_suspended = B_FALSE; + } + } + } + mutex_exit(&zfsvfs->z_znodes_lock); + + if (!vfs_isrdonly(zfsvfs->z_vfs) && !zfsvfs->z_unmounted) { + /* + * zfs_suspend_fs() could have interrupted freeing + * of dnodes. We need to restart this freeing so + * that we don't "leak" the space. + */ + zfs_unlinked_drain(zfsvfs); + } + + cache_purgevfs(zfsvfs->z_parent->z_vfs); + +bail: + /* release the VFS ops */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't setup the sa framework, try to force + * unmount this file system. + */ + if (zfsvfs->z_os) + zfs_vfs_unmount(zfsvfs->z_vfs, 0, NULL); + } + return (err); +} + + +void +zfs_freevfs(struct mount *vfsp) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + + dprintf("+freevfs\n"); + + vfs_setfsprivate(vfsp, NULL); + + zfsvfs_free(zfsvfs); + + atomic_dec_32(&zfs_active_fs_count); + dprintf("-freevfs\n"); +} + +void +zfs_init(void) +{ + + printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); + + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); + + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); + + /* Start arc_os - reclaim thread */ + arc_os_init(); + +} + +void +zfs_fini(void) +{ + arc_os_fini(); + zfsctl_fini(); + zfs_znode_fini(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +/* + * Release VOPs and unmount a suspended filesystem. + */ +int +zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + zfsvfs->z_os = os; + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + /* + * Try to force unmount this file system. + */ + zfs_vfs_unmount(zfsvfs->z_vfs, 0, NULL); + zfsvfs->z_unmounted = B_TRUE; + return (0); +} + +int +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) +{ + int error; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (SET_ERROR(EINVAL)); + + if (newvers < zfsvfs->z_version) + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal(dmu_objset_spa(os), "upgrade", tx, + "oldver=%llu newver=%llu dataset = %llu", zfsvfs->z_version, + newvers, dmu_objset_id(os)); + + dmu_tx_commit(tx); + + zfsvfs->z_version = newvers; + os->os_version = newvers; + + zfs_set_fuid_feature(zfsvfs); + + return (0); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLMODE: + *value = ZFS_ACLTYPE_OFF; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + +/* + * Return true if the coresponding vfs's unmounted flag is set. + * Otherwise return false. + * If this function returns true we know VFS unmount has been initiated. + */ +boolean_t +zfs_get_vfs_flag_unmounted(objset_t *os) +{ + zfsvfs_t *zfvp; + boolean_t unmounted = B_FALSE; + + ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp != NULL && zfvp->z_vfs != NULL && + (vfs_isunmount(zfvp->z_vfs))) + unmounted = B_TRUE; + mutex_exit(&os->os_user_ptr_lock); + + return (unmounted); +} diff --git a/module/os/macos/zfs/zfs_vnops.c b/module/os/macos/zfs/zfs_vnops.c new file mode 100644 index 0000000000..de33bbaeae --- /dev/null +++ b/module/os/macos/zfs/zfs_vnops.c @@ -0,0 +1,4563 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2013, 2020 Jorgen Lundman + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int zfs_vnop_force_formd_normalized_output = 0; /* disabled by default */ + +#undef dprintf +#define dprintf printf + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) zrele() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call zrele() within a tx then use zfs_zrele_async(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * zrele(...); // release held znodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * zrele(...); // release held znodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* + * Virus scanning is unsupported. It would be possible to add a hook + * here to performance the required virus scan. This could be done + * entirely in the kernel or potentially as an update to invoke a + * scanning utility. + */ +static int +zfs_vscan(struct vnode *vp, cred_t *cr, int async) +{ + return (0); +} + +/* ARGSUSED */ +int +zfs_open(struct vnode *vp, int mode, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Honor ZFS_APPENDONLY file attribute */ + if ((mode & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & O_APPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Virus scan eligible files on open */ + if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(zp->z_mode) && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (zfs_vscan(vp, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +int +zfs_close(struct vnode *vp, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_dec_32(&zp->z_sync_cnt); + + if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(zp->z_mode) && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) + VERIFY(zfs_vscan(vp, cr, 1) == 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +#if defined(SEEK_HOLE) && defined(SEEK_DATA) +/* + * Lseek support for finding holes (cmd == SEEK_HOLE) and + * data (cmd == SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey_common(struct vnode *vp, int cmd, loff_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_size; + if (noff >= file_sz) { + return (SET_ERROR(ENXIO)); + } + + if (cmd == SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); + + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* file was dirty, so fall back to using generic logic */ + if (error == EBUSY) { + if (hole) + *off = file_sz; + + return (0); + } + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +int +zfs_holey(struct vnode *vp, int cmd, loff_t *off) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_holey_common(vp, cmd, off); + + ZFS_EXIT(zfsvfs); + return (error); +} +#endif /* SEEK_HOLE && SEEK_DATA */ + +#if defined(_KERNEL) +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +static void +update_pages(vnode_t *vp, int64_t start, int64_t len, + objset_t *os, uint64_t oid) +{ + znode_t *zp = VTOZ(vp); + int error = 0; + vm_offset_t vaddr = 0; + upl_t upl; + upl_page_info_t *pl = NULL; + int upl_size; + int upl_page; + off_t off; + + off = start & (PAGE_SIZE - 1); + start &= ~PAGE_MASK; + + upl_size = (off + len + (PAGE_SIZE - 1)) & ~PAGE_MASK; + + // dprintf("update_pages: start 0x%llx len 0x%llx: 1st off x%llx\n", + // start, len, off); + /* + * Create a UPL for the current range and map its + * page list into the kernel virtual address space. + */ + error = ubc_create_upl(vp, start, upl_size, &upl, &pl, + UPL_FILE_IO | UPL_SET_LITE); + if ((error != KERN_SUCCESS) || !upl) { + printf("ZFS: update_pages failed to ubc_create_upl: %d\n", + error); + return; + } + + if (ubc_upl_map(upl, &vaddr) != KERN_SUCCESS) { + printf("ZFS: update_pages failed to ubc_upl_map: %d\n", + error); + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); + return; + } + for (upl_page = 0; len > 0; ++upl_page) { + uint64_t nbytes = MIN(PAGESIZE - off, len); + /* + * We don't want a new page to "appear" in the middle of + * the file update (because it may not get the write + * update data), so we grab a lock to block + * zfs_getpage(). + */ + rw_enter(&zp->z_map_lock, RW_WRITER); + if (pl && upl_valid_page(pl, upl_page)) { + rw_exit(&zp->z_map_lock); + (void) dmu_read(os, oid, start+off, nbytes, + (void *)(vaddr+off), DMU_READ_PREFETCH); + + } else { // !upl_valid_page + rw_exit(&zp->z_map_lock); + } + vaddr += PAGE_SIZE; + start += PAGE_SIZE; + len -= nbytes; + off = 0; + } + + /* + * Unmap the page list and free the UPL. + */ + (void) ubc_upl_unmap(upl); + /* + * We want to abort here since due to dmu_write() + * we effectively didn't dirty any pages. + */ + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(struct vnode *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + int len = nbytes; + int error = 0; + vm_offset_t vaddr = 0; + upl_t upl; + upl_page_info_t *pl = NULL; + off_t upl_start; + int upl_size; + int upl_page; + off_t off; + + upl_start = uio_offset(uio); + off = upl_start & PAGE_MASK; + upl_start &= ~PAGE_MASK; + upl_size = (off + nbytes + (PAGE_SIZE - 1)) & ~PAGE_MASK; + + /* + * Create a UPL for the current range and map its + * page list into the kernel virtual address space. + */ + error = ubc_create_upl(vp, upl_start, upl_size, &upl, &pl, + UPL_FILE_IO | UPL_SET_LITE); + if ((error != KERN_SUCCESS) || !upl) { + return (EIO); + } + + if (ubc_upl_map(upl, &vaddr) != KERN_SUCCESS) { + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); + return (ENOMEM); + } + + for (upl_page = 0; len > 0; ++upl_page) { + uint64_t bytes = MIN(PAGE_SIZE - off, len); + if (pl && upl_valid_page(pl, upl_page)) { + uio_setrw(uio, UIO_READ); + error = uiomove((caddr_t)vaddr + off, bytes, UIO_READ, + uio); + } else { + error = dmu_read_uio(os, zp->z_id, uio, bytes); + } + + vaddr += PAGE_SIZE; + len -= bytes; + off = 0; + if (error) + break; + } + + /* + * Unmap the page list and free the UPL. + */ + (void) ubc_upl_unmap(upl); + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); + + return (error); +} +#endif /* _KERNEL */ + +unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ +unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: ip - inode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - O_SYNC flags; used to provide FRSYNC semantics. + * O_DIRECT flag; used to bypass page cache. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 on success, error code on failure. + * + * Side Effects: + * inode - atime updated if byte count > 0 + */ +/* ARGSUSED */ +int +zfs_read(struct vnode *vp, uio_t *uio, int ioflag, cred_t *cr) +{ + int error = 0; + boolean_t frsync = B_FALSE; + + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_pflags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + /* + * Validate file offset + */ + if (uio_offset(uio) < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Fasttrack empty reads + */ + if (uio_resid(uio) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + +#ifdef FRSYNC + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + * Only do this for non-snapshots. + * + * Some platforms do not support FRSYNC and instead map it + * to O_SYNC, which results in unnecessary calls to zil_commit. We + * only honor FRSYNC requests on platforms which support it. + */ + frsync = !!(ioflag & FRSYNC); +#endif + if (zfsvfs->z_log && + (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); + + /* + * Lock the range against changes. + */ + zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, + uio_offset(uio), uio_resid(uio), RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio_offset(uio) >= zp->z_size) { + error = 0; + goto out; + } + + ASSERT(uio_offset(uio) < zp->z_size); + ssize_t n = MIN(uio_resid(uio), zp->z_size - uio_offset(uio)); + + while (n > 0) { + ssize_t nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio_offset(uio), zfs_read_chunk_size)); + + if (zp->z_is_mapped && !(ioflag & O_DIRECT)) { + error = mappedread(vp, nbytes, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + } + + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + + n -= nbytes; + } + +out: + zfs_rangelock_exit(lr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: ip - inode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - O_APPEND flag set if in append mode. + * O_DIRECT flag; used to bypass page cache. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime|mtime updated if byte count > 0 + */ + +/* ARGSUSED */ +int +zfs_write(struct vnode *vp, uio_t *uio, int ioflag, cred_t *cr) +{ + int error = 0; + ssize_t start_resid = uio_resid(uio); + rlim64_t limit = MAXOFFSET_T; + const iovec_t *aiov = NULL; + arc_buf_t *abuf = NULL; + int write_eof; + + /* + * Fasttrack empty write + */ + ssize_t n = start_resid; + if (n == 0) + return (0); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + sa_bulk_attr_t bulk[4]; + int count = 0; + uint64_t mtime[2], ctime[2]; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM + */ + if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && + (uio_offset(uio) < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* + * Validate file offset + */ + offset_t woff = ioflag & O_APPEND ? zp->z_size : uio_offset(uio); + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + int max_blksz = zfsvfs->z_max_blksz; + xuio_t *xuio = NULL; + + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFAULT)); + } + + /* + * If in append mode, set the io offset pointer to eof. + */ + zfs_locked_range_t *lr; + if (ioflag & O_APPEND) { + /* + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; + } + uio_setoffset(uio, woff); + } else { + /* + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); + } + + if (woff >= limit) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* Will this write extend the file length? */ + write_eof = (woff + n > zp->z_size); + uint64_t end_size = MAX(zp->z_size, woff + n); + zilog_t *zilog = zfsvfs->z_log; + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + woff = uio_offset(uio); + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, + zp->z_uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, + zp->z_gid) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + abuf = NULL; + if (xuio) { + + } else if (n >= max_blksz && woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + size_t cbytes; + + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if ((error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes))) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + + /* + * Start a transaction. + */ + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } + + /* + * If rangelock_enter() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since rangelock_reduce() will + * shrink down lr_length to the appropriate size. + */ + if (lr->lr_length == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_rangelock_reduce(lr, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + + ssize_t tx_bytes = 0; + + if (woff + nbytes > zp->z_size) + vnode_pager_setsize(vp, woff + nbytes); + + /* + * This conditional is always true in OSX, it is kept so + * the sources look familiar to other platforms + */ + if (abuf == NULL) { + tx_bytes = uio_resid(uio); + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); + tx_bytes -= uio_resid(uio); + } else { + tx_bytes = nbytes; + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + error = dmu_assign_arcbuf_by_dbuf( + sa_get_db(zp->z_sa_hdl), woff, abuf, tx); + if (error != 0) { + dmu_return_arcbuf(abuf); + dmu_tx_commit(tx); + break; + } + } + ASSERT(tx_bytes <= uio_resid(uio)); + uioskip(uio, tx_bytes); + } + if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) { + update_pages(vp, woff, tx_bytes, zfsvfs->z_os, + zp->z_id); + } + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the execute bits is set. + * + * It would be nice to do this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + uint32_t uid = KUID_TO_SUID(zp->z_uid); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + zp->z_mode = newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_size) < uio_offset(uio)) { + (void) atomic_cas_64(&zp->z_size, end_size, + uio_offset(uio)); + ASSERT(error == 0); + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + if (error == 0) + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + else + (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, + NULL, NULL); + dmu_tx_commit(tx); + + if (error != 0) + break; + + ASSERT(tx_bytes == nbytes); + n -= nbytes; + } + + zfs_rangelock_exit(lr); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_replay || uio_resid(uio) == start_resid) { + dprintf("%s: error resid %llu\n", __func__, uio_resid(uio)); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (O_SYNC | O_DSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); + + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * Write the bytes to a file. + * + * IN: zp - znode of file to be written to + * data - bytes to write + * len - number of bytes to write + * pos - offset to start writing at + * + * OUT: resid - remaining bytes to write + * + * RETURN: 0 if success + * positive error code if failure + * + * Timestamps: + * zp - ctime|mtime updated if byte count > 0 + */ +int +zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *presid) +{ + int error = 0; + ssize_t resid; + + error = zfs_vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, + pos, UIO_SYSSPACE, IO_SYNC, RLIM64_INFINITY, NOCRED, &resid); + + if (error) { + return (SET_ERROR(error)); + } else if (presid == NULL) { + if (resid != 0) { + error = SET_ERROR(EIO); + } + } else { + *presid = resid; + } + return (error); +} + +/* + * Drop a reference on the passed inode asynchronously. This ensures + * that the caller will never drop the last reference on an inode in + * the current context. Doing so while holding open a tx could result + * in a deadlock if iput_final() re-enters the filesystem code. + */ +void +zfs_zrele_async(znode_t *zp) +{ + struct vnode *vp = ZTOV(zp); + objset_t *os = ITOZSB(vp)->z_os; + + ASSERT(os != NULL); + + if (vnode_iocount(vp) == 1) + VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), + (task_func_t *)vnode_put, vp, TQ_SLEEP) != TASKQID_INVALID); + else + zrele(zp); +} + +/* ARGSUSED */ +void +zfs_get_done(zgd_t *zgd, int error) +{ + znode_t *zp = zgd->zgd_private; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_rangelock_exit(zgd->zgd_lr); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_zrele_async(zp); + + kmem_free(zgd, sizeof (zgd_t)); +} + +#ifdef DEBUG +static int zil_fault_io = 0; +#endif + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error = 0; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); + if (zp->z_unlinked) { + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_zrele_async(zp); + return (SET_ERROR(ENOENT)); + } + + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zp; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + /* test for truncation needs to be done while range locked */ + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } + ASSERT(error == 0 || error == ENOENT); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's + * written out and its checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); + } + /* test for truncation needs to be done while range locked */ + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); +#ifdef DEBUG + if (zil_fault_io) { + error = SET_ERROR(EIO); + zil_fault_io = 0; + } +#endif + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } + } + } + + zfs_get_done(zgd, error); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_access(struct vnode *vp, int mode, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held inode reference for it. + * + * IN: zdp - znode of directory to search. + * nm - name of entry to lookup. + * flags - LOOKUP_XATTR set if looking for an attribute. + * cr - credentials of caller. + * direntflags - directory lookup flags + * realpnp - returned pathname. + * + * OUT: zpp - znode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +int +zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, + cred_t *cr, int *direntflags, struct componentname *realpnp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zdp); + int error = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *zpp = NULL; + + /* + * OsX has separate vnops for XATTR activity + */ + + + if (!S_ISDIR(zdp->z_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTDIR)); + } + + /* + * Check accessibility of directory. + */ + + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the ip of the created or trunc'd file. + * + * IN: dzp - znode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - file flag. + * vsecp - ACL to be set + * + * OUT: zpp - znode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dzp - ctime|mtime updated if new entry created + * zp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp = NULL; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + objset_t *os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr(vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + +top: + *zpp = NULL; + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + zhold(dzp); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible igrab(zp) */ + int zflg = 0; + + if (flag & FIGNORECASE) + zflg |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL); + if (error) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + if (strcmp(name, "..") == 0) + error = SET_ERROR(EISDIR); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zp == NULL) { + uint64_t txtype; + uint64_t projid = ZFS_DEFAULT_PROJID; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EINVAL); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + + error = dmu_tx_assign(tx, + (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + /* + * Failed, have zp but on OsX we don't have a vp, as it + * would have been attached below, and we've cleared out + * zp, signal then not to call zrele() on it. + */ + if (ZTOV(zp) == NULL) { + zfs_znode_free(zp); + zp = NULL; + } + + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + if (flag & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(zp, zfsvfs); + + } else { + int aflags = (flag & O_APPEND) ? V_APPEND : 0; + + if (have_acl) + zfs_acl_ids_free(&acl_ids); + have_acl = B_FALSE; + + /* + * A directory entry already exists for this name. + */ + /* + * Can't truncate an existing file if in exclusive mode. + */ + if (excl) { + error = SET_ERROR(EEXIST); + goto out; + } + /* + * Can't open a directory for writing. + */ + if (S_ISDIR(zp->z_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + goto out; + } + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + /* + * Truncate regular files if requested. + */ + if (S_ISREG(zp->z_mode) && + (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { + /* we can't hold any locks when calling zfs_freesp() */ + if (dl) { + zfs_dirent_unlock(dl); + dl = NULL; + } + error = zfs_freesp(zp, 0, 0, mode, TRUE); + } + } +out: + + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + zrele(zp); + } else { + *zpp = zp; + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dzp - znode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dzp - ctime|mtime + * ip - ctime (if nlink > 0) + */ + +uint64_t null_xattr = 0; + +/*ARGSUSED*/ +int +zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) +{ + znode_t *zp; + znode_t *xzp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + uint64_t acl_obj, xattr_obj; + uint64_t xattr_obj_unlinked = 0; + uint64_t obj = 0; + uint64_t links; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + boolean_t may_delete_now, delete_now = FALSE; + boolean_t unlinked, toobig = FALSE; + uint64_t txtype; + struct componentname *realnmp = NULL; + struct componentname realnm = { 0 }; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) { + zflg |= ZCILOOK; + + realnm.cn_nameptr = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + realnm.cn_namelen = MAXPATHLEN; + realnmp = &realnm; + } + +top: + xattr_obj = 0; + xzp = NULL; + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, realnmp))) { + if (realnmp) + kmem_free(realnm.cn_nameptr, realnm.cn_namelen); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (S_ISDIR(zp->z_mode)) { + error = SET_ERROR(EPERM); + goto out; + } + + mutex_enter(&zp->z_lock); + may_delete_now = vnode_iocount(ZTOV(zp)) == 1 && + !(zp->z_is_mapped); + mutex_exit(&zp->z_lock); + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the inode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + obj = zp->z_id; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + if (may_delete_now) { + toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + mutex_enter(&zp->z_lock); + if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + mutex_exit(&zp->z_lock); + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(zp); + if (xzp) + zrele(xzp); + goto top; + } + if (realnmp) + kmem_free(realnm.cn_nameptr, realnm.cn_namelen); + dmu_tx_abort(tx); + zrele(zp); + if (xzp) + zrele(xzp); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + /* + * Hold z_lock so that we can make sure that the ACL obj + * hasn't changed. Could have been deleted due to + * zfs_sa_upgrade(). + */ + mutex_enter(&zp->z_lock); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); + delete_now = may_delete_now && !toobig && + vnode_iocount(ZTOV(zp)) == 1 && + !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && + zfs_external_acl(zp) == acl_obj; + } + + if (delete_now) { + if (xattr_obj_unlinked) { + ASSERT3U(xzp->z_nlink, ==, 2); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; + links = 0; + error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &links, sizeof (links), tx); + ASSERT3U(error, ==, 0); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + + if (zp->z_is_sa) + error = sa_remove(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), tx); + else + error = sa_update(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), &null_xattr, + sizeof (uint64_t), tx); + ASSERT0(error); + } + /* + * Add to the unlinked set because a new reference could be + * taken concurrently resulting in a deferred destruction. + */ + zfs_unlinked_add(zp, tx); + mutex_exit(&zp->z_lock); + } else if (unlinked) { + mutex_exit(&zp->z_lock); + zfs_unlinked_add(zp, tx); + } + + txtype = TX_REMOVE; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); + + dmu_tx_commit(tx); +out: + if (realnmp) + kmem_free(realnm.cn_nameptr, realnm.cn_namelen); + + zfs_dirent_unlock(dl); + + if (delete_now) + zrele(zp); + else + zfs_zrele_async(zp); + + if (xzp) + zfs_zrele_async(xzp); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dzp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dzp - znode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * flags - case flags. + * vsecp - ACL to be set + * + * OUT: zpp - znode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dzp - ctime|mtime updated + * zpp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, + cred_t *cr, int flags, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + uint64_t txtype; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t waited = B_FALSE; + + ASSERT(S_ISDIR(vap->va_mode)); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (dirname == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr(vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + vsecp, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ +top: + *zpp = NULL; + + if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, + NULL, NULL))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + /* + * Now put new name in parent dir. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + *zpp = zp; + + txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); + +out: + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(zp, zfsvfs); + + zfs_dirent_unlock(dl); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (error != 0) { + zrele(zp); + } else { + } + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dzp - znode of directory to remove from. + * name - name of directory to be removed. + * cwd - inode of current working directory. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dzp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, + int flags) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (ZTOTYPE(zp) != VDIR) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + if (zp == cwd) { + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * Grab a lock on the directory to make sure that no one is + * trying to add (or lookup) entries while we are removing it. + */ + rw_enter(&zp->z_name_lock, RW_WRITER); + + /* + * Grab a lock on the parent pointer to make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(zp); + goto top; + } + dmu_tx_abort(tx); + zrele(zp); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, + B_FALSE); + } + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); +out: + zfs_dirent_unlock(dl); + + zrele(zp); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read directory entries from the given directory cursor position and emit + * name and position for each entry. + * + * IN: ip - inode of directory to read. + * ctx - directory entry context. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +int +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, + int flags, int *a_numdirent) +{ + + znode_t *zp = VTOZ(vp); + boolean_t extended = (flags & VNODE_READDIR_EXTENDED); + struct direntry *eodp; /* Extended */ + struct dirent *odp; /* Standard */ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; + int local_eof; + int outcount; + int error = 0; + uint8_t prefetch; + uint8_t type; + int numdirent = 0; + char *bufptr; + boolean_t isdotdir = B_TRUE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) + goto out; + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio_curriovlen(uio) <= 0) { + error = EINVAL; + goto out; + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + goto out; + } + + error = 0; + os = zfsvfs->z_os; + offset = uio_offset(uio); + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + bytes_wanted = uio_curriovlen(uio); + bufsize = (size_t)bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + bufptr = (char *)outbuf; + + /* + * Transform to file-system independent format + */ + + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen; + uint64_t *next = NULL; + size_t namelen; + int force_formd_normalized_output; + size_t nfdlen; + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strlcpy(zap.za_name, ".", MAXNAMELEN); + zap.za_normalization_conflict = 0; + objnum = (zp->z_id == zfsvfs->z_root) ? 2 : zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strlcpy(zap.za_name, "..", MAXNAMELEN); + zap.za_normalization_conflict = 0; + objnum = (parent == zfsvfs->z_root) ? 2 : parent; + objnum = (zp->z_id == zfsvfs->z_root) ? 1 : objnum; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strlcpy(zap.za_name, ZFS_CTLDIR_NAME, + MAXNAMELEN); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + + /* This is not a special case directory */ + isdotdir = B_FALSE; + + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + /* + * Allow multiple entries provided the first entry is + * the object id. Non-zpl consumers may safely make + * use of the additional space. + * + * XXX: This should be a feature flag for compatibility + */ + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + + } + + /* emit start */ + +#define DIRENT_RECLEN(namelen, ext) \ + ((ext) ? \ + ((sizeof (struct direntry) + (namelen) - (MAXPATHLEN-1) + 7) & ~7) \ + : \ + ((sizeof (struct dirent) - (NAME_MAX+1)) + (((namelen)+1 + 7) &~ 7))) + + /* + * Check if name will fit. + * + * Note: non-ascii names may expand (3x) when converted to NFD + */ + namelen = strlen(zap.za_name); + + /* sysctl to force formD normalization of vnop output */ + if (zfs_vnop_force_formd_normalized_output && + !is_ascii_str(zap.za_name)) + force_formd_normalized_output = 1; + else + force_formd_normalized_output = 0; + + if (force_formd_normalized_output) + namelen = MIN(extended ? MAXPATHLEN-1 : MAXNAMLEN, + namelen * 3); + + reclen = DIRENT_RECLEN(namelen, extended); + + /* + * Will this entry fit in the buffer? + */ + if (outcount + reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = (EINVAL); + goto update; + } + break; + } + + if (extended) { + /* + * Add extended flag entry: + */ + eodp = (struct direntry *)bufptr; + /* NOTE: d_seekoff is the offset for the *next* entry */ + next = &(eodp->d_seekoff); + eodp->d_ino = INO_ZFSTOXNU(objnum, zfsvfs->z_root); + eodp->d_type = type; + + /* + * Mac OS X: non-ascii names are UTF-8 NFC on disk + * so convert to NFD before exporting them. + */ + namelen = strlen(zap.za_name); + if (!force_formd_normalized_output || + utf8_normalizestr((const u_int8_t *)zap.za_name, + namelen, (u_int8_t *)eodp->d_name, &nfdlen, + MAXPATHLEN-1, UTF_DECOMPOSED) != 0) { + /* ASCII or normalization failed, copy zap */ + if ((namelen > 0)) + (void) bcopy(zap.za_name, eodp->d_name, + namelen + 1); + } else { + /* Normalization succeeded (in buffer) */ + namelen = nfdlen; + } + eodp->d_namlen = namelen; + eodp->d_reclen = reclen = + DIRENT_RECLEN(namelen, extended); + + } else { + /* + * Add normal entry: + */ + + odp = (struct dirent *)bufptr; + odp->d_ino = INO_ZFSTOXNU(objnum, zfsvfs->z_root); + odp->d_type = type; + + /* + * Mac OS X: non-ascii names are UTF-8 NFC on disk + * so convert to NFD before exporting them. + */ + namelen = strlen(zap.za_name); + if (!force_formd_normalized_output || + utf8_normalizestr((const u_int8_t *)zap.za_name, + namelen, (u_int8_t *)odp->d_name, &nfdlen, + MAXNAMLEN, UTF_DECOMPOSED) != 0) { + /* ASCII or normalization failed, copy zap */ + if ((namelen > 0)) + (void) bcopy(zap.za_name, odp->d_name, + namelen + 1); + } else { + /* Normalization succeeded (in buffer). */ + namelen = nfdlen; + } + odp->d_namlen = namelen; + odp->d_reclen = reclen = + DIRENT_RECLEN(namelen, extended); + } + + outcount += reclen; + bufptr += reclen; + numdirent++; + + ASSERT(outcount <= bufsize); + + /* emit done */ + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + if (extended) + *next = offset; + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + /* All done, copy temporary buffer to userland */ + if ((error = uiomove(outbuf, (long)outcount, UIO_READ, uio))) { + /* + * Reset the pointer. + */ + offset = uio_offset(uio); + } + + +update: + zap_cursor_fini(&zc); + if (outbuf) { + kmem_free(outbuf, bufsize); + } + + if (error == ENOENT) + error = 0; + + uio_setoffset(uio, offset); + if (a_numdirent) + *a_numdirent = numdirent; + +out: + ZFS_EXIT(zfsvfs); + + dprintf("-zfs_readdir: num %d\n", numdirent); + + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +int +zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + vnode_t *vp = ZTOV(zp); + + if (zp->z_is_mapped /* && !(syncflag & FNODSYNC) */ && + vnode_isreg(vp) && !vnode_isswap(vp)) { + cluster_push(vp, /* waitdata ? IO_SYNC : */ 0); + } + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED && + !vnode_isrecycled(ZTOV(zp))) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } + tsd_set(zfs_fsyncer_key, NULL); + + return (0); +} + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * If ATTR_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * ct - caller context + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds) + */ +int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + uint64_t links; + uint64_t mtime[2], ctime[2], crtime[2], rdev; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = /* (flags&ATTR_NOACLCHECK)?B_TRUE: */ B_FALSE; + sa_bulk_attr_t bulk[4]; + int count = 0; + + VERIFY3P(zp->z_zfsvfs, ==, vfs_fsprivate(vnode_mount(vp))); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + if (vnode_isblk(vp) || vnode_ischr(vp)) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { + if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + mutex_enter(&zp->z_lock); + vap->va_type = IFTOVT(zp->z_mode); + vap->va_mode = zp->z_mode & ~S_IFMT; + vap->va_nodeid = zp->z_id; + if (vnode_isvroot((vp)) && zfs_show_ctldir(zp)) + links = zp->z_links + 1; + else + links = zp->z_links; + vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ + vap->va_size = zp->z_size; + if (vnode_isblk(vp) || vnode_ischr(vp)) + vap->va_rdev = zfs_cmpldev(rdev); + + vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((zp->z_pflags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((zp->z_pflags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((zp->z_pflags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((zp->z_pflags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((zp->z_pflags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((zp->z_pflags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((zp->z_pflags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((zp->z_pflags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vnode_isreg(vp)) { + zfs_sa_get_scanstamp(zp, xvap); + } + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + times, sizeof (times)); + ZFS_TIME_DECODE(&xoap->xoa_createtime, times); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + ZFS_TIME_DECODE(&vap->va_crtime, crtime); + + mutex_exit(&zp->z_lock); + + /* + * If we are told to ignore owners, we scribble over the + * uid and gid here unless root. + */ + if (((unsigned int)vfs_flags(zfsvfs->z_vfs)) & MNT_IGNORE_OWNERSHIP) { + if (kauth_cred_getuid(cr) != 0) { + vap->va_uid = UNKNOWNUID; + vap->va_gid = UNKNOWNGID; + } + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +#ifdef NOTSUREYET +/* + * For the operation of changing file's user/group/project, we need to + * handle not only the main object that is assigned to the file directly, + * but also the ones that are used by the file via hidden xattr directory. + * + * Because the xattr directory may contains many EA entries, as to it may + * be impossible to change all of them via the transaction of changing the + * main object's user/group/project attributes. Then we have to change them + * via other multiple independent transactions one by one. It may be not good + * solution, but we have no better idea yet. + */ +static int +zfs_setattr_dir(znode_t *dzp) +{ + struct vnode *dxip = ZTOI(dzp); + struct vnode *xip = NULL; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + objset_t *os = zfsvfs->z_os; + zap_cursor_t zc; + zap_attribute_t zap; + zfs_dirlock_t *dl; + znode_t *zp; + dmu_tx_t *tx = NULL; + uint64_t uid, gid; + sa_bulk_attr_t bulk[4]; + int count; + int err; + + zap_cursor_init(&zc, os, dzp->z_id); + while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { + count = 0; + if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { + err = ENXIO; + break; + } + + err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, + ZEXISTS, NULL, NULL); + if (err == ENOENT) + goto next; + if (err) + break; + + xip = ZTOI(zp); + if (zp->z_uid == dzp->z_uid && + zp->z_gid == dzp->z_gid && + zp->z_projid == dzp->z_projid) + goto next; + + tx = dmu_tx_create(os); + if (!(zp->z_pflags & ZFS_PROJID)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + break; + + mutex_enter(&dzp->z_lock); + + if (zp->z_uid != dxzp->z_uid) { + zp->z_uid = dzp->z_uid; + uid = zfs_uid_read(dzp); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, sizeof (uid)); + } + + if (KGID_TO_SGID(zp->z_gid) != KGID_TO_SGID(dxzp->z_gid)) { + zp->z_gid = dzp->z_gid; + gid = zfs_gid_read(dzp); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, sizeof (gid)); + } + + if (zp->z_projid != dzp->z_projid) { + if (!(zp->z_pflags & ZFS_PROJID)) { + zp->z_pflags |= ZFS_PROJID; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, + sizeof (zp->z_pflags)); + } + + zp->z_projid = dzp->z_projid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), + NULL, &zp->z_projid, sizeof (zp->z_projid)); + } + + mutex_exit(&dzp->z_lock); + + if (likely(count > 0)) { + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + tx = NULL; + if (err != 0 && err != ENOENT) + break; + +next: + if (zp) { + zrele(zp); + zp = NULL; + zfs_dirent_unlock(dl); + } + zap_cursor_advance(&zc); + } + + if (tx) + dmu_tx_abort(tx); + if (zp) { + zrele(zp); + zfs_dirent_unlock(dl); + } + zap_cursor_fini(&zc); + + return (err == ENOENT ? 0 : err); +} +#endif + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: zp - znode of file to be modified. + * vap - new attribute values. + * If ATTR_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +{ + struct vnode *vp; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t *tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2], atime[2], crtime[2]; + uint64_t projid = ZFS_INVALID_PROJID; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2 = 0; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t fuid_dirtied = B_FALSE; + boolean_t handle_eadir = B_FALSE; + sa_bulk_attr_t *bulk, *xattr_bulk; + int count = 0, xattr_count = 0, bulks = 9; + + if (mask == 0) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + vp = ZTOV(zp); + + /* + * If this is a xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + if (xoap != NULL && (mask & ATTR_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + } + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & ATTR_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & ATTR_SIZE && S_ISDIR(zp->z_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & ATTR_SIZE && !S_ISREG(zp->z_mode) && !S_ISFIFO(zp->z_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); + xva_init(tmpxvattr); + + bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || + ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + err = SET_ERROR(EPERM); + goto out3; + } + + if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { + err = SET_ERROR(EPERM); + goto out3; + } + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (ATTR_ATIME | ATTR_MTIME)) { + if (((mask & ATTR_ATIME) && + TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & ATTR_MTIME) && + TIMESPEC_OVERFLOW(&vap->va_mtime))) { + err = SET_ERROR(EOVERFLOW); + goto out3; + } + } + +top: + attrzp = NULL; + aclp = NULL; + + /* Can this be moved to before the top label? */ + if (zfs_is_readonly(zfsvfs)) { + err = SET_ERROR(EROFS); + goto out3; + } + + /* + * First validate permissions + */ + + if (mask & ATTR_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr); + if (err) + goto out3; + + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) + goto out3; + } + + if (mask & (ATTR_ATIME|ATTR_MTIME) || + ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + B_FALSE, cr); + } + + if (mask & (ATTR_UID|ATTR_GID)) { + int idmask = (mask & (ATTR_UID|ATTR_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & ATTR_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & ATTR_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both ATTR_UID and ATTR_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (ATTR_UID|ATTR_GID)) && + take_owner && take_group) || + ((idmask == ATTR_UID) && take_owner) || + ((idmask == ATTR_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + B_FALSE, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + (void) secpolicy_setid_clear(vap, cr); + trim_mask = (mask & (ATTR_UID|ATTR_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + mutex_enter(&zp->z_lock); + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & ATTR_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((!S_ISREG(zp->z_mode) && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + mutex_exit(&zp->z_lock); + err = SET_ERROR(EPERM); + goto out3; + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + mutex_exit(&zp->z_lock); + + if (mask & ATTR_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, B_FALSE, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) + goto out3; + + trim_mask |= ATTR_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) + goto out3; + + if (trim_mask) + vap->va_mask |= saved_mask; + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { + handle_eadir = B_TRUE; + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); + if (err) + goto out2; + } + if (mask & ATTR_UID) { + new_kuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_kuid != zp->z_uid && + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_kuid)) { + if (attrzp) + zrele(attrzp); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & ATTR_GID) { + new_kgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); + if (new_kgid != zp->z_gid && + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_kgid)) { + if (attrzp) + zrele(attrzp); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + zrele(attrzp); + err = EDQUOT; + goto out2; + } + } + tx = dmu_tx_create(os); + + if (mask & ATTR_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) + goto out; + + mutex_enter(&zp->z_lock); + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + mutex_exit(&zp->z_lock); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if (((mask & ATTR_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } + + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&attrzp->z_acl_lock); + mutex_enter(&attrzp->z_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } + } + + if (mask & (ATTR_UID|ATTR_GID)) { + + if (mask & ATTR_UID) { + new_uid = new_kuid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = new_uid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_uid = new_uid; + } + } + + if (mask & ATTR_GID) { + new_gid = new_kgid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + zp->z_gid = new_gid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = new_gid; + } + } + if (!(mask & ATTR_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + + if (mask & ATTR_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; + ASSERT3P(aclp, !=, NULL); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { + zp->z_atime_dirty = B_FALSE; + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, sizeof (atime)); + } + + if (mask & (ATTR_MTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (mask & (ATTR_CTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_ctime, ctime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + } + + if (mask & ATTR_CRTIME) { + ZFS_TIME_ENCODE(&vap->va_crtime, crtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, + crtime, sizeof (crtime)); + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + + if (attrzp && mask) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, &ctime, + sizeof (ctime)); + } + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & ATTR_XVATTR)) { + + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(S_ISREG(zp->z_mode)); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + mutex_exit(&zp->z_lock); + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&attrzp->z_acl_lock); + mutex_exit(&attrzp->z_lock); + } +out: + if (err == 0 && xattr_count > 0) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + if (attrzp) + zrele(attrzp); + if (err == ERESTART) + goto top; + } else { + if (count > 0) + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + if (attrzp) { + zrele(attrzp); + } + } + +out2: + if (os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + +out3: + kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(tmpxvattr, sizeof (xvattr_t)); + ZFS_EXIT(zfsvfs); + return (err); +} + +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + zfs_zrele_async(zl->zl_znode); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = ZTOZSB(zp)->z_root; + uint64_t oidp = zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + if (!rw_tryenter(rwlp, rw)) { + /* + * Another thread is renaming in this path. + * Note that if we are a WRITER, we don't have any + * parent_locks held yet. + */ + if (rw == RW_READER && zp->z_id > szp->z_id) { + /* + * Drop our locks and restart + */ + zfs_rename_unlock(&zl); + *zlpp = NULL; + zp = tdzp; + oidp = zp->z_id; + rwlp = &szp->z_parent_lock; + rw = RW_WRITER; + continue; + } else { + /* + * Wait for other thread to drop its locks + */ + rw_enter(rwlp, rw); + } + } + + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + if (oidp == szp->z_id) /* We're a descendant of szp */ + return (SET_ERROR(EINVAL)); + + if (oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(ZTOZSB(zp), oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), + &oidp, sizeof (oidp)); + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdzp - Source directory containing the "old entry". + * snm - Old entry name. + * tdzp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdzp,tdzp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, + cred_t *cr, int flags) +{ + znode_t *szp, *tzp; + zfsvfs_t *zfsvfs = ZTOZSB(sdzp); + zilog_t *zilog; + uint64_t addtime[2]; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int cmp, serr, terr; + int error = 0; + int zflg = 0; + boolean_t waited = B_FALSE; + + if (snm == NULL || tnm == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(sdzp); + zilog = zfsvfs->z_log; + + ZFS_VERIFY_ZP(tdzp); + + /* + * We check i_sb because snapshots and the ctldir must have different + * super blocks. + */ + // Can't we use zp->z_zfsvfs in place of zp->vp->v_vfs ? + if (VTOM(ZTOV(tdzp)) != VTOM(ZTOV(sdzp)) || + zfsctl_is_node(ZTOV(tdzp))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + cmp = -1; + } else if (sdzp->z_id > tdzp->z_id) { + cmp = 1; + } else { + /* + * First compare the two name arguments without + * considering any case folding. + */ + int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + + cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); + ASSERT(error == 0 || !zfsvfs->z_utf8); + if (cmp == 0) { + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + */ + ZFS_EXIT(zfsvfs); + return (0); + } + /* + * If the file system is case-folding, then we may + * have some more checking to do. A case-folding file + * system is either supporting mixed case sensitivity + * access or is completely case-insensitive. Note + * that the file system is always case preserving. + * + * In mixed sensitivity mode case sensitive behavior + * is the default. FIGNORECASE must be used to + * explicitly request case insensitive behavior. + * + * If the source and target names provided differ only + * by case (e.g., a request to rename 'tim' to 'Tim'), + * we will treat this as a special case in the + * case-insensitive mode: as long as the source name + * is an exact match, we will allow this to proceed as + * a name-change request. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + (zfsvfs->z_case == ZFS_CASE_MIXED && + flags & FIGNORECASE)) && + u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, + &error) == 0) { + /* + * case preserving rename request, require exact + * name matches + */ + zflg |= ZCIEXACT; + zflg &= ~ZCILOOK; + } + } + + /* + * If the source and destination directories are the same, we should + * grab the z_name_lock of that directory only once. + */ + if (sdzp == tdzp) { + zflg |= ZHAVELOCK; + rw_enter(&sdzp->z_name_lock, RW_READER); + } + + if (cmp < 0) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, + ZEXISTS | zflg, NULL, NULL); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); + } else { + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, zflg, NULL, NULL); + serr = zfs_dirent_lock(&sdl, + sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, + NULL, NULL); + } + + if (serr) { + /* + * Source entry invalid or not there. + */ + if (!terr) { + zfs_dirent_unlock(tdl); + if (tzp) + zrele(tzp); + } + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) + serr = EINVAL; + ZFS_EXIT(zfsvfs); + return (serr); + } + if (terr) { + zfs_dirent_unlock(sdl); + zrele(szp); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(tnm, "..") == 0) + terr = EINVAL; + ZFS_EXIT(zfsvfs); + return (terr); + } + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto out; + + if (S_ISDIR(szp->z_mode)) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (S_ISDIR(szp->z_mode)) { + if (!S_ISDIR(tzp->z_mode)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + } else { + if (S_ISDIR(tzp->z_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + +#if defined(MAC_OS_X_VERSION_10_12) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12) + /* If renamex(VFS_RENAME_EXCL) is used, error out */ + if (flags & VFS_RENAME_EXCL) { + error = EEXIST; + goto out; + } +#endif + + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(szp); + if (tzp) + zrele(tzp); + goto top; + } + dmu_tx_abort(tx); + zrele(szp); + if (tzp) + zrele(tzp); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + if (tdzp->z_pflags & ZFS_PROJINHERIT) + szp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + /* + * If we moved an entry into a different directory + * (sdzp != tdzp) then we also need to update ADDEDTIME + * (ADDTIME) property for FinderInfo. We are already + * inside error == 0 conditional + */ + if ((sdzp != tdzp) && + zfsvfs->z_use_sa == B_TRUE) { + timestruc_t now; + gethrestime(&now); + ZFS_TIME_ENCODE(&now, addtime); + error = sa_update(szp->z_sa_hdl, + SA_ZPL_ADDTIME(zfsvfs), (void *)&addtime, + sizeof (addtime), tx); + } + + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME | + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + + /* + * Update cached name - for vget, and access + * without calling vnop_lookup first - it is + * easier to clear it out and let getattr + * look it up if needed. + */ + if (tzp) { + mutex_enter(&tzp->z_lock); + tzp->z_name_cache[0] = 0; + mutex_exit(&tzp->z_lock); + } + if (szp) { + mutex_enter(&szp->z_lock); + szp->z_name_cache[0] = 0; + mutex_exit(&szp->z_lock); + } + + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdl, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } else { + /* + * If we had removed the existing target, subsequent + * call to zfs_link_create() to add back the same entry + * but, the new dnode (szp) should not fail. + */ + ASSERT(tzp == NULL); + } + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + zrele(szp); + if (tzp) { + zrele(tzp); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dzp - Directory to contain new symbolic link. + * name - Name of directory entry in dip. + * vap - Attributes of new entry. + * link - Name for new symlink entry. + * cr - credentials of caller. + * flags - case flags + * + * OUT: zpp - Znode for new symbolic link. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dip - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, + znode_t **zpp, cred_t *cr, int flags) +{ + znode_t *zp; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + int zflg = ZNEW; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + boolean_t waited = B_FALSE; + + ASSERT(S_ISLNK(vap->va_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } +top: + *zpp = NULL; + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datsets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + mutex_exit(&zp->z_lock); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + } + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(zp, zfsvfs); + + if (error == 0) { + *zpp = zp; + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } else { + zrele(zp); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by ip. + * + * IN: ip - inode of symbolic link + * uio - structure to contain the link path. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + */ +/* ARGSUSED */ +int +zfs_readlink(struct vnode *vp, uio_t *uio, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdzp referencing szp. + * + * IN: tdzp - Directory to contain new entry. + * szp - znode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdzp - ctime|mtime updated + * szp - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, + int flags) +{ + struct vnode *svp = ZTOV(szp); + znode_t *tzp; + zfsvfs_t *zfsvfs = ZTOZSB(tdzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uint64_t parent; + uid_t owner; + boolean_t waited = B_FALSE; + boolean_t is_tmpfile = 0; + uint64_t txg; + + ASSERT(S_ISDIR(tdzp->z_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(tdzp); + zilog = zfsvfs->z_log; + +#ifdef __APPLE__ + if (VTOM(svp) != VTOM(ZTOV(tdzp))) { + ZFS_EXIT(zfsvfs); + return (EXDEV); + } +#endif + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (vnode_isdir(svp)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + ZFS_VERIFY_ZP(szp); + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(szp->z_uid), + cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + +top: + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); + if (is_tmpfile) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, tdzp); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + } else if (is_tmpfile) { + /* restore z_unlinked since when linking failed */ + szp->z_unlinked = B_TRUE; + } + txg = dmu_tx_get_txg(tx); + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +void +zfs_inactive(struct vnode *vp) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ITOZSB(vp); + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vnode_recycle(vp); + return; + } + + if (zp->z_unlinked) { + /* + * Fast path to recycle a vnode of a removed file. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vnode_recycle(vp); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); + zp->z_atime_dirty = 0; + dmu_tx_commit(tx); + } + } + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + +static int +zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + // boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t skipaclchk = B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +int +zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + // boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t skipaclchk = B_FALSE; + zilog_t *zilog = zfsvfs->z_log; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_setacl(zp, vsecp, skipaclchk, cr); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: zp - znode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * zp - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} diff --git a/module/os/macos/zfs/zfs_vnops_osx.c b/module/os/macos/zfs/zfs_vnops_osx.c new file mode 100644 index 0000000000..341b7e9463 --- /dev/null +++ b/module/os/macos/zfs/zfs_vnops_osx.c @@ -0,0 +1,5281 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013 Will Andrews + * Copyright (c) 2013, 2020 Jorgen Lundman + */ + +/* + * XXX GENERAL COMPATIBILITY ISSUES + * + * 'name' is a common argument, but in OS X (and FreeBSD), we need to pass + * the componentname pointer, so other things can use them. We should + * change the 'name' argument to be an opaque name pointer, and define + * OS-dependent macros that yield the desired results when needed. + * + * On OS X, VFS performs access checks before calling anything, so + * zfs_zaccess_* calls are not used. Not true on FreeBSD, though. Perhaps + * those calls should be conditionally #if 0'd? + * + * On OS X, VFS & I/O objects are often opaque, e.g. uio_t and struct vnode + * require using functions to access elements of an object. Should convert + * the Solaris code to use macros on other platforms. + * + * OS X and FreeBSD appear to use similar zfs-vfs interfaces; see Apple's + * comment in zfs_remove() about the fact that VFS holds the last ref while + * in Solaris it's the ZFS code that does. On FreeBSD, the code Apple + * refers to here results in a panic if the branch is actually taken. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + + + +#ifdef _KERNEL +#include +#include + +unsigned int debug_vnop_osx_printf = 0; +unsigned int zfs_vnop_ignore_negatives = 0; +unsigned int zfs_vnop_ignore_positives = 0; +unsigned int zfs_vnop_create_negatives = 1; +#endif + +#define DECLARE_CRED(ap) \ + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context) +#define DECLARE_CONTEXT(ap) \ + caller_context_t *ct = (caller_context_t *)(ap)->a_context +#define DECLARE_CRED_AND_CONTEXT(ap) \ + DECLARE_CRED(ap); \ + DECLARE_CONTEXT(ap) + +#undef dprintf +#define dprintf printf + +/* Empty FinderInfo struct */ +static u_int32_t emptyfinfo[8] = {0}; + +/* + * zfs vfs operations. + */ +static struct vfsops zfs_vfsops_template = { + zfs_vfs_mount, + zfs_vfs_start, + zfs_vfs_unmount, + zfs_vfs_root, + zfs_vfs_quotactl, + zfs_vfs_getattr, + zfs_vfs_sync, + zfs_vfs_vget, + zfs_vfs_fhtovp, + zfs_vfs_vptofh, + zfs_vfs_init, + zfs_vfs_sysctl, + zfs_vfs_setattr, +#if defined(MAC_OS_X_VERSION_10_12) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12) + NULL, /* vfs_ioctl */ + NULL, /* vfs_vget_snapdir */ + NULL +#else + {NULL} +#endif +}; + +#define ZFS_VNOP_TBL_CNT 6 + +static struct vnodeopv_desc *zfs_vnodeop_opv_desc_list[ZFS_VNOP_TBL_CNT] = +{ + &zfs_dvnodeop_opv_desc, + &zfs_fvnodeop_opv_desc, + &zfs_symvnodeop_opv_desc, + &zfs_xdvnodeop_opv_desc, + &zfs_fifonodeop_opv_desc, + &zfs_ctldir_opv_desc, +}; + +static vfstable_t zfs_vfsconf; + +int +zfs_vnop_removexattr_int(zfsvfs_t *zfsvfs, znode_t *zp, const char *name, + cred_t *cr); + +int +zfs_vfs_init(__unused struct vfsconf *vfsp) +{ + return (0); +} + +int +zfs_vfs_start(__unused struct mount *mp, __unused int flags, + __unused vfs_context_t context) +{ + return (0); +} + +int +zfs_vfs_quotactl(__unused struct mount *mp, __unused int cmds, + __unused uid_t uid, __unused caddr_t datap, __unused vfs_context_t context) +{ + dprintf("%s ENOTSUP\n", __func__); + return (ENOTSUP); +} + +static kmutex_t zfs_findernotify_lock; +static kcondvar_t zfs_findernotify_thread_cv; +static boolean_t zfs_findernotify_thread_exit; + +#define VNODE_EVENT_ATTRIB 0x00000008 + +static int +zfs_findernotify_callback(mount_t mp, __unused void *arg) +{ + /* Do some quick checks to see if it is ZFS */ + struct vfsstatfs *vsf = vfs_statfs(mp); + + // Filesystem ZFS? + if (vsf->f_fssubtype == MNTTYPE_ZFS_SUBTYPE) { + vfs_context_t kernelctx = spl_vfs_context_kernel(); + struct vnode *rootvp, *vp; + + /* + * Since potentially other filesystems could be using "our" + * fssubtype, and we don't always announce as "zfs" due to + * hfs-mimic requirements, we have to make extra care here to + * make sure this "mp" really is ZFS. + */ + zfsvfs_t *zfsvfs; + + zfsvfs = vfs_fsprivate(mp); + + /* + * The first entry in struct zfsvfs is the vfs ptr, so they + * should be equal if it is ZFS + */ + if (!zfsvfs || + (mp != zfsvfs->z_vfs)) + return (VFS_RETURNED); + + /* Guard against unmount */ + ZFS_ENTER_ERROR(zfsvfs, VFS_RETURNED); + + /* Check if space usage has changed enough to bother updating */ + uint64_t refdbytes, availbytes, usedobjs, availobjs; + uint64_t delta; + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + if (availbytes >= zfsvfs->z_findernotify_space) { + delta = availbytes - zfsvfs->z_findernotify_space; + } else { + delta = zfsvfs->z_findernotify_space - availbytes; + } + +#define ZFS_FINDERNOTIFY_THRESHOLD (1ULL<<20) + + /* Under the limit ? */ + if (delta <= ZFS_FINDERNOTIFY_THRESHOLD) goto out; + + /* Over threadhold, so we will notify finder, remember value */ + zfsvfs->z_findernotify_space = availbytes; + + /* If old value is zero (first run), don't bother */ + if (availbytes == delta) + goto out; + + dprintf("ZFS: findernotify %p space delta %llu\n", mp, delta); + + // Grab the root zp + if (!VFS_ROOT(mp, 0, &rootvp)) { + + struct componentname cn; + char *tmpname = ".fseventsd"; + + bzero(&cn, sizeof (cn)); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN; + // cn.cn_context = kernelctx; + cn.cn_pnbuf = tmpname; + cn.cn_pnlen = sizeof (tmpname); + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_namelen = strlen(tmpname); + + // Attempt to lookup .Trashes + if (!VOP_LOOKUP(rootvp, &vp, &cn, kernelctx)) { + + // Send the event to wake up Finder + struct vnode_attr vattr; + // Also calls VATTR_INIT + spl_vfs_get_notify_attributes(&vattr); + // Fill in vap + vnode_getattr(vp, &vattr, kernelctx); + // Send event + spl_vnode_notify(vp, VNODE_EVENT_ATTRIB, + &vattr); + + // Cleanup vp + vnode_put(vp); + + } // VNOP_LOOKUP + + // Cleanup rootvp + vnode_put(rootvp); + + } // VFS_ROOT + +out: + ZFS_EXIT(zfsvfs); + + } // SUBTYPE_ZFS + + return (VFS_RETURNED); +} + + +static void +zfs_findernotify_thread(void *notused) +{ + callb_cpr_t cpr; + + dprintf("ZFS: findernotify thread start\n"); + CALLB_CPR_INIT(&cpr, &zfs_findernotify_lock, callb_generic_cpr, FTAG); + + mutex_enter(&zfs_findernotify_lock); + while (!zfs_findernotify_thread_exit) { + + /* Sleep 32 seconds */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&zfs_findernotify_thread_cv, + &zfs_findernotify_lock, ddi_get_lbolt() + (hz<<5)); + CALLB_CPR_SAFE_END(&cpr, &zfs_findernotify_lock); + + if (!zfs_findernotify_thread_exit) + vfs_iterate(LK_NOWAIT, zfs_findernotify_callback, NULL); + + } + + zfs_findernotify_thread_exit = FALSE; + cv_broadcast(&zfs_findernotify_thread_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ + dprintf("ZFS: findernotify thread exit\n"); + thread_exit(); +} + +void +zfs_start_notify_thread(void) +{ + mutex_init(&zfs_findernotify_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zfs_findernotify_thread_cv, NULL, CV_DEFAULT, NULL); + zfs_findernotify_thread_exit = FALSE; + (void) thread_create(NULL, 0, zfs_findernotify_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + + +void +zfs_stop_notify_thread(void) +{ + mutex_enter(&zfs_findernotify_lock); + zfs_findernotify_thread_exit = TRUE; + /* + * The reclaim thread will set arc_reclaim_thread_exit back to + * FALSE when it is finished exiting; we're waiting for that. + */ + while (zfs_findernotify_thread_exit) { + cv_signal(&zfs_findernotify_thread_cv); + cv_wait(&zfs_findernotify_thread_cv, &zfs_findernotify_lock); + } + mutex_exit(&zfs_findernotify_lock); + mutex_destroy(&zfs_findernotify_lock); + cv_destroy(&zfs_findernotify_thread_cv); +} + +int +zfs_vfs_sysctl(int *name, __unused uint_t namelen, user_addr_t oldp, + size_t *oldlenp, user_addr_t newp, size_t newlen, + __unused vfs_context_t context) +{ +#if 0 + int error; + switch (name[0]) { + case ZFS_SYSCTL_FOOTPRINT: { + zfs_footprint_stats_t *footprint; + size_t copyinsize; + size_t copyoutsize; + int max_caches; + int act_caches; + + if (newp) { + return (EINVAL); + } + if (!oldp) { + *oldlenp = sizeof (zfs_footprint_stats_t); + return (0); + } + copyinsize = *oldlenp; + if (copyinsize < sizeof (zfs_footprint_stats_t)) { + *oldlenp = sizeof (zfs_footprint_stats_t); + return (ENOMEM); + } + footprint = kmem_alloc(copyinsize, KM_SLEEP); + + max_caches = copyinsize - sizeof (zfs_footprint_stats_t); + max_caches += sizeof (kmem_cache_stats_t); + max_caches /= sizeof (kmem_cache_stats_t); + + footprint->version = ZFS_FOOTPRINT_VERSION; + + footprint->memory_stats.current = zfs_footprint.current; + footprint->memory_stats.target = zfs_footprint.target; + footprint->memory_stats.highest = zfs_footprint.highest; + footprint->memory_stats.maximum = zfs_footprint.maximum; + + arc_get_stats(&footprint->arc_stats); + + kmem_cache_stats(&footprint->cache_stats[0], max_caches, + &act_caches); + footprint->caches_count = act_caches; + footprint->thread_count = zfs_threads; + + copyoutsize = sizeof (zfs_footprint_stats_t) + + ((act_caches - 1) * sizeof (kmem_cache_stats_t)); + + error = ddi_copyout(footprint, oldp, copyoutsize, 0); + + kmem_free(footprint, copyinsize); + + return (error); + } + + case ZFS_SYSCTL_CONFIG_DEBUGMSG: + error = sysctl_int(oldp, oldlenp, newp, newlen, + &zfs_msg_buf_enabled); + return (error); + + case ZFS_SYSCTL_CONFIG_zdprintf: +#ifdef ZFS_DEBUG + error = sysctl_int(oldp, oldlenp, newp, newlen, + &zfs_zdprintf_enabled); +#else + error = ENOTSUP; +#endif + return (error); + } +#endif + return (ENOTSUP); +} + +/* + * All these functions could be declared as 'static' but to assist with + * dtrace debugging, we do not. + */ + +int +zfs_vnop_open(struct vnop_open_args *ap) +#if 0 + struct vnop_open_args { + struct vnode *a_vp; + int a_mode; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + int err = 0; + + err = zfs_open(ap->a_vp, ap->a_mode, 0, cr); + + if (err) dprintf("zfs_open() failed %d\n", err); + return (err); +} + +int +zfs_vnop_close(struct vnop_close_args *ap) +#if 0 + struct vnop_close_args { + struct vnode *a_vp; + int a_fflag; + vfs_context_t a_context; + }; +#endif +{ +// int count = 1; +// int offset = 0; +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + + return (zfs_close(ap->a_vp, ap->a_fflag, cr)); +} + +int +zfs_vnop_ioctl(struct vnop_ioctl_args *ap) +#if 0 + struct vnop_ioctl_args { + struct vnode *a_vp; + ulong_t a_command; + caddr_t a_data; + int a_fflag; + kauth_cred_t a_cred; + struct proc *a_p; + }; +#endif +{ + /* OS X has no use for zfs_ioctl(). */ + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + DECLARE_CRED_AND_CONTEXT(ap); + + dprintf("vnop_ioctl %08lx: VTYPE %d\n", ap->a_command, + vnode_vtype(ZTOV(zp))); + + ZFS_ENTER(zfsvfs); + if (IFTOVT((mode_t)zp->z_mode) == VFIFO) { + dprintf("ZFS: FIFO ioctl %02lx ('%lu' + %lu)\n", + ap->a_command, (ap->a_command&0xff00)>>8, + ap->a_command&0xff); + error = fifo_ioctl(ap); + error = 0; + ZFS_EXIT(zfsvfs); + goto out; + } + + if ((IFTOVT((mode_t)zp->z_mode) == VBLK) || + (IFTOVT((mode_t)zp->z_mode) == VCHR)) { + dprintf("ZFS: spec ioctl %02lx ('%lu' + %lu)\n", + ap->a_command, (ap->a_command&0xff00)>>8, + ap->a_command&0xff); + error = spec_ioctl(ap); + ZFS_EXIT(zfsvfs); + goto out; + } + ZFS_EXIT(zfsvfs); + + switch (ap->a_command) { + + /* ioctl supported by ZFS and POSIX */ + + case F_FULLFSYNC: + dprintf("%s F_FULLFSYNC\n", __func__); +#ifdef F_BARRIERFSYNC + case F_BARRIERFSYNC: + dprintf("%s F_BARRIERFSYNC\n", __func__); +#endif + error = zfs_fsync(VTOZ(ap->a_vp), /* flag */0, cr); + break; + + case F_CHKCLEAN: + dprintf("%s F_CHKCLEAN\n", __func__); + /* + * normally calls http://fxr.watson.org/fxr/source/bsd/ + * vfs/vfs_cluster.c?v=xnu-2050.18.24#L5839 + */ + /* XXX Why don't we? */ + off_t fsize = zp->z_size; + error = is_file_clean(ap->a_vp, fsize); + break; + + case F_RDADVISE: + dprintf("%s F_RDADVISE\n", __func__); + uint64_t file_size; + struct radvisory *ra; + int len; + + ra = (struct radvisory *)(ap->a_data); + + file_size = zp->z_size; + len = ra->ra_count; + + /* XXX Check request size */ + if (ra->ra_offset > file_size) { + dprintf("invalid request offset\n"); + error = EFBIG; + break; + } + + if ((ra->ra_offset + len) > file_size) { + len = file_size - ra->ra_offset; + dprintf("%s truncating F_RDADVISE from" + " %08x -> %08x\n", __func__, + ra->ra_count, len); + } + + /* + * Rather than advisory_read (which calls + * cluster_io->VNOP_BLOCKMAP), prefetch + * the level 0 metadata and level 1 data + * at the requested offset + length. + */ + // error = advisory_read(ap->a_vp, file_size, + // ra->ra_offset, len); + dmu_prefetch(zfsvfs->z_os, zp->z_id, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch(zfsvfs->z_os, zp->z_id, + 1, ra->ra_offset, len, + ZIO_PRIORITY_SYNC_READ); +#if 0 + { + const char *name = vnode_getname(ap->a_vp); + printf("%s F_RDADVISE: prefetch issued for " + "[%s](0x%016llx) (0x%016llx 0x%08x)\n", __func__, + (name ? name : ""), zp->z_id, + ra->ra_offset, len); + if (name) vnode_putname(name); + } +#endif + + break; + + case SPOTLIGHT_GET_MOUNT_TIME: + case SPOTLIGHT_IOC_GET_MOUNT_TIME: + case SPOTLIGHT_FSCTL_GET_MOUNT_TIME: + dprintf("%s SPOTLIGHT_GET_MOUNT_TIME\n", __func__); + *(uint32_t *)ap->a_data = zfsvfs->z_mount_time; + break; + case SPOTLIGHT_GET_UNMOUNT_TIME: + dprintf("%s SPOTLIGHT_GET_UNMOUNT_TIME\n", __func__); + *(uint32_t *)ap->a_data = zfsvfs->z_last_unmount_time; + break; + case SPOTLIGHT_FSCTL_GET_LAST_MTIME: + case SPOTLIGHT_IOC_GET_LAST_MTIME: + dprintf("%s SPOTLIGHT_FSCTL_GET_LAST_MTIME\n", + __func__); + *(uint32_t *)ap->a_data = zfsvfs->z_last_unmount_time; + break; + + case HFS_SET_ALWAYS_ZEROFILL: + dprintf("%s HFS_SET_ALWAYS_ZEROFILL\n", __func__); + /* Required by Spotlight search */ + break; + case HFS_EXT_BULKACCESS_FSCTL: + dprintf("%s HFS_EXT_BULKACCESS_FSCTL\n", __func__); + /* Required by Spotlight search */ + break; + + /* ioctl required to simulate HFS mimic behavior */ + case 0x80005802: + dprintf("%s 0x80005802 unknown\n", __func__); + /* unknown - from subsystem read, 'X', 2 */ + break; + + case HFS_GETPATH: + case HFSIOC_GETPATH: + dprintf("%s HFS_GETPATH\n", __func__); + { + struct vfsstatfs *vfsp; + struct vnode *file_vp; + ino64_t cnid; + int outlen; + char *bufptr; + int flags = 0; + + /* Caller must be owner of file system. */ + vfsp = vfs_statfs(zfsvfs->z_vfs); + if (proc_suser(current_proc()) && + kauth_cred_getuid((kauth_cred_t)cr) != + vfsp->f_owner) { + error = EACCES; + goto out; + } + /* Target vnode must be file system's root. */ + if (!vnode_isvroot(ap->a_vp)) { + error = EINVAL; + goto out; + } + + /* We are passed a string containing inode # */ + bufptr = (char *)ap->a_data; + cnid = strtoul(bufptr, NULL, 10); + if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) { + flags |= BUILDPATH_VOLUME_RELATIVE; + } + + if ((error = zfs_vfs_vget(zfsvfs->z_vfs, cnid, + &file_vp, (vfs_context_t)ct))) { + goto out; + } + error = build_path(file_vp, bufptr, MAXPATHLEN, + &outlen, flags, (vfs_context_t)ct); + vnode_put(file_vp); + + dprintf("ZFS: HFS_GETPATH done %d : '%s'\n", + error, error ? "" : bufptr); + } + break; + + case HFS_TRANSFER_DOCUMENT_ID: + case HFSIOC_TRANSFER_DOCUMENT_ID: + dprintf("%s HFS_TRANSFER_DOCUMENT_ID\n", __func__); + { + u_int32_t to_fd = *(u_int32_t *)ap->a_data; + file_t *to_fp; + struct vnode *to_vp; + znode_t *to_zp; + + to_fp = getf(to_fd); + if (to_fp == NULL) { + error = EBADF; + goto out; + } + + to_vp = getf_vnode(to_fp); + + if ((error = vnode_getwithref(to_vp))) { + releasef(to_fd); + goto out; + } + + /* Confirm it is inside our mount */ + if (((zfsvfs_t *)vfs_fsprivate( + vnode_mount(to_vp))) != zfsvfs) { + error = EXDEV; + goto transfer_out; + } + + to_zp = VTOZ(to_vp); + + /* Source should have UF_TRACKED */ + if (!(zp->z_pflags & ZFS_TRACKED)) { + dprintf("ZFS: source is not TRACKED\n"); + error = EINVAL; + /* dest should NOT have UF_TRACKED */ + } else if (to_zp->z_pflags & ZFS_TRACKED) { + dprintf("ZFS: dest already TRACKED\n"); + error = EEXIST; + /* should be valid types */ + } else if ( + (IFTOVT((mode_t)zp->z_mode) == VDIR) || + (IFTOVT((mode_t)zp->z_mode) == VREG) || + (IFTOVT((mode_t)zp->z_mode) == VLNK)) { + /* + * Make sure source has a document id + * - although it can't + */ + if (!zp->z_document_id) + zfs_setattr_generate_id(zp, 0, + NULL); + + /* transfer over */ + to_zp->z_document_id = + zp->z_document_id; + zp->z_document_id = 0; + to_zp->z_pflags |= ZFS_TRACKED; + zp->z_pflags &= ~ZFS_TRACKED; + + /* Commit to disk */ + zfs_setattr_set_documentid(to_zp, + B_TRUE); + zfs_setattr_set_documentid(zp, + B_TRUE); /* also update flags */ + dprintf("ZFS: Moved docid %u from " + "id %llu to id %llu\n", + to_zp->z_document_id, zp->z_id, + to_zp->z_id); + } +transfer_out: + vnode_put(to_vp); + releasef(to_fd); + } + break; + + + case F_MAKECOMPRESSED: + dprintf("%s F_MAKECOMPRESSED\n", __func__); + /* + * Not entirely sure what this does, but HFS comments + * include: "Make the file compressed; truncate & + * toggle BSD bits" + * makes compressed copy of allocated blocks + * shortens file to new length + * sets BSD bits to indicate per-file compression + * + * On HFS, locks cnode and compresses its data. ZFS + * inband compression makes this obsolete. + */ + if (vfs_isrdonly(zfsvfs->z_vfs) || + !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + error = EROFS; + goto out; + } + + /* Are there any other usecounts/FDs? */ + if (vnode_isinuse(ap->a_vp, 1)) { + error = EBUSY; + goto out; + } + + if (zp->z_pflags & ZFS_IMMUTABLE) { + error = EINVAL; + goto out; + } + + /* Return failure */ + error = EINVAL; + break; + + case HFS_PREV_LINK: + case HFS_NEXT_LINK: + case HFSIOC_PREV_LINK: + case HFSIOC_NEXT_LINK: + dprintf("%s HFS_PREV/NEXT_LINK\n", __func__); + { + /* + * Find sibling linkids with hardlinks. a_data points + * to the "current" linkid, and look up either prev + * or next (a_command) linkid. Return in a_data. + */ + uint32_t linkfileid; + struct vfsstatfs *vfsp; + /* Caller must be owner of file system. */ + vfsp = vfs_statfs(zfsvfs->z_vfs); + if ((kauth_cred_getuid(cr) == 0) && + kauth_cred_getuid(cr) != vfsp->f_owner) { + error = EACCES; + goto out; + } + /* Target vnode must be file system's root. */ + if (!vnode_isvroot(ap->a_vp)) { + error = EINVAL; + goto out; + } + linkfileid = *(uint32_t *)ap->a_data; + if (linkfileid < 16) { /* kHFSFirstUserCatalogNodeID */ + error = EINVAL; + goto out; + } + + /* + * Attempt to find the linkid in the hardlink_link + * AVL tree. If found, call to get prev or next. + */ + hardlinks_t *searchnode, *findnode, *sibling; + avl_index_t loc; + + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + searchnode->hl_linkid = linkfileid; + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks_linkid, + searchnode, &loc); + kmem_free(searchnode, sizeof (hardlinks_t)); + + if (!findnode) { + rw_exit(&zfsvfs->z_hardlinks_lock); + *(uint32_t *)ap->a_data = 0; + dprintf("ZFS: HFS_NEXT_LINK/HFS_PREV_LINK %u " + "not found\n", linkfileid); + goto out; + } + + if (ap->a_command != HFS_NEXT_LINK) { + + while ((sibling = + AVL_NEXT(&zfsvfs->z_hardlinks_linkid, + findnode)) != NULL) { + if (findnode->hl_fileid == + sibling->hl_fileid) + break; + } + + } else { + + while ((sibling = + AVL_PREV(&zfsvfs->z_hardlinks_linkid, + findnode)) != NULL) { + if (findnode->hl_fileid == + sibling->hl_fileid) + break; + } + + } + rw_exit(&zfsvfs->z_hardlinks_lock); + + dprintf("ZFS: HFS_%s_LINK %u sibling %u\n", + (ap->a_command != HFS_NEXT_LINK) ? "NEXT" : "PREV", + linkfileid, + sibling ? sibling->hl_linkid : 0); + + // Did we get a new node? + if (sibling == NULL) { + *(uint32_t *)ap->a_data = 0; + goto out; + } + + *(uint32_t *)ap->a_data = sibling->hl_linkid; + error = 0; + } + break; + + case HFS_RESIZE_PROGRESS: + case HFSIOC_RESIZE_PROGRESS: + dprintf("%s HFS_RESIZE_PROGRESS\n", __func__); + /* fail as if requested of non-root fs */ + error = EINVAL; + break; + + case HFS_RESIZE_VOLUME: + case HFSIOC_RESIZE_VOLUME: + dprintf("%s HFS_RESIZE_VOLUME\n", __func__); + /* fail as if requested of non-root fs */ + error = EINVAL; + break; + + case HFS_CHANGE_NEXT_ALLOCATION: + case HFSIOC_CHANGE_NEXT_ALLOCATION: + dprintf("%s HFS_CHANGE_NEXT_ALLOCATION\n", __func__); + /* fail as if requested of non-root fs */ + error = EINVAL; + break; + + case HFS_CHANGE_NEXTCNID: + case HFSIOC_CHANGE_NEXTCNID: + dprintf("%s HFS_CHANGE_NEXTCNID\n", __func__); + /* FIXME : fail as though read only */ + error = EROFS; + break; + + case F_FREEZE_FS: + dprintf("%s F_FREEZE_FS\n", __func__); + /* Dont support freeze */ + error = ENOTSUP; + break; + + case F_THAW_FS: + dprintf("%s F_THAW_FS\n", __func__); + /* dont support fail as though insufficient privilege */ + error = EACCES; + break; + + case HFS_BULKACCESS_FSCTL: + case HFSIOC_BULKACCESS: + dprintf("%s HFS_BULKACCESS_FSCTL\n", __func__); + /* Respond as if HFS_STANDARD flag is set */ + error = EINVAL; + break; + + case HFS_FSCTL_GET_VERY_LOW_DISK: + case HFSIOC_GET_VERY_LOW_DISK: + dprintf("%s HFS_FSCTL_GET_VERY_LOW_DISK\n", __func__); + *(uint32_t *)ap->a_data = + zfsvfs->z_freespace_notify_dangerlimit; + break; + + case HFS_FSCTL_SET_VERY_LOW_DISK: + case HFSIOC_SET_VERY_LOW_DISK: + dprintf("%s HFS_FSCTL_SET_VERY_LOW_DISK\n", __func__); + if (*(uint32_t *)ap->a_data >= + zfsvfs->z_freespace_notify_warninglimit) { + error = EINVAL; + } else { + zfsvfs->z_freespace_notify_dangerlimit = + *(uint32_t *)ap->a_data; + } + break; + + case HFS_FSCTL_GET_LOW_DISK: + case HFSIOC_GET_LOW_DISK: + dprintf("%s HFS_FSCTL_GET_LOW_DISK\n", __func__); + *(uint32_t *)ap->a_data = + zfsvfs->z_freespace_notify_warninglimit; + break; + + case HFS_FSCTL_SET_LOW_DISK: + case HFSIOC_SET_LOW_DISK: + dprintf("%s HFS_FSCTL_SET_LOW_DISK\n", __func__); + if (*(uint32_t *)ap->a_data >= + zfsvfs->z_freespace_notify_desiredlevel || + *(uint32_t *)ap->a_data <= + zfsvfs->z_freespace_notify_dangerlimit) { + error = EINVAL; + } else { + zfsvfs->z_freespace_notify_warninglimit = + *(uint32_t *)ap->a_data; + } + break; + + case HFS_FSCTL_GET_DESIRED_DISK: + case HFSIOC_GET_DESIRED_DISK: + dprintf("%s HFS_FSCTL_GET_DESIRED_DISK\n", __func__); + *(uint32_t *)ap->a_data = + zfsvfs->z_freespace_notify_desiredlevel; + break; + + case HFS_FSCTL_SET_DESIRED_DISK: + case HFSIOC_SET_DESIRED_DISK: + dprintf("%s HFS_FSCTL_SET_DESIRED_DISK\n", __func__); + if (*(uint32_t *)ap->a_data <= + zfsvfs->z_freespace_notify_warninglimit) { + error = EINVAL; + } else { + zfsvfs->z_freespace_notify_desiredlevel = + *(uint32_t *)ap->a_data; + } + break; + + case HFS_VOLUME_STATUS: + case HFSIOC_VOLUME_STATUS: + dprintf("%s HFS_VOLUME_STATUS\n", __func__); + /* For now we always reply "all ok" */ + *(uint32_t *)ap->a_data = + zfsvfs->z_notification_conditions; + break; + + case HFS_SET_BOOT_INFO: + dprintf("%s HFS_SET_BOOT_INFO\n", __func__); + /* + * ZFS booting is not supported, mimic selection + * of a non-root HFS volume + */ + *(uint32_t *)ap->a_data = 0; + error = EINVAL; + break; + case HFS_GET_BOOT_INFO: + { + u_int32_t vcbFndrInfo[8]; + printf("%s HFS_GET_BOOT_INFO\n", __func__); + /* + * ZFS booting is not supported, mimic selection + * of a non-root HFS volume + */ + memset(vcbFndrInfo, 0, sizeof (vcbFndrInfo)); + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(zfsvfs->z_vfs); + vcbFndrInfo[6] = vfsstatfs->f_fsid.val[0]; + vcbFndrInfo[7] = vfsstatfs->f_fsid.val[1]; + bcopy(vcbFndrInfo, ap->a_data, + sizeof (vcbFndrInfo)); + } + break; + case HFS_MARK_BOOT_CORRUPT: + dprintf("%s HFS_MARK_BOOT_CORRUPT\n", __func__); + /* + * ZFS booting is not supported, mimic selection + * of a non-root HFS volume + */ + *(uint32_t *)ap->a_data = 0; + error = EINVAL; + break; + + case HFS_FSCTL_GET_JOURNAL_INFO: + case HFSIOC_GET_JOURNAL_INFO: + dprintf("%s HFS_FSCTL_GET_JOURNAL_INFO\n", __func__); + /* + * XXX We're setting the mount as 'Journaled' + * so this might conflict + * Respond as though journal is empty/disabled + */ + { + struct hfs_journal_info *jip; + jip = (struct hfs_journal_info *)ap->a_data; + jip->jstart = 0; + jip->jsize = 0; + } + break; + + case HFS_DISABLE_METAZONE: + dprintf("%s HFS_DISABLE_METAZONE\n", __func__); + /* fail as though insufficient privs */ + error = EACCES; + break; + +#ifdef HFS_GET_FSINFO + case HFS_GET_FSINFO: + case HFSIOC_GET_FSINFO: + dprintf("%s HFS_GET_FSINFO\n", __func__); + break; +#endif + +#ifdef HFS_REPIN_HOTFILE_STATE + case HFS_REPIN_HOTFILE_STATE: + case HFSIOC_REPIN_HOTFILE_STATE: + dprintf("%s HFS_REPIN_HOTFILE_STATE\n", __func__); + break; +#endif + +#ifdef HFS_SET_HOTFILE_STATE + case HFS_SET_HOTFILE_STATE: + case HFSIOC_SET_HOTFILE_STATE: + dprintf("%s HFS_SET_HOTFILE_STATE\n", __func__); + break; +#endif + +#ifdef APFSIOC_GET_NEAR_LOW_DISK + case APFSIOC_GET_NEAR_LOW_DISK: + dprintf("%s APFSIOC_GET_NEAR_LOW_DISK\n", __func__); + *(uint32_t *)ap->a_data = + zfsvfs->z_freespace_notify_warninglimit; + break; +#endif + +#ifdef APFSIOC_SET_NEAR_LOW_DISK + case APFSIOC_SET_NEAR_LOW_DISK: + dprintf("%s APFSIOC_SET_NEAR_LOW_DISK\n", __func__); + if (*(uint32_t *)ap->a_data >= + zfsvfs->z_freespace_notify_desiredlevel || + *(uint32_t *)ap->a_data <= + zfsvfs->z_freespace_notify_dangerlimit) { + error = EINVAL; + } else { + zfsvfs->z_freespace_notify_warninglimit = + *(uint32_t *)ap->a_data; + } + break; +#endif + + /* End HFS mimic ioctl */ + + default: + dprintf("%s: Unknown ioctl %02lx ('%lu' + %lu)\n", + __func__, ap->a_command, (ap->a_command&0xff00)>>8, + ap->a_command&0xff); + error = ENOTTY; + } + +out: + if (error) { + dprintf("%s: failing ioctl: %02lx ('%lu' + %lu) returned %d\n", + __func__, ap->a_command, (ap->a_command&0xff00)>>8, + ap->a_command&0xff, error); + } + + return (error); +} + + +int +zfs_vnop_read(struct vnop_read_args *ap) +#if 0 + struct vnop_read_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; + }; +#endif +{ + int ioflag = zfs_ioflags(ap->a_ioflag); + int error; + /* uint64_t resid; */ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + + /* resid = uio_resid(ap->a_uio); */ + error = zfs_read(ap->a_vp, ap->a_uio, ioflag, cr); + + if (error) dprintf("vnop_read %d\n", error); + return (error); +} + +int +zfs_vnop_write(struct vnop_write_args *ap) +#if 0 + struct vnop_write_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; + }; +#endif +{ + int ioflag = zfs_ioflags(ap->a_ioflag); + int error; + DECLARE_CRED(ap); + + // dprintf("zfs_vnop_write(vp %p, offset 0x%llx size 0x%llx\n", + // ap->a_vp, uio_offset(ap->a_uio), uio_resid(ap->a_uio)); + + error = zfs_write(ap->a_vp, ap->a_uio, ioflag, cr); + + /* + * Mac OS X: pageout requires that the UBC file size be current. + * Possibly, we could update it only if size has changed. + */ + + /* if (tx_bytes != 0) { */ + if (!error) { + ubc_setsize(ap->a_vp, VTOZ(ap->a_vp)->z_size); + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + +int +zfs_vnop_access(struct vnop_access_args *ap) +#if 0 + struct vnop_access_args { + struct vnodeop_desc *a_desc; + struct vnode a_vp; + int a_action; + vfs_context_t a_context; + }; +#endif +{ + int error = ENOTSUP; + int action = ap->a_action; + int mode = 0; + DECLARE_CRED(ap); + + /* + * KAUTH_VNODE_READ_EXTATTRIBUTES, as well? + * KAUTH_VNODE_WRITE_EXTATTRIBUTES + */ + if (action & KAUTH_VNODE_READ_DATA) + mode |= VREAD; + if (action & KAUTH_VNODE_WRITE_DATA) + mode |= VWRITE; + if (action & KAUTH_VNODE_EXECUTE) + mode |= VEXEC; + + dprintf("vnop_access: action %04x -> mode %04x\n", action, mode); + error = zfs_access(ap->a_vp, mode, 0, cr); + + if (error) dprintf("%s: error %d\n", __func__, error); + return (error); +} + + +/* + * hard link references? + * Read the comment in zfs_getattr_znode_unlocked for the reason + * for this hackery. Since getattr(VA_NAME) is extremely common + * call in OSX, we opt to always save the name. We need to be careful + * as zfs_dirlook can return ctldir node as well (".zfs"). + * Hardlinks also need to be able to return the correct parentid. + */ +static void zfs_cache_name(struct vnode *vp, struct vnode *dvp, char *filename) +{ + znode_t *zp; + if (!vp || + !filename || + !filename[0] || + zfsctl_is_node(vp) || + !VTOZ(vp)) + return; + + // Only cache files, or we might end up caching "." + if (!vnode_isreg(vp)) + return; + + zp = VTOZ(vp); + + mutex_enter(&zp->z_lock); + + strlcpy(zp->z_name_cache, filename, + MAXPATHLEN); + + // If hardlink, remember the parentid. + if (((zp->z_links > 1) || (zp->z_finder_hardlink)) && + (IFTOVT((mode_t)zp->z_mode) == VREG) && dvp) { + zp->z_finder_parentid = VTOZ(dvp)->z_id; + } + + mutex_exit(&zp->z_lock); +} + + +int +zfs_vnop_lookup(struct vnop_lookup_args *ap) +#if 0 + struct vnop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + vfs_context_t a_context; + }; +#endif +{ + struct componentname *cnp = ap->a_cnp; + DECLARE_CRED(ap); + int error; + int negative_cache = 0; + znode_t *zp = NULL; + int direntflags = 0; + char filename[MAXNAMELEN]; + + *ap->a_vpp = NULL; /* In case we return an error */ + + /* + * Darwin uses namelen as an optimisation, for example it can be + * set to 5 for the string "alpha/beta" to look up "alpha". In this + * case we need to copy it out to null-terminate. + */ + bcopy(cnp->cn_nameptr, filename, cnp->cn_namelen); + filename[cnp->cn_namelen] = '\0'; + +#if 1 + /* + * cache_lookup() returns 0 for no-entry + * -1 for cache found (a_vpp set) + * ENOENT for negative cache + */ + error = cache_lookup(ap->a_dvp, ap->a_vpp, cnp); + if (error) { + /* We found a cache entry, positive or negative. */ + if (error == -1) { /* Positive entry? */ + if (!zfs_vnop_ignore_positives) { + error = 0; + goto exit; /* Positive cache, return it */ + } + /* Release iocount held by cache_lookup */ + vnode_put(*ap->a_vpp); + } + /* Negatives are only followed if not CREATE, from HFS+. */ + if (cnp->cn_nameiop != CREATE) { + if (!zfs_vnop_ignore_negatives) { + goto exit; /* Negative cache hit */ + } + negative_cache = 1; + } + } +#endif + + dprintf("+vnop_lookup '%s' %s\n", filename, + negative_cache ? "negative_cache":""); + + /* + * 'cnp' passed to us is 'readonly' as XNU does not expect a return + * name, but most likely expects it correct in getattr. + */ + struct componentname cn2; + cn2.cn_nameptr = filename; + cn2.cn_namelen = MAXNAMELEN; + cn2.cn_nameiop = cnp->cn_nameiop; + cn2.cn_flags = cnp->cn_flags; + + error = zfs_lookup(VTOZ(ap->a_dvp), filename, &zp, /* flags */ 0, cr, + &direntflags, &cn2); + /* flags can be LOOKUP_XATTR | FIGNORECASE */ + +#if 1 + /* + * It appears that VFS layer adds negative cache entries for us, so + * we do not need to add them here, or they are duplicated. + */ + if ((error == ENOENT) && zfs_vnop_create_negatives) { + if ((ap->a_cnp->cn_nameiop == CREATE || + ap->a_cnp->cn_nameiop == RENAME) && + (cnp->cn_flags & ISLASTCN)) { + error = EJUSTRETURN; + goto exit; + } + /* Insert name into cache (as non-existent) if appropriate. */ + if ((cnp->cn_flags & MAKEENTRY) && + ap->a_cnp->cn_nameiop != CREATE) { + cache_enter(ap->a_dvp, NULL, ap->a_cnp); + dprintf("Negative-cache made for '%s'\n", + filename); + } + } /* ENOENT */ +#endif + +exit: + + if (error == 0 && (zp != NULL)) { + printf("back with zp %p: name '%s'\n", zp, filename); + + *ap->a_vpp = ZTOV(zp); + + zfs_cache_name(*ap->a_vpp, ap->a_dvp, filename); + + } + + dprintf("-vnop_lookup %d : dvp %llu '%s'\n", error, + VTOZ(ap->a_dvp)->z_id, filename); + + return (error); +} + +int +zfs_vnop_create(struct vnop_create_args *ap) +#if 0 + struct vnop_create_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + DECLARE_CRED(ap); + vcexcl_t excl; + int mode = 0; /* FIXME */ + int error; + znode_t *zp = NULL; + + dprintf("vnop_create: '%s'\n", cnp->cn_nameptr); + + /* + * extern int zfs_create(struct vnode *dvp, char *name, vattr_t *vap, + * int excl, int mode, struct vnode **vpp, cred_t *cr); + */ + excl = (vap->va_vaflags & VA_EXCLUSIVE) ? EXCL : NONEXCL; + + error = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, excl, mode, + &zp, cr, 0, NULL); + if (!error) { + cache_purge_negatives(ap->a_dvp); + *ap->a_vpp = ZTOV(zp); + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + + +static int zfs_remove_hardlink(struct vnode *vp, struct vnode *dvp, char *name) +{ + /* + * Because we store hash of hardlinks in an AVLtree, we need to remove + * any entries in it upon deletion. Since it is complicated to know + * if an entry was a hardlink, we simply check if the avltree has the + * name. + */ + hardlinks_t *searchnode, *findnode; + avl_index_t loc; + + if (!vp || !VTOZ(vp)) + return (1); + if (!dvp || !VTOZ(dvp)) + return (1); + znode_t *zp = VTOZ(vp); + znode_t *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int ishardlink = 0; + + ishardlink = ((zp->z_links > 1) && + (IFTOVT((mode_t)zp->z_mode) == VREG)) ? 1 : 0; + if (zp->z_finder_hardlink) + ishardlink = 1; + + if (!ishardlink) + return (0); + + dprintf("ZFS: removing hash (%llu,%llu,'%s')\n", + dzp->z_id, zp->z_id, name); + + // Attempt to remove from hardlink avl, if its there + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + searchnode->hl_parent = dzp->z_id == zfsvfs->z_root ? 2 : dzp->z_id; + searchnode->hl_fileid = zp->z_id; + strlcpy(searchnode->hl_name, name, PATH_MAX); + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks, searchnode, &loc); + rw_exit(&zfsvfs->z_hardlinks_lock); + kmem_free(searchnode, sizeof (hardlinks_t)); + + // Found it? remove it + if (findnode) { + rw_enter(&zfsvfs->z_hardlinks_lock, RW_WRITER); + avl_remove(&zfsvfs->z_hardlinks, findnode); + avl_remove(&zfsvfs->z_hardlinks_linkid, findnode); + rw_exit(&zfsvfs->z_hardlinks_lock); + kmem_free(findnode, sizeof (*findnode)); + dprintf("ZFS: removed hash '%s'\n", name); + mutex_enter(&zp->z_lock); + zp->z_name_cache[0] = 0; + zp->z_finder_parentid = 0; + mutex_exit(&zp->z_lock); + return (1); + } + return (0); +} + + +static int zfs_rename_hardlink(struct vnode *vp, struct vnode *tvp, + struct vnode *fdvp, struct vnode *tdvp, + char *from, char *to) +{ + /* + * Because we store hash of hardlinks in an AVLtree, we need to update + * any entries in it upon rename. Since it is complicated to know + * if an entry was a hardlink, we simply check if the avltree has the + * name. + */ + hardlinks_t *searchnode, *findnode, *delnode; + avl_index_t loc; + uint64_t parent_fid, parent_tid; + int ishardlink = 0; + + if (!vp || !VTOZ(vp)) + return (0); + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ishardlink = ((zp->z_links > 1) && + (IFTOVT((mode_t)zp->z_mode) == VREG)) ? 1 : 0; + if (zp->z_finder_hardlink) + ishardlink = 1; + + if (!ishardlink) + return (0); + + if (!fdvp || !VTOZ(fdvp)) + return (0); + parent_fid = VTOZ(fdvp)->z_id; + parent_fid = parent_fid == zfsvfs->z_root ? 2 : parent_fid; + + if (!tdvp || !VTOZ(tdvp)) { + parent_tid = parent_fid; + } else { + parent_tid = VTOZ(tdvp)->z_id; + parent_tid = parent_tid == zfsvfs->z_root ? 2 : parent_tid; + } + + dprintf("ZFS: looking to rename hardlinks (%llu,%llu,%s)\n", + parent_fid, zp->z_id, from); + + + // Attempt to remove from hardlink avl, if its there + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + searchnode->hl_parent = parent_fid; + searchnode->hl_fileid = zp->z_id; + strlcpy(searchnode->hl_name, from, PATH_MAX); + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks, searchnode, &loc); + rw_exit(&zfsvfs->z_hardlinks_lock); + + // Found it? update it + if (findnode) { + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_WRITER); + + // Technically, we do not need to re-do the _linkid AVL here. + avl_remove(&zfsvfs->z_hardlinks, findnode); + avl_remove(&zfsvfs->z_hardlinks_linkid, findnode); + + // If we already have a hashid for "to" and the rename + // presumably unlinked it, we need to remove it first. + searchnode->hl_parent = parent_tid; + strlcpy(searchnode->hl_name, to, PATH_MAX); + delnode = avl_find(&zfsvfs->z_hardlinks, searchnode, &loc); + if (delnode) { + dprintf("ZFS: apparently %llu:'%s' exists, deleting\n", + parent_tid, to); + avl_remove(&zfsvfs->z_hardlinks, delnode); + avl_remove(&zfsvfs->z_hardlinks_linkid, delnode); + kmem_free(delnode, sizeof (*delnode)); + } + + dprintf("ZFS: renamed hash %llu (%llu:'%s' to %llu:'%s'): %s\n", + zp->z_id, + parent_fid, from, + parent_tid, to, + delnode ? "deleted":""); + + // Update source node to new hash, and name. + findnode->hl_parent = parent_tid; + strlcpy(findnode->hl_name, to, PATH_MAX); + // zp->z_finder_parentid = parent_tid; + + avl_add(&zfsvfs->z_hardlinks, findnode); + avl_add(&zfsvfs->z_hardlinks_linkid, findnode); + + rw_exit(&zfsvfs->z_hardlinks_lock); + kmem_free(searchnode, sizeof (hardlinks_t)); + + return (1); + } + + kmem_free(searchnode, sizeof (hardlinks_t)); + return (0); +} + + +int +zfs_vnop_remove(struct vnop_remove_args *ap) +#if 0 + struct vnop_remove_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int error; + + dprintf("vnop_remove: %p (%s)\n", ap->a_vp, ap->a_cnp->cn_nameptr); + + /* + * extern int zfs_remove ( struct vnode *dvp, char *name, cred_t *cr, + * caller_context_t *ct, int flags); + */ + error = zfs_remove(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, cr, + /* flags */0); + if (!error) { + cache_purge(ap->a_vp); + + zfs_remove_hardlink(ap->a_vp, + ap->a_dvp, + ap->a_cnp->cn_nameptr); + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + +int +zfs_vnop_mkdir(struct vnop_mkdir_args *ap) +#if 0 + struct vnop_mkdir_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int error; + + dprintf("vnop_mkdir '%s'\n", ap->a_cnp->cn_nameptr); + +#if 0 + /* Let's deny OS X fseventd for now */ + if (ap->a_cnp->cn_nameptr && + strcmp(ap->a_cnp->cn_nameptr, ".fseventsd") == 0) + return (EINVAL); +#endif + +#if 0 + /* spotlight for now */ + if (ap->a_cnp->cn_nameptr && + strcmp(ap->a_cnp->cn_nameptr, ".Spotlight-V100") == 0) + return (EINVAL); +#endif + /* + * extern int zfs_mkdir(struct vnode *dvp, char *dirname, vattr_t *vap, + * struct vnode **vpp, cred_t *cr, caller_context_t *ct, int flags, + * vsecattr_t *vsecp); + */ + znode_t *zp = NULL; + error = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, ap->a_vap, + &zp, cr, /* flags */0, /* vsecp */NULL); + if (!error) { + *ap->a_vpp = ZTOV(zp); + cache_purge_negatives(ap->a_dvp); + vnode_update_identity(*ap->a_vpp, ap->a_dvp, + (const char *)ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen, + 0, VNODE_UPDATE_NAME); + + VERIFY3P(zp->z_zfsvfs, ==, + vfs_fsprivate(vnode_mount(*ap->a_vpp))); + + + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + +int +zfs_vnop_rmdir(struct vnop_rmdir_args *ap) +#if 0 + struct vnop_rmdir_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int error; + + dprintf("vnop_rmdir\n"); + + /* + * extern int zfs_rmdir(struct vnode *dvp, char *name, + * struct vnode *cwd, cred_t *cr, caller_context_t *ct, int flags); + */ + error = zfs_rmdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, + /* cwd */NULL, cr, /* flags */0); + if (!error) { + cache_purge(ap->a_vp); + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + +int +zfs_vnop_readdir(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + int error; + DECLARE_CRED(ap); + + dprintf("+readdir: %p\n", ap->a_vp); + + /* + * XXX This interface needs vfs_has_feature. + * XXX zfs_readdir() also needs to grow support for passing back the + * number of entries (OS X/FreeBSD) and cookies (FreeBSD). However, + * it should be the responsibility of the OS caller to malloc/free + * space for that. + */ + + /* + * extern int zfs_readdir(struct vnode *vp, uio_t *uio, cred_t *cr, + * int *eofp, int flags, int *a_numdirent); + */ + *ap->a_numdirent = 0; + + error = zfs_readdir(ap->a_vp, ap->a_uio, cr, ap->a_eofflag, ap->a_flags, + ap->a_numdirent); + + /* .zfs dirs can be completely empty */ + if (*ap->a_numdirent == 0) + *ap->a_numdirent = 2; /* . and .. */ + + if (error) { + dprintf("-readdir %d (nument %d)\n", error, *ap->a_numdirent); + } + return (error); +} + +int +zfs_vnop_fsync(struct vnop_fsync_args *ap) +#if 0 + struct vnop_fsync_args { + struct vnode *a_vp; + int a_waitfor; + vfs_context_t a_context; + }; +#endif +{ + znode_t *zp = VTOZ(ap->a_vp); + zfsvfs_t *zfsvfs; +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int err; + + /* + * Check if this znode has already been synced, freed, and recycled + * by znode_pageout_func. + * + * XXX What is this? Substitute for Illumos vn_has_cached_data()? + */ + if (zp == NULL) + return (0); + + zfsvfs = zp->z_zfsvfs; + + if (!zfsvfs) + return (0); + + /* + * If we come here via vnode_create()->vclean() we can not end up in + * zil_commit() or we will deadlock. But we know that vnop_reclaim will + * be called next, so we just return success. + */ + if (vnode_isrecycled(ap->a_vp)) + return (0); + + err = zfs_fsync(VTOZ(ap->a_vp), /* flag */0, cr); + + if (err) dprintf("%s err %d\n", __func__, err); + + return (err); +} + +int +zfs_vnop_getattr(struct vnop_getattr_args *ap) +#if 0 + struct vnop_getattr_args { + struct vnode *a_vp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ + int error; + DECLARE_CRED_AND_CONTEXT(ap); + + /* dprintf("+vnop_getattr zp %p vp %p\n", VTOZ(ap->a_vp), ap->a_vp); */ + + error = zfs_getattr(ap->a_vp, ap->a_vap, /* flags */0, cr, ct); + + if (error == 0) { + error = zfs_getattr_znode_unlocked(ap->a_vp, ap->a_vap); + } + if (error) + dprintf("-vnop_getattr '%p' %d\n", (ap->a_vp), error); + + return (error); +} + +int +zfs_vnop_setattr(struct vnop_setattr_args *ap) +#if 0 + struct vnop_setattr_args { + struct vnode *a_vp; + struct vnode_vattr *a_vap; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + vattr_t *vap = ap->a_vap; + uint_t mask = vap->va_mask; + int error = 0; + int hfscompression = 0; + znode_t *zp = VTOZ(ap->a_vp); + + /* Translate OS X requested mask to ZFS */ + mask = vap->va_mask; + + /* + * Both 'flags' and 'acl' can come to setattr, but without 'mode' set. + * However, ZFS assumes 'mode' is also set. We need to look up 'mode' in + * this case. + */ + if ((VATTR_IS_ACTIVE(vap, va_flags) || VATTR_IS_ACTIVE(vap, va_acl)) && + !VATTR_IS_ACTIVE(vap, va_mode)) { + uint64_t mode; + + mask |= ATTR_MODE; + + dprintf("fetching MODE for FLAGS or ACL\n"); + ZFS_ENTER(zp->z_zfsvfs); + ZFS_VERIFY_ZP(zp); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zp->z_zfsvfs), &mode, + sizeof (mode)); + vap->va_mode = mode; + ZFS_EXIT(zp->z_zfsvfs); + } + if (VATTR_IS_ACTIVE(vap, va_flags)) { + + /* + * If TRACKED is wanted, and not previously set, + * go set DocumentID + */ + if ((vap->va_flags & UF_TRACKED) && + !(zp->z_pflags & ZFS_TRACKED)) { + zfs_setattr_generate_id(zp, 0, NULL); + /* flags updated in vnops */ + zfs_setattr_set_documentid(zp, B_FALSE); + } + + /* If they are trying to turn on compression.. */ + if (vap->va_flags & UF_COMPRESSED) { + zp->z_skip_truncate_undo_decmpfs = B_TRUE; + printf("setattr trying to set COMPRESSED!\n"); + } + /* Map OS X file flags to zfs file flags */ + zfs_setbsdflags(zp, vap->va_flags); + dprintf("OS X flags %08x changed to ZFS %04llx\n", + vap->va_flags, zp->z_pflags); + vap->va_flags = zp->z_pflags; + + } + + vap->va_mask = mask; + + /* + * If z_skip_truncate_undo_decmpfs is set, and they are trying to + * va_size == 0 (truncate), we undo the decmpfs work here. This is + * because we can not stop (no error, or !feature works) macOS from + * using decmpfs. + */ +#ifndef DECMPFS_XATTR_NAME +#define DECMPFS_XATTR_NAME "com.apple.decmpfs" +#endif + if ((VATTR_IS_ACTIVE(vap, va_total_size) || + VATTR_IS_ACTIVE(vap, va_data_size)) && + zp->z_skip_truncate_undo_decmpfs) { + zp->z_skip_truncate_undo_decmpfs = B_FALSE; + + printf("setattr setsize with compress attempted\n"); + + if (zfs_vnop_removexattr_int(zp->z_zfsvfs, zp, + DECMPFS_XATTR_NAME, NULL) == 0) { + /* Successfully deleted the XATTR - skip truncate */ + VATTR_CLEAR_ACTIVE(vap, va_total_size); + VATTR_CLEAR_ACTIVE(vap, va_data_size); + printf("setattr skipping truncate!\n"); + } + } + + error = zfs_setattr(VTOZ(ap->a_vp), ap->a_vap, /* flag */0, cr); + + dprintf("vnop_setattr: called on vp %p with mask %04x, err=%d\n", + ap->a_vp, mask, error); + + if (!error) { + /* If successful, tell OS X which fields ZFS set. */ + if (VATTR_IS_ACTIVE(vap, va_data_size)) { + dprintf("ZFS: setattr new size %llx %llx\n", + vap->va_size, ubc_getsize(ap->a_vp)); + ubc_setsize(ap->a_vp, vap->va_size); + VATTR_SET_SUPPORTED(vap, va_data_size); + } + if (VATTR_IS_ACTIVE(vap, va_mode)) + VATTR_SET_SUPPORTED(vap, va_mode); + if (VATTR_IS_ACTIVE(vap, va_acl)) + VATTR_SET_SUPPORTED(vap, va_acl); + if (VATTR_IS_ACTIVE(vap, va_uid)) + VATTR_SET_SUPPORTED(vap, va_uid); + if (VATTR_IS_ACTIVE(vap, va_gid)) + VATTR_SET_SUPPORTED(vap, va_gid); + if (VATTR_IS_ACTIVE(vap, va_access_time)) + VATTR_SET_SUPPORTED(vap, va_access_time); + if (VATTR_IS_ACTIVE(vap, va_modify_time)) + VATTR_SET_SUPPORTED(vap, va_modify_time); + if (VATTR_IS_ACTIVE(vap, va_change_time)) + VATTR_SET_SUPPORTED(vap, va_change_time); + if (VATTR_IS_ACTIVE(vap, va_create_time)) + VATTR_SET_SUPPORTED(vap, va_create_time); + if (VATTR_IS_ACTIVE(vap, va_backup_time)) + VATTR_SET_SUPPORTED(vap, va_backup_time); + if (VATTR_IS_ACTIVE(vap, va_flags)) { + VATTR_SET_SUPPORTED(vap, va_flags); + } + + } + +#if 1 + uint64_t missing = 0; + missing = (vap->va_active ^ (vap->va_active & vap->va_supported)); + if (missing != 0) { + printf("vnop_setattr:: asked %08llx replied %08llx " + "missing %08llx\n", vap->va_active, + vap->va_supported, missing); + } +#endif + + if (error) + dprintf("ZFS: vnop_setattr return failure %d\n", error); + return (error); +} + +int +zfs_vnop_rename(struct vnop_rename_args *ap) +#if 0 + struct vnop_rename_args { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int error; + + dprintf("vnop_rename\n"); + + /* + * extern int zfs_rename(struct vnode *sdvp, char *snm, + * struct vnode *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, + * int flags); + */ + error = zfs_rename(VTOZ(ap->a_fdvp), ap->a_fcnp->cn_nameptr, + VTOZ(ap->a_tdvp), ap->a_tcnp->cn_nameptr, cr, /* flags */0); + + if (!error) { + cache_purge_negatives(ap->a_fdvp); + cache_purge_negatives(ap->a_tdvp); + cache_purge(ap->a_fvp); + + zfs_rename_hardlink(ap->a_fvp, ap->a_tvp, + ap->a_fdvp, ap->a_tdvp, + ap->a_fcnp->cn_nameptr, + ap->a_tcnp->cn_nameptr); + if (ap->a_tvp) { + cache_purge(ap->a_tvp); + } + +#ifdef __APPLE__ + /* + * After a rename, the VGET path /.vol/$fsid/$ino fails for + * a short period on hardlinks (until someone calls lookup). + * So until we can figure out exactly why this is, we drive + * a lookup here to ensure that vget will work + * (Finder/Spotlight). + */ + if (ap->a_fvp && VTOZ(ap->a_fvp) && + VTOZ(ap->a_fvp)->z_finder_hardlink) { + struct vnode *vp; + if (VOP_LOOKUP(ap->a_tdvp, &vp, ap->a_tcnp, + spl_vfs_context_kernel()) == 0) + vnode_put(vp); + } +#endif + + } + + if (error) dprintf("%s: error %d\n", __func__, error); + return (error); +} + +#if defined(MAC_OS_X_VERSION_10_12) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12) +int +zfs_vnop_renamex(struct vnop_renamex_args *ap) +#if 0 + struct vnop_renamex_args { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + struct vnode_attr *a_vap; // Reserved for future use + vfs_rename_flags_t a_flags; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + int error; + + dprintf("vnop_renamex\n"); + + /* + * extern int zfs_rename(struct vnode *sdvp, char *snm, + * struct vnode *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, + * int flags); + * + * Currently, hfs only supports one flag, VFS_RENAME_EXCL, so + * we will do the same. Since zfs_rename() only has logic for + * FIGNORECASE, passing VFS_RENAME_EXCL should be ok, if a bit + * hacky. + */ + error = zfs_rename(VTOZ(ap->a_fdvp), ap->a_fcnp->cn_nameptr, + VTOZ(ap->a_tdvp), ap->a_tcnp->cn_nameptr, cr, + (ap->a_flags&VFS_RENAME_EXCL)); + + if (!error) { + cache_purge_negatives(ap->a_fdvp); + cache_purge_negatives(ap->a_tdvp); + cache_purge(ap->a_fvp); + + zfs_rename_hardlink(ap->a_fvp, ap->a_tvp, + ap->a_fdvp, ap->a_tdvp, + ap->a_fcnp->cn_nameptr, + ap->a_tcnp->cn_nameptr); + if (ap->a_tvp) { + cache_purge(ap->a_tvp); + } + +#ifdef __APPLE__ + /* + * After a rename, the VGET path /.vol/$fsid/$ino fails for + * a short period on hardlinks (until someone calls lookup). + * So until we can figure out exactly why this is, we drive + * a lookup here to ensure that vget will work + * (Finder/Spotlight). + */ + if (ap->a_fvp && VTOZ(ap->a_fvp) && + VTOZ(ap->a_fvp)->z_finder_hardlink) { + struct vnode *vp; + if (VOP_LOOKUP(ap->a_tdvp, &vp, ap->a_tcnp, + spl_vfs_context_kernel()) == 0) + vnode_put(vp); + } +#endif + + } + + if (error) dprintf("%s: error %d\n", __func__, error); + return (error); +} +#endif // vnop_renamex_args + +int +zfs_vnop_symlink(struct vnop_symlink_args *ap) +#if 0 + struct vnop_symlink_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vnode_vattr *a_vap; + char *a_target; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + int error; + + dprintf("vnop_symlink\n"); + + /* + * extern int zfs_symlink(struct vnode *dvp, struct vnode **vpp, + * char *name, vattr_t *vap, char *link, cred_t *cr); + */ + + /* OS X doesn't need to set vap->va_mode? */ + znode_t *zp = NULL; + error = zfs_symlink(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, + ap->a_vap, ap->a_target, &zp, cr, 0); + if (!error) { + *ap->a_vpp = ZTOV(zp); + cache_purge_negatives(ap->a_dvp); + } else { + dprintf("%s: error %d\n", __func__, error); + } + /* XXX zfs_attach_vnode()? */ + return (error); +} + + +int +zfs_vnop_readlink(struct vnop_readlink_args *ap) +#if 0 + struct vnop_readlink_args { + struct vnode *vp; + struct uio *uio; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + + dprintf("vnop_readlink\n"); + + /* + * extern int zfs_readlink(struct vnode *vp, uio_t *uio, cred_t *cr, + * caller_context_t *ct); + */ + return (zfs_readlink(ap->a_vp, ap->a_uio, cr)); +} + +int +zfs_vnop_link(struct vnop_link_args *ap) +#if 0 + struct vnop_link_args { + struct vnode *a_vp; + struct vnode *a_tdvp; + struct componentname *a_cnp; + vfs_context_t a_context; + }; +#endif +{ +// DECLARE_CRED_AND_CONTEXT(ap); + DECLARE_CRED(ap); + int error; + + dprintf("vnop_link\n"); + + /* XXX Translate this inside zfs_link() instead. */ + if (vnode_mount(ap->a_vp) != vnode_mount(ap->a_tdvp)) { + dprintf("%s: vp and tdvp on different mounts\n", __func__); + return (EXDEV); + } + + /* + * XXX Understand why Apple made this comparison in so many places where + * others do not. + */ + if (ap->a_cnp->cn_namelen >= ZAP_MAXNAMELEN) { + dprintf("%s: name too long %d\n", __func__, + ap->a_cnp->cn_namelen); + return (ENAMETOOLONG); + } + + /* + * extern int zfs_link(struct vnode *tdvp, struct vnode *svp, + * char *name, cred_t *cr, caller_context_t *ct, int flags); + */ + + error = zfs_link(VTOZ(ap->a_tdvp), VTOZ(ap->a_vp), + ap->a_cnp->cn_nameptr, cr, 0); + if (!error) { + // Set source vnode to multipath too, zfs_get_vnode() + // handles the target + vnode_setmultipath(ap->a_vp); + cache_purge(ap->a_vp); + cache_purge_negatives(ap->a_tdvp); + } else { + dprintf("%s error %d\n", __func__, error); + } + + return (error); +} + +int +zfs_vnop_pagein(struct vnop_pagein_args *ap) +#if 0 + struct vnop_pagein_args { + struct vnode *a_vp; + upl_t a_pl; + vm_offset_t a_pl_offset; + off_t a_foffset; + size_t a_size; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ + /* XXX Crib this from the Apple zfs_vnops.c. */ + struct vnode *vp = ap->a_vp; + offset_t off = ap->a_f_offset; + size_t len = ap->a_size; + upl_t upl = ap->a_pl; + vm_offset_t upl_offset = ap->a_pl_offset; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + caddr_t vaddr = NULL; + /* vm_offset_t vaddr = NULL; */ + int flags = ap->a_flags; + int need_unlock = 0; + int error = 0; + uint64_t file_sz; + + dprintf("+vnop_pagein: %p/%p off 0x%llx size 0x%lx filesz 0x%llx\n", + zp, vp, off, len, zp->z_size); + + if (upl == (upl_t)NULL) + panic("zfs_vnop_pagein: no upl!"); + + if (len <= 0) { + dprintf("zfs_vnop_pagein: invalid size %ld", len); + if (!(flags & UPL_NOCOMMIT)) + (void) ubc_upl_abort(upl, 0); + return (EINVAL); + } + + ZFS_ENTER(zfsvfs); + + file_sz = zp->z_size; + + ASSERT(vn_has_cached_data(vp)); + /* ASSERT(zp->z_dbuf_held && zp->z_phys); */ + /* can't fault passed EOF */ + if ((off < 0) || (off >= file_sz) || + (len & PAGE_MASK) || (upl_offset & PAGE_MASK)) { + dprintf("passed EOF or size error\n"); + ZFS_EXIT(zfsvfs); + if (!(flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(upl, upl_offset, len, + (UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY)); + return (EFAULT); + } + + /* + * If we already own the lock, then we must be page faulting in the + * middle of a write to this file (i.e., we are writing to this file + * using data from a mapped region of the file). + */ + if (!rw_write_held(&zp->z_map_lock)) { + rw_enter(&zp->z_map_lock, RW_WRITER); + need_unlock = TRUE; + } + + + if (ubc_upl_map(upl, (vm_offset_t *)&vaddr) != KERN_SUCCESS) { + dprintf("zfs_vnop_pagein: failed to ubc_upl_map"); + if (!(flags & UPL_NOCOMMIT)) + (void) ubc_upl_abort(upl, 0); + if (need_unlock) + rw_exit(&zp->z_map_lock); + ZFS_EXIT(zfsvfs); + return (ENOMEM); + } + + dprintf("vaddr %p with upl_off 0x%lx\n", vaddr, upl_offset); + vaddr += upl_offset; + + /* Can't read beyond EOF - but we need to zero those extra bytes. */ + if (off + len > file_sz) { + uint64_t newend = file_sz - off; + + dprintf("ZFS: pagein zeroing offset 0x%llx for 0x%llx bytes.\n", + newend, len - newend); + memset(&vaddr[newend], 0, len - newend); + len = newend; + } + /* + * Fill pages with data from the file. + */ + while (len > 0) { + uint64_t readlen; + + readlen = MIN(PAGESIZE, len); + + dprintf("pagein from off 0x%llx len 0x%llx into " + "address %p (len 0x%lx)\n", + off, readlen, vaddr, len); + + error = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, readlen, + (void *)vaddr, DMU_READ_PREFETCH); + if (error) { + printf("zfs_vnop_pagein: dmu_read err %d\n", error); + break; + } + off += readlen; + vaddr += readlen; + len -= readlen; + } + ubc_upl_unmap(upl); + + if (!(flags & UPL_NOCOMMIT)) { + if (error) + ubc_upl_abort_range(upl, upl_offset, ap->a_size, + (UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY)); + else + ubc_upl_commit_range(upl, upl_offset, ap->a_size, + (UPL_COMMIT_CLEAR_DIRTY | + UPL_COMMIT_FREE_ON_EMPTY)); + } + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + /* + * We can't grab the range lock for the page as reader which would stop + * truncation as this leads to deadlock. So we need to recheck the file + * size. + */ + if (ap->a_f_offset >= file_sz) + error = EFAULT; + if (need_unlock) + rw_exit(&zp->z_map_lock); + + ZFS_EXIT(zfsvfs); + if (error) dprintf("%s error %d\n", __func__, error); + return (error); +} + + + + +static int +zfs_pageout(zfsvfs_t *zfsvfs, znode_t *zp, upl_t upl, vm_offset_t upl_offset, + offset_t off, size_t size, int flags) +{ + dmu_tx_t *tx; + zfs_locked_range_t *lr; + uint64_t filesz; + int err = 0; + size_t len = size; + + dprintf("+vnop_pageout: %p/%p off 0x%llx len 0x%lx upl_off 0x%lx: " + "blksz 0x%x, z_size 0x%llx upl %p flags 0x%x\n", zp, ZTOV(zp), + off, len, upl_offset, zp->z_blksz, + zp->z_size, upl, flags); + + if (upl == (upl_t)NULL) { + dprintf("ZFS: vnop_pageout: failed on NULL upl\n"); + return (EINVAL); + } + /* + * We can't leave this function without either calling upl_commit or + * upl_abort. So use the non-error version. + */ + ZFS_ENTER_IFERROR(zfsvfs) { + if (!(flags & UPL_NOCOMMIT)) + (void) ubc_upl_abort(upl, + UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); + dprintf("ZFS: vnop_pageout: abort on z_unmounted\n"); + ZFS_EXIT(zfsvfs); + return (EIO); + } + + + ASSERT(vn_has_cached_data(ZTOV(zp))); + /* ASSERT(zp->z_dbuf_held); */ /* field no longer present in znode. */ + + if (len <= 0) { + if (!(flags & UPL_NOCOMMIT)) + (void) ubc_upl_abort(upl, + UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY); + err = EINVAL; + goto exit; + } + if (vnode_vfsisrdonly(ZTOV(zp))) { + if (!(flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(upl, upl_offset, len, + UPL_ABORT_FREE_ON_EMPTY); + err = EROFS; + goto exit; + } + + filesz = zp->z_size; /* get consistent copy of zp_size */ + + if (off < 0 || off >= filesz || (off & PAGE_MASK_64) || + (len & PAGE_MASK)) { + if (!(flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(upl, upl_offset, len, + UPL_ABORT_FREE_ON_EMPTY); + err = EINVAL; + goto exit; + } + + uint64_t pgsize = roundup(filesz, PAGESIZE); + + /* Any whole pages beyond the end of the file while we abort */ + if ((size + off) > pgsize) { + printf("ZFS: pageout abort outside pages (rounded 0x%llx > " + "UPLlen 0x%llx\n", pgsize, size + off); + ubc_upl_abort_range(upl, pgsize, + pgsize - (size + off), + UPL_ABORT_FREE_ON_EMPTY); + } + + dprintf("ZFS: starting with size %lx\n", len); + +top: + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + /* + * can't push pages passed end-of-file + */ + filesz = zp->z_size; + if (off >= filesz) { + /* ignore all pages */ + err = 0; + goto out; + } else if (off + len > filesz) { +#if 0 + int npages = btopr(filesz - off); + page_t *trunc; + + page_list_break(&pp, &trunc, npages); + /* ignore pages past end of file */ + if (trunc) + pvn_write_done(trunc, flags); +#endif + len = filesz - off; + } + + tx = dmu_tx_create(zfsvfs->z_os); + if (!tx) { + printf("ZFS: zfs_vnops_osx: NULL TX encountered!\n"); + if (!(flags & UPL_NOCOMMIT)) + ubc_upl_abort_range(upl, upl_offset, len, + UPL_ABORT_FREE_ON_EMPTY); + err = EINVAL; + goto exit; + } + dmu_tx_hold_write(tx, zp->z_id, off, len); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + if (err == ERESTART) { + zfs_rangelock_exit(lr); + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + goto out; + } + + caddr_t va; + + if (ubc_upl_map(upl, (vm_offset_t *)&va) != KERN_SUCCESS) { + err = EINVAL; + goto out; + } + + va += upl_offset; + while (len >= PAGESIZE) { + ssize_t sz = PAGESIZE; + + dprintf("pageout: dmu_write off 0x%llx size 0x%lx\n", off, sz); + + dmu_write(zfsvfs->z_os, zp->z_id, off, sz, va, tx); + va += sz; + off += sz; + len -= sz; + } + + /* + * The last, possibly partial block needs to have the data zeroed that + * would extend past the size of the file. + */ + if (len > 0) { + ssize_t sz = len; + + dprintf("pageout: dmu_writeX off 0x%llx size 0x%lx\n", off, sz); + dmu_write(zfsvfs->z_os, zp->z_id, off, sz, va, tx); + + va += sz; + off += sz; + len -= sz; + + /* + * Zero out the remainder of the PAGE that didn't fit within + * the file size. + */ + // bzero(va, PAGESIZE-sz); + // dprintf("zero last 0x%lx bytes.\n", PAGESIZE-sz); + + } + ubc_upl_unmap(upl); + + if (err == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(err); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0, + NULL, NULL); + } + dmu_tx_commit(tx); + +out: + zfs_rangelock_exit(lr); + if (flags & UPL_IOSYNC) + zil_commit(zfsvfs->z_log, zp->z_id); + + if (!(flags & UPL_NOCOMMIT)) { + if (err) + ubc_upl_abort_range(upl, upl_offset, size, + (UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY)); + else + ubc_upl_commit_range(upl, upl_offset, size, + (UPL_COMMIT_CLEAR_DIRTY | + UPL_COMMIT_FREE_ON_EMPTY)); + } +exit: + ZFS_EXIT(zfsvfs); + if (err) dprintf("%s err %d\n", __func__, err); + return (err); +} + + + +int +zfs_vnop_pageout(struct vnop_pageout_args *ap) +#if 0 + struct vnop_pageout_args { + struct vnode *a_vp; + upl_t a_pl; + vm_offset_t a_pl_offset; + off_t a_foffset; + size_t a_size; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + int flags = ap->a_flags; + upl_t upl = ap->a_pl; + vm_offset_t upl_offset = ap->a_pl_offset; + size_t len = ap->a_size; + offset_t off = ap->a_f_offset; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = NULL; + int error; + + if (!zp || !zp->z_zfsvfs) { + if (!(flags & UPL_NOCOMMIT)) + ubc_upl_abort(upl, + (UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY)); + printf("ZFS: vnop_pageout: null zp or zfsvfs\n"); + return (ENXIO); + } + + zfsvfs = zp->z_zfsvfs; + + dprintf("+vnop_pageout: off 0x%llx len 0x%lx upl_off 0x%lx: " + "blksz 0x%x, z_size 0x%llx\n", off, len, upl_offset, zp->z_blksz, + zp->z_size); + + /* + * XXX Crib this too, although Apple uses parts of zfs_putapage(). + * Break up that function into smaller bits so it can be reused. + */ + error = zfs_pageout(zfsvfs, zp, upl, upl_offset, ap->a_f_offset, + len, flags); + + return (error); +} + + +static int bluster_pageout(zfsvfs_t *zfsvfs, znode_t *zp, upl_t upl, + upl_offset_t upl_offset, off_t f_offset, int size, + uint64_t filesize, int flags, caddr_t vaddr, + dmu_tx_t *tx) +{ + int io_size; + int rounded_size; + off_t max_size; + int is_clcommit = 0; + + if ((flags & UPL_NOCOMMIT) == 0) + is_clcommit = 1; + + /* + * If they didn't specify any I/O, then we are done... + * we can't issue an abort because we don't know how + * big the upl really is + */ + if (size <= 0) { + dprintf("%s invalid size %d\n", __func__, size); + return (EINVAL); + } + + if (vnode_vfsisrdonly(ZTOV(zp))) { + if (is_clcommit) + ubc_upl_abort_range(upl, upl_offset, size, + UPL_ABORT_FREE_ON_EMPTY); + dprintf("%s: readonly fs\n", __func__); + return (EROFS); + } + + /* + * can't page-in from a negative offset + * or if we're starting beyond the EOF + * or if the file offset isn't page aligned + * or the size requested isn't a multiple of PAGE_SIZE + */ + if (f_offset < 0 || f_offset >= filesize || + (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) { + if (is_clcommit) + ubc_upl_abort_range(upl, upl_offset, size, + UPL_ABORT_FREE_ON_EMPTY); + dprintf("%s: invalid offset or size\n", __func__); + return (EINVAL); + } + max_size = filesize - f_offset; + + if (size < max_size) + io_size = size; + else + io_size = max_size; + + rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; + + if (size > rounded_size) { + if (is_clcommit) + ubc_upl_abort_range(upl, upl_offset + rounded_size, + size - rounded_size, UPL_ABORT_FREE_ON_EMPTY); + } + +#if 1 + if (f_offset + size > filesize) { + dprintf("ZFS: lowering size %u to %llu\n", + size, f_offset > filesize ? 0 : filesize - f_offset); + if (f_offset > filesize) + size = 0; + else + size = filesize - f_offset; + } +#endif + + dmu_write(zfsvfs->z_os, zp->z_id, f_offset, size, + &vaddr[upl_offset], tx); + + return (0); +} + + + + +/* + * In V2 of vnop_pageout, we are given a NULL upl, so that we can + * grab the file locks first, then request the upl to lock down pages. + */ +int +zfs_vnop_pageoutv2(struct vnop_pageout_args *ap) +#if 0 + struct vnop_pageout_args { + struct vnode *a_vp; + upl_t a_pl; + vm_offset_t a_pl_offset; + off_t a_foffset; + size_t a_size; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + int a_flags = ap->a_flags; + vm_offset_t a_pl_offset = ap->a_pl_offset; + size_t a_size = ap->a_size; + upl_t upl = ap->a_pl; + upl_page_info_t *pl; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = NULL; + int error = 0; + uint64_t filesize; + zfs_locked_range_t *lr; + dmu_tx_t *tx; + caddr_t vaddr = NULL; + int merror = 0; + + /* + * We can still get into this function as non-v2 style, by the default + * pager (ie, swap - when we eventually support it) + */ + if (upl) { + dprintf("ZFS: Relaying vnop_pageoutv2 to vnop_pageout\n"); + return (zfs_vnop_pageout(ap)); + } + + if (!zp || !zp->z_zfsvfs) { + printf("ZFS: vnop_pageout: null zp or zfsvfs\n"); + return (ENXIO); + } + + if (ZTOV(zp) == NULL) { + printf("ZFS: vnop_pageout: null vp\n"); + return (ENXIO); + } + + // XNU can call us with iocount == 0 && usecount == 0. Grab + // a ref now so the vp doesn't reclaim while we are in here. + if (vnode_get(ZTOV(zp)) != 0) { + printf("ZFS: vnop_pageout: vnode_ref failed.\n"); + return (ENXIO); + } + + mutex_enter(&zp->z_lock); + + sa_handle_t *z_sa_hdl; + z_sa_hdl = zp->z_sa_hdl; + if (!z_sa_hdl) { + mutex_exit(&zp->z_lock); + vnode_put(ZTOV(zp)); + printf("ZFS: vnop_pageout: null sa_hdl\n"); + return (ENXIO); + } + + zfsvfs = zp->z_zfsvfs; + + mutex_exit(&zp->z_lock); + + if (error) { + printf("ZFS: %s: can't hold_sa: %d\n", __func__, error); + vnode_put(ZTOV(zp)); + return (ENXIO); + } + + dprintf("+vnop_pageout2: off 0x%llx len 0x%lx upl_off 0x%lx: " + "blksz 0x%x, z_size 0x%llx\n", ap->a_f_offset, a_size, + a_pl_offset, zp->z_blksz, + zp->z_size); + + + /* Start the pageout request */ + /* + * We can't leave this function without either calling upl_commit or + * upl_abort. So use the non-error version. + */ + ZFS_ENTER_IFERROR(zfsvfs) { + dprintf("ZFS: vnop_pageoutv2: abort on z_unmounted\n"); + error = EIO; + goto exit_abort; + } + if (vfs_flags(zfsvfs->z_vfs) & MNT_RDONLY) { + dprintf("ZFS: vnop_pageoutv2: readonly\n"); + error = EROFS; + goto exit_abort; + } + ASSERT(vn_has_cached_data(ZTOV(zp))); + + lr = zfs_rangelock_enter(&zp->z_rangelock, ap->a_f_offset, a_size, + RL_WRITER); + + /* Grab UPL now */ + int request_flags; + + /* + * we're in control of any UPL we commit + * make sure someone hasn't accidentally passed in UPL_NOCOMMIT + */ + a_flags &= ~UPL_NOCOMMIT; + a_pl_offset = 0; + + if (a_flags & UPL_MSYNC) { + request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY; + } else { + request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; + } + + error = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, + request_flags); + if (error || (upl == NULL)) { + dprintf("ZFS: Failed to create UPL! %d\n", error); + goto pageout_done; + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, ap->a_f_offset, ap->a_size); + + // NULL z_sa_hdl + if (z_sa_hdl != NULL) + dmu_tx_hold_sa(tx, z_sa_hdl, B_FALSE); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + ubc_upl_abort(upl, (UPL_ABORT_ERROR|UPL_ABORT_FREE_ON_EMPTY)); + goto pageout_done; + } + + off_t f_offset; + int64_t offset; + int64_t isize; + int64_t pg_index; + + filesize = zp->z_size; /* get consistent copy of zp_size */ + + isize = ap->a_size; + f_offset = ap->a_f_offset; + + /* + * Scan from the back to find the last page in the UPL, so that we + * aren't looking at a UPL that may have already been freed by the + * preceding aborts/completions. + */ + for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0; ) { + if (upl_page_present(pl, --pg_index)) + break; + if (pg_index == 0) { + dprintf("ZFS: failed on pg_index\n"); + dmu_tx_commit(tx); + ubc_upl_abort_range(upl, 0, isize, + UPL_ABORT_FREE_ON_EMPTY); + goto pageout_done; + } + } + + dprintf("ZFS: isize %llu pg_index %llu\n", isize, pg_index); + /* + * initialize the offset variables before we touch the UPL. + * a_f_offset is the position into the file, in bytes + * offset is the position into the UPL, in bytes + * pg_index is the pg# of the UPL we're operating on. + * isize is the offset into the UPL of the last non-clean page. + */ + isize = ((pg_index + 1) * PAGE_SIZE); + + offset = 0; + pg_index = 0; + while (isize > 0) { + int64_t xsize; + int64_t num_of_pages; + + // printf("isize %d for page %d\n", isize, pg_index); + + if (!upl_page_present(pl, pg_index)) { + /* + * we asked for RET_ONLY_DIRTY, so it's possible + * to get back empty slots in the UPL. + * just skip over them + */ + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + + continue; + } + if (!upl_dirty_page(pl, pg_index)) { + /* + * hfs has a call to panic here, but we trigger this + * *a lot* so unsure what is going on + */ + dprintf("zfs_vnop_pageoutv2: unforeseen clean page " + "@ index %lld for UPL %p\n", pg_index, upl); + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + continue; + } + + /* + * We know that we have at least one dirty page. + * Now checking to see how many in a row we have + */ + num_of_pages = 1; + xsize = isize - PAGE_SIZE; + + while (xsize > 0) { + if (!upl_dirty_page(pl, pg_index + num_of_pages)) + break; + num_of_pages++; + xsize -= PAGE_SIZE; + } + xsize = num_of_pages * PAGE_SIZE; + + if (!vnode_isswap(vp)) { + off_t end_of_range; + + end_of_range = f_offset + xsize - 1; + if (end_of_range >= filesize) { + end_of_range = (off_t)(filesize - 1); + } + } + + // Map it if needed + if (!vaddr) { + if ((ubc_upl_map(upl, (vm_offset_t *)&vaddr) != + KERN_SUCCESS) || vaddr == NULL) { + error = EINVAL; + vaddr = NULL; + dprintf("ZFS: unable to map\n"); + goto out; + } + dprintf("ZFS: Mapped %p\n", vaddr); + } + + + dprintf("ZFS: bluster offset %lld fileoff %lld size %lld " + "filesize %lld\n", offset, f_offset, xsize, filesize); + merror = bluster_pageout(zfsvfs, zp, upl, offset, f_offset, + xsize, filesize, a_flags, vaddr, tx); + /* remember the first error */ + if ((error == 0) && (merror)) + error = merror; + + f_offset += xsize; + offset += xsize; + isize -= xsize; + pg_index += num_of_pages; + } // while isize + + /* finish off transaction */ + if (error == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, ap->a_f_offset, + a_size, 0, NULL, NULL); + } + dmu_tx_commit(tx); + + // unmap + if (vaddr) { + ubc_upl_unmap(upl); + vaddr = NULL; + } +out: + zfs_rangelock_exit(lr); + if (a_flags & UPL_IOSYNC) + zil_commit(zfsvfs->z_log, zp->z_id); + + if (error) + ubc_upl_abort(upl, (UPL_ABORT_ERROR|UPL_ABORT_FREE_ON_EMPTY)); + else + ubc_upl_commit_range(upl, 0, a_size, UPL_COMMIT_FREE_ON_EMPTY); + + upl = NULL; + + vnode_put(ZTOV(zp)); + + ZFS_EXIT(zfsvfs); + if (error) + dprintf("ZFS: pageoutv2 failed %d\n", error); + return (error); + +pageout_done: + zfs_rangelock_exit(lr); + +exit_abort: + dprintf("ZFS: pageoutv2 aborted %d\n", error); + // VERIFY(ubc_create_upl(vp, off, len, &upl, &pl, flags) == 0); + // ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); + + vnode_put(ZTOV(zp)); + + if (zfsvfs) + ZFS_EXIT(zfsvfs); + return (error); +} + + + + + + +int +zfs_vnop_mmap(struct vnop_mmap_args *ap) +#if 0 + struct vnop_mmap_args { + struct vnode *a_vp; + int a_fflags; + kauth_cred_t a_cred; + struct proc *a_p; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs; + + if (!zp) + return (ENODEV); + + zfsvfs = zp->z_zfsvfs; + + dprintf("+vnop_mmap: %p\n", ap->a_vp); + + ZFS_ENTER(zfsvfs); + + if (!vnode_isreg(vp)) { + ZFS_EXIT(zfsvfs); + return (ENODEV); + } + mutex_enter(&zp->z_lock); + zp->z_is_mapped = 1; + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + dprintf("-vnop_mmap\n"); + return (0); +} + +int +zfs_vnop_mnomap(struct vnop_mnomap_args *ap) +#if 0 + struct vnop_mnomap_args { + struct vnode *a_vp; + int a_fflags; + kauth_cred_t a_cred; + struct proc *a_p; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + dprintf("+vnop_mnomap: %p\n", ap->a_vp); + + ZFS_ENTER(zfsvfs); + + if (!vnode_isreg(vp)) { + ZFS_EXIT(zfsvfs); + return (ENODEV); + } + mutex_enter(&zp->z_lock); + /* + * If a file as been mmaped even once, it needs to keep "z_is_mapped" + * high because it will potentially keep pages in the UPL cache we need + * to update on writes. We can either drop the UPL pages here, or simply + * keep updating both places on zfs_write(). + */ + /* zp->z_is_mapped = 0; */ + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + dprintf("-vnop_mnomap\n"); + return (0); +} + + + + +int +zfs_vnop_inactive(struct vnop_inactive_args *ap) +#if 0 + struct vnop_inactive_args { + struct vnode *a_vp; + vfs_context_t a_context; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + zfs_inactive(vp); + return (0); +} + + + +#ifdef _KERNEL +uint64_t vnop_num_reclaims = 0; +uint64_t vnop_num_vnodes = 0; +#endif + + +int +zfs_vnop_reclaim(struct vnop_reclaim_args *ap) +#if 0 + struct vnop_reclaim_args { + struct vnode *a_vp; + vfs_context_t a_context; + }; +#endif +{ + /* + * Care needs to be taken here, we may already have called reclaim + * from vnop_inactive, if so, very little needs to be done. + */ + + struct vnode *vp = ap->a_vp; + znode_t *zp = NULL; + zfsvfs_t *zfsvfs = NULL; + + /* Destroy the vm object and flush associated pages. */ +#ifndef __APPLE__ + vnode_destroy_vobject(vp); +#endif + + /* Already been released? */ + zp = VTOZ(vp); + ASSERT(zp != NULL); + dprintf("+vnop_reclaim zp %p/%p type %d\n", zp, vp, vnode_vtype(vp)); + if (!zp) goto out; + + zfsvfs = zp->z_zfsvfs; + + if (!zfsvfs) { + printf("ZFS: vnop_reclaim with zfsvfs == NULL\n"); + return (0); + } + + if (zfsctl_is_node(vp)) { + printf("ZFS: vnop_reclaim with ctldir node\n"); + return (0); + } + + ZTOV(zp) = NULL; + + /* + * Purge old data structures associated with the denode. + */ + vnode_clearfsnode(vp); /* vp->v_data = NULL */ + vnode_removefsref(vp); /* ADDREF from vnode_create */ + atomic_dec_64(&vnop_num_vnodes); + + dprintf("+vnop_reclaim zp %p/%p unlinked %d unmount " + "%d sa_hdl %p\n", zp, vp, zp->z_unlinked, + zfsvfs->z_unmounted, zp->z_sa_hdl); + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + zfs_znode_free(zp); + } else { + zfs_zinactive(zp); + zfs_znode_free(zp); + } + rw_exit(&zfsvfs->z_teardown_inactive_lock); + +#ifdef _KERNEL + atomic_inc_64(&vnop_num_reclaims); +#endif + +out: + return (0); +} + + + + + +int +zfs_vnop_mknod(struct vnop_mknod_args *ap) +#if 0 + struct vnop_mknod_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vnode_vattr *vap; + vfs_context_t a_context; + }; +#endif +{ + struct vnop_create_args create_ap; + int error; + + dprintf("%s\n", __func__); + + bzero(&create_ap, sizeof (struct vnop_create_args)); + + create_ap.a_dvp = ap->a_dvp; + create_ap.a_vpp = ap->a_vpp; + create_ap.a_cnp = ap->a_cnp; + create_ap.a_vap = ap->a_vap; + create_ap.a_context = ap->a_context; + + error = zfs_vnop_create(&create_ap); + if (error) dprintf("%s error %d\n", __func__, error); + return (error); +} + +int +zfs_vnop_allocate(struct vnop_allocate_args *ap) +#if 0 + struct vnop_allocate_args { + struct vnode *a_vp; + off_t a_length; + u_int32_t a_flags; + off_t *a_bytesallocated; + off_t a_offset; + vfs_context_t a_context; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs; + uint64_t wantedsize = 0, filesize = 0; + int err = 0; + + dprintf("%s %llu %d %llu %llu: '%s'\n", __func__, ap->a_length, + ap->a_flags, (ap->a_bytesallocated ? *ap->a_bytesallocated : 0), + ap->a_offset, zp->z_name_cache); + + /* + * This code has been reverted: + * https://github.com/openzfsonosx/zfs/issues/631 + * Most likely not correctly aligned, and too-large offsets. + */ + return (0); + + if (!zp || !zp->z_sa_hdl) + return (ENODEV); + +// *ap->a_bytesallocated = 0; + + if (!vnode_isreg(vp)) { + ZFS_EXIT(zfsvfs); + return (ENODEV); + } + + filesize = zp->z_size; + wantedsize = ap->a_length; + + if (ap->a_flags & ALLOCATEFROMPEOF) + wantedsize += filesize; + else if (ap->a_flags & ALLOCATEFROMVOL) + /* blockhint = ap->a_offset / blocksize */ // yeah, no idea + printf("%s: help, allocatefromvolume set?\n", __func__); + + dprintf("%s: filesize %llu wantedsize %llu\n", __func__, + filesize, wantedsize); + + // If we are extending + if (wantedsize > filesize) { + + err = zfs_freesp(zp, wantedsize, 0, FWRITE, B_TRUE); + + // If we are truncating, Apple claims this code is never called. + } else if (wantedsize < filesize) { + + printf("%s: file shrinking branch taken?\n", __func__); + + } + + if (!err) { + *(ap->a_bytesallocated) = wantedsize - filesize; + } + + ZFS_EXIT(zfsvfs); + dprintf("-%s: %d\n", __func__, err); + return (err); +} + +int +zfs_vnop_whiteout(struct vnop_whiteout_args *ap) +#if 0 + struct vnop_whiteout_args { + struct vnode *a_dvp; + struct componentname *a_cnp; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ + dprintf("vnop_whiteout: ENOTSUP\n"); + + return (ENOTSUP); +} + +int +zfs_vnop_pathconf(struct vnop_pathconf_args *ap) +#if 0 + struct vnop_pathconf_args { + struct vnode *a_vp; + int a_name; + register_t *a_retval; + vfs_context_t a_context; + }; +#endif +{ + int32_t *valp = ap->a_retval; + int error = 0; + + dprintf("+vnop_pathconf a_name %d\n", ap->a_name); + + switch (ap->a_name) { + case _PC_LINK_MAX: + *valp = INT_MAX; + break; + case _PC_PIPE_BUF: + *valp = PIPE_BUF; + break; + case _PC_CHOWN_RESTRICTED: + *valp = 200112; /* POSIX */ + break; + case _PC_NO_TRUNC: + *valp = 200112; /* POSIX */ + break; + case _PC_NAME_MAX: + case _PC_NAME_CHARS_MAX: + *valp = ZAP_MAXNAMELEN - 1; /* 255 */ + break; + case _PC_PATH_MAX: + case _PC_SYMLINK_MAX: + *valp = PATH_MAX; /* 1024 */ + break; + case _PC_CASE_SENSITIVE: + { + znode_t *zp = VTOZ(ap->a_vp); + *valp = 1; + if (zp && zp->z_zfsvfs) { + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + *valp = (zfsvfs->z_case == ZFS_CASE_SENSITIVE) ? 1 : 0; + } + } + break; + case _PC_CASE_PRESERVING: + *valp = 1; + break; +/* + * OS X 10.6 does not define this. + */ +#ifndef _PC_XATTR_SIZE_BITS +#define _PC_XATTR_SIZE_BITS 26 +#endif +/* + * Even though ZFS has 64 bit limit on XATTR size, there would appear to be a + * limit in SMB2 that the bit size returned has to be 18, or we will get an + * error from most XATTR calls (STATUS_ALLOTTED_SPACE_EXCEEDED). + */ +#ifndef AD_XATTR_SIZE_BITS +#define AD_XATTR_SIZE_BITS 18 +#endif + case _PC_XATTR_SIZE_BITS: + *valp = AD_XATTR_SIZE_BITS; + break; + case _PC_FILESIZEBITS: + *valp = 64; + break; + default: + printf("ZFS: unknown pathconf %d called.\n", ap->a_name); + error = EINVAL; + } + + if (error) dprintf("%s vp %p : %d\n", __func__, ap->a_vp, error); + return (error); +} + +int +zfs_vnop_getxattr(struct vnop_getxattr_args *ap) +#if 0 + struct vnop_getxattr_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + char *a_name; + struct uio *a_uio; + size_t *a_size; + int a_options; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct uio *uio = ap->a_uio; + struct componentname cn = { 0 }; + int error = 0; + int size = 0; + uint64_t resid = uio ? uio_resid(uio) : 0; + znode_t *xdzp = NULL, *xzp = NULL; + + dprintf("+getxattr vp %p: '%s'\n", ap->a_vp, ap->a_name); + + /* xattrs disabled? */ + if (zfsvfs->z_xattr == B_FALSE) { + return (ENOTSUP); + } + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_use_sa && zfsvfs->z_xattr_sa && zp->z_is_sa) { + char *value = NULL; + + rw_enter(&zp->z_xattr_lock, RW_READER); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + rw_exit(&zp->z_xattr_lock); + + if (!resid) { /* Lookup size */ + + rw_enter(&zp->z_xattr_lock, RW_READER); + size = zpl_xattr_get_sa(vp, ap->a_name, NULL, 0); + rw_exit(&zp->z_xattr_lock); + if (size > 0) { + *ap->a_size = size; + goto out; + } + } + + if (resid) { + value = kmem_alloc(resid, KM_SLEEP); + rw_enter(&zp->z_xattr_lock, RW_READER); + size = zpl_xattr_get_sa(vp, ap->a_name, value, resid); + rw_exit(&zp->z_xattr_lock); + + /* Finderinfo checks */ + if (!error && resid && + bcmp(ap->a_name, XATTR_FINDERINFO_NAME, + sizeof (XATTR_FINDERINFO_NAME)) == 0) { + + /* Must be 32 bytes */ + if (resid != sizeof (emptyfinfo) || + size != sizeof (emptyfinfo)) { + error = ERANGE; + kmem_free(value, resid); + goto out; + } + + /* If FinderInfo is empty > it doesn't exist */ + if (bcmp(value, emptyfinfo, + sizeof (emptyfinfo)) == 0) { + error = ENOATTR; + kmem_free(value, resid); + goto out; + } + + /* According to HFS zero out some fields */ + finderinfo_update((uint8_t *)value, zp); + } + + if (size > 0) + error = uiomove((const char *)value, size, 0, + uio); + + kmem_free(value, resid); + + goto out; + } + } + + /* Legacy xattr */ + + /* Grab the hidden attribute directory vnode. */ + if ((error = zfs_get_xattrdir(zp, &xdzp, cr, 0))) { + goto out; + } + + cn.cn_namelen = strlen(ap->a_name) + 1; + cn.cn_nameptr = (char *)kmem_zalloc(cn.cn_namelen, KM_SLEEP); + + /* Lookup the attribute name. */ + if ((error = zfs_dirlook(xdzp, (char *)ap->a_name, &xzp, 0, NULL, + &cn))) { + goto out; + } + + /* + * If we are dealing with FinderInfo, we duplicate the UIO first + * so that we can uiomove to/from it to modify contents. + */ + if (!error && uio && + bcmp(ap->a_name, XATTR_FINDERINFO_NAME, + sizeof (XATTR_FINDERINFO_NAME)) == 0) { + ssize_t local_resid; + zfs_file_t zf; + u_int8_t finderinfo[32]; + static u_int32_t emptyfinfo[8] = {0}; + + /* Read the attribute data. */ + /* FinderInfo is 32 bytes */ + if ((user_size_t)uio_resid(uio) < 32) { + error = ERANGE; + goto out; + } + + /* Use the convenience wrappers to read to SYSSPACE */ + zf.f_vnode = ZTOV(xzp); + zf.f_fd = -1; + + error = zfs_file_pread(&zf, &finderinfo, + sizeof (finderinfo), 0ULL, &local_resid); + + if (local_resid != 0) { + error = ERANGE; + } else { + + /* Update size if requested */ + if (ap->a_size) + *ap->a_size = (size_t)sizeof (finderinfo); + + /* According to HFS we are to zero out some fields */ + finderinfo_update((uint8_t *)&finderinfo, zp); + + /* If Finder Info is empty then it doesn't exist. */ + if (bcmp(finderinfo, emptyfinfo, + sizeof (emptyfinfo)) == 0) { + error = ENOATTR; + } else { + + /* Copy out the data we just modified */ + error = uiomove((const char *)&finderinfo, + sizeof (finderinfo), 0, uio); + + } /* Not empty */ + } /* Correct size */ + + /* We are done */ + goto out; + } /* Is finder info */ + + /* If NOT finderinfo */ + + if (uio == NULL) { + + /* Query xattr size. */ + if (ap->a_size) { + mutex_enter(&xzp->z_lock); + *ap->a_size = (size_t)xzp->z_size; + mutex_exit(&xzp->z_lock); + } + + } else { + + /* Read xattr */ + error = zfs_read(ZTOV(xzp), uio, 0, cr); + + if (ap->a_size && uio) { + *ap->a_size = (size_t)resid - uio_resid(ap->a_uio); + } + + } + +out: + + if (error == ENOENT) + error = ENOATTR; + + if (cn.cn_nameptr) + kmem_free(cn.cn_nameptr, cn.cn_namelen); + if (xzp) { + zrele(xzp); + } + if (xdzp) { + zrele(xdzp); + } + + ZFS_EXIT(zfsvfs); + dprintf("-getxattr vp %p : %d size %lu: %s\n", ap->a_vp, error, + !error && ap->a_size ? *ap->a_size : 0, + ap->a_uio == NULL ? "sizelookup" : "xattrread"); + return (error); +} + +int +zfs_vnop_setxattr(struct vnop_setxattr_args *ap) +#if 0 + struct vnop_setxattr_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + char *a_name; + struct uio *a_uio; + int a_options; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + struct vnode *xvp = NULLVP; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct uio *uio = ap->a_uio; + int flag; + int error = 0; + znode_t *xdzp = NULL; + + dprintf("+setxattr vp %p '%s' (enabled: %d) resid %llu\n", ap->a_vp, + ap->a_name, zfsvfs->z_xattr, uio_resid(ap->a_uio)); + + /* xattrs disabled? */ + if (zfsvfs->z_xattr == B_FALSE) { + return (ENOTSUP); + } + + if (ap->a_name == NULL || ap->a_name[0] == '\0') { + return (EINVAL); /* invalid name */ + } + + ZFS_ENTER(zfsvfs); + + if (strlen(ap->a_name) >= ZAP_MAXNAMELEN) { + error = ENAMETOOLONG; + goto out; + } + + if (ap->a_options & XATTR_CREATE) + flag = ZNEW; /* expect no pre-existing entry */ + else if (ap->a_options & XATTR_REPLACE) + flag = ZEXISTS; /* expect an existing entry */ + else + flag = 0; + + + /* Preferentially store the xattr as a SA for better performance */ + if (zfsvfs->z_use_sa && zfsvfs->z_xattr_sa && zp->z_is_sa) { + char *value; + uint64_t size; + + rw_enter(&zp->z_xattr_lock, RW_READER); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + rw_exit(&zp->z_xattr_lock); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + + /* New, expect it to not exist .. */ + if ((flag & ZNEW) && + (zpl_xattr_get_sa(vp, ap->a_name, NULL, 0) > 0)) { + error = EEXIST; + rw_exit(&zp->z_xattr_lock); + goto out; + } + + /* Replace, XATTR must exist .. */ + if ((flag & ZEXISTS) && + ((error = + zpl_xattr_get_sa(vp, ap->a_name, NULL, 0)) <= 0) && + error == -ENOENT) { + error = ENOATTR; + rw_exit(&zp->z_xattr_lock); + goto out; + } + + size = uio_resid(uio); + value = kmem_alloc(size, KM_SLEEP); + + size_t bytes; + + /* Copy in the xattr value */ + uiocopy((const char *)value, size, UIO_WRITE, + uio, &bytes); + + + /* Finderinfo checks */ + if (!error && bytes && + bcmp(ap->a_name, XATTR_FINDERINFO_NAME, + sizeof (XATTR_FINDERINFO_NAME)) == 0) { + + /* Must be 32 bytes */ + if (bytes != sizeof (emptyfinfo)) { + error = ERANGE; + rw_exit(&zp->z_xattr_lock); + kmem_free(value, size); + goto out; + } + + /* According to HFS we are to zero out some fields */ + finderinfo_update((uint8_t *)value, zp); + } + + error = zpl_xattr_set_sa(vp, ap->a_name, + value, bytes, + flag, cr); + rw_exit(&zp->z_xattr_lock); + kmem_free(value, size); + + goto out; + } + + /* Legacy xattr */ + + if ((error = zfs_get_xattrdir(zp, &xdzp, cr, CREATE_XATTR_DIR))) { + goto out; + } + + /* Lookup or create the named attribute. */ + error = zpl_obtain_xattr(xdzp, ap->a_name, VTOZ(vp)->z_mode, cr, + &xvp, flag); + if (error) + goto out; + + /* Write the attribute data. */ + ASSERT(uio != NULL); + + /* OsX setxattr() replaces xattrs */ + error = zfs_freesp(VTOZ(xvp), 0, 0, VTOZ(vp)->z_mode, TRUE); + + /* Special case for Finderinfo */ + if (!error && uio && + bcmp(ap->a_name, XATTR_FINDERINFO_NAME, + sizeof (XATTR_FINDERINFO_NAME)) == 0) { + + u_int8_t finderinfo[32]; + + /* Read the attribute data. */ + /* FinderInfo is 32 bytes */ + if ((user_size_t)uio_resid(uio) < 32) { + error = ERANGE; + goto out; + } + + /* Copy in the finderinfo to our space */ + error = uiomove((const char *)&finderinfo, + sizeof (finderinfo), 0, uio); + if (error) + goto out; + + /* Zero out some fields, according to HFS */ + finderinfo_update((uint8_t *)&finderinfo, zp); + + /* + * TODO: + * When writing FINDERINFO, we need to replace the + * ADDEDTIME date with actual crtime and not let + * userland overwrite it. + */ + + /* Empty Finderinfo is non-existent. */ + if (bcmp(finderinfo, emptyfinfo, sizeof (emptyfinfo)) == 0) { + /* Attempt to delete it? */ + error = zfs_remove(xdzp, (char *)ap->a_name, cr, 0); + goto out; + } + + /* Build a new uio to call zfs_write() to make it go in txg */ + uio_t *luio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE); + if (luio == NULL) { + error = ENOMEM; + goto out; + } + uio_addiov(luio, (user_addr_t)&finderinfo, sizeof (finderinfo)); + + error = zfs_write(xvp, luio, 0, cr); + + if (uio_resid(luio) != 0) + error = ERANGE; + + uio_free(luio); + + goto out; + } /* Finderinfo */ + + /* Write XATTR to disk */ + error = zfs_write(xvp, uio, 0, cr); + +out: + + if (error == ENOENT) + error = ENOATTR; + + if (xdzp) { + zrele(xdzp); + } + if (xvp) { + VN_RELE(xvp); + } + + ZFS_EXIT(zfsvfs); + dprintf("-setxattr vp %p: err %d: resid %llx\n", ap->a_vp, error, + uio_resid(ap->a_uio)); + return (error); +} + +int +zfs_vnop_removexattr_int(zfsvfs_t *zfsvfs, znode_t *zp, const char *name, + cred_t *cr) +{ + struct vnode *vp = ZTOV(zp); + struct componentname cn = { 0 }; + int error; + uint64_t xattr; + znode_t *xdzp = NULL, *xzp = NULL; + + dprintf("+removexattr_int vp %p '%s'\n", vp, name); + + ZFS_ENTER(zfsvfs); + + /* + * Recursive attributes are not allowed. + */ + if (zp->z_pflags & ZFS_XATTR) { + error = EINVAL; + goto out; + } + + if (zfsvfs->z_use_sa && zfsvfs->z_xattr_sa && zp->z_is_sa) { + nvlist_t *nvl; + + rw_enter(&zp->z_xattr_lock, RW_READER); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + rw_exit(&zp->z_xattr_lock); + + nvl = zp->z_xattr_cached; + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); + + dprintf("ZFS: removexattr nvlist_remove said %d\n", error); + if (!error) { + /* Update the SA for adds, modss, and removals. */ + error = -zfs_sa_set_xattr(zp); + rw_exit(&zp->z_xattr_lock); + goto out; + } + rw_exit(&zp->z_xattr_lock); + } + + sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr, sizeof (xattr)); + if (xattr == 0) { + error = ENOATTR; + goto out; + } + + /* Grab the hidden attribute directory vnode. */ + if ((error = zfs_get_xattrdir(zp, &xdzp, cr, 0))) { + goto out; + } + + cn.cn_namelen = strlen(name)+1; + cn.cn_nameptr = (char *)kmem_zalloc(cn.cn_namelen, KM_SLEEP); + + /* Lookup the attribute name. */ + if ((error = zfs_dirlook(xdzp, (char *)name, &xzp, 0, NULL, + &cn))) { + if (error == ENOENT) + error = ENOATTR; + goto out; + } + + error = zfs_remove(xdzp, (char *)name, cr, /* flags */0); + +out: + if (cn.cn_nameptr) + kmem_free(cn.cn_nameptr, cn.cn_namelen); + + if (xzp) { + zrele(xzp); + } + if (xdzp) { + zrele(xdzp); + } + + ZFS_EXIT(zfsvfs); + if (error) dprintf("%s vp %p: error %d\n", __func__, vp, error); + return (error); +} + +int +zfs_vnop_removexattr(struct vnop_removexattr_args *ap) +#if 0 + struct vnop_removexattr_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + char *a_name; + int a_options; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + dprintf("+removexattr vp %p '%s'\n", ap->a_vp, ap->a_name); + + /* xattrs disabled? */ + if (zfsvfs->z_xattr == B_FALSE) { + return (ENOTSUP); + } + + return (zfs_vnop_removexattr_int(zfsvfs, zp, ap->a_name, cr)); +} + + +int +zfs_vnop_listxattr(struct vnop_listxattr_args *ap) +#if 0 + struct vnop_listxattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + uio_t a_uio; + size_t *a_size; + int a_options; + vfs_context_t a_context; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct uio *uio = ap->a_uio; + zap_cursor_t zc; + zap_attribute_t za; + objset_t *os; + size_t size = 0; + char *nameptr; + char nfd_name[ZAP_MAXNAMELEN]; + size_t namelen; + int error = 0; + uint64_t xattr; + int force_formd_normalized_output; + znode_t *xdzp = NULL; + + dprintf("+listxattr vp %p: \n", ap->a_vp); + + /* xattrs disabled? */ + if (zfsvfs->z_xattr == B_FALSE) { + return (EINVAL); + } + + ZFS_ENTER(zfsvfs); + + /* + * Recursive attributes are not allowed. + */ + if (zp->z_pflags & ZFS_XATTR) { + error = EINVAL; + goto out; + } + + if (zfsvfs->z_use_sa && zp->z_is_sa && zp->z_xattr_cached) { + nvpair_t *nvp = NULL; + + rw_enter(&zp->z_xattr_lock, RW_READER); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + rw_exit(&zp->z_xattr_lock); + + while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != + NULL) { + ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); + + namelen = strlen(nvpair_name(nvp)) + 1; /* Null byte */ + + /* Just checking for space requirements? */ + if (uio == NULL) { + size += namelen; + } else { + if (namelen > uio_resid(uio)) { + error = ERANGE; + break; + } + dprintf("ZFS: listxattr '%s'\n", + nvpair_name(nvp)); + error = uiomove((caddr_t)nvpair_name(nvp), + namelen, UIO_READ, uio); + if (error) + break; + } + } /* while nvlist */ + } /* SA xattr */ + if (error) goto out; + + /* Do we even have any attributes? */ + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr, + sizeof (xattr)) || (xattr == 0)) { + goto out; /* all done */ + } + + /* Grab the hidden attribute directory vnode. */ + if (zfs_get_xattrdir(zp, &xdzp, cr, 0) != 0) { + goto out; + } + os = zfsvfs->z_os; + + for (zap_cursor_init(&zc, os, xdzp->z_id); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + if (xattr_protected(za.za_name)) + continue; /* skip */ + /* + * Mac OS X: non-ascii names are UTF-8 NFC on disk + * so convert to NFD before exporting them. + */ + namelen = strlen(za.za_name); + + if (force_formd_normalized_output && + !is_ascii_str(za.za_name)) + force_formd_normalized_output = 1; + else + force_formd_normalized_output = 0; + + if (force_formd_normalized_output && + utf8_normalizestr((const u_int8_t *)za.za_name, namelen, + (u_int8_t *)nfd_name, &namelen, sizeof (nfd_name), + UTF_DECOMPOSED) == 0) { + nameptr = nfd_name; + } else { + nameptr = &za.za_name[0]; + } + ++namelen; /* account for NULL termination byte */ + if (uio == NULL) { + size += namelen; + } else { + if (namelen > uio_resid(uio)) { + error = ERANGE; + break; + } + error = uiomove((caddr_t)nameptr, namelen, UIO_READ, + uio); + if (error) + break; + } + } + zap_cursor_fini(&zc); +out: + if (uio == NULL) { + *ap->a_size = size; + } + if (xdzp) { + zrele(xdzp); + } + + ZFS_EXIT(zfsvfs); + if (error) { + dprintf("%s vp %p: error %d size %ld\n", __func__, + ap->a_vp, error, size); + } + return (error); +} + +#ifdef HAVE_NAMED_STREAMS +int +zfs_vnop_getnamedstream(struct vnop_getnamedstream_args *ap) +#if 0 + struct vnop_getnamedstream_args { + struct vnode *a_vp; + struct vnode **a_svpp; + char *a_name; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + struct vnode **svpp = ap->a_svpp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct componentname cn = { 0 }; + int error = ENOATTR; + znode_t *xdzp = NULL; + znode_t *xzp = NULL; + + dprintf("+getnamedstream vp %p '%s': op %u\n", ap->a_vp, ap->a_name, + ap->a_operation); + + *svpp = NULLVP; + + ZFS_ENTER(zfsvfs); + + /* + * Mac OS X only supports the "com.apple.ResourceFork" stream. + */ + if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, + sizeof (XATTR_RESOURCEFORK_NAME)) != 0) + goto out; + + /* Only regular files */ + if (!vnode_isreg(vp)) { + return (EPERM); + } + + /* Grab the hidden attribute directory vnode. */ + if (zfs_get_xattrdir(zp, &xdzp, cr, 0) != 0) + goto out; + + cn.cn_namelen = strlen(ap->a_name) + 1; + cn.cn_nameptr = (char *)kmem_zalloc(cn.cn_namelen, KM_SLEEP); + + /* Lookup the attribute name. */ + if ((error = zfs_dirlook(xdzp, (char *)ap->a_name, &xzp, 0, NULL, + &cn))) { + if (error == ENOENT) + error = ENOATTR; + } else { + *svpp = ZTOV(xzp); + } + + kmem_free(cn.cn_nameptr, cn.cn_namelen); + +out: + if (xdzp) + zrele(xdzp); + +#if 0 // Disabled, not sure its required and empty vnodes are odd. + /* + * If the lookup is NS_OPEN, they are accessing "..namedfork/rsrc" + * to which we should return 0 with empty vp to empty file. + * See hfs_vnop_getnamedstream() + */ + if ((error == ENOATTR) && + ap->a_operation == NS_OPEN) { + + if ((error = zfs_get_xattrdir(zp, &xdvp, cr, + CREATE_XATTR_DIR)) == 0) { + /* Lookup or create the named attribute. */ + error = zpl_obtain_xattr(VTOZ(xdvp), ap->a_name, + VTOZ(vp)->z_mode, cr, ap->a_svpp, + ZNEW); + vnode_put(xdvp); + } + } +#endif + + ZFS_EXIT(zfsvfs); + if (error) dprintf("%s vp %p: error %d\n", __func__, ap->a_vp, error); + return (error); +} + +int +zfs_vnop_makenamedstream(struct vnop_makenamedstream_args *ap) +#if 0 + struct vnop_makenamedstream_args { + struct vnode *a_vp; + struct vnode **a_svpp; + char *a_name; + }; +#endif +{ + DECLARE_CRED(ap); + struct vnode *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct componentname cn; + struct vnode_attr vattr; + int error = 0; + znode_t *xdzp = NULL; + znode_t *xzp = NULL; + + dprintf("+makenamedstream vp %p: '%s'\n", ap->a_vp, ap->a_name); + + *ap->a_svpp = NULLVP; + + ZFS_ENTER(zfsvfs); + + /* Only regular files can have a resource fork stream. */ + if (!vnode_isreg(vp)) { + error = EPERM; + goto out; + } + + /* + * Mac OS X only supports the "com.apple.ResourceFork" stream. + */ + if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, + sizeof (XATTR_RESOURCEFORK_NAME)) != 0) { + error = ENOATTR; + goto out; + } + + /* Grab the hidden attribute directory vnode. */ + if ((error = zfs_get_xattrdir(zp, &xdzp, cr, CREATE_XATTR_DIR))) + goto out; + + bzero(&cn, sizeof (cn)); + cn.cn_nameiop = CREATE; + cn.cn_flags = ISLASTCN; + cn.cn_nameptr = (char *)ap->a_name; + cn.cn_namelen = strlen(cn.cn_nameptr); + + VATTR_INIT(&vattr); + VATTR_SET(&vattr, va_type, VREG); + VATTR_SET(&vattr, va_mode, VTOZ(vp)->z_mode & ~S_IFMT); + + error = zfs_create(xdzp, (char *)ap->a_name, &vattr, NONEXCL, + VTOZ(vp)->z_mode, &xzp, cr, 0, NULL); + + if (error == 0) + *ap->a_svpp = ZTOV(xzp); + +out: + if (xdzp) + zrele(xdzp); + + ZFS_EXIT(zfsvfs); + if (error) dprintf("%s vp %p: error %d\n", __func__, ap->a_vp, error); + return (error); +} + +int +zfs_vnop_removenamedstream(struct vnop_removenamedstream_args *ap) +#if 0 + struct vnop_removenamedstream_args { + struct vnode *a_vp; + struct vnode **a_svpp; + char *a_name; + }; +#endif +{ + struct vnode *svp = ap->a_svp; + znode_t *zp = VTOZ(svp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + + dprintf("zfs_vnop_removenamedstream: %p '%s'\n", + svp, ap->a_name); + ZFS_ENTER(zfsvfs); + + /* + * Mac OS X only supports the "com.apple.ResourceFork" stream. + */ + if (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, + sizeof (XATTR_RESOURCEFORK_NAME)) != 0) { + error = ENOATTR; + goto out; + } + + /* ### MISING CODE ### */ + /* + * It turns out that even though APPLE uses makenamedstream() to + * create a stream, for example compression, they use vnop_removexattr + * to delete it, so this appears not in use. + */ + dprintf("zfs_vnop_removenamedstream\n"); + error = EPERM; +out: + ZFS_EXIT(zfsvfs); + return (ENOTSUP); +} +#endif /* HAVE_NAMED_STREAMS */ + +/* + * The Darwin kernel's HFS+ appears to implement this by two methods, + * + * if (ap->a_options & FSOPT_EXCHANGE_DATA_ONLY) is set + * ** Copy the data of the files over (including rsrc) + * + * if not set + * ** exchange FileID between the two nodes, copy over vnode information + * like that of *time records, uid/gid, flags, mode, linkcount, + * finderinfo, c_desc, c_attr, c_flag, and cache_purge(). + * + * This call is deprecated in 10.8 + */ +int +zfs_vnop_exchange(struct vnop_exchange_args *ap) +#if 0 + struct vnop_exchange_args { + struct vnode *a_fvp; + struct vnode *a_tvp; + int a_options; + vfs_context_t a_context; + }; +#endif +{ + vnode_t *fvp = ap->a_fvp; + vnode_t *tvp = ap->a_tvp; + znode_t *fzp; + zfsvfs_t *zfsvfs; + + /* The files must be on the same volume. */ + if (vnode_mount(fvp) != vnode_mount(tvp)) { + dprintf("%s fvp and tvp not in same mountpoint\n", + __func__); + return (EXDEV); + } + + if (fvp == tvp) { + dprintf("%s fvp == tvp\n", __func__); + return (EINVAL); + } + + /* Only normal files can be exchanged. */ + if (!vnode_isreg(fvp) || !vnode_isreg(tvp)) { + dprintf("%s fvp or tvp is not a regular file\n", + __func__); + return (EINVAL); + } + + fzp = VTOZ(fvp); + zfsvfs = fzp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + + /* ADD MISSING CODE HERE */ + + ZFS_EXIT(zfsvfs); + printf("vnop_exchange: ENOTSUP\n"); + return (ENOTSUP); +} + +int +zfs_vnop_revoke(struct vnop_revoke_args *ap) +#if 0 + struct vnop_revoke_args { + struct vnode *a_vp; + int a_flags; + vfs_context_t a_context; + }; +#endif +{ + return (vn_revoke(ap->a_vp, ap->a_flags, ap->a_context)); +} + +int +zfs_vnop_blktooff(struct vnop_blktooff_args *ap) +#if 0 + struct vnop_blktooff_args { + struct vnode *a_vp; + daddr64_t a_lblkno; + off_t *a_offset; + }; +#endif +{ + dprintf("vnop_blktooff: 0\n"); + return (ENOTSUP); +} + +int +zfs_vnop_offtoblk(struct vnop_offtoblk_args *ap) +#if 0 + struct vnop_offtoblk_args { + struct vnode *a_vp; + off_t a_offset; + daddr64_t *a_lblkno; + }; +#endif +{ + dprintf("+vnop_offtoblk\n"); + return (ENOTSUP); +} + +int +zfs_vnop_blockmap(struct vnop_blockmap_args *ap) +#if 0 + struct vnop_blockmap_args { + struct vnode *a_vp; + off_t a_foffset; + size_t a_size; + daddr64_t *a_bpn; + size_t *a_run; + void *a_poff; + int a_flags; +}; +#endif +{ + dprintf("+vnop_blockmap\n"); + return (ENOTSUP); + +#if 0 + znode_t *zp; + zfsvfs_t *zfsvfs; + + ASSERT(ap); + ASSERT(ap->a_vp); + ASSERT(ap->a_size); + + if (!ap->a_bpn) { + return (0); + } + + if (vnode_isdir(ap->a_vp)) { + return (ENOTSUP); + } + + zp = VTOZ(ap->a_vp); + if (!zp) + return (ENODEV); + + zfsvfs = zp->z_zfsvfs; + if (!zfsvfs) + return (ENODEV); + + /* Return full request size as contiguous */ + if (ap->a_run) { + // *ap->a_run = ap->a_size; + *ap->a_run = 0; + } + if (ap->a_poff) { + *((int *)(ap->a_poff)) = 0; + /* + * returning offset of -1 asks the + * caller to zero the ranges + */ + // *((int *)(ap->a_poff)) = -1; + } + *ap->a_bpn = 0; +// *ap->a_bpn = (daddr64_t)(ap->a_foffset / zfsvfs->z_max_blksz); + + dprintf("%s ret %lu %d %llu\n", __func__, + ap->a_size, *((int *)(ap->a_poff)), *((uint64_t *)(ap->a_bpn))); + + return (0); +#endif +} + +int +zfs_vnop_strategy(struct vnop_strategy_args *ap) +#if 0 + struct vnop_strategy_args { + struct buf *a_bp; + }; +#endif +{ + dprintf("vnop_strategy: 0\n"); + return (ENOTSUP); +} + +int +zfs_vnop_select(struct vnop_select_args *ap) +#if 0 + struct vnop_select_args { + struct vnode *a_vp; + int a_which; + int a_fflags; + kauth_cred_t a_cred; + void *a_wql; + struct proc *a_p; + }; +#endif +{ + dprintf("vnop_select: 1\n"); + return (1); +} + +#ifdef WITH_READDIRATTR +int +zfs_vnop_readdirattr(struct vnop_readdirattr_args *ap) +#if 0 + struct vnop_readdirattr_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct attrlist *a_alist; + struct uio *a_uio; + ulong_t a_maxcount; + ulong_t a_options; + ulong_t *a_newstate; + int *a_eofflag; + ulong_t *a_actualcount; + vfs_context_t a_context; + }; +#endif +{ + struct vnode *vp = ap->a_vp; + struct attrlist *alp = ap->a_alist; + struct uio *uio = ap->a_uio; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zap_cursor_t zc; + zap_attribute_t zap; + attrinfo_t attrinfo; + int maxcount = ap->a_maxcount; + uint64_t offset = (uint64_t)uio_offset(uio); + u_int32_t fixedsize; + u_int32_t maxsize; + u_int32_t attrbufsize; + void *attrbufptr = NULL; + void *attrptr; + void *varptr; /* variable-length storage area */ + boolean_t user64 = vfs_context_is64bit(ap->a_context); + int prefetch = 0; + int error = 0; + +#if 0 + dprintf("+vnop_readdirattr\n"); +#endif + + *(ap->a_actualcount) = 0; + *(ap->a_eofflag) = 0; + + /* + * Check for invalid options or invalid uio. + */ + if (((ap->a_options & ~(FSOPT_NOINMEMUPDATE | FSOPT_NOFOLLOW)) != 0) || + (uio_resid(uio) <= 0) || (maxcount <= 0)) { + dprintf("%s invalid argument\n"); + return (EINVAL); + } + /* + * Reject requests for unsupported attributes. + */ + if ((alp->bitmapcount != ZFS_ATTR_BIT_MAP_COUNT) || + (alp->commonattr & ~ZFS_ATTR_CMN_VALID) || + (alp->dirattr & ~ZFS_ATTR_DIR_VALID) || + (alp->fileattr & ~ZFS_ATTR_FILE_VALID) || + (alp->volattr != 0 || alp->forkattr != 0)) { + dprintf("%s unsupported attr\n"); + return (EINVAL); + } + /* + * Check if we should prefetch znodes + */ + if ((alp->commonattr & ~ZFS_DIR_ENT_ATTRS) || + (alp->dirattr != 0) || (alp->fileattr != 0)) { + prefetch = TRUE; + } + + /* + * Setup a buffer to hold the packed attributes. + */ + fixedsize = sizeof (u_int32_t) + getpackedsize(alp, user64); + maxsize = fixedsize; + if (alp->commonattr & ATTR_CMN_NAME) + maxsize += ZAP_MAXNAMELEN + 1; + attrbufptr = (void*)kmem_alloc(maxsize, KM_SLEEP); + if (attrbufptr == NULL) { + dprintf("%s kmem_alloc failed\n"); + return (ENOMEM); + } + attrptr = attrbufptr; + varptr = (char *)attrbufptr + fixedsize; + + attrinfo.ai_attrlist = alp; + attrinfo.ai_varbufend = (char *)attrbufptr + maxsize; + attrinfo.ai_context = ap->a_context; + + ZFS_ENTER(zfsvfs); + + /* + * Initialize the zap iterator cursor. + */ + + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, zfsvfs->z_os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, zfsvfs->z_os, zp->z_id, offset); + } + + while (1) { + ino64_t objnum; + enum vtype vtype = VNON; + znode_t *tmp_zp = NULL; + + /* + * Note that the low 4 bits of the cookie returned by zap is + * always zero. This allows us to use the low nibble for + * "special" entries: + * We use 0 for '.', and 1 for '..' (ignored here). + * If this is the root of the filesystem, we use the offset 2 + * for the *'.zfs' directory. + */ + if (offset <= 1) { + offset = 2; + continue; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strlcpy(zap.za_name, ZFS_CTLDIR_NAME, + MAXNAMELEN); + objnum = ZFSCTL_INO_ROOT; + vtype = VDIR; + } else { + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + *(ap->a_eofflag) = (error == ENOENT); + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + error = ENXIO; + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + vtype = DTTOVT(ZFS_DIRENT_TYPE(zap.za_first_integer)); + /* Check if vtype is MIA */ + if ((vtype == 0) && !prefetch && (alp->dirattr || + alp->fileattr || + (alp->commonattr & ATTR_CMN_OBJTYPE))) { + prefetch = 1; + } + } + + /* Grab znode if required */ + if (prefetch) { + dmu_prefetch(zfsvfs->z_os, objnum, 0, 0); + if ((error = zfs_zget(zfsvfs, objnum, &tmp_zp)) == 0) { + if (vtype == VNON) { + /* SA_LOOKUP? */ + vtype = IFTOVT(tmp_zp->z_mode); + } + } else { + tmp_zp = NULL; + error = ENXIO; + goto skip_entry; + /* + * Currently ".zfs" entry is skipped, as we have + * no methods to pack that into the attrs (all + * helper functions take znode_t *, and .zfs is + * not a znode_t *). Add dummy .zfs code here if + * it is desirable to show .zfs in Finder. + */ + } + } + + /* + * Setup for the next item's attribute list + */ + *((u_int32_t *)attrptr) = 0; /* byte count slot */ + attrptr = ((u_int32_t *)attrptr) + 1; /* fixed attr start */ + attrinfo.ai_attrbufpp = &attrptr; + attrinfo.ai_varbufpp = &varptr; + + /* + * Pack entries into attribute buffer. + */ + if (alp->commonattr) { + commonattrpack(&attrinfo, zfsvfs, tmp_zp, zap.za_name, + objnum, vtype, user64); + } + if (alp->dirattr && vtype == VDIR) { + dirattrpack(&attrinfo, tmp_zp); + } + if (alp->fileattr && vtype != VDIR) { + fileattrpack(&attrinfo, zfsvfs, tmp_zp); + } + /* All done with tmp znode. */ + if (prefetch && tmp_zp) { + vnode_put(ZTOV(tmp_zp)); + tmp_zp = NULL; + } + attrbufsize = ((char *)varptr - (char *)attrbufptr); + + /* + * Make sure there's enough buffer space remaining. + */ + if (uio_resid(uio) < 0 || + attrbufsize > (u_int32_t)uio_resid(uio)) { + break; + } else { + *((u_int32_t *)attrbufptr) = attrbufsize; + error = uiomove((caddr_t)attrbufptr, attrbufsize, + UIO_READ, uio); + if (error != 0) + break; + attrptr = attrbufptr; + /* Point to variable-length storage */ + varptr = (char *)attrbufptr + fixedsize; + *(ap->a_actualcount) += 1; + + /* + * Move to the next entry, fill in the previous offset. + */ + skip_entry: + if ((offset > 2) || ((offset == 2) && + !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + /* Termination checks */ + if (--maxcount <= 0 || uio_resid(uio) < 0 || + (u_int32_t)uio_resid(uio) < (fixedsize + + ZAP_AVENAMELEN)) { + break; + } + } + } +update: + zap_cursor_fini(&zc); + + if (attrbufptr) { + kmem_free(attrbufptr, maxsize); + } + if (error == ENOENT) { + error = 0; + } + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + /* XXX newstate TBD */ + *ap->a_newstate = zp->z_atime[0] + zp->z_atime[1]; + uio_setoffset(uio, offset); + + ZFS_EXIT(zfsvfs); + dprintf("-readdirattr: error %d\n", error); + return (error); +} +#endif + + +#ifdef WITH_SEARCHFS +int +zfs_vnop_searchfs(struct vnop_searchfs_args *ap) +#if 0 + struct vnop_searchfs_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + void *a_searchparams1; + void *a_searchparams2; + struct attrlist *a_searchattrs; + ulong_t a_maxmatches; + struct timeval *a_timelimit; + struct attrlist *a_returnattrs; + ulong_t *a_nummatches; + ulong_t a_scriptcode; + ulong_t a_options; + struct uio *a_uio; + struct searchstate *a_searchstate; + vfs_context_t a_context; + }; +#endif +{ + printf("vnop_searchfs called, type %d\n", vnode_vtype(ap->a_vp)); + + *(ap->a_nummatches) = 0; + + return (ENOTSUP); +} +#endif + + + +/* + * Predeclare these here so that the compiler assumes that this is an "old + * style" function declaration that does not include arguments so that we won't + * get type mismatch errors in the initializations that follow. + */ +static int zfs_inval(void); +static int zfs_isdir(void); + +static int +zfs_inval() +{ + dprintf("ZFS: Bad vnop: returning EINVAL\n"); + return (EINVAL); +} + +static int +zfs_isdir() +{ + dprintf("ZFS: Bad vnop: returning EISDIR\n"); + return (EISDIR); +} + + +#define VOPFUNC int (*)(void *) + +/* Directory vnode operations template */ +int (**zfs_dvnodeops) (void *); +struct vnodeopv_entry_desc zfs_dvnodeops_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_lookup_desc, (VOPFUNC)zfs_vnop_lookup}, + {&vnop_create_desc, (VOPFUNC)zfs_vnop_create}, + {&vnop_whiteout_desc, (VOPFUNC)zfs_vnop_whiteout}, + {&vnop_mknod_desc, (VOPFUNC)zfs_vnop_mknod}, + {&vnop_open_desc, (VOPFUNC)zfs_vnop_open}, + {&vnop_close_desc, (VOPFUNC)zfs_vnop_close}, + {&vnop_access_desc, (VOPFUNC)zfs_vnop_access}, + {&vnop_getattr_desc, (VOPFUNC)zfs_vnop_getattr}, + {&vnop_setattr_desc, (VOPFUNC)zfs_vnop_setattr}, + {&vnop_read_desc, (VOPFUNC)zfs_isdir}, + {&vnop_write_desc, (VOPFUNC)zfs_isdir}, + {&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_select_desc, (VOPFUNC)zfs_isdir}, + {&vnop_bwrite_desc, (VOPFUNC)zfs_isdir}, + {&vnop_fsync_desc, (VOPFUNC)zfs_vnop_fsync}, + {&vnop_remove_desc, (VOPFUNC)zfs_vnop_remove}, + {&vnop_link_desc, (VOPFUNC)zfs_vnop_link}, + {&vnop_rename_desc, (VOPFUNC)zfs_vnop_rename}, +#if defined(MAC_OS_X_VERSION_10_12) && \ + (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12) + {&vnop_renamex_desc, (VOPFUNC)zfs_vnop_renamex}, +#endif + {&vnop_mkdir_desc, (VOPFUNC)zfs_vnop_mkdir}, + {&vnop_rmdir_desc, (VOPFUNC)zfs_vnop_rmdir}, + {&vnop_symlink_desc, (VOPFUNC)zfs_vnop_symlink}, + {&vnop_readdir_desc, (VOPFUNC)zfs_vnop_readdir}, + {&vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim}, + {&vnop_pathconf_desc, (VOPFUNC)zfs_vnop_pathconf}, + {&vnop_revoke_desc, (VOPFUNC)zfs_vnop_revoke}, + {&vnop_getxattr_desc, (VOPFUNC)zfs_vnop_getxattr}, + {&vnop_setxattr_desc, (VOPFUNC)zfs_vnop_setxattr}, + {&vnop_removexattr_desc, (VOPFUNC)zfs_vnop_removexattr}, + {&vnop_listxattr_desc, (VOPFUNC)zfs_vnop_listxattr}, +#ifdef WITH_READDIRATTR + {&vnop_readdirattr_desc, (VOPFUNC)zfs_vnop_readdirattr}, +#endif +#ifdef WITH_SEARCHFS + {&vnop_searchfs_desc, (VOPFUNC)zfs_vnop_searchfs}, +#endif + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_dvnodeop_opv_desc = +{ &zfs_dvnodeops, zfs_dvnodeops_template }; + +/* Regular file vnode operations template */ +int (**zfs_fvnodeops) (void *); +struct vnodeopv_entry_desc zfs_fvnodeops_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_whiteout_desc, (VOPFUNC)zfs_vnop_whiteout}, + {&vnop_open_desc, (VOPFUNC)zfs_vnop_open}, + {&vnop_close_desc, (VOPFUNC)zfs_vnop_close}, + {&vnop_access_desc, (VOPFUNC)zfs_vnop_access}, + {&vnop_getattr_desc, (VOPFUNC)zfs_vnop_getattr}, + {&vnop_setattr_desc, (VOPFUNC)zfs_vnop_setattr}, + {&vnop_read_desc, (VOPFUNC)zfs_vnop_read}, + {&vnop_write_desc, (VOPFUNC)zfs_vnop_write}, + {&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_select_desc, (VOPFUNC)zfs_vnop_select}, + {&vnop_fsync_desc, (VOPFUNC)zfs_vnop_fsync}, + {&vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim}, + {&vnop_pathconf_desc, (VOPFUNC)zfs_vnop_pathconf}, + {&vnop_bwrite_desc, (VOPFUNC)zfs_inval}, + {&vnop_pagein_desc, (VOPFUNC)zfs_vnop_pagein}, +#if HAVE_PAGEOUT_V2 + {&vnop_pageout_desc, (VOPFUNC)zfs_vnop_pageoutv2}, +#else + {&vnop_pageout_desc, (VOPFUNC)zfs_vnop_pageout}, +#endif + {&vnop_mmap_desc, (VOPFUNC)zfs_vnop_mmap}, + {&vnop_mnomap_desc, (VOPFUNC)zfs_vnop_mnomap}, + {&vnop_blktooff_desc, (VOPFUNC)zfs_vnop_blktooff}, + {&vnop_offtoblk_desc, (VOPFUNC)zfs_vnop_offtoblk}, + {&vnop_blockmap_desc, (VOPFUNC)zfs_vnop_blockmap}, + {&vnop_strategy_desc, (VOPFUNC)zfs_vnop_strategy}, + {&vnop_allocate_desc, (VOPFUNC)zfs_vnop_allocate}, + {&vnop_revoke_desc, (VOPFUNC)zfs_vnop_revoke}, + {&vnop_exchange_desc, (VOPFUNC)zfs_vnop_exchange}, + {&vnop_getxattr_desc, (VOPFUNC)zfs_vnop_getxattr}, + {&vnop_setxattr_desc, (VOPFUNC)zfs_vnop_setxattr}, + {&vnop_removexattr_desc, (VOPFUNC)zfs_vnop_removexattr}, + {&vnop_listxattr_desc, (VOPFUNC)zfs_vnop_listxattr}, +#ifdef HAVE_NAMED_STREAMS + {&vnop_getnamedstream_desc, (VOPFUNC)zfs_vnop_getnamedstream}, + {&vnop_makenamedstream_desc, (VOPFUNC)zfs_vnop_makenamedstream}, + {&vnop_removenamedstream_desc, (VOPFUNC)zfs_vnop_removenamedstream}, +#endif +#ifdef WITH_SEARCHFS + {&vnop_searchfs_desc, (VOPFUNC)zfs_vnop_searchfs}, +#endif + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_fvnodeop_opv_desc = +{ &zfs_fvnodeops, zfs_fvnodeops_template }; + +/* Symbolic link vnode operations template */ +int (**zfs_symvnodeops) (void *); +struct vnodeopv_entry_desc zfs_symvnodeops_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_open_desc, (VOPFUNC)zfs_vnop_open}, + {&vnop_close_desc, (VOPFUNC)zfs_vnop_close}, + {&vnop_access_desc, (VOPFUNC)zfs_vnop_access}, + {&vnop_getattr_desc, (VOPFUNC)zfs_vnop_getattr}, + {&vnop_setattr_desc, (VOPFUNC)zfs_vnop_setattr}, + {&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_readlink_desc, (VOPFUNC)zfs_vnop_readlink}, + {&vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim}, + {&vnop_pathconf_desc, (VOPFUNC)zfs_vnop_pathconf}, + {&vnop_revoke_desc, (VOPFUNC)zfs_vnop_revoke}, + {&vnop_getxattr_desc, (VOPFUNC)zfs_vnop_getxattr}, + {&vnop_setxattr_desc, (VOPFUNC)zfs_vnop_setxattr}, + {&vnop_removexattr_desc, (VOPFUNC)zfs_vnop_removexattr}, + {&vnop_listxattr_desc, (VOPFUNC)zfs_vnop_listxattr}, + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_symvnodeop_opv_desc = +{ &zfs_symvnodeops, zfs_symvnodeops_template }; + +/* Extended attribtue directory vnode operations template */ +int (**zfs_xdvnodeops) (void *); +struct vnodeopv_entry_desc zfs_xdvnodeops_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_lookup_desc, (VOPFUNC)zfs_vnop_lookup}, + {&vnop_create_desc, (VOPFUNC)zfs_vnop_create}, + {&vnop_whiteout_desc, (VOPFUNC)zfs_vnop_whiteout}, + {&vnop_mknod_desc, (VOPFUNC)zfs_inval}, + {&vnop_open_desc, (VOPFUNC)zfs_vnop_open}, + {&vnop_close_desc, (VOPFUNC)zfs_vnop_close}, + {&vnop_access_desc, (VOPFUNC)zfs_vnop_access}, + {&vnop_getattr_desc, (VOPFUNC)zfs_vnop_getattr}, + {&vnop_setattr_desc, (VOPFUNC)zfs_vnop_setattr}, + {&vnop_read_desc, (VOPFUNC)zfs_vnop_read}, + {&vnop_write_desc, (VOPFUNC)zfs_vnop_write}, + {&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_select_desc, (VOPFUNC)zfs_vnop_select}, + {&vnop_fsync_desc, (VOPFUNC)zfs_vnop_fsync}, + {&vnop_remove_desc, (VOPFUNC)zfs_vnop_remove}, + {&vnop_link_desc, (VOPFUNC)zfs_vnop_link}, + {&vnop_rename_desc, (VOPFUNC)zfs_vnop_rename}, + {&vnop_mkdir_desc, (VOPFUNC)zfs_inval}, + {&vnop_rmdir_desc, (VOPFUNC)zfs_vnop_rmdir}, + {&vnop_symlink_desc, (VOPFUNC)zfs_inval}, + {&vnop_readdir_desc, (VOPFUNC)zfs_vnop_readdir}, + {&vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim}, + {&vnop_pathconf_desc, (VOPFUNC)zfs_vnop_pathconf}, + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_xdvnodeop_opv_desc = +{ &zfs_xdvnodeops, zfs_xdvnodeops_template }; + +/* Error vnode operations template */ +int (**zfs_evnodeops) (void *); +struct vnodeopv_entry_desc zfs_evnodeops_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim}, + {&vnop_pathconf_desc, (VOPFUNC)zfs_vnop_pathconf}, + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_evnodeop_opv_desc = +{ &zfs_evnodeops, zfs_evnodeops_template }; + +int (**zfs_fifonodeops)(void *); +struct vnodeopv_entry_desc zfs_fifonodeops_template[] = { + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)fifo_lookup }, + { &vnop_create_desc, (VOPFUNC)fifo_create }, + { &vnop_mknod_desc, (VOPFUNC)fifo_mknod }, + { &vnop_open_desc, (VOPFUNC)fifo_open }, + { &vnop_close_desc, (VOPFUNC)fifo_close }, + { &vnop_getattr_desc, (VOPFUNC)zfs_vnop_getattr }, + { &vnop_setattr_desc, (VOPFUNC)zfs_vnop_setattr }, + { &vnop_read_desc, (VOPFUNC)fifo_read }, + { &vnop_write_desc, (VOPFUNC)fifo_write }, + { &vnop_ioctl_desc, (VOPFUNC)fifo_ioctl }, + { &vnop_select_desc, (VOPFUNC)fifo_select }, + { &vnop_revoke_desc, (VOPFUNC)fifo_revoke }, + { &vnop_mmap_desc, (VOPFUNC)fifo_mmap }, + { &vnop_fsync_desc, (VOPFUNC)zfs_vnop_fsync }, + { &vnop_remove_desc, (VOPFUNC)fifo_remove }, + { &vnop_link_desc, (VOPFUNC)fifo_link }, + { &vnop_rename_desc, (VOPFUNC)fifo_rename }, + { &vnop_mkdir_desc, (VOPFUNC)fifo_mkdir }, + { &vnop_rmdir_desc, (VOPFUNC)fifo_rmdir }, + { &vnop_symlink_desc, (VOPFUNC)fifo_symlink }, + { &vnop_readdir_desc, (VOPFUNC)fifo_readdir }, + { &vnop_readlink_desc, (VOPFUNC)fifo_readlink }, + { &vnop_inactive_desc, (VOPFUNC)zfs_vnop_inactive }, + { &vnop_reclaim_desc, (VOPFUNC)zfs_vnop_reclaim }, + { &vnop_strategy_desc, (VOPFUNC)fifo_strategy }, + { &vnop_pathconf_desc, (VOPFUNC)fifo_pathconf }, + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, + { &vnop_bwrite_desc, (VOPFUNC)zfs_inval }, + { &vnop_pagein_desc, (VOPFUNC)zfs_vnop_pagein }, +#if HAVE_PAGEOUT_V2 + { &vnop_pageout_desc, (VOPFUNC)zfs_vnop_pageoutv2 }, +#else + { &vnop_pageout_desc, (VOPFUNC)zfs_vnop_pageout }, +#endif + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, + { &vnop_blktooff_desc, (VOPFUNC)zfs_vnop_blktooff }, + { &vnop_offtoblk_desc, (VOPFUNC)zfs_vnop_offtoblk }, + { &vnop_blockmap_desc, (VOPFUNC)zfs_vnop_blockmap }, + { &vnop_getxattr_desc, (VOPFUNC)zfs_vnop_getxattr}, + { &vnop_setxattr_desc, (VOPFUNC)zfs_vnop_setxattr}, + { &vnop_removexattr_desc, (VOPFUNC)zfs_vnop_removexattr}, + { &vnop_listxattr_desc, (VOPFUNC)zfs_vnop_listxattr}, + { (struct vnodeop_desc *)NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_fifonodeop_opv_desc = + { &zfs_fifonodeops, zfs_fifonodeops_template }; + + +/* + * .zfs/snapdir vnops + */ +int (**zfs_ctldirops) (void *); +struct vnodeopv_entry_desc zfs_ctldir_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_lookup_desc, (VOPFUNC)zfsctl_vnop_lookup}, + {&vnop_getattr_desc, (VOPFUNC)zfsctl_vnop_getattr}, + {&vnop_readdir_desc, (VOPFUNC)zfsctl_vnop_readdir}, + {&vnop_mkdir_desc, (VOPFUNC)zfsctl_vnop_mkdir}, + {&vnop_rmdir_desc, (VOPFUNC)zfsctl_vnop_rmdir}, + /* We also need to define these for the top ones to work */ + {&vnop_open_desc, (VOPFUNC)zfsctl_vnop_open}, + {&vnop_close_desc, (VOPFUNC)zfsctl_vnop_close}, + {&vnop_access_desc, (VOPFUNC)zfsctl_vnop_access}, + {&vnop_inactive_desc, (VOPFUNC)zfsctl_vnop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfsctl_vnop_reclaim}, + {&vnop_revoke_desc, (VOPFUNC)err_revoke}, + {&vnop_fsync_desc, (VOPFUNC)nop_fsync}, + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfs_ctldir_opv_desc = +{ &zfs_ctldirops, zfs_ctldir_template }; + +/* + * Get new vnode for znode. + * + * This function uses zp->z_zfsvfs, zp->z_mode, zp->z_flags, zp->z_id and sets + * zp->z_vnode and zp->z_vid. + */ +int +zfs_znode_getvnode(znode_t *zp, zfsvfs_t *zfsvfs) +{ + struct vnode_fsparam vfsp; + struct vnode *vp = NULL; + + dprintf("getvnode zp %p with vp %p zfsvfs %p vfs %p\n", zp, vp, + zfsvfs, zfsvfs->z_vfs); + + if (zp->z_vnode) + panic("zp %p vnode already set\n", zp->z_vnode); + + bzero(&vfsp, sizeof (vfsp)); + vfsp.vnfs_str = "zfs"; + vfsp.vnfs_mp = zfsvfs->z_vfs; + vfsp.vnfs_vtype = IFTOVT((mode_t)zp->z_mode); + vfsp.vnfs_fsnode = zp; + vfsp.vnfs_flags = VNFS_ADDFSREF; + + /* Tag root directory */ + if (zp->z_id == zfsvfs->z_root) { + vfsp.vnfs_markroot = 1; + } + + switch (vfsp.vnfs_vtype) { + case VDIR: + if (zp->z_pflags & ZFS_XATTR) { + vfsp.vnfs_vops = zfs_xdvnodeops; + } else { + vfsp.vnfs_vops = zfs_dvnodeops; + } + zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ + break; + case VBLK: + case VCHR: + { + uint64_t rdev; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), + &rdev, sizeof (rdev)) == 0); + + vfsp.vnfs_rdev = zfs_cmpldev(rdev); + } + /* FALLTHROUGH */ + case VSOCK: + vfsp.vnfs_vops = zfs_fvnodeops; + break; + case VFIFO: + vfsp.vnfs_vops = zfs_fifonodeops; + break; + case VREG: + vfsp.vnfs_vops = zfs_fvnodeops; + vfsp.vnfs_filesize = zp->z_size; + break; + case VLNK: + vfsp.vnfs_vops = zfs_symvnodeops; +#if 0 + vfsp.vnfs_filesize = ???; +#endif + break; + default: + vfsp.vnfs_vops = zfs_fvnodeops; + printf("ZFS: Warning, error-vnops selected: vtype %d\n", + vfsp.vnfs_vtype); + break; + } + + /* + * vnode_create() has a habit of calling both vnop_reclaim() and + * vnop_fsync(), which can create havok as we are already holding locks. + */ + + while (vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp) != 0) { + kpreempt(KPREEMPT_SYNC); + } + atomic_inc_64(&vnop_num_vnodes); + + printf("Assigned zp %p with vp %p zfsvfs %p\n", zp, vp, zp->z_zfsvfs); + + /* + * Unfortunately, when it comes to IOCTL_GET_BOOT_INFO and getting + * the volume finderinfo, XNU checks the tags, and only acts on + * HFS. So we have to set it to HFS on the root. It is pretty gross + * but until XNU adds supporting code.. + * The only place we use tags in ZFS is ctldir checking for VT_OTHER + */ + if (zp->z_id == zfsvfs->z_root) + vnode_settag(vp, VT_HFS); + else + vnode_settag(vp, VT_ZFS); + + zp->z_vid = vnode_vid(vp); + zp->z_vnode = vp; + + /* + * OS X Finder is hardlink agnostic, so we need to mark vp's that + * are hardlinks, so that it forces a lookup each time, ignoring + * the name cache. + */ + if ((zp->z_links > 1) && (IFTOVT((mode_t)zp->z_mode) == VREG)) + vnode_setmultipath(vp); + + return (0); +} + + +/* + * Called by taskq, to call zfs_znode_getvnode( vnode_create( - and + * attach vnode to znode. + */ +void +zfs_znode_asyncgetvnode_impl(void *arg) +{ + znode_t *zp = (znode_t *)arg; + VERIFY3P(zp, !=, NULL); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + VERIFY3P(zfsvfs, !=, NULL); + + // Attach vnode, done as different thread + zfs_znode_getvnode(zp, zfsvfs); + + // Wake up anyone blocked on us + mutex_enter(&zp->z_attach_lock); + taskq_init_ent(&zp->z_attach_taskq); + cv_broadcast(&zp->z_attach_cv); + mutex_exit(&zp->z_attach_lock); + +} + + +/* + * If the znode's vnode is not yet attached (zp->z_vnode == NULL) + * we call taskq_wait to wait for it to complete. + * We guarantee znode has a vnode at the return of function only + * when return is "0". On failure to wait, it returns -1, and caller + * may consider waiting by other means. + */ +int +zfs_znode_asyncwait(znode_t *zp) +{ + int ret = -1; + zfsvfs_t *zfsvfs; + + if (zp == NULL) + return (ret); + + zfsvfs = zp->z_zfsvfs; + if (zfsvfs == NULL) + return (ret); + + ZFS_ENTER_IFERROR(zfsvfs) + goto out; + + if (zfsvfs->z_os == NULL) + goto out; + + // Work out if we need to block, that is, we have + // no vnode AND a taskq was launched. Unsure if we should + // look inside taskqent node like this. + mutex_enter(&zp->z_attach_lock); + if (zp->z_vnode == NULL && + zp->z_attach_taskq.tqent_func != NULL) { + // We need to block and wait for taskq to finish. + cv_wait(&zp->z_attach_cv, &zp->z_attach_lock); + ret = 0; + } + mutex_exit(&zp->z_attach_lock); + +out: + ZFS_EXIT(zfsvfs); + return (ret); +} + +/* + * Called in place of VN_RELE() for the places that uses ZGET_FLAG_ASYNC. + */ +void +zfs_znode_asyncput_impl(znode_t *zp) +{ + // Make sure the other thread finished zfs_znode_getvnode(); + // This may block, if waiting is required. + zfs_znode_asyncwait(zp); + + // Safe to release now that it is attached. + VN_RELE(ZTOV(zp)); +} + +/* + * Called in place of VN_RELE() for the places that uses ZGET_FLAG_ASYNC, + * where we also taskq it - as we come from reclaim. + */ +void +zfs_znode_asyncput(znode_t *zp) +{ + dsl_pool_t *dp = dmu_objset_pool(zp->z_zfsvfs->z_os); + taskq_t *tq = dsl_pool_zrele_taskq(dp); + + VERIFY3P(tq, !=, NULL); + + VERIFY(taskq_dispatch( + (taskq_t *)tq, + (task_func_t *)zfs_znode_asyncput_impl, zp, TQ_SLEEP) != 0); +} + +/* + * Attach a new vnode to the znode asynchronically. We do this using + * a taskq to call it, and then wait to release the iocount. + * Called of zget_ext(..., ZGET_FLAG_ASYNC); will use + * zfs_znode_asyncput(zp) instead of VN_RELE(vp). + */ +int +zfs_znode_asyncgetvnode(znode_t *zp, zfsvfs_t *zfsvfs) +{ + VERIFY(zp != NULL); + VERIFY(zfsvfs != NULL); + + // We should not have a vnode here. + VERIFY3P(ZTOV(zp), ==, NULL); + + dsl_pool_t *dp = dmu_objset_pool(zfsvfs->z_os); + taskq_t *tq = dsl_pool_zrele_taskq(dp); + VERIFY3P(tq, !=, NULL); + + mutex_enter(&zp->z_attach_lock); + taskq_dispatch_ent(tq, + (task_func_t *)zfs_znode_asyncgetvnode_impl, + zp, + TQ_SLEEP, + &zp->z_attach_taskq); + mutex_exit(&zp->z_attach_lock); + return (0); +} + + + +/* + * Maybe these should live in vfsops + */ +int +zfs_vfsops_init(void) +{ + struct vfs_fsentry vfe; + + /* Start thread to notify Finder of changes */ + zfs_start_notify_thread(); + + vfe.vfe_vfsops = &zfs_vfsops_template; + vfe.vfe_vopcnt = ZFS_VNOP_TBL_CNT; + vfe.vfe_opvdescs = zfs_vnodeop_opv_desc_list; + + strlcpy(vfe.vfe_fsname, "zfs", MFSNAMELEN); + + /* + * Note: must set VFS_TBLGENERICMNTARGS with VFS_TBLLOCALVOL + * to suppress local mount argument handling. + */ + vfe.vfe_flags = VFS_TBLTHREADSAFE | VFS_TBLNOTYPENUM | VFS_TBLLOCALVOL | + VFS_TBL64BITREADY | VFS_TBLNATIVEXATTR | VFS_TBLGENERICMNTARGS | + VFS_TBLREADDIR_EXTENDED; + +#if HAVE_PAGEOUT_V2 + vfe.vfe_flags |= VFS_TBLVNOP_PAGEOUTV2; +#endif + +#ifdef VFS_TBLCANMOUNTROOT // From 10.12 + vfe.vfe_flags |= VFS_TBLCANMOUNTROOT; +#endif + + vfe.vfe_reserv[0] = 0; + vfe.vfe_reserv[1] = 0; + + if (vfs_fsadd(&vfe, &zfs_vfsconf) != 0) + return (KERN_FAILURE); + else + return (KERN_SUCCESS); +} + +int +zfs_vfsops_fini(void) +{ + + zfs_stop_notify_thread(); + + return (vfs_fsremove(zfs_vfsconf)); +} diff --git a/module/os/macos/zfs/zfs_vnops_osx_lib.c b/module/os/macos/zfs/zfs_vnops_osx_lib.c new file mode 100644 index 0000000000..7504d71013 --- /dev/null +++ b/module/os/macos/zfs/zfs_vnops_osx_lib.c @@ -0,0 +1,2232 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013 Will Andrews + * Copyright (c) 2013, 2020 Jorgen Lundman + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +extern int zfs_vnop_force_formd_normalized_output; /* disabled by default */ + +/* + * Unfortunately Apple defines "KAUTH_VNODE_ACCESS (1<<31)" which + * generates: "warning: signed shift result (0x80000000) sets the + * sign bit of the shift expression's type ('int') and becomes negative." + * So until they fix their define, we override it here. + */ + +#if KAUTH_VNODE_ACCESS == 0x80000000 +#undef KAUTH_VNODE_ACCESS +#define KAUTH_VNODE_ACCESS (1ULL<<31) +#endif + + + +int zfs_hardlink_addmap(znode_t *zp, uint64_t parentid, uint32_t linkid); + +/* Originally from illumos:uts/common/sys/vfs.h */ +typedef uint64_t vfs_feature_t; +#define VFSFT_XVATTR 0x100000001 /* Supports xvattr for attrs */ +#define VFSFT_CASEINSENSITIVE 0x100000002 /* Supports case-insensitive */ +#define VFSFT_NOCASESENSITIVE 0x100000004 /* NOT case-sensitive */ +#define VFSFT_DIRENTFLAGS 0x100000008 /* Supports dirent flags */ +#define VFSFT_ACLONCREATE 0x100000010 /* Supports ACL on create */ +#define VFSFT_ACEMASKONACCESS 0x100000020 /* Can use ACEMASK for access */ +#define VFSFT_SYSATTR_VIEWS 0x100000040 /* Supports sysattr view i/f */ +#define VFSFT_ACCESS_FILTER 0x100000080 /* dirents filtered by access */ +#define VFSFT_REPARSE 0x100000100 /* Supports reparse point */ +#define VFSFT_ZEROCOPY_SUPPORTED 0x100000200 /* Supports loaning buffers */ + +#define ZFS_SUPPORTED_VATTRS \ + (VNODE_ATTR_va_mode | \ + VNODE_ATTR_va_uid | \ + VNODE_ATTR_va_gid | \ + VNODE_ATTR_va_fsid | \ + VNODE_ATTR_va_fileid | \ + VNODE_ATTR_va_nlink | \ + VNODE_ATTR_va_data_size | \ + VNODE_ATTR_va_total_size | \ + VNODE_ATTR_va_rdev | \ + VNODE_ATTR_va_gen | \ + VNODE_ATTR_va_create_time | \ + VNODE_ATTR_va_access_time | \ + VNODE_ATTR_va_modify_time | \ + VNODE_ATTR_va_change_time | \ + VNODE_ATTR_va_backup_time | \ + VNODE_ATTR_va_flags | \ + VNODE_ATTR_va_parentid | \ + VNODE_ATTR_va_iosize | \ + VNODE_ATTR_va_filerev | \ + VNODE_ATTR_va_type | \ + VNODE_ATTR_va_encoding | \ + 0) + +// VNODE_ATTR_va_uuuid | +// VNODE_ATTR_va_guuid | + + + + + + + + +/* + * fnv_32a_str - perform a 32 bit Fowler/Noll/Vo FNV-1a hash on a string + * + * input: + * str - string to hash + * hval - previous hash value or 0 if first call + * + * returns: + * 32 bit hash as a static hash type + * + * NOTE: To use the recommended 32 bit FNV-1a hash, use FNV1_32A_INIT as the + * hval arg on the first call to either fnv_32a_buf() or fnv_32a_str(). + */ +uint32_t +fnv_32a_str(const char *str, uint32_t hval) +{ + unsigned char *s = (unsigned char *)str; /* unsigned string */ + + /* + * FNV-1a hash each octet in the buffer + */ + while (*s) { + + /* xor the bottom with the current octet */ + hval ^= (uint32_t)*s++; + + /* multiply by the 32 bit FNV magic prime mod 2^32 */ +#if defined(NO_FNV_GCC_OPTIMIZATION) + hval *= FNV_32_PRIME; +#else + hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + + (hval<<24); +#endif + } + + /* return our new hash value */ + return (hval); +} + +/* + * fnv_32a_buf - perform a 32 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * + * input: + * buf- start of buffer to hash + * len- length of buffer in octets + * hval- previous hash value or 0 if first call + * + * returns: + * 32 bit hash as a static hash type + * + * NOTE: To use the recommended 32 bit FNV-1a hash, use FNV1_32A_INIT as the + * hval arg on the first call to either fnv_32a_buf() or fnv_32a_str(). + */ +uint32_t +fnv_32a_buf(void *buf, size_t len, uint32_t hval) +{ + unsigned char *bp = (unsigned char *)buf; /* start of buffer */ + unsigned char *be = bp + len; /* beyond end of buffer */ + + /* + * FNV-1a hash each octet in the buffer + */ + while (bp < be) { + + /* xor the bottom with the current octet */ + hval ^= (uint32_t)*bp++; + + /* multiply by the 32 bit FNV magic prime mod 2^32 */ +#if defined(NO_FNV_GCC_OPTIMIZATION) + hval *= FNV_32_PRIME; +#else + hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + + (hval<<24); +#endif + } + + /* return our new hash value */ + return (hval); +} + +int +zfs_getattr_znode_unlocked(struct vnode *vp, vattr_t *vap) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + uint64_t parent; + sa_bulk_attr_t bulk[4]; + int count = 0; +#ifdef VNODE_ATTR_va_addedtime + uint64_t addtime[2] = { 0 }; +#endif + int ishardlink = 0; + + // printf("getattr_osx\n"); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (VATTR_IS_ACTIVE(vap, va_acl)) { + // printf("want acl\n"); + VATTR_RETURN(vap, va_uuuid, kauth_null_guid); + VATTR_RETURN(vap, va_guuid, kauth_null_guid); + + // dprintf("Calling getacl\n"); + if ((error = zfs_getacl(zp, &vap->va_acl, B_FALSE, NULL))) { + // dprintf("zfs_getacl returned error %d\n", error); + error = 0; + } else { + + VATTR_SET_SUPPORTED(vap, va_acl); + /* va_acl implies va_uuuid & va_guuid are supported. */ + VATTR_RETURN(vap, va_uuuid, kauth_null_guid); + VATTR_RETURN(vap, va_guuid, kauth_null_guid); + } + + } + + mutex_enter(&zp->z_lock); + + ishardlink = ((zp->z_links > 1) && + (IFTOVT((mode_t)zp->z_mode) == VREG)) ? 1 : 0; + if (zp->z_finder_hardlink == TRUE) + ishardlink = 1; + else if (ishardlink) + zp->z_finder_hardlink = TRUE; + + /* Work out which SA we need to fetch */ + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * Unfortunately, sa_bulk_lookup does not let you handle optional + * SA entries - so have to look up the optionals individually. + */ + error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count); + if (error) { + dprintf("ZFS: Warning: getattr failed sa_bulk_lookup: %d, " + "parent %llu, flags %llu\n", error, parent, zp->z_pflags); + mutex_exit(&zp->z_lock); + ZFS_EXIT(zfsvfs); + return (0); + } + +#ifdef VNODE_ATTR_va_addedtime + if (VATTR_IS_ACTIVE(vap, va_addedtime)) { + sa_lookup(zp->z_sa_hdl, SA_ZPL_ADDTIME(zfsvfs), + &addtime, sizeof (addtime)); + } +#endif + + /* + * On Mac OS X we always export the root directory id as 2 + */ + vap->va_fileid = INO_ZFSTOXNU(zp->z_id, zfsvfs->z_root); + + vap->va_data_size = zp->z_size; + vap->va_total_size = zp->z_size; + // vap->va_gen = zp->z_gen; + vap->va_gen = 0; +#if defined(DEBUG) || defined(ZFS_DEBUG) + if (zp->z_gen != 0) + dprintf("%s: va_gen %lld -> 0\n", __func__, zp->z_gen); +#endif + + if (vnode_isdir(vp)) { + vap->va_nlink = zp->z_size; + } else { + vap->va_nlink = zp->z_links; + } + + + /* + * Carbon compatibility, pretend to support this legacy attribute + */ + if (VATTR_IS_ACTIVE(vap, va_backup_time)) { + vap->va_backup_time.tv_sec = 0; + vap->va_backup_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_backup_time); + } + vap->va_flags = zfs_getbsdflags(zp); + /* + * On Mac OS X we always export the root directory id as 2 + * and its parent as 1 + */ + if (zp->z_id == zfsvfs->z_root) + vap->va_parentid = 1; + else if (parent == zfsvfs->z_root) + vap->va_parentid = 2; + else + vap->va_parentid = parent; + + // Hardlinks: Return cached parentid, make it 2 if root. + if (ishardlink && zp->z_finder_parentid) + vap->va_parentid = (zp->z_finder_parentid == zfsvfs->z_root) ? + 2 : zp->z_finder_parentid; + + vap->va_iosize = zp->z_blksz ? zp->z_blksz : zfsvfs->z_max_blksz; + // vap->va_iosize = 512; + VATTR_SET_SUPPORTED(vap, va_iosize); + + /* Don't include '.' and '..' in the number of entries */ + if (VATTR_IS_ACTIVE(vap, va_nchildren) && vnode_isdir(vp)) { + VATTR_RETURN(vap, va_nchildren, vap->va_nlink - 2); + } + + /* + * va_dirlinkcount is the count of directory hard links. When a file + * system does not support ATTR_DIR_LINKCOUNT, xnu will default to 1. + * Since we claim to support ATTR_DIR_LINKCOUNT both as valid and as + * native, we'll just return 1. We set 1 for this value in dirattrpack + * as well. If in the future ZFS actually supports directory hard links, + * we can return a real value. + */ + if (VATTR_IS_ACTIVE(vap, va_dirlinkcount) && vnode_isdir(vp)) { + VATTR_RETURN(vap, va_dirlinkcount, 1); + } + + + if (VATTR_IS_ACTIVE(vap, va_data_alloc) || + VATTR_IS_ACTIVE(vap, va_total_alloc)) { + uint32_t blksize; + u_longlong_t nblks; + sa_object_size(zp->z_sa_hdl, &blksize, &nblks); + vap->va_data_alloc = (uint64_t)512LL * (uint64_t)nblks; + vap->va_total_alloc = vap->va_data_alloc; + vap->va_supported |= VNODE_ATTR_va_data_alloc | + VNODE_ATTR_va_total_alloc; + } + + if (VATTR_IS_ACTIVE(vap, va_name)) { + vap->va_name[0] = 0; + + if (!vnode_isvroot(vp)) { + + /* + * Finder (Carbon) relies on getattr returning the + * correct name for hardlinks to work, so we store the + * lookup name in vnop_lookup if file references are + * high, then set the return name here. + * If we also want ATTR_CMN_* lookups to work, we need + * to set a unique va_linkid for each entry, and based + * on the linkid in the lookup, return the correct name. + * It is set in zfs_vnop_lookup(). + * Since zap_value_search is a slow call, we only use + * it if we have not cached the name in vnop_lookup. + */ + + // Cached name, from vnop_lookup + if (ishardlink && + zp->z_name_cache[0]) { + + strlcpy(vap->va_name, zp->z_name_cache, + MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + + } else if (zp->z_name_cache[0]) { + + strlcpy(vap->va_name, zp->z_name_cache, + MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + + } else { + + // Go find the name. + if (zap_value_search(zfsvfs->z_os, parent, + zp->z_id, ZFS_DIRENT_OBJ(-1ULL), + vap->va_name) == 0) { + VATTR_SET_SUPPORTED(vap, va_name); + // Might as well keep this name too. + strlcpy(zp->z_name_cache, vap->va_name, + MAXPATHLEN); + } // zap_value_search + + } + + dprintf("getattr: %p return name '%s':%04llx\n", vp, + vap->va_name, vap->va_linkid); + + + } else { + /* + * The vroot objects must return a unique name for + * Finder to be able to distringuish between mounts. + * For this reason we simply return the fullname, + * from the statfs mountedfrom + * + * dataset mountpoint + * foo /bar + * As we used to return "foo" to ATTR_CMN_NAME of + * "/bar" we change this to return "bar" as expected. + */ + char *r, *osname; + osname = vfs_statfs(zfsvfs->z_vfs)->f_mntonname; + r = strrchr(osname, '/'); + strlcpy(vap->va_name, + r ? &r[1] : osname, + MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + dprintf("getattr root returning '%s'\n", vap->va_name); + } + } + + if (VATTR_IS_ACTIVE(vap, va_linkid)) { + + /* + * Apple needs a little extra care with HARDLINKs. All hardlink + * targets return the same va_fileid (POSIX) but also return + * a unique va_linkid. This we generate by hashing the (unique) + * name and store as va_linkid. However, Finder will call + * vfs_vget() with linkid and expect to receive the correct link + * target, so we need to add it to the AVL z_hardlinks. + */ + if (ishardlink) { + hardlinks_t *searchnode, *findnode; + avl_index_t loc; + + // If we don't have a linkid, make one. + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + searchnode->hl_parent = vap->va_parentid; + searchnode->hl_fileid = zp->z_id; + strlcpy(searchnode->hl_name, zp->z_name_cache, + PATH_MAX); + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks, searchnode, + &loc); + rw_exit(&zfsvfs->z_hardlinks_lock); + kmem_free(searchnode, sizeof (hardlinks_t)); + + if (!findnode) { + static uint32_t zfs_hardlink_sequence = + 1ULL<<31; + uint32_t id; + + id = atomic_inc_32_nv(&zfs_hardlink_sequence); + + zfs_hardlink_addmap(zp, vap->va_parentid, id); + VATTR_RETURN(vap, va_linkid, id); + + } else { + VATTR_RETURN(vap, va_linkid, + findnode->hl_linkid); + } + + } else { // !ishardlink - use same as fileid + + VATTR_RETURN(vap, va_linkid, vap->va_fileid); + + } + + } // active linkid + + if (VATTR_IS_ACTIVE(vap, va_filerev)) { + VATTR_RETURN(vap, va_filerev, 0); + } + if (VATTR_IS_ACTIVE(vap, va_fsid)) { + VATTR_RETURN(vap, va_fsid, zfsvfs->z_rdev); + } + if (VATTR_IS_ACTIVE(vap, va_type)) { + VATTR_RETURN(vap, va_type, vnode_vtype(ZTOV(zp))); + } + if (VATTR_IS_ACTIVE(vap, va_encoding)) { + VATTR_RETURN(vap, va_encoding, kTextEncodingMacUnicode); + } +#ifdef VNODE_ATTR_va_addedtime + /* + * ADDEDTIME should come from finderinfo according to hfs_attrlist.c + * in ZFS we can use crtime, and add logic to getxattr finderinfo to + * copy the ADDEDTIME into the structure. See vnop_getxattr + */ + if (VATTR_IS_ACTIVE(vap, va_addedtime)) { + /* Lookup the ADDTIME if it exists, if not, use CRTIME */ + if ((addtime[0] == 0) && (addtime[1])) { + dprintf("ZFS: ADDEDTIME using crtime %llu (error %d)\n", + vap->va_crtime.tv_sec, error); + vap->va_addedtime.tv_sec = vap->va_crtime.tv_sec; + vap->va_addedtime.tv_nsec = vap->va_crtime.tv_nsec; + } else { + dprintf("ZFS: ADDEDTIME using addtime %llu\n", + addtime[0]); + ZFS_TIME_DECODE(&vap->va_addedtime, addtime); + } + VATTR_SET_SUPPORTED(vap, va_addedtime); + } +#endif +#ifdef VNODE_ATTR_va_fsid64 + if (VATTR_IS_ACTIVE(vap, va_fsid64)) { + vap->va_fsid64.val[0] = + vfs_statfs(zfsvfs->z_vfs)->f_fsid.val[0]; + vap->va_fsid64.val[1] = vfs_typenum(zfsvfs->z_vfs); + VATTR_SET_SUPPORTED(vap, va_fsid64); + } +#endif +#ifdef VNODE_ATTR_va_write_gencount + if (VATTR_IS_ACTIVE(vap, va_write_gencount)) { + if (!zp->z_write_gencount) + atomic_inc_64(&zp->z_write_gencount); + VATTR_RETURN(vap, va_write_gencount, + (uint32_t)zp->z_write_gencount); + } +#endif + +#ifdef VNODE_ATTR_va_document_id + if (VATTR_IS_ACTIVE(vap, va_document_id)) { + + if (!zp->z_document_id) { + zfs_setattr_generate_id(zp, parent, vap->va_name); + } + + VATTR_RETURN(vap, va_document_id, zp->z_document_id); + } +#endif /* VNODE_ATTR_va_document_id */ + + +#if 0 // Issue #192 + if (VATTR_IS_ACTIVE(vap, va_uuuid)) { + kauth_cred_uid2guid(zp->z_uid, &vap->va_uuuid); + } + if (VATTR_IS_ACTIVE(vap, va_guuid)) { + kauth_cred_gid2guid(zp->z_gid, &vap->va_guuid); + } +#endif + + if (ishardlink) { + dprintf("ZFS:getattr(%s,%llu,%llu) parent %llu: cache_parent " + "%llu: va_nlink %u\n", VATTR_IS_ACTIVE(vap, va_name) ? + vap->va_name : zp->z_name_cache, + vap->va_fileid, + VATTR_IS_ACTIVE(vap, va_linkid) ? vap->va_linkid : 0, + vap->va_parentid, + zp->z_finder_parentid, + vap->va_nlink); + } + + vap->va_supported |= ZFS_SUPPORTED_VATTRS; + uint64_t missing = 0; + missing = (vap->va_active ^ (vap->va_active & vap->va_supported)); + if (missing != 0) { + dprintf("vnop_getattr:: asked %08llx replied %08llx " + " missing %08llx\n", + vap->va_active, vap->va_supported, + missing); + } + + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + return (error); +} + +boolean_t +vfs_has_feature(vfs_t *vfsp, vfs_feature_t vfsft) +{ + + switch (vfsft) { + case VFSFT_CASEINSENSITIVE: + case VFSFT_NOCASESENSITIVE: + return (B_TRUE); + default: + return (B_FALSE); + } +} + +int +zfs_access_native_mode(struct vnode *vp, int *mode, cred_t *cr, + caller_context_t *ct) +{ + int accmode = *mode & (VREAD|VWRITE|VEXEC /* |VAPPEND */); + int error = 0; + int flag = 0; // FIXME + + if (accmode != 0) + error = zfs_access(vp, accmode, flag, cr); + + *mode &= ~(accmode); + + return (error); +} + +int +zfs_ioflags(int ap_ioflag) +{ + int flags = 0; + + if (ap_ioflag & IO_APPEND) + flags |= FAPPEND; + if (ap_ioflag & IO_NDELAY) + flags |= FNONBLOCK; + if (ap_ioflag & IO_SYNC) + flags |= (FSYNC | FDSYNC | FRSYNC); + + return (flags); +} + +int +zfs_vnop_ioctl_fullfsync(struct vnode *vp, vfs_context_t ct, zfsvfs_t *zfsvfs) +{ + int error; + + error = zfs_fsync(VTOZ(vp), /* syncflag */ 0, NULL); + if (error) + return (error); + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + else + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + return (0); +} + +uint32_t +zfs_getbsdflags(znode_t *zp) +{ + uint32_t bsdflags = 0; + uint64_t zflags = zp->z_pflags; + + if (zflags & ZFS_NODUMP) + bsdflags |= UF_NODUMP; + if (zflags & ZFS_UIMMUTABLE) + bsdflags |= UF_IMMUTABLE; + if (zflags & ZFS_UAPPENDONLY) + bsdflags |= UF_APPEND; + if (zflags & ZFS_OPAQUE) + bsdflags |= UF_OPAQUE; + if (zflags & ZFS_HIDDEN) + bsdflags |= UF_HIDDEN; + if (zflags & ZFS_TRACKED) + bsdflags |= UF_TRACKED; + if (zflags & ZFS_COMPRESSED) + bsdflags |= UF_COMPRESSED; + + if (zflags & ZFS_SIMMUTABLE) + bsdflags |= SF_IMMUTABLE; + if (zflags & ZFS_SAPPENDONLY) + bsdflags |= SF_APPEND; + /* + * Due to every file getting archive set automatically, and OSX + * don't let you move/copy it as a user, we disable archive connection + * for now + * if (zflags & ZFS_ARCHIVE) + * bsdflags |= SF_ARCHIVED; + */ + dprintf("getbsd changing zfs %08lx to osx %08lx\n", + zflags, bsdflags); + return (bsdflags); +} + +void +zfs_setbsdflags(znode_t *zp, uint32_t bsdflags) +{ + uint64_t zflags; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), + &zflags, sizeof (zflags)) == 0); + + if (bsdflags & UF_NODUMP) + zflags |= ZFS_NODUMP; + else + zflags &= ~ZFS_NODUMP; + + if (bsdflags & UF_IMMUTABLE) + zflags |= ZFS_UIMMUTABLE; + else + zflags &= ~ZFS_UIMMUTABLE; + + if (bsdflags & UF_APPEND) + zflags |= ZFS_UAPPENDONLY; + else + zflags &= ~ZFS_UAPPENDONLY; + + if (bsdflags & UF_OPAQUE) + zflags |= ZFS_OPAQUE; + else + zflags &= ~ZFS_OPAQUE; + + if (bsdflags & UF_HIDDEN) + zflags |= ZFS_HIDDEN; + else + zflags &= ~ZFS_HIDDEN; + + if (bsdflags & UF_TRACKED) + zflags |= ZFS_TRACKED; + else + zflags &= ~ZFS_TRACKED; + + if (bsdflags & UF_COMPRESSED) + zflags |= ZFS_COMPRESSED; + else + zflags &= ~ZFS_COMPRESSED; + + /* + * if (bsdflags & SF_ARCHIVED) + * zflags |= ZFS_ARCHIVE; + * else + * zflags &= ~ZFS_ARCHIVE; + */ + if (bsdflags & SF_IMMUTABLE) + zflags |= ZFS_SIMMUTABLE; + else + zflags &= ~ZFS_SIMMUTABLE; + + if (bsdflags & SF_APPEND) + zflags |= ZFS_SAPPENDONLY; + else + zflags &= ~ZFS_SAPPENDONLY; + + zp->z_pflags = zflags; + dprintf("setbsd changing osx %08lx to zfs %08lx\n", + bsdflags, zflags); + + /* + * (void )sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), + * (void *)&zp->z_pflags, sizeof (uint64_t), tx); + */ +} + +/* + * Lookup/Create an extended attribute entry. + * + * Input arguments: + * dzp - znode for hidden attribute directory + * name - name of attribute + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * + * Output arguments: + * vpp - pointer to the vnode for the entry (NULL if there isn't one) + * + * Return value: 0 on success or errno value on failure. + */ +int +zpl_obtain_xattr(znode_t *dzp, const char *name, mode_t mode, cred_t *cr, + vnode_t **vpp, int flag) +{ + znode_t *xzp = NULL; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + struct vnode_attr vattr; + int error; + struct componentname cn = { 0 }; + zfs_acl_ids_t acl_ids; + + /* zfs_dirent_lock() expects a component name */ + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + VATTR_INIT(&vattr); + VATTR_SET(&vattr, va_type, VREG); + VATTR_SET(&vattr, va_mode, mode & ~S_IFMT); + + if ((error = zfs_acl_ids_create(dzp, 0, + &vattr, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + cn.cn_namelen = strlen(name)+1; + cn.cn_nameptr = (char *)kmem_zalloc(cn.cn_namelen, KM_SLEEP); + +top: + /* Lock the attribute entry name. */ + if ((error = zfs_dirent_lock(&dl, dzp, (char *)name, &xzp, flag, + NULL, &cn))) { + goto out; + } + /* If the name already exists, we're done. */ + if (xzp != NULL) { + zfs_dirent_unlock(dl); + goto out; + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, (char *)name); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + +#if 1 // FIXME + if (dzp->z_pflags & ZFS_INHERIT_ACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + } +#endif + zfs_sa_upgrade_txholds(tx, dzp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + goto out; + } + + zfs_mknode(dzp, &vattr, tx, cr, 0, &xzp, &acl_ids); + + /* + * ASSERT(xzp->z_id == zoid); + */ + (void) zfs_link_create(dl, xzp, tx, ZNEW); + zfs_log_create(zilog, tx, TX_CREATE, dzp, xzp, (char *)name, + NULL /* vsecp */, 0 /* acl_ids.z_fuidp */, &vattr); + dmu_tx_commit(tx); + + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(xzp, zfsvfs); + + zfs_dirent_unlock(dl); +out: + zfs_acl_ids_free(&acl_ids); + if (cn.cn_nameptr) + kmem_free(cn.cn_nameptr, cn.cn_namelen); + + /* The REPLACE error if doesn't exist is ENOATTR */ + if ((flag & ZEXISTS) && (error == ENOENT)) + error = ENOATTR; + + if (xzp) + *vpp = ZTOV(xzp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * ace_trivial: + * determine whether an ace_t acl is trivial + * + * Trivialness implies that the acl is composed of only + * owner, group, everyone entries. ACL can't + * have read_acl denied, and write_owner/write_acl/write_attributes + * can only be owner@ entry. + */ +int +ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)) +{ + uint16_t flags; + uint32_t mask; + uint16_t type; + uint64_t cookie = 0; + + while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { + switch (flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + case ACE_EVERYONE: + break; + default: + return (1); + + } + + if (flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE)) + return (1); + + /* + * Special check for some special bits + * + * Don't allow anybody to deny reading basic + * attributes or a files ACL. + */ + if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (type == ACE_ACCESS_DENIED_ACE_TYPE)) + return (1); + + /* + * Delete permission is never set by default + */ + if (mask & ACE_DELETE) + return (1); + + /* + * Child delete permission should be accompanied by write + */ + if ((mask & ACE_DELETE_CHILD) && !(mask & ACE_WRITE_DATA)) + return (1); + /* + * only allow owner@ to have + * write_acl/write_owner/write_attributes/write_xattr/ + */ + + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(flags & ACE_OWNER) && (mask & + (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS)))) + return (1); + + } + + return (0); +} + + +void +acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) +{ + uint32_t read_mask = ACE_READ_DATA; + uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; + uint32_t execute_mask = ACE_EXECUTE; + + if (isdir) + write_mask |= ACE_DELETE_CHILD; + + masks->deny1 = 0; + if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) + masks->deny1 |= read_mask; + if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) + masks->deny1 |= write_mask; + if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) + masks->deny1 |= execute_mask; + + masks->deny2 = 0; + if (!(mode & S_IRGRP) && (mode & S_IROTH)) + masks->deny2 |= read_mask; + if (!(mode & S_IWGRP) && (mode & S_IWOTH)) + masks->deny2 |= write_mask; + if (!(mode & S_IXGRP) && (mode & S_IXOTH)) + masks->deny2 |= execute_mask; + + masks->allow0 = 0; + if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) + masks->allow0 |= read_mask; + if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) + masks->allow0 |= write_mask; + if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) + masks->allow0 |= execute_mask; + + masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; + if (mode & S_IRUSR) + masks->owner |= read_mask; + if (mode & S_IWUSR) + masks->owner |= write_mask; + if (mode & S_IXUSR) + masks->owner |= execute_mask; + + masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IRGRP) + masks->group |= read_mask; + if (mode & S_IWGRP) + masks->group |= write_mask; + if (mode & S_IXGRP) + masks->group |= execute_mask; + + masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IROTH) + masks->everyone |= read_mask; + if (mode & S_IWOTH) + masks->everyone |= write_mask; + if (mode & S_IXOTH) + masks->everyone |= execute_mask; +} + +void commonattrpack(attrinfo_t *aip, zfsvfs_t *zfsvfs, znode_t *zp, + const char *name, ino64_t objnum, enum vtype vtype, + boolean_t user64) +{ + attrgroup_t commonattr = aip->ai_attrlist->commonattr; + void *attrbufptr = *aip->ai_attrbufpp; + void *varbufptr = *aip->ai_varbufpp; + struct mount *mp = zfsvfs->z_vfs; + cred_t *cr = (cred_t *)vfs_context_ucred(aip->ai_context); + finderinfo_t finderinfo; + + /* + * We should probably combine all the sa_lookup into a bulk + * lookup operand. + */ + + finderinfo.fi_flags = 0; + + if (ATTR_CMN_NAME & commonattr) { + nameattrpack(aip, name, strlen(name)); + attrbufptr = *aip->ai_attrbufpp; + varbufptr = *aip->ai_varbufpp; + } + if (ATTR_CMN_DEVID & commonattr) { + *((dev_t *)attrbufptr) = vfs_statfs(mp)->f_fsid.val[0]; + attrbufptr = ((dev_t *)attrbufptr) + 1; + } + if (ATTR_CMN_FSID & commonattr) { + *((fsid_t *)attrbufptr) = vfs_statfs(mp)->f_fsid; + attrbufptr = ((fsid_t *)attrbufptr) + 1; + } + if (ATTR_CMN_OBJTYPE & commonattr) { + *((fsobj_type_t *)attrbufptr) = vtype; + attrbufptr = ((fsobj_type_t *)attrbufptr) + 1; + } + if (ATTR_CMN_OBJTAG & commonattr) { + *((fsobj_tag_t *)attrbufptr) = VT_ZFS; + attrbufptr = ((fsobj_tag_t *)attrbufptr) + 1; + } + /* + * Note: ATTR_CMN_OBJID is lossy (only 32 bits). + */ + if ((ATTR_CMN_OBJID | ATTR_CMN_OBJPERMANENTID) & commonattr) { + u_int32_t fileid; + /* + * On Mac OS X we always export the root directory id as 2 + */ + fileid = (objnum == zfsvfs->z_root) ? 2 : objnum; + + if (ATTR_CMN_OBJID & commonattr) { + ((fsobj_id_t *)attrbufptr)->fid_objno = fileid; + ((fsobj_id_t *)attrbufptr)->fid_generation = 0; + attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; + } + if (ATTR_CMN_OBJPERMANENTID & commonattr) { + ((fsobj_id_t *)attrbufptr)->fid_objno = fileid; + ((fsobj_id_t *)attrbufptr)->fid_generation = 0; + attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; + } + } + /* + * Note: ATTR_CMN_PAROBJID is lossy (only 32 bits). + */ + if (ATTR_CMN_PAROBJID & commonattr) { + uint64_t parentid; + + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parentid, sizeof (parentid)) == 0); + + /* + * On Mac OS X we always export the root + * directory id as 2 and its parent as 1 + */ + if (zp && zp->z_id == zfsvfs->z_root) + parentid = 1; + else if (parentid == zfsvfs->z_root) + parentid = 2; + + ASSERT(parentid != 0); + + ((fsobj_id_t *)attrbufptr)->fid_objno = (uint32_t)parentid; + ((fsobj_id_t *)attrbufptr)->fid_generation = 0; + attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; + } + if (ATTR_CMN_SCRIPT & commonattr) { + *((text_encoding_t *)attrbufptr) = kTextEncodingMacUnicode; + attrbufptr = ((text_encoding_t *)attrbufptr) + 1; + } + if (ATTR_CMN_CRTIME & commonattr) { + uint64_t times[2]; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + times, sizeof (times)) == 0); + if (user64) { + ZFS_TIME_DECODE((timespec_user64_t *)attrbufptr, + times); + attrbufptr = ((timespec_user64_t *)attrbufptr) + 1; + } else { + ZFS_TIME_DECODE((timespec_user32_t *)attrbufptr, + times); + attrbufptr = ((timespec_user32_t *)attrbufptr) + 1; + } + } + if (ATTR_CMN_MODTIME & commonattr) { + uint64_t times[2]; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_MTIME(zfsvfs), + times, sizeof (times)) == 0); + if (user64) { + ZFS_TIME_DECODE((timespec_user64_t *)attrbufptr, + times); + attrbufptr = ((timespec_user64_t *)attrbufptr) + 1; + } else { + ZFS_TIME_DECODE((timespec_user32_t *)attrbufptr, + times); + attrbufptr = ((timespec_user32_t *)attrbufptr) + 1; + } + } + if (ATTR_CMN_CHGTIME & commonattr) { + uint64_t times[2]; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_CTIME(zfsvfs), + times, sizeof (times)) == 0); + if (user64) { + ZFS_TIME_DECODE((timespec_user64_t *)attrbufptr, + times); + attrbufptr = ((timespec_user64_t *)attrbufptr) + 1; + } else { + ZFS_TIME_DECODE((timespec_user32_t *)attrbufptr, + times); + attrbufptr = ((timespec_user32_t *)attrbufptr) + 1; + } + } + if (ATTR_CMN_ACCTIME & commonattr) { + uint64_t times[2]; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + times, sizeof (times)) == 0); + if (user64) { + ZFS_TIME_DECODE((timespec_user64_t *)attrbufptr, + times); + attrbufptr = ((timespec_user64_t *)attrbufptr) + 1; + } else { + ZFS_TIME_DECODE((timespec_user32_t *)attrbufptr, + times); + attrbufptr = ((timespec_user32_t *)attrbufptr) + 1; + } + } + if (ATTR_CMN_BKUPTIME & commonattr) { + /* legacy attribute -- just pass zero */ + if (user64) { + ((timespec_user64_t *)attrbufptr)->tv_sec = 0; + ((timespec_user64_t *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((timespec_user64_t *)attrbufptr) + 1; + } else { + ((timespec_user32_t *)attrbufptr)->tv_sec = 0; + ((timespec_user32_t *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((timespec_user32_t *)attrbufptr) + 1; + } + } + if (ATTR_CMN_FNDRINFO & commonattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + &val, sizeof (val)) == 0); + getfinderinfo(zp, cr, &finderinfo); + /* Shadow ZFS_HIDDEN to Finder Info's invisible bit */ + if (val & ZFS_HIDDEN) { + finderinfo.fi_flags |= + OSSwapHostToBigConstInt16(kIsInvisible); + } + bcopy(&finderinfo, attrbufptr, sizeof (finderinfo)); + attrbufptr = (char *)attrbufptr + 32; + } + if (ATTR_CMN_OWNERID & commonattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_UID(zfsvfs), + &val, sizeof (val)) == 0); + *((uid_t *)attrbufptr) = val; + attrbufptr = ((uid_t *)attrbufptr) + 1; + } + if (ATTR_CMN_GRPID & commonattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_GID(zfsvfs), + &val, sizeof (val)) == 0); + *((gid_t *)attrbufptr) = val; + attrbufptr = ((gid_t *)attrbufptr) + 1; + } + if (ATTR_CMN_ACCESSMASK & commonattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + &val, sizeof (val)) == 0); + *((u_int32_t *)attrbufptr) = val; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_CMN_FLAGS & commonattr) { + // TODO, sa_lookup of ZPL_FLAGS + u_int32_t flags = zfs_getbsdflags(zp); + + /* Shadow Finder Info's invisible bit to UF_HIDDEN */ + if ((ATTR_CMN_FNDRINFO & commonattr) && + (OSSwapBigToHostInt16(finderinfo.fi_flags) & kIsInvisible)) + flags |= UF_HIDDEN; + + *((u_int32_t *)attrbufptr) = flags; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_CMN_USERACCESS & commonattr) { + u_int32_t user_access = 0; + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + &val, sizeof (val)) == 0); + + user_access = getuseraccess(zp, aip->ai_context); + + /* Also consider READ-ONLY file system. */ + if (vfs_flags(mp) & MNT_RDONLY) { + user_access &= ~W_OK; + } + + /* Locked objects are not writable either */ + if ((val & ZFS_IMMUTABLE) && + (vfs_context_suser(aip->ai_context) != 0)) { + user_access &= ~W_OK; + } + + *((u_int32_t *)attrbufptr) = user_access; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_CMN_FILEID & commonattr) { + /* + * On Mac OS X we always export the root directory id as 2 + */ + if (objnum == zfsvfs->z_root) + objnum = 2; + + *((u_int64_t *)attrbufptr) = objnum; + attrbufptr = ((u_int64_t *)attrbufptr) + 1; + } + if (ATTR_CMN_PARENTID & commonattr) { + uint64_t parentid; + + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parentid, sizeof (parentid)) == 0); + + /* + * On Mac OS X we always export the root + * directory id as 2 and its parent as 1 + */ + if (zp && zp->z_id == zfsvfs->z_root) + parentid = 1; + else if (parentid == zfsvfs->z_root) + parentid = 2; + + ASSERT(parentid != 0); + + *((u_int64_t *)attrbufptr) = parentid; + attrbufptr = ((u_int64_t *)attrbufptr) + 1; + } + + *aip->ai_attrbufpp = attrbufptr; + *aip->ai_varbufpp = varbufptr; +} + +void +dirattrpack(attrinfo_t *aip, znode_t *zp) +{ + attrgroup_t dirattr = aip->ai_attrlist->dirattr; + void *attrbufptr = *aip->ai_attrbufpp; + + if (ATTR_DIR_LINKCOUNT & dirattr) { + *((u_int32_t *)attrbufptr) = 1; /* no dir hard links */ + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_DIR_ENTRYCOUNT & dirattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &val, sizeof (val)) == 0); + *((u_int32_t *)attrbufptr) = (uint32_t)val; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_DIR_MOUNTSTATUS & dirattr && zp) { + vnode_t *vp = ZTOV(zp); + + if (vp != NULL && vnode_mountedhere(vp) != NULL) + *((u_int32_t *)attrbufptr) = DIR_MNTSTATUS_MNTPOINT; + else + *((u_int32_t *)attrbufptr) = 0; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + *aip->ai_attrbufpp = attrbufptr; +} + +void +fileattrpack(attrinfo_t *aip, zfsvfs_t *zfsvfs, znode_t *zp) +{ + attrgroup_t fileattr = aip->ai_attrlist->fileattr; + void *attrbufptr = *aip->ai_attrbufpp; + void *varbufptr = *aip->ai_varbufpp; + uint64_t allocsize = 0; + cred_t *cr = (cred_t *)vfs_context_ucred(aip->ai_context); + + if ((ATTR_FILE_ALLOCSIZE | ATTR_FILE_DATAALLOCSIZE) & fileattr && zp) { + uint32_t blksize; + u_longlong_t nblks; + + sa_object_size(zp->z_sa_hdl, &blksize, &nblks); + allocsize = (uint64_t)512LL * (uint64_t)nblks; + } + if (ATTR_FILE_LINKCOUNT & fileattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &val, sizeof (val)) == 0); + *((u_int32_t *)attrbufptr) = val; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_FILE_TOTALSIZE & fileattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &val, sizeof (val)) == 0); + *((off_t *)attrbufptr) = val; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_ALLOCSIZE & fileattr) { + *((off_t *)attrbufptr) = allocsize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_IOBLOCKSIZE & fileattr && zp) { + *((u_int32_t *)attrbufptr) = + zp->z_blksz ? zp->z_blksz : zfsvfs->z_max_blksz; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_FILE_DEVTYPE & fileattr) { + uint64_t mode, val = 0; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + &mode, sizeof (mode)) == 0); + sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), + &val, sizeof (val)); + if (S_ISBLK(mode) || S_ISCHR(mode)) + *((u_int32_t *)attrbufptr) = (u_int32_t)val; + else + *((u_int32_t *)attrbufptr) = 0; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_FILE_DATALENGTH & fileattr) { + uint64_t val; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &val, sizeof (val)) == 0); + *((off_t *)attrbufptr) = val; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_DATAALLOCSIZE & fileattr) { + *((off_t *)attrbufptr) = allocsize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if ((ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE) & fileattr) { + uint64_t rsrcsize = 0; + uint64_t xattr; + + if (!sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr, sizeof (xattr)) && + xattr) { + znode_t *xdzp = NULL, *xzp = NULL; + struct componentname cn = { 0 }; + char *name = NULL; + + name = spa_strdup(XATTR_RESOURCEFORK_NAME); + cn.cn_namelen = strlen(name)+1; + cn.cn_nameptr = kmem_zalloc(cn.cn_namelen, KM_SLEEP); + + /* Grab the hidden attribute directory vnode. */ + if (zfs_get_xattrdir(zp, &xdzp, cr, 0) == 0 && + zfs_dirlook(xdzp, name, &xzp, 0, NULL, + &cn) == 0) { + rsrcsize = xzp->z_size; + } + spa_strfree(name); + kmem_free(cn.cn_nameptr, cn.cn_namelen); + + if (xzp) + zrele(xzp); + if (xdzp) + zrele(xdzp); + } + if (ATTR_FILE_RSRCLENGTH & fileattr) { + *((off_t *)attrbufptr) = rsrcsize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_RSRCALLOCSIZE & fileattr) { + *((off_t *)attrbufptr) = roundup(rsrcsize, 512); + attrbufptr = ((off_t *)attrbufptr) + 1; + } + } + *aip->ai_attrbufpp = attrbufptr; + *aip->ai_varbufpp = varbufptr; +} + +void +nameattrpack(attrinfo_t *aip, const char *name, int namelen) +{ + void *varbufptr; + struct attrreference *attr_refptr; + u_int32_t attrlen; + size_t nfdlen, freespace; + int force_formd_normalized_output; + + varbufptr = *aip->ai_varbufpp; + attr_refptr = (struct attrreference *)(*aip->ai_attrbufpp); + + freespace = (char *)aip->ai_varbufend - (char *)varbufptr; + /* + * Mac OS X: non-ascii names are UTF-8 NFC on disk + * so convert to NFD before exporting them. + */ + + if (zfs_vnop_force_formd_normalized_output && + !is_ascii_str(name)) + force_formd_normalized_output = 1; + else + force_formd_normalized_output = 0; + + namelen = strlen(name); + if (!force_formd_normalized_output || + utf8_normalizestr((const u_int8_t *)name, namelen, + (u_int8_t *)varbufptr, &nfdlen, + freespace, UTF_DECOMPOSED) != 0) { + /* ASCII or normalization failed, just copy zap name. */ + strncpy((char *)varbufptr, name, MIN(freespace, namelen+1)); + } else { + /* Normalization succeeded (already in buffer). */ + namelen = nfdlen; + } + attrlen = namelen + 1; + attr_refptr->attr_dataoffset = (char *)varbufptr - (char *)attr_refptr; + attr_refptr->attr_length = attrlen; + /* + * Advance beyond the space just allocated and + * round up to the next 4-byte boundary: + */ + varbufptr = ((char *)varbufptr) + attrlen + ((4 - (attrlen & 3)) & 3); + ++attr_refptr; + + *aip->ai_attrbufpp = attr_refptr; + *aip->ai_varbufpp = varbufptr; +} + +int +getpackedsize(struct attrlist *alp, boolean_t user64) +{ + attrgroup_t attrs; + int timespecsize; + int size = 0; + + timespecsize = user64 ? sizeof (timespec_user64_t) : + sizeof (timespec_user32_t); + + if ((attrs = alp->commonattr) != 0) { + if (attrs & ATTR_CMN_NAME) + size += sizeof (struct attrreference); + if (attrs & ATTR_CMN_DEVID) + size += sizeof (dev_t); + if (attrs & ATTR_CMN_FSID) + size += sizeof (fsid_t); + if (attrs & ATTR_CMN_OBJTYPE) + size += sizeof (fsobj_type_t); + if (attrs & ATTR_CMN_OBJTAG) + size += sizeof (fsobj_tag_t); + if (attrs & ATTR_CMN_OBJID) + size += sizeof (fsobj_id_t); + if (attrs & ATTR_CMN_OBJPERMANENTID) + size += sizeof (fsobj_id_t); + if (attrs & ATTR_CMN_PAROBJID) + size += sizeof (fsobj_id_t); + if (attrs & ATTR_CMN_SCRIPT) + size += sizeof (text_encoding_t); + if (attrs & ATTR_CMN_CRTIME) + size += timespecsize; + if (attrs & ATTR_CMN_MODTIME) + size += timespecsize; + if (attrs & ATTR_CMN_CHGTIME) + size += timespecsize; + if (attrs & ATTR_CMN_ACCTIME) + size += timespecsize; + if (attrs & ATTR_CMN_BKUPTIME) + size += timespecsize; + if (attrs & ATTR_CMN_FNDRINFO) + size += 32 * sizeof (u_int8_t); + if (attrs & ATTR_CMN_OWNERID) + size += sizeof (uid_t); + if (attrs & ATTR_CMN_GRPID) + size += sizeof (gid_t); + if (attrs & ATTR_CMN_ACCESSMASK) + size += sizeof (u_int32_t); + if (attrs & ATTR_CMN_FLAGS) + size += sizeof (u_int32_t); + if (attrs & ATTR_CMN_USERACCESS) + size += sizeof (u_int32_t); + if (attrs & ATTR_CMN_FILEID) + size += sizeof (u_int64_t); + if (attrs & ATTR_CMN_PARENTID) + size += sizeof (u_int64_t); + /* + * Also add: + * ATTR_CMN_GEN_COUNT (|FSOPT_ATTR_CMN_EXTENDED) + * ATTR_CMN_DOCUMENT_ID (|FSOPT_ATTR_CMN_EXTENDED) + * ATTR_CMN_EXTENDED_SECURITY + * ATTR_CMN_UUID + * ATTR_CMN_GRPUUID + * ATTR_CMN_FULLPATH + * ATTR_CMN_ADDEDTIME + * ATTR_CMN_ERROR + * ATTR_CMN_DATA_PROTECT_FLAGS + */ + } + if ((attrs = alp->dirattr) != 0) { + if (attrs & ATTR_DIR_LINKCOUNT) + size += sizeof (u_int32_t); + if (attrs & ATTR_DIR_ENTRYCOUNT) + size += sizeof (u_int32_t); + if (attrs & ATTR_DIR_MOUNTSTATUS) + size += sizeof (u_int32_t); + } + if ((attrs = alp->fileattr) != 0) { + if (attrs & ATTR_FILE_LINKCOUNT) + size += sizeof (u_int32_t); + if (attrs & ATTR_FILE_TOTALSIZE) + size += sizeof (off_t); + if (attrs & ATTR_FILE_ALLOCSIZE) + size += sizeof (off_t); + if (attrs & ATTR_FILE_IOBLOCKSIZE) + size += sizeof (u_int32_t); + if (attrs & ATTR_FILE_DEVTYPE) + size += sizeof (u_int32_t); + if (attrs & ATTR_FILE_DATALENGTH) + size += sizeof (off_t); + if (attrs & ATTR_FILE_DATAALLOCSIZE) + size += sizeof (off_t); + if (attrs & ATTR_FILE_RSRCLENGTH) + size += sizeof (off_t); + if (attrs & ATTR_FILE_RSRCALLOCSIZE) + size += sizeof (off_t); + } + return (size); +} + + +void +getfinderinfo(znode_t *zp, cred_t *cr, finderinfo_t *fip) +{ + znode_t *xdzp = NULL; + znode_t *xzp = NULL; + struct uio *auio = NULL; + struct componentname cn = { 0 }; + int error; + uint64_t xattr = 0; + char *name = NULL; + + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zp->z_zfsvfs), + &xattr, sizeof (xattr)) || + (xattr == 0)) { + goto nodata; + } + + auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); + if (auio == NULL) { + goto nodata; + } + uio_addiov(auio, CAST_USER_ADDR_T(fip), sizeof (finderinfo_t)); + + /* + * Grab the hidden attribute directory vnode. + * + * XXX - switch to embedded Finder Info when it becomes available + */ + if ((error = zfs_get_xattrdir(zp, &xdzp, cr, 0))) { + goto out; + } + + name = spa_strdup(XATTR_FINDERINFO_NAME); + cn.cn_namelen = strlen(name)+1; + cn.cn_nameptr = kmem_zalloc(cn.cn_namelen, KM_SLEEP); + + if ((error = zfs_dirlook(xdzp, name, &xzp, 0, NULL, &cn))) { + goto out; + } + error = dmu_read_uio(zp->z_zfsvfs->z_os, xzp->z_id, auio, + sizeof (finderinfo_t)); +out: + if (name) + spa_strfree(name); + if (cn.cn_nameptr) + kmem_free(cn.cn_nameptr, cn.cn_namelen); + if (auio) + uio_free(auio); + if (xzp) + zrele(xzp); + if (xdzp) + zrele(xdzp); + if (error == 0) + return; +nodata: + bzero(fip, sizeof (finderinfo_t)); +} + +#define KAUTH_DIR_WRITE (KAUTH_VNODE_ACCESS | KAUTH_VNODE_ADD_FILE | \ + KAUTH_VNODE_ADD_SUBDIRECTORY | \ + KAUTH_VNODE_DELETE_CHILD) + +#define KAUTH_DIR_READ (KAUTH_VNODE_ACCESS | KAUTH_VNODE_LIST_DIRECTORY) + +#define KAUTH_DIR_EXECUTE (KAUTH_VNODE_ACCESS | KAUTH_VNODE_SEARCH) + +#define KAUTH_FILE_WRITE (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA) + +#define KAUTH_FILE_READ (KAUTH_VNODE_ACCESS | KAUTH_VNODE_READ_DATA) + +#define KAUTH_FILE_EXECUTE (KAUTH_VNODE_ACCESS | KAUTH_VNODE_EXECUTE) + +/* + * Compute the same user access value as getattrlist(2) + */ +u_int32_t +getuseraccess(znode_t *zp, vfs_context_t ctx) +{ + vnode_t *vp; + u_int32_t user_access = 0; + zfs_acl_phys_t acl_phys; + int error; + /* Only take the expensive vnode_authorize path when we have an ACL */ + + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys)); + + if (error || acl_phys.z_acl_count == 0) { + kauth_cred_t cred = vfs_context_ucred(ctx); + uint64_t obj_uid; + uint64_t obj_mode; + + /* User id 0 (root) always gets access. */ + if (!vfs_context_suser(ctx)) { + return (R_OK | W_OK | X_OK); + } + + sa_lookup(zp->z_sa_hdl, SA_ZPL_UID(zp->z_zfsvfs), + &obj_uid, sizeof (obj_uid)); + sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zp->z_zfsvfs), + &obj_mode, sizeof (obj_mode)); + + // obj_uid = pzp->zp_uid; + obj_mode = obj_mode & MODEMASK; + if (obj_uid == UNKNOWNUID) { + obj_uid = kauth_cred_getuid(cred); + } + if ((obj_uid == kauth_cred_getuid(cred)) || + (obj_uid == UNKNOWNUID)) { + return (((u_int32_t)obj_mode & S_IRWXU) >> 6); + } + /* Otherwise, settle for 'others' access. */ + return ((u_int32_t)obj_mode & S_IRWXO); + } + vp = ZTOV(zp); + if (vnode_isdir(vp)) { + if (vnode_authorize(vp, NULLVP, KAUTH_DIR_WRITE, ctx) == 0) + user_access |= W_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_DIR_READ, ctx) == 0) + user_access |= R_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_DIR_EXECUTE, ctx) == 0) + user_access |= X_OK; + } else { + if (vnode_authorize(vp, NULLVP, KAUTH_FILE_WRITE, ctx) == 0) + user_access |= W_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_FILE_READ, ctx) == 0) + user_access |= R_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_FILE_EXECUTE, ctx) == 0) + user_access |= X_OK; + } + return (user_access); +} + + + +static unsigned char fingerprint[] = {0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef, + 0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef}; + +/* + * Convert "Well Known" GUID to enum type. + */ +int +kauth_wellknown_guid(guid_t *guid) +{ + uint32_t last = 0; + + if (memcmp(fingerprint, guid->g_guid, sizeof (fingerprint))) + return (KAUTH_WKG_NOT); + + last = BE_32(*((u_int32_t *)&guid->g_guid[12])); + + switch (last) { + case 0x0c: + return (KAUTH_WKG_EVERYBODY); + case 0x0a: + return (KAUTH_WKG_OWNER); + case 0x10: + return (KAUTH_WKG_GROUP); + case 0xFFFFFFFE: + return (KAUTH_WKG_NOBODY); + } + + return (KAUTH_WKG_NOT); +} + + +/* + * Set GUID to "well known" guid, based on enum type + */ +void +nfsacl_set_wellknown(int wkg, guid_t *guid) +{ + /* + * All WKGs begin with the same 12 bytes. + */ + bcopy(fingerprint, (void *)guid, 12); + /* + * The final 4 bytes are our code (in network byte order). + */ + switch (wkg) { + case 4: + *((u_int32_t *)&guid->g_guid[12]) = BE_32(0x0000000c); + break; + case 3: + *((u_int32_t *)&guid->g_guid[12]) = BE_32(0xfffffffe); + break; + case 1: + *((u_int32_t *)&guid->g_guid[12]) = BE_32(0x0000000a); + break; + case 2: + *((u_int32_t *)&guid->g_guid[12]) = BE_32(0x00000010); + }; +} + + +/* + * Convert Darwin ACL list, into ZFS ACL "aces" list. + */ +void +aces_from_acl(ace_t *aces, int *nentries, struct kauth_acl *k_acl, + int *seen_type) +{ + int i; + ace_t *ace; + guid_t *guidp; + kauth_ace_rights_t ace_rights; + uid_t who; + uint32_t mask = 0; + uint16_t flags = 0; + uint16_t type = 0; + u_int32_t ace_flags; + int wkg; + int err = 0; + + *nentries = k_acl->acl_entrycount; + + // bzero(aces, sizeof (*aces) * *nentries); + + // *nentries = aclp->acl_cnt; + + for (i = 0; i < *nentries; i++) { + // entry = &(aclp->acl_entry[i]); + + flags = 0; + mask = 0; + + ace = &(aces[i]); + + /* Note Mac OS X GUID is a 128-bit identifier */ + guidp = &k_acl->acl_ace[i].ace_applicable; + + who = -1; + wkg = kauth_wellknown_guid(guidp); + + switch (wkg) { + case KAUTH_WKG_OWNER: + flags |= ACE_OWNER; + if (seen_type) *seen_type |= ACE_OWNER; + break; + case KAUTH_WKG_GROUP: + flags |= ACE_GROUP|ACE_IDENTIFIER_GROUP; + if (seen_type) *seen_type |= ACE_GROUP; + break; + case KAUTH_WKG_EVERYBODY: + flags |= ACE_EVERYONE; + if (seen_type) *seen_type |= ACE_EVERYONE; + break; + + case KAUTH_WKG_NOBODY: + default: + /* Try to get a uid from supplied guid */ + err = kauth_cred_guid2uid(guidp, &who); + if (err) { + err = kauth_cred_guid2gid(guidp, &who); + if (!err) { + flags |= ACE_IDENTIFIER_GROUP; + } + } + if (err) { + *nentries = 0; + return; + } + + } // switch + + ace->a_who = who; + + ace_rights = k_acl->acl_ace[i].ace_rights; + if (ace_rights & KAUTH_VNODE_READ_DATA) + mask |= ACE_READ_DATA; + if (ace_rights & KAUTH_VNODE_WRITE_DATA) + mask |= ACE_WRITE_DATA; + if (ace_rights & KAUTH_VNODE_APPEND_DATA) + mask |= ACE_APPEND_DATA; + if (ace_rights & KAUTH_VNODE_READ_EXTATTRIBUTES) + mask |= ACE_READ_NAMED_ATTRS; + if (ace_rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) + mask |= ACE_WRITE_NAMED_ATTRS; + if (ace_rights & KAUTH_VNODE_EXECUTE) + mask |= ACE_EXECUTE; + if (ace_rights & KAUTH_VNODE_DELETE_CHILD) + mask |= ACE_DELETE_CHILD; + if (ace_rights & KAUTH_VNODE_READ_ATTRIBUTES) + mask |= ACE_READ_ATTRIBUTES; + if (ace_rights & KAUTH_VNODE_WRITE_ATTRIBUTES) + mask |= ACE_WRITE_ATTRIBUTES; + if (ace_rights & KAUTH_VNODE_DELETE) + mask |= ACE_DELETE; + if (ace_rights & KAUTH_VNODE_READ_SECURITY) + mask |= ACE_READ_ACL; + if (ace_rights & KAUTH_VNODE_WRITE_SECURITY) + mask |= ACE_WRITE_ACL; + if (ace_rights & KAUTH_VNODE_TAKE_OWNERSHIP) + mask |= ACE_WRITE_OWNER; + if (ace_rights & KAUTH_VNODE_SYNCHRONIZE) + mask |= ACE_SYNCHRONIZE; + ace->a_access_mask = mask; + + ace_flags = k_acl->acl_ace[i].ace_flags; + if (ace_flags & KAUTH_ACE_FILE_INHERIT) + flags |= ACE_FILE_INHERIT_ACE; + if (ace_flags & KAUTH_ACE_DIRECTORY_INHERIT) + flags |= ACE_DIRECTORY_INHERIT_ACE; + if (ace_flags & KAUTH_ACE_LIMIT_INHERIT) + flags |= ACE_NO_PROPAGATE_INHERIT_ACE; + if (ace_flags & KAUTH_ACE_ONLY_INHERIT) + flags |= ACE_INHERIT_ONLY_ACE; + ace->a_flags = flags; + + switch (ace_flags & KAUTH_ACE_KINDMASK) { + case KAUTH_ACE_PERMIT: + type = ACE_ACCESS_ALLOWED_ACE_TYPE; + break; + case KAUTH_ACE_DENY: + type = ACE_ACCESS_DENIED_ACE_TYPE; + break; + case KAUTH_ACE_AUDIT: + type = ACE_SYSTEM_AUDIT_ACE_TYPE; + break; + case KAUTH_ACE_ALARM: + type = ACE_SYSTEM_ALARM_ACE_TYPE; + break; + } + ace->a_type = type; + dprintf(" ACL: %d type %04x, mask %04x, flags %04x, who %d\n", + i, type, mask, flags, who); + } + +} + +void +finderinfo_update(uint8_t *finderinfo, znode_t *zp) +{ + u_int8_t *finfo = NULL; + struct timespec va_crtime; + + /* Advance finfo by 16 bytes to the 2nd half of the finderinfo */ + finfo = (u_int8_t *)finderinfo + 16; + + /* Don't expose a symlink's private type/creator. */ + if (IFTOVT((mode_t)zp->z_mode) == VLNK) { + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)finderinfo; + fip->fdType = 0; + fip->fdCreator = 0; + } + + /* hfs_xattr.c hfs_zero_hidden_fields() */ + if ((IFTOVT((mode_t)zp->z_mode) == VREG) || + (IFTOVT((mode_t)zp->z_mode) == VLNK)) { + struct FndrExtendedFileInfo *extinfo = + (struct FndrExtendedFileInfo *)finfo; + extinfo->document_id = 0; + extinfo->date_added = 0; + extinfo->write_gen_counter = 0; + } + + if (IFTOVT((mode_t)zp->z_mode) == VDIR) { + struct FndrExtendedDirInfo *extinfo = + (struct FndrExtendedDirInfo *)finfo; + extinfo->document_id = 0; + extinfo->date_added = 0; + extinfo->write_gen_counter = 0; + } + +} + + + +int +zpl_xattr_set_sa(struct vnode *vp, const char *name, const void *value, + size_t size, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + nvlist_t *nvl; + size_t sa_size; + int error; + + ASSERT(zp->z_xattr_cached); + nvl = zp->z_xattr_cached; + + if (value == NULL) { + error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); + if (error == -ENOENT) + return (error); + // error = zpl_xattr_set_dir(vp, name, NULL, 0, flags, cr); + } else { + /* Limited to 32k to keep nvpair memory allocations small */ + if (size > DXATTR_MAX_ENTRY_SIZE) + return (-EFBIG); + + /* Prevent the DXATTR SA from consuming the entire SA region */ + error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); + if (error) + return (error); + + if (sa_size > DXATTR_MAX_SA_SIZE) + return (-EFBIG); + error = -nvlist_add_byte_array(nvl, name, + (uchar_t *)value, size); + if (error) + return (error); + } + + /* Update the SA for additions, modifications, and removals. */ + if (!error) + error = -zfs_sa_set_xattr(zp); + + ASSERT3S(error, <=, 0); + + return (error); +} + +int +zpl_xattr_get_sa(struct vnode *vp, const char *name, void *value, size_t size) +{ + znode_t *zp = VTOZ(vp); + uchar_t *nv_value; + uint_t nv_size; + int error = 0; + +#ifdef __LINUX__ + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); +#endif + + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + return (error); + + ASSERT(zp->z_xattr_cached); + error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name, + &nv_value, &nv_size); + if (error) + return (error); + + if (!size) + return (nv_size); + if (size < nv_size) + return (-ERANGE); + + memcpy(value, nv_value, nv_size); + + return (nv_size); +} + + + +/* + * Document ID. Persistant IDs that can survive "safe saving". + * 'revisiond' appears to use fchflags(UF_TRACKED) on files/dirs + * that it wishes to use DocumentIDs with. Here, we will lookup + * if an entry already has a DocumentID stored in SA, but if not, + * hash the DocumentID for (PARENTID + filename) and return it. + * In vnop_setattr for UF_TRACKED, we will store the DocumentID to + * disk. + * Although it is not entirely clear which situations we should handle + * we do handle: + * + * Case 1: + * "file.txt" gets chflag(UF_TRACKED) and DocumentID set. + * "file.txt" is renamed to "file.tmp". DocumentID is kept. + * "file.txt" is re-created, DocumentID remains same, but not saved. + * + * Case 2: + * "file.txt" gets chflag(UF_TRACKED) and DocumentID set. + * "file.txt" is moved to another directory. DocumentID is kept. + * + * It is interesting to note that HFS+ has "tombstones" which is + * created when a UF_TRACKED entry is unlinked, or, renamed. + * Then if a new entry is created with same PARENT+name, and matching + * tombstone is found, will inherit the DocumentID, and UF_TRACKED flag. + * + * We may need to implement this as well. + * + * If "name" or "parent" is known, pass it along, or it needs to look it up. + * + */ +void +zfs_setattr_generate_id(znode_t *zp, uint64_t val, char *name) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + char *nameptr = NULL; + char *filename = NULL; + uint64_t parent = val; + int error = 0; + uint64_t docid = 0; + + if (!zp->z_document_id && zp->z_sa_hdl) { + + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DOCUMENTID(zfsvfs), + &docid, sizeof (docid)); + if (!error && docid) { + zp->z_document_id = docid; + return; + } + + /* Have name? */ + if (name && *name) { + nameptr = name; + } else { + /* Do we have parent? */ + if (!parent) { + VERIFY(sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, + sizeof (parent)) == 0); + } + /* Lookup filename */ + filename = kmem_zalloc(MAXPATHLEN + 2, KM_SLEEP); + if (zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), filename) == 0) { + + nameptr = filename; + // Might as well keep this name too. + strlcpy(zp->z_name_cache, filename, + MAXPATHLEN); + } + } + + zp->z_document_id = fnv_32a_buf(&parent, sizeof (parent), + FNV1_32A_INIT); + if (nameptr) + zp->z_document_id = + fnv_32a_str(nameptr, zp->z_document_id); + + if (filename) + kmem_free(filename, MAXPATHLEN + 2); + } // !document_id +} + +/* + * setattr asked for UF_TRACKED to be set, which means we will make sure + * we have a hash made (includes getting filename) and stored in SA. + */ +int +zfs_setattr_set_documentid(znode_t *zp, boolean_t update_flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + dmu_tx_t *tx; + int count = 0; + sa_bulk_attr_t bulk[2]; + + dprintf("ZFS: vnop_setattr(UF_TRACKED) obj %llu : documentid %08u\n", + zp->z_id, + zp->z_document_id); + + /* Write the new documentid to SA */ + if ((zfsvfs->z_use_sa == B_TRUE) && + !vfs_isrdonly(zfsvfs->z_vfs) && + spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + + uint64_t docid = zp->z_document_id; // 32->64 + + if (update_flags == B_TRUE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DOCUMENTID(zfsvfs), NULL, + &docid, sizeof (docid)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } + + if (error) + dprintf("ZFS: sa_update(SA_ZPL_DOCUMENTID) failed %d\n", + error); + + } // if z_use_sa && !readonly + + return (error); +} + +int +zfs_hardlink_addmap(znode_t *zp, uint64_t parentid, uint32_t linkid) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + hardlinks_t *searchnode, *findnode; + avl_index_t loc; + + searchnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + searchnode->hl_parent = parentid; + searchnode->hl_fileid = zp->z_id; + strlcpy(searchnode->hl_name, zp->z_name_cache, PATH_MAX); + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_WRITER); + findnode = avl_find(&zfsvfs->z_hardlinks, searchnode, &loc); + kmem_free(searchnode, sizeof (hardlinks_t)); + if (!findnode) { + // Add hash entry + zp->z_finder_hardlink = TRUE; + findnode = kmem_alloc(sizeof (hardlinks_t), KM_SLEEP); + + findnode->hl_parent = parentid; + findnode->hl_fileid = zp->z_id; + strlcpy(findnode->hl_name, zp->z_name_cache, PATH_MAX); + + findnode->hl_linkid = linkid; + + avl_add(&zfsvfs->z_hardlinks, findnode); + avl_add(&zfsvfs->z_hardlinks_linkid, findnode); + dprintf("ZFS: Inserted new hardlink node (%llu,%llu,'%s') " + "<-> (%x,%u)\n", + findnode->hl_parent, + findnode->hl_fileid, findnode->hl_name, + findnode->hl_linkid, findnode->hl_linkid); + } + rw_exit(&zfsvfs->z_hardlinks_lock); + + return (findnode ? 1 : 0); +} + +/* dst buffer must be at least UUID_PRINTABLE_STRING_LENGTH bytes */ + +int +zfs_vfs_uuid_unparse(uuid_t uuid, char *dst) +{ + if (!uuid || !dst) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + snprintf(dst, UUID_PRINTABLE_STRING_LENGTH, "%02X%02X%02X%02X-" + "%02X%02X-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X", + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7], + uuid[8], uuid[9], uuid[10], uuid[11], + uuid[12], uuid[13], uuid[14], uuid[15]); + + return (0); +} + +int +zfs_vfs_uuid_gen(const char *osname, uuid_t uuid) +{ + MD5_CTX md5c; + /* namespace (generated by uuidgen) */ + /* 50670853-FBD2-4EC3-9802-73D847BF7E62 */ + char namespace[16] = {0x50, 0x67, 0x08, 0x53, /* - */ + 0xfb, 0xd2, /* - */ 0x4e, 0xc3, /* - */ + 0x98, 0x02, /* - */ + 0x73, 0xd8, 0x47, 0xbf, 0x7e, 0x62}; + + /* Validate arguments */ + if (!osname || !uuid || strlen(osname) == 0) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + /* + * UUID version 3 (MD5) namespace variant: + * hash namespace (uuid) together with name + */ + MD5Init(&md5c); + MD5Update(&md5c, &namespace, sizeof (namespace)); + MD5Update(&md5c, osname, strlen(osname)); + MD5Final(uuid, &md5c); + + /* + * To make UUID version 3, twiddle a few bits: + * xxxxxxxx-xxxx-Mxxx-Nxxx-xxxxxxxxxxxx + * [uint32]-[uin-t32]-[uin-t32][uint32] + * M should be 0x3 to indicate uuid v3 + * N should be 0x8, 0x9, 0xa, or 0xb + */ + uuid[6] = (uuid[6] & 0x0F) | 0x30; + uuid[8] = (uuid[8] & 0x3F) | 0x80; + + /* Print all caps */ + // dprintf("%s UUIDgen: [%s](%ld)->" + dprintf("%s UUIDgen: [%s](%ld) -> " + "[%02X%02X%02X%02X-%02X%02X-%02X%02X-" + "%02X%02X-%02X%02X%02X%02X%02X%02X]\n", + __func__, osname, strlen(osname), + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7], + uuid[8], uuid[9], uuid[10], uuid[11], + uuid[12], uuid[13], uuid[14], uuid[15]); + + return (0); +} + +int +uio_prefaultpages(ssize_t n, struct uio *uio) +{ + return (0); +} + +/* No #pragma weaks here! */ +void +dmu_buf_add_ref(dmu_buf_t *db, void *tag) +{ + dbuf_add_ref((dmu_buf_impl_t *)db, tag); +} + +boolean_t +dmu_buf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t object, + uint64_t blkid, void *tag) +{ + return (dbuf_try_add_ref(db, os, object, blkid, tag)); +} diff --git a/module/os/macos/zfs/zfs_znode.c b/module/os/macos/zfs/zfs_znode.c new file mode 100644 index 0000000000..f890035a89 --- /dev/null +++ b/module/os/macos/zfs/zfs_znode.c @@ -0,0 +1,2351 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2007-2009 Apple Inc. All rights reserved. + * Use is subject to license terms. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2011 Martin Matuska */ +/* Portions Copyright 2013 Jorgen Lundman */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif /* _KERNEL */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* Used by fstat(1). */ +#ifndef __APPLE__ +SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof (znode_t), + "sizeof (znode_t)"); +#endif +void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag); + + +// #define dprintf printf + + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL +/* + * This is used by the test suite so that it can delay znodes from being + * freed in order to inspect the unlinked set. + */ +int zfs_unlink_suspend_progress = 0; + +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ + +kmem_cache_t *znode_cache = NULL; +static kmem_cache_t *znode_hold_cache = NULL; +unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; + +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ +static void +zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) +{ + znode_t *zp = arg; + + /* + * If in append mode, convert to writer and lock starting at the + * current end of file. + */ + if (new->lr_type == RL_APPEND) { + new->lr_offset = zp->z_size; + new->lr_type = RL_WRITER; + } + + /* + * If we need to grow the block size then lock the whole file range. + */ + uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { + new->lr_offset = 0; + new->lr_length = UINT64_MAX; + } +} + +/*ARGSUSED*/ +#if 0 // unused function +static void +znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) +{ + /* + * We should never drop all dbuf refs without first clearing + * the eviction callback. + */ + panic("evicting znode %p\n", user_ptr); +} +#endif + +extern struct vop_vector zfs_vnodeops; +extern struct vop_vector zfs_fifoops; +extern struct vop_vector zfs_shareops; + +/* + * XXX: We cannot use this function as a cache constructor, because + * there is one global cache for all file systems and we need + * to pass vfsp here, which is not possible, because argument + * 'cdrarg' is defined at kmem_cache_create() time. + */ +/*ARGSUSED*/ +static int +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_t *zp = buf; + + list_link_init(&zp->z_link_node); + + mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); + zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); + + mutex_init(&zp->z_attach_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zp->z_attach_cv, NULL, CV_DEFAULT, NULL); + + zp->z_dirlocks = NULL; + zp->z_acl_cached = NULL; + zp->z_xattr_cached = NULL; + zp->z_xattr_parent = 0; + zp->z_moved = 0; + zp->z_skip_truncate_undo_decmpfs = B_FALSE; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *arg) +{ + znode_t *zp = buf; + + ASSERT(ZTOV(zp) == NULL); + ASSERT(!list_link_active(&zp->z_link_node)); + mutex_destroy(&zp->z_lock); + rw_destroy(&zp->z_map_lock); + rw_destroy(&zp->z_parent_lock); + rw_destroy(&zp->z_name_lock); + mutex_destroy(&zp->z_acl_lock); + rw_destroy(&zp->z_xattr_lock); + zfs_rangelock_fini(&zp->z_rangelock); + mutex_destroy(&zp->z_attach_lock); + cv_destroy(&zp->z_attach_cv); + + ASSERT(zp->z_dirlocks == NULL); + ASSERT(zp->z_acl_cached == NULL); + ASSERT(zp->z_xattr_cached == NULL); +} + +static int +zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_hold_t *zh = buf; + + mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); + zfs_refcount_create(&zh->zh_refcount); + zh->zh_obj = ZFS_NO_OBJECT; + + return (0); +} + +static void +zfs_znode_hold_cache_destructor(void *buf, void *arg) +{ + znode_hold_t *zh = buf; + + mutex_destroy(&zh->zh_lock); + zfs_refcount_destroy(&zh->zh_refcount); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache. The KMC_SLAB hint is used in order that it be + * backed by kmalloc() when on the Linux slab in order that any + * wait_on_bit() operations on the related inode operate properly. + */ + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, + zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, + NULL, 0); + + ASSERT(znode_hold_cache == NULL); + znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", + sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, + zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); +} + +void +zfs_znode_fini(void) +{ + /* + * Cleanup zcache + */ + if (znode_cache) + kmem_cache_destroy(znode_cache); + znode_cache = NULL; + + if (znode_hold_cache) + kmem_cache_destroy(znode_hold_cache); + znode_hold_cache = NULL; +} + +/* + * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to + * serialize access to a znode and its SA buffer while the object is being + * created or destroyed. This kind of locking would normally reside in the + * znode itself but in this case that's impossible because the znode and SA + * buffer may not yet exist. Therefore the locking is handled externally + * with an array of mutexs and AVLs trees which contain per-object locks. + * + * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted + * in to the correct AVL tree and finally the per-object lock is held. In + * zfs_znode_hold_exit() the process is reversed. The per-object lock is + * released, removed from the AVL tree and destroyed if there are no waiters. + * + * This scheme has two important properties: + * + * 1) No memory allocations are performed while holding one of the z_hold_locks. + * This ensures evict(), which can be called from direct memory reclaim, will + * never block waiting on a z_hold_locks which just happens to have hashed + * to the same index. + * + * 2) All locks used to serialize access to an object are per-object and never + * shared. This minimizes lock contention without creating a large number + * of dedicated locks. + * + * On the downside it does require znode_lock_t structures to be frequently + * allocated and freed. However, because these are backed by a kmem cache + * and very short lived this cost is minimal. + */ +int +zfs_znode_hold_compare(const void *a, const void *b) +{ + const znode_hold_t *zh_a = (const znode_hold_t *)a; + const znode_hold_t *zh_b = (const znode_hold_t *)b; + + return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); +} + +boolean_t +zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) +{ + znode_hold_t *zh, search; + int i = ZFS_OBJ_HASH(zfsvfs, obj); + boolean_t held; + + search.zh_obj = obj; + + mutex_enter(&zfsvfs->z_hold_locks[i]); + zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); + held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; + mutex_exit(&zfsvfs->z_hold_locks[i]); + + return (held); +} + +static znode_hold_t * +zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) +{ + znode_hold_t *zh, *zh_new, search; + int i = ZFS_OBJ_HASH(zfsvfs, obj); + boolean_t found = B_FALSE; + + zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); + zh_new->zh_obj = obj; + search.zh_obj = obj; + + mutex_enter(&zfsvfs->z_hold_locks[i]); + zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); + if (likely(zh == NULL)) { + zh = zh_new; + avl_add(&zfsvfs->z_hold_trees[i], zh); + } else { + ASSERT3U(zh->zh_obj, ==, obj); + found = B_TRUE; + } + zfs_refcount_add(&zh->zh_refcount, NULL); + mutex_exit(&zfsvfs->z_hold_locks[i]); + + if (found == B_TRUE) + kmem_cache_free(znode_hold_cache, zh_new); + + ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); + ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); + mutex_enter(&zh->zh_lock); + + return (zh); +} + +static void +zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) +{ + int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); + boolean_t remove = B_FALSE; + + ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); + ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); + mutex_exit(&zh->zh_lock); + + mutex_enter(&zfsvfs->z_hold_locks[i]); + if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) { + avl_remove(&zfsvfs->z_hold_trees[i], zh); + remove = B_TRUE; + } + mutex_exit(&zfsvfs->z_hold_locks[i]); + + if (remove == B_TRUE) + kmem_cache_free(znode_hold_cache, zh); +} + +int +zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + int error = 0; +#if 0 // FIXME, uses vnode struct, not ptr + zfs_acl_ids_t acl_ids; + vattr_t vattr; + znode_t *sharezp; + struct vnode *vp, *vnode; + znode_t *zp; + + vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0555; + vattr.va_uid = crgetuid(kcred); + vattr.va_gid = crgetgid(kcred); + + sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); + sharezp->z_moved = 0; + sharezp->z_unlinked = 0; + sharezp->z_atime_dirty = 0; + sharezp->z_zfsvfs = zfsvfs; + sharezp->z_is_sa = zfsvfs->z_use_sa; + + sharezp->z_vnode = vnode; + vnode.v_data = sharezp; + + vp = ZTOV(sharezp); + vp->v_type = VDIR; + + VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, + kcred, NULL, &acl_ids)); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, sharezp); + POINTER_INVALIDATE(&sharezp->z_zfsvfs); + error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); + zfsvfs->z_shares_dir = sharezp->z_id; + + zfs_acl_ids_free(&acl_ids); + ZTOV(sharezp)->v_data = NULL; + ZTOV(sharezp)->v_count = 0; + ZTOV(sharezp)->v_holdcnt = 0; + zp->z_vnode = NULL; + sa_handle_destroy(sharezp->z_sa_hdl); + sharezp->z_vnode = NULL; + kmem_cache_free(znode_cache, sharezp); +#endif + return (error); +} + +/* + * define a couple of values we need available + * for both 64 and 32 bit environments. + */ +#ifndef NBITSMINOR64 +#define NBITSMINOR64 32 +#endif +#ifndef MAXMAJ64 +#define MAXMAJ64 0xffffffffUL +#endif +#ifndef MAXMIN64 +#define MAXMIN64 0xffffffffUL +#endif + +/* + * Create special expldev for ZFS private use. + * Can't use standard expldev since it doesn't do + * what we want. The standard expldev() takes a + * dev32_t in LP64 and expands it to a long dev_t. + * We need an interface that takes a dev32_t in ILP32 + * and expands it to a long dev_t. + */ +static uint64_t +zfs_expldev(dev_t dev) +{ + return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); +} +/* + * Special cmpldev for ZFS private use. + * Can't use standard cmpldev since it takes + * a long dev_t and compresses it to dev32_t in + * LP64. We need to do a compaction of a long dev_t + * to a dev32_t in ILP32. + */ +dev_t +zfs_cmpldev(uint64_t dev) +{ + return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); +} + +static void +zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, + dmu_buf_t *db, dmu_object_type_t obj_type, + sa_handle_t *sa_hdl) +{ + ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); + + mutex_enter(&zp->z_lock); + + ASSERT(zp->z_sa_hdl == NULL); + ASSERT(zp->z_acl_cached == NULL); + if (sa_hdl == NULL) { + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + SA_HDL_SHARED, &zp->z_sa_hdl)); + } else { + zp->z_sa_hdl = sa_hdl; + sa_set_userp(sa_hdl, zp); + } + + zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; + + mutex_exit(&zp->z_lock); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked || + RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); + + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; +} + +#if 0 // Until we need it ? +static void +zfs_vnode_destroy(struct vnode *vp) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + if (vp != NULL) { + znode_t *zp = VTOZ(vp); + + if (zp != NULL) { + mutex_enter(&zfsvfs->z_znodes_lock); + if (list_link_active(&zp->z_link_node)) { + list_remove(&zfsvfs->z_all_znodes, zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + + kmem_cache_free(znode_cache, zp); + } + + vnode_clearfsnode(vp); + vnode_put(vp); + vnode_recycle(vp); + } +} +#endif + +/* + * Construct a new znode/vnode and intialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, + dmu_object_type_t obj_type, sa_handle_t *hdl) +{ + znode_t *zp; + struct vnode *vp; + uint64_t mode; + uint64_t parent; + sa_bulk_attr_t bulk[11]; + int count = 0; + uint64_t projid = ZFS_DEFAULT_PROJID; + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + + ASSERT(zp->z_dirlocks == NULL); + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + zp->z_moved = 0; + + /* + * Defer setting z_zfsvfs until the znode is ready to be a candidate for + * the zfs_znode_move() callback. + */ + zp->z_vnode = NULL; + zp->z_sa_hdl = NULL; + zp->z_unlinked = 0; + zp->z_atime_dirty = 0; + zp->z_mapcnt = 0; + zp->z_id = db->db_object; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; + + zp->z_is_mapped = 0; + zp->z_is_ctldir = 0; + zp->z_vid = 0; + zp->z_uid = 0; + zp->z_gid = 0; + zp->z_size = 0; + zp->z_name_cache[0] = 0; + zp->z_finder_parentid = 0; + zp->z_finder_hardlink = FALSE; + + taskq_init_ent(&zp->z_attach_taskq); + + vp = ZTOV(zp); /* Does nothing in OSX */ + + zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, 8); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 || + (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + (zp->z_pflags & ZFS_PROJID) && + sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { + if (hdl == NULL) + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; + printf("znode_alloc: sa_bulk_lookup failed - aborting\n"); + kmem_cache_free(znode_cache, zp); + return (NULL); + } + + zp->z_projid = projid; + zp->z_mode = mode; + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + membar_producer(); + /* + * Everything else must be valid before assigning z_zfsvfs makes the + * znode eligible for zfs_znode_move(). + */ + zp->z_zfsvfs = zfsvfs; + mutex_exit(&zfsvfs->z_znodes_lock); + + return (zp); +} + + +static uint64_t empty_xattr; +static uint64_t pad[4]; +static zfs_acl_phys_t acl_phys; +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_XATTR - new object is an attribute + * bonuslen - length of bonus buffer + * setaclp - File/Dir initial ACL + * fuidp - Tracks fuid allocation. + * + * OUT: zpp - allocated znode + * + * OS X implementation notes: + * + * The caller of zfs_mknode() is expected to call zfs_znode_getvnode() + * AFTER the dmu_tx_commit() is performed. This prevents deadlocks + * since vnode_create can indirectly attempt to clean a dirty vnode. + * + * The current list of callers includes: + * zfs_vnop_create + * zfs_vnop_mkdir + * zfs_vnop_symlink + * zfs_obtain_xattr + * zfs_make_xattrdir + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) +{ + uint64_t crtime[2], atime[2], mtime[2], ctime[2]; + uint64_t mode, size, links, parent, pflags; + uint64_t projid = ZFS_DEFAULT_PROJID; + uint64_t dzp_pflags = 0; + uint64_t rdev = 0; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + dmu_buf_t *db; + timestruc_t now; + uint64_t gen, obj; + int bonuslen; + int dnodesize; + sa_handle_t *sa_hdl; + dmu_object_type_t obj_type; + sa_bulk_attr_t *sa_attrs; + int cnt = 0; + zfs_acl_locator_cb_t locate = { 0 }; + int err = 0; + znode_hold_t *zh; + + ASSERT(vap && (vap->va_mask & (ATTR_TYPE|ATTR_MODE)) == + (ATTR_TYPE|ATTR_MODE)); + + if (zfsvfs->z_replay) { + obj = vap->va_nodeid; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ + } else { + obj = 0; + gethrestime(&now); + gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); + } + + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + bonuslen = (obj_type == DMU_OT_SA) ? + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be needed to allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (vap->va_type == VDIR) { + if (zfsvfs->z_replay) { + VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = zap_create_norm_dnsize(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx); + } + } else { + if (zfsvfs->z_replay) { + VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = dmu_object_alloc_dnsize(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx); + } + } + + zh = zfs_znode_hold_enter(zfsvfs, obj); + VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_id = obj; + } else { + dzp_pflags = dzp->z_pflags; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp_pflags & ZFS_XATTR) { + flag |= IS_XATTR; + } + + if (zfsvfs->z_use_fuids) + pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + else + pflags = 0; + + if (vap->va_type == VDIR) { + size = 2; /* contents ("." and "..") */ + links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } else { + size = links = 0; + } + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + rdev = zfs_expldev(vap->va_rdev); + } + + parent = dzp->z_id; + mode = acl_ids->z_mode; + if (flag & IS_XATTR) + pflags |= ZFS_XATTR; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { + /* + * With ZFS_PROJID flag, we can easily know whether there is + * project ID stored on disk or not. See zfs_space_delta_cb(). + */ + if (obj_type != DMU_OT_ZNODE && + dmu_objset_projectquota_enabled(zfsvfs->z_os)) + pflags |= ZFS_PROJID; + + /* + * Inherit project ID from parent if required. + */ + projid = zfs_inherit_projid(dzp); + if (dzp->z_pflags & ZFS_PROJINHERIT) + pflags |= ZFS_PROJINHERIT; + } + + /* + * No execs denied will be deterimed when zfs_mode_compute() is called. + */ + pflags |= acl_ids->z_aclp->z_hints & + (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| + ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); + + ZFS_TIME_ENCODE(&now, crtime); + ZFS_TIME_ENCODE(&now, ctime); + + if (vap->va_mask & ATTR_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, atime); + } else { + ZFS_TIME_ENCODE(&now, atime); + } + + if (vap->va_mask & ATTR_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + } else { + ZFS_TIME_ENCODE(&now, mtime); + } + + /* Now add in all of the "SA" attributes */ + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + &sa_hdl)); + + /* + * Setup the array of attributes to be replaced/set on the new file + * + * order for DMU_OT_ZNODE is critical since it needs to be constructed + * in the old znode_phys_t format. Don't change this ordering + */ + sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + } else { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), + NULL, &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), + NULL, &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + } + + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, + &empty_xattr, 8); + } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + pflags & ZFS_PROJID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), + NULL, &projid, 8); + } + if (obj_type == DMU_OT_ZNODE || + (vap->va_type == VBLK || vap->va_type == VCHR)) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + + } + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, + sizeof (uint64_t) * 4); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (zfs_acl_phys_t)); + } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &acl_ids->z_aclp->z_acl_count, 8); + locate.cb_aclp = acl_ids->z_aclp; + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + acl_ids->z_aclp->z_acl_bytes); + mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, + acl_ids->z_fuid, acl_ids->z_fgid); + } + + VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + + if (!(flag & IS_ROOT_NODE)) { + /* + * We must not hold any locks while calling vnode_create inside + * zfs_znode_alloc(), as it may call either of vnop_reclaim, or + * vnop_fsync. If it is not enough to just release ZFS_OBJ_HOLD + * we will have to attach the vnode after the dmu_commit like + * maczfs does, in each vnop caller. + */ + do { + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); + } while (*zpp == NULL); + + VERIFY(*zpp != NULL); + VERIFY(dzp != NULL); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + + (*zpp)->z_sa_hdl = sa_hdl; + } + + (*zpp)->z_pflags = pflags; + (*zpp)->z_mode = mode; + (*zpp)->z_dnodesize = dnodesize; + (*zpp)->z_projid = projid; + + if (vap->va_mask & ATTR_XVATTR) + zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); + + if (obj_type == DMU_OT_ZNODE || + acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { + err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx); + ASSERT(err == 0); + } + + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); + zfs_znode_hold_exit(zfsvfs, zh); +} + +/* + * Update in-core attributes. It is assumed the caller will be doing an + * sa_bulk_update to push the changes out. + */ +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + xoptattr_t *xoap; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + ×, sizeof (times), tx); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined, zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + zfs_sa_set_scanstamp(zp, xvap, tx); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SPARSE); + } +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + return (zfs_zget_ext(zfsvfs, obj_num, zpp, 0)); +} + +int +zfs_zget_ext(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp, + int flags) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + znode_hold_t *zh; + struct vnode *vp = NULL; + sa_handle_t *hdl; + uint32_t vid; + int err; + + dprintf("+zget %llu\n", obj_num); + + *zpp = NULL; + +again: + zh = zfs_znode_hold_enter(zfsvfs, obj_num); + + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + zfs_znode_hold_exit(zfsvfs, zh); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EINVAL)); + } + + hdl = dmu_buf_get_user(db); + if (hdl != NULL) { + zp = sa_get_userdata(hdl); + + + /* + * Since "SA" does immediate eviction we + * should never find a sa handle that doesn't + * know about the znode. + */ + ASSERT3P(zp, !=, NULL); + + mutex_enter(&zp->z_lock); + + /* + * Since zp may disappear after we unlock below, + * we save a copy of vp and it's vid + */ + vid = zp->z_vid; + vp = ZTOV(zp); + + /* + * Since we do immediate eviction of the z_dbuf, we + * should never find a dbuf with a znode that doesn't + * know about the dbuf. + */ + ASSERT3U(zp->z_id, ==, obj_num); + + /* + * OS X can return the znode when the file is unlinked + * in order to support the sync of open-unlinked files + */ + if (!(flags & ZGET_FLAG_UNLINKED) && zp->z_unlinked) { + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (ENOENT); + } + + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + + /* + * We are racing zfs_znode_getvnode() and we got here first, we + * need to let it get ahead + */ + if (!vp) { + + // Wait until attached, if we can. + if ((flags & ZGET_FLAG_ASYNC) && + zfs_znode_asyncwait(zp) == 0) { + dprintf("%s: waited on z_vnode OK\n", __func__); + } else { + dprintf("%s: async racing attach\n", __func__); + // Could be zp is being torn down, idle a bit, + // and retry. This branch is rarely executed. + kpreempt(KPREEMPT_SYNC); + } + goto again; + } + + /* + * Due to vnode_create() -> zfs_fsync() -> zil_commit() -> + * zget() -> vnode_getwithvid() -> deadlock. Unsure why + * vnode_getwithvid() ends up sleeping in msleep() but + * vnode_get() does not. + * As we can deadlock here using vnode_getwithvid() we will use + * the simpler vnode_get() in the ASYNC cases. We verify the + * vids match below. + */ + if ((flags & ZGET_FLAG_ASYNC)) + err = vnode_get(vp); + else + err = vnode_getwithvid(vp, vid); + + if (err != 0) { + dprintf("ZFS: vnode_get() returned %d\n", err); + kpreempt(KPREEMPT_SYNC); + goto again; + } + + /* + * Since we had to drop all of our locks above, make sure + * that we have the vnode and znode we had before. + */ + mutex_enter(&zp->z_lock); + if ((vid != zp->z_vid) || (vp != ZTOV(zp))) { + mutex_exit(&zp->z_lock); + /* + * Release the wrong vp from vnode_getwithvid(). + */ + VN_RELE(vp); + dprintf("ZFS: the vids do not match part 1\n"); + goto again; + } + if (vnode_vid(vp) != zp->z_vid) + printf("ZFS: the vids do not match\n"); + mutex_exit(&zp->z_lock); + + *zpp = zp; + + return (0); + } // if vnode != NULL + + /* + * Not found create new znode/vnode + * but only if file exists. + * + * There is a small window where zfs_vget() could + * find this object while a file create is still in + * progress. This is checked for in zfs_znode_alloc() + * + * if zfs_znode_alloc() fails it will drop the hold on the + * bonus buffer. + */ + + zp = NULL; + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, + doi.doi_bonus_type, NULL); + if (zp == NULL) { + err = SET_ERROR(ENOENT); + zfs_znode_hold_exit(zfsvfs, zh); + dprintf("zget returning %d\n", err); + return (err); + } + + printf("zget create: %llu setting to %p\n", obj_num, zp); + *zpp = zp; + + // Spawn taskq to attach while we are locked + if (flags & ZGET_FLAG_ASYNC) { + zfs_znode_asyncgetvnode(zp, zfsvfs); + } + + zfs_znode_hold_exit(zfsvfs, zh); + + /* Attach a vnode to our new znode */ + if (!(flags & ZGET_FLAG_ASYNC)) { + zfs_znode_getvnode(zp, zfsvfs); + } + + dprintf("zget returning %d\n", err); + return (err); +} + + +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_info_t doi; + dmu_buf_t *db; + struct vnode *vp; + uint64_t obj_num = zp->z_id; + uint64_t mode, size; + sa_bulk_attr_t bulk[8]; + int err; + int count = 0; + uint64_t gen; + uint64_t projid = ZFS_DEFAULT_PROJID; + znode_hold_t *zh; + + if (zp->z_is_ctldir) + return (0); + + zh = zfs_znode_hold_enter(zfsvfs, obj_num); + + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + mutex_exit(&zp->z_acl_lock); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + + rw_exit(&zp->z_xattr_lock); + + ASSERT(zp->z_sa_hdl == NULL); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + zfs_znode_hold_exit(zfsvfs, zh); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EINVAL)); + } + + zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); + size = zp->z_size; + + /* reload cached values */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, sizeof (gen)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, sizeof (zp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, + sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, sizeof (zp->z_uid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, sizeof (zp->z_gid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EIO)); + } + + if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), + &projid, 8); + if (err != 0 && err != ENOENT) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(err)); + } + } + + zp->z_projid = projid; + zp->z_mode = mode; + + if (gen != zp->z_gen) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EIO)); + } + + /* + * XXXPJD: Not sure how is that possible, but under heavy + * zfs recv -F load it happens that z_gen is the same, but + * vnode type is different than znode type. This would mean + * that for example regular file was replaced with directory + * which has the same object number. + */ + vp = ZTOV(zp); + if (vp != NULL && + vnode_vtype(vp) != IFTOVT((mode_t)zp->z_mode)) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (EIO); + } + + zp->z_blksz = doi.doi_data_block_size; + if (vp != NULL) { + vn_pages_remove(vp, 0, 0); + if (zp->z_size != size) + vnode_pager_setsize(vp, zp->z_size); + } + + /* + * If the file has zero links, then it has been unlinked on the send + * side and it must be in the received unlinked set. + * We call zfs_znode_dmu_fini() now to prevent any accesses to the + * stale data and to prevent automatical removal of the file in + * zfs_zinactive(). The file will be removed either when it is removed + * on the send side and the next incremental stream is received or + * when the unlinked set gets processed. + */ + zp->z_unlinked = (zp->z_links == 0); + if (zp->z_unlinked) + zfs_znode_dmu_fini(zp); + + zfs_znode_hold_exit(zfsvfs, zh); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zfs_external_acl(zp); + znode_hold_t *zh; + + zh = zfs_znode_hold_enter(zfsvfs, obj); + if (acl_obj) { + VERIFY(!zp->z_is_sa); + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + } + VERIFY(0 == dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); +} + +void +zfs_zinactive(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t z_id = zp->z_id; + znode_hold_t *zh; + + ASSERT(zp->z_sa_hdl); + + /* + * Don't allow a zfs_zget() while were trying to release this znode + */ + zh = zfs_znode_hold_enter(zfsvfs, z_id); + + mutex_enter(&zp->z_lock); + + /* + * If this was the last reference to a file with no links, remove + * the file from the file system unless the file system is mounted + * read-only. That can happen, for example, if the file system was + * originally read-write, the file was opened, then unlinked and + * the file system was made read-only before the file was finally + * closed. The file will remain in the unlinked set. + */ + if (zp->z_unlinked) { + ASSERT(!zfsvfs->z_issnap); + + if (!(vfs_isrdonly(zfsvfs->z_vfs)) && + !zfs_unlink_suspend_progress) { + mutex_exit(&zp->z_lock); + zfs_znode_hold_exit(zfsvfs, zh); + zfs_rmnode(zp); + return; + } + } + + mutex_exit(&zp->z_lock); + zfs_znode_dmu_fini(zp); + + zfs_znode_hold_exit(zfsvfs, zh); +} + +void +zfs_znode_free(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + mutex_enter(&zfsvfs->z_znodes_lock); + zp->z_vnode = NULL; + zp->z_zfsvfs = NULL; + POINTER_INVALIDATE(&zp->z_zfsvfs); + list_remove(&zfsvfs->z_all_znodes, zp); /* XXX */ + mutex_exit(&zfsvfs->z_znodes_lock); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + + ASSERT(zp->z_sa_hdl == NULL); + + kmem_cache_free(znode_cache, zp); +} + + +/* + * Prepare to update znode time stamps. + * + * IN: zp - znode requiring timestamp update + * flag - ATTR_MTIME, ATTR_CTIME, ATTR_ATIME flags + * have_tx - true of caller is creating a new txg + * + * OUT: zp - new atime (via underlying inode's i_atime) + * mtime - new mtime + * ctime - new ctime + * + * NOTE: The arguments are somewhat redundant. The following condition + * is always true: + * + * have_tx == !(flag & ATTR_ATIME) + */ +void +zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2], boolean_t have_tx) +{ + timestruc_t now; + + gethrestime(&now); + + if (have_tx) { /* will sa_bulk_update happen really soon? */ + zp->z_atime_dirty = 0; + zp->z_seq++; + } else { + zp->z_atime_dirty = 1; + } + + if (flag & ATTR_ATIME) { + ZFS_TIME_ENCODE(&now, zp->z_atime); + } + + if (flag & ATTR_MTIME) { + ZFS_TIME_ENCODE(&now, mtime); + if (zp->z_zfsvfs->z_use_fuids) { + zp->z_pflags |= (ZFS_ARCHIVE | + ZFS_AV_MODIFIED); + } + } + + if (flag & ATTR_CTIME) { + ZFS_TIME_ENCODE(&now, ctime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_pflags |= ZFS_ARCHIVE; + } +} + +void +zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2]) +{ + zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE); +} + +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, + zp->z_id, + size, 0, tx); + + if (error == ENOTSUP) + return; + ASSERT(error == 0); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); +} + +#ifdef sun +/* + * This is a dummy interface used when pvn_vplist_dirty() should *not* + * be calling back into the fs for a putpage(). E.g.: when truncating + * a file, the pages being "thrown away* don't need to be written out. + */ +/* ARGSUSED */ +static int +zfs_no_putpage(struct vnode *vp, page_t *pp, u_offset_t *offp, size_t *lenp, + int flags, cred_t *cr) +{ + ASSERT(0); + return (0); +} +#endif /* sun */ + +/* + * Increase the file length + * + * IN: zp - znode of file to free data in. + * end - new end-of-file + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_extend(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_tx_t *tx; + zfs_locked_range_t *lr; + uint64_t newblksz; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end <= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + if (end > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); + } else { + newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; + } + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); + + zp->z_size = end; + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &zp->z_size, + sizeof (zp->z_size), tx)); + + vnode_pager_setsize(ZTOV(zp), end); + + zfs_rangelock_exit(lr); + + dmu_tx_commit(tx); + + return (0); +} + + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + int error; + + /* + * Lock the range being freed. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + if (off + len > zp->z_size) + len = zp->z_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + if (error == 0) { + /* + * In FreeBSD we cannot free block in the middle of a file, + * but only at the end of a file, so this code path should + * never happen. + */ + vnode_pager_setsize(ZTOV(zp), off); + } + +#ifdef _LINUX + /* + * Zero partial page cache entries. This must be done under a + * range lock in order to keep the ARC and page cache in sync. + */ + if (zp->z_is_mapped) { + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; + + /* first possible full page in hole */ + first_page = (off + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* last page of hole */ + last_page = (off + len) >> PAGE_CACHE_SHIFT; + + /* offset of first_page */ + first_page_offset = first_page << PAGE_CACHE_SHIFT; + /* offset of last_page */ + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* truncate whole pages */ + if (last_page_offset > first_page_offset) { + truncate_inode_pages_range(ZTOI(zp)->i_mapping, + first_page_offset, last_page_offset - 1); + } + + /* truncate sub-page ranges */ + if (first_page > last_page) { + /* entire punched area within a single page */ + zfs_zero_partial_page(zp, off, len); + } else { + /* beginning of punched area at the end of a page */ + page_len = first_page_offset - off; + if (page_len > 0) + zfs_zero_partial_page(zp, off, page_len); + + /* end of punched area at the beginning of a page */ + page_len = off + len - last_page_offset; + if (page_len > 0) + zfs_zero_partial_page(zp, last_page_offset, + page_len); + } + } +#endif + zfs_rangelock_exit(lr); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + struct vnode *vp = ZTOV(zp); + dmu_tx_t *tx; + zfs_locked_range_t *lr; + int error; + sa_bulk_attr_t bulk[2]; + int count = 0; + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); + if (error) { + zfs_rangelock_exit(lr); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + zp->z_size = end; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &zp->z_size, sizeof (zp->z_size)); + + if (end == 0) { + zp->z_pflags &= ~ZFS_SPARSE; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); + + dmu_tx_commit(tx); + + /* + * Clear any mapped pages in the truncated region. This has to + * happen outside of the transaction to avoid the possibility of + * a deadlock with someone trying to push a page that we are + * about to invalidate. + */ + vnode_pager_setsize(vp, end); + + zfs_rangelock_exit(lr); + + return (0); +} + +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 on success, error code on failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ +// struct vnode *vp = ZTOV(zp); + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t mode; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + int error; + + if (vnode_isfifo(ZTOV(zp))) + return (0); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, + sizeof (mode))) != 0) + return (error); + + if (off > zp->z_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + goto out; + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + goto out; +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + + error = 0; + +out: + + return (error); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs; + uint64_t moid, obj, sa_obj, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; + int size; + int error; + int i; + znode_t *rootzp = NULL; + vattr_t vattr; + znode_t *zp; + zfs_acl_ids_t acl_ids; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Set starting attributes. + */ + version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); + VERIFY(nvpair_value_uint64(elem, &val) == 0); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + if (val < version) + version = val; + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT(error == 0); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + + /* + * Create zap object used for SA attribute registration + */ + + if (version >= ZPL_VERSION_SA) { + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + } else { + sa_obj = 0; + } + /* + * Create a delete queue. + */ + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/vnode/zfsvfs + * to allow zfs_mknode to work. + */ + VATTR_NULL(&vattr); + vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); + + rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_moved = 0; + rootzp->z_unlinked = 0; + rootzp->z_atime_dirty = 0; + rootzp->z_is_sa = USE_SA(version, os); + + rootzp->z_vnode = NULL; +#ifndef __APPLE__ + vnode.v_type = VDIR; + vnode.v_data = rootzp; + rootzp->z_vnode = &vnode; +#endif + + zfsvfs = kmem_alloc(sizeof (zfsvfs_t), KM_SLEEP); +#ifdef __APPLE__ + bzero(zfsvfs, sizeof (zfsvfs_t)); +#endif + zfsvfs->z_os = os; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_version = version; + zfsvfs->z_use_fuids = USE_FUIDS(version, os); + zfsvfs->z_use_sa = USE_SA(version, os); + zfsvfs->z_norm = norm; + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + + ASSERT(error == 0); + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); + zfsvfs->z_hold_size = size; + zfsvfs->z_hold_trees = kmem_zalloc(sizeof (avl_tree_t) * size, + KM_SLEEP); + zfsvfs->z_hold_locks = kmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); + for (i = 0; i != size; i++) { + avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, + sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); + mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); + } + + rootzp->z_zfsvfs = zfsvfs; + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, rootzp); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); + ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); + POINTER_INVALIDATE(&rootzp->z_zfsvfs); + + sa_handle_destroy(rootzp->z_sa_hdl); + rootzp->z_sa_hdl = NULL; + rootzp->z_vnode = NULL; + kmem_cache_free(znode_cache, rootzp); + + for (i = 0; i != size; i++) { + avl_destroy(&zfsvfs->z_hold_trees[i]); + mutex_destroy(&zfsvfs->z_hold_locks[i]); + } + + /* + * Create shares directory + */ + + error = zfs_create_share_dir(zfsvfs, tx); + + ASSERT(error == 0); + + list_destroy(&zfsvfs->z_all_znodes); + mutex_destroy(&zfsvfs->z_znodes_lock); + + kmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); + kmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); + + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +#endif /* _KERNEL */ + +static int +zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) +{ + uint64_t sa_obj = 0; + int error; + + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (error != 0 && error != ENOENT) + return (error); + + error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); + return (error); +} +static int +zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, + dmu_buf_t **db, void *tag) +{ + dmu_object_info_t doi; + int error; + + if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) + return (error); + + dmu_object_info_from_db(*db, &doi); + if (((doi.doi_bonus_type != DMU_OT_SA) && + (doi.doi_bonus_type != DMU_OT_ZNODE)) || + ((doi.doi_bonus_type == DMU_OT_ZNODE) && + (doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(*db, tag); + return (SET_ERROR(ENOTSUP)); + } + + error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); + if (error != 0) { + sa_buf_rele(*db, tag); + return (error); + } + return (0); +} + +void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) +{ + sa_handle_destroy(hdl); + sa_buf_rele(db, tag); +} + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) +{ + uint64_t parent; + uint64_t pflags; + uint64_t mode; + uint64_t parent_mode; + sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; + int count = 0; + int error; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, + &parent, sizeof (parent)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, + &pflags, sizeof (pflags)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &mode, sizeof (mode)); + + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) + return (error); + + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return ((EINVAL)); + + *pobjp = parent; + + return (0); +} + +/* + * Given an object number, return some zpl level statistics + */ +static int +zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, + zfs_stat_t *sb) +{ + sa_bulk_attr_t bulk[4]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &sb->zs_mode, sizeof (sb->zs_mode)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, + &sb->zs_gen, sizeof (sb->zs_gen)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, + &sb->zs_links, sizeof (sb->zs_links)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, + &sb->zs_ctime, sizeof (sb->zs_ctime)); + + return (sa_bulk_lookup(hdl, bulk, count)); +} + +static int +zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, + sa_attr_type_t *sa_table, char *buf, int len) +{ + sa_handle_t *sa_hdl; + sa_handle_t *prevhdl = NULL; + dmu_buf_t *prevdb = NULL; + dmu_buf_t *sa_db = NULL; + char *path = buf + len - 1; + int error; + + *path = '\0'; + sa_hdl = hdl; + + uint64_t deleteq_obj; + VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + error = zap_lookup_int(osp, deleteq_obj, obj); + if (error == 0) { + return (ESTALE); + } else if (error != ENOENT) { + return (error); + } + error = 0; + + for (;;) { + uint64_t pobj = 0; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir = 0; + + if (prevdb) + zfs_release_sa_handle(prevhdl, prevdb, FTAG); + + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) snprintf(component + 1, MAXNAMELEN+1, + ""); + } else { + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), + component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT(path >= buf); + bcopy(component, path, complen); + obj = pobj; + + if (sa_hdl != hdl) { + prevhdl = sa_hdl; + prevdb = sa_db; + } + error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); + if (error != 0) { + sa_hdl = prevhdl; + sa_db = prevdb; + break; + } + } + + if (sa_hdl != NULL && sa_hdl != hdl) { + ASSERT(sa_db != NULL); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + + return (error); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +int +zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len) +{ + char *path = buf + len - 1; + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + *path = '\0'; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + if (error != 0) { + zfs_release_sa_handle(hdl, db, FTAG); + return (error); + } + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} diff --git a/module/os/macos/zfs/zio_crypt.c b/module/os/macos/zfs/zio_crypt.c new file mode 100644 index 0000000000..7523844c07 --- /dev/null +++ b/module/os/macos/zfs/zio_crypt.c @@ -0,0 +1,1995 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file is responsible for handling all of the details of generating + * encryption parameters and performing encryption and authentication. + * + * BLOCK ENCRYPTION PARAMETERS: + * Encryption /Authentication Algorithm Suite (crypt): + * The encryption algorithm, mode, and key length we are going to use. We + * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit + * keys. All authentication is currently done with SHA512-HMAC. + * + * Plaintext: + * The unencrypted data that we want to encrypt. + * + * Initialization Vector (IV): + * An initialization vector for the encryption algorithms. This is used to + * "tweak" the encryption algorithms so that two blocks of the same data are + * encrypted into different ciphertext outputs, thus obfuscating block patterns. + * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is + * never reused with the same encryption key. This value is stored unencrypted + * and must simply be provided to the decryption function. We use a 96 bit IV + * (as recommended by NIST) for all block encryption. For non-dedup blocks we + * derive the IV randomly. The first 64 bits of the IV are stored in the second + * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of + * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits + * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count + * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of + * level 0 blocks is the number of allocated dnodes in that block. The on-disk + * format supports at most 2^15 slots per L0 dnode block, because the maximum + * block size is 16MB (2^24). In either case, for level 0 blocks this number + * will still be smaller than UINT32_MAX so it is safe to store the IV in the + * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count + * for the dnode code. + * + * Master key: + * This is the most important secret data of an encrypted dataset. It is used + * along with the salt to generate that actual encryption keys via HKDF. We + * do not use the master key to directly encrypt any data because there are + * theoretical limits on how much data can actually be safely encrypted with + * any encryption mode. The master key is stored encrypted on disk with the + * user's wrapping key. Its length is determined by the encryption algorithm. + * For details on how this is stored see the block comment in dsl_crypt.c + * + * Salt: + * Used as an input to the HKDF function, along with the master key. We use a + * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt + * can be used for encrypting many blocks, so we cache the current salt and the + * associated derived key in zio_crypt_t so we do not need to derive it again + * needlessly. + * + * Encryption Key: + * A secret binary key, generated from an HKDF function used to encrypt and + * decrypt data. + * + * Message Authenication Code (MAC) + * The MAC is an output of authenticated encryption modes such as AES-GCM and + * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted + * data on disk and return garbage to the application. Effectively, it is a + * checksum that can not be reproduced by an attacker. We store the MAC in the + * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated + * regular checksum of the ciphertext which can be used for scrubbing. + * + * OBJECT AUTHENTICATION: + * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because + * they contain some info that always needs to be readable. To prevent this + * data from being altered, we authenticate this data using SHA512-HMAC. This + * will produce a MAC (similar to the one produced via encryption) which can + * be used to verify the object was not modified. HMACs do not require key + * rotation or IVs, so we can keep up to the full 3 copies of authenticated + * data. + * + * ZIL ENCRYPTION: + * ZIL blocks have their bp written to disk ahead of the associated data, so we + * cannot store the MAC there as we normally do. For these blocks the MAC is + * stored in the embedded checksum within the zil_chain_t header. The salt and + * IV are generated for the block on bp allocation instead of at encryption + * time. In addition, ZIL blocks have some pieces that must be left in plaintext + * for claiming even though all of the sensitive user data still needs to be + * encrypted. The function zio_crypt_init_uios_zil() handles parsing which + * pieces of the block need to be encrypted. All data that is not encrypted is + * authenticated using the AAD mechanisms that the supported encryption modes + * provide for. In order to preserve the semantics of the ZIL for encrypted + * datasets, the ZIL is not protected at the objset level as described below. + * + * DNODE ENCRYPTION: + * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left + * in plaintext for scrubbing and claiming, but the bonus buffers might contain + * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing + * which which pieces of the block need to be encrypted. For more details about + * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). + * + * OBJECT SET AUTHENTICATION: + * Up to this point, everything we have encrypted and authenticated has been + * at level 0 (or -2 for the ZIL). If we did not do any further work the + * on-disk format would be susceptible to attacks that deleted or rearrannged + * the order of level 0 blocks. Ideally, the cleanest solution would be to + * maintain a tree of authentication MACs going up the bp tree. However, this + * presents a problem for raw sends. Send files do not send information about + * indirect blocks so there would be no convenient way to transfer the MACs and + * they cannot be recalculated on the receive side without the master key which + * would defeat one of the purposes of raw sends in the first place. Instead, + * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs + * from the level below. We also include some portable fields from blk_prop such + * as the lsize and compression algorithm to prevent the data from being + * misinterpretted. + * + * At the objset level, we maintain 2 seperate 256 bit MACs in the + * objset_phys_t. The first one is "portable" and is the logical root of the + * MAC tree maintianed in the metadnode's bps. The second, is "local" and is + * used as the root MAC for the user accounting objects, which are also not + * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload + * of the send file. The useraccounting code ensures that the useraccounting + * info is not present upon a receive, so the local MAC can simply be cleared + * out at that time. For more info about objset_phys_t authentication, see + * zio_crypt_do_objset_hmacs(). + * + * CONSIDERATIONS FOR DEDUP: + * In order for dedup to work, blocks that we want to dedup with one another + * need to use the same IV and encryption key, so that they will have the same + * ciphertext. Normally, one should never reuse an IV with the same encryption + * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both + * blocks. In this case, however, since we are using the same plaindata as + * well all that we end up with is a duplicate of the original ciphertext we + * already had. As a result, an attacker with read access to the raw disk will + * be able to tell which blocks are the same but this information is given away + * by dedup anyway. In order to get the same IVs and encryption keys for + * equivalent blocks of data we use an HMAC of the plaindata. We use an HMAC + * here so that a reproducible checksum of the plaindata is never available to + * the attacker. The HMAC key is kept alongside the master key, encrypted on + * disk. The first 64 bits of the HMAC are used in place of the random salt, and + * the next 96 bits are used as the IV. As a result of this mechanism, dedup + * will only work within a clone family since encrypted dedup requires use of + * the same master and HMAC keys. + */ + +/* + * After encrypting many blocks with the same key we may start to run up + * against the theoretical limits of how much data can securely be encrypted + * with a single key using the supported encryption modes. The most obvious + * limitation is that our risk of generating 2 equivalent 96 bit IVs increases + * the more IVs we generate (which both GCM and CCM modes strictly forbid). + * This risk actually grows surprisingly quickly over time according to the + * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have + * generated n IVs with a cryptographically secure RNG, the approximate + * probability p(n) of a collision is given as: + * + * p(n) ~= e^(-n*(n-1)/(2*(2^96))) + * + * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] + * + * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion + * we must not write more than 398,065,730 blocks with the same encryption key. + * Therefore, we rotate our keys after 400,000,000 blocks have been written by + * generating a new random 64 bit salt for our HKDF encryption key generation + * function. + */ +#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 +#define ZFS_CURRENT_MAX_SALT_USES \ + (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) +unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; + +typedef struct blkptr_auth_buf { + uint64_t bab_prop; /* blk_prop - portable mask */ + uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ + uint64_t bab_pad; /* reserved for future use */ +} blkptr_auth_buf_t; + +zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { + {"", ZC_TYPE_NONE, 0, "inherit"}, + {"", ZC_TYPE_NONE, 0, "on"}, + {"", ZC_TYPE_NONE, 0, "off"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} +}; + +void +zio_crypt_key_destroy(zio_crypt_key_t *key) +{ + rw_destroy(&key->zk_salt_lock); + + /* free crypto templates */ + crypto_destroy_ctx_template(key->zk_current_tmpl); + crypto_destroy_ctx_template(key->zk_hmac_tmpl); + + /* zero out sensitive data */ + bzero(key, sizeof (zio_crypt_key_t)); +} + +int +zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech; + uint_t keydata_len; + + ASSERT(key != NULL); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + bzero(key, sizeof (zio_crypt_key_t)); + + /* fill keydata buffers and salt with random data */ + ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_master_keydata, keydata_len); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for the ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = &key->zk_hmac_key; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + /* + * Initialize the crypto templates. It's ok if this fails because + * this is just an optimization. + */ + mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, + &key->zk_hmac_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_hmac_tmpl = NULL; + + key->zk_crypt = crypt; + key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; + key->zk_salt_count = 0; + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + return (0); + +error: + zio_crypt_key_destroy(key); + return (ret); +} + +static int +zio_crypt_key_change_salt(zio_crypt_key_t *key) +{ + int ret = 0; + uint8_t salt[ZIO_DATA_SALT_LEN]; + crypto_mechanism_t mech; + uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; + + /* generate a new salt */ + ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + rw_enter(&key->zk_salt_lock, RW_WRITER); + + /* someone beat us to the salt rotation, just unlock and return */ + if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) + goto out_unlock; + + /* derive the current key from the master key and the new salt */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); + if (ret != 0) + goto out_unlock; + + /* assign the salt and reset the usage count */ + bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); + key->zk_salt_count = 0; + + /* destroy the old context template and create the new one */ + crypto_destroy_ctx_template(key->zk_current_tmpl); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + rw_exit(&key->zk_salt_lock); + + return (0); + +out_unlock: + rw_exit(&key->zk_salt_lock); +error: + return (ret); +} + +/* See comment above zfs_key_max_salt_uses definition for details */ +int +zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) +{ + int ret; + boolean_t salt_change; + + rw_enter(&key->zk_salt_lock, RW_READER); + + bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); + salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= + ZFS_CURRENT_MAX_SALT_USES); + + rw_exit(&key->zk_salt_lock); + + if (salt_change) { + ret = zio_crypt_key_change_salt(key); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * This function handles all encryption and decryption in zfs. When + * encrypting it expects puio to reference the plaintext and cuio to + * reference the cphertext. cuio must have enough space for the + * ciphertext + room for a MAC. datalen should be the length of the + * plaintext / ciphertext alone. + */ +static int +zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, + crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, + uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len) +{ + int ret; + crypto_data_t plaindata, cipherdata; + CK_AES_CCM_PARAMS ccmp; + CK_AES_GCM_PARAMS gcmp; + crypto_mechanism_t mech; + zio_crypt_info_t crypt_info; + uint_t plain_full_len; + uint64_t maclen; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW); + + /* lookup the encryption info */ + crypt_info = zio_crypt_table[crypt]; + + /* the mac will always be the last iovec_t in the cipher uio */ + user_addr_t mac; + uio_getiov(cuio, uio_iovcnt(cuio) - 1, &mac, &maclen); + + ASSERT(maclen <= ZIO_DATA_MAC_LEN); + + /* setup encryption mechanism (same as crypt) */ + mech.cm_type = crypto_mech2id(crypt_info.ci_mechname); + + /* + * Strangely, the ICP requires that plain_full_len must include + * the MAC length when decrypting, even though the UIO does not + * need to have the extra space allocated. + */ + if (encrypt) { + plain_full_len = datalen; + } else { + plain_full_len = datalen + maclen; + } + + /* + * setup encryption params (currently only AES CCM and AES GCM + * are supported) + */ + if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) { + ccmp.ulNonceSize = ZIO_DATA_IV_LEN; + ccmp.ulAuthDataSize = auth_len; + ccmp.authData = authbuf; + ccmp.ulMACSize = maclen; + ccmp.nonce = ivbuf; + ccmp.ulDataSize = plain_full_len; + + mech.cm_param = (char *)(&ccmp); + mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS); + } else { + gcmp.ulIvLen = ZIO_DATA_IV_LEN; + gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN); + gcmp.ulAADLen = auth_len; + gcmp.pAAD = authbuf; + gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen); + gcmp.pIv = ivbuf; + + mech.cm_param = (char *)(&gcmp); + mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); + } + + /* populate the cipher and plain data structs. */ + plaindata.cd_format = CRYPTO_DATA_UIO; + plaindata.cd_offset = 0; + plaindata.cd_uio = puio; + plaindata.cd_miscdata = NULL; + plaindata.cd_length = plain_full_len; + + cipherdata.cd_format = CRYPTO_DATA_UIO; + cipherdata.cd_offset = 0; + cipherdata.cd_uio = cuio; + cipherdata.cd_miscdata = NULL; + cipherdata.cd_length = datalen + maclen; + + /* perform the actual encryption */ + if (encrypt) { + ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata, + NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + } else { + ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata, + NULL); + if (ret != CRYPTO_SUCCESS) { + ASSERT3U(ret, ==, CRYPTO_INVALID_MAC); + ret = SET_ERROR(ECKSUM); + goto error; + } + } + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) +{ + int ret; + uio_t *puio = NULL, *cuio = NULL; + uint64_t aad[3]; + uint64_t crypt = key->zk_crypt; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* generate iv for wrapping the master and hmac key */ + ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); + if (ret != 0) + goto error; + + puio = uio_create(2, 0, UIO_SYSSPACE, UIO_READ); + if (puio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + cuio = uio_create(3, 0, UIO_SYSSPACE, UIO_WRITE); + if (cuio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + /* initialize uio_ts */ + VERIFY0(uio_addiov(puio, (user_addr_t)key->zk_master_keydata, + keydata_len)); + VERIFY0(uio_addiov(puio, (user_addr_t)key->zk_hmac_keydata, + SHA512_HMAC_KEYLEN)); + + VERIFY0(uio_addiov(cuio, (user_addr_t)keydata_out, keydata_len)); + VERIFY0(uio_addiov(cuio, (user_addr_t)hmac_keydata_out, + SHA512_HMAC_KEYLEN)); + VERIFY0(uio_addiov(cuio, (user_addr_t)mac, WRAPPING_MAC_LEN)); + + /* + * Although we don't support writing to the old format, we do + * support rewrapping the key so that the user can move and + * quarantine datasets on the old format. + */ + if (key->zk_version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(key->zk_guid); + } else { + ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(key->zk_guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(key->zk_version); + } + + enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; + + /* encrypt the keys and store the resulting ciphertext and mac */ + ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len, + puio, cuio, (uint8_t *)aad, aad_len); + if (ret != 0) + goto error; + + if (puio) uio_free(puio); + if (cuio) uio_free(cuio); + + return (0); + +error: + if (puio) uio_free(puio); + if (cuio) uio_free(cuio); + + return (ret); +} + +int +zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech; + uio_t *puio = NULL, *cuio = NULL; + uint64_t aad[3]; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + puio = uio_create(2, 0, UIO_SYSSPACE, UIO_WRITE); + if (puio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + cuio = uio_create(3, 0, UIO_SYSSPACE, UIO_READ); + if (cuio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + /* initialize uio_ts */ + VERIFY0(uio_addiov(puio, (user_addr_t)key->zk_master_keydata, + keydata_len)); + VERIFY0(uio_addiov(puio, (user_addr_t)key->zk_hmac_keydata, + SHA512_HMAC_KEYLEN)); + + VERIFY0(uio_addiov(cuio, (user_addr_t)keydata, keydata_len)); + VERIFY0(uio_addiov(cuio, (user_addr_t)hmac_keydata, + SHA512_HMAC_KEYLEN)); + VERIFY0(uio_addiov(cuio, (user_addr_t)mac, WRAPPING_MAC_LEN)); + + if (version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(guid); + } else { + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(version); + } + + enc_len = keydata_len + SHA512_HMAC_KEYLEN; + + /* decrypt the keys and store the result in the output buffers */ + ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len, + puio, cuio, (uint8_t *)aad, aad_len); + if (ret != 0) + goto error; + + /* generate a fresh salt */ + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = key->zk_hmac_keydata; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + /* + * Initialize the crypto templates. It's ok if this fails because + * this is just an optimization. + */ + mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, + &key->zk_hmac_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_hmac_tmpl = NULL; + + key->zk_crypt = crypt; + key->zk_version = version; + key->zk_guid = guid; + key->zk_salt_count = 0; + + if (puio) uio_free(puio); + if (cuio) uio_free(cuio); + + return (0); + +error: + if (puio) uio_free(puio); + if (cuio) uio_free(cuio); + zio_crypt_key_destroy(key); + return (ret); +} + +int +zio_crypt_generate_iv(uint8_t *ivbuf) +{ + int ret; + + /* randomly generate the IV */ + ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); + if (ret != 0) + goto error; + + return (0); + +error: + bzero(ivbuf, ZIO_DATA_IV_LEN); + return (ret); +} + +int +zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen) +{ + int ret; + crypto_mechanism_t mech; + crypto_data_t in_data, digest_data; + uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; + + ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); + + /* initialize sha512-hmac mechanism and crypto data */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + /* initialize the crypto data */ + in_data.cd_format = CRYPTO_DATA_RAW; + in_data.cd_offset = 0; + in_data.cd_length = datalen; + in_data.cd_raw.iov_base = (char *)data; + in_data.cd_raw.iov_len = in_data.cd_length; + + digest_data.cd_format = CRYPTO_DATA_RAW; + digest_data.cd_offset = 0; + digest_data.cd_length = SHA512_DIGEST_LENGTH; + digest_data.cd_raw.iov_base = (char *)raw_digestbuf; + digest_data.cd_raw.iov_len = digest_data.cd_length; + + /* generate the hmac */ + ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl, + &digest_data, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_digestbuf, digestbuf, digestlen); + + return (0); + +error: + bzero(digestbuf, digestlen); + return (ret); +} + +int +zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt) +{ + int ret; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + ret = zio_crypt_do_hmac(key, data, datalen, + digestbuf, SHA512_DIGEST_LENGTH); + if (ret != 0) + return (ret); + + bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); + bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); + + return (0); +} + +/* + * The following functions are used to encode and decode encryption parameters + * into blkptr_t and zil_header_t. The ICP wants to use these parameters as + * byte strings, which normally means that these strings would not need to deal + * with byteswapping at all. However, both blkptr_t and zil_header_t may be + * byteswapped by lower layers and so we must "undo" that byteswap here upon + * decoding and encoding in a non-native byteorder. These functions require + * that the byteorder bit is correct before being called. + */ +void +zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); + bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, val32); + } else { + bcopy(salt, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); + + bcopy(iv, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); + + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, BSWAP_32(val32)); + } +} + +void +zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_PROTECTED(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_IS_AUTHENTICATED(bp)) { + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); + bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); + + val32 = (uint32_t)BP_GET_IV2(bp); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } else { + val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); + bcopy(&val64, salt, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); + bcopy(&val64, iv, sizeof (uint64_t)); + + val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } +} + +void +zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], + sizeof (uint64_t)); + } else { + bcopy(mac, &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[2] = BSWAP_64(val64); + + bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[3] = BSWAP_64(val64); + } +} + +void +zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + bzero(mac, ZIO_DATA_MAC_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); + } else { + val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); + bcopy(&val64, mac, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); + bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); + } +} + +void +zio_crypt_encode_mac_zil(void *data, uint8_t *mac) +{ + zil_chain_t *zilc = data; + + bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], + sizeof (uint64_t)); +} + +void +zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) +{ + /* + * The ZIL MAC is embedded in the block it protects, which will + * not have been byteswapped by the time this function has been called. + * As a result, we don't need to worry about byteswapping the MAC. + */ + const zil_chain_t *zilc = data; + + bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); +} + +/* + * This routine takes a block of dnodes (src_abd) and copies only the bonus + * buffers to the same offsets in the dst buffer. datalen should be the size + * of both the src_abd and the dst buffer (not just the length of the bonus + * buffers). + */ +void +zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) +{ + uint_t i, max_dnp = datalen >> DNODE_SHIFT; + uint8_t *src; + dnode_phys_t *dnp, *sdnp, *ddnp; + + src = abd_borrow_buf_copy(src_abd, datalen); + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), + DN_MAX_BONUS_LEN(dnp)); + } + } + + abd_return_buf(src_abd, src, datalen); +} + +/* + * This function decides what fields from blk_prop are included in + * the on-disk various MAC algorithms. + */ +static void +zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) +{ + /* + * Version 0 did not properly zero out all non-portable fields + * as it should have done. We maintain this code so that we can + * do read-only imports of pools on this version. + */ + if (version == 0) { + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); + BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); + return; + } + + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + + /* + * The hole_birth feature might set these fields even if this bp + * is a hole. We zero them out here to guarantee that raw sends + * will function with or without the feature. + */ + if (BP_IS_HOLE(bp)) { + bp->blk_prop = 0ULL; + return; + } + + /* + * At L0 we want to verify these fields to ensure that data blocks + * can not be reinterpretted. For instance, we do not want an attacker + * to trick us into returning raw lz4 compressed data to the user + * by modifying the compression bits. At higher levels, we cannot + * enforce this policy since raw sends do not convey any information + * about indirect blocks, so these values might be different on the + * receive side. Fortunately, this does not open any new attack + * vectors, since any alterations that can be made to a higher level + * bp must still verify the correct order of the layer below it. + */ + if (BP_GET_LEVEL(bp) != 0) { + BP_SET_BYTEORDER(bp, 0); + BP_SET_COMPRESS(bp, 0); + + /* + * psize cannot be set to zero or it will trigger + * asserts, but the value doesn't really matter as + * long as it is constant. + */ + BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); + } + + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); +} + +static void +zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, + blkptr_auth_buf_t *bab, uint_t *bab_len) +{ + blkptr_t tmpbp = *bp; + + if (should_bswap) + byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); + + ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); + ASSERT0(BP_IS_EMBEDDED(&tmpbp)); + + zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); + + /* + * We always MAC blk_prop in LE to ensure portability. This + * must be done after decoding the mac, since the endianness + * will get zero'd out here. + */ + zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); + bab->bab_prop = LE_64(tmpbp.blk_prop); + bab->bab_pad = 0ULL; + + /* version 0 did not include the padding */ + *bab_len = sizeof (blkptr_auth_buf_t); + if (version == 0) + *bab_len -= sizeof (uint64_t); +} + +static int +zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + int ret; + uint_t bab_len; + blkptr_auth_buf_t bab; + crypto_data_t cd; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + cd.cd_length = bab_len; + cd.cd_raw.iov_base = (char *)&bab; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + return (0); + +error: + return (ret); +} + +static void +zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + SHA2Update(ctx, &bab, bab_len); +} + +static void +zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + bcopy(&bab, *aadp, bab_len); + *aadp += bab_len; + *aad_len += bab_len; +} + +static int +zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, dnode_phys_t *dnp) +{ + int ret, i; + dnode_phys_t *adnp; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + crypto_data_t cd; + uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; + + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + + /* authenticate the core dnode (masking out non-portable bits) */ + bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); + adnp = (dnode_phys_t *)tmp_dncore; + if (le_bswap) { + adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); + adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); + adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); + adnp->dn_used = BSWAP_64(adnp->dn_used); + } + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + + cd.cd_length = sizeof (tmp_dncore); + cd.cd_raw.iov_base = (char *)adnp; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + for (i = 0; i < dnp->dn_nblkptr; i++) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, &dnp->dn_blkptr[i]); + if (ret != 0) + goto error; + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, DN_SPILL_BLKPTR(dnp)); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * objset_phys_t blocks introduce a number of exceptions to the normal + * authentication process. objset_phys_t's contain 2 seperate HMACS for + * protecting the integrity of their data. The portable_mac protects the + * the metadnode. This MAC can be sent with a raw send and protects against + * reordering of data within the metadnode. The local_mac protects the user + * accounting objects which are not sent from one system to another. + * + * In addition, objset blocks are the only blocks that can be modified and + * written to disk without the key loaded under certain circumstances. During + * zil_claim() we need to be able to update the zil_header_t to complete + * claiming log blocks and during raw receives we need to write out the + * portable_mac from the send file. Both of these actions are possible + * because these fields are not protected by either MAC so neither one will + * need to modify the MACs without the key. However, when the modified blocks + * are written out they will be byteswapped into the host machine's native + * endianness which will modify fields protected by the MAC. As a result, MAC + * calculation for objset blocks works slightly differently from other block + * types. Where other block types MAC the data in whatever endianness is + * written to disk, objset blocks always MAC little endian version of their + * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() + * and le_bswap indicates whether a byteswap is needed to get this block + * into little endian format. + */ +int +zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) +{ + int ret; + crypto_mechanism_t mech; + crypto_context_t ctx; + crypto_data_t cd; + objset_phys_t *osp = data; + uint64_t intval; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; + uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; + + /* initialize HMAC mechanism */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + + /* calculate the portable MAC from the portable fields and metadnode */ + ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the os_type */ + intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in fields from the metadnode */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_meta_dnode); + if (ret) + goto error; + + /* store the final digest in a temporary buffer and copy what we need */ + cd.cd_length = SHA512_DIGEST_LENGTH; + cd.cd_raw.iov_base = (char *)raw_portable_mac; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_final(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + + /* + * The local MAC protects the user and group accounting. If these + * objects are not present, the local MAC is zeroed out. + */ + if (osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE) { + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (0); + } + + /* calculate the local MAC from the userused and groupused dnodes */ + ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the non-portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in fields from the user accounting dnodes */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_userused_dnode); + if (ret) + goto error; + + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_groupused_dnode); + if (ret) + goto error; + + /* store the final digest in a temporary buffer and copy what we need */ + cd.cd_length = SHA512_DIGEST_LENGTH; + cd.cd_raw.iov_base = (char *)raw_local_mac; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_final(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); + + return (0); + +error: + bzero(portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (ret); +} + +static void +zio_crypt_destroy_uio(uio_t *uio) +{ +#ifdef _KERNEL + if (uio) uio_free(uio); +#endif +} + +/* + * This function parses an uncompressed indirect block and returns a checksum + * of all the portable fields from all of the contained bps. The portable + * fields are the MAC and all of the fields from blk_prop except for the dedup, + * checksum, and psize bits. For an explanation of the purpose of this, see + * the comment block on object set authentication. + */ +static int +zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, + uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) +{ + blkptr_t *bp; + int i, epb = datalen >> SPA_BLKPTRSHIFT; + SHA2_CTX ctx; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + /* checksum all of the MACs from the layer below */ + SHA2Init(SHA512, &ctx); + for (i = 0, bp = buf; i < epb; i++, bp++) { + zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, + byteswap, bp); + } + SHA2Final(digestbuf, &ctx); + + if (generate) { + bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) + return (SET_ERROR(ECKSUM)); + + return (0); +} + +int +zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + + /* + * Unfortunately, callers of this function will not always have + * easy access to the on-disk format version. This info is + * normally found in the DSL Crypto Key, but the checksum-of-MACs + * is expected to be verifiable even when the key isn't loaded. + * Here, instead of doing a ZAP lookup for the version for each + * zio, we simply try both existing formats. + */ + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, + datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); + if (ret == ECKSUM) { + ASSERT(!generate); + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, + buf, datalen, 0, byteswap, cksum); + } + + return (ret); +} + +int +zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + void *buf; + + buf = abd_borrow_buf_copy(abd, datalen); + ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, + byteswap, cksum); + abd_return_buf(abd, buf, datalen); + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting ZIL blocks. + * We do not check for the older ZIL chain because the encryption feature + * was not available before the newer ZIL chain was introduced. The goal + * here is to encrypt everything except the blkptr_t of a lr_write_t and + * the zil_chain_t header. Everything that is not encrypted is authenticated. + */ +static int +zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t **puio, + uio_t **cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + boolean_t *no_crypt) +{ + int ret; + uint64_t txtype, lr_len; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + uio_t *srcuio = NULL, *dstuio = NULL; + uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; + zil_chain_t *zilc; + lr_t *lr; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + /* cipherbuf always needs an extra iovec for the MAC */ + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + /* find the start and end record of the log block */ + zilc = (zil_chain_t *)src; + slrp = src + sizeof (zil_chain_t); + aadp = aadbuf; + blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + + /* calculate the number of encrypted iovecs we will need */ + for (; slrp < blkend; slrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + nr_iovecs++; + if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) + nr_iovecs++; + } + + nr_src += nr_iovecs; + nr_dst += nr_iovecs; + + if (nr_src == 0) + nr_src = 1; + if (nr_dst == 0) + nr_dst = 1; + + /* allocate the uio to hold iovecs */ + if (nr_src != 0) { + srcuio = uio_create(nr_src, 0, UIO_SYSSPACE, UIO_READ); + if (srcuio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + if (nr_dst != 0) { + dstuio = uio_create(nr_dst, 0, UIO_SYSSPACE, UIO_WRITE); + if (dstuio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + /* + * Copy the plain zil header over and authenticate everything except + * the checksum that will store our MAC. If we are writing the data + * the embedded checksum will not have been calculated yet, so we don't + * authenticate that. + */ + bcopy(src, dst, sizeof (zil_chain_t)); + bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); + aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); + aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); + + /* loop over records again, filling in iovecs */ + nr_iovecs = 0; + slrp = src + sizeof (zil_chain_t); + dlrp = dst + sizeof (zil_chain_t); + + for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + /* copy the common lr_t */ + bcopy(slrp, dlrp, sizeof (lr_t)); + bcopy(slrp, aadp, sizeof (lr_t)); + aadp += sizeof (lr_t); + aad_len += sizeof (lr_t); + + /* + * If this is a TX_WRITE record we want to encrypt everything + * except the bp if exists. If the bp does exist we want to + * authenticate it. + */ + if (txtype == TX_WRITE) { + crypt_len = sizeof (lr_write_t) - + sizeof (lr_t) - sizeof (blkptr_t); + + VERIFY0(uio_addiov(srcuio, + (user_addr_t)slrp + sizeof (lr_t), + crypt_len)); + VERIFY0(uio_addiov(dstuio, + (user_addr_t)dlrp + sizeof (lr_t), + crypt_len)); + + /* copy the bp now since it will not be encrypted */ + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), + sizeof (blkptr_t)); + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + aadp, sizeof (blkptr_t)); + aadp += sizeof (blkptr_t); + aad_len += sizeof (blkptr_t); + nr_iovecs++; + total_len += crypt_len; + + if (lr_len != sizeof (lr_write_t)) { + crypt_len = lr_len - sizeof (lr_write_t); + + VERIFY0(uio_addiov(srcuio, + (user_addr_t)slrp + sizeof (lr_write_t), + crypt_len)); + VERIFY0(uio_addiov(dstuio, + (user_addr_t)dlrp + sizeof (lr_write_t), + crypt_len)); + nr_iovecs++; + total_len += crypt_len; + } + } else { + crypt_len = lr_len - sizeof (lr_t); + VERIFY0(uio_addiov(srcuio, + (user_addr_t)slrp + sizeof (lr_t), + crypt_len)); + VERIFY0(uio_addiov(dstuio, + (user_addr_t)dlrp + sizeof (lr_t), + crypt_len)); + nr_iovecs++; + total_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + if (encrypt) { + *puio = srcuio; + *cuio = dstuio; + } else { + *puio = dstuio; + *cuio = srcuio; + } + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (srcuio) uio_free(srcuio); + if (dstuio) uio_free(dstuio); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting dnode blocks. + */ +static int +zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uio_t **puio, uio_t **cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; + struct uio *src_uio = NULL, *dst_uio = NULL; + uint8_t *src, *dst, *aadp; + dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + aadp = aadbuf; + + /* + * Count the number of iovecs we will need to do the encryption by + * counting the number of bonus buffers that need to be encrypted. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + /* + * This block may still be byteswapped. However, all of the + * values we use are either uint8_t's (for which byteswapping + * is a noop) or a * != 0 check, which will work regardless + * of whether or not we byteswap. + */ + if (sdnp[i].dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && + sdnp[i].dn_bonuslen != 0) { + nr_iovecs++; + } + } + + nr_src += nr_iovecs; + nr_dst += nr_iovecs; + + if (nr_src == 0) + nr_src = 1; + if (nr_dst == 0) + nr_dst = 1; + + if (nr_src != 0) { + src_uio = uio_create(nr_src, 0, UIO_SYSSPACE, UIO_READ); + if (src_uio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + ASSERT(src_uio != NULL); + + if (nr_dst != 0) { + dst_uio = uio_create(nr_dst, 0, UIO_SYSSPACE, UIO_WRITE); + if (dst_uio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + ASSERT(dst_uio != NULL); + + nr_iovecs = 0; + + /* + * Iterate through the dnodes again, this time filling in the uios + * we allocated earlier. We also concatenate any data we want to + * authenticate onto aadbuf. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + + /* copy over the core fields and blkptrs (kept as plaintext) */ + bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), + sizeof (blkptr_t)); + } + + /* + * Handle authenticated data. We authenticate everything in + * the dnode that can be brought over when we do a raw send. + * This includes all of the core fields as well as the MACs + * stored in the bp checksums and all of the portable bits + * from blk_prop. We include the dnode padding here in case it + * ever gets used in the future. Some dn_flags and dn_used are + * not portable so we mask those out values out of the + * authenticated data. + */ + crypt_len = offsetof(dnode_phys_t, dn_blkptr); + bcopy(dnp, aadp, crypt_len); + adnp = (dnode_phys_t *)aadp; + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + aadp += crypt_len; + aad_len += crypt_len; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, &dnp->dn_blkptr[j]); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, DN_SPILL_BLKPTR(dnp)); + } + + /* + * If this bonus buffer needs to be encrypted, we prepare an + * iovec_t. The encryption / decryption functions will fill + * this in for us with the encrypted or decrypted data. + * Otherwise we add the bonus buffer to the authenticated + * data buffer and copy it over to the destination. The + * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that + * we can guarantee alignment with the AES block size + * (128 bits). + */ + crypt_len = DN_MAX_BONUS_LEN(dnp); + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + ASSERT3U(nr_iovecs, <, nr_src); + ASSERT3U(nr_iovecs, <, nr_dst); + VERIFY0(uio_addiov(src_uio, (user_addr_t)DN_BONUS(dnp), + crypt_len)); + VERIFY0(uio_addiov(dst_uio, + (user_addr_t)DN_BONUS(&ddnp[i]), + crypt_len)); + + nr_iovecs++; + total_len += crypt_len; + } else { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); + bcopy(DN_BONUS(dnp), aadp, crypt_len); + aadp += crypt_len; + aad_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + if (encrypt) { + *puio = src_uio; + *cuio = dst_uio; + } else { + *puio = dst_uio; + *cuio = src_uio; + } + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + zio_crypt_destroy_uio(src_uio); + zio_crypt_destroy_uio(dst_uio); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + return (ret); +} + +static int +zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, uio_t **puio, uio_t **cuio, + uint_t *enc_len) +{ + int ret = 0; + + *puio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); + *cuio = uio_create(2, 0, UIO_SYSSPACE, UIO_WRITE); + if (*puio == NULL || *cuio == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + VERIFY0(uio_addiov(*puio, (user_addr_t)plainbuf, datalen)); + VERIFY0(uio_addiov(*cuio, (user_addr_t)cipherbuf, datalen)); + + *enc_len = datalen; + + return (0); + +error: + zio_crypt_destroy_uio(*puio); + zio_crypt_destroy_uio(*cuio); + + *enc_len = 0; + return (ret); +} + + +/* + * This function builds up the plaintext (puio) and ciphertext (cuio) uios so + * that they can be used for encryption and decryption by zio_do_crypt_uio(). + * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks + * requiring special handling to parse out pieces that are to be encrypted. The + * authbuf is used by these special cases to store additional authenticated + * data (AAD) for the encryption modes. + */ +static int +zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uint8_t *mac, uio_t **puio, uio_t **cuio, uint_t *enc_len, + uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + + ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); + + /* route to handler */ + switch (ot) { + case DMU_OT_INTENT_LOG: + ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, + datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, + no_crypt); + break; + case DMU_OT_DNODE: + ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, + cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, + auth_len, no_crypt); + break; + default: + ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, + datalen, puio, cuio, enc_len); + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + break; + } + + if (ret != 0) + goto error; + + ASSERT(puio != NULL); + ASSERT(cuio != NULL); + + /* populate the uios */ +#ifdef __APPLE__ + + VERIFY0(uio_addiov(*cuio, (user_addr_t)mac, ZIO_DATA_MAC_LEN)); + +#else // !APPLE + + puio->uio_segflg = UIO_SYSSPACE; + cuio->uio_segflg = UIO_SYSSPACE; + + mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); + mac_iov->iov_base = mac; + mac_iov->iov_len = ZIO_DATA_MAC_LEN; + +#endif // !APPLE + + return (0); + +error: + return (ret); +} + +/* + * Primary encryption / decryption entrypoint for zio data. + */ +int +zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt) +{ + int ret; + boolean_t locked = B_FALSE; + uint64_t crypt = key->zk_crypt; + uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; + /* + * We have to delay the allocation call uio_create() until we know + * how many iovecs we want (as max). + */ + uio_t *puio = NULL, *cuio = NULL; + uint_t enc_len, auth_len; + uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; + crypto_key_t tmp_ckey, *ckey = NULL; + crypto_ctx_template_t tmpl; + uint8_t *authbuf = NULL; + + /* create uios for encryption */ + ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, + cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, + &authbuf, &auth_len, no_crypt); + + if (ret != 0) + return (ret); + + /* + * If the needed key is the current one, just use it. Otherwise we + * need to generate a temporary one from the given salt + master key. + * If we are encrypting, we must return a copy of the current salt + * so that it can be stored in the blkptr_t. + */ + rw_enter(&key->zk_salt_lock, RW_READER); + locked = B_TRUE; + + if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { + ckey = &key->zk_current_key; + tmpl = key->zk_current_tmpl; + } else { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); + if (ret != 0) + goto error; + + tmp_ckey.ck_format = CRYPTO_KEY_RAW; + tmp_ckey.ck_data = enc_keydata; + tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + ckey = &tmp_ckey; + tmpl = NULL; + } + + VERIFY(puio != NULL); + VERIFY(cuio != NULL); + + /* perform the encryption / decryption */ + ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len, + puio, cuio, authbuf, auth_len); + + if (ret != 0) + goto error; + + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(puio); + zio_crypt_destroy_uio(cuio); + + return (0); + +error: + if (locked) + rw_exit(&key->zk_salt_lock); + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + + zio_crypt_destroy_uio(puio); + zio_crypt_destroy_uio(cuio); + return (ret); +} + +/* + * Simple wrapper around zio_do_crypt_data() to work with abd's instead of + * linear buffers. + */ +int +zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, + boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, + uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) +{ + int ret; + void *ptmp, *ctmp; + + if (encrypt) { + ptmp = abd_borrow_buf_copy(pabd, datalen); + ctmp = abd_borrow_buf(cabd, datalen); + } else { + ptmp = abd_borrow_buf(pabd, datalen); + ctmp = abd_borrow_buf_copy(cabd, datalen); + } + + ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, + datalen, ptmp, ctmp, no_crypt); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (0); + +error: + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (ret); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* BEGIN CSTYLED */ +module_param(zfs_key_max_salt_uses, ulong, 0644); +MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " + "can be used for generating encryption keys before it is rotated"); +/* END CSTYLED */ +#endif diff --git a/module/os/macos/zfs/zvolIO.cpp b/module/os/macos/zfs/zvolIO.cpp new file mode 100644 index 0000000000..96a9aefe58 --- /dev/null +++ b/module/os/macos/zfs/zvolIO.cpp @@ -0,0 +1,1192 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013-2020, Jorgen Lundman. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + + +/* + * ZVOL Device + */ +#define DEBUG +#if defined(DEBUG) || defined(ZFS_DEBUG) +#ifdef dprintf +#undef dprintf +#endif +#define dprintf(fmt, ...) do { \ + IOLog("zvolIO %s " fmt "\n", __func__, ##__VA_ARGS__); \ + _NOTE(CONSTCOND) } while (0) +#else +#ifndef dprintf +#define dprintf(fmt, ...) do { } while (0); +#endif +#endif /* if DEBUG or ZFS_DEBUG */ + +// #define dprintf IOLog + +// Define the superclass +#define super IOBlockStorageDevice + +#define ZVOL_BSIZE DEV_BSIZE + +static const char *ZVOL_PRODUCT_NAME_PREFIX = "ZVOL "; + +/* Wrapper for zvol_state pointer to IOKit device */ +typedef struct zvol_iokit { + net_lundman_zfs_zvol_device *dev; +} zvol_iokit_t; + +OSDefineMetaClassAndStructors(net_lundman_zfs_zvol_device, IOBlockStorageDevice) + +bool +net_lundman_zfs_zvol_device::init(zvol_state_t *c_zv, + OSDictionary *properties) +{ + zvol_iokit_t *iokitdev = NULL; + + dprintf("zvolIO_device:init\n"); + + if (!c_zv || c_zv->zv_zso->zvo_iokitdev != NULL) { + dprintf("zvol %s invalid c_zv\n", __func__); + return (false); + } + + if ((iokitdev = (zvol_iokit_t *)kmem_alloc(sizeof (zvol_iokit_t), + KM_SLEEP)) == NULL) { + printf("zvol %s wrapper alloc failed\n", __func__); + return (false); + } + + if (super::init(properties) == false) { + printf("zvol %s super init failed\n", __func__); + kmem_free(iokitdev, sizeof (zvol_iokit_t)); + return (false); + } + + /* Store reference to zvol_state_t in the iokitdev */ + zv = c_zv; + /* Store reference to iokitdev in zvol_state_t */ + iokitdev->dev = this; + + /* Assign to zv once completely initialized */ + zv->zv_zso->zvo_iokitdev = iokitdev; + + /* Apply the name from the full dataset path */ + if (strlen(zv->zv_name) != 0) { + setName(zv->zv_name); + } + + return (true); +} + +bool +net_lundman_zfs_zvol_device::attach(IOService* provider) +{ + OSDictionary *protocolCharacteristics = 0; + OSDictionary *deviceCharacteristics = 0; + OSDictionary *storageFeatures = 0; + OSBoolean *unmapFeature = 0; + const OSSymbol *propSymbol = 0; + OSString *dataString = 0; + OSNumber *dataNumber = 0; + + char product_name[strlen(ZVOL_PRODUCT_NAME_PREFIX) + MAXPATHLEN + 1]; + + if (!provider) { + dprintf("ZVOL attach missing provider\n"); + return (false); + } + + if (super::attach(provider) == false) + return (false); + + /* + * We want to set some additional properties for ZVOLs, in + * particular, "Virtual Device", and type "File" + * (or is Internal better?) + * + * Finally "Generic" type. + * + * These properties are defined in *protocol* characteristics + */ + + protocolCharacteristics = OSDictionary::withCapacity(3); + + if (!protocolCharacteristics) { + IOLog("failed to create dict for protocolCharacteristics.\n"); + return (true); + } + + propSymbol = OSSymbol::withCString( + kIOPropertyPhysicalInterconnectTypeVirtual); + + if (!propSymbol) { + IOLog("could not create interconnect type string\n"); + return (true); + } + protocolCharacteristics->setObject( + kIOPropertyPhysicalInterconnectTypeKey, propSymbol); + + propSymbol->release(); + propSymbol = 0; + + propSymbol = OSSymbol::withCString(kIOPropertyInterconnectFileKey); + if (!propSymbol) { + IOLog("could not create interconnect location string\n"); + return (true); + } + protocolCharacteristics->setObject( + kIOPropertyPhysicalInterconnectLocationKey, propSymbol); + + propSymbol->release(); + propSymbol = 0; + + setProperty(kIOPropertyProtocolCharacteristicsKey, + protocolCharacteristics); + + protocolCharacteristics->release(); + protocolCharacteristics = 0; + + /* + * We want to set some additional properties for ZVOLs, in + * particular, physical block size (volblocksize) of the + * underlying ZVOL, and 'logical' block size presented by + * the virtual disk. Also set physical bytes per sector. + * + * These properties are defined in *device* characteristics + */ + + deviceCharacteristics = OSDictionary::withCapacity(3); + + if (!deviceCharacteristics) { + IOLog("failed to create dict for deviceCharacteristics.\n"); + return (true); + } + + /* Set this device to be an SSD, for priority and VM paging */ + propSymbol = OSSymbol::withCString( + kIOPropertyMediumTypeSolidStateKey); + if (!propSymbol) { + IOLog("could not create medium type string\n"); + return (true); + } + deviceCharacteristics->setObject(kIOPropertyMediumTypeKey, + propSymbol); + + propSymbol->release(); + propSymbol = 0; + + /* Set logical block size to ZVOL_BSIZE (512b) */ + dataNumber = OSNumber::withNumber(ZVOL_BSIZE, + 8 * sizeof (ZVOL_BSIZE)); + + deviceCharacteristics->setObject(kIOPropertyLogicalBlockSizeKey, + dataNumber); + + dprintf("logicalBlockSize %llu\n", + dataNumber->unsigned64BitValue()); + + dataNumber->release(); + dataNumber = 0; + + /* Set physical block size to match volblocksize property */ + dataNumber = OSNumber::withNumber(zv->zv_volblocksize, + 8 * sizeof (zv->zv_volblocksize)); + + deviceCharacteristics->setObject(kIOPropertyPhysicalBlockSizeKey, + dataNumber); + + dprintf("physicalBlockSize %llu\n", + dataNumber->unsigned64BitValue()); + + dataNumber->release(); + dataNumber = 0; + + /* Set physical bytes per sector to match volblocksize property */ + dataNumber = OSNumber::withNumber((uint64_t)(zv->zv_volblocksize), + 8 * sizeof (uint64_t)); + + deviceCharacteristics->setObject(kIOPropertyBytesPerPhysicalSectorKey, + dataNumber); + + dprintf("physicalBytesPerSector %llu\n", + dataNumber->unsigned64BitValue()); + + dataNumber->release(); + dataNumber = 0; + + /* Publish the Device / Media name */ + (void) snprintf(product_name, sizeof (product_name), "%s%s", + ZVOL_PRODUCT_NAME_PREFIX, zv->zv_name); + dataString = OSString::withCString(product_name); + deviceCharacteristics->setObject(kIOPropertyProductNameKey, dataString); + dataString->release(); + dataString = 0; + + /* Apply these characteristics */ + setProperty(kIOPropertyDeviceCharacteristicsKey, + deviceCharacteristics); + + deviceCharacteristics->release(); + deviceCharacteristics = 0; + + /* + * ZVOL unmap support + * + * These properties are defined in IOStorageFeatures + */ + + storageFeatures = OSDictionary::withCapacity(1); + if (!storageFeatures) { + IOLog("failed to create dictionary for storageFeatures.\n"); + return (true); + } + + /* Set unmap feature */ + unmapFeature = OSBoolean::withBoolean(true); + storageFeatures->setObject(kIOStorageFeatureUnmap, unmapFeature); + unmapFeature->release(); + unmapFeature = 0; + + /* Apply these storage features */ + setProperty(kIOStorageFeaturesKey, storageFeatures); + storageFeatures->release(); + storageFeatures = 0; + + + /* + * Set transfer limits: + * + * Maximum transfer size (bytes) + * Maximum transfer block count + * Maximum transfer block size (bytes) + * Maximum transfer segment count + * Maximum transfer segment size (bytes) + * Minimum transfer segment size (bytes) + * + * We will need to establish safe defaults for all / per volblocksize + * + * Example: setProperty(kIOMinimumSegmentAlignmentByteCountKey, 1, 1); + */ + + /* + * Finally "Generic" type, set as a device property. Tried setting this + * to the string "ZVOL" however the OS does not recognize it as a block + * storage device. This would probably be possible by extending the + * IOBlockStorage Device / Driver relationship. + */ + + setProperty(kIOBlockStorageDeviceTypeKey, + kIOBlockStorageDeviceTypeGeneric); + + return (true); +} + +int +net_lundman_zfs_zvol_device::renameDevice(void) +{ + OSDictionary *deviceDict; + OSString *nameStr; + char *newstr; + int len; + + /* Length of string and null terminating character */ + len = strlen(ZVOL_PRODUCT_NAME_PREFIX) + strlen(zv->zv_name) + 1; + newstr = (char *)kmem_alloc(len, KM_SLEEP); + if (!newstr) { + dprintf("%s string alloc failed\n", __func__); + return (ENOMEM); + } + + /* Append prefix and dsl name */ + snprintf(newstr, len, "%s%s", ZVOL_PRODUCT_NAME_PREFIX, zv->zv_name); + nameStr = OSString::withCString(newstr); + kmem_free(newstr, len); + + if (!nameStr) { + dprintf("%s couldn't allocate name string\n", __func__); + return (ENOMEM); + } + + /* Fetch current device characteristics dictionary */ + deviceDict = OSDynamicCast(OSDictionary, + getProperty(kIOPropertyDeviceCharacteristicsKey)); + if (!deviceDict || (deviceDict = + OSDictionary::withDictionary(deviceDict)) == NULL) { + dprintf("couldn't clone device characteristics\n"); + /* Allocate new dict */ + if (!deviceDict && + (deviceDict = OSDictionary::withCapacity(1)) == NULL) { + dprintf("%s OSDictionary alloc failed\n", __func__); + nameStr->release(); + return (ENOMEM); + } + + } + + /* Add or replace the product name */ + if (deviceDict->setObject(kIOPropertyProductNameKey, + nameStr) == false) { + dprintf("%s couldn't set product name\n", __func__); + nameStr->release(); + deviceDict->release(); + return (ENXIO); + } + nameStr->release(); + nameStr = 0; + + /* Set IORegistry property */ + if (setProperty(kIOPropertyDeviceCharacteristicsKey, + deviceDict) == false) { + dprintf("%s couldn't set IORegistry property\n", __func__); + deviceDict->release(); + return (ENXIO); + } + deviceDict->release(); + deviceDict = 0; + + /* Apply the name from the full dataset path */ + setName(zv->zv_name); + + return (0); +} + +int +net_lundman_zfs_zvol_device::offlineDevice(void) +{ + IOService *client; + + if ((client = this->getClient()) == NULL) { + return (ENOENT); + } + + /* Ask IOBlockStorageDevice to offline media */ + if (client->message(kIOMessageMediaStateHasChanged, + this, (void *)kIOMediaStateOffline) != kIOReturnSuccess) { + dprintf("%s failed\n", __func__); + return (ENXIO); + } + + return (0); +} + +int +net_lundman_zfs_zvol_device::onlineDevice(void) +{ + IOService *client; + + if ((client = this->getClient()) == NULL) { + return (ENOENT); + } + + /* Ask IOBlockStorageDevice to online media */ + if (client->message(kIOMessageMediaStateHasChanged, + this, (void *)kIOMediaStateOnline) != kIOReturnSuccess) { + dprintf("%s failed\n", __func__); + return (ENXIO); + } + + return (0); +} + +int +net_lundman_zfs_zvol_device::refreshDevice(void) +{ + IOService *client; + + if ((client = this->getClient()) == NULL) { + return (ENOENT); + } + + /* Ask IOBlockStorageDevice to reset the media params */ + if (client->message(kIOMessageMediaParametersHaveChanged, + this) != kIOReturnSuccess) { + dprintf("%s failed\n", __func__); + return (ENXIO); + } + + return (0); +} + +int +net_lundman_zfs_zvol_device::getBSDName(void) +{ + IORegistryEntry *ioregdevice = 0; + OSObject *bsdnameosobj = 0; + OSString* bsdnameosstr = 0; + + ioregdevice = OSDynamicCast(IORegistryEntry, this); + + if (!ioregdevice) + return (-1); + + bsdnameosobj = ioregdevice->getProperty(kIOBSDNameKey, + gIOServicePlane, kIORegistryIterateRecursively); + + if (!bsdnameosobj) + return (-1); + + bsdnameosstr = OSDynamicCast(OSString, bsdnameosobj); + + IOLog("zvol: bsd name is '%s'\n", + bsdnameosstr->getCStringNoCopy()); + + if (!zv) + return (-1); + + zv->zv_zso->zvo_bsdname[0] = 'r'; // for 'rdiskX'. + strlcpy(&zv->zv_zso->zvo_bsdname[1], + bsdnameosstr->getCStringNoCopy(), + sizeof (zv->zv_zso->zvo_bsdname)-1); + /* + * IOLog("name assigned '%s'\n", zv->zv_zso->zvo_bsdname); + */ + + return (0); +} + +void +net_lundman_zfs_zvol_device::detach(IOService *provider) +{ + super::detach(provider); +} + +void +net_lundman_zfs_zvol_device::clearState(void) +{ + zv = NULL; +} + +bool +net_lundman_zfs_zvol_device::handleOpen(IOService *client, + IOOptionBits options, void *argument) +{ + IOStorageAccess access = (uintptr_t)argument; + bool ret = false; + int openflags = 0; + + if (super::handleOpen(client, options, argument) == false) + return (false); + + /* Device terminating? */ + if (zv == NULL || + zv->zv_zso == NULL || + zv->zv_zso->zvo_iokitdev == NULL) + return (false); + + if (access & kIOStorageAccessReaderWriter) { + openflags = FWRITE | ZVOL_EXCL; + } else { + openflags = FREAD; + } + + /* + * Don't use 'zv' until it has been verified by zvol_os_open_zv() + * and returned as opened, then it holds an open count and can be + * used. + */ + if (zvol_os_open_zv(zv, zv->zv_zso->zvo_openflags, 0, NULL) == 0) { + ret = true; + } else { + openflags = FREAD; + if (zvol_os_open_zv(zv, FREAD /* ZVOL_EXCL */, 0, NULL) == 0) { + ret = true; + } + } + + if (ret) + zv->zv_zso->zvo_openflags = openflags; + + + dprintf("Open %s (openflags %llx)\n", (ret ? "done" : "failed"), + ret ? zv->zv_zso->zvo_openflags : 0); + + if (ret == false) + super::handleClose(client, options); + + return (ret); +} + +void +net_lundman_zfs_zvol_device::handleClose(IOService *client, + IOOptionBits options) +{ + super::handleClose(client, options); + + /* Terminating ? */ + if (zv == NULL || + zv->zv_zso == NULL || + zv->zv_zso->zvo_iokitdev == NULL) + return; + + zvol_os_close_zv(zv, zv->zv_zso->zvo_openflags, 0, NULL); + +} + +IOReturn +net_lundman_zfs_zvol_device::doAsyncReadWrite( + IOMemoryDescriptor *buffer, UInt64 block, UInt64 nblks, + IOStorageAttributes *attributes, IOStorageCompletion *completion) +{ + IODirection direction; + IOByteCount actualByteCount; + struct iomem iomem; + iomem.buf = NULL; + + // Return errors for incoming I/O if we have been terminated. + if (isInactive() == true) { + dprintf("asyncReadWrite notActive fail\n"); + return (kIOReturnNotAttached); + } + + // These variables are set in zvol_first_open(), which should have been + // called already. + if (!zv->zv_dn) { + dprintf("asyncReadWrite no zvol dnode\n"); + return (kIOReturnNotAttached); + } + + // Ensure the start block is within the disk capacity. + if ((block)*(ZVOL_BSIZE) >= zv->zv_volsize) { + dprintf("asyncReadWrite start block outside volume\n"); + return (kIOReturnBadArgument); + } + + // Shorten the read, if beyond the end + if (((block + nblks)*(ZVOL_BSIZE)) > zv->zv_volsize) { + dprintf("asyncReadWrite block shortening needed\n"); + return (kIOReturnBadArgument); + } + + // Get the buffer direction, whether this is a read or a write. + direction = buffer->getDirection(); + if ((direction != kIODirectionIn) && (direction != kIODirectionOut)) { + dprintf("asyncReadWrite kooky direction\n"); + return (kIOReturnBadArgument); + } + + // dprintf("%s offset @block %llu numblocks %llu: blksz %u\n", + // direction == kIODirectionIn ? "Read" : "Write", + // block, nblks, (ZVOL_BSIZE)); + + /* Perform the read or write operation through the transport driver. */ + actualByteCount = (nblks*(ZVOL_BSIZE)); + + iomem.buf = buffer; + + /* Make sure we don't go away while the command is being executed */ + /* Open should be holding a retain */ + + if (direction == kIODirectionIn) { + + if (zvol_os_read_zv(zv, (block*(ZVOL_BSIZE)), + actualByteCount, &iomem)) { + + actualByteCount = 0; + } + + } else { + + if (zvol_os_write_zv(zv, (block*(ZVOL_BSIZE)), + actualByteCount, &iomem)) { + actualByteCount = 0; + } + + } + + /* Open should be holding a retain */ + iomem.buf = NULL; // overkill + + if (actualByteCount != nblks*(ZVOL_BSIZE)) + dprintf("Read/Write operation failed\n"); + + // Call the completion function. + (completion->action)(completion->target, completion->parameter, + kIOReturnSuccess, actualByteCount); + + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::doDiscard(UInt64 block, UInt64 nblks) +{ + dprintf("doDiscard called with block, nblks (%llu, %llu)\n", + block, nblks); + uint64_t bytes = 0; + uint64_t off = 0; + + /* Convert block/nblks to offset/bytes */ + off = block * ZVOL_BSIZE; + bytes = nblks * ZVOL_BSIZE; + dprintf("calling zvol_unmap with offset, bytes (%llu, %llu)\n", + off, bytes); + + if (zvol_os_unmap(zv, off, bytes) == 0) + return (kIOReturnSuccess); + else + return (kIOReturnError); +} + + +IOReturn +net_lundman_zfs_zvol_device::doUnmap(IOBlockStorageDeviceExtent *extents, + UInt32 extentsCount, UInt32 options = 0) +{ + UInt32 i = 0; + IOReturn result; + + dprintf("doUnmap called with (%u) extents and options (%u)\n", + (uint32_t)extentsCount, (uint32_t)options); + + if (options > 0 || !extents) { + return (kIOReturnUnsupported); + } + + for (i = 0; i < extentsCount; i++) { + + result = doDiscard(extents[i].blockStart, + extents[i].blockCount); + + if (result != kIOReturnSuccess) { + return (result); + } + } + + return (kIOReturnSuccess); +} + +UInt32 +net_lundman_zfs_zvol_device::doGetFormatCapacities(UInt64* capacities, + UInt32 capacitiesMaxCount) const +{ + dprintf("formatCap\n"); + + /* + * Ensure that the array is sufficient to hold all our formats + * (we require one element). + */ + if ((capacities != NULL) && (capacitiesMaxCount < 1)) + return (0); + /* Error, return an array size of 0. */ + + /* + * The caller may provide a NULL array if it wishes to query the number + * of formats that we support. + */ + if (capacities != NULL) + capacities[0] = zv->zv_volsize; + + dprintf("returning capacity[0] size %llu\n", zv->zv_volsize); + + return (1); +} + +char * +net_lundman_zfs_zvol_device::getProductString(void) +{ + dprintf("getProduct %p\n", zv); + + if (zv) + return (zv->zv_name); + + return ((char *)"ZVolume"); +} + +IOReturn +net_lundman_zfs_zvol_device::reportBlockSize(UInt64 *blockSize) +{ + dprintf("reportBlockSize %llu\n", *blockSize); + + if (blockSize) *blockSize = (ZVOL_BSIZE); + + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::reportMaxValidBlock(UInt64 *maxBlock) +{ + dprintf("reportMaxValidBlock %llu\n", *maxBlock); + + if (maxBlock) *maxBlock = ((zv->zv_volsize / (ZVOL_BSIZE)) - 1); + + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::reportMediaState(bool *mediaPresent, + bool *changedState) +{ + dprintf("reportMediaState\n"); + if (mediaPresent) *mediaPresent = true; + if (changedState) *changedState = false; + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::reportPollRequirements(bool *pollRequired, + bool *pollIsExpensive) +{ + dprintf("reportPollReq\n"); + if (pollRequired) *pollRequired = false; + if (pollIsExpensive) *pollIsExpensive = false; + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::reportRemovability(bool *isRemovable) +{ + dprintf("reportRemova\n"); + if (isRemovable) *isRemovable = false; + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::doEjectMedia(void) +{ + dprintf("ejectMedia\n"); +/* XXX */ + // Only 10.6 needs special work to eject + // if ((version_major == 10) && (version_minor == 8)) + // destroyBlockStorageDevice(zvol); + // } + + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::doFormatMedia(UInt64 byteCapacity) +{ + dprintf("doFormat\n"); + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::doLockUnlockMedia(bool doLock) +{ + dprintf("doLockUnlock\n"); + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::doSynchronizeCache(void) +{ + dprintf("doSync\n"); + if (zv && zv->zv_zilog) { + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } + return (kIOReturnSuccess); +} + +char * +net_lundman_zfs_zvol_device::getVendorString(void) +{ + dprintf("getVendor\n"); + return ((char *)"ZVOL"); +} + +char * +net_lundman_zfs_zvol_device::getRevisionString(void) +{ + dprintf("getRevision\n"); + return ((char *)ZFS_META_VERSION); +} + +char * +net_lundman_zfs_zvol_device::getAdditionalDeviceInfoString(void) +{ + dprintf("getAdditional\n"); + return ((char *)"ZFS Volume"); +} + +IOReturn +net_lundman_zfs_zvol_device::reportEjectability(bool *isEjectable) +{ + dprintf("reportEjecta\n"); + /* + * Which do we prefer? If you eject it, you can't get volume back until + * you import it again. + */ + + if (isEjectable) *isEjectable = false; + return (kIOReturnSuccess); +} + +/* XXX deprecated function */ +IOReturn +net_lundman_zfs_zvol_device::reportLockability(bool *isLockable) +{ + dprintf("reportLocka\n"); + if (isLockable) *isLockable = true; + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::reportWriteProtection(bool *isWriteProtected) +{ + dprintf("reportWritePro: %d\n", *isWriteProtected); + + if (!isWriteProtected) + return (kIOReturnSuccess); + + if (zv && (zv->zv_flags & ZVOL_RDONLY)) + *isWriteProtected = true; + else + *isWriteProtected = false; + + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::getWriteCacheState(bool *enabled) +{ + dprintf("getCacheState\n"); + if (enabled) *enabled = true; + return (kIOReturnSuccess); +} + +IOReturn +net_lundman_zfs_zvol_device::setWriteCacheState(bool enabled) +{ + dprintf("setWriteCache\n"); + return (kIOReturnSuccess); +} + +extern "C" { + +/* C interfaces */ +int +zvolCreateNewDevice(zvol_state_t *zv) +{ + net_lundman_zfs_zvol_device *zvol; + ZFSPool *pool_proxy; + spa_t *spa; + dprintf("%s\n", __func__); + + /* We must have a valid zvol_state_t */ + if (!zv || !zv->zv_objset) { + dprintf("%s missing zv or objset\n", __func__); + return (EINVAL); + } + + /* We need the spa to get the pool proxy */ + if ((spa = dmu_objset_spa(zv->zv_objset)) == NULL) { + dprintf("%s couldn't get spa\n", __func__); + return (EINVAL); + } + if (spa->spa_iokit_proxy == NULL || + (pool_proxy = spa->spa_iokit_proxy->proxy) == NULL) { + dprintf("%s missing IOKit pool proxy\n", __func__); + return (EINVAL); + } + + zvol = new net_lundman_zfs_zvol_device; + + /* Validate creation, initialize and attach */ + if (!zvol || zvol->init(zv) == false || + zvol->attach(pool_proxy) == false) { + dprintf("%s device creation failed\n", __func__); + if (zvol) zvol->release(); + return (ENOMEM); + } + /* Start the service */ + if (zvol->start(pool_proxy) == false) { + dprintf("%s device start failed\n", __func__); + zvol->detach(pool_proxy); + zvol->release(); + return (ENXIO); + } + + /* Open pool_proxy provider */ + if (pool_proxy->open(zvol) == false) { + dprintf("%s open provider failed\n", __func__); + zvol->stop(pool_proxy); + zvol->detach(pool_proxy); + zvol->release(); + return (ENXIO); + } + /* Is retained by provider */ + zvol->release(); + zvol = 0; + + return (0); +} + +int +zvolRegisterDevice(zvol_state_t *zv) +{ + net_lundman_zfs_zvol_device *zvol; + OSDictionary *matching; + IOService *service = 0; + IOMedia *media = 0; + OSString *nameStr = 0, *bsdName = 0; + uint64_t timeout = (5ULL * kSecondScale); + bool ret = false; + + if (!zv || !zv->zv_zso->zvo_iokitdev || zv->zv_name[0] == 0) { + dprintf("%s missing zv, iokitdev, or name\n", __func__); + return (EINVAL); + } + + if ((zvol = zv->zv_zso->zvo_iokitdev->dev) == NULL) { + dprintf("%s couldn't get zvol device\n", __func__); + return (EINVAL); + } + + if (!zvol->getVendorString()) { + return (EINVAL); + } + + /* Create matching string and dictionary */ + { + char str[MAXNAMELEN]; + snprintf(str, MAXNAMELEN, "%s %s Media", + zvol->getVendorString(), zv->zv_name); + if ((nameStr = OSString::withCString(str)) == NULL) { + dprintf("%s problem with name string\n", __func__); + return (ENOMEM); + } + } + matching = IOService::serviceMatching("IOMedia"); + if (!matching || !matching->setObject(gIONameMatchKey, nameStr)) { + dprintf("%s couldn't get matching dictionary\n", __func__); + return (ENOMEM); + } + + /* Register device for service matching */ + zvol->registerService(kIOServiceAsynchronous); + + /* Wait for upper layer BSD client */ + dprintf("%s waiting for IOMedia\n", __func__); + /* Wait for up to 5 seconds */ + service = IOService::waitForMatchingService(matching, timeout); + dprintf("%s %s service\n", __func__, (service ? "got" : "no")); + + if (!service) { + dprintf("%s couldn't get matching service\n", __func__); + return (false); + } + + dprintf("%s casting to IOMedia\n", __func__); + media = OSDynamicCast(IOMedia, service); + + if (!media) { + dprintf("%s no IOMedia\n", __func__); + service->release(); + return (false); + } + + dprintf("%s getting IOBSDNameKey\n", __func__); + bsdName = OSDynamicCast(OSString, + media->getProperty(kIOBSDNameKey)); + + if (bsdName) { + const char *str = bsdName->getCStringNoCopy(); + dprintf("%s Got bsd name [%s]\n", + __func__, str); + zv->zv_zso->zvo_bsdname[0] = 'r'; + snprintf(zv->zv_zso->zvo_bsdname+1, + sizeof (zv->zv_zso->zvo_bsdname)-1, + "%s", str); + dprintf("%s zvol bsdname set to %s\n", __func__, + zv->zv_zso->zvo_bsdname); +// zvol_add_symlink(zv, zv->zv_zso->zvo_bsdname+1, +// zv->zv_zso->zvo_bsdname); + ret = true; + } else { + dprintf("%s couldn't get BSD Name\n", __func__); + } + + /* Release retain held by waitForMatchingService */ + service->release(); + + printf("%s complete\n", __func__); + return (ret); +} + +/* Struct passed in will be freed before returning */ +void * +zvolRemoveDevice(zvol_iokit_t *iokitdev) +{ + net_lundman_zfs_zvol_device *zvol; + dprintf("%s\n", __func__); + + if (!iokitdev) { + dprintf("%s missing argument\n", __func__); + return (NULL); + } + + zvol = iokitdev->dev; + /* Free the wrapper struct */ + kmem_free(iokitdev, sizeof (zvol_iokit_t)); + + if (zvol == NULL) { + dprintf("%s couldn't get IOKit handle\n", __func__); + return (NULL); + } + + /* Mark us as terminating */ + zvol->clearState(); + + return (zvol); +} + +/* + * zvolRemoveDevice continued.. + * terminate() will block and we can deadlock, so it is issued as a + * separate thread. Done from zvol_os.c as it is easier in C. + */ +int +zvolRemoveDeviceTerminate(void *arg) +{ + net_lundman_zfs_zvol_device *zvol = (net_lundman_zfs_zvol_device *)arg; + + /* Terminate */ + if (zvol->terminate(kIOServiceTerminate|kIOServiceAsynchronous| + kIOServiceRequired) == false) { + IOLog("%s terminate failed\n", __func__); + } + + return (0); +} + +/* Called with zv->zv_name already updated */ +int +zvolRenameDevice(zvol_state_t *zv) +{ + net_lundman_zfs_zvol_device *zvol = NULL; + int error; + + if (!zv || strnlen(zv->zv_name, 1) == 0) { + dprintf("%s missing argument\n", __func__); + return (EINVAL); + } + + if ((zvol = zv->zv_zso->zvo_iokitdev->dev) == NULL) { + dprintf("%s couldn't get zvol device\n", __func__); + return (EINVAL); + } + + /* Set IORegistry name and property */ + if ((error = zvol->renameDevice()) != 0) { + dprintf("%s renameDevice error %d\n", __func__, error); + return (error); + } + + /* + * XXX This works, but if there is a volume mounted on + * the zvol at the time it is uncleanly ejected. + * We just need to add diskutil unmount to `zfs rename`, + * like zpool export. + */ + /* Inform clients of this device that name has changed */ + if (zvol->offlineDevice() != 0 || + zvol->onlineDevice() != 0) { + dprintf("%s media reset failed\n", __func__); + return (ENXIO); + } + + return (0); +} + +/* Called with zvol volsize already updated */ +int +zvolSetVolsize(zvol_state_t *zv) +{ + net_lundman_zfs_zvol_device *zvol; + int error; + + dprintf("%s\n", __func__); + + if (!zv || !zv->zv_zso->zvo_iokitdev) { + dprintf("%s invalid zvol\n", __func__); + return (EINVAL); + } + + /* Cast to correct type */ + if ((zvol = zv->zv_zso->zvo_iokitdev->dev) == NULL) { + dprintf("%s couldn't cast IOKit handle\n", __func__); + return (ENXIO); + } + /* + * XXX This works fine, even if volume is mounted, + * but only tested expanding the zvol and only with + * GPT/APM/MBR partition map (not volume on whole-zvol). + */ + /* Inform clients of this device that size has changed */ + if ((error = zvol->refreshDevice()) != 0) { + dprintf("%s refreshDevice error %d\n", __func__, error); + return (error); + } + + return (0); +} + +uint64_t +zvolIO_kit_read(struct iomem *iomem, uint64_t offset, + char *address, uint64_t len) +{ + IOByteCount done; + // IOLog("zvolIO_kit_read offset %p count %llx to offset %llx\n", + // address, len, offset); + ASSERT(iomem && address && len > 0); + + done = iomem->buf->writeBytes(offset, (void *)address, len); + + return (done); +} + +uint64_t +zvolIO_kit_write(struct iomem *iomem, uint64_t offset, + char *address, uint64_t len) +{ + IOByteCount done; + // IOLog("zvolIO_kit_write offset %p count %llx to offset %llx\n", + // address, len, offset); + ASSERT(iomem && address && len > 0); + + done = iomem->buf->readBytes(offset, (void *)address, len); + + return (done); +} + +} /* extern "C" */ diff --git a/module/os/macos/zfs/zvol_os.c b/module/os/macos/zfs/zvol_os.c new file mode 100644 index 0000000000..1c985598c4 --- /dev/null +++ b/module/os/macos/zfs/zvol_os.c @@ -0,0 +1,1024 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2020 by Jorgen Lundman. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static uint32_t zvol_major = ZVOL_MAJOR; + +unsigned int zvol_request_sync = 0; +unsigned int zvol_prefetch_bytes = (128 * 1024); +unsigned long zvol_max_discard_blocks = 16384; +unsigned int zvol_threads = 8; + +taskq_t *zvol_taskq; + +typedef struct zv_request { + zvol_state_t *zv; + + void (*zv_func)(void *); + void *zv_arg; + + taskq_ent_t ent; +} zv_request_t; + +int +dmu_read_iokit_dnode(dnode_t *dn, uint64_t *offset, + uint64_t position, uint64_t *size, struct iomem *iomem); +int +dmu_write_iokit_dnode(dnode_t *dn, uint64_t *offset, uint64_t position, + uint64_t *size, struct iomem *iomem, dmu_tx_t *tx); + + +static void +zvol_os_spawn_cb(void *param) +{ + zv_request_t *zvr = (zv_request_t *)param; + + zvr->zv_func(zvr->zv_arg); + + kmem_free(zvr, sizeof (zv_request_t)); +} + +static void +zvol_os_spawn(void (*func)(void *), void *arg) +{ + zv_request_t *zvr; + zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); + zvr->zv_arg = arg; + zvr->zv_func = func; + + taskq_init_ent(&zvr->ent); + + taskq_dispatch_ent(zvol_taskq, + zvol_os_spawn_cb, zvr, 0, &zvr->ent); +} + +/* + * Given a path, return TRUE if path is a ZVOL. + */ +static boolean_t +zvol_os_is_zvol(const char *device) +{ +#if 0 + struct stat stbf; + + // Stat device, get major/minor, match zv + if (stat(device, &stbf) == 0) { + if (S_ISBLK(stbf.st_mode) || S_ISCHR(stbf.st_mode)) { + dev_t dev = makedevice(stbf.st_major, stbf.st_minor); + + zvol_state_t *zv; + zv = zvol_find_by_dev(dev); + if (zv != NULL) { + mutex_exit(&zv->zv_state_lock); + return (B_TRUE); + } + } + } +#endif + return (B_FALSE); +} + +/* + * Make sure zv is still in the list (not freed) and if it is + * grab the locks in the correct order. + * Can we rely on list_link_active() instead of looping list? + */ +static int +zvol_os_verify_and_lock(zvol_state_t *node) +{ + zvol_state_t *zv; + + rw_enter(&zvol_state_lock, RW_READER); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + mutex_enter(&zv->zv_state_lock); + if (zv == node) { + + if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); + } + rw_exit(&zvol_state_lock); + return (1); + } + mutex_exit(&zv->zv_state_lock); + } + rw_exit(&zvol_state_lock); + return (0); +} + +static void +zvol_os_register_device_cb(void *param) +{ + zvol_state_t *zv = (zvol_state_t *)param; + + if (zvol_os_verify_and_lock(zv) == 0) + return; + + zvolRegisterDevice(zv); + + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); +} + +int +zvol_os_write(dev_t dev, struct uio *uio, int p) +{ + return (ENOTSUP); +} + +int +zvol_os_read(dev_t dev, struct uio *uio, int p) +{ + return (ENOTSUP); +} + +int +zvol_os_write_zv(zvol_state_t *zv, uint64_t position, + uint64_t count, struct iomem *iomem) +{ + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + boolean_t sync; + uint64_t offset = 0; + uint64_t bytes; + uint64_t off; + + if (zv == NULL) + return (ENXIO); + + /* Some requests are just for flush and nothing else. */ + if (count == 0) + return (0); + + volsize = zv->zv_volsize; + if (count > 0 && + (position >= volsize)) + return (EIO); + + rw_enter(&zv->zv_suspend_lock, RW_READER); + + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + } + rw_downgrade(&zv->zv_suspend_lock); + } + + dprintf("zvol_write_iokit(position %llu offset " + "0x%llx bytes 0x%llx)\n", position, offset, count); + + sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + + /* Lock the entire range */ + lr = zfs_rangelock_enter(&zv->zv_rangelock, position, count, + RL_WRITER); + + /* Iterate over (DMU_MAX_ACCESS/2) segments */ + while (count > 0 && (position + offset) < volsize) { + /* bytes for this segment */ + bytes = MIN(count, DMU_MAX_ACCESS >> 1); + off = offset; + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + + /* don't write past the end */ + if (bytes > volsize - (position + off)) + bytes = volsize - (position + off); + + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + + error = dmu_write_iokit_dnode(zv->zv_dn, &offset, + position, &bytes, iomem, tx); + + if (error == 0) { + count -= MIN(count, + (DMU_MAX_ACCESS >> 1)) + bytes; + zvol_log_write(zv, tx, offset, bytes, sync); + } + dmu_tx_commit(tx); + + if (error) + break; + } + zfs_rangelock_exit(lr); + + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + + rw_exit(&zv->zv_suspend_lock); + + return (error); +} + +int +zvol_os_read_zv(zvol_state_t *zv, uint64_t position, + uint64_t count, struct iomem *iomem) +{ + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + uint64_t offset = 0; + + if (zv == NULL) + return (ENXIO); + + volsize = zv->zv_volsize; + if (count > 0 && + (position >= volsize)) + return (EIO); + + rw_enter(&zv->zv_suspend_lock, RW_READER); + + lr = zfs_rangelock_enter(&zv->zv_rangelock, position, count, + RL_READER); + + while (count > 0 && (position+offset) < volsize) { + uint64_t bytes = MIN(count, DMU_MAX_ACCESS >> 1); + + /* don't read past the end */ + if (bytes > volsize - (position + offset)) + bytes = volsize - (position + offset); + + dprintf("%s %llu offset %llu len %llu bytes %llu\n", + "zvol_read_iokit: position", + position, offset, count, bytes); + + error = dmu_read_iokit_dnode(zv->zv_dn, &offset, position, + &bytes, iomem); + + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; + break; + } + count -= MIN(count, DMU_MAX_ACCESS >> 1) - bytes; + } + zfs_rangelock_exit(lr); + + rw_exit(&zv->zv_suspend_lock); + return (error); +} + +int +zvol_os_unmap(zvol_state_t *zv, uint64_t off, uint64_t bytes) +{ + zfs_locked_range_t *lr = NULL; + dmu_tx_t *tx = NULL; + int error = 0; + uint64_t end = off + bytes; + + if (zv == NULL) + return (ENXIO); + + /* + * XNU's wipefs_wipe() will issue one giant unmap for the entire + * device; + * zfs create -V 8g BOOM/vol + * zvolIO doDiscard calling zvol_unmap with offset, bytes (0, 858992) + * Which will both take too long, and is uneccessary. We will ignore + * any unmaps deemed "too large". + */ + if ((off == 0ULL) && + (zv->zv_volsize > (1ULL << 24)) && /* 16Mb slop */ + (bytes >= (zv->zv_volsize - (1ULL << 24)))) + return (0); + + rw_enter(&zv->zv_suspend_lock, RW_READER); + + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + } + rw_downgrade(&zv->zv_suspend_lock); + } + + off = P2ROUNDUP(off, zv->zv_volblocksize); + end = P2ALIGN(end, zv->zv_volblocksize); + + if (end > zv->zv_volsize) /* don't write past the end */ + end = zv->zv_volsize; + + if (off >= end) { + /* Return success- caller does not need to know */ + goto out; + } + + bytes = end - off; + lr = zfs_rangelock_enter(&zv->zv_rangelock, off, bytes, RL_WRITER); + + tx = dmu_tx_create(zv->zv_objset); + + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, TXG_WAIT); + + if (error) { + dmu_tx_abort(tx); + } else { + + zvol_log_truncate(zv, tx, off, bytes, B_TRUE); + + dmu_tx_commit(tx); + + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, off, bytes); + } + + zfs_rangelock_exit(lr); + + if (error == 0) { + /* + * If the 'sync' property is set to 'always' then + * treat this as a synchronous operation + * (i.e. commit to zil). + */ + if (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) { + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } + } + +out: + rw_exit(&zv->zv_suspend_lock); + return (error); +} + +int +zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) +{ + zv->zv_volsize = volsize; + return (0); +} + +static void +zvol_os_clear_private_cb(void *param) +{ + zvolRemoveDeviceTerminate(param); +} + +static void +zvol_os_clear_private(zvol_state_t *zv) +{ + void *term; + + printf("%s\n", __func__); + /* We can do all removal work, except call terminate. */ + term = zvolRemoveDevice(zv->zv_zso->zvo_iokitdev); + if (term == NULL) + return; + + zv->zv_zso->zvo_iokitdev = NULL; + + /* Call terminate in the background */ + zvol_os_spawn(zvol_os_clear_private_cb, term); + +} + +/* + * Find a zvol_state_t given the full major+minor dev_t. If found, + * return with zv_state_lock taken, otherwise, return (NULL) without + * taking zv_state_lock. + */ +static zvol_state_t * +zvol_os_find_by_dev(dev_t dev) +{ + zvol_state_t *zv; + + printf("%s\n", __func__); + + rw_enter(&zvol_state_lock, RW_READER); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + mutex_enter(&zv->zv_state_lock); + if (zv->zv_zso->zvo_dev == dev) { + rw_exit(&zvol_state_lock); + return (zv); + } + mutex_exit(&zv->zv_state_lock); + } + rw_exit(&zvol_state_lock); + + return (NULL); +} + +void +zvol_os_validate_dev(zvol_state_t *zv) +{ + ASSERT3U(MINOR(zv->zv_zso->zvo_dev) & ZVOL_MINOR_MASK, ==, 0); +} + +/* + * Allocate memory for a new zvol_state_t and setup the required + * request queue and generic disk structures for the block device. + */ +static zvol_state_t * +zvol_os_alloc(dev_t dev, const char *name) +{ + zvol_state_t *zv; + struct zvol_state_os *zso; + uint64_t volmode; + + printf("%s\n", __func__); + if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) + return (NULL); + + printf("%s 2\n", __func__); + if (volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + + printf("%s 3\n", __func__); + if (volmode == ZFS_VOLMODE_NONE) + return (NULL); + + printf("%s 4\n", __func__); + zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); + zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_zso = zso; + + list_link_init(&zv->zv_next); + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + + zv->zv_open_count = 0; + strlcpy(zv->zv_name, name, MAXNAMELEN); + + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + + return (zv); +#if 0 +out_kmem: + kmem_free(zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + return (NULL); +#endif +} + +/* + * Cleanup then free a zvol_state_t which was created by zvol_alloc(). + * At this time, the structure is not opened by anyone, is taken off + * the zvol_state_list, and has its private data set to NULL. + * The zvol_state_lock is dropped. + * + */ +static void +zvol_os_free(zvol_state_t *zv) +{ + printf("%s\n", __func__); + + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count == 0); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + + mutex_destroy(&zv->zv_state_lock); + + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); +} + + + +/* + * Create a block device minor node and setup the linkage between it + * and the specified volume. Once this function returns the block + * device is live and ready for use. + */ +static int +zvol_os_create_minor(const char *name) +{ + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t *doi; + uint64_t volsize; + unsigned minor = 0; + int error = 0; + uint64_t hash = zvol_name_hash(name); + + printf("%s\n", __func__); + + if (zvol_inhibit_dev) + return (0); + + // minor? + zv = zvol_find_by_name_hash(name, hash, RW_NONE); + if (zv) { + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + mutex_exit(&zv->zv_state_lock); + return (SET_ERROR(EEXIST)); + } + + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); + if (error) + goto out_doi; + + error = dmu_object_info(os, ZVOL_OBJ, doi); + if (error) + goto out_dmu_objset_disown; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + goto out_dmu_objset_disown; + + zv = zvol_os_alloc(makedevice(zvol_major, minor), name); + if (zv == NULL) { + error = SET_ERROR(EAGAIN); + goto out_dmu_objset_disown; + } + zv->zv_hash = hash; + + if (dmu_objset_is_snapshot(os)) + zv->zv_flags |= ZVOL_RDONLY; + + zv->zv_volblocksize = doi->doi_data_block_size; + zv->zv_volsize = volsize; + zv->zv_objset = os; + + // set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); + + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(dmu_objset_zil(os), B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + + /* Create the IOKit zvol while owned */ + if ((error = zvolCreateNewDevice(zv)) != 0) { + dprintf("%s zvolCreateNewDevice error %d\n", + __func__, error); + } + + zv->zv_objset = NULL; +out_dmu_objset_disown: + dmu_objset_disown(os, B_TRUE, FTAG); +out_doi: + kmem_free(doi, sizeof (dmu_object_info_t)); + + if (error == 0) { + rw_enter(&zvol_state_lock, RW_WRITER); + zvol_insert(zv); + rw_exit(&zvol_state_lock); + + /* Register (async) IOKit zvol after disown and unlock */ + /* The callback with release the mutex */ + zvol_os_spawn(zvol_os_register_device_cb, zv); + + } else { + + } + + printf("%s complete\n", __func__); + return (error); +} + + +static void zvol_os_rename_device_cb(void *param) +{ + zvol_state_t *zv = (zvol_state_t *)param; + if (zvol_os_verify_and_lock(zv) == 0) + return; + zvolRenameDevice(zv); + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); +} + +static void +zvol_os_rename_minor(zvol_state_t *zv, const char *newname) +{ + // int readonly = get_disk_ro(zv->zv_zso->zvo_disk); + + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + + /* move to new hashtable entry */ + zv->zv_hash = zvol_name_hash(zv->zv_name); + hlist_del(&zv->zv_hlink); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); + + zvol_os_spawn(zvol_os_rename_device_cb, zv); + + /* + * The block device's read-only state is briefly changed causing + * a KOBJ_CHANGE uevent to be issued. This ensures udev detects + * the name change and fixes the symlinks. This does not change + * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never + * changes. This would normally be done using kobject_uevent() but + * that is a GPL-only symbol which is why we need this workaround. + */ + // set_disk_ro(zv->zv_zso->zvo_disk, !readonly); + // set_disk_ro(zv->zv_zso->zvo_disk, readonly); +} + +static void +zvol_os_set_disk_ro(zvol_state_t *zv, int flags) +{ + // set_disk_ro(zv->zv_zso->zvo_disk, flags); +} + +static void +zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) +{ + // set_capacity(zv->zv_zso->zvo_disk, capacity); +} + +int +zvol_os_open_zv(zvol_state_t *zv, int flag, int otyp, struct proc *p) +{ + int error = 0; + + printf("%s\n", __func__); + + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zvol_os_verify_and_lock(zv) == 0) + return (SET_ERROR(ENOENT)); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock)); + + if (zv->zv_open_count == 0) { + error = zvol_first_open(zv, !(flag & FWRITE)); + if (error) + goto out_mutex; + } + + if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + error = EROFS; + goto out_open_count; + } + + zv->zv_open_count++; + + mutex_exit(&zv->zv_state_lock); + + rw_exit(&zv->zv_suspend_lock); + + return (0); + +out_open_count: + if (zv->zv_open_count == 0) + zvol_last_close(zv); + +out_mutex: + mutex_exit(&zv->zv_state_lock); + + rw_exit(&zv->zv_suspend_lock); + if (error == EINTR) { + error = ERESTART; + schedule(); + } + return (SET_ERROR(error)); +} + +int +zvol_os_open(dev_t devp, int flag, int otyp, struct proc *p) +{ + zvol_state_t *zv; + int error = 0; + + printf("%s\n", __func__); + + if (!getminor(devp)) + return (0); + + zv = zvol_os_find_by_dev(devp); + if (zv == NULL) { + return (SET_ERROR(ENXIO)); + } + + error = zvol_os_open_zv(zv, flag, otyp, p); + + mutex_exit(&zv->zv_state_lock); + return (SET_ERROR(error)); +} + +int +zvol_os_close_zv(zvol_state_t *zv, int flag, int otyp, struct proc *p) +{ + printf("%s\n", __func__); + + if (zvol_os_verify_and_lock(zv) == 0) + return (SET_ERROR(ENOENT)); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock)); + + zv->zv_open_count--; + + if (zv->zv_open_count == 0) + zvol_last_close(zv); + + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + + return (0); +} + +int +zvol_os_close(dev_t dev, int flag, int otyp, struct proc *p) +{ + zvol_state_t *zv; + int error = 0; + + printf("%s\n", __func__); + + if (!getminor(dev)) + return (0); + + zv = zvol_os_find_by_dev(dev); + if (zv == NULL) { + return (SET_ERROR(-ENXIO)); + } + + error = zvol_os_close_zv(zv, flag, otyp, p); + + mutex_exit(&zv->zv_state_lock); + return (0); +} + +void +zvol_os_strategy(struct buf *bp) +{ + +} + +int +zvol_os_get_volume_blocksize(dev_t dev) +{ + /* XNU can only handle two sizes. */ + return (DEV_BSIZE); +} + +int +zvol_os_ioctl(dev_t dev, unsigned long cmd, caddr_t data, int isblk, + cred_t *cr, int *rvalp) +{ + int error = 0; + u_int32_t *f; + u_int64_t *o; + zvol_state_t *zv = NULL; + + printf("%s\n", __func__); + + if (!getminor(dev)) + return (ENXIO); + + zv = zvol_os_find_by_dev(dev); + + if (zv == NULL) { + dprintf("zv is NULL\n"); + return (ENXIO); + } + + f = (u_int32_t *)data; + o = (u_int64_t *)data; + + switch (cmd) { + + case DKIOCGETMAXBLOCKCOUNTREAD: + dprintf("DKIOCGETMAXBLOCKCOUNTREAD\n"); + *o = 32; + break; + + case DKIOCGETMAXBLOCKCOUNTWRITE: + dprintf("DKIOCGETMAXBLOCKCOUNTWRITE\n"); + *o = 32; + break; + case DKIOCGETMAXSEGMENTCOUNTREAD: + dprintf("DKIOCGETMAXSEGMENTCOUNTREAD\n"); + *o = 32; + break; + + case DKIOCGETMAXSEGMENTCOUNTWRITE: + dprintf("DKIOCGETMAXSEGMENTCOUNTWRITE\n"); + *o = 32; + break; + + case DKIOCGETBLOCKSIZE: + dprintf("DKIOCGETBLOCKSIZE: %llu\n", + zv->zv_volblocksize); + *f = zv->zv_volblocksize; + break; + + case DKIOCSETBLOCKSIZE: + dprintf("DKIOCSETBLOCKSIZE %lu\n", *f); + + if (!isblk) { + /* We can only do this for a block device */ + error = ENODEV; + break; + } + + if (zvol_check_volblocksize(zv->zv_name, + (uint64_t)*f)) { + error = EINVAL; + break; + } + + /* set the new block size */ + zv->zv_volblocksize = (uint64_t)*f; + dprintf("setblocksize changed: %llu\n", + zv->zv_volblocksize); + break; + + case DKIOCISWRITABLE: + dprintf("DKIOCISWRITABLE\n"); + if (zv && (zv->zv_flags & ZVOL_RDONLY)) + *f = 0; + else + *f = 1; + break; +#ifdef DKIOCGETBLOCKCOUNT32 + case DKIOCGETBLOCKCOUNT32: + dprintf("DKIOCGETBLOCKCOUNT32: %lu\n", + (uint32_t)zv->zv_volsize / zv->zv_volblocksize); + *f = (uint32_t)zv->zv_volsize / zv->zv_volblocksize; + break; +#endif + + case DKIOCGETBLOCKCOUNT: + dprintf("DKIOCGETBLOCKCOUNT: %llu\n", + zv->zv_volsize / zv->zv_volblocksize); + *o = (uint64_t)zv->zv_volsize / zv->zv_volblocksize; + break; + + case DKIOCGETBASE: + dprintf("DKIOCGETBASE\n"); + /* + * What offset should we say? + * 0 is ok for FAT but to HFS + */ + *o = zv->zv_volblocksize * 0; + break; + + case DKIOCGETPHYSICALBLOCKSIZE: + dprintf("DKIOCGETPHYSICALBLOCKSIZE\n"); + *f = zv->zv_volblocksize; + break; + +#ifdef DKIOCGETTHROTTLEMASK + case DKIOCGETTHROTTLEMASK: + dprintf("DKIOCGETTHROTTLEMASK\n"); + *o = 0; + break; +#endif + + case DKIOCGETMAXBYTECOUNTREAD: + *o = SPA_MAXBLOCKSIZE; + break; + + case DKIOCGETMAXBYTECOUNTWRITE: + *o = SPA_MAXBLOCKSIZE; + break; +#ifdef DKIOCUNMAP + case DKIOCUNMAP: + dprintf("DKIOCUNMAP\n"); + *f = 1; + break; +#endif + + case DKIOCGETFEATURES: + *f = 0; + break; + +#ifdef DKIOCISSOLIDSTATE + case DKIOCISSOLIDSTATE: + dprintf("DKIOCISSOLIDSTATE\n"); + *f = 0; + break; +#endif + + case DKIOCISVIRTUAL: + *f = 1; + break; + + case DKIOCGETMAXSEGMENTBYTECOUNTREAD: + *o = 32 * zv->zv_volblocksize; + break; + + case DKIOCGETMAXSEGMENTBYTECOUNTWRITE: + *o = 32 * zv->zv_volblocksize; + break; + + case DKIOCSYNCHRONIZECACHE: + dprintf("DKIOCSYNCHRONIZECACHE\n"); + break; + + default: + dprintf("unknown ioctl: ENOTTY\n"); + error = ENOTTY; + break; + } + + mutex_exit(&zv->zv_state_lock); + + return (SET_ERROR(error)); +} + +const static zvol_platform_ops_t zvol_macos_ops = { + .zv_free = zvol_os_free, + .zv_rename_minor = zvol_os_rename_minor, + .zv_create_minor = zvol_os_create_minor, + .zv_update_volsize = zvol_os_update_volsize, + .zv_clear_private = zvol_os_clear_private, + .zv_is_zvol = zvol_os_is_zvol, + .zv_set_disk_ro = zvol_os_set_disk_ro, + .zv_set_capacity = zvol_os_set_capacity, +}; + +int +zvol_init(void) +{ + int threads = MIN(MAX(zvol_threads, 1), 1024); + + zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, + threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (zvol_taskq == NULL) { + return (-ENOMEM); + } + + zvol_init_impl(); + zvol_register_ops(&zvol_macos_ops); + return (0); +} + +void +zvol_fini(void) +{ + zvol_fini_impl(); + taskq_destroy(zvol_taskq); +} diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 7eb6b2ff9c..af74b28d1e 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -221,7 +221,8 @@ zfs_mod_supported_feature(const char *name) * The equivalent _can_ be done on FreeBSD by way of the sysctl * tree, but this has not been done yet. */ -#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__) || defined(__APPLE__) +#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || \ + defined(__FreeBSD__) || defined(__APPLE__) return (B_TRUE); #else return (zfs_mod_supported(ZFS_SYSFS_POOL_FEATURES, name)); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 1bce6f55f6..6b000471bc 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -515,8 +515,9 @@ zfs_prop_init(void) "RSNAPS"); #ifdef __APPLE__ - zprop_register_index(ZFS_PROP_BROWSE, "com.apple.browse", 1,PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "on | off", "COM.APPLE.BROWSE", boolean_table); + zprop_register_index(ZFS_PROP_BROWSE, "com.apple.browse", 1, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "COM.APPLE.BROWSE", + boolean_table); zprop_register_index(ZFS_PROP_IGNOREOWNER, "com.apple.ignoreowner", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "COM.APPLE.IGNOREOWNER", boolean_table); @@ -648,7 +649,7 @@ zfs_prop_init(void) * that we don't have to change the values of the zfs_prop_t enum, or * have NULL pointers in the zfs_prop_table[]. */ -#if defined (__FreeBSD__) || defined (__APPLE__) +#if defined(__FreeBSD__) || defined(__APPLE__) zprop_register_impl(ZFS_PROP_ACLTYPE, "acltype", PROP_TYPE_INDEX, ZFS_ACLTYPE_OFF, NULL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, diff --git a/module/zcommon/zprop_common.c b/module/zcommon/zprop_common.c index bc02716cb0..debec63433 100644 --- a/module/zcommon/zprop_common.c +++ b/module/zcommon/zprop_common.c @@ -77,7 +77,8 @@ zfs_mod_supported_prop(const char *name, zfs_type_t type) * The equivalent _can_ be done on FreeBSD by way of the sysctl * tree, but this has not been done yet. */ -#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__) || defined(__APPLE__) +#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || \ + defined(__FreeBSD__) || defined(__APPLE__) return (B_TRUE); #else return (zfs_mod_supported(type == ZFS_TYPE_POOL ? diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index db5ae8d8c8..af9b3cc230 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1385,7 +1385,7 @@ extern uint64_t zvolIO_kit_write(struct iomem *iomem, uint64_t offset, int dmu_read_iokit_dnode(dnode_t *dn, uint64_t *offset, - uint64_t position, uint64_t *size, struct iomem *iomem) + uint64_t position, uint64_t *size, struct iomem *iomem) { int err; @@ -1551,12 +1551,12 @@ dmu_write_iokit_dnode(dnode_t *dn, uint64_t *offset, uint64_t position, int err = 0; int i; - err = dmu_buf_hold_array_by_dnode(dn, *offset+position, *size, + err = dmu_buf_hold_array_by_dnode(dn, *offset+position, *size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); - while(*size > 0) { + while (*size > 0) { for (i = 0; i < numbufs; i++) { int tocpy; @@ -1569,7 +1569,8 @@ dmu_write_iokit_dnode(dnode_t *dn, uint64_t *offset, uint64_t position, bufoff = (position + *offset) - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, *size); - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + ASSERT(i == 0 || i == numbufs-1 || + tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 68a4d9d907..2101d8b135 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1930,8 +1930,8 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, } -#if !defined (__OPTIMIZE__) && defined (__APPLE__) -#warning "dsl_scan_visitbp is known to panic during scrubs without optimize (-O)" +#if !defined(__OPTIMIZE__) && defined(__APPLE__) && defined(_KERNEL) +#warning "dsl_scan_visitbp will panic during scrubs without optimize (-O)" #endif /* diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9d6854380c..37ea7124d0 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1262,7 +1262,7 @@ spa_activate(spa_t *spa, spa_mode_t mode) spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); -#if defined (_KERNEL) && defined (__APPLE__) +#if defined(_KERNEL) && defined(__APPLE__) spa_activate_os(spa); #endif @@ -1400,7 +1400,7 @@ spa_deactivate(spa_t *spa) spa->spa_did = 0; } -#if defined (_KERNEL) && defined (__APPLE__) +#if defined(_KERNEL) && defined(__APPLE__) spa_deactivate_os(spa); #endif @@ -5923,7 +5923,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; -#if defined (__APPLE__) && defined (_KERNEL) +#if defined(__APPLE__) && defined(_KERNEL) spa_create_os(spa); #endif @@ -6112,7 +6112,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) zvol_create_minors_recursive(pool); -#if defined (__APPLE__) && defined (_KERNEL) +#if defined(__APPLE__) && defined(_KERNEL) spa_create_os(spa); #endif @@ -6352,7 +6352,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, } export_spa: -#if defined (__APPLE__) && defined (_KERNEL) +#if defined(__APPLE__) && defined(_KERNEL) spa_export_os(spa); #endif diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index f453caf5c8..79f467b2b7 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1084,7 +1084,8 @@ zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name) subclass, spa, NULL, NULL, NULL, 0, 0); - if (ereport == NULL) return; + if (ereport == NULL) + return; VERIFY0(nvlist_add_string(ereport, "snapshot_name", name)); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index dea9ffe4ca..f1e8d0cd51 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -257,14 +257,16 @@ zfs_xattr_owner_unlinked(znode_t *zp) if (tzp != zp) zrele(tzp); #elif __APPLE__ + VERIFY(ZTOV(zp) != NULL); if (VN_HOLD(ZTOV(zp)) == 0) { /* - * if zp is XATTR node, keep walking up via z_xattr_parent until we - * get the owner + * if zp is XATTR node, keep walking up via z_xattr_parent + * until we get the owner */ while (zp->z_pflags & ZFS_XATTR) { ASSERT3U(zp->z_xattr_parent, !=, 0); - if (zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, &dzp) != 0) { + if (zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, + &dzp) != 0) { unlinked = 1; break; } diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 5b215c0470..83be2ab817 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -71,7 +71,7 @@ zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, bzero(vap, sizeof (*vap)); vap->va_mask = (uint_t)mask; vap->va_mode = mode; -#if defined (__FreeBSD__) || defined (__APPLE__) +#if defined(__FreeBSD__) || defined(__APPLE__) vap->va_type = IFTOVT(mode); #endif vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index 9f67112148..e7ca31fcba 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -66,9 +66,9 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { {"ZPL_DACL_ACES", 0, SA_ACL, 0}, {"ZPL_DXATTR", 0, SA_UINT8_ARRAY, 0}, {"ZPL_PROJID", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, -#if defined (__APPLE__) - {"ZPL_ADDTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, - {"ZPL_DOCUMENTID", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, +#if defined(__APPLE__) + {"ZPL_ADDTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_DOCUMENTID", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, #endif {NULL, 0, 0, 0} }; diff --git a/scripts/cmd-macos.sh b/scripts/cmd-macos.sh new file mode 100755 index 0000000000..a80592b3a5 --- /dev/null +++ b/scripts/cmd-macos.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +if test x"$#" = "x0" ; then + printf "You need to supply a command.\n" + exit 1 +fi + +cmd=$1 +shift + +READLINK=`which greadlink 2>/dev/null` +if test x$READLINK = "x" ; then + READLINK=`which readlink 2>/dev/null` +fi + +if ! test x$READLINK = "x" ; then + $READLINK -f . > /dev/null 2>&1 + if ! test x$? = "x0" ; then + unset READLINK + else + CANONICALIZE="$READLINK -f" + fi +fi + +if test x$READLINK = "x" ; then + REALPATH=`which grealpath 2>/dev/null` + if test x$REALPATH = "x" ; then + REALPATH=`which realpath 2>/dev/null` + fi + if test x$REALPATH = "x" ; then + CANONICALIZE=readlink + else + CANONICALIZE=$REALPATH + fi +fi + +topdir=`dirname "$($CANONICALIZE "$0")"` + +if test x$topdir = x"." ; then + if ! test -f zfs.release.in ; then + printf "cd into the zfs source directory or install GNU readlink or realpath.\n" + printf "Homebrew: brew install coreutils\n" + printf "MacPorts: port install coreutils\n" + printf "Gentoo Prefix: emerge sys-apps/coreutils\n" + exit 1 + fi +fi + +topdir=$topdir/../ + +for lib in nvpair uutil zpool zfs zfs_core diskmgt; do + export DYLD_LIBRARY_PATH=$topdir/lib/lib${lib}/.libs:$DYLD_LIBRARY_PATH +done +for c in zdb zfs zpool ztest; do + export PATH=${topdir}/cmd/${c}/.libs:$PATH +done + +#echo PATH=$PATH +#echo DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH +exec ${topdir}/cmd/$cmd/.libs/$cmd "$@" diff --git a/scripts/debug-macos.sh b/scripts/debug-macos.sh new file mode 100755 index 0000000000..1f049df437 --- /dev/null +++ b/scripts/debug-macos.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +if test x"$#" = "x0" ; then + printf "You need to supply a command.\n" + exit 1 +fi + +cmd=$1 +shift + +READLINK=`which greadlink 2>/dev/null` +if test x$READLINK = "x" ; then + READLINK=`which readlink 2>/dev/null` +fi + +if ! test x$READLINK = "x" ; then + $READLINK -f . > /dev/null 2>&1 + if ! test x$? = "x0" ; then + unset READLINK + else + CANONICALIZE="$READLINK -f" + fi +fi + +if test x$READLINK = "x" ; then + REALPATH=`which grealpath 2>/dev/null` + if test x$REALPATH = "x" ; then + REALPATH=`which realpath 2>/dev/null` + fi + if test x$REALPATH = "x" ; then + CANONICALIZE=readlink + else + CANONICALIZE=$REALPATH + fi +fi + +topdir=`dirname "$($CANONICALIZE "$0")"` + +if test x$topdir = x"." ; then + if ! test -f zfs.release.in ; then + printf "cd into the zfs source directory or install GNU readlink or realpath.\n" + printf "Homebrew: brew install coreutils\n" + printf "MacPorts: port install coreutils\n" + printf "Gentoo Prefix: emerge sys-apps/coreutils\n" + exit 1 + fi +fi + +topdir=$topdir/../ + +for lib in nvpair uutil zpool zfs zfs_core diskmgt; do + export DYLD_LIBRARY_PATH=$topdir/lib/lib${lib}/.libs:$DYLD_LIBRARY_PATH +done +for c in zdb zfs zpool ztest; do + export PATH=${topdir}/cmd/${c}/.libs:$PATH +done + +#echo PATH=$PATH +#echo DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH +exec lldb ${topdir}/cmd/$cmd/.libs/$cmd "$@" diff --git a/scripts/load_macos.sh b/scripts/load_macos.sh new file mode 100755 index 0000000000..2e8a1880b7 --- /dev/null +++ b/scripts/load_macos.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# +# Expected to be run from the root of the source tree, as root; +# ./scripts/load_macos.sh +# +# Copies compiled zfs.kext to /tmp/ and prepares the requirements +# for load. +# + +rsync -ar module/os/macos/zfs/zfs.kext/ /tmp/zfs.kext/ + +chown -R root:wheel /tmp/zfs.kext + +kextload -v /tmp/zfs.kext || kextutil /tmp/zfs.kext + +# log stream --source --predicate 'sender == "zfs"' --style compact +